Ejemplo n.º 1
0
def fasta_get_freq(seq,start= 0,end= 0,step= 1,ksize=3,bases= 'ATCG'):
    '''return count of kmer across fasta region'''
    kmer_dict= kmer_dict_init(ksize= ksize,bases=bases)
    if end == 0:
        end= len(seq) - ksize

    for ki in range(start,end,step):
        kmer= seq[ki:ki+ksize]
        if 'N' in kmer:
            continue
        get_by_path(kmer_dict, kmer[:-1])[kmer[-1]] += 1

    return kmer_dict
Ejemplo n.º 2
0
def kmer_freq_balance(kmer_dict, mutations, fasta_len= 10000, bases= 'ACGT',ksize= 3):
    '''return list of possible kmer mutations'''

    mutation_sum= []
    Nkmers= fasta_len - ksize

    for idx in range(len(mutations)):
        mut= mutations[idx]
        prop= get_by_path(kmer_dict,mut[0])
        prop= prop / Nkmers
        mutation_sum.append(prop)

    return np.array(mutation_sum).reshape(1,-1)
Ejemplo n.º 3
0
def vcf_muts_matrix_v1(refseq,summary,start= 0,end= 0,ksize= 3,bases='ATCG', collapse= True):
    ''' 
    Return matrix of mutation contexts by SNP in genotype array
    Each mutation is mapped to list of possible mutations as a binary vector.
    - v1 determines if alternative allele = reference allele in fasta. 
        if so, allele is switched, position idx is flagged. 
    '''
    
    mutations= get_mutations(bases= bases,ksize= ksize)
    kmers, kmer_idx= kmer_comp_index(mutations)
    
    mut_lib= kmer_mut_index(mutations)
    
    if end == 0:
        end= max(summary.POS)
    
    k5= int(ksize/2)
    k3= ksize - k5
    pos_mut= []
    flag_reverse= []
    flag_remove= []
    
    for x in range(summary.shape[0]):
        pos= int(summary.POS[x]) - 1
        if pos >=  start and pos <= end:
            kmer= refseq[pos-k5: pos + k3]
            if 'N' in kmer:
                flag_remove.append(x)
                continue
            mut= kmer + summary.ALT[x]
            
            if kmer[1] == summary.ALT[x]:
                flag_reverse.append(x)
                mut= kmer+summary.REF[x]
            
            if len(mut) != 4: 
                print(kmer)
                print(summary.REF[x],summary.ALT[x])
                print(x,pos)
                print(len(refseq),summary.shape[0])
                if collapse:
                    mut_array=np.zeros(len(kmer_idx))
                    pos_mut.append(mut_array)
                    continue
                else:
                    mut_array=np.zeros(len(mutations))
                    pos_mut.append(mut_array)
                    continue
            if collapse:
                mut_index= kmers[mut]
                mut_array=np.zeros(len(kmer_idx))
            else:
                mut_index= get_by_path(mut_lib, list(mut))
                mut_array=np.zeros(len(mutations))
            
            mut_array[mut_index]= 1
            pos_mut.append(mut_array)
    
    pos_mut= np.array(pos_mut).T
    
    return pos_mut, flag_reverse, flag_remove