def group_clone_VJ_cdr3(dico_same_VJ, dicoSeq, Clone_threshold): VJ_ID_diff_CDR3 = {} dicoclone_vj_cdr3 = {} for VJ_ID in dico_same_VJ.keys(): sub_sub_group = 0 #print (VJ_ID,"VJ_ID,VJ_ID") VJ_ID_diff_CDR3[VJ_ID] = {} for seq in dico_same_VJ[VJ_ID]: sub_gourp_dist = {} CDR3_seq = dicoSeq[seq.rstrip()][2] print(VJ_ID_diff_CDR3[VJ_ID].keys()) if len(VJ_ID_diff_CDR3[VJ_ID].keys()) != 0: Sub_gourp = VJ_ID_diff_CDR3[VJ_ID].keys() for g in Sub_gourp: print(g, dicoSeq[seq.rstrip()][2]) print( "aaaaa", 1 - hamming_distance(dicoSeq[seq.rstrip()][2], g) / float(len(g))) if len(dicoSeq[seq.rstrip()][2]) == len(g): if 1 - (hamming_distance(dicoSeq[seq.rstrip()][2], g) / float(len(g))) >= Clone_threshold: print("here!") sub_gourp_dist[g] = ('+', 1 - (hamming_distance( dicoSeq[seq.rstrip()][2], g) / float(len(g)))) elif dicoSeq[seq.rstrip()][1].split("*")[0][-1] == "6": length = max(len(dicoSeq[seq.rstrip()][2]), len(g)) if 1 - (levenshtein_distance(dicoSeq[seq.rstrip( )][2], g) / float(length)) >= Clone_threshold: sub_gourp_dist[g] = ( '+', 1 - (levenshtein_distance(dicoSeq[seq.rstrip()][2], g) / float(length))) if sub_gourp_dist == {}: VJ_ID_diff_CDR3[VJ_ID][CDR3_seq] = [seq] else: dist_loc = {} for key in sub_gourp_dist.keys(): seqs = [ skbio.Protein(CDR3_seq, metadata={'id': "CDR3_seq"}), skbio.Protein(key, metadata={'id': "key"}) ] #print (seqs[0],seqs[1]) msa = skbio.alignment.global_pairwise_align_protein( seqs[0], seqs[1], 25) dist_loc[key] = float(msa[1]) print(dist_loc, "dist_loc") best_coressp = (max(dist_loc.items(), key=operator.itemgetter(1))[0]) VJ_ID_diff_CDR3[VJ_ID][best_coressp].append(seq) else: VJ_ID_diff_CDR3[VJ_ID][CDR3_seq] = [seq] print(sub_gourp_dist) #print (VJ_ID_diff_CDR3) return VJ_ID_diff_CDR3
def trim(OGid): # 0 Load MSA try: msa1 = read_fasta(f'../align_fastas1/out/{OGid}.mfa') except FileNotFoundError: msa1 = read_fasta(f'../align_fastas2-2/out/{OGid}.mfa') # 1 Calculate shared variables gaps_array = np.full((len(msa1), len(msa1[0][1])), False) for i, (_, seq) in enumerate(msa1): for j, sym in enumerate(seq): if sym == '-': gaps_array[i, j] = True scores = gaps_array.sum(axis=0) msa1 = skbio.TabularMSA([skbio.Protein(seq, metadata={'description': header}) for header, seq in msa1]) # 2 Get trims (segments and columns) syms_list1 = trim_conserved(msa1, scores, gaps_array, tp['con_frac'], tp['con_window'], tp['con_minlen'], tp['con_rate'], tp['con_minsig']) syms_list2, trims = trim_insertions(msa1, scores, gaps_array, tp['gap_num'], tp['gap_rate'], tp['gap_minsig'], tp['nongap_frac'], tp['nongap_minlen'], tp['gp_sigma'], tp['gd_window'], tp['indel1_rate'], tp['indel2_rate'], tp['weights'], tp['threshold'], matrix) # 3 Combine trims (segments and columns) to yield final alignment msa2 = [] for seq, syms1, syms2 in zip(msa1, syms_list1, syms_list2): syms = ['-' if sym1 != sym2 else sym1 for sym1, sym2 in zip(syms1, syms2)] # Will only differ if one is converted to gap msa2.append((seq.metadata['description'], syms)) # 4 Restore gap only columns gaps_array = np.full((len(msa2), len(msa2[0][1])), False) for i, (_, seq) in enumerate(msa2): for j, sym in enumerate(seq): if sym == '-': gaps_array[i, j] = True scores = gaps_array.sum(axis=0) rf = ['x' for _ in range(len(msa2[0][1]))] # Metadata for marking consensus columns in profile HMM for region, in ndimage.find_objects(ndimage.label(scores == len(msa2))[0]): rf[region] = (region.stop - region.start) * ['.'] for i in range(len(msa2)): syms = msa2[i][1] syms[region] = list(str(msa1[i, region])) # 5 Write to file msa2 = skbio.TabularMSA([skbio.Protein(''.join(syms), metadata={'description': header}) for header, syms in msa2], positional_metadata={'RF': rf}) msa2.write(f'out/{OGid}.sto', 'stockholm')
def _series_to_fasta_format(ff, data, sequence_type="DNA"): with ff.open() as f: for id_, seq in data.iteritems(): if sequence_type == "protein": sequence = skbio.Protein(seq, metadata={'id': id_}) elif sequence_type == "DNA": sequence = skbio.DNA(seq, metadata={'id': id_}) elif sequence_type == "RNA": sequence = skbio.RNA(seq, metadata={'id': id_}) else: raise NotImplementedError( "pd.Series can only be converted to DNA or " "protein FASTA format.") skbio.io.write(sequence, format='fasta', into=f)
rows = [] for OGid in OGids: try: msa = read_fasta(f'../align_fastas1/out/{OGid}.mfa') except FileNotFoundError: msa = read_fasta(f'../align_fastas2-2/out/{OGid}.mfa') gaps_array = np.full((len(msa), len(msa[0][1])), False) for i, (_, seq) in enumerate(msa): for j, sym in enumerate(seq): if sym == '-': gaps_array[i, j] = True scores = gaps_array.sum(axis=0) msa = skbio.TabularMSA([ skbio.Protein(seq, metadata={'description': header}) for header, seq in msa ]) mask = ndimage.label(len(msa) - scores <= tp['gap_num'])[0] regions = [region for region, in ndimage.find_objects(mask)] for region in regions: for segment in get_segments(msa, region, matrix): row = { 'OGid': OGid, 'start': segment['region'].start, 'stop': segment['region'].stop, 'index': segment['index'], 'length': sum([s.stop - s.start for s in segment['slices']]) } rows.append(row)