def abundance_greedy_clustering(amplicon_file, minseqlen, mincount, chunk_size, kmer_size): # Get all the non chimeric sequences generator = chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size) # Add the first sequence to the list of OTUs otu_final = [] otu_final.append(next(generator)) # For each other sequence for sequence, occ in generator: is_otu = True # Compare it to each sequence in the final list # (because the other sequences have a lower occurence) for sequence2, occ2 in otu_final: alignement = nw.global_align(sequence, sequence2, gap_open=-1, gap_extend=-1, matrix=os.path.abspath( os.path.join( os.path.dirname(__file__), "MATCH"))) if occ2 > occ and get_identity(alignement) > 97: is_otu = False break if is_otu: otu_final.append((sequence, occ)) return otu_final
def init_pop(self, lines_list): pop = [] for c in range(self.chromosomes): # New chromosome lines_list_aux = [] # Use nwalign to compute the pairwise alignments # by the Needleman-Wunsch algorithm for i in range(len(lines_list)): alignments = [] # Compute the pairwise alignments for j in range(len(lines_list)): if i != j: curr_alignment = nw.global_align( lines_list[i], lines_list[j]) alignments.append(curr_alignment) # Randomly select an alignment alignment = random.choice(alignments) alignment = alignment[0] lines_list_aux.append(alignment) # Add the chromosome generated and prints it lines_list_aux = Utils.add_gaps(lines_list_aux) pop.append({"chromosome": lines_list_aux, "evaluation": 0}) print("\nChromosome " + str(c + 1) + ":") Utils.print_chromosome(lines_list_aux) # Initial population return pop
def chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size): """ """ dfr_lst = dereplication_fulllength(amplicon_file, minseqlen, mincount) dfr_lst2 = dereplication_fulllength(amplicon_file, minseqlen, mincount) kmer_dict = {} com = [] id=1 for seq in dfr_lst: kmer_dict = get_kmer_dict(kmer_dict,seq[0],id,kmer_size) id+=1 perc_identity_matrix = [] for l in dfr_lst2: chunks = get_chunks(l[0], chunk_size) chunk_mates = [] for seq in chunks: mates = search_mates(kmer_dict, seq, kmer_size) chunk_mates.append(mates) if len(chunk_mates) > 1: for f in com[0:2]: sequ = get_chunks(no_chimere[f], chunk_size) perc_identity_matrix = [[]] for k, chunk in enumerate(chunks): align = nw.global_align(chunk, sequ[k]) identite = get_identity(align) perc_identity_matrix[k].append(identite) chimera = detect_chimera(perc_identity_matrix) if not detect_chimera(perc_identity_matrix): yield l
def abundance_greedy_clustering( amplicon_file, minseqlen, mincount, chunk_size, kmer_size ): output = [] not_chimeric = [ seq for seq in chimera_removal( amplicon_file, minseqlen, mincount, chunk_size, kmer_size ) ] for index, sequence in enumerate(not_chimeric): # Get sequence with higher abundance abund_sequences = not_chimeric[:index] if len(abund_sequences) < 1: output += [sequence] else: valid = True for abund_seq in abund_sequences: alignment_list = nw.global_align( sequence[0], abund_seq[0], matrix=os.path.abspath( os.path.join(os.path.dirname(__file__), "MATCH") ), ) similarity = get_identity(alignment_list) if similarity > 97: valid = False break if valid: output += [sequence] return output
def abundance_greedy_clustering(amplicon_file, minseqlen, mincount, chunk_size, kmer_size): """ abundance_greedy_clustering fait appel à chimera_removal et réalise également des mesures d’identité à l’aide de get_identity. Elle retourne une liste d’OTU, cette liste indiquera pour chaque séquence son occurrence (count). """ otu = [] for i, seq in enumerate( chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size)): if i == 0: otu.append(seq) print(seq) else: for seq_otu in otu: idt = get_identity( nw.global_align(seq_otu[0], seq[0], gap_open=-1, gap_extend=-1, matrix=os.path.abspath( os.path.join(os.path.dirname(__file__), '../agc')) + "/MATCH")) if idt <= 97: otu.append(seq) else: pass return otu
def abundance_greedy_clustering(amplicon_file, minseqlen, mincount, chunk_size, kmer_size): """ @brief : Regroupement glouton de sequences. @param amplicon_file : string, lien du fichier d'entrée. @param minseqlen : int, longueur minimale des séquences. @param mincount : int, comptage minimal des séquences. @param chunk_size : int, taille des segments. @kmer_size : int, taille des kmers. @returns : list, liste d'OTU. """ otu_list = [] chimeras = list( chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size)) for i in range(len(enumerate(chimeras))): otu = True for j in range(i + 1, len(chimeras)): if get_identity(nw.global_align(chimeras[i][0], chimeras[j][0])) > 97 and \ chimeras[i][1] >= chimeras[j][1]: otu_list.append(chimeras[j]) otu = False break if otu: otu_list.append(chimeras) return otu_list[0]
def run_NW_align(seq1: str, seq2: str): # Global alignment with a specified penalty for gap open and extend # out_align = nw.global_align(seq1, seq2, gap_open=-10, gap_extend=-5, match=12, matrix='BLOSUM62') return out_align
def _align(self): matrix = self._get_matrix_file(match=self._match, mismatch=self._mismatch, matrix=self._matrix) aln = nw.global_align(self.query.sequence, self.target.sequence, gap_open=self._gap_open, gap_extend=self._gap_extend, matrix=matrix) return aln
def chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size): """Returns a generator of non chimera sequences :Parameters: amplicon_file: Path to the amplicon_file minseqlen: Minimal length of sequences (int) mincount: Minimum counting (int) chunk_size: Sub sequences length (int) kmer_size: Size of kmers (int) Returns: generator of non chimera sequences """ non_chimera_seq_list = [] id_seq = 0 kmer_dict = {} # Sequence generator read = dereplication_fulllength(amplicon_file, minseqlen, mincount) # Evaluate each sequence for seq, value in read: mate_seq_list_id = [] # List of segments chunk_list = list(get_chunks(seq, chunk_size)) # Build a list of ids of mate non chimera sequences for each segment for chunk in chunk_list: mate_seq_list_id.append(search_mates(kmer_dict, chunk, kmer_size)) # Find parent sequences if there are parent_seq_list_id = common(common(mate_seq_list_id[0], mate_seq_list_id[1]), common(mate_seq_list_id[2], mate_seq_list_id[3])) perc_identity_matrix = [[], [], [], []] chimera = False # If there are at least 2 parents if len(parent_seq_list_id) >= 2: # Then we compute the matrix with the percentages of identity for parent in parent_seq_list_id[:2]: # List of segments of the parent chunk_list_p = list(get_chunks(non_chimera_seq_list[parent][0], chunk_size)) for i in range(len(chunk_list)): # Make alignment between two segments alignment_list = nw.global_align(chunk_list[i], chunk_list_p[i], gap_open=-1, gap_extend=-1, matrix=os.path.abspath( os.path.join(os.path.dirname(__file__), "MATCH"))) # Compute their identity identity = get_identity(alignment_list) perc_identity_matrix[i].append(identity) # Finally we check if the candidate sequence is a chimera or not chimera = detect_chimera(perc_identity_matrix) # If it is not if not chimera: # We add it to the non chimera sequences list non_chimera_seq_list.append([seq, value]) # We also add it to kmer_dict for chunk in chunk_list: kmer_dict = get_unique_kmer(kmer_dict, chunk, id_seq, kmer_size) id_seq += 1 # And we yield it with her counting yield [seq, value]
def chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size): """Retourne générateur séquences non chimérique, Format: yield [seq, count]""" gen_seq = [] for elm in dereplication_fulllength(amplicon_file,minseqlen,mincount): gen_seq.append(elm) seq_candidate1 = gen_seq[0] seq_candidate2 = gen_seq[1] seq_candidate1_chunks = get_chunks(seq_candidate1[0], chunk_size) seq_candidate2_chunks = get_chunks(seq_candidate2[0], chunk_size) dict_simil = {} subseq_list = [] for i in range(1,len(gen_seq)): subseq_list.append(get_chunks(gen_seq[i][0], chunk_size)) for subseq in subseq_list: dict_subseq = cut_kmer(subseq, kmer_size) for chunk in seq_candidate1_chunks: dict_seq1_chunk_candidate = cut_kmer(chunk, kmer_size) for chunk2 in seq_candidate2_chunks: dict_seq2_chunk_candidate = cut_kmer(chunk2, kmer_size) for key in dict_subseq: while len(dict_simil<=8): if key in dict_seq1_chunk_candidate or key in dict_seq2_chunk_candidate: dict_simil[subseq] +=1 seq_parents = [] for key in dict_simil: if len(seq_parents<=2): if key in seq_candidate1 and key in seq_candidate2: seq_parents.append(key) else: break identity_list = [] for elm in seq_parents: for i in range(len(get_chunks(elm, chunk_size))): alignment_list = nw.global_align(get_chunks(elm[i], chunk_size), seq_candidate1_chunks[i], gap_open=-1, gap_extend=-1, matrix=os.path.abspath(os.path.join(os.path.dirname(__file__),"MATCH"))) identityc1 = get_identity(alignment_list) alignment_list = nw.global_align(get_chunks(elm[i], chunk_size), seq_candidate2_chunks[i], gap_open=-1, gap_extend=-1, matrix=os.path.abspath(os.path.join(os.path.dirname(__file__),"MATCH"))) identityc2 = get_identity(alignment_list) identity_list.append([identityc1,identityc2]) for elm in dereplication_fulllength(amplicon_file,minseqlen,mincount): yield [elm[0], elm[1]]
def abundance_greedy_clustering(amplicon_file, minseqlen, mincount, chunk_size, kmer_size): """ Retourne la liste des OTUS""" list = [] chimera = [] for elm in chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size): chimera.append(elm) for i in range(len(chimera)): alignement_list = nw.global_align(chimera[i][0], chimera[0][0], gap_open=-1, gap_extend=-1, matrix=os.path.abspath(os.path.join(os.path.dirname(__file__),"MATCH"))) identity = get_identity(alignement_list) if identity>=0.97: list.append([chimera[i][0],chimera[i][1]]) return list
def get_identity_matrix(chunks, parents, sequence_bank, chunk_size): """Get the identity matrix between a sequence and 2 parents. :Parameters: chunks: Chunks from the candidate sequence. parents: Parent sequences from the candidate sequence. sequence_bank: List of sequences that are not chimeras. chunk_size: Size of the chunks. """ perc_identity_matrix = [[] for chunk_index in range(len(chunks))] for parent in parents: parent_chunks = get_chunks(sequence_bank[parent], chunk_size) for index, chunk in enumerate(chunks): alignment = nw.global_align(chunk, parent_chunks[index]) identity = get_identity(alignment) perc_identity_matrix[index].append(identity) return perc_identity_matrix
def _pairwise_align(consensus_info, ref_genome_info, ref_first_orf_start_1based, ref_last_orf_end_1based): ref_first_orf_start_0based = ref_first_orf_start_1based - 1 ref_last_orf_end_0based = ref_last_orf_end_1based - 1 ref_gapped_seq, consensus_gapped_seq = nw.global_align( ref_genome_info[1], consensus_info[1]) num_ref_gaps_in_orf_region = num_cons_gaps_in_orf_region = 0 curr_ref_index = curr_cons_index = -1 cons_first_orf_start_0based = cons_last_orf_end_0based = None for gapped_index in range(len(ref_gapped_seq)): curr_cons_base = consensus_gapped_seq[gapped_index] if curr_cons_base != "-": curr_cons_index += 1 else: # if we are within the orf-containing region, keep # track of how many gap bases we see if cons_first_orf_start_0based is not None and \ cons_last_orf_end_0based is None: num_cons_gaps_in_orf_region += 1 curr_ref_base = ref_gapped_seq[gapped_index] if curr_ref_base != "-": curr_ref_index += 1 if curr_ref_index == ref_first_orf_start_0based: cons_first_orf_start_0based = curr_cons_index elif curr_ref_index == ref_last_orf_end_0based: cons_last_orf_end_0based = curr_cons_index break else: if cons_first_orf_start_0based is not None and \ cons_last_orf_end_0based is None: num_ref_gaps_in_orf_region += 1 result = {} result[CONS_SEQ_NAME] = consensus_info[0] result[REF_SEQ_NAME] = ref_genome_info[0] result[REF_ALIGNMENT] = ref_gapped_seq result[CONS_ALIGNMENT] = consensus_gapped_seq result[CONS_FIRST_ORF_START_0B] = cons_first_orf_start_0based result[CONS_LAST_ORF_END_0B] = cons_last_orf_end_0based result[NUM_INSERTS] = num_ref_gaps_in_orf_region result[NUM_DELS] = num_cons_gaps_in_orf_region return result
def compute_id_matrix(chunk_chim, parents): """TODO""" identity_percentage_matrix = np.zeros((len(chunk_chim), len(parents))) for chk_index, chunk in enumerate(chunk_chim): for i, parent in enumerate(parents): alignment_list = nw.global_align( chunk, parent["chunks"][chk_index], gap_open=-1, gap_extend=-1, matrix=os.path.abspath( os.path.join(os.path.dirname(__file__), "MATCH") ), ) identity_percentage_matrix[chk_index, i] = round( get_identity(alignment_list), 2 ) return identity_percentage_matrix
def chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size): """ @brief : Récupère les séquences non chimériques du fichier donné. @param amplicon_file : string, lien du fichier d'entrée. @param minseqlen : int, longueur minimale des séquences. @param mincount : int, comptage minimal des séquences. @param chunk_size : int, taille des segments. @kmer_size : int, taille des kmers. @returns : generator, générateur de séquences non chimériques. """ kmer_dict = {} non_chimera = [] perc_id_matrix = [] id_seq = 0 for seq, count in dereplication_fulllength(amplicon_file, minseqlen, mincount): chunks = get_chunks(seq, chunk_size)[:4] mates = [ search_mates(kmer_dict, sub_seq, kmer_size) for sub_seq in chunks ] parents = [] for mate, _ in enumerate(mates): parents = common(parents, mates[mate]) if len(parents) >= 2: perc_id_matrix = [[] for _ in range(len(chunks))] for parent in parents: parent_chunks = get_chunks(non_chimera[parent], chunk_size) for index, chunk in chunks: alignment = nw.global_align(chunk, parent_chunks[index]) identity = get_identity(alignment) perc_id_matrix[index].append(identity) if not detect_chimera(perc_id_matrix): kmer_dict = get_unique_kmer(kmer_dict, seq, id_seq, kmer_size) non_chimera.append(seq) id_seq += 1 yield [seq, count]
def calcul_identity_matrix(chunks_courant, parents, chunk_size, list_non_chimere): """ input: - chunks_courant: chunks de la séquence courante - parents: les 2 séquences parentes possibles - chunk_size: taille du chunk - list_non_chimere: liste de séquences non chimériques output: - perc_identity_matrix: matrice donnant par segment le taux d’identité entre la séquence candidate et deux séquences parente """ perc_identity_matrix = [[] for nb_chunk in range(len(chunks_courant))] for parent in parents: chunk_ref = get_chunks(list_non_chimere[parent], chunk_size) for element, chunk in enumerate(chunks_courant): res_alignement = nw.global_align(chunk, chunk_ref[element]) res_identite = get_identity(res_alignement) perc_identity_matrix[element].append(res_identite) return perc_identity_matrix
def phnSequenceAlignment(phns_teacher, phns_student): """ Align two phn sequences :param phns_teacher: :param phn_student: :return: """ # convert phonemes to letters phns_teacher_letters, phns_student_letters, dict_letters2syl = \ convertSyl2Letters(syllables0=phns_teacher, syllables1=phns_student) # global alignment, because the mismatch between teacher and student phone list phns_teacher_aligned, phns_student_aligned = \ nw.global_align(phns_teacher_letters, phns_student_letters) # output the insertion and deletion indices, and the corresponding teacher's phones for the student' phones dict_student_idx_2_teacher_phn, insertion_indices_student, deletion_indices_teacher, teacher_student_indices_pair = \ identifyInsertionDeletionIdx(phns_teacher_aligned, phns_student_aligned, dict_letters2syl) return insertion_indices_student, deletion_indices_teacher, teacher_student_indices_pair, dict_student_idx_2_teacher_phn
def compute_similarity_matrix(chunks, parents_sequence, non_chimeric_list, chunk_size): """ Compute the similarity matrix. Parameters: chunks: (List) Chunks of the current sequence parents_sequence: (List) Parent sequences from the sequence non_chimeric_list: (List) List of non_chimeric sequences chunk_size: (Int) Size of a chunk Returns: The similarity matrix """ # Initialize the matrix perc_identity_matrix = [[] for _ in range(len(chunks))] for parent_sequence in parents_sequence: # Get sub sequences from non_chimeric_sequence list with a size of chunk_size non_chimeric_chunk = get_chunks(non_chimeric_list[parent_sequence], chunk_size) for index, chunk in enumerate(chunks): # Compute alignement list between current chunk and chunk from the non_chimeric list global_alignement = nw.global_align( chunk, non_chimeric_chunk[index], gap_open=-1, gap_extend=-1, matrix=os.path.abspath( os.path.join(os.path.dirname(__file__), "MATCH"))) # Compute the similarity rate from the alignement list similarity_rate = get_identity(global_alignement) # Add the result to the matrix perc_identity_matrix[index].append(similarity_rate) return perc_identity_matrix
def abundance_greedy_clustering(amplicon_file, minseqlen, mincount, chunk_size, kmer_size): """Returns a list of OTU :Parameters: amplicon_file: Path to the amplicon_file minseqlen: Minimal length of sequences (int) mincount: Minimum counting (int) chunk_size: Sub sequences length (int) kmer_size: Size of kmers (int) Returns: list of OTU (list) """ result = [] # Non chimera sequences list (in ascending order) read = sorted(list(chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size)), key=lambda x: x[1]) # Run the list (not the last sequence which is necessarily an OTU) cpt = 1 for seq1, value1 in read[:-1]: otu = True # Run a second time to compare the sequence with all other (which have a bigger counting) for seq2, value2 in read[cpt:]: # Make alignment between two sequences alignment_list = nw.global_align(seq1, seq2, gap_open=-1, gap_extend=-1, matrix=os.path.abspath(os.path.join( os.path.dirname(__file__), "MATCH"))) # Check if the sequence is an OTU if seq1 != seq2 and get_identity(alignment_list) > 97 and value2 > value1: otu = False break # Add the sequence and her counting to the result if it is and OTU if otu: result.append([seq1, value1]) cpt += 1 # Add the last sequence to the list of OTU result.append([read[-1][0], read[-1][1]]) # Sort the result in descending order result = sorted(result, key=lambda x: x[1], reverse=True) return result
def aln(s1, s2): aln1, aln2 = nw.global_align(s1, s2) return aln1, aln2
def chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size): """Fait appel au générateur fourni par dereplication_fulllength et retourne un générateur des séquences non chimérique au format: yield [sequence, count] """ kmer_dict = {} perc_identity_matrix = [] chunk_match = [] seq_list = [] chim_id = 0 for i, occurence_list in enumerate( list(dereplication_fulllength(amplicon_file, minseqlen, mincount))): chim = True chunk_list = get_chunks(occurence_list[0], chunk_size) for chunk in chunk_list: chunk_match.append([ i[0] for i in Counter([ ids for kmer in cut_kmer(chunk, kmer_size) if kmer in kmer_dict for ids in kmer_dict[kmer] ]).most_common(8) ]) com_seq = common(chunk_match[0], chunk_match[1]) for j in range(2, len(chunk_match)): com_seq = common(com_seq, chunk_match[j]) if len(com_seq) > 1: for k in range(len(chunk_list)): perc_identity_matrix.append([][k]) for seq in com_seq[0:2]: seq_chunk_list = get_chunks(seq_list[seq], chunk_size) for l, chunk in enumerate(chunk_list): perc_identity_matrix[l].append( get_identity( nw.global_align(chunk, seq_chunk_list[l], gap_open=-1, gap_extend=1, matrix="MATCH"))) std_list = [] flag_file = 0 flag_similarity = 0 for line in perc_identity_matrix: std_list.append(statistics.stdev([line[0], line[1]])) if flag_file == 0: val0 = line[0] val1 = line[0] flag_file = 1 else: if flag_similarity == 1: continue if val0 != line[0] and val1 != line[1]: flag_similarity = 1 val0 = line[0] val1 = line[0] std_mean = statistics.mean(std_list) if std_mean > 5 and flag_similarity == 1: chim = True chim = False else: chim = False if not chim: #kmer_dict = get_unique_kmer(kmer_dict, occurence_list[0], chim_id, kmer_size) for kmer in cut_kmer(occurence_list[0], kmer_size): if kmer not in kmer_dict: kmer_dict[kmer] = [chim_id] else: kmer_dict[kmer].append(chim_id) kmer_dict[kmer] = list( set(kmer_dict[kmer]).union(set(kmer_dict[kmer]))) seq_list.append(occurence_list[0]) chim_id += 1 yield occurence_list
res_one_1_filled = "test" #MsaHandler.fillup_wildcarded_result(res_one_1, pivot_msa, '@') res_three_2_filled = "test" # MsaHandler.fillup_wildcarded_result(res_three_2, pivot_msa, '@') res_final_1 = res_one_1_filled res_final_2 = pivot_msa res_final_3 = res_three_2 #res_final_3 = res_three_2_filled return res_final_1, res_final_2, res_final_3 except Exception as ex: tr = inspect.trace() print("Exception raised in %s" % tr[-1][3]) import nwalign3 as nw reto = nw.global_align("CEELECANTH", "PELICAN") reto2 = nw.global_align("(Westf.), Grevener", "††††††(Westf.), Grevener") reto3 = nw.global_align("(Westf.), Grevener", "††††††(Westf.), Grevener", gap_open=-5, gap_extend=-2) #import seqanpy #print(seqanpy.align_global('ACCGGT', 'CCG')) from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner # Create sequences to be aligned. a = Sequence('what a beautiful day'.split())
def chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size): # Get all the sequences generator = dereplication_fulllength(amplicon_file, minseqlen, mincount) # Add the 2 first sequences to the list of references (non chimeric sequences) references = [] i = 0 for line in generator: references.append(line) if i == 2: break # Yield the 2 first sequences for ref in references: yield ref # For all the other sequences for candidate, occurence in generator: is_chimera = False # Get the segments of all the references segments_reference_list = [] for reference, _ in references: segments_reference_list.append(get_chunks(reference, chunk_size)) # Get the kmers of the segments of the references kmers_segments_reference_list = [] for segments_reference in segments_reference_list: kmers_segments_reference = [] for segment in segments_reference: kmers_reference = list(cut_kmer(segment, kmer_size)) kmers_segments_reference.append(kmers_reference) kmers_segments_reference_list.append(kmers_segments_reference) # 1. Divide each candidate in 4 segments of size L= chunk_size segments_candidate = get_chunks(candidate, chunk_size) # 2. For each segment, identify 8 sequences with similar kmers # Get the kmers of the segments of the candidate kmers_segments_candidate = [] for segment in segments_candidate: kmers_candidate = list(cut_kmer(segment, kmer_size)) kmers_segments_candidate.append(kmers_candidate) list_mates = [] # For each list of kmers for each segment for k, kmer_list in enumerate(kmers_segments_candidate): kmer_dict = dict() # For each kmer of segment k for kmer in kmer_list: # For each segment n°k of each reference for i, seg in enumerate( [a[k] for a in kmers_segments_reference_list]): if kmer in seg: if kmer in kmer_dict.keys(): if i not in kmer_dict[kmer]: kmer_dict[kmer].append(i) else: kmer_dict[kmer] = [i] # For each segment, identify 8 sequences with similar kmers list_mates.append( set(search_mates(kmer_dict, candidate, kmer_size))) # 3. Find two parents parents_ids = set.intersection(*list_mates) # 4. Compute the similarities if len(parents_ids) > 1: # If we have more than 2 parents, test with all the combinations of parents for parent1, parent2 in itertools.combinations(parents_ids, 2): # Create the matrix perc_identity_matrix = [[] for k in range(4)] for k in range(4): for id_parent in [parent1, parent2]: segments_parent = get_chunks(references[id_parent][0], chunk_size) alignement = nw.global_align( segments_candidate[k], segments_parent[k], gap_open=-1, gap_extend=-1, matrix=os.path.abspath( os.path.join(os.path.dirname(__file__), "MATCH"))) perc_identity_matrix[k].append( get_identity(alignement)) # Detect a chimera is_chimera = detect_chimera(perc_identity_matrix) if is_chimera: break # If this is not a chimera, add it to the references and yield if not is_chimera: references.append((candidate, occurence)) yield candidate, occurence