Example #1
0
def abundance_greedy_clustering(amplicon_file, minseqlen, mincount, chunk_size,
                                kmer_size):
    # Get all the non chimeric sequences
    generator = chimera_removal(amplicon_file, minseqlen, mincount, chunk_size,
                                kmer_size)

    # Add the first sequence to the list of OTUs
    otu_final = []
    otu_final.append(next(generator))

    # For each other sequence
    for sequence, occ in generator:
        is_otu = True
        # Compare it to each sequence in the final list
        # (because the other sequences have a lower occurence)
        for sequence2, occ2 in otu_final:
            alignement = nw.global_align(sequence,
                                         sequence2,
                                         gap_open=-1,
                                         gap_extend=-1,
                                         matrix=os.path.abspath(
                                             os.path.join(
                                                 os.path.dirname(__file__),
                                                 "MATCH")))
            if occ2 > occ and get_identity(alignement) > 97:
                is_otu = False
                break
        if is_otu:
            otu_final.append((sequence, occ))

    return otu_final
Example #2
0
File: ga.py Project: Nicolik/msa-ga
    def init_pop(self, lines_list):
        pop = []
        for c in range(self.chromosomes):

            # New chromosome
            lines_list_aux = []

            # Use nwalign to compute the pairwise alignments
            # by the Needleman-Wunsch algorithm
            for i in range(len(lines_list)):
                alignments = []

                # Compute the pairwise alignments
                for j in range(len(lines_list)):
                    if i != j:
                        curr_alignment = nw.global_align(
                            lines_list[i], lines_list[j])
                        alignments.append(curr_alignment)

                # Randomly select an alignment
                alignment = random.choice(alignments)
                alignment = alignment[0]
                lines_list_aux.append(alignment)

            # Add the chromosome generated and prints it
            lines_list_aux = Utils.add_gaps(lines_list_aux)
            pop.append({"chromosome": lines_list_aux, "evaluation": 0})
            print("\nChromosome " + str(c + 1) + ":")
            Utils.print_chromosome(lines_list_aux)

        # Initial population
        return pop
def chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size):
    """
    """
    dfr_lst = dereplication_fulllength(amplicon_file, minseqlen, mincount)
    dfr_lst2 = dereplication_fulllength(amplicon_file, minseqlen, mincount)
    kmer_dict = {}
    com = []
    id=1
    for seq in dfr_lst:
    	kmer_dict = get_kmer_dict(kmer_dict,seq[0],id,kmer_size)
    	id+=1
    perc_identity_matrix = []

    for l in dfr_lst2:
        chunks = get_chunks(l[0], chunk_size)
        chunk_mates = []
        for seq in chunks:
            mates = search_mates(kmer_dict, seq, kmer_size)
            chunk_mates.append(mates)
        if len(chunk_mates) > 1:
            for f in com[0:2]:
                sequ = get_chunks(no_chimere[f], chunk_size)
                perc_identity_matrix = [[]]
                for k, chunk in enumerate(chunks):
                    align = nw.global_align(chunk, sequ[k])
                    identite =  get_identity(align)
                    perc_identity_matrix[k].append(identite)
            chimera = detect_chimera(perc_identity_matrix)

        if not detect_chimera(perc_identity_matrix):
            yield l
Example #4
0
def abundance_greedy_clustering(
    amplicon_file, minseqlen, mincount, chunk_size, kmer_size
):
    output = []
    not_chimeric = [
        seq
        for seq in chimera_removal(
            amplicon_file, minseqlen, mincount, chunk_size, kmer_size
        )
    ]

    for index, sequence in enumerate(not_chimeric):
        # Get sequence with higher abundance
        abund_sequences = not_chimeric[:index]
        if len(abund_sequences) < 1:
            output += [sequence]
        else:
            valid = True
            for abund_seq in abund_sequences:
                alignment_list = nw.global_align(
                    sequence[0],
                    abund_seq[0],
                    matrix=os.path.abspath(
                        os.path.join(os.path.dirname(__file__), "MATCH")
                    ),
                )
                similarity = get_identity(alignment_list)

                if similarity > 97:
                    valid = False
                    break
            if valid:
                output += [sequence]

    return output
Example #5
0
def abundance_greedy_clustering(amplicon_file, minseqlen, mincount, chunk_size,
                                kmer_size):
    """
    abundance_greedy_clustering fait appel à chimera_removal et réalise également des mesures d’identité à l’aide de get_identity.
    Elle retourne une liste d’OTU, cette liste indiquera pour chaque séquence son occurrence (count).

    """
    otu = []
    for i, seq in enumerate(
            chimera_removal(amplicon_file, minseqlen, mincount, chunk_size,
                            kmer_size)):
        if i == 0:
            otu.append(seq)
            print(seq)
        else:
            for seq_otu in otu:
                idt = get_identity(
                    nw.global_align(seq_otu[0],
                                    seq[0],
                                    gap_open=-1,
                                    gap_extend=-1,
                                    matrix=os.path.abspath(
                                        os.path.join(os.path.dirname(__file__),
                                                     '../agc')) + "/MATCH"))
                if idt <= 97:
                    otu.append(seq)
                else:
                    pass
    return otu
Example #6
0
def abundance_greedy_clustering(amplicon_file, minseqlen, mincount, chunk_size,
                                kmer_size):
    """
    @brief : Regroupement glouton de sequences.
    @param amplicon_file : string, lien du fichier d'entrée.
    @param minseqlen : int, longueur minimale des séquences.
    @param mincount : int, comptage minimal des séquences.
    @param chunk_size : int, taille des segments.
    @kmer_size : int, taille des kmers.
    @returns : list, liste d'OTU.
    """

    otu_list = []
    chimeras = list(
        chimera_removal(amplicon_file, minseqlen, mincount, chunk_size,
                        kmer_size))
    for i in range(len(enumerate(chimeras))):
        otu = True
        for j in range(i + 1, len(chimeras)):
            if get_identity(nw.global_align(chimeras[i][0], chimeras[j][0])) > 97 and \
                chimeras[i][1] >= chimeras[j][1]:
                otu_list.append(chimeras[j])
                otu = False
                break
        if otu:
            otu_list.append(chimeras)
    return otu_list[0]
def run_NW_align(seq1: str, seq2: str):
    # Global alignment with a specified penalty for gap open and extend #
    out_align = nw.global_align(seq1,
                                seq2,
                                gap_open=-10,
                                gap_extend=-5,
                                match=12,
                                matrix='BLOSUM62')
    return out_align
Example #8
0
 def _align(self):
     matrix = self._get_matrix_file(match=self._match,
                                    mismatch=self._mismatch,
                                    matrix=self._matrix)
     aln = nw.global_align(self.query.sequence,
                           self.target.sequence,
                           gap_open=self._gap_open,
                           gap_extend=self._gap_extend,
                           matrix=matrix)
     return aln
Example #9
0
def chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size):
    """Returns a generator of non chimera sequences
      :Parameters:
          amplicon_file: Path to the amplicon_file
          minseqlen: Minimal length of sequences (int)
          mincount: Minimum counting (int)
          chunk_size: Sub sequences length (int)
          kmer_size: Size of kmers (int)
      Returns: generator of non chimera sequences
    """
    non_chimera_seq_list = []
    id_seq = 0
    kmer_dict = {}
    # Sequence generator
    read = dereplication_fulllength(amplicon_file, minseqlen, mincount)
    # Evaluate each sequence
    for seq, value in read:
        mate_seq_list_id = []
        # List of segments
        chunk_list = list(get_chunks(seq, chunk_size))
        # Build a list of ids of mate non chimera sequences for each segment
        for chunk in chunk_list:
            mate_seq_list_id.append(search_mates(kmer_dict, chunk, kmer_size))
        # Find parent sequences if there are
        parent_seq_list_id = common(common(mate_seq_list_id[0], mate_seq_list_id[1]),
                                    common(mate_seq_list_id[2], mate_seq_list_id[3]))
        perc_identity_matrix = [[], [], [], []]
        chimera = False
        # If there are at least 2 parents
        if len(parent_seq_list_id) >= 2:
            # Then we compute the matrix with the percentages of identity
            for parent in parent_seq_list_id[:2]:
                # List of segments of the parent
                chunk_list_p = list(get_chunks(non_chimera_seq_list[parent][0], chunk_size))
                for i in range(len(chunk_list)):
                    # Make alignment between two segments
                    alignment_list = nw.global_align(chunk_list[i], chunk_list_p[i], gap_open=-1,
                                                     gap_extend=-1, matrix=os.path.abspath(
                                                         os.path.join(os.path.dirname(__file__),
                                                                      "MATCH")))
                    # Compute their identity
                    identity = get_identity(alignment_list)
                    perc_identity_matrix[i].append(identity)
            # Finally we check if the candidate sequence is a chimera or not
            chimera = detect_chimera(perc_identity_matrix)
        # If it is not
        if not chimera:
            # We add it to the non chimera sequences list
            non_chimera_seq_list.append([seq, value])
            # We also add it to kmer_dict
            for chunk in chunk_list:
                kmer_dict = get_unique_kmer(kmer_dict, chunk, id_seq, kmer_size)
            id_seq += 1
            # And we yield it with her counting
            yield [seq, value]
Example #10
0
def chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size):
    """Retourne générateur séquences non chimérique, Format: yield [seq, count]"""
    gen_seq = []
    for elm in dereplication_fulllength(amplicon_file,minseqlen,mincount):
        gen_seq.append(elm)
    seq_candidate1 = gen_seq[0]
    seq_candidate2 = gen_seq[1]
    seq_candidate1_chunks = get_chunks(seq_candidate1[0], chunk_size)
    seq_candidate2_chunks = get_chunks(seq_candidate2[0], chunk_size)
    dict_simil = {}
    subseq_list = []
    for i in range(1,len(gen_seq)):
        subseq_list.append(get_chunks(gen_seq[i][0], chunk_size))
    for subseq in subseq_list:
        dict_subseq = cut_kmer(subseq, kmer_size)
        for chunk in seq_candidate1_chunks:
            dict_seq1_chunk_candidate = cut_kmer(chunk, kmer_size)
            for chunk2 in seq_candidate2_chunks:
                dict_seq2_chunk_candidate = cut_kmer(chunk2, kmer_size)
                for key in dict_subseq:
                    while len(dict_simil<=8):
                        if key in dict_seq1_chunk_candidate or key in dict_seq2_chunk_candidate:
                            dict_simil[subseq] +=1
    seq_parents = []
    for key in dict_simil:
        if len(seq_parents<=2):
            if key in seq_candidate1 and key in seq_candidate2:
                seq_parents.append(key)
        else:
            break

    identity_list = []
    for elm in seq_parents:
        for i in range(len(get_chunks(elm, chunk_size))):
            alignment_list = nw.global_align(get_chunks(elm[i], chunk_size), seq_candidate1_chunks[i], gap_open=-1, gap_extend=-1, matrix=os.path.abspath(os.path.join(os.path.dirname(__file__),"MATCH")))
            identityc1 = get_identity(alignment_list)
            alignment_list = nw.global_align(get_chunks(elm[i], chunk_size), seq_candidate2_chunks[i], gap_open=-1, gap_extend=-1, matrix=os.path.abspath(os.path.join(os.path.dirname(__file__),"MATCH")))
            identityc2 = get_identity(alignment_list)
            identity_list.append([identityc1,identityc2])
    for elm in dereplication_fulllength(amplicon_file,minseqlen,mincount):
        yield [elm[0], elm[1]]
Example #11
0
def abundance_greedy_clustering(amplicon_file, minseqlen, mincount, chunk_size, kmer_size):
    """ Retourne la liste des OTUS"""
    list = []
    chimera = []
    for elm in chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size):
        chimera.append(elm)
    for i in range(len(chimera)):
        alignement_list = nw.global_align(chimera[i][0], chimera[0][0], gap_open=-1, gap_extend=-1, matrix=os.path.abspath(os.path.join(os.path.dirname(__file__),"MATCH")))
        identity = get_identity(alignement_list)
        if identity>=0.97:
            list.append([chimera[i][0],chimera[i][1]])
    return list
Example #12
0
def get_identity_matrix(chunks, parents, sequence_bank, chunk_size):
    """Get the identity matrix between a sequence and 2 parents.
      :Parameters:
          chunks: Chunks from the candidate sequence.
          parents: Parent sequences from the candidate sequence.
          sequence_bank: List of sequences that are not chimeras.
          chunk_size:  Size of the chunks.
    """
    perc_identity_matrix = [[] for chunk_index in range(len(chunks))]
    for parent in parents:
        parent_chunks = get_chunks(sequence_bank[parent], chunk_size)
        for index, chunk in enumerate(chunks):
            alignment = nw.global_align(chunk, parent_chunks[index])
            identity = get_identity(alignment)
            perc_identity_matrix[index].append(identity)
    return perc_identity_matrix
def _pairwise_align(consensus_info, ref_genome_info,
                    ref_first_orf_start_1based, ref_last_orf_end_1based):
    ref_first_orf_start_0based = ref_first_orf_start_1based - 1
    ref_last_orf_end_0based = ref_last_orf_end_1based - 1

    ref_gapped_seq, consensus_gapped_seq = nw.global_align(
        ref_genome_info[1], consensus_info[1])

    num_ref_gaps_in_orf_region = num_cons_gaps_in_orf_region = 0
    curr_ref_index = curr_cons_index = -1
    cons_first_orf_start_0based = cons_last_orf_end_0based = None
    for gapped_index in range(len(ref_gapped_seq)):
        curr_cons_base = consensus_gapped_seq[gapped_index]

        if curr_cons_base != "-":
            curr_cons_index += 1
        else:
            # if we are within the orf-containing region, keep
            # track of how many gap bases we see
            if cons_first_orf_start_0based is not None and \
                    cons_last_orf_end_0based is None:
                num_cons_gaps_in_orf_region += 1

        curr_ref_base = ref_gapped_seq[gapped_index]
        if curr_ref_base != "-":
            curr_ref_index += 1
            if curr_ref_index == ref_first_orf_start_0based:
                cons_first_orf_start_0based = curr_cons_index
            elif curr_ref_index == ref_last_orf_end_0based:
                cons_last_orf_end_0based = curr_cons_index
                break
        else:
            if cons_first_orf_start_0based is not None and \
                    cons_last_orf_end_0based is None:
                num_ref_gaps_in_orf_region += 1

    result = {}
    result[CONS_SEQ_NAME] = consensus_info[0]
    result[REF_SEQ_NAME] = ref_genome_info[0]
    result[REF_ALIGNMENT] = ref_gapped_seq
    result[CONS_ALIGNMENT] = consensus_gapped_seq
    result[CONS_FIRST_ORF_START_0B] = cons_first_orf_start_0based
    result[CONS_LAST_ORF_END_0B] = cons_last_orf_end_0based
    result[NUM_INSERTS] = num_ref_gaps_in_orf_region
    result[NUM_DELS] = num_cons_gaps_in_orf_region

    return result
Example #14
0
def compute_id_matrix(chunk_chim, parents):
    """TODO"""
    identity_percentage_matrix = np.zeros((len(chunk_chim), len(parents)))
    for chk_index, chunk in enumerate(chunk_chim):
        for i, parent in enumerate(parents):
            alignment_list = nw.global_align(
                chunk,
                parent["chunks"][chk_index],
                gap_open=-1,
                gap_extend=-1,
                matrix=os.path.abspath(
                    os.path.join(os.path.dirname(__file__), "MATCH")
                ),
            )
            identity_percentage_matrix[chk_index, i] = round(
                get_identity(alignment_list), 2
            )
    return identity_percentage_matrix
Example #15
0
def chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size):
    """
    @brief : Récupère les séquences non chimériques du fichier donné.
    @param amplicon_file : string, lien du fichier d'entrée.
    @param minseqlen : int, longueur minimale des séquences.
    @param mincount : int, comptage minimal des séquences.
    @param chunk_size : int, taille des segments.
    @kmer_size : int, taille des kmers.
    @returns : generator, générateur de séquences non chimériques.
    """

    kmer_dict = {}
    non_chimera = []
    perc_id_matrix = []
    id_seq = 0

    for seq, count in dereplication_fulllength(amplicon_file, minseqlen,
                                               mincount):
        chunks = get_chunks(seq, chunk_size)[:4]

        mates = [
            search_mates(kmer_dict, sub_seq, kmer_size) for sub_seq in chunks
        ]
        parents = []

        for mate, _ in enumerate(mates):
            parents = common(parents, mates[mate])

        if len(parents) >= 2:

            perc_id_matrix = [[] for _ in range(len(chunks))]
            for parent in parents:
                parent_chunks = get_chunks(non_chimera[parent], chunk_size)
                for index, chunk in chunks:
                    alignment = nw.global_align(chunk, parent_chunks[index])
                    identity = get_identity(alignment)
                    perc_id_matrix[index].append(identity)

        if not detect_chimera(perc_id_matrix):
            kmer_dict = get_unique_kmer(kmer_dict, seq, id_seq, kmer_size)
            non_chimera.append(seq)
            id_seq += 1
            yield [seq, count]
Example #16
0
File: agc.py Project: mymisou2/agc
def calcul_identity_matrix(chunks_courant, parents, chunk_size,
                           list_non_chimere):
    """
    input:
        - chunks_courant: chunks de la séquence courante
        - parents: les 2 séquences parentes possibles
        - chunk_size: taille du chunk
        - list_non_chimere: liste de séquences non chimériques
    output:
        - perc_identity_matrix: matrice donnant par segment le taux d’identité
        entre la séquence candidate et deux séquences parente
    """
    perc_identity_matrix = [[] for nb_chunk in range(len(chunks_courant))]
    for parent in parents:
        chunk_ref = get_chunks(list_non_chimere[parent], chunk_size)
        for element, chunk in enumerate(chunks_courant):
            res_alignement = nw.global_align(chunk, chunk_ref[element])
            res_identite = get_identity(res_alignement)
            perc_identity_matrix[element].append(res_identite)
    return perc_identity_matrix
Example #17
0
def phnSequenceAlignment(phns_teacher, phns_student):
    """
    Align two phn sequences
    :param phns_teacher:
    :param phn_student:
    :return:
    """
    # convert phonemes to letters
    phns_teacher_letters, phns_student_letters, dict_letters2syl = \
        convertSyl2Letters(syllables0=phns_teacher, syllables1=phns_student)

    # global alignment, because the mismatch between teacher and student phone list
    phns_teacher_aligned, phns_student_aligned = \
        nw.global_align(phns_teacher_letters, phns_student_letters)

    # output the insertion and deletion indices, and the corresponding teacher's phones for the student' phones
    dict_student_idx_2_teacher_phn, insertion_indices_student, deletion_indices_teacher, teacher_student_indices_pair = \
        identifyInsertionDeletionIdx(phns_teacher_aligned, phns_student_aligned, dict_letters2syl)

    return insertion_indices_student, deletion_indices_teacher, teacher_student_indices_pair, dict_student_idx_2_teacher_phn
Example #18
0
def compute_similarity_matrix(chunks, parents_sequence, non_chimeric_list,
                              chunk_size):
    """
    Compute the similarity matrix.

    Parameters:
        chunks: (List) Chunks of the current sequence
        parents_sequence: (List) Parent sequences from the sequence
        non_chimeric_list: (List) List of non_chimeric sequences
        chunk_size: (Int) Size of a chunk

    Returns: The similarity matrix
    """

    # Initialize the matrix
    perc_identity_matrix = [[] for _ in range(len(chunks))]

    for parent_sequence in parents_sequence:
        # Get sub sequences from non_chimeric_sequence list with a size of chunk_size
        non_chimeric_chunk = get_chunks(non_chimeric_list[parent_sequence],
                                        chunk_size)

        for index, chunk in enumerate(chunks):
            # Compute alignement list between current chunk and chunk from the non_chimeric list
            global_alignement = nw.global_align(
                chunk,
                non_chimeric_chunk[index],
                gap_open=-1,
                gap_extend=-1,
                matrix=os.path.abspath(
                    os.path.join(os.path.dirname(__file__), "MATCH")))

            # Compute the similarity rate from the alignement list
            similarity_rate = get_identity(global_alignement)

            # Add the result to the matrix
            perc_identity_matrix[index].append(similarity_rate)

    return perc_identity_matrix
Example #19
0
def abundance_greedy_clustering(amplicon_file, minseqlen, mincount, chunk_size, kmer_size):
    """Returns a list of OTU
      :Parameters:
          amplicon_file: Path to the amplicon_file
          minseqlen: Minimal length of sequences (int)
          mincount: Minimum counting (int)
          chunk_size: Sub sequences length (int)
          kmer_size: Size of kmers (int)
      Returns: list of OTU (list)
    """
    result = []
    # Non chimera sequences list (in ascending order)
    read = sorted(list(chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size)),
                  key=lambda x: x[1])
    # Run the list (not the last sequence which is necessarily an OTU)
    cpt = 1
    for seq1, value1 in read[:-1]:
        otu = True
        # Run a second time to compare the sequence with all other (which have a bigger counting)
        for seq2, value2 in read[cpt:]:
            # Make alignment between two sequences
            alignment_list = nw.global_align(seq1, seq2, gap_open=-1, gap_extend=-1,
                                             matrix=os.path.abspath(os.path.join(
                                                 os.path.dirname(__file__), "MATCH")))
            # Check if the sequence is an OTU
            if seq1 != seq2 and get_identity(alignment_list) > 97 and value2 > value1:
                otu = False
                break
        # Add the sequence and her counting to the result if it is and OTU
        if otu:
            result.append([seq1, value1])
        cpt += 1
    # Add the last sequence to the list of OTU
    result.append([read[-1][0], read[-1][1]])
    # Sort the result in descending order
    result = sorted(result, key=lambda x: x[1], reverse=True)
    return result
Example #20
0
File: enz.py Project: UoMMIB/enz
 def aln(s1, s2):
     aln1, aln2 = nw.global_align(s1, s2)
     return aln1, aln2
Example #21
0
def chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size):
    """Fait appel au générateur fourni par dereplication_fulllength et
    retourne un générateur des séquences non chimérique au format:
    yield [sequence, count]
    """
    kmer_dict = {}
    perc_identity_matrix = []
    chunk_match = []
    seq_list = []
    chim_id = 0
    for i, occurence_list in enumerate(
            list(dereplication_fulllength(amplicon_file, minseqlen,
                                          mincount))):
        chim = True
        chunk_list = get_chunks(occurence_list[0], chunk_size)
        for chunk in chunk_list:
            chunk_match.append([
                i[0] for i in Counter([
                    ids for kmer in cut_kmer(chunk, kmer_size)
                    if kmer in kmer_dict for ids in kmer_dict[kmer]
                ]).most_common(8)
            ])
        com_seq = common(chunk_match[0], chunk_match[1])
        for j in range(2, len(chunk_match)):
            com_seq = common(com_seq, chunk_match[j])
        if len(com_seq) > 1:
            for k in range(len(chunk_list)):
                perc_identity_matrix.append([][k])
            for seq in com_seq[0:2]:
                seq_chunk_list = get_chunks(seq_list[seq], chunk_size)
                for l, chunk in enumerate(chunk_list):
                    perc_identity_matrix[l].append(
                        get_identity(
                            nw.global_align(chunk,
                                            seq_chunk_list[l],
                                            gap_open=-1,
                                            gap_extend=1,
                                            matrix="MATCH")))
            std_list = []
            flag_file = 0
            flag_similarity = 0
            for line in perc_identity_matrix:
                std_list.append(statistics.stdev([line[0], line[1]]))
                if flag_file == 0:
                    val0 = line[0]
                    val1 = line[0]
                    flag_file = 1
                else:
                    if flag_similarity == 1:
                        continue
                    if val0 != line[0] and val1 != line[1]:
                        flag_similarity = 1
                    val0 = line[0]
                    val1 = line[0]
            std_mean = statistics.mean(std_list)
            if std_mean > 5 and flag_similarity == 1:
                chim = True
            chim = False

        else:
            chim = False
        if not chim:
            #kmer_dict = get_unique_kmer(kmer_dict, occurence_list[0], chim_id, kmer_size)
            for kmer in cut_kmer(occurence_list[0], kmer_size):
                if kmer not in kmer_dict:
                    kmer_dict[kmer] = [chim_id]
                else:
                    kmer_dict[kmer].append(chim_id)
                    kmer_dict[kmer] = list(
                        set(kmer_dict[kmer]).union(set(kmer_dict[kmer])))
            seq_list.append(occurence_list[0])
            chim_id += 1
            yield occurence_list
        res_one_1_filled = "test"  #MsaHandler.fillup_wildcarded_result(res_one_1, pivot_msa, '@')
        res_three_2_filled = "test"  # MsaHandler.fillup_wildcarded_result(res_three_2, pivot_msa, '@')

        res_final_1 = res_one_1_filled
        res_final_2 = pivot_msa
        res_final_3 = res_three_2
        #res_final_3 = res_three_2_filled
        return res_final_1, res_final_2, res_final_3

    except Exception as ex:
        tr = inspect.trace()
        print("Exception raised in %s" % tr[-1][3])


import nwalign3 as nw
reto = nw.global_align("CEELECANTH", "PELICAN")
reto2 = nw.global_align("(Westf.), Grevener", "††††††(Westf.), Grevener")
reto3 = nw.global_align("(Westf.), Grevener",
                        "††††††(Westf.), Grevener",
                        gap_open=-5,
                        gap_extend=-2)

#import seqanpy
#print(seqanpy.align_global('ACCGGT', 'CCG'))

from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

# Create sequences to be aligned.
a = Sequence('what a beautiful day'.split())
Example #23
0
def chimera_removal(amplicon_file, minseqlen, mincount, chunk_size, kmer_size):
    # Get all the sequences
    generator = dereplication_fulllength(amplicon_file, minseqlen, mincount)

    # Add the 2 first sequences to the list of references (non chimeric sequences)
    references = []
    i = 0
    for line in generator:
        references.append(line)
        if i == 2:
            break

    # Yield the 2 first sequences
    for ref in references:
        yield ref

    # For all the other sequences
    for candidate, occurence in generator:

        is_chimera = False

        # Get the segments of all the references
        segments_reference_list = []
        for reference, _ in references:
            segments_reference_list.append(get_chunks(reference, chunk_size))

        # Get the kmers of the segments of the references
        kmers_segments_reference_list = []
        for segments_reference in segments_reference_list:
            kmers_segments_reference = []
            for segment in segments_reference:
                kmers_reference = list(cut_kmer(segment, kmer_size))
                kmers_segments_reference.append(kmers_reference)
            kmers_segments_reference_list.append(kmers_segments_reference)

        # 1. Divide each candidate in 4 segments of size L= chunk_size
        segments_candidate = get_chunks(candidate, chunk_size)

        # 2. For each segment, identify 8 sequences with similar kmers
        # Get the kmers of the segments of the candidate
        kmers_segments_candidate = []
        for segment in segments_candidate:
            kmers_candidate = list(cut_kmer(segment, kmer_size))
            kmers_segments_candidate.append(kmers_candidate)

        list_mates = []

        # For each list of kmers for each segment
        for k, kmer_list in enumerate(kmers_segments_candidate):
            kmer_dict = dict()
            # For each kmer of segment k
            for kmer in kmer_list:
                # For each segment n°k of each reference
                for i, seg in enumerate(
                    [a[k] for a in kmers_segments_reference_list]):
                    if kmer in seg:
                        if kmer in kmer_dict.keys():
                            if i not in kmer_dict[kmer]:
                                kmer_dict[kmer].append(i)
                        else:
                            kmer_dict[kmer] = [i]
            # For each segment, identify 8 sequences with similar kmers
            list_mates.append(
                set(search_mates(kmer_dict, candidate, kmer_size)))

        # 3. Find two parents
        parents_ids = set.intersection(*list_mates)

        # 4. Compute the similarities
        if len(parents_ids) > 1:
            # If we have more than 2 parents, test with all the combinations of parents
            for parent1, parent2 in itertools.combinations(parents_ids, 2):

                # Create the matrix
                perc_identity_matrix = [[] for k in range(4)]
                for k in range(4):
                    for id_parent in [parent1, parent2]:
                        segments_parent = get_chunks(references[id_parent][0],
                                                     chunk_size)

                        alignement = nw.global_align(
                            segments_candidate[k],
                            segments_parent[k],
                            gap_open=-1,
                            gap_extend=-1,
                            matrix=os.path.abspath(
                                os.path.join(os.path.dirname(__file__),
                                             "MATCH")))

                        perc_identity_matrix[k].append(
                            get_identity(alignement))

                # Detect a chimera
                is_chimera = detect_chimera(perc_identity_matrix)
                if is_chimera:
                    break

        # If this is not a chimera, add it to the references and yield
        if not is_chimera:
            references.append((candidate, occurence))
            yield candidate, occurence