Example #1
0
def load_csv_file(file, delimiter=";"):
    """
    This function loads a "Primer" file.
    @returns: List of PrimerPair instances
    """
    pos = {
        "id": 0,
        "forwardPrimer": 0,
        "reversePrimer": 0,
        "fPDNA": 0,
        "rPDNA": 0,
        "ampliconMinLength": 0,
        "ampliconMaxLength": 0
    }
    header_len = len(pos)
    primer_dict = {}
    with open(file, newline='') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=delimiter)
        headers = next(csvreader)
        if (len(headers) != header_len):
            raise ValueError("Wrong header")
        for i in range(len(headers)):
            if (headers[i] not in pos):
                raise ValueError("Unknown header " + headers[i])
            pos[headers[i]] = i

        i = 1
        for row in csvreader:
            i += 1
            if (len(row) == header_len):
                fprimer = Seq(row[pos["fPDNA"]], IUPAC.IUPACAmbiguousDNA())
                fprimer = SeqRecord(fprimer)
                fprimer.id = row[pos["forwardPrimer"]]

                rprimer = Seq(row[pos["rPDNA"]], IUPAC.IUPACAmbiguousDNA())
                rprimer = SeqRecord(rprimer)
                if (True):  #TODO
                    rprimer = rprimer.reverse_complement()
                rprimer.id = row[pos["reversePrimer"]]

                primer_pair = PrimerPair((row[pos["id"]]), fprimer, rprimer,
                                         int(row[pos["ampliconMinLength"]]),
                                         int(row[pos["ampliconMaxLength"]]))
                if (check_primer_pair_integrity(primer_pair)):
                    primer_dict[row[pos["id"]]] = primer_pair
                else:
                    logging.warning("Skipping primer pair " + primer_pair.id +
                                    ", bad sequence")
            else:
                logging.warning("Wrong primer pair in line " + str(i))

    return primer_dict
def stage_one_trimming(alignment, window_size, proportion, threshold, min_len):
    """
    ---------------------------------------------------------------------
    MODIFIED FUNCTION FROM PHYLUCE: generic_align.py
    ---------------------------------------------------------------------
    First stage alignment trimming to find and trim edges of a given
    alignment.  Calls running_average function above to determine reasonable
    alignment start and end trimming for the entire alignment block.
    """
    start, end = running_average(alignment, window_size, proportion, threshold)
    s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?"))
    for sequence in alignment:
        sequence.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
        if start >= 0 and end:
            trim = sequence[start:end]
            if set(trim) != set(
                ['-']) and set(trim) != (['?']) and len(trim) >= min_len:
                s1_trimmed.append(sequence[start:end])
            else:
                s1_trimmed = None
                break
        else:
            s1_trimmed = None
            break

    return s1_trimmed
Example #3
0
def genome_to_seqrecord(phage_genome):
    """Creates a SeqRecord object from a pdm_utils Genome object.

    :param phage_genome: A pdm_utils Genome object.
    :type phage_genome: Genome
    :returns: A BioPython SeqRecord object
    :rtype: SeqRecord
    """

    assert phage_genome != None,\
    "Genome object passed is None and not initialized"
    try:
        record = SeqRecord(phage_genome.seq)
        record.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
    except AttributeError:
        print("Genome object failed to be converted to SeqRecord.",
              "Genome valid attribute 'seq' is required to",
              "convert to SeqRecord object.")
        raise
    record.name = phage_genome.name
    if phage_genome.accession != "":
        record.id = phage_genome.accession
    record.features = get_seqrecord_features(phage_genome)
    record.description = get_seqrecord_description(phage_genome)
    record.annotations=\
            get_seqrecord_annotations(phage_genome)

    return record
Example #4
0
def find_gapped_columns(align, cfg):
    """Find all columns that contain more gaps than the cfg setting using 
    a sliding windows"""
    max_gap_proportion = cfg["max_gap_proportion"]
    nbr_sequences = len(align)
    columns_to_remove = []
    len_align = align.get_alignment_length()
    for index in range(len_align):
        column = str(align[:, index]).replace("n", "N")
        gap_freq = (column.count("N") + column.count("-")) / nbr_sequences
        if gap_freq > max_gap_proportion:
            columns_to_remove.append(index)
    if columns_to_remove:
        idxs = [x for x in range(len_align) if x not in columns_to_remove]
        trimmed_records = []
        for rec in align:
            L_seq = list(rec.seq)
            new_seq = "".join([L_seq[i] for i in idxs])
            new_rec = SeqRecord(
                Seq(new_seq, IUPAC.IUPACAmbiguousDNA()),
                name=rec.name,
                id=rec.id,
                description="",
            )
            trimmed_records.append(new_rec)
        return trimmed_records
    return align
Example #5
0
def main():
    logging.basicConfig()
    parser = argparse.ArgumentParser()
    parser.add_argument('--fasta', dest='fasta_file', metavar='STRING', required=True, type=str)
    parser.add_argument('--num_fragments', dest='num_fragments', metavar='int', required=True, type=int)
    parser.add_argument('--mean_frag_size', dest='frag_size_mu', metavar='int', required=True, type=int)
    parser.add_argument('--frag_size_std', dest='frag_size_sigma', metavar='int', required=True, type=int)
    parser.add_argument('--mean_mutation_rate', dest='mutation_rate_mu', metavar='float', required=True, type=float)
    parser.add_argument('--mutation_rate_std', dest='mutation_rate_sigma', metavar='float', required=True, type=float)
    parser.add_argument('--output', dest='output_file', metavar='string', required=True, type=str)
    args = parser.parse_args()

    outhandle = open(args.output_file, 'w')

    generated_seqs = []
    for record in SeqIO.parse(args.fasta_file, 'fasta'):
        base_id = record.id
        base_seq = str(record.seq)
        while len(generated_seqs) < args.num_fragments:
            try:
                mutation_rate = rnd.gauss(args.mutation_rate_mu, args.mutation_rate_sigma)
                subsequence = subselect_sequence(base_seq, args.frag_size_mu, args.frag_size_sigma)
                mutated_subsequence = mutate(subsequence, mutation_rate)
                new_id = '%s__mut_%.2f__len_%i' % (base_id, mutation_rate, len(subsequence))
                generated_seqs.append(SeqRecord(Seq(mutated_subsequence, IUPAC.IUPACAmbiguousDNA()),
                                            id=new_id, name=new_id, description=''))
            except Exception as e:
                print(e)

    SeqIO.write(generated_seqs, outhandle, 'fasta')

    outhandle.close()
Example #6
0
def get_sine_forward(sine_fname):
    """Only in direction given in file."""
    [sine_record] = SeqIO.parse(sine_fname,
                                "fasta",
                                alphabet=IUPAC.IUPACAmbiguousDNA())
    # TODO: If we return it as dumb string, why did we bother about the alphabet?
    # TODO: The reference SINEs do contain a couple ambiguous chars - N, Y.
    return str(sine_record.seq)
Example #7
0
def get_sines(sine_fname):
    """As given in file + reverse complements."""
    for (i, sine_record) in enumerate(SeqIO.parse(sine_fname, "fasta")):
        cur_seq = Seq(str(sine_record.seq), IUPAC.IUPACAmbiguousDNA())
        yield str(cur_seq)
        cur_seq_rc = cur_seq.reverse_complement()
        yield str(cur_seq_rc)
        print(cur_seq, cur_seq_rc, '''\n ======================''')
Example #8
0
    def set_primer_seqs(self, fwd_sequence, rev_sequence):
        """Set the primer sequences.

        Set the primer sequences from the given forward and reverse sequences.

        Parameters
        ----------
        fwd_sequence : string
            forward primer sequence - ambiguities allowed.
        rev_sequence : string
            reverse primer sequence - ambiguities allowed.

        """
        fwd_primer = Seq(fwd_sequence, IUPAC.IUPACAmbiguousDNA())
        rev_primer = Seq(rev_sequence, IUPAC.IUPACAmbiguousDNA())
        self.logger.info("Setting foward primer to " + fwd_sequence)
        self.logger.info("Setting reverse primer to " + rev_sequence)
        self._primer_pair = (fwd_primer, rev_primer)
Example #9
0
def gapCdsToProteins(proteinAlignment, extraDnaSeqs=None):
    """ to replace proteinToCodonAlignment() """
    protSeqDict = {}
    for seqRecord in proteinAlignment:
        protSeqDict[seqRecord.id] = seqRecord
    dnaFasta = patric_api.getSequenceOfFeatures(protSeqDict.keys(), 'dna')
    #if Debug:
    #     LOG.write("dnaFasta sample: %s\n"%dnaFasta[:100])

    dnaSeqDict = SeqIO.to_dict(
        SeqIO.parse(StringIO(dnaFasta),
                    "fasta",
                    alphabet=IUPAC.IUPACAmbiguousDNA()))
    for seqId in protSeqDict:
        if extraDnaSeqs and seqId in extraDnaSeqs:
            dnaSeqDict[seqId] = extraDnaSeqs[seqId]
            if Debug:
                LOG.write("appending extra DNA seq %s\n" % seqId)
    if set(dnaSeqDict.keys()) != set(protSeqDict.keys()):
        raise Exception(
            "Protein and DNA sets differ:\nProteins: %s\nDNA: %s\n" %
            (", ".join(sorted(protSeqDict)), ", ".join(sorted(dnaSeqDict))))
    dnaAlignFasta = StringIO()
    prot_align_len = proteinAlignment.get_alignment_length()
    for seqId in dnaSeqDict:
        dnaSeq = dnaSeqDict[seqId].seq
        if len(dnaSeq) < 3 * prot_align_len:
            # this is to handle cases where protein exists but DNA does not
            dnaSeq += '---' * (prot_align_len - len(dnaSeq))
        protSeq = protSeqDict[seqId].seq
        dnaAlignFasta.write(">" + seqId + "\n")
        dnaSeqPos = 0
        for protPos in range(0, len(protSeq)):
            if protSeq[protPos] == '-':
                codon = '---'
            else:
                #  TODO: in future use a codon table to check correct matching
                codon = str(dnaSeq[dnaSeqPos:dnaSeqPos + 3])
                dnaSeqPos += 3
            dnaAlignFasta.write(codon)
        protPos += 1  # should now be equal to prot_align_len
        if Debug:
            LOG.write(
                seqId +
                " protPos={0}, dnaSeqPos={1}, orig_DNA_len={2}, orig_prot_len={3}\n"
                .format(protPos, dnaSeqPos, len(dnaSeq), len(protSeq)))
        if protPos < prot_align_len:
            dnaAlignFasta.write(''.join("---" * (prot_align_len - protPos)))
            LOG.write(
                "padding short seq {0}, of {1} pos out to {2}, orig_DNA_len={3}, orig_prot_len={4}\n"
                .format(seqId, protPos, prot_align_len, len(dnaSeq),
                        len(protSeq)))
        dnaAlignFasta.write("\n")
    dnaAlignFasta_text = dnaAlignFasta.getvalue()
    retval = AlignIO.read(StringIO(dnaAlignFasta_text), 'fasta')
    return retval
Example #10
0
def write_fasta(sequences, filename, field=""):
    records = []
    for sq in sequences:
        if field == "original":
            for k in range(2):
                records.append(
                    SeqRecord(Seq(sq["seqs"][k], IUPAC.IUPACAmbiguousDNA()),
                              id=sq["names"][k],
                              description=""))
        elif field == "trimmed" or sq["merged_seq"] is None:
            for k in range(2):
                records.append(
                    SeqRecord(Seq(sq["trimmed_seqs"][k],
                                  IUPAC.IUPACAmbiguousDNA()),
                              id=sq["names"][k],
                              description="({})".format("unmerged" if field ==
                                                        "merged" else field)))
        elif field == "merged":
            records.append(
                SeqRecord(Seq(sq["merged_seq"], IUPAC.IUPACAmbiguousDNA()),
                          id=", ".join(sq["names"]),
                          description="(merged)"))
    SeqIO.write(records, filename, "fasta")
Example #11
0
    def to_seq_record(self):
        """Convert the Gene to a SeqRecord"""
        #build a list of features
        feats = [_f.to_seq_feature() for _f in self.features.all()]
        #build a dictionary of annotations & refs
        annot = {}
        for a in self.annotations.all():
            a.to_ann(annot)
        annot['references'] = [r.to_ref() for r in self.references.all()]

        return SeqRecord(seq=Seq(self.sequence, IUPAC.IUPACAmbiguousDNA()),
                         name=self.name,
                         description=self.description,
                         features=feats,
                         annotations=annot)
Example #12
0
 def translate(self, align, offset):
     """
     Translate the alignment according to the selected frame which is set 
         according to 'offset' value
     """
     end = ((align.get_alignment_length() - offset) // 3) * 3 + offset
     t_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N"))
     for rec in align:
         seq = str(rec.seq).upper().replace("-", "N").replace("n", "N")
         new_seq = Seq(seq,
                       IUPAC.IUPACAmbiguousDNA())[offset:end].translate()
         new_rec = SeqRecord(new_seq,
                             name=rec.name,
                             id=rec.id,
                             description="")
         t_align.append(new_rec)
     return t_align
Example #13
0
 def test_reverse_complements(self):
     """Test double reverse complement preserves the sequence."""
     sorted_amb_rna = sorted(ambiguous_rna_values)
     sorted_amb_dna = sorted(ambiguous_dna_values)
     for sequence in [
             Seq.Seq("".join(sorted_amb_rna)),
             Seq.Seq("".join(sorted_amb_dna)),
             Seq.Seq("".join(sorted_amb_rna), Alphabet.generic_rna),
             Seq.Seq("".join(sorted_amb_dna), Alphabet.generic_dna),
             Seq.Seq("".join(sorted_amb_rna).replace("X", ""),
                     IUPAC.IUPACAmbiguousRNA()),
             Seq.Seq("".join(sorted_amb_dna).replace("X", ""),
                     IUPAC.IUPACAmbiguousDNA()),
             Seq.Seq("AWGAARCKG"),
     ]:  # Note no U or T
         reversed_sequence = sequence.reverse_complement()
         self.assertEqual(str(sequence),
                          str(reversed_sequence.reverse_complement()))
Example #14
0
def cds_to_seqrecord(cds):
    try:
        record = SeqRecord(cds.seq)
        record.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
    except AttributeError:
        print("Genome object failed to be converted to SeqRecord\n."
              "Genome valid attribute 'seq' is required to "
              "convert to SeqRecord object.")

    record.name = cds.id
    if cds.locus_tag != "":
        record.id = cds.locus_tag
    cds.set_seqfeature()
    record.features = [cds.seqfeature]
    record.description = f"Single gene {cds.id}"
    record.annotations = get_cds_seqrecord_annotations(cds)

    return record
 def stage_one_trimming(self,
                        alignment,
                        window_size,
                        proportion,
                        threshold,
                        min_len,
                        replace_ends=False):
     """
     First stage (of 3) alignment trimming to find and trim edges of a given
     alignment.  Calls running_average function above to determine reasonable
     alignment start and end trimming for the entire alignment block.
     """
     # get the trim positions that we determine begin and end "good"
     # alignments
     start, end = self.running_average(alignment, window_size, proportion,
                                       threshold)
     # create a new alignment object to hold our alignment
     s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna,
                                                  "-?"))
     for sequence in alignment:
         # ensure correct sequence alphabet or we'll get a conflict when
         # we try to generate a consensus
         sequence.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
         if start >= 0 and end:
             trim = sequence[start:end]
             # ensure we don't just add a taxon with only gaps/missing
             # data and that alignments are >= min_len
             if set(trim) != set(
                 ['-']) and set(trim) != (['?']) and len(trim) >= min_len:
                 if not replace_ends:
                     s1_trimmed.append(sequence[start:end])
                 else:
                     # replace end gaps with missing data character ?
                     # called on third iteration of trimming
                     repl = self._replace_ends(str(sequence[start:end].seq))
                     s1_trimmed.append(
                         self._record_formatter(repl, sequence.id))
             else:
                 s1_trimmed = None
                 break
         else:
             s1_trimmed = None
             break
     return s1_trimmed
Example #16
0
 def _extract_clusters(self, tag, qual='ugene_name'):
     tagre = re.compile(tag)
     clusters = {}
     records = SeqView()
     records.load(self.genomes_files)
     for record in records:
         for f in record.features:
             if qual in f.qualifiers:
                 q = ' '.join(f.qualifiers[qual])
                 if not tagre.match(q): continue
                 c = f.extract(record)
                 c.id = c.name = q
                 c.description = record.description
                 if c.seq.alphabet is not NucleotideAlphabet \
                 or c.seq.alphabet is not ProteinAlphabet:
                     c.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
                 self._process_features(c)
                 clusters[c.id] = c
     return clusters
def save_record_fasta(record_df, save_to_dir):
    records_grouped = record_df.groupby('HGNC')
    genes = list(records_grouped.groups.keys())

    for g in genes:
        records_grouped.get_group(g)
        out_dir = save_to_dir / g
        if not out_dir.exists():
            out_dir.mkdir()
        out_file = out_dir / '{}_sequences.fasta'.format(g)
        seq_list = []
        for i, r in records_grouped.get_group(g).iterrows():
            seq_list.append(
                SeqRecord(Seq(r.Sequence, IUPAC.IUPACAmbiguousDNA()),
                          id=r.entrez_id,
                          name=r.HGNC,
                          description=r.Description))
            SeqIO.write(seq_list, out_file, 'fasta')
    return seq_list
Example #18
0
def ageSequence(rec, outfile, freq, end_length, seed, logger=None):
    assert logger is not None, "must use logging"
    logger.info("frequncy of mutation: %f", freq)
    newseqlist = list(rec.seq)
    alph = ["A", "T", "C", "G"]
    seqlen = len(rec.seq)
    if end_length is None or end_length is 0:
        ignore_region = []
    elif not seqlen - (2 * end_length) > 1:
        raise ValueError("Edge width cannot be greater than half the " +
                         "length of the sequence ")
    else:
        ignore_region = set([
            idx for sublist in [range(end_length, seqlen - end_length)]
            for idx in sublist
        ])
    newseqlist = list(rec.seq)
    seq_len = len(newseqlist)
    random.seed(seed)
    # subst_idxs = random.sample(range(0, seq_len), int(round(seq_len * freq)))
    idxs = list(range(0, seq_len))
    random.shuffle(idxs)
    subst_idxs = idxs[0:int(round(seq_len * freq))]
    # ignore the indexes in the regions we are leaving unchanaged
    executed_subst_idxs = [x for x in subst_idxs if x not in ignore_region]
    for i in executed_subst_idxs:
        if i in ignore_region:
            pass
        else:
            substitute_base(strlist=newseqlist, position=i, alph=alph)
    logger.info("Changed %d of %d bases", len(executed_subst_idxs), seq_len)
    newrec = SeqRecord(
        id=rec.id,
        # description="riboSim mutation frequency" + str(freq),
        seq=Seq("".join(newseqlist), IUPAC.IUPACAmbiguousDNA()))

    with open(outfile, "a") as o:
        SeqIO.write(newrec, o, "fasta")

    assert len(newseqlist) == len(rec.seq), \
        "something bad happened! unequal lengths of input and output sequences"
Example #19
0
def search_sines2(sine_f, r1_f, to_check = {0,1,2}, step_print = 10000, nlines = 100000):

    sine_set = []
    stats = collections.Counter()
    
    for (i,sine_record) in enumerate(SeqIO.parse(sine_f, "fasta")):
        if (i in to_check):
            cur_seq = Seq(str(sine_record.seq), IUPAC.IUPACAmbiguousDNA())
            cur_seq_rc = cur_seq.reverse_complement()
            sine_set.append(str(cur_seq))
            sine_set.append(str(cur_seq_rc))
            print(cur_seq, cur_seq_rc, '''\n ======================''')

    for sine in sine_set:
        matcher = difflib.SequenceMatcher(isjunk=None, a=sine) 
        
        total = 0
        cnt = 0
        start_time = time()
        print('''sequences for sine = ''')
                    
        for cur_seq in r1_f:
            total += 1
            matcher.set_seq2(cur_seq)
            res = matcher.find_longest_match(0, len(sine), 0, len(cur_seq))
            d = res[2]

            stats[d] += 1
                            
            if (total % step_print == 0 or total == nlines):
                print('''distances for first''', total, '''segments \n''')
                print('''========================''')
                print('''time elapsed''', (time() - start_time)/60.0, '''minutes''')
                for k in sorted(stats):
                    print('longest common =', k, 'num matches =', stats[k], '''/''',cnt)
                
            
            if (total == nlines):
                break
Example #20
0
 def setUp(self):
     self.test_seqs = [
         Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna),
         Seq.Seq("ATGAAACTG"),
         Seq.Seq("ATGAARCTG"),
         Seq.Seq("AWGAARCKG"),  # Note no U or T
         Seq.Seq("".join(ambiguous_rna_values)),
         Seq.Seq("".join(ambiguous_dna_values)),
         Seq.Seq("".join(ambiguous_rna_values), Alphabet.generic_rna),
         Seq.Seq("".join(ambiguous_dna_values), Alphabet.generic_dna),
         Seq.Seq("".join(ambiguous_rna_values), IUPAC.IUPACAmbiguousRNA()),
         Seq.Seq("".join(ambiguous_dna_values), IUPAC.IUPACAmbiguousDNA()),
         Seq.Seq("AWGAARCKG", Alphabet.generic_dna),
         Seq.Seq("AUGAAACUG", Alphabet.generic_rna),
         Seq.Seq("ATGAAACTG", IUPAC.unambiguous_dna),
         Seq.Seq("ATGAAACTGWN", IUPAC.ambiguous_dna),
         Seq.Seq("AUGAAACUG", Alphabet.generic_rna),
         Seq.Seq("AUGAAACUG", IUPAC.unambiguous_rna),
         Seq.Seq("AUGAAACUGWN", IUPAC.ambiguous_rna),
         Seq.Seq("ATGAAACTG", Alphabet.generic_nucleotide),
         Seq.MutableSeq("ATGAAACTG", Alphabet.generic_dna),
         Seq.MutableSeq("AUGaaaCUG", IUPAC.unambiguous_rna),
     ]
Example #21
0
def concat_genome(input_dir, ext, outpath, verbose=False):
    """for each fasta, read in, add to existing string, and when finished,
    write out as single-entry fasta
    """
    fastas = sorted(glob.glob(str(input_dir + ext)))
    if len(fastas) == 0:
        if verbose:
            print("No files found!")
        return 1
    if verbose:
        print(str("combining the following files matching extension " +
                  "{0}:{1}".format(ext, " ".join(fastas))))
    new_seq = ""
    for filen in fastas:
        print("Adding %s to combined sequence" % filen)
        with open(filen, 'r') as i_file:
            seq_rec = list(SeqIO.parse(i_file, 'fasta'))[0]
            new_seq = new_seq + str(seq_rec.seq)
            if verbose:
                print(str("Len of sequence:{0}\nLen of concatenated " +
                          "sequence:{1}").format(len(seq_rec),
                                                 len(new_seq)))
    try:

        with open(outpath, 'w') as o_file:
            success = SeqIO.write(
                SeqRecord(
                    seq=Seq(new_seq, IUPAC.IUPACAmbiguousDNA()),
                    id="concatenated_genome"), o_file, 'fasta')
            if success:
                print("wrote out concatenated file!")
                return 0
    except Exception as e:
        if verbose:
            print(e)
        return 1
Example #22
0
def cds_to_seqrecord(cds, parent_genome):
    """Creates a SeqRecord object from a Cds and its parent Genome.

    :param cds: A populated Cds object.
    :type cds: Cds
    :param phage_genome: Populated parent Genome object of the Cds object.
    :returns: Filled Biopython SeqRecord object.
    :rtype: SeqRecord
    """
    record = SeqRecord(cds.translation)
    record.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
    record.name = cds.id
    if cds.locus_tag != "":
        record.id = cds.locus_tag

    cds.set_seqfeature()
    record.features = [cds.seqfeature]

    record.description = (
        f"{cds.description} "
        f"[{parent_genome.host_genus} phage {cds.genome_id}]")
    record.annotations = get_cds_seqrecord_annotations(cds, parent_genome)

    return record
Example #23
0
    print "%s={%s} --> {%s}=%s" % \
        (ambig_char, values, compl_values, ambiguous_rna_complement[ambig_char])
    assert set(compl_values) == set(
        ambiguous_rna_values[ambiguous_rna_complement[ambig_char]])

print
print "Reverse complements:"
for sequence in [
        Seq.Seq("".join(sorted(ambiguous_rna_values))),
        Seq.Seq("".join(sorted(ambiguous_dna_values))),
        Seq.Seq("".join(sorted(ambiguous_rna_values)), Alphabet.generic_rna),
        Seq.Seq("".join(sorted(ambiguous_dna_values)), Alphabet.generic_dna),
        Seq.Seq("".join(sorted(ambiguous_rna_values)).replace("X", ""),
                IUPAC.IUPACAmbiguousRNA()),
        Seq.Seq("".join(sorted(ambiguous_dna_values)).replace("X", ""),
                IUPAC.IUPACAmbiguousDNA()),
        Seq.Seq("AWGAARCKG")
]:  # Note no U or T
    print "%s -> %s" \
          % (repr(sequence), repr(Seq.reverse_complement(sequence)))
    assert str(sequence) \
       == str(Seq.reverse_complement(Seq.reverse_complement(sequence))), \
       "Dobule reverse complement didn't preserve the sequence!"
print

###########################################################################

test_seqs = [
    s,
    t,
    u,
Example #24
0
outputName = sys.argv[3]

refDict = {}
strainDict = {}
chrListRef = []

#read in fasta
#build dictionary with chr as key and sequence as value

refGenome = open(refGenomeName, 'r')
for seq_record in SeqIO.parse(refGenome, "fasta"):
    refDict[seq_record.id] = seq_record.seq
    idStr = str(seq_record.id)
    chrListRef.append(idStr)
    seqStr = str(seq_record.seq)
    strainDict[idStr] = MutableSeq(seqStr, IUPAC.IUPACAmbiguousDNA())

outIndelName = outputName + ".indel"
outIndelFile = open(outIndelName, 'w')

vcf = open(strainVCF, 'r')
lines = vcf.readlines()
counter = 0
for line in lines:
    currentLine = line.strip('\n')
    if re.match('^#', currentLine):  #Looks for lines that start with #
        header = currentLine
        #print "this is just the header"
    else:
        SNPline = currentLine.split()  #Split based on tabs
        chr = SNPline[0]
Example #25
0
def proteinToCodonAlignment(proteinAlignment, extraDnaSeqs=None):
    protSeqDict = {}
    for seqRecord in proteinAlignment:
        protSeqDict[seqRecord.id] = seqRecord
    dnaFasta = patric_api.getSequenceOfFeatures(protSeqDict.keys(), 'dna')
    #if Debug:
    #     LOG.write("dnaFasta sample: %s\n"%dnaFasta[:100])

    dnaSeqDict = SeqIO.to_dict(
        SeqIO.parse(StringIO(dnaFasta),
                    "fasta",
                    alphabet=IUPAC.IUPACAmbiguousDNA()))
    for seqId in protSeqDict:
        if extraDnaSeqs and seqId in extraDnaSeqs:
            dnaSeqDict[seqId] = extraDnaSeqs[seqId]
            if Debug:
                LOG.write("appending extra DNA seq %s\n" % seqId)
    if set(dnaSeqDict.keys()) != set(protSeqDict.keys()):
        raise Exception(
            "Protein and DNA sets differ:\nProteins: %s\nDNA: %s\n" %
            (", ".join(sorted(protSeqDict)), ", ".join(sorted(dnaSeqDict))))
    for seqId in dnaSeqDict:
        if not len(dnaSeqDict[seqId].seq):
            #del(dnaSeqDict[seqId])
            LOG.write("warning: seqId %s length of dna was zero\n" % seqId)
    dnaSeqRecords = []
    for proteinSeq in proteinAlignment:
        dnaSeqRecords.append(dnaSeqDict[proteinSeq.id])

    if Debug:
        LOG.write("dna seqs has %d seqs\n" % (len(dnaSeqRecords)))
        #LOG.write("DNA seq ids: %s\n"%(", ".join(sorted(dnaSeqDict))))
        #LOG.write("pro seq ids: %s\n"%(", ".join(sorted(protSeqDict))))
        #LOG.write("first two aligned DNA seqs:\n")
        #SeqIO.write(dnaSeqRecords[:2], LOG, "fasta")
        #LOG.flush()
    """
    # now check length of protein vs dna sequences, extend dna if needed to make match in numbers of codons
    for i, protRec in enumerate(proteinAlignment):
        protSeq = str(protRec.seq)
        protSeq.replace('-','')
        protLen = len(protSeq)
        if len(dnaSeqs[i].seq) < protLen*3:
            shortfall = (protLen*3) - len(dnaSeqs[i].seq)
            if Debug:
                LOG.write("DNA seq for %s is too short for protein, shortfall = %d\n"%(protRec.id, shortfall))
            # extend on both ends to be safe
            dnaSeqs[i].seq = "N"*shortfall + dnaSeqs[i].seq + "N"*shortfall
    """
    returnValue = None
    #with warnings.catch_warnings():
    #warnings.simplefilter('ignore', BiopythonWarning)
    #try:
    #ambiguous_nucleotide_values = {'K': 'GT', 'M': 'AC', 'N': 'ACGT', 'S': 'CG', 'R': 'AG', 'W': 'AT', 'Y': 'CT'}
    #ambiguous_protein_values = {'X': 'ACDEFGHIKLMNOPQRSTVWY', 'J': 'IL', 'B': 'DN', 'Z': 'EQ'}
    #ambiguous_codon_table = CodonTable.AmbiguousCodonTable(CodonTable.ambiguous_dna_by_name["Standard"], IUPAC.IUPACAmbiguousDNA(), ambiguous_nucleotide_values, IUPAC.protein, ambiguous_protein_values)
    #returnValue = codonalign.build(pro_align=proteinAlignment, nucl_seqs=dnaSeqRecords, codon_table=ambiguous_codon_table, max_score=1000)
    returnValue = codonalign.build(pro_align=proteinAlignment,
                                   nucl_seqs=dnaSeqRecords,
                                   max_score=1000)
    for dnaSeq in returnValue:
        proteinRecord = protSeqDict[dnaSeq.id]
        if proteinRecord.annotations:
            dnaSeq.annotations = proteinRecord.annotations.copy()

        #except Exception as e:
        #    LOG.write("problem in codonalign, skipping\n%s\n"%str(e))
        #    raise(e)
    return returnValue
Example #26
0
print "RNA Ambiguity mapping:", sorted_dict(ambiguous_rna_values)
print "RNA Complement mapping:", sorted_dict(ambiguous_rna_complement)
for ambig_char, values in sorted(ambiguous_rna_values.iteritems()):
    compl_values = complement(values).replace("T","U")  # need to help as no alphabet
    print("%s={%s} --> {%s}=%s" % \
        (ambig_char, values, compl_values, ambiguous_rna_complement[ambig_char]))
    assert set(compl_values) == set(ambiguous_rna_values[ambiguous_rna_complement[ambig_char]])

print
print("Reverse complements:")
for sequence in [Seq.Seq("".join(sorted(ambiguous_rna_values))),
            Seq.Seq("".join(sorted(ambiguous_dna_values))),
            Seq.Seq("".join(sorted(ambiguous_rna_values)), Alphabet.generic_rna),
            Seq.Seq("".join(sorted(ambiguous_dna_values)), Alphabet.generic_dna),
            Seq.Seq("".join(sorted(ambiguous_rna_values)).replace("X",""), IUPAC.IUPACAmbiguousRNA()),
            Seq.Seq("".join(sorted(ambiguous_dna_values)).replace("X",""), IUPAC.IUPACAmbiguousDNA()),
            Seq.Seq("AWGAARCKG")]:  # Note no U or T
        print("%s -> %s" \
              % (repr(sequence), repr(Seq.reverse_complement(sequence))))
        assert str(sequence) \
           == str(Seq.reverse_complement(Seq.reverse_complement(sequence))), \
           "Dobule reverse complement didn't preserve the sequence!"
print

###########################################################################

test_seqs = [s,t,u,
             Seq.Seq("ATGAAACTG"),
             "ATGAAACtg",
             #TODO - Fix ambiguous translation
             #Seq.Seq("ATGAARCTG"),
Example #27
0
)
from Bio.Data.CodonTable import TranslationError, standard_dna_table
from Bio.Seq import MutableSeq

test_seqs = [
    Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna),
    Seq.Seq("T", IUPAC.ambiguous_dna),
    Seq.Seq("ATGAAACTG"),
    Seq.Seq("ATGAARCTG"),
    Seq.Seq("AWGAARCKG"),  # Note no U or T
    Seq.Seq("".join(ambiguous_rna_values)),
    Seq.Seq("".join(ambiguous_dna_values)),
    Seq.Seq("".join(ambiguous_rna_values), Alphabet.generic_rna),
    Seq.Seq("".join(ambiguous_dna_values), Alphabet.generic_dna),
    Seq.Seq("".join(ambiguous_rna_values), IUPAC.IUPACAmbiguousRNA()),
    Seq.Seq("".join(ambiguous_dna_values), IUPAC.IUPACAmbiguousDNA()),
    Seq.Seq("AWGAARCKG", Alphabet.generic_dna),
    Seq.Seq("AUGAAACUG", Alphabet.generic_rna),
    Seq.Seq("ATGAAACTG", IUPAC.unambiguous_dna),
    Seq.Seq("ATGAAA-CTG", Alphabet.generic_dna),
    Seq.Seq("ATGAAACTGWN", IUPAC.ambiguous_dna),
    Seq.Seq("AUGAAACUG", Alphabet.generic_rna),
    Seq.Seq("AUGAAA==CUG", Alphabet.generic_rna),
    Seq.Seq("AUGAAACUG", IUPAC.unambiguous_rna),
    Seq.Seq("AUGAAACUGWN", IUPAC.ambiguous_rna),
    Seq.Seq("ATGAAACTG", Alphabet.generic_nucleotide),
    Seq.Seq("AUGAAACTG", Alphabet.generic_nucleotide),  # U and T
    Seq.MutableSeq("ATGAAACTG", Alphabet.generic_dna),
    Seq.MutableSeq("AUGaaaCUG", IUPAC.unambiguous_rna),
    Seq.Seq("ACTGTCGTCT", Alphabet.generic_protein),
]
Example #28
0
    def get_biop_motif(self, cluster_num, motif_num, option='sites'):
        ##import egrin2.export_motifs as em
        """export the specified motif to a biopython motif object
        Parameters:
        - cluster_num: bicluster number
        - motif_num: motif number
        - option of how to translate - sites: jaspar 'sites' file; pfm: jaspar 'pfm' file
        """
        #conn = sql3.connect(self.dbfile)
        #cursor = conn.cursor()
        #cursor.execute('select max(iteration) from motif_infos')
        #iteration = cursor.fetchone()[0]

        #query = 'select rowid from motif_infos where iteration=? and cluster=? and motif_num=?'
        #params = [self.iteration, cluster_num, motif_num]
        #cursor.execute(query, params)
        #rowid = cursor.fetchone()[0]
        #motif_infos = self.tables['motif_infos']
        #rowid = motif_infos[(motif_infos.iteration==self.iteration) & 
        #                    (motif_infos.cluster==cluster_num) & (motif_infos.motif_num==motif_num)].index.values[0]+1
        rowid = self.__get_motif_id(cluster_num, motif_num)
        #mot_info = pd.read_sql('select * from motif_infos where rowid=?', conn, params=[rowid])
        #mot_sites = pd.read_sql('select * from meme_motif_sites where motif_info_id=?', conn, params=[rowid])
        mot_sites = self.tables['meme_motif_sites'][self.tables['meme_motif_sites'].motif_info_id == rowid]
            
        output = StringIO()
        ## ONE WAY TO TRY -- but Bio.motifs cant parse the incomplete MEME file
        ##output.write(em.MEME_FILE_HEADER % (0.25, 0.25, 0.25, 0.25))
        ##em.write_pssm(output, cursor, os.path.dirname(self.dbfile), cluster_num, rowid,
        ##              motif_num, mot_info['evalue'][0], 10)
        ##output.seek(0)
        ##mot = motifs.read( output, 'meme' )
            
        ## Second way - create a jaspar 'pfm' file from the pssm
        if option == 'pfm':
            #query = 'select a,c,g,t from motif_pssm_rows where iteration=? and motif_info_id=?'
            #params = [self.iteration, rowid]
            #pssm = pd.read_sql( query, conn, params=params )

            motif_pssm_rows = self.tables['motif_pssm_rows']
            pssm = motif_pssm_rows[(motif_pssm_rows.iteration==self.iteration) & (motif_pssm_rows.motif_info_id==rowid)]
            pssm = pssm.drop( ['motif_info_id', 'iteration', 'row'], 1 )

            counts = np.round( pssm * mot_sites.shape[0] ).transpose()
            counts.to_string(output, header=False, index=False )
            output.seek(0)
            mot = motifs.read( output, 'pfm' )

            ## Third way - create a jaspar 'sites' file
        elif option == 'sites':
            seqs = {}
            for i in mot_sites.index.values:
                name = mot_sites.ix[i].seq_name
                flank_left = mot_sites.ix[i].flank_left
                flank_left = Seq(flank_left if flank_left is not None else "", IUPAC.IUPACAmbiguousDNA()).lower()
                seq = Seq(mot_sites.ix[i].seq, IUPAC.IUPACAmbiguousDNA())
                flank_right = mot_sites.ix[i].flank_right
                flank_right = Seq(flank_right if flank_right is not None else "", IUPAC.IUPACAmbiguousDNA()).lower()
                full_seq = flank_left + seq + flank_right
                bs = SeqRecord( full_seq, id=name )
                seqs[i] = bs
                    
            SeqIO.write(seqs.values(), output, 'fasta')
            output.seek(0)
            mot = motifs.read( output, 'sites' )
            
        output.close()
        ## Note Bio.motifs.weblogo() uses the weblogo server (slow? requires connection.)
        #kwargs = dict(color_scheme='classic')
        #mot.weblogo('file.png', color_scheme='color_classic') ## note, can use format='PDF'
        #img = mpimg.imread('file.png')
        #imgplot = plt.imshow( img )
        #plt.show()
        return mot
Example #29
0
def search_sines(sine_f, r1_f, override = 0, upper_mut_dist = 30, step_print = 10000, nlines = 500000, sine_l = 80):
    print ('override =',override)
    sine_set = []
    stats = collections.Counter()

    global bar_codes
    bar_codes = {}
    
    global detailed_stats
    detailed_stats = collections.Counter()
    
    global distances_from_combined_regexp
    distances_from_combined_regexp = {}

    matcher = difflib.SequenceMatcher()
    
    for sine_record in SeqIO.parse(sine_f, "fasta"):
        cur_seq = Seq(str(sine_record.seq)[:sine_l], IUPAC.IUPACAmbiguousDNA())
        cur_seq_rc = cur_seq.reverse_complement()
        sine_set.append(str(cur_seq))
        sine_set.append(str(cur_seq_rc))
        print(cur_seq, cur_seq_rc, '''\n ======================''')

    complete_regexp = '''|'''.join(sine_set)
    p = tre.compile(complete_regexp, tre.EXTENDED)

    if override == 1:
        bases = ['A','C','G','T']
        ind_list = [random.randrange(4) for i in range(sine_l)]
        r_sine = ''.join( [bases[ind_list[i]] for i in range(sine_l)] )
        r_sine_rc = ''.join( [bases[3-ind_list[i]] for i in range(sine_l)] )
        sine_set = [r_sine, r_sine_rc]
        complete_regexp = '''|'''.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)

    # Also specifies the shift  range   
    if override > 1:
        if override > 2:
            d = override - 1 #random.randrange(2, override)
            print('skipping ',d)
            for (i,cur_seq) in enumerate(r1_f):
                if i == d:
                    break
                
        sine_set = []
        for (i,s) in enumerate(r1_f):
            cur_seq = Seq(s[:sine_l], IUPAC.IUPACAmbiguousDNA())
            cur_seq_rc = cur_seq.reverse_complement()
            sine_set.append(str(cur_seq))
            sine_set.append(str(cur_seq_rc))
            if i == 2:
                break
            
        complete_regexp = '''|'''.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)     

        
    total = 0
    cnt = 0
    start_time = time()
    print('''sequences = ''')

    bar_code_len = 60                         
    for cur_seq in r1_f:
        total += 1
        m = p.search(cur_seq, tre.Fuzzyness(maxerr = upper_mut_dist))
        if m:
            res = m.group(0)
            d = m.cost
            # Filter out strings that were cut out. Approximate by max-length matches
            # 10 is arbitrary, not very small
            if (m.groups()[0][1] < len(cur_seq) - 10) and (m.groups()[0][0] > 40):
                # print(m.groups(), len(cur_seq))
                cnt += 1      
                stats[d] += 1

                bar_code = cur_seq[m.groups()[0][0] - 40 : m.groups()[0][0]]

                if bar_code in bar_codes:
                   bar_codes[bar_code] +=  1
                else:
                    bar_codes[bar_code] = 1

            detailed_stats[res] += 1
            distances_from_combined_regexp[res] = d 

        if (total % step_print == 0 or total == nlines):
            print('''distances for first''', total, '''segments \n''')
            print('''========================''')
            print('''time elapsed''', (time() - start_time)/60.0, '''minutes''')
            for k in sorted(stats):
                print('edit distance =', k, 'matches =', stats[k], '''/''',cnt)
        
        if (total == nlines):
            break
Example #30
0
def assemble_alleles(fasta, cfg, **kargs):
    """
    Main function for STEP7:
    Allele fragments are assembled in order to recover the full length transcript.
    interp. 
        overlap_threshold (int)         Parameter that plays two roles in the analysis:
                                        1. in order to investigate two fragment from 
                                        the same origin for they cannot overlap by more 
                                        than this value.
                                        2. in order to assemble two fragment f0, f1, 
                                        there must be a fragment t from a different 
                                        taxon that overlap each of the two sequences by 
                                        overlapping_threshold / 2. f0 and f1 are 
                                        assembled if they are both in the overlap region
                                        among the closest sequences from t.
        kmer (int)                      The number of non ambiguous consecutive bases to 
                                        mark the beginning and end of a sequence. 
        overlap_proportion (float)      When testing for homology of two non (low) 
                                        overlapping sequences f0, f1 from taxon t0; 
                                        we use a search the set of sequences from the
                                        other taxa that largely overlap both f0 and f1.
                                        The number of taxa to be search is 
                                        (number seq that overlap) * overlap_proportion
        max_alleles_distance (float)    indicates the max distance allowed between 2 
                                        alleles. 
        mistmatch_cost (int)            Penalty cost for mismatch in the overlapping 
                                        flanking regions of putative alleles.
        overlap_cost (int)              Penalty cost for overlap in the flanking regions 
                                        of putative alleles.                                    
    
    """
    in_format = cfg["input_format"]
    out_suffix = cfg["output_suffix"]
    out_folder = cfg["output_folder"]
    taxa_without_alleles = cfg["taxa_without_alleles"]
    if taxa_without_alleles:
        taxa_no_allele = taxa_without_alleles
    else:
        taxa_no_allele = []
    kmer = cfg["kmer"]
    overlap_ratio = cfg["overlap_proportion"]
    overlap_threshold = cfg["overlap_threshold"]
    max_allele_dist = cfg["max_alleles_distance"]
    overlap_cost = cfg["overlap_cost"]
    mistmatch_cost = cfg["mistmatch_cost"]
    basename = fasta.split(in_format)[0]

    try:
        assert (kmer) < 50
    except:
        s = "'kmer' > 50, unrealistic parameter value"
        raise ValueError(s)
        sys.exit(s)

    try:
        cfg["dna_model"] in [0, 1, 2, 3, 4, 5]
    except:
        s = "Wrong parameters: 'dna_model' must be in [0, 1, 2, 3, 4, 5]"
        raise ValueError(s)
        sys.exit(s)

    try:
        align = AlignIO.read(fasta, "fasta")
    except:
        exception = "[Error] Problem opening fasta alignment {}\n\
Are all sequences the same length?".format(fasta)
        raise Exception(exception)
        return exception

    align_length = len(align)
    if align_length < 3:
        s_i = "STEP7 Done with fasta: {}".format(fasta)
        s_d = "No transformation performed on fasta {0} \n\
which only contains {1} sequences".format(fasta, align_length)
        return (s_i, s_d)

    length = align.get_alignment_length()
    try:
        assert overlap_threshold < length
    except:
        exception = "[Error]: for fasta {} 'overlap_threshold'\
         exceeds the length of the alignment".format(fasta)
        return (exception, None)

    #  Calculate distance matrix
    calculate_distance_matrix(fasta, {
        "input_suffix": in_format,
        "dna_model": cfg["dna_model"]
    })
    distance_matrix = basename + "_for_dnadist.fasta.distmat"
    dist = distance_matrix_parser(distance_matrix)
    dist_dict = {tuple(sorted([x[0], x[1]])): x[2]["distance"] for x in dist}
    non_empty_seqs = list(set(list(chain(*list(dist_dict.keys())))))

    #  Find records with alleles that have not been excluded in the settings
    rec_alleles = [
        rec for rec in align if rec.name.split("|")[0] not in taxa_no_allele
        and rec.name in non_empty_seqs
    ]
    align_dict = {
        rec.name: str(rec.seq)
        for rec in align if rec.name in non_empty_seqs
    }
    taxa = {
        rec.name: str(rec.seq)
        for rec in rec_alleles if rec.name in non_empty_seqs
    }

    #  For each sequence we recode 0 for '-', 1 otherwise,
    #   into a dictionary {name_number: profile}"
    taxa_profiles = sequence_profiler(taxa)

    #  Find beginning and end for each sequence.
    #  If no/beginning end is found do not include the sequence
    start_end_dict = {
        rec: (find_start_end(taxa[rec], kmer))
        for rec in taxa if None not in [find_start_end(taxa[rec], kmer)]
    }

    #  Nbr ovelapping sequences used for calculation
    overlapping_seq = int(overlap_ratio * len(list(taxa.keys())))

    #  Dictionary that contains the taxa (i.e. the transcriptomes) as keys
    #   and the names of all sequences corresponding to the key as value.
    taxa_list = list(set([name.split("|")[0] for name in taxa.keys()]))
    taxa_contigs = dict(zip(taxa_list, [[] for x in taxa_list]))
    for rec in taxa:
        if rec in list(start_end_dict.keys()):
            taxon = rec.split("|")[0]
            taxa_contigs[taxon].append(rec)

    #  Dict with items of overlapping_seq as keys and as values the list
    #   of all overlapping seqs. Dict used for finding the seqs from
    #   a different taxon  that overlap the sequences from a given taxon.
    dict_overlap_seq = {}
    for seq in taxa:
        dict_overlap_seq[seq] = [
            x for x in taxa.keys() if x != seq and (get_overlap_score(
                tuple(sorted([x, seq])), taxa_profiles) > overlap_threshold)
        ]
    #  Obtain a dictionary that keep for each allele pair the longest and for
    #   the sequences that are not paired assigns the key 'None'.
    alleles_dict = {}
    para_dict = {}
    for taxon in [x.split("|")[0] for x in taxa.keys()]:
        if len(taxa_contigs[taxon]) > 1:
            L, P = assertain_allele(
                taxa_contigs[taxon],
                dist_dict,
                taxa_profiles,
                dict_overlap_seq,
                max_allele_dist,
                overlapping_seq,
            )
            para_dict.update(P)
            for (k, v) in L:
                alleles_dict[k] = v
        else:
            alleles_dict[taxa_contigs[taxon][0]] = None
    #  Obtain a new dictionary of taxon as key and sequences without alleles
    #   as values. These are the sequences that are not alleles so they could
    #   be assembled into full length transcripts.
    taxa_contigs_no_alleles = {}
    for k, v in list(taxa_contigs.items()):
        taxa_contigs_no_alleles[k] = [
            seq for seq in v if seq in list(alleles_dict.keys())
        ]

    #  Dict that for each taxon find the seq pairs with overlap below the
    #   threshold and that have a common sequences that overlap both sequences
    #   as the best match as obtained from the distance analysis.
    taxa_with_pair = {}
    out_seqs = []
    used_seq_in_out = []

    for taxon, seqs in list(taxa_contigs_no_alleles.items()):

        #  Case 1. there is only one sequence, so no pair can be infered.
        if len(seqs) == 1:
            #  Case the unique sequence has an allele pair
            if alleles_dict[seqs[0]] and len(alleles_dict[seqs[0]]) > 1:
                out_seqs.extend([(x, align_dict[x].upper().replace("N", "-"))
                                 for x in alleles_dict[seqs[0]]])
                used_seq_in_out.extend([x for x in alleles_dict[seqs[0]]])
            #  Case the unique sequence has no allele pair
            else:
                out_seqs.append(
                    (seqs[0], align_dict[seqs[0]].upper().replace("N", "-")))
                used_seq_in_out.append(seqs[0])
            continue

        #  Case 2. there are several sequences, try to find ways to concatenate them.
        seqs_pairs = list(combinations(seqs, 2))
        L = []
        for p in seqs_pairs:
            overlap = get_overlap_score(p, taxa_profiles)
            start_0, end_0 = start_end_dict[p[0]]
            start_1, end_1 = start_end_dict[p[1]]
            s_p = sorted([(p[0], start_0), (p[1], start_1)],
                         key=lambda x: x[1])
            sorted_pair = (s_p[0], s_p[1])
            if not overlap:
                seq_coverage = (end_0 - start_0) + (end_1 - start_1)
            else:
                seq_coverage = (max(end_0, end_1) - min(end_0, end_1) +
                                min(end_0, end_1) - max(start_0, start_1) +
                                max(start_0, start_1) - min(start_0, start_1))
            #  If they are little overlap between two seqs of the same taxon
            #   and they share the closest overlapping sequence, create
            #   an edge between the two sequences.
            if overlap <= overlap_threshold and closest_match(
                    p,
                    taxon,
                    align_dict,
                    taxa_profiles,
                    dist_dict,
                    overlap_threshold,
                    overlap_ratio,
                    alleles_dict,
            ):
                L.append((
                    sorted_pair,
                    {
                        "overlap": overlap,
                        "sequence_coverage": seq_coverage,
                        "start": min(start_0, start_1),
                    },
                ))
        if L:
            taxa_with_pair[taxon] = sorted(L, key=lambda x: x[1]["start"])
        else:
            #  No pair could be calculated
            for p in seqs_pairs:
                for item in p:
                    if alleles_dict[item] and len(alleles_dict[item]) > 1:
                        out_seqs.extend([
                            (x, align_dict[x].upper().replace("N", "-"))
                            for x in alleles_dict[item]
                        ])
                        used_seq_in_out.extend([x for x in alleles_dict[item]])
                    else:
                        out_seqs.append(
                            (item, align_dict[item].upper().replace("N", "-")))
                        used_seq_in_out.append(alleles_dict[item])
                continue

    #  Case there are no sequences that can be paired:
    #  Return the original alignment.
    if not taxa_with_pair:
        clean_align = [rec for rec in align if rec.name in non_empty_seqs]
        out_name = basename + out_suffix + ".fasta"
        SeqIO.write(clean_align, out_name, "fasta")
        if not os.path.isfile(os.path.join(out_folder, out_name)):
            shutil.move(out_name, out_folder)
        s_i = "STEP7 Done with fasta: {}".format(fasta)
        s_d = "No full size allele could be reconstructed for fasta: {}".format(
            fasta)

        return (s_i, s_d)

    #  Case some sequences are paired:
    #  1. Generate all possible path between nodes that have indegrees of one
    #     and nodes with with outdegree of one.
    #  2. The path are scored according to their length in term of number
    #     of nucleotids minus the number of weighted mismaches, minus
    #     the number of weighted overlap.
    #  3. The best path is selected.
    #  4. Then new path are recalculated by removing the selected terminal
    #     nodes and recalculating the path. Adding then to the path pool
    #     for selection.
    #  5. Continue until all nodes have been used at least once.
    for taxon in list(taxa_with_pair.keys())[:]:
        G = nx.DiGraph()
        G.add_edges_from([(x[0][0], x[0][1], x[1])
                          for x in taxa_with_pair[taxon]])
        in_nodes = [k for k, v in list(G.in_degree(G.nodes())) if v == 0]
        out_nodes = [k for k, v in list(G.out_degree(G.nodes())) if v == 0]

        #   Initialize the search procedure with the first path
        possible_paths = []
        for item in product(in_nodes, out_nodes):
            possible_paths.extend(
                nx.all_simple_paths(G, source=item[0], target=item[1]))
        used_seqs = []
        possible_path_dict = {}
        for path in possible_paths:
            score = (get_path_length(path, start_end_dict) -
                     overlap_cost * count_mismatches(path, align)[0] -
                     mistmatch_cost * count_mismatches(path, align)[1])
            possible_path_dict[tuple(path)] = score

        best_path = sorted(list(possible_path_dict.items()),
                           key=lambda x: x[1])[-1]
        sequences_best_path = [y for y in best_path[0]]

        all_best_paths = [best_path]
        remaining_seqs = [
            x for x in G.nodes() if x not in [y[0] for y in best_path[0]]
        ]

        #  Remove the sequences of the best path from the graph only
        #   if there is a paralog sequence that is overlapping them.
        for seq in sequences_best_path:
            used_seqs.append(seq[0])
            if seq[0] in list(para_dict.keys()):
                G.remove_node(seq)
                for path in list(possible_path_dict.keys()):
                    if seq in path:
                        del possible_path_dict[path]

        remaining_seqs = list(G.nodes())

        while len(remaining_seqs) > 1:
            #  Retrieve 5' and 3' sequences.
            in_nodes = [k for k, v in list(G.in_degree(G.nodes())) if v == 0]
            out_nodes = [k for k, v in list(G.out_degree(G.nodes())) if v == 0]
            if in_nodes == out_nodes:
                break

            #  Find all path that originate in one in_node and end in one out_node
            possible_paths = []
            for item in product(in_nodes, out_nodes):
                possible_paths.extend(
                    nx.all_simple_paths(G, source=item[0], target=item[1]))
            for path in possible_paths:
                score = (get_path_length(path, start_end_dict) -
                         overlap_cost * count_mismatches(path, align)[0] -
                         mistmatch_cost * count_mismatches(path, align)[1])
                possible_path_dict[tuple(path)] = score

            #  Select the path with the best score:
            #   if the putative path is entirely included in a previously
            #   selected path, it is rejected and the loop ends
            for path in sorted(list(possible_path_dict.items()),
                               key=lambda x: x[1],
                               reverse=True):
                for b_path in [set(x[0]) for x in all_best_paths]:
                    if set(path[0]).intersection(b_path) == set(path[0]):
                        best_path = None
                        break
                if path not in all_best_paths and best_path is not None:
                    best_path = path
                    break
                else:
                    best_path = None
            if best_path is None:
                break

            all_best_paths.append(best_path)
            sequences_best_path = [y for y in best_path[0]]
            #  Switch that records whether the path is entirely made of
            #   sequences lacking paralogs
            no_paralogue = True

            #  1- Remove all sequences in the best path that have a paralog
            for seq in sequences_best_path:
                used_seqs.append(seq[0])
                if seq[0] in list(para_dict.keys()):
                    G.remove_node(seq)
                    used_seqs.append(seq)
                    no_paralogue = False
                    for path in list(possible_path_dict.keys()):
                        if seq in path:
                            del possible_path_dict[path]

            #  2- Remove all sequences in the path if only made out of
            #     sequences lacking a paralog
            if no_paralogue:
                for seq in sequences_best_path:
                    G.remove_node(seq)
                    used_seqs.append(seq)
                    for path in list(possible_path_dict.keys()):
                        if seq in path:
                            del possible_path_dict[path]
            remaining_seqs = G.nodes()

        if remaining_seqs and remaining_seqs[0][0] not in used_seqs:
            all_best_paths.extend([((remaining_seqs[0], ), None)])

        all_best_paths_seq = []
        for x in all_best_paths:
            all_best_paths_seq.append([y[0] for y in x[0]])

        final_paths = []
        for path in all_best_paths_seq:
            final_paths.append(path)
            alleles_path = []
            #  Switch that ensures that there is at least one seq with alleles
            switch = False
            for p in path:
                if alleles_dict[p] is None and len(path) > 1:
                    alleles_path.append(p)
                elif alleles_dict[p] is not None:
                    alternative_allele = [x for x in alleles_dict[p] if x != p]
                    alleles_path.extend(alternative_allele)
                    switch = True
            if switch:
                final_paths.append(alleles_path)

        for path in final_paths:
            used_seq_in_out.extend(path)
            out_seqs.append(make_consensus(path, align))

    #  Add sequences that have not been included in any path

    for item in list(
            set([x for x in align_dict.keys()
                 ]).difference(set(used_seq_in_out))):
        if item not in [x[0] for x in out_seqs]:
            out_seqs.append((item, align_dict[item].replace("N", "-")))
    out_seq_newname = zip(
        [
            rename(x[0])
            for x in sorted(list(set(out_seqs)), key=lambda x: x[0])
        ],
        [x[1] for x in sorted(list(set(out_seqs)), key=lambda x: x[0])],
    )
    final_records = []
    for name, seq in out_seq_newname:
        final_records.append(
            SeqRecord(Seq(seq, IUPAC.IUPACAmbiguousDNA()),
                      name=name,
                      id=name,
                      description=""))

    out_name = basename + out_suffix + ".fasta"

    SeqIO.write(final_records, out_name, "fasta")
    if not os.path.isfile(os.path.join(out_folder, out_name)):
        shutil.move(out_name, out_folder)

    s_i = "STEP7 Done with fasta: {}".format(fasta)
    s_d = "Alignment size was reduced from {0} to {1} sequences".format(
        len(align), len(final_records))
    return (s_i, s_d)