Ejemplo n.º 1
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == "parse":
             nucl = SeqIO.parse(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], "clustal", alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == "index":
             # Deliberately using a fancy protein alphabet for testing:
             nucl = SeqIO.index(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], "clustal", alphabet=generic_protein)
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
             nucl.close()  # Close the indexed FASTA file
         elif i[1] == "id":
             nucl = SeqIO.parse(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], "clustal", alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = {i.split()[0]: i.split()[1] for i in handle}
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
     self.alns = alns
Ejemplo n.º 2
0
def main():
    (opts, args) = getoptions()

    # Load PWMs
    pssms = load_motifs(opts.pwm_dir, opts.pseudocount)

    if opts.testseq is not None:
        if opts.seqtype == 'RNA':
            seq = Seq(opts.testseq,
                      IUPAC.IUPACUnambiguousRNA()).back_transcribe()
            seq.alphabet = IUPAC.IUPACUnambiguousDNA()
        else:
            seq = Seq(opts.testseq, IUPAC.IUPACUnambiguousDNA())
        final = scan_all(pssms, seq, opts)
        print final.to_csv(sep="\t", index=False)
    else:
        # Scan in sequence
        print >> sys.stderr, "Scanning sequences ",
        tic = time.time()
        for seqrecord in SeqIO.parse(open(args[0]), "fasta"):

            seq = seqrecord.seq
            if opts.seqtype == "RNA":
                seq = seq.back_transcribe()
            seq.alphabet = IUPAC.IUPACUnambiguousDNA()

            final = scan_all(pssms, seq, opts)
            print final.to_csv(sep="\t", index=False)

        toc = time.time()
        print >> sys.stderr, "done in %0.2f seconds!" % (float(toc - tic))
Ejemplo n.º 3
0
def get_seq_record(record, start, stop, description):
    """Return a SeqRecord for query between start and stop.

    Given a sam record, find query sequences that cover the [start, stop]
    interval completely and create a SeqRecord object with sequence in the
    reference orientation.
    """
    # get the query positions of the bases mapped to start, stop
    # currently only extracts from reads where both endpoints are mapped
    positions = record.get_aligned_pairs(matches_only=True)
    first_position = [item[0] for item in positions if item[1] == start]
    last_position = [item[0] for item in positions if item[1] == stop]
    # fetch and reorient sequence
    if first_position and last_position:
        name = record.query_name
        if not record.is_reverse:
            seq = Seq(
                record.get_forward_sequence()
                [first_position[0]:last_position[0]],
                IUPAC.IUPACUnambiguousDNA())
            direction = 'f'
        else:
            length = record.query_length
            seq = Seq(
                record.get_forward_sequence()[length -
                                              last_position[0]:length -
                                              first_position[0]],
                IUPAC.IUPACUnambiguousDNA()).reverse_complement()
            direction = 'rc'
        return SeqRecord(seq,
                         id=name,
                         description=('|').join(
                             [description, direction[record.is_reverse]]))
Ejemplo n.º 4
0
    def chgAlpha(self, newAlpha):
        """Accepts 'DNA' 'RNA' or 'protein' or an 
	alphabet object"""

        from Bio.Seq import Seq
        from Bio.Alphabet import IUPAC

        alpha = None
        if newAlpha == "DNA":
            alpha = IUPAC.IUPACUnambiguousDNA()
            self.typ = alpha
        elif newAlpha == "RNA":
            alpha = IUPAC.IUPACUnambiguousDNA()
            self.typ = alpha
        elif newAlpha == "protein":
            alpha = IUPAC.IUPACProtein()
            self.typ = alpha
        else:
            raise NameError, "type not 'DNA', 'RNA', or 'protein'"

        if not alpha:
            alpha = newAlpha

        self.seq = Seq(self.seq.tostring(), alpha)

        self.checkAlpha()
Ejemplo n.º 5
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == 'parse':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == 'index':
             # Deliberately using a fancy protein alphabet for testing:
             nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=Gapped(IUPAC.ExtendedIUPACProtein()))
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
         elif i[1] == 'id':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = dict((i.split()[0], i.split()[1]) for i in handle)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
         nucl.close()  # Close the indexed FASTA file
     self.alns = alns
Ejemplo n.º 6
0
 def setUp(self):
     # Test set 1
     seq1 = SeqRecord(Seq(
         'TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG',
         alphabet=IUPAC.IUPACUnambiguousDNA()),
                      id='pro1')
     seq2 = SeqRecord(Seq(
         'TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG',
         alphabet=IUPAC.IUPACUnambiguousDNA()),
                      id='pro2')
     pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE',
                          alphabet=IUPAC.protein),
                      id='pro1')
     pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE',
                          alphabet=IUPAC.protein),
                      id='pro2')
     aln1 = MultipleSeqAlignment([pro1, pro2])
     self.aln1 = aln1
     self.seqlist1 = [seq1, seq2]
     # Test set 2
     #                      M  K  K  H  E L(F)L  C  Q  G  T  S  N  K  L  T  Q(L)L  G  T  F  E  D  H  F  L  S  L  Q  R  M  F  N  N  C  E  V  V
     seq3 = SeqRecord(Seq(
         'ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC',
         alphabet=IUPAC.IUPACUnambiguousDNA()),
                      id='pro1')
     # seq4 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAA TGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2')
     seq4 = SeqRecord(Seq(
         'ATGAAAAAGCACGAGTTCTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAATGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC',
         alphabet=IUPAC.IUPACUnambiguousDNA()),
                      id='pro2')
     # seq5 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCC  TTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro3')
     seq5 = SeqRecord(Seq(
         'ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC',
         alphabet=IUPAC.IUPACUnambiguousDNA()),
                      id='pro3')
     pro3 = SeqRecord(Seq(
         'MKKHELLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL',
         alphabet=IUPAC.protein),
                      id='pro1')
     pro4 = SeqRecord(Seq(
         'MKKHEFLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL',
         alphabet=IUPAC.protein),
                      id='pro2')
     pro5 = SeqRecord(Seq(
         'MKKHELLCQGTSNKLTLLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL',
         alphabet=IUPAC.protein),
                      id='pro3')
     aln2 = MultipleSeqAlignment([pro3, pro4, pro5])
     self.aln2 = aln2
     self.seqlist2 = [seq3, seq4, seq5]
Ejemplo n.º 7
0
def sequence(ungapped, position, length):
    """
    Given an ungapped sequence and a positive or negative number (position),
    return the nucleotide at that position plus [length] nucleotides in the
    positive direction.
    """
    if position >= 0:
        return str(
            Seq(str(ungapped.seq),
                IUPAC.IUPACUnambiguousDNA())[position:position + length])
    else:
        return getNegative(
            str(
                Seq(str(ungapped.seq),
                    IUPAC.IUPACUnambiguousDNA())[position:position + length]))
Ejemplo n.º 8
0
 def setUp(self):
     nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
     prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein)
     with open(TEST_ALIGN_FILE6[0][2]) as handle:
         id_corr = dict((i.split()[0], i.split()[1]) for i in handle)
     aln = CodonAlign.build(prot, nucl, corr_dict=id_corr, alphabet=CodonAlign.default_codon_alphabet)
     self.aln = aln
Ejemplo n.º 9
0
 def gb(self):
     l = self.length()
     g = SeqRecord(
         Seq(self.sequence(),IUPAC.IUPACUnambiguousDNA()),
         id=self.name[0:8],
         name=self.name[0:8],
         description=self.description
     )
     g.features = []
     for f in self.features():
         t = f.type
         if f.direction == 'f':
             strand = 1
         else:
             strand = -1
         if self.shape == 'c' and f.end > l:
             f1 = FeatureLocation(ExactPosition(f.start), ExactPosition(l), strand)
             f2 = FeatureLocation(ExactPosition(0), ExactPosition(f.end - l), strand)
             if strand == 1:
                 floc = CompoundLocation([f1, f2])
             else:
                 floc = CompoundLocation([f2, f1])
         else:
             floc = FeatureLocation(ExactPosition(f.start),ExactPosition(f.end), strand)
         sf = SeqFeature(floc, f.type, qualifiers=dict([[q.name,q.data] for q in f.qualifiers.all()]))
         g.features.append(sf)
     return g.format('genbank')
Ejemplo n.º 10
0
    def _guess_consensus_alphabet(self, ambiguous):
        """Pick an (ungapped) alphabet for an alignment consesus sequence (PRIVATE).

        This just looks at the sequences we have, checks their type, and
        returns as appropriate type which seems to make sense with the
        sequences we've got.
        """
        # Start with the (un-gapped version of) the alignment alphabet
        a = Alphabet._get_base_alphabet(self.alignment._alphabet)

        # Now check its compatible with all the rest of the sequences
        for record in self.alignment:
            # Get the (un-gapped version of) the sequence's alphabet
            alt = Alphabet._get_base_alphabet(record.seq.alphabet)
            if not isinstance(alt, a.__class__):
                raise ValueError(
                    "Alignment contains a sequence with an incompatible alphabet."
                )

        # Check the ambiguous character we are going to use in the consensus
        # is in the alphabet's list of valid letters (if defined).
        if (
            hasattr(a, "letters")
            and a.letters is not None
            and ambiguous not in a.letters
        ):
            # We'll need to pick a more generic alphabet...
            if isinstance(a, IUPAC.IUPACUnambiguousDNA):
                if ambiguous in IUPAC.IUPACUnambiguousDNA().letters:
                    a = IUPAC.IUPACUnambiguousDNA()
                else:
                    a = Alphabet.generic_dna
            elif isinstance(a, IUPAC.IUPACUnambiguousRNA):
                if ambiguous in IUPAC.IUPACUnambiguousRNA().letters:
                    a = IUPAC.IUPACUnambiguousRNA()
                else:
                    a = Alphabet.generic_rna
            elif isinstance(a, IUPAC.IUPACProtein):
                if ambiguous in IUPAC.ExtendedIUPACProtein().letters:
                    a = IUPAC.ExtendedIUPACProtein()
                else:
                    a = Alphabet.generic_protein
            else:
                a = Alphabet.single_letter_alphabet
        return a
Ejemplo n.º 11
0
 def setUp(self):
     nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
     prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein)
     with open(TEST_ALIGN_FILE6[0][2]) as handle:
         id_corr = dict((i.split()[0], i.split()[1]) for i in handle)
     with warnings.catch_warnings():
         warnings.simplefilter('ignore', BiopythonWarning)
         aln = codonalign.build(prot, nucl, corr_dict=id_corr, alphabet=codonalign.default_codon_alphabet)
     self.aln = aln
Ejemplo n.º 12
0
 def setUp(self):
     self.aln_file = [
         TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3,
         TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6
     ]
     alns = []
     for i in self.aln_file:
         if i[1] == 'parse':
             nucl = SeqIO.parse(i[0][0],
                                'fasta',
                                alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = CodonAlign.build(
                     prot, nucl, alphabet=CodonAlign.default_codon_alphabet)
         elif i[1] == 'index':
             nucl = SeqIO.index(i[0][0],
                                'fasta',
                                alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = CodonAlign.build(
                     prot,
                     nucl,
                     alphabet=CodonAlign.default_codon_alphabet,
                     max_score=20)
         elif i[1] == 'id':
             nucl = SeqIO.parse(i[0][0],
                                'fasta',
                                alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             id = dict((i.split()[0], i.split()[1])
                       for i in open(i[0][2]).readlines())
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = CodonAlign.build(
                     prot,
                     nucl,
                     corr_dict=id,
                     alphabet=CodonAlign.default_codon_alphabet)
         alns.append(caln)
     self.alns = alns
Ejemplo n.º 13
0
def cast_sequence(ungapped_sequence):
    """
    ungapped_sequence: a list with the sequence and id for all the species in a file.
    
    Returns a list sequences with the type cast as c.
    """
    casted = []
    for record in ungapped_sequence:
        casted.append(Seq(str(record.seq), IUPAC.IUPACUnambiguousDNA()))
    return casted
Ejemplo n.º 14
0
	def gb(self):
		g = SeqRecord(
			Seq(self.sequence(),IUPAC.IUPACUnambiguousDNA()),
			id=self.name[0:8],
			name=self.name[0:8],
			description=self.description
		)
		g.features = [SeqFeature(
			FeatureLocation(ExactPosition(f.start-1),ExactPosition(f.end)), 
			f.type, qualifiers=dict([[q.name,q.data] for q in f.qualifiers.all()])) 
			for f in self.features()]
		return g.format('genbank')
Ejemplo n.º 15
0
 def pwm_scan(self, left=0, right=0):
     records = SeqIO.index(self.genome, 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
     print(self.motif.consensus)
     print(self.motif.pssm)
     print(self.motif)
     for gene, peaks in self._peaks.items():
         for peak in peaks:
             chrom = str(peak.chrom).replace('chr', '')
             seq = records[chrom][int(peak.start) - left:int(peak.end) + right].seq
             matches = list(self.motif.pssm.search(seq))
             print("Gene: " + str(gene) + ", height: " + str(peak.enrichment) + ", sites: " + str(len(matches)))
             if matches:
                 print(matches)
Ejemplo n.º 16
0
 def test_mk(self):
     p = SeqIO.index(TEST_ALIGN_FILE7[0][0],
                     "fasta",
                     alphabet=IUPAC.IUPACUnambiguousDNA())
     pro_aln = AlignIO.read(TEST_ALIGN_FILE7[0][1],
                            "clustal",
                            alphabet=IUPAC.protein)
     codon_aln = codonalign.build(pro_aln, p)
     p.close()  # Close indexed FASTA file
     self.assertAlmostEqual(codonalign.mktest(
         [codon_aln[1:12], codon_aln[12:16], codon_aln[16:]]),
                            0.0021,
                            places=4)
Ejemplo n.º 17
0
def tntable(table=11):
    codons = itertools.product('ACTG', repeat=3)
    _table = {}
    translation = {}
    for x in [''.join(y) for y in codons]:
        aa = str(Seq(
            x, IUPAC.IUPACUnambiguousDNA()).translate(table=table)).upper()
        translation[x] = aa

        try:
            _table[aa][x] = {}
        except:
            _table[aa] = {x: {}}
    return _table, translation
Ejemplo n.º 18
0
    def setUp(self):
        # Test set 1
        seq1 = SeqRecord(Seq('TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG',
                alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1')
        seq2 = SeqRecord(Seq('TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG',
                alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2')
        pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE', alphabet=IUPAC.protein), id='pro1')
        pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE', alphabet=IUPAC.protein), id='pro2')
        aln1 = MultipleSeqAlignment([pro1, pro2])
        self.aln1 = aln1
        self.seqlist1 = [seq1, seq2]
        # Test set 2
        #                      M  K  K  H  E L(F)L  C  Q  G  T  S  N  K  L  T  Q(L)L  G  T  F  E  D  H  F  L  S  L  Q  R  M  F  N  N  C  E  V  V
        seq3 = SeqRecord(Seq('ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1')
        # seq4 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAA TGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2')
        seq4 = SeqRecord(Seq('ATGAAAAAGCACGAGTTCTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAATGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2')
        # seq5 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCC  TTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro3')
        seq5 = SeqRecord(Seq('ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro3')
        pro3 = SeqRecord(Seq('MKKHELLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL', alphabet=IUPAC.protein), id='pro1')
        pro4 = SeqRecord(Seq('MKKHEFLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL', alphabet=IUPAC.protein), id='pro2')
        pro5 = SeqRecord(Seq('MKKHELLCQGTSNKLTLLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL', alphabet=IUPAC.protein), id='pro3')
        aln2 = MultipleSeqAlignment([pro3, pro4, pro5])
        self.aln2 = aln2
        self.seqlist2 = [seq3, seq4, seq5]

        # Test set 3
        # use Yeast mitochondrial codon table
        seq6 = SeqRecord(Seq('ATGGCAAGGGACCACCCAGTTGGGCACTGATATGATCGGGTGTATTTGCAGAGTAGTAACCTTTCTTTTCTCAAGACCATCCAG', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro6')
        seq7 = SeqRecord(Seq('ATGGCAAGGCACCATCCAGTTGAGCACTGATATGATCGGGTGTATTTGCAGAGTAGTAACGTGTCTCTGCTCAAGACCATCCAG', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro7')
        seq8 = SeqRecord(Seq('ATGGCAGGGGACCACCCAGTTGGGCACTGATATGATCGTGTGTATCTGCAGAGTAGTAACCACTCTTTTCTCATGACCATCCAG', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro8')
        pro6 = SeqRecord(Seq('MARDHPVGHWYDRVYLQSSNTSFTKTIQ', alphabet=IUPAC.protein), id='pro6')
        pro7 = SeqRecord(Seq('MARHHPVEHWYDRVYLQSSNVSTTKTIQ', alphabet=IUPAC.protein), id='pro7')
        pro8 = SeqRecord(Seq('MAGDHPVGHWYDRVYTQSSNHSFTMTIQ', alphabet=IUPAC.protein), id='pro8')
        aln3 = MultipleSeqAlignment([pro6, pro7, pro8])
        self.aln3 = aln3
        self.seqlist3 = [seq6, seq7, seq8]
        self.codontable3 = CodonTable.unambiguous_dna_by_id[3]
def get_seq_record(record, start, stop, description):
    """Return a SeqRecord for query between start and stop.
    Given a sam record, find query sequences that cover the [start, stop]
    interval completely and create a SeqRecord object.
    """
    # get the query positions of the bases mapped to start, stop
    # TODO: handle cases where no base is mapped (None)
    positions = record.get_aligned_pairs(matches_only=True)
    first_position = [item[0] for item in positions if item[1] == start]
    last_position = [item[0] for item in positions if item[1] == stop]
    # fetch sequence and qual
    if first_position and last_position:
        name = record.query_name
        if not record.is_reverse:
            seq = Seq(
                record.get_forward_sequence()
                [first_position[0]:last_position[0]],
                IUPAC.IUPACUnambiguousDNA())
            qual = list(record.get_forward_qualities()
                        )[first_position[0]:last_position[0]]
        else:
            length = record.query_length
            seq = Seq(
                record.get_forward_sequence()[length -
                                              last_position[0]:length -
                                              first_position[0]],
                IUPAC.IUPACUnambiguousDNA())
            qual = list(
                record.get_forward_qualities())[length -
                                                last_position[0]:length -
                                                first_position[0]]
        rec = SeqRecord(seq,
                        id=name,
                        description=('|').join(
                            [description, direction[record.is_reverse]]))
        rec.letter_annotations['phred_quality'] = qual
        return rec
Ejemplo n.º 20
0
def filter_genes(input_fasta, reference_hash):
    bad_genes = set()
    in_fasta = pyfasta.Fasta(input_fasta)
    for keys in in_fasta.keys():
        tmp_gene = str(in_fasta[keys]).upper()
        try:
            tmp_gene = analysis_functions_introgressions.extend_ambiguous_dna(
                tmp_gene)
        except:
            print(tmp_gene)
            sys.exit(1)
        ref_gene = reference_hash[keys]
        ref_gene_translate = SeqRecord(Seq.Seq(
            str(ref_gene).replace("-", ""),
            alphabet=IUPAC.IUPACUnambiguousDNA()),
                                       id="REF").seq.translate(to_stop=True)
        if ((len(ref_gene_translate) + 1) != len(ref_gene) / 3):
            bad_genes.add(keys)
        for options in (tmp_gene):
            tmp_gene_translate = SeqRecord(
                Seq.Seq(str(options).replace("-", ""),
                        alphabet=IUPAC.IUPACUnambiguousDNA()),
                id="REF").seq.translate(to_stop=True)

            if ("strict" in input_fasta):
                if ((len(tmp_gene_translate) + 1) != len(options) / 3):
                    bad_genes.add(keys)
            else:
                if ((len(tmp_gene_translate) + 1) != int(len(options)) / 3):
                    bad_genes.add(keys)
            if (len(tmp_gene_translate) <= len(ref_gene_translate) * .9 or
                    len(tmp_gene_translate) * .9 >= len(ref_gene_translate)):
                bad_genes.add(keys)
    with open(input_fasta + ".filt_genes", "w") as output_filt:
        for bad_gene in bad_genes:
            output_filt.write(bad_gene + "\n")
Ejemplo n.º 21
0
def pwm2pssm(file, pseudocount):
    """
	Convert load PWM and covernt it to PSSM (take the log_odds)
	"""
    pwm = pd.read_table(file)
    # Assuming we are doing RNA motif scanning. Need to replace U with T
    # as Biopython's motif scanner only does DNA
    pwm.rename(columns={'U': 'T'}, inplace=True)
    pwm = pwm.drop("Pos", 1).to_dict(orient='list')
    pwm = motifs.Motif(alphabet=IUPAC.IUPACUnambiguousDNA(), counts=pwm)
    pwm = pwm.counts.normalize(pseudocount)

    # Can optionally add background, but for now assuming uniform probability
    pssm = pwm.log_odds()

    # Replace negative infinity values with very low number
    #for letter, odds in pssm.iteritems():
    #pssm[letter] = [-10**6 if x == -float("inf") else x for x in odds]

    return (pssm)
Ejemplo n.º 22
0
def get_sequences_from_fasta(fasta_string, limit=None, return_record=False, dna_alphabet=False):
    """parses a FASTA file of sequences, and returns each contained sequence

    Args:
        fasta_string: the contents of the FASTA file
        limit: how many sequences to parse
        return_record: whether to return the sequence as a string, or a 
            biopython record object
        dna_alphabet: whether to force to usage of the DNA alphabet
            (IUPAC.IUPACUnambiguousDNA)

    Returns:
        a list of sequences, in the intended format
    """
    seq_io = StringIO(fasta_string)
    sequences_parsed = None
    if dna_alphabet:
        sequences_parsed = SeqIO.parse(seq_io, 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
    else:
        sequences_parsed = SeqIO.parse(seq_io, 'fasta')
    sequences = []
    for i, fasta in enumerate(sequences_parsed):
        # get sequence, and convert to uppercase
        sequence = str(fasta.seq).upper()
        seq_id = str(fasta.id)

        # make sure the sequence is only A,C,G,T/U
        # since we are only working with RNA sequences, we can assume this must be true
        for c in sequence:
            if not(c in ['A', 'C', 'G', 'T', 'U']):
                raise Exception('A sequence contains invalid character: %s' % c)
        if return_record:
            sequences.append(fasta)
        else:
            sequences.append((seq_id, sequence))
        if limit:
            if i >= limit-1:
                break

    return sequences
Ejemplo n.º 23
0
 def _load_pwms(self):
     """Loads and returns position weight matrices.
     
     Returns:
         a dictionary of pwms, where the key is the CISBP id code
     """
     
     pwms = {}
     dir_path = os.path.dirname(os.path.realpath(__file__))
     for file in glob.glob(dir_path + "/data/cisbp_rna/pwms/*.txt"):
         pwm_id = os.path.splitext(os.path.basename(file))[0]
         try:
             pwm = pd.read_csv(file, sep="\t", header=0, index_col=0)
             # biopython can only handle DNA motifs so we replace U with T
             pwm.rename(columns = {"U":"T"}, inplace=True)
             pwm = motifs.Motif(alphabet=IUPAC.IUPACUnambiguousDNA(), counts=pwm.to_dict(orient="list"))
             pwm = pwm.counts.normalize(pseudocounts=0.00001)
             pwms[pwm_id] = pwm
         except:
             # some pwm files are empty - we skip these
             continue 
     return pwms
Ejemplo n.º 24
0
 def test_mk(self):
     ver = sys.version_info
     if ver[0] == 2 and ver[1] == 6:
         warnings.warn('Python 2.6 detected. Skip testing MK method')
         pass
     else:
         from run_tests import is_numpy
         if is_numpy():
             p = SeqIO.index(TEST_ALIGN_FILE7[0][0],
                             'fasta',
                             alphabet=IUPAC.IUPACUnambiguousDNA())
             pro_aln = AlignIO.read(TEST_ALIGN_FILE7[0][1],
                                    'clustal',
                                    alphabet=IUPAC.protein)
             codon_aln = CodonAlign.build(pro_aln, p)
             self.assertAlmostEquals(round(
                 CodonAlign.mktest(
                     [codon_aln[1:12], codon_aln[12:16], codon_aln[16:]]),
                 4),
                                     0.0021,
                                     places=4)
         else:
             warnings.warn('Numpy not installed. Skip MK test.')
Ejemplo n.º 25
0
    def read_tag_file(self, tag_file_name):
        """Process the tags file.

        Process the tag file, which has the format: TagName ForwardTag. Read
        the tag file, and store a dictionary that contains the tagname and a
        pair containing the forward and reverse tag, for all samples.

        Parameters
        ----------
        tag_file_name : string
            input tag file name - one sample per line, with format:
            TagName ForwardTag

        Raises
        ------
        IOError
            when input file format is incorrect.

        """
        tag_file = open(tag_file_name)
        for line in tag_file:
            line = line.strip()
            if len(line) == 0:
                continue
            tokens = line.split()
            if len(tokens) != 2:
                self.logger.error("Line does not have the correct format.")
                raise IOError("Line:" + line +
                              "\ndoes not have the correct format.")
            if tokens[0] in self._tag_dict:
                self.logger.error("Repeat tag name: " + tokens[0])
                raise IOError(tokens[0] + " already present in file.")
            forwardTag = Seq(tokens[1], IUPAC.IUPACUnambiguousDNA())
            self._tag_dict[tokens[0]] = forwardTag
        self.logger.info("Read " + str(len(self._tag_dict)) + " valid tag \
                         combinations.")
        tag_file.close()
Ejemplo n.º 26
0
 def __init__(self, _number, _seq):
     self.number = _number
     self.seq = Seq(_seq, IUPAC.IUPACUnambiguousDNA())
     self.rseq = self.seq.complement()
     self.features = []
Ejemplo n.º 27
0
# Used later when marking output file
alignment_file_name =  os.path.basename(sys.argv[1])
motif_file_name =  os.path.basename(sys.argv[2])

print ("alignment file: " + alignment_file_name)
print ("motif file: " + motif_file_name)

raw_sequences = []
for record in alignment:
    raw_sequences.append(SeqRecord(record.seq.ungap("-"), id = record.id))

## make raw sequences all IUPAC.IUPACUnambiguousDNA()
raw_sequences_2 = []
for seq in raw_sequences:
    raw_sequences_2.append(Seq(str(seq.seq), IUPAC.IUPACUnambiguousDNA()))

#####################
## Motifs
#####################

pwm = motif.counts.normalize(pseudocounts=0.0) # Doesn't change from pwm
pssm = pwm.log_odds()
motif_length = len(motif) #for later retrival of nucleotide sequence

######################
## Searching for Motifs in Sequences
######################

## Returns a list of arrays with a score for each position
## This give the score for each position
Ejemplo n.º 28
0
    seqs = []
    header = None
    for seq_record in SeqIO.parse(fastafile, "fasta"):
        seq_record.seq.alphabet = IUPAC.unambiguous_dna
        seqs.append(seq_record)

    return seqs


fasta = ReadFASTA(args.file[0])
regions = pd.read_csv(
    args.csv[0], converters={"coding": literal_eval}, index_col="chr"
)  # the chromosomes are the indexes here - that's fine they should be unique

# cycle through fasta filte
OR = []
for gene in fasta:
    seg = gene.id
    coding_regions = regions.loc[seg, "coding"]
    open_reading = copy.deepcopy(gene)
    open_reading.seq = Seq('', alphabet=IUPAC.IUPACUnambiguousDNA())
    for splice in coding_regions:
        open_reading.seq = open_reading.seq + gene.seq[
            splice[0] - 1:splice[1]]  # for python numbering and slicing

    OR.append(open_reading)

output_handle = open(args.out_fa[0], "w")
SeqIO.write(OR, output_handle, "fasta")
output_handle.close()
def divergence():

    ########################
    ## Arguments d'entrée ##
    ########################
    fic1dna = sys.argv[1]  #fichier des séquences adn de l'espèce 1
    fic2dna = sys.argv[2]  #fichier des séquences adn de l'espèce 2
    fic1prot = sys.argv[3]  #fichier des séquences protéiques de l'espèce 1
    fic2prot = sys.argv[4]  #fichier des séquences protéiques de l'espèce 2

    #outfile_unaligned="outfile_unaligned.fa"
    #outfile_unaligned=open(outfile_unaligned,"w",encoding='utf-8')
    outfile_dn_ds = sys.argv[5]  #fichier de sortie format tableau, sep = ";"
    outfile_dn_ds = open(outfile_dn_ds, "w", encoding='utf-8')
    method = sys.argv[6]  #Methode utilisée
    muscle_exe = sys.argv[7]  #Chemin vers le fichier executable de MUSCLE

    #Transformation des séquences en format SeqIO
    seq1dna = list(
        SeqIO.parse(fic1dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()))
    seq2dna = list(
        SeqIO.parse(fic2dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()))
    seq1prot = list(SeqIO.parse(fic1prot, "fasta", alphabet=IUPAC.protein))
    seq2prot = list(SeqIO.parse(fic2prot, "fasta", alphabet=IUPAC.protein))

    #Première ligne du tableau "titres"
    """print("seq.id",";","dN",";","dS",";","Dist_third_pos",";","Dist_brute",";","Length_seq_1",";","Length_seq2",
		";","GC_content_seq1",";","GC_content_seq2",";","GC",";","Mean_length",file=outfile_dn_ds)"""

    print("Nombre de paires de sequences a analyser: ", len(seq1dna))

    print("seq.id", ";", "dN", ";", "dS", ";", "Dist_third_pos", ";",
          "Dist_brute", ";", "Length_seq_1", ";", "Length_seq2", ";",
          "GC_content_seq1", ";", "GC_content_seq2", ";", "GC", ";",
          "Mean_length")
    """df2 = pd.DataFrame(columns=("seq.id","dN","dS","Dist_third_pos","Dist_brute","Length_seq_1","Length_seq2",
		"GC_content_seq1","GC_content_seq2","GC","Mean_length"))"""

    #Boucle sur chaque paire de séquence
    u = 0
    while u < (len(seq1dna)):

        try:

            ###########################################################
            #.    Alignement entre chaque paire de séquence           #
            ###########################################################

            nuc1 = str(seq1dna[u].seq
                       )  #Récupère la séquence u et la transforme en string
            nuc2 = str(seq2dna[u].seq)
            prot1 = str(seq1prot[u].seq)
            prot2 = str(seq2prot[u].seq)

            protein2 = SeqRecord(
                Seq(prot2, alphabet=IUPAC.protein), id='protein2'
            )  #Transformation de la séquence protéique en format SeqRecord
            protein1 = SeqRecord(Seq(prot1, alphabet=IUPAC.protein),
                                 id='protein1')

            with open(
                    "outfile_unaligned.fa", "w", encoding='utf-8'
            ) as output_handle:  #Permet de créer un fichier de deux séquences non-alignées (format fasta)
                SeqIO.write(protein1, output_handle, "fasta")
                SeqIO.write(protein2, output_handle, "fasta")

            muscle_cline = MuscleCommandline(
                muscle_exe,
                input="outfile_unaligned.fa",
                out="outfile_aligned.aln"
            )  #Prend en entrée le fichier de séquences non-alignées et sort un fichier de séquences alignées
            stdout, stderr = muscle_cline()
            alns = AlignIO.read(
                "outfile_aligned.aln",
                "fasta")  #Lecture du fichier de séquences alignées

            prot1 = str(alns[0].seq)  #Récupère la séquence protéique 1 alignée
            prot2 = str(alns[1].seq)  #Récup§re la séquence protéique 2 alignée

            nuc2 = SeqRecord(
                Seq(nuc2, alphabet=IUPAC.IUPACUnambiguousDNA()), id='nuc2'
            )  #Transformation de la séquence nucléique en format SeqRecord
            nuc1 = SeqRecord(Seq(nuc1, alphabet=IUPAC.IUPACUnambiguousDNA()),
                             id='nuc1')

            prot1 = SeqRecord(
                Seq(prot1, alphabet=IUPAC.protein), id='pro1'
            )  #Transformation de la séquence protéique en format SeqRecord
            prot2 = SeqRecord(Seq(prot2, alphabet=IUPAC.protein), id='pro2')

            aln = MultipleSeqAlignment(
                [prot1, prot2]
            )  #Créer format alignement des 2 séquences protéiques préalablement alignées

            codon_aln = codonalign.build(
                aln, [nuc1, nuc2])  #Créer un alignement de codon

            #Fichier d'alignement
            #AlignIO.write(codon_aln,"outfile_aligned", 'fasta')

            lengthseq1 = len(nuc1.seq)
            lengthseq2 = len(nuc2.seq)
            GCcontentseq1 = GC(nuc1.seq)
            GCcontentseq2 = GC(nuc2.seq)

            GC_mean = ((GCcontentseq1 + GCcontentseq2) / 2)

            if lengthseq1 >= lengthseq2:
                Min_length = lengthseq2
            if lengthseq1 < lengthseq2:
                Min_length = lengthseq1

            ##########################################################
            #           CALCULS DES INDICES DE DIVERGENCE            #
            ##########################################################

            #Calcul de divergence synonyme et non-synonyme

            #Supression des gaps
            seq1 = ""
            seq2 = ""
            for x, z in zip(codon_aln[0], codon_aln[1]):
                if z == "-":
                    continue
                if x == "-":
                    continue
                else:
                    seq1 += x
                    seq2 += z

            #################################################################
            #.	        Comptage du nombre de site polymorhe brute          #
            #################################################################

            #Compteur de différences par site
            compteur0 = 0
            for i, e in zip(seq1, seq2):
                if i != e:
                    compteur0 += 1

            distance_brute = round(float((compteur0) / len(seq1)), 3)

            seq1_third_pos = ""
            seq2_third_pos = ""

            compteur1 = 0
            for i in seq1[2::3]:
                if i.isalpha():
                    seq1_third_pos += i
                    compteur1 += 1

            compteur2 = 0
            for i in seq2[2::3]:
                if i.isalpha():
                    seq2_third_pos += i
                    compteur2 += 1

            ####################################################################
            #	Comptage du nombre de site polymorphe en troisième position    #
            ####################################################################

            #Compteur de différences par site (3ieme position)
            compteur3 = 0
            for i, e in zip(seq1_third_pos, seq2_third_pos):
                if i != e:
                    compteur3 += 1

            distance_third_pos = round(float((compteur3) / compteur2), 3)

            ####################################################################
            #			Calcul dN et dS selon la méthode utilisée 			   #
            ####################################################################

            try:

                dN, dS = cal_dn_ds(codon_aln[0], codon_aln[1], method=method)
                """print(seq1dna[u].id,";",dN,";",dS,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", dN, ";", dS, ";", distance_third_pos,
                      ";", distance_brute, ";", lengthseq1, ";", lengthseq2,
                      ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean,
                      ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":dN,"dS":dS,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            except ValueError:
                result = 9.999  #Saturation trop importante pour calculer les indices.
                """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", result, ";", result, ";",
                      distance_third_pos, ";", distance_brute, ";", lengthseq1,
                      ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2,
                      ";", GC_mean, ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            except ZeroDivisionError:
                result = 9.999
                """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", result, ";", result, ";",
                      distance_third_pos, ";", distance_brute, ";", lengthseq1,
                      ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2,
                      ";", GC_mean, ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            except KeyError:
                result = 9.999
                """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", result, ";", result, ";",
                      distance_third_pos, ";", distance_brute, ";", lengthseq1,
                      ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2,
                      ";", GC_mean, ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            u += 1

        except:
            traceback.print_exc()
            print("Une erreur est survenue pour la sequence: ", seq1dna[u].id,
                  "vs", seq2dna[u].id)
            """df2=df2.append({"seq.id":seq1dna[u].id,"dN":"NA","dS":"NA","Dist_third_pos":"NA","Dist_brute":"NA","Length_seq_1":"NA",
		"Length_seq2":"NA","GC_content_seq1":"NA","GC_content_seq2":"NA","GC":"NA","Mean_length":"NA"}, ignore_index=True)"""

            u += 1

    #df2.to_csv(outfile_dn_ds, sep='\t')
    outfile_dn_ds.close()  #Fermeture du fichier ouvert
Ejemplo n.º 30
0
    }
    #Lists necessary for converting ambiguous nucleotides
    sequence5f = list()  #5' cutting point
    sequence3f = list()  #3' end cutting point
    sequence5c = list()  #5' complementary cutting point

    #Creating the 5' and 3' cutting points
    for nucleotide in sequence5:
        sequence5f.append(
            DIUPAC[nucleotide])  #Changes the nucleotide key for its value
    sequence5f = "".join(sequence5f)  #Joins the list into a string
    for nucleotide in sequence3:
        sequence3f.append(DIUPAC[nucleotide])
    sequence3f = "".join(sequence3f)
    #Obtaining complementary of 5' target sequence (cutting point) to be able to check whether a sequence is the forward or the reverse
    check = Seq(sequence5, IUPAC.IUPACUnambiguousDNA())
    checkseq = check.reverse_complement()
    for nucleotide in checkseq:
        sequence5c.append(DIUPAC[nucleotide])
    sequence5c = "".join(sequence5c)

    log.write(
        "Target string used to cut 5' end: %s\nTarget string used to cut 3' end: %s\n\n"
        % (sequence5f, sequence3f))
    Preverse = list()  #Necessary for cutting
    Cutfiles = list()

    def cutfile(filename, sequence5f, sequence3f
                ):  #Function to cut the fastq files in the indicated points
        global cut  #The variable can also be used outside the function
        global uncut