Ejemplo n.º 1
0
def calcAbundance(alignmentFile, consensusFile, abundanceFile, abundancePercentFile, verbose):
	print('Calculating the abundance matrix...')
	alignment = AlignIO.read(alignmentFile, "fasta")
	summary = SummaryInfo(alignment)
	consensusSeq = SeqIO.read(consensusFile, 'fasta')
	if (len(consensusSeq) == alignment.get_alignment_length()):
		abundanceMatrix = summary.pos_specific_score_matrix(consensusSeq)
	else:
		with open(consensusFile, "w") as f:			
			SeqIO.write(SeqRecord( summary.dumb_consensus(), id='consensus'), f, "fasta")
		abundanceMatrix = summary.pos_specific_score_matrix()
	if verbose:
		print("Abundance matrix (absolute values):")
		print(str(abundanceMatrix))
	with open(abundanceFile, 'w') as f:
		f.write(str(abundanceMatrix))
	for pos, abundance in enumerate(abundanceMatrix):
		for res, value in abundance.items():
			abundanceMatrix[pos][res] = 100.0 * float(value) / float(len(alignment))
	if verbose:
		print("Abundance matrix (percentages):")
		print(str(abundanceMatrix))
	with open(abundancePercentFile, 'w') as f:
		f.write(str(abundanceMatrix))
	print('OK')
    def test_proteins(self):
        alpha = HasStopCodon(Gapped(generic_protein, "-"), "*")
        a = MultipleSeqAlignment([
            SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-", alpha),
                      id="ID001"),
            SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*", alpha),
                      id="ID002"),
            SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*", alpha),
                      id="ID003")
        ])
        self.assertEqual(32, a.get_alignment_length())

        s = SummaryInfo(a)

        c = s.dumb_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*")

        c = s.gap_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX")

        m = s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
        self.assertEqual(
            str(m),
            """    A   D   E   F   G   H   I   K   L   M   N   P   Q   R   S   W   Y
M  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
H  0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0
X  2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
F  0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
L  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
K  0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
R  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
P  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0
E  0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
N  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
W  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
""")

        ic = s.information_content(chars_to_ignore=['-', '*'])
        self.assertAlmostEqual(ic, 133.061475107, places=6)
Ejemplo n.º 3
0
    def test_nucleotides(self):
        filename = "GFF/multi.fna"
        format = "fasta"
        alignment = AlignIO.read(filename, format, alphabet=unambiguous_dna)
        summary = SummaryInfo(alignment)

        c = summary.dumb_consensus(ambiguous="N")
        self.assertEqual(str(c), "NNNNNNNN")

        c = summary.gap_consensus(ambiguous="N")
        self.assertEqual(str(c), "NNNNNNNN")

        expected = {"A": 0.25, "G": 0.25, "T": 0.25, "C": 0.25}

        m = summary.pos_specific_score_matrix(chars_to_ignore=["-"],
                                              axis_seq=c)
        self.assertEqual(
            str(m), """    A   C   G   T
N  2.0 0.0 1.0 0.0
N  1.0 1.0 1.0 0.0
N  1.0 0.0 2.0 0.0
N  0.0 1.0 1.0 1.0
N  1.0 2.0 0.0 0.0
N  0.0 2.0 1.0 0.0
N  1.0 2.0 0.0 0.0
N  0.0 2.0 1.0 0.0
""")

        # Have a generic alphabet, without a declared gap char, so must tell
        # provide the frequencies and chars to ignore explicitly.
        ic = summary.information_content(e_freq_table=expected,
                                         chars_to_ignore=["-"])
        self.assertAlmostEqual(ic, 7.32029999423075, places=6)
Ejemplo n.º 4
0
def _run_hmmer_single_thread(sequences, iter=3):
    pbar = tqdm(range(len(sequences)), desc="sequences processed")
    database = "data/Uniprot/uniprot_sprot.fasta"
    outpth = os.path.join(tmp_dir, "align.sto")
    for seqid, seq in sequences.items():
        pbar.update(1)
        cline = "jackhmmer -N %d --acc --noali -A %s - %s" % (iter, outpth, database)

        child = subprocess.Popen(cline,
                                 stdin=subprocess.PIPE,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 universal_newlines=True,
                                 shell=(sys.platform != "win32"))

        stdout, _ = child.communicate(input=">%s\n%s" % (seqid, seq))
        assert child.returncode == 0

        info = SummaryInfo(AlignIO.read(outpth, "stockholm"))
        pssm = info.pos_specific_score_matrix(chars_to_ignore=[GAP])

        db.pssm.update_one({
            "_id": seqid}, {
            '$set': {"pssm": pssm.pssm, "seq": seq}
        }, upsert=True)

    pbar.close()
Ejemplo n.º 5
0
    def test_proteins(self):
        a = MultipleSeqAlignment([
            SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-"), id="ID001"),
            SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*"), id="ID002"),
            SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*"), id="ID003")
        ])
        self.assertEqual(32, a.get_alignment_length())

        s = SummaryInfo(a)

        c = s.dumb_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*")

        c = s.gap_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX")

        m = s.pos_specific_score_matrix(chars_to_ignore=["-", "*"], axis_seq=c)
        self.assertEqual(
            str(m),
            """    A   D   E   F   G   H   I   K   L   M   N   P   Q   R   S   W   Y
M  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
H  0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0
X  2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
F  0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
L  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
K  0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
R  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
P  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0
E  0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
N  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
W  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
""")

        letters = IUPACData.protein_letters
        base_freq = 1.0 / len(letters)
        e_freq_table = {letter: base_freq for letter in letters}
        ic = s.information_content(e_freq_table=e_freq_table,
                                   chars_to_ignore=["-", "*"])
        self.assertAlmostEqual(ic, 133.061475107, places=6)
Ejemplo n.º 6
0
    def test_proteins(self):
        alpha = HasStopCodon(Gapped(generic_protein, "-"), "*")
        a = MultipleSeqAlignment([
                SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-", alpha), id="ID001"),
                SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*", alpha), id="ID002"),
                SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*", alpha), id="ID003")])
        self.assertEqual(32, a.get_alignment_length())

        s = SummaryInfo(a)

        c = s.dumb_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*")

        c = s.gap_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX")

        m = s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
        self.assertEqual(str(m), """    A   D   E   F   G   H   I   K   L   M   N   P   Q   R   S   W   Y
M  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
H  0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0
X  2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
F  0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
L  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
K  0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
R  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
P  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0
E  0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
N  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
W  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
""")

        ic = s.information_content(chars_to_ignore=['-', '*'])
        self.assertAlmostEqual(ic, 133.061475107, places=6)
Ejemplo n.º 7
0
def make_consensus(alignment_file, format_alignment):
    from Bio import AlignIO
    from Bio.Align.AlignInfo import SummaryInfo
    format = format_alignment
    alignment = AlignIO.read(alignment_file, format)
    summary = SummaryInfo(alignment)
    mat = summary.pos_specific_score_matrix()
    consensus = ""
    l = list()
    for row in mat:
        key, freq = max(row.items(), key=operator.itemgetter(1))
        consensus = "".join([consensus, key])
        l.append(freq / len(alignment._records))
    consensus = consensus.strip("-")
    return consensus, l
Ejemplo n.º 8
0
    def test_nucleotides(self):
        filename = "GFF/multi.fna"
        format = "fasta"
        alignment = AlignIO.read(filename, format, alphabet=unambiguous_dna)
        summary = SummaryInfo(alignment)

        c = summary.dumb_consensus(ambiguous="N")
        self.assertEqual(str(c), 'NNNNNNNN')
        self.assertNotEqual(c.alphabet, unambiguous_dna)
        self.assertTrue(isinstance(c.alphabet, DNAAlphabet))

        c = summary.gap_consensus(ambiguous="N")
        self.assertEqual(str(c), 'NNNNNNNN')
        self.assertNotEqual(c.alphabet, unambiguous_dna)
        self.assertTrue(isinstance(c.alphabet, DNAAlphabet))

        expected = FreqTable({"A": 0.25, "G": 0.25, "T": 0.25, "C": 0.25},
                             FREQ, unambiguous_dna)

        m = summary.pos_specific_score_matrix(chars_to_ignore=['-'],
                                              axis_seq=c)
        self.assertEqual(str(m), """    A   C   G   T
N  2.0 0.0 1.0 0.0
N  1.0 1.0 1.0 0.0
N  1.0 0.0 2.0 0.0
N  0.0 1.0 1.0 1.0
N  1.0 2.0 0.0 0.0
N  0.0 2.0 1.0 0.0
N  1.0 2.0 0.0 0.0
N  0.0 2.0 1.0 0.0
""")

        # Have a generic alphabet, without a declared gap char, so must tell
        # provide the frequencies and chars to ignore explicitly.
        ic = summary.information_content(e_freq_table=expected,
                                         chars_to_ignore=['-'])
        self.assertAlmostEqual(ic, 7.32029999423075, places=6)