Esempio n. 1
0
def compute_silent_contamination(fna, faa, hits):
    prot_contigs = set()
    for seq in Fasta(faa):
        contig = seq.name.split("_", 1)[1].rsplit("_", 1)[0]
        prot_contigs.add(contig)

    hit_contigs = set()
    for row in hits:
        contig = row["target"].split("_", 1)[1].rsplit("_", 1)[0]
        hit_contigs.add(contig)

    contigs = {}
    total_bp = 0
    for rec in Fasta(fna):
        contigs[rec.name] = len(rec.seq)
        total_bp += len(rec.seq)

    # count all the stats
    stats = {
        "contigs": len(contigs),
        "contigs_w_hits": 0,
        "bp_w_hits": 0,
        "contigs_w_proteins": 0,
        "total_bp": total_bp,
    }
    for contig, size in contigs.items():
        if contig in prot_contigs:
            stats["contigs_w_proteins"] += 1
        if contig in hit_contigs:
            stats["contigs_w_hits"] += 1
            stats["bp_w_hits"] += size
    return stats
Esempio n. 2
0
    def test_metaeuk_cleaner(self):
        cleaned = Fasta(
            clean_metaeuk_fasta(METAEUK_dirty, ".test_tmp_metaeuk_file"))
        clean = Fasta(METAEUK_clean)

        for seq_1, seq_2 in zip(clean, cleaned):
            self.assertEqual(seq_1.name, seq_2.name)
            self.assertEqual(seq_1.seq, seq_2.seq)
        os.remove(".test_tmp_metaeuk_file")
Esempio n. 3
0
 def test_open_invalid(self):
     with self.assertRaises(ValueError):
         x = Fasta(NOT_A_FASTA)
         for s in x:
             print(s)
     with self.assertRaises(ValueError):
         x = Fasta(CORRUPT_FILE)
         for s in x:
             print(s)
Esempio n. 4
0
 def test_Fasta_extract(self):
     fa = Fasta(TESTDATA_FASTA)
     for seq in fa:
         if seq.name == "TEST":
             break
     outfile = ".test_fasta"
     reduce_fasta(TESTDATA_FASTA, outfile, ["TEST"])
     fa2 = Fasta(outfile)
     for seq2 in fa2:
         if seq2.name == "TEST":
             break
     self.assertEqual(seq.seq, seq2.seq)
     self.assertEqual(seq.name, seq2.name)
     self.assertEqual(seq.long_name, seq2.long_name)
     os.remove(outfile)
Esempio n. 5
0
def split_contig_faa(path, workdir, delim="_binsep_"):
    faa_dir = os.path.join(workdir, "faa")
    if os.path.exists(faa_dir):
        logging.info(
            "Faa folder exists, remove if you want to rerun this analsys, will reuse it for now"
        )
        return [
            os.path.abspath(os.path.join(faa_dir, x))
            for x in os.listdir(faa_dir)
        ]
    elif file.isdir(faa_dir):
        fls = {}
        for seq in Fasta(path):
            b = seq.name.split(delim, 1)[0]
            contig = seq.name.split(delim, 1)[1]
            if b not in fls.keys():
                fls[b] = open(os.path.join(faa_dir, b), "w")
            fls[b].write(">{name}\n{seq}\n".format(name=contig, seq=seq.seq))
        # close all files
        for key, fl in fls.items():
            fl.close()
        return [
            os.path.abspath(os.path.join(faa_dir, x))
            for x in os.listdir(faa_dir)
        ]
    else:
        logging.warning("Could not create fasta split dir: {}".format(faa_dir))
Esempio n. 6
0
 def read_fasta(path):
     seqs = {}
     for record in Fasta(path):
         seqs[record.name] = record.seq
     return seqs
Esempio n. 7
0
 def test_multiline(self):
     expected = ["AAGGCCTT", "CCTTAAGG", "AAGGCCTT"]
     for e, seq in zip(expected, Fasta(MULTILINE_FILE)):
         self.assertEqual(e, seq.seq)
Esempio n. 8
0
 def test_Fasta(self):
     fa = Fasta(TESTDATA_FASTA)
     for seq in fa:
         if seq.name == "TEST":
             self.assertEqual(seq.seq, "HELLO")