def compute_silent_contamination(fna, faa, hits): prot_contigs = set() for seq in Fasta(faa): contig = seq.name.split("_", 1)[1].rsplit("_", 1)[0] prot_contigs.add(contig) hit_contigs = set() for row in hits: contig = row["target"].split("_", 1)[1].rsplit("_", 1)[0] hit_contigs.add(contig) contigs = {} total_bp = 0 for rec in Fasta(fna): contigs[rec.name] = len(rec.seq) total_bp += len(rec.seq) # count all the stats stats = { "contigs": len(contigs), "contigs_w_hits": 0, "bp_w_hits": 0, "contigs_w_proteins": 0, "total_bp": total_bp, } for contig, size in contigs.items(): if contig in prot_contigs: stats["contigs_w_proteins"] += 1 if contig in hit_contigs: stats["contigs_w_hits"] += 1 stats["bp_w_hits"] += size return stats
def test_metaeuk_cleaner(self): cleaned = Fasta( clean_metaeuk_fasta(METAEUK_dirty, ".test_tmp_metaeuk_file")) clean = Fasta(METAEUK_clean) for seq_1, seq_2 in zip(clean, cleaned): self.assertEqual(seq_1.name, seq_2.name) self.assertEqual(seq_1.seq, seq_2.seq) os.remove(".test_tmp_metaeuk_file")
def test_open_invalid(self): with self.assertRaises(ValueError): x = Fasta(NOT_A_FASTA) for s in x: print(s) with self.assertRaises(ValueError): x = Fasta(CORRUPT_FILE) for s in x: print(s)
def test_Fasta_extract(self): fa = Fasta(TESTDATA_FASTA) for seq in fa: if seq.name == "TEST": break outfile = ".test_fasta" reduce_fasta(TESTDATA_FASTA, outfile, ["TEST"]) fa2 = Fasta(outfile) for seq2 in fa2: if seq2.name == "TEST": break self.assertEqual(seq.seq, seq2.seq) self.assertEqual(seq.name, seq2.name) self.assertEqual(seq.long_name, seq2.long_name) os.remove(outfile)
def split_contig_faa(path, workdir, delim="_binsep_"): faa_dir = os.path.join(workdir, "faa") if os.path.exists(faa_dir): logging.info( "Faa folder exists, remove if you want to rerun this analsys, will reuse it for now" ) return [ os.path.abspath(os.path.join(faa_dir, x)) for x in os.listdir(faa_dir) ] elif file.isdir(faa_dir): fls = {} for seq in Fasta(path): b = seq.name.split(delim, 1)[0] contig = seq.name.split(delim, 1)[1] if b not in fls.keys(): fls[b] = open(os.path.join(faa_dir, b), "w") fls[b].write(">{name}\n{seq}\n".format(name=contig, seq=seq.seq)) # close all files for key, fl in fls.items(): fl.close() return [ os.path.abspath(os.path.join(faa_dir, x)) for x in os.listdir(faa_dir) ] else: logging.warning("Could not create fasta split dir: {}".format(faa_dir))
def read_fasta(path): seqs = {} for record in Fasta(path): seqs[record.name] = record.seq return seqs
def test_multiline(self): expected = ["AAGGCCTT", "CCTTAAGG", "AAGGCCTT"] for e, seq in zip(expected, Fasta(MULTILINE_FILE)): self.assertEqual(e, seq.seq)
def test_Fasta(self): fa = Fasta(TESTDATA_FASTA) for seq in fa: if seq.name == "TEST": self.assertEqual(seq.seq, "HELLO")