def main(args): utilities.add_tmp_dir(args) utilities.check_input(args) utilities.check_dependencies(["blastn"]) utilities.check_database(args) utilities.add_tmp_dir(args) print("\u001b[1m" + "• Searching database with BLASTN" + "\u001b[0m") for target in ["hg38", "phix"]: db = f"{args['db']}/known-contam/{target}/{target}" out = f"{args['tmp_dir']}/{target}.m8" run_blastn( args["fna"], db, out, args["threads"], args["qcov"], args["pid"], args["evalue"], ) print("\u001b[1m" + "\n• Identifying contigs with hits to database" + "\u001b[0m") flagged = set([]) for target in ["hg38", "phix"]: out = f"{args['tmp_dir']}/{target}.m8" for r in utilities.parse_blast(out): flagged.add(r["qname"]) flagged = list(flagged) out = f"{args['tmp_dir']}/flagged_contigs" print(f" {len(flagged)} flagged contigs: {out}") with open(out, "w") as f: for contig in flagged: f.write(contig + "\n")
def main(args): utilities.add_tmp_dir(args) utilities.check_input(args) utilities.check_dependencies(["prodigal", "hmmsearch", "blastp", "blastn"]) utilities.check_database(args) print("\u001b[1m" + "• Calling genes with Prodigal" + "\u001b[0m") utilities.run_prodigal(args["fna"], args["tmp_dir"]) print(f" all genes: {args['tmp_dir']}/genes.[ffn|faa]") print("\u001b[1m" + "\n• Identifying PhyEco phylogenetic marker genes with HMMER" + "\u001b[0m") utilities.run_hmmsearch(args["db"], args["tmp_dir"], args["tmp_dir"], args["threads"]) extract_homologs(args["tmp_dir"]) print(f" hmm results: {args['tmp_dir']}/phyeco.hmmsearch") print(f" marker genes: {args['tmp_dir']}/markers") print( "\u001b[1m" + "\n• Performing pairwise BLAST alignment of marker genes against database" + "\u001b[0m") align_homologs(args["db"], args["tmp_dir"], args["seq_type"], args["threads"]) print(f" blast results: {args['tmp_dir']}/alns") print("\u001b[1m" + "\n• Finding taxonomic outliers" + "\u001b[0m") flagged = flag_contigs(args["db"], args["tmp_dir"], args) out = f"{args['tmp_dir']}/flagged_contigs" print(f" {len(flagged)} flagged contigs: {out}") with open(out, "w") as f: for contig in flagged: f.write(contig + "\n")
def main(args): utilities.add_tmp_dir(args) utilities.check_input(args) utilities.check_dependencies(["coverm"]) print("\u001b[1m" + "• Computing contig coverage" + "\u001b[0m") utilities.run_coverm(args["bams"], args["tmp_dir"], args["threads"]) coverage_df = pd.read_csv(f"{args['tmp_dir']}/coverage.tsv", sep="\t", index_col=0) contig_id_list = [] contig_length_list = [] for id, seq in utilities.parse_fasta(args["fna"]): contig_id_list.append(id) contig_length_list.append(len(seq)) contig_coverage_df = coverage_df.loc[contig_id_list] largest_mean_coverage_sample = contig_coverage_df.mean(axis=0).idxmax() if contig_coverage_df.shape[1] > 1: print( "\u001b[1m" + f"\n• Sample being used for outlier detection: {largest_mean_coverage_sample.split()[0]}" + "\u001b[0m") contig_coverage_df = contig_coverage_df.loc[:, largest_mean_coverage_sample] if contig_coverage_df.mean() < 1: sys.exit( "\nError: The average coverage is less than 1 in all the supplied BAM files" ) if args["weighted_mean"]: print( "\u001b[1m" + "\n• Computing per-contig deviation from the weighted mean coverage" + "\u001b[0m") reference = np.average(contig_coverage_df.values, weights=contig_length_list) else: print("\u001b[1m" + "\n• Computing per-contig deviation from the mean coverage" + "\u001b[0m") reference = contig_coverage_df.mean() outliers = ((contig_coverage_df / reference) >= args["max_deviation"]) | ( (contig_coverage_df / reference) <= 1 / args["max_deviation"]) print("\u001b[1m" + "\n• Identifying outlier contigs" + "\u001b[0m") flagged = contig_coverage_df.loc[outliers].index.tolist() out = f"{args['tmp_dir']}/flagged_contigs" print(f" {len(flagged)} flagged contigs: {out}") with open(out, "w") as f: for contig in flagged: f.write(contig + "\n")
def main(args): utilities.add_tmp_dir(args) utilities.check_input(args) if args["weighted_mean"]: print("\u001b[1m" + "• Computing weighted mean contig GC content" + "\u001b[0m") else: print("\u001b[1m" + "• Computing mean contig GC content" + "\u001b[0m") contigs = {} contig_length_list = [] for id, seq in utilities.parse_fasta(args["fna"]): contig = Contig() contig.id = id contig.seq = str(seq) contig.gc = round(SeqUtils.GC(seq), 2) contigs[id] = contig contig_length_list.append(len(seq)) if args["weighted_mean"]: print("\u001b[1m" + "\n• Computing per-contig deviation from weighted mean" + "\u001b[0m") reference = np.average([c.gc for c in contigs.values()], weights=contig_length_list) else: print("\u001b[1m" + "\n• Computing per-contig deviation from mean" + "\u001b[0m") reference = np.average([c.gc for c in contigs.values()]) for contig in contigs.values(): contig.values = {} contig.values["delta"] = abs(contig.gc - reference) print("\u001b[1m" + "\n• Identifying outlier contigs" + "\u001b[0m") flagged = [ contig.id for contig in contigs.values() if contig.values["delta"] > args["cutoff"] ] out = f"{args['tmp_dir']}/flagged_contigs" print(f" {len(flagged)} flagged contigs: {out}") with open(out, "w") as f: for contig in flagged: f.write(contig + "\n")
def main(args): utilities.check_input(args) print("\u001b[1m" + "• Reading genome bin" + "\u001b[0m") bin = {id: seq for id, seq in utilities.parse_fasta(args["fna"])} bin_length = round(sum(len(_) for _ in bin.values()) / 1000, 2) print(f" genome length: {len(bin)} contigs, {bin_length} Kbp") print("\u001b[1m" + "\n• Reading flagged contigs" + "\u001b[0m") flagged_contigs = [] programs = [ "phylo-markers", "clade-markers", "conspecific", "tetra-freq", "gc-content", "coverage", "known-contam", ] for program in programs: path = f"{args['out']}/{program}/flagged_contigs" if not os.path.exists(path): print(f" {program}: no output file found") else: contigs = [_.rstrip() for _ in open(path)] bases = round(sum(len(bin[id]) for id in contigs) / 1000, 2) flagged_contigs += contigs print(f" {program}: {len(contigs)} contigs, {bases} Kbp") flagged_contigs = list(set(flagged_contigs)) flagged_length = round( sum(len(bin[id]) for id in flagged_contigs) / 1000, 2) print("\u001b[1m" + "\n• Removing flagged contigs" + "\u001b[0m") clean = bin.copy() for id in flagged_contigs: del clean[id] clean_length = round(sum(len(_) for _ in clean.values()) / 1000, 2) print(f" removed: {len(flagged_contigs)} contigs, {flagged_length} Kbp") print(f" remains: {len(clean)} contigs, {clean_length} Kbp") with open(args['out_fna'], "w") as f: for id, seq in clean.items(): f.write(">" + id + "\n" + textwrap.fill(seq, 70) + "\n") print(f" cleaned bin: {args['out_fna']}")
def main(args): utilities.add_tmp_dir(args) utilities.check_input(args) utilities.check_dependencies(["mash"]) if not os.path.exists(args["mash_sketch"]): sys.exit(f"\nError: mash sketch '{args['mash_sketch']}' not found\n") print("\u001b[1m" + "• Finding conspecific genomes in database" + "\u001b[0m") run_mash(args["mash_sketch"], args["fna"], args["tmp_dir"], args["threads"]) genomes = find_conspecific(args["tmp_dir"], args["mash_dist"], args["exclude"]) print(f" {len(genomes)} genomes within {args['mash_dist']} mash-dist") out = f"{args['tmp_dir']}/conspecific.list" with open(out, "w") as f: f.write("genome_id\tmash_dist\n") for genome_id, mash_dist in genomes: f.write(genome_id + "\t" + str(mash_dist) + "\n") print(f" list of genomes: {out}") print(f" mash output: {args['tmp_dir']}/mash.dist") if len(genomes) < args["min_genomes"]: sys.exit("\nError: insufficient number of conspecific genomes\n") if len(genomes) > args["max_genomes"]: print("\u001b[1m" + f"\n• Selecting top {args['max_genomes']} most-similar genomes" + "\u001b[0m") genomes = genomes[0:args["max_genomes"]] out = f"{args['tmp_dir']}/conspecific_subset.list" with open(out, "w") as f: f.write("genome_id\tmash_dist\n") for genome_id, mash_dist in genomes: f.write(genome_id + "\t" + str(mash_dist) + "\n") print(f" list of genomes: {out}") print( "\u001b[1m" + "\n• Performing pairwise alignment of contigs in bin to database genomes" + "\u001b[0m") alignments = align_contigs(args, genomes) num_alns = sum(len(_.split("\n")) for _ in alignments) print(f" total alignments: {num_alns}") print("\u001b[1m" + "\n• Summarizing alignments" + "\u001b[0m") contigs = find_contig_targets(args, genomes, alignments) out = f"{args['tmp_dir']}/contig_hits.tsv" with open(out, "w") as f: f.write("contig_id\tlength\talignment_rate\n") for contig, values in contigs.items(): row = [ contig, str(values["len"]), f"{values['hits']}/{len(genomes)}" ] f.write("\t".join(row) + "\n") print(f" contig features: {out}") print("\u001b[1m" + "\n• Identifying contigs with no conspecific alignments" + "\u001b[0m") flagged = flag_contigs(args, contigs) out = f"{args['tmp_dir']}/flagged_contigs" with open(out, "w") as f: for contig in flagged: f.write(contig + "\n") print(f" {len(flagged)} flagged contigs: {out}")
def main(args): utilities.add_tmp_dir(args) utilities.check_input(args) utilities.check_database(args) print("\u001b[1m" + "• Reading database info" + "\u001b[0m") ref_taxonomy = read_ref_taxonomy(args["db"]) taxon_to_taxonomy = {} for taxonomy in set(ref_taxonomy.values()): for taxon in taxonomy.split("|"): taxon_to_taxonomy[taxon] = taxonomy min_pid = {"k": 57, "p": 77, "c": 82, "o": 86, "f": 87, "g": 91, "s": 96} if args["min_genes"] is not None: args["min_genes"] = dict([(r, args["min_genes"]) for r in ranks]) else: args["min_genes"] = { "k": 237, "p": 44, "c": 30, "o": 24, "f": 22, "g": 20, "s": 19, } print("\u001b[1m" + "\n• Calling genes with Prodigal" + "\u001b[0m") utilities.run_prodigal(args["fna"], args["tmp_dir"]) print(f" all genes: {args['tmp_dir']}/genes.[ffn|faa]") print( "\u001b[1m" + "\n• Performing pairwise alignment of genes against MetaPhlan2 database of clade-specific genes" + "\u001b[0m" ) utilities.run_lastal(args["db"], args["tmp_dir"], args["threads"]) print(f" alignments: {args['tmp_dir']}/genes.m8") print("\u001b[1m" + "\n• Finding top hits to database" + "\u001b[0m") genes = {} for aln in utilities.parse_last(args["tmp_dir"] + "/genes.m8"): # clade exclusion ref_taxa = ref_taxonomy[aln["tid"]].split("|") if args["exclude_clades"] and any( taxon in ref_taxa for taxon in args["exclude_clades"] ): continue # initialize gene if aln["qid"] not in genes: genes[aln["qid"]] = Gene() genes[aln["qid"]].id = aln["qid"] genes[aln["qid"]].contig_id = aln["qid"].rsplit("_", 1)[0] # get top alignments if genes[aln["qid"]].aln is None: genes[aln["qid"]].aln = aln genes[aln["qid"]].ref_taxa = ref_taxa elif float(aln["score"]) > float(genes[aln["qid"]].aln["score"]): genes[aln["qid"]].ref_taxa = ref_taxa print(" %s genes with a database hit" % len(genes)) print("\u001b[1m" + "\n• Classifying genes at each taxonomic rank" + "\u001b[0m") counts = {} for gene in genes.values(): for ref_taxon in gene.ref_taxa: rank = ref_taxon.split("__")[0] if rank not in counts: counts[rank] = 0 if rank == "t": continue elif float(gene.aln["pid"]) < min_pid[rank]: continue elif gene.aln["qcov"] < 0.4: continue elif gene.aln["tcov"] < 0.4: continue gene.taxa[rank] = ref_taxon counts[rank] += 1 for rank in ranks: print(f" {rank_names[rank]}: {counts[rank]} classified genes") print("\u001b[1m" + "\n• Taxonomically classifying contigs" + "\u001b[0m") contigs = {} for id, seq in utilities.parse_fasta(args["fna"]): contigs[id] = Contig() contigs[id].id = id contigs[id].length = len(seq) # aggregate hits by contig for gene in genes.values(): contigs[gene.contig_id].genes.append(gene) # classify contigs at each level for contig in contigs.values(): contig.classify() # summarize counts = {} for contig in contigs.values(): for rank, taxon in contig.cons_taxa.items(): if rank not in counts: counts[rank] = 0 if taxon is not None: counts[rank] += 1 print(" total contigs: %s" % len(contigs)) for rank in ranks: print(f" {rank_names[rank]}: {counts[rank]} classified contigs") print("\u001b[1m" + "\n• Taxonomically classifying genome" + "\u001b[0m") bin = Bin() bin.classify( contigs, args["min_bin_fract"], args["min_contig_fract"], args["min_gene_fract"], args["min_genes"], args["lowest_rank"], ) print(f" consensus taxon: {bin.cons_taxon}") print("\u001b[1m" + "\n• Identifying taxonomically discordant contigs" + "\u001b[0m") if bin.cons_taxon is not None: bin.rank_index = ( taxon_to_taxonomy[bin.cons_taxon].split("|").index(bin.cons_taxon) ) bin.taxonomy = taxon_to_taxonomy[bin.cons_taxon].split("|")[ 0 : bin.rank_index + 1 ] flag_contigs(contigs, bin) flagged = [contig.id for contig in contigs.values() if contig.flagged] out = f"{args['tmp_dir']}/flagged_contigs" print(f" {len(flagged)} flagged contigs: {out}") with open(out, "w") as f: for contig in flagged: f.write(contig + "\n")
def main(args): utilities.add_tmp_dir(args) utilities.check_input(args) utilities.check_dependencies(["blastn"]) print("\u001b[1m" + "• Counting tetranucleotides" + "\u001b[0m") # init data contigs = {} contig_length_list = [] for id, seq in utilities.parse_fasta(args["fna"]): contig = Contig() contig.id = id contig.seq = str(seq) contig.kmers = init_kmers() contigs[id] = contig contig_length_list.append(len(seq)) # count kmers for contig in contigs.values(): for i in range(len(contig.seq) - 3): kmer_fwd = contig.seq[i : i + 4] if kmer_fwd in contig.kmers: contig.kmers[kmer_fwd] += 1 else: kmer_rev = utilities.reverse_complement(kmer_fwd) contig.kmers[kmer_rev] += 1 print("\u001b[1m" + "\n• Normalizing counts" + "\u001b[0m") for contig in contigs.values(): total = float(sum(contig.kmers.values())) for kmer, count in contig.kmers.items(): contig.kmers[kmer] = 100 * count / total if total > 0 else 0.0 print("\u001b[1m" + "\n• Performing PCA" + "\u001b[0m") df = pd.DataFrame(dict([(c.id, c.kmers) for c in contigs.values()])) pca = PCA(n_components=1) pca.fit(df) pc1 = pca.components_[0] if args["weighted_mean"]: print( "\u001b[1m" + "\n• Computing per-contig deviation from the weighted mean along the first principal component" + "\u001b[0m" ) reference_pc = np.average(pc1, weights=contig_length_list) else: print( "\u001b[1m" + "\n• Computing per-contig deviation from the mean along the first principal component" + "\u001b[0m" ) reference_pc = np.average(pc1) for contig_id, contig_pc in zip(list(df.columns), pc1): contigs[contig_id].pc = contig_pc contigs[contig_id].values = {} contigs[contig_id].values["delta"] = abs(contig_pc - reference_pc) print("\u001b[1m" + "\n• Identifying outlier contigs" + "\u001b[0m") flagged = [ contig.id for contig in contigs.values() if contig.values["delta"] > args["cutoff"] ] out = f"{args['tmp_dir']}/flagged_contigs" print(f" {len(flagged)} flagged contigs: {out}") with open(out, "w") as f: for contig in flagged: f.write(contig + "\n")