Ejemplo n.º 1
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_dependencies(["blastn"])
    utilities.check_database(args)
    utilities.add_tmp_dir(args)
    print("\u001b[1m" + "• Searching database with BLASTN" + "\u001b[0m")
    for target in ["hg38", "phix"]:
        db = f"{args['db']}/known-contam/{target}/{target}"
        out = f"{args['tmp_dir']}/{target}.m8"
        run_blastn(
            args["fna"],
            db,
            out,
            args["threads"],
            args["qcov"],
            args["pid"],
            args["evalue"],
        )
    print("\u001b[1m" + "\n• Identifying contigs with hits to database" + "\u001b[0m")
    flagged = set([])
    for target in ["hg38", "phix"]:
        out = f"{args['tmp_dir']}/{target}.m8"
        for r in utilities.parse_blast(out):
            flagged.add(r["qname"])
    flagged = list(flagged)
    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
Ejemplo n.º 2
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_dependencies(["prodigal", "hmmsearch", "blastp", "blastn"])
    utilities.check_database(args)
    print("\u001b[1m" + "• Calling genes with Prodigal" + "\u001b[0m")
    utilities.run_prodigal(args["fna"], args["tmp_dir"])
    print(f"  all genes: {args['tmp_dir']}/genes.[ffn|faa]")
    print("\u001b[1m" +
          "\n• Identifying PhyEco phylogenetic marker genes with HMMER" +
          "\u001b[0m")
    utilities.run_hmmsearch(args["db"], args["tmp_dir"], args["tmp_dir"],
                            args["threads"])
    extract_homologs(args["tmp_dir"])
    print(f"  hmm results: {args['tmp_dir']}/phyeco.hmmsearch")
    print(f"  marker genes: {args['tmp_dir']}/markers")
    print(
        "\u001b[1m" +
        "\n• Performing pairwise BLAST alignment of marker genes against database"
        + "\u001b[0m")
    align_homologs(args["db"], args["tmp_dir"], args["seq_type"],
                   args["threads"])
    print(f"  blast results: {args['tmp_dir']}/alns")
    print("\u001b[1m" + "\n• Finding taxonomic outliers" + "\u001b[0m")
    flagged = flag_contigs(args["db"], args["tmp_dir"], args)
    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
Ejemplo n.º 3
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_dependencies(["coverm"])
    print("\u001b[1m" + "• Computing contig coverage" + "\u001b[0m")
    utilities.run_coverm(args["bams"], args["tmp_dir"], args["threads"])
    coverage_df = pd.read_csv(f"{args['tmp_dir']}/coverage.tsv",
                              sep="\t",
                              index_col=0)
    contig_id_list = []
    contig_length_list = []
    for id, seq in utilities.parse_fasta(args["fna"]):
        contig_id_list.append(id)
        contig_length_list.append(len(seq))
    contig_coverage_df = coverage_df.loc[contig_id_list]
    largest_mean_coverage_sample = contig_coverage_df.mean(axis=0).idxmax()
    if contig_coverage_df.shape[1] > 1:
        print(
            "\u001b[1m" +
            f"\n• Sample being used for outlier detection: {largest_mean_coverage_sample.split()[0]}"
            + "\u001b[0m")
    contig_coverage_df = contig_coverage_df.loc[:,
                                                largest_mean_coverage_sample]
    if contig_coverage_df.mean() < 1:
        sys.exit(
            "\nError: The average coverage is less than 1 in all the supplied BAM files"
        )
    if args["weighted_mean"]:
        print(
            "\u001b[1m" +
            "\n• Computing per-contig deviation from the weighted mean coverage"
            + "\u001b[0m")
        reference = np.average(contig_coverage_df.values,
                               weights=contig_length_list)
    else:
        print("\u001b[1m" +
              "\n• Computing per-contig deviation from the mean coverage" +
              "\u001b[0m")
        reference = contig_coverage_df.mean()
    outliers = ((contig_coverage_df / reference) >= args["max_deviation"]) | (
        (contig_coverage_df / reference) <= 1 / args["max_deviation"])
    print("\u001b[1m" + "\n• Identifying outlier contigs" + "\u001b[0m")
    flagged = contig_coverage_df.loc[outliers].index.tolist()
    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
Ejemplo n.º 4
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    if args["weighted_mean"]:
        print("\u001b[1m" + "• Computing weighted mean contig GC content" +
              "\u001b[0m")
    else:
        print("\u001b[1m" + "• Computing mean contig GC content" + "\u001b[0m")
    contigs = {}
    contig_length_list = []
    for id, seq in utilities.parse_fasta(args["fna"]):
        contig = Contig()
        contig.id = id
        contig.seq = str(seq)
        contig.gc = round(SeqUtils.GC(seq), 2)
        contigs[id] = contig
        contig_length_list.append(len(seq))
    if args["weighted_mean"]:
        print("\u001b[1m" +
              "\n• Computing per-contig deviation from weighted mean" +
              "\u001b[0m")
        reference = np.average([c.gc for c in contigs.values()],
                               weights=contig_length_list)
    else:
        print("\u001b[1m" + "\n• Computing per-contig deviation from mean" +
              "\u001b[0m")
        reference = np.average([c.gc for c in contigs.values()])
    for contig in contigs.values():
        contig.values = {}
        contig.values["delta"] = abs(contig.gc - reference)
    print("\u001b[1m" + "\n• Identifying outlier contigs" + "\u001b[0m")
    flagged = [
        contig.id for contig in contigs.values()
        if contig.values["delta"] > args["cutoff"]
    ]

    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
Ejemplo n.º 5
0
def main(args):
    utilities.check_input(args)
    print("\u001b[1m" + "• Reading genome bin" + "\u001b[0m")
    bin = {id: seq for id, seq in utilities.parse_fasta(args["fna"])}
    bin_length = round(sum(len(_) for _ in bin.values()) / 1000, 2)
    print(f"  genome length: {len(bin)} contigs, {bin_length} Kbp")
    print("\u001b[1m" + "\n• Reading flagged contigs" + "\u001b[0m")
    flagged_contigs = []
    programs = [
        "phylo-markers",
        "clade-markers",
        "conspecific",
        "tetra-freq",
        "gc-content",
        "coverage",
        "known-contam",
    ]
    for program in programs:
        path = f"{args['out']}/{program}/flagged_contigs"
        if not os.path.exists(path):
            print(f"  {program}: no output file found")
        else:
            contigs = [_.rstrip() for _ in open(path)]
            bases = round(sum(len(bin[id]) for id in contigs) / 1000, 2)
            flagged_contigs += contigs
            print(f"  {program}: {len(contigs)} contigs, {bases} Kbp")
    flagged_contigs = list(set(flagged_contigs))
    flagged_length = round(
        sum(len(bin[id]) for id in flagged_contigs) / 1000, 2)
    print("\u001b[1m" + "\n• Removing flagged contigs" + "\u001b[0m")
    clean = bin.copy()
    for id in flagged_contigs:
        del clean[id]
    clean_length = round(sum(len(_) for _ in clean.values()) / 1000, 2)
    print(f"  removed: {len(flagged_contigs)} contigs, {flagged_length} Kbp")
    print(f"  remains: {len(clean)} contigs, {clean_length} Kbp")
    with open(args['out_fna'], "w") as f:
        for id, seq in clean.items():
            f.write(">" + id + "\n" + textwrap.fill(seq, 70) + "\n")
    print(f"  cleaned bin: {args['out_fna']}")
Ejemplo n.º 6
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_dependencies(["mash"])
    if not os.path.exists(args["mash_sketch"]):
        sys.exit(f"\nError: mash sketch '{args['mash_sketch']}' not found\n")
    print("\u001b[1m" + "• Finding conspecific genomes in database" +
          "\u001b[0m")
    run_mash(args["mash_sketch"], args["fna"], args["tmp_dir"],
             args["threads"])
    genomes = find_conspecific(args["tmp_dir"], args["mash_dist"],
                               args["exclude"])
    print(f"  {len(genomes)} genomes within {args['mash_dist']} mash-dist")
    out = f"{args['tmp_dir']}/conspecific.list"
    with open(out, "w") as f:
        f.write("genome_id\tmash_dist\n")
        for genome_id, mash_dist in genomes:
            f.write(genome_id + "\t" + str(mash_dist) + "\n")
    print(f"  list of genomes: {out}")
    print(f"  mash output: {args['tmp_dir']}/mash.dist")
    if len(genomes) < args["min_genomes"]:
        sys.exit("\nError: insufficient number of conspecific genomes\n")
    if len(genomes) > args["max_genomes"]:
        print("\u001b[1m" +
              f"\n• Selecting top {args['max_genomes']} most-similar genomes" +
              "\u001b[0m")
        genomes = genomes[0:args["max_genomes"]]
        out = f"{args['tmp_dir']}/conspecific_subset.list"
        with open(out, "w") as f:
            f.write("genome_id\tmash_dist\n")
            for genome_id, mash_dist in genomes:
                f.write(genome_id + "\t" + str(mash_dist) + "\n")
        print(f"  list of genomes: {out}")
    print(
        "\u001b[1m" +
        "\n• Performing pairwise alignment of contigs in bin to database genomes"
        + "\u001b[0m")
    alignments = align_contigs(args, genomes)
    num_alns = sum(len(_.split("\n")) for _ in alignments)
    print(f"  total alignments: {num_alns}")
    print("\u001b[1m" + "\n• Summarizing alignments" + "\u001b[0m")
    contigs = find_contig_targets(args, genomes, alignments)
    out = f"{args['tmp_dir']}/contig_hits.tsv"
    with open(out, "w") as f:
        f.write("contig_id\tlength\talignment_rate\n")
        for contig, values in contigs.items():
            row = [
                contig,
                str(values["len"]), f"{values['hits']}/{len(genomes)}"
            ]
            f.write("\t".join(row) + "\n")
    print(f"  contig features: {out}")
    print("\u001b[1m" +
          "\n• Identifying contigs with no conspecific alignments" +
          "\u001b[0m")
    flagged = flag_contigs(args, contigs)
    out = f"{args['tmp_dir']}/flagged_contigs"
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
    print(f"  {len(flagged)} flagged contigs: {out}")
Ejemplo n.º 7
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_database(args)
    print("\u001b[1m" + "• Reading database info" + "\u001b[0m")
    ref_taxonomy = read_ref_taxonomy(args["db"])
    taxon_to_taxonomy = {}
    for taxonomy in set(ref_taxonomy.values()):
        for taxon in taxonomy.split("|"):
            taxon_to_taxonomy[taxon] = taxonomy
    min_pid = {"k": 57, "p": 77, "c": 82, "o": 86, "f": 87, "g": 91, "s": 96}
    if args["min_genes"] is not None:
        args["min_genes"] = dict([(r, args["min_genes"]) for r in ranks])
    else:
        args["min_genes"] = {
            "k": 237,
            "p": 44,
            "c": 30,
            "o": 24,
            "f": 22,
            "g": 20,
            "s": 19,
        }
    print("\u001b[1m" + "\n• Calling genes with Prodigal" + "\u001b[0m")
    utilities.run_prodigal(args["fna"], args["tmp_dir"])
    print(f"  all genes: {args['tmp_dir']}/genes.[ffn|faa]")
    print(
        "\u001b[1m"
        + "\n• Performing pairwise alignment of genes against MetaPhlan2 database of clade-specific genes"
        + "\u001b[0m"
    )
    utilities.run_lastal(args["db"], args["tmp_dir"], args["threads"])
    print(f"  alignments: {args['tmp_dir']}/genes.m8")

    print("\u001b[1m" + "\n• Finding top hits to database" + "\u001b[0m")
    genes = {}
    for aln in utilities.parse_last(args["tmp_dir"] + "/genes.m8"):
        # clade exclusion
        ref_taxa = ref_taxonomy[aln["tid"]].split("|")
        if args["exclude_clades"] and any(
            taxon in ref_taxa for taxon in args["exclude_clades"]
        ):
            continue
        # initialize gene
        if aln["qid"] not in genes:
            genes[aln["qid"]] = Gene()
            genes[aln["qid"]].id = aln["qid"]
            genes[aln["qid"]].contig_id = aln["qid"].rsplit("_", 1)[0]

        # get top alignments
        if genes[aln["qid"]].aln is None:
            genes[aln["qid"]].aln = aln
            genes[aln["qid"]].ref_taxa = ref_taxa
        elif float(aln["score"]) > float(genes[aln["qid"]].aln["score"]):
            genes[aln["qid"]].ref_taxa = ref_taxa
    print("  %s genes with a database hit" % len(genes))
    print("\u001b[1m" + "\n• Classifying genes at each taxonomic rank" + "\u001b[0m")
    counts = {}
    for gene in genes.values():
        for ref_taxon in gene.ref_taxa:
            rank = ref_taxon.split("__")[0]
            if rank not in counts:
                counts[rank] = 0
            if rank == "t":
                continue
            elif float(gene.aln["pid"]) < min_pid[rank]:
                continue
            elif gene.aln["qcov"] < 0.4:
                continue
            elif gene.aln["tcov"] < 0.4:
                continue
            gene.taxa[rank] = ref_taxon
            counts[rank] += 1
    for rank in ranks:
        print(f"  {rank_names[rank]}: {counts[rank]} classified genes")
    print("\u001b[1m" + "\n• Taxonomically classifying contigs" + "\u001b[0m")
    contigs = {}
    for id, seq in utilities.parse_fasta(args["fna"]):
        contigs[id] = Contig()
        contigs[id].id = id
        contigs[id].length = len(seq)
    # aggregate hits by contig
    for gene in genes.values():
        contigs[gene.contig_id].genes.append(gene)
    # classify contigs at each level
    for contig in contigs.values():
        contig.classify()
    # summarize
    counts = {}
    for contig in contigs.values():
        for rank, taxon in contig.cons_taxa.items():
            if rank not in counts:
                counts[rank] = 0
            if taxon is not None:
                counts[rank] += 1
    print("  total contigs: %s" % len(contigs))
    for rank in ranks:
        print(f"  {rank_names[rank]}: {counts[rank]} classified contigs")

    print("\u001b[1m" + "\n• Taxonomically classifying genome" + "\u001b[0m")
    bin = Bin()
    bin.classify(
        contigs,
        args["min_bin_fract"],
        args["min_contig_fract"],
        args["min_gene_fract"],
        args["min_genes"],
        args["lowest_rank"],
    )
    print(f"  consensus taxon: {bin.cons_taxon}")
    print("\u001b[1m" + "\n• Identifying taxonomically discordant contigs" + "\u001b[0m")
    if bin.cons_taxon is not None:
        bin.rank_index = (
            taxon_to_taxonomy[bin.cons_taxon].split("|").index(bin.cons_taxon)
        )
        bin.taxonomy = taxon_to_taxonomy[bin.cons_taxon].split("|")[
            0 : bin.rank_index + 1
        ]
        flag_contigs(contigs, bin)
    flagged = [contig.id for contig in contigs.values() if contig.flagged]
    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
Ejemplo n.º 8
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_dependencies(["blastn"])

    print("\u001b[1m" + "• Counting tetranucleotides" + "\u001b[0m")
    # init data
    contigs = {}
    contig_length_list = []
    for id, seq in utilities.parse_fasta(args["fna"]):
        contig = Contig()
        contig.id = id
        contig.seq = str(seq)
        contig.kmers = init_kmers()
        contigs[id] = contig
        contig_length_list.append(len(seq))

    # count kmers
    for contig in contigs.values():
        for i in range(len(contig.seq) - 3):
            kmer_fwd = contig.seq[i : i + 4]
            if kmer_fwd in contig.kmers:
                contig.kmers[kmer_fwd] += 1
            else:
                kmer_rev = utilities.reverse_complement(kmer_fwd)
                contig.kmers[kmer_rev] += 1

    print("\u001b[1m" + "\n• Normalizing counts" + "\u001b[0m")
    for contig in contigs.values():
        total = float(sum(contig.kmers.values()))
        for kmer, count in contig.kmers.items():
            contig.kmers[kmer] = 100 * count / total if total > 0 else 0.0
    print("\u001b[1m" + "\n• Performing PCA" + "\u001b[0m")
    df = pd.DataFrame(dict([(c.id, c.kmers) for c in contigs.values()]))
    pca = PCA(n_components=1)
    pca.fit(df)
    pc1 = pca.components_[0]
    if args["weighted_mean"]:
        print(
            "\u001b[1m"
            + "\n• Computing per-contig deviation from the weighted mean along the first principal component"
            + "\u001b[0m"
        )
        reference_pc = np.average(pc1, weights=contig_length_list)
    else:
        print(
            "\u001b[1m"
            + "\n• Computing per-contig deviation from the mean along the first principal component"
            + "\u001b[0m"
        )
        reference_pc = np.average(pc1)
    for contig_id, contig_pc in zip(list(df.columns), pc1):
        contigs[contig_id].pc = contig_pc
        contigs[contig_id].values = {}
        contigs[contig_id].values["delta"] = abs(contig_pc - reference_pc)

    print("\u001b[1m" + "\n• Identifying outlier contigs" + "\u001b[0m")
    flagged = [
        contig.id
        for contig in contigs.values()
        if contig.values["delta"] > args["cutoff"]
    ]

    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")