Beispiel #1
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_dependencies(["prodigal", "hmmsearch", "blastp", "blastn"])
    utilities.check_database(args)
    print("\u001b[1m" + "• Calling genes with Prodigal" + "\u001b[0m")
    utilities.run_prodigal(args["fna"], args["tmp_dir"])
    print(f"  all genes: {args['tmp_dir']}/genes.[ffn|faa]")
    print("\u001b[1m" +
          "\n• Identifying PhyEco phylogenetic marker genes with HMMER" +
          "\u001b[0m")
    utilities.run_hmmsearch(args["db"], args["tmp_dir"], args["tmp_dir"],
                            args["threads"])
    extract_homologs(args["tmp_dir"])
    print(f"  hmm results: {args['tmp_dir']}/phyeco.hmmsearch")
    print(f"  marker genes: {args['tmp_dir']}/markers")
    print(
        "\u001b[1m" +
        "\n• Performing pairwise BLAST alignment of marker genes against database"
        + "\u001b[0m")
    align_homologs(args["db"], args["tmp_dir"], args["seq_type"],
                   args["threads"])
    print(f"  blast results: {args['tmp_dir']}/alns")
    print("\u001b[1m" + "\n• Finding taxonomic outliers" + "\u001b[0m")
    flagged = flag_contigs(args["db"], args["tmp_dir"], args)
    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
Beispiel #2
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_dependencies(["blastn"])
    utilities.check_database(args)
    utilities.add_tmp_dir(args)
    print("\u001b[1m" + "• Searching database with BLASTN" + "\u001b[0m")
    for target in ["hg38", "phix"]:
        db = f"{args['db']}/known-contam/{target}/{target}"
        out = f"{args['tmp_dir']}/{target}.m8"
        run_blastn(
            args["fna"],
            db,
            out,
            args["threads"],
            args["qcov"],
            args["pid"],
            args["evalue"],
        )
    print("\u001b[1m" + "\n• Identifying contigs with hits to database" + "\u001b[0m")
    flagged = set([])
    for target in ["hg38", "phix"]:
        out = f"{args['tmp_dir']}/{target}.m8"
        for r in utilities.parse_blast(out):
            flagged.add(r["qname"])
    flagged = list(flagged)
    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
Beispiel #3
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_dependencies(["coverm"])
    print("\u001b[1m" + "• Computing contig coverage" + "\u001b[0m")
    utilities.run_coverm(args["bams"], args["tmp_dir"], args["threads"])
    coverage_df = pd.read_csv(f"{args['tmp_dir']}/coverage.tsv",
                              sep="\t",
                              index_col=0)
    contig_id_list = []
    contig_length_list = []
    for id, seq in utilities.parse_fasta(args["fna"]):
        contig_id_list.append(id)
        contig_length_list.append(len(seq))
    contig_coverage_df = coverage_df.loc[contig_id_list]
    largest_mean_coverage_sample = contig_coverage_df.mean(axis=0).idxmax()
    if contig_coverage_df.shape[1] > 1:
        print(
            "\u001b[1m" +
            f"\n• Sample being used for outlier detection: {largest_mean_coverage_sample.split()[0]}"
            + "\u001b[0m")
    contig_coverage_df = contig_coverage_df.loc[:,
                                                largest_mean_coverage_sample]
    if contig_coverage_df.mean() < 1:
        sys.exit(
            "\nError: The average coverage is less than 1 in all the supplied BAM files"
        )
    if args["weighted_mean"]:
        print(
            "\u001b[1m" +
            "\n• Computing per-contig deviation from the weighted mean coverage"
            + "\u001b[0m")
        reference = np.average(contig_coverage_df.values,
                               weights=contig_length_list)
    else:
        print("\u001b[1m" +
              "\n• Computing per-contig deviation from the mean coverage" +
              "\u001b[0m")
        reference = contig_coverage_df.mean()
    outliers = ((contig_coverage_df / reference) >= args["max_deviation"]) | (
        (contig_coverage_df / reference) <= 1 / args["max_deviation"])
    print("\u001b[1m" + "\n• Identifying outlier contigs" + "\u001b[0m")
    flagged = contig_coverage_df.loc[outliers].index.tolist()
    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
Beispiel #4
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_dependencies(["mash"])
    if not os.path.exists(args["mash_sketch"]):
        sys.exit(f"\nError: mash sketch '{args['mash_sketch']}' not found\n")
    print("\u001b[1m" + "• Finding conspecific genomes in database" +
          "\u001b[0m")
    run_mash(args["mash_sketch"], args["fna"], args["tmp_dir"],
             args["threads"])
    genomes = find_conspecific(args["tmp_dir"], args["mash_dist"],
                               args["exclude"])
    print(f"  {len(genomes)} genomes within {args['mash_dist']} mash-dist")
    out = f"{args['tmp_dir']}/conspecific.list"
    with open(out, "w") as f:
        f.write("genome_id\tmash_dist\n")
        for genome_id, mash_dist in genomes:
            f.write(genome_id + "\t" + str(mash_dist) + "\n")
    print(f"  list of genomes: {out}")
    print(f"  mash output: {args['tmp_dir']}/mash.dist")
    if len(genomes) < args["min_genomes"]:
        sys.exit("\nError: insufficient number of conspecific genomes\n")
    if len(genomes) > args["max_genomes"]:
        print("\u001b[1m" +
              f"\n• Selecting top {args['max_genomes']} most-similar genomes" +
              "\u001b[0m")
        genomes = genomes[0:args["max_genomes"]]
        out = f"{args['tmp_dir']}/conspecific_subset.list"
        with open(out, "w") as f:
            f.write("genome_id\tmash_dist\n")
            for genome_id, mash_dist in genomes:
                f.write(genome_id + "\t" + str(mash_dist) + "\n")
        print(f"  list of genomes: {out}")
    print(
        "\u001b[1m" +
        "\n• Performing pairwise alignment of contigs in bin to database genomes"
        + "\u001b[0m")
    alignments = align_contigs(args, genomes)
    num_alns = sum(len(_.split("\n")) for _ in alignments)
    print(f"  total alignments: {num_alns}")
    print("\u001b[1m" + "\n• Summarizing alignments" + "\u001b[0m")
    contigs = find_contig_targets(args, genomes, alignments)
    out = f"{args['tmp_dir']}/contig_hits.tsv"
    with open(out, "w") as f:
        f.write("contig_id\tlength\talignment_rate\n")
        for contig, values in contigs.items():
            row = [
                contig,
                str(values["len"]), f"{values['hits']}/{len(genomes)}"
            ]
            f.write("\t".join(row) + "\n")
    print(f"  contig features: {out}")
    print("\u001b[1m" +
          "\n• Identifying contigs with no conspecific alignments" +
          "\u001b[0m")
    flagged = flag_contigs(args, contigs)
    out = f"{args['tmp_dir']}/flagged_contigs"
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
    print(f"  {len(flagged)} flagged contigs: {out}")
Beispiel #5
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_dependencies(["blastn"])

    print("\u001b[1m" + "• Counting tetranucleotides" + "\u001b[0m")
    # init data
    contigs = {}
    contig_length_list = []
    for id, seq in utilities.parse_fasta(args["fna"]):
        contig = Contig()
        contig.id = id
        contig.seq = str(seq)
        contig.kmers = init_kmers()
        contigs[id] = contig
        contig_length_list.append(len(seq))

    # count kmers
    for contig in contigs.values():
        for i in range(len(contig.seq) - 3):
            kmer_fwd = contig.seq[i : i + 4]
            if kmer_fwd in contig.kmers:
                contig.kmers[kmer_fwd] += 1
            else:
                kmer_rev = utilities.reverse_complement(kmer_fwd)
                contig.kmers[kmer_rev] += 1

    print("\u001b[1m" + "\n• Normalizing counts" + "\u001b[0m")
    for contig in contigs.values():
        total = float(sum(contig.kmers.values()))
        for kmer, count in contig.kmers.items():
            contig.kmers[kmer] = 100 * count / total if total > 0 else 0.0
    print("\u001b[1m" + "\n• Performing PCA" + "\u001b[0m")
    df = pd.DataFrame(dict([(c.id, c.kmers) for c in contigs.values()]))
    pca = PCA(n_components=1)
    pca.fit(df)
    pc1 = pca.components_[0]
    if args["weighted_mean"]:
        print(
            "\u001b[1m"
            + "\n• Computing per-contig deviation from the weighted mean along the first principal component"
            + "\u001b[0m"
        )
        reference_pc = np.average(pc1, weights=contig_length_list)
    else:
        print(
            "\u001b[1m"
            + "\n• Computing per-contig deviation from the mean along the first principal component"
            + "\u001b[0m"
        )
        reference_pc = np.average(pc1)
    for contig_id, contig_pc in zip(list(df.columns), pc1):
        contigs[contig_id].pc = contig_pc
        contigs[contig_id].values = {}
        contigs[contig_id].values["delta"] = abs(contig_pc - reference_pc)

    print("\u001b[1m" + "\n• Identifying outlier contigs" + "\u001b[0m")
    flagged = [
        contig.id
        for contig in contigs.values()
        if contig.values["delta"] > args["cutoff"]
    ]

    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")