Exemple #1
0
def main( ):
    args = get_args( )
    if args.out is None:
        name = wu.path2name( args.query )
        args.out = wu.name2path( name, ".", ".blastout" )
    params = {
        "QUERY"   : args.query,
        "OUTFILE" : args.out,
        "BLASTN"  : args.blastn,
        "DB"      : args.db,
        "MAXTAR"  : wu.c_max_target_seqs,
        "THREADS" : args.threads,
        "FORMAT"  : wu.c_blast_format_string,
        }
    command = [
        "{BLASTN}",
        "-query {QUERY}",
        "-db {DB}",
        "-out {OUTFILE}",
        "-max_target_seqs {MAXTAR}",
        "-num_threads {THREADS}",
        "-outfmt \'{FORMAT}\'",
        ]
    command = " ".join( command ).format( **params )
    wu.say( "Executing command:", command )
    os.system( command )
    wu.say( "Finished successfully." )
Exemple #2
0
def write_detailed_output(
    basename=None,
    outdir=None,
    contig_coverage=None,
    contig_hits=None,
):

    # first file can get pretty big, hence gzip
    p_site_hits = wu.name2path(basename, outdir, ".site_hits.tsv.gz")
    p_gene_hits = wu.name2path(basename, outdir, ".gene_hits.tsv")

    # write: site_hits
    wu.say("Writing site hits.")
    with wu.try_open(p_site_hits, "w") as fh:
        wu.write_rowdict(
            format=c_formats["site_hits"],
            file=fh,
        )
        for c in sorted(contig_coverage):
            depths = contig_coverage[c]
            rowdict = {
                "contig": c,
                "mean": np.mean(depths),
                "stdev": np.std(depths),
                "depths": " ".join(["{:.0f}".format(k) for k in depths]),
            }
            wu.write_rowdict(
                rowdict=rowdict,
                format=c_formats["site_hits"],
                file=fh,
            )

    # write: gene_hits
    wu.say("Writing gene-pair hits.")
    with wu.try_open(p_gene_hits, "w") as fh:
        wu.write_rowdict(
            format=c_formats["gene_hits"],
            file=fh,
        )
        for c in sorted(contig_hits):
            for code1, code2 in sorted(contig_hits[c]):
                if code2 > code1:
                    continue
                value = contig_hits[c][(code1, code2)]
                rowdict = {
                    "contig": c,
                    "gene1": code1,
                    "gene2": code2,
                    "hits": value,
                }
                wu.write_rowdict(
                    rowdict=rowdict,
                    format=c_formats["gene_hits"],
                    file=fh,
                )
Exemple #3
0
def bowtie2_build(
    p_bowtie2_build=None,
    p_contigs=None,
    p_index=None,
    args=None,
):
    alias = {
        "PROG": p_bowtie2_build,
        "CONTIGS": p_contigs,
        "INDEX": p_index,
    }
    if args.resume and os.path.exists(p_index + ".1.bt2"):
        wu.say("RESUMING: The index <{INDEX}> already exists.".format(**alias))
    else:
        wu.say("Indexing <{CONTIGS}> to <{INDEX}>.".format(**alias))
        command = [
            "{PROG}",
            "{CONTIGS}",
            "{INDEX}",
        ]
        command = " ".join(command)
        command = command.format(**alias)
        os.system(command)
        wu.say("Build complete.")
    return None
def main( ):

    args = get_args( )
    if args.gff is None:
        name = wu.path2name( args.blastout )
        args.gff = wu.name2path( name, ".", ".gff" )

    fh_gff = wu.try_open( args.gff, "w" )
    writer = csv.writer( fh_gff, csv.excel_tab )

    for contig, hits in wu.iter_contig_hits( args.blastout ):
        intervals = hits2ints( 
            hits, 
            args.min_scov,
            )
        intervals = overlap_intervals( 
            intervals, 
            args.min_overlap, 
            args.stranded == "on",
            )
        for start, stop, strand in intervals:
            gene_length = stop - start + 1
            if gene_length >= args.min_gene_length:
                items = [
                    contig,
                    "waafle_genecaller",
                    "gene",
                    start,
                    stop,
                    ".",
                    strand,
                    0,
                    ".",
                    ]
                writer.writerow( [str( k ) for k in items] )

    fh_gff.close( )
    wu.say( "Finished successfully." )
Exemple #5
0
def concordant_hits(p_sam=None, ):
    counter = 0
    mate1 = None
    mate2 = None
    for hit in wu.iter_sam_hits(p_sam):
        # progress
        counter += 1
        if counter % int(1e5) == 0:
            wu.say("  SAM alignments processed: {:.1f}M".format(counter / 1e6))
        # weave
        mate1 = mate2
        mate2 = hit
        # edge case
        if mate1 is None:
            continue
        # not a mate pair
        elif mate1.qseqid != mate2.qseqid:
            continue
        # not concordant
        elif mate1.sseqid != mate2.sseqid:
            continue
        # good pair
        else:
            yield [mate1, mate2]
Exemple #6
0
def bowtie2_align(
    p_bowtie2=None,
    p_reads1=None,
    p_reads2=None,
    p_index=None,
    p_sam=None,
    args=None,
):
    alias = {
        "PROG": p_bowtie2,
        "READS1": p_reads1,
        "READS2": p_reads2,
        "INDEX": p_index,
        "SAM": p_sam,
        "THREADS": args.threads,
    }
    if args.resume and os.path.exists(p_sam):
        wu.say(
            "RESUMING: A sam mapping <{SAM}> already exists.".format(**alias))
    else:
        wu.say("Performing bowtie2 alignment.")
        command = [
            "{PROG}",
            "-x {INDEX}",
            "-1 {READS1}",
            "-2 {READS2}",
            "-S {SAM}",
            "--threads {THREADS}",
            "--no-mixed",
            "--no-discordant",
        ]
        command = " ".join(command)
        command = command.format(**alias)
        os.system(command)
        wu.say("Alignment complete.")
    return None
Exemple #7
0
def main():

    # begin
    args = get_args()
    p_contigs = args.contigs
    p_gff = args.gff

    # define files
    p_outdir = args.outdir
    p_tmpdir = args.tmpdir
    basename = args.basename
    if basename is None:
        basename = wu.path2name(p_contigs)
    p_index = wu.name2path(basename, p_tmpdir, ".index")
    p_sam = wu.name2path(basename, p_tmpdir, ".sam")
    p_junctions = wu.name2path(basename, p_outdir, ".junctions.tsv")

    # alignment workflow
    if args.sam is not None:
        p_sam = args.sam
        wu.say("Using specified SAM file:", p_sam)
    elif args.reads1 is not None and args.reads2 is not None:
        # build process
        bowtie2_build(
            p_bowtie2_build=args.bowtie2_build,
            p_contigs=args.contigs,
            p_index=p_index,
            args=args,
        )
        # alignment process
        bowtie2_align(
            p_bowtie2=args.bowtie2,
            p_reads1=args.reads1,
            p_reads2=args.reads2,
            p_index=p_index,
            p_sam=p_sam,
            args=args,
        )
    else:
        wu.die("Must provide READS or SAM file.")

    # load contig data
    wu.say("Loading contig lengths.")
    contig_lengths = wu.read_contig_lengths(p_contigs)
    contig_coverage = {}
    for name, length in contig_lengths.items():
        contig_coverage[name] = np.zeros(length)
    wu.say("Loading contig gene coordinates.")
    contig_loci = {}
    for name, loci in wu.iter_contig_loci(p_gff):
        contig_loci[name] = loci
    contig_hits = {}

    # post-processing workflow
    wu.say("Processing SAM file.")
    for mate1, mate2 in concordant_hits(p_sam):
        contig = mate1.sseqid
        inner = contig_hits.setdefault(contig, Counter())
        # update pers-site coverage (note: base-0 start and pythonic end)
        coords = [mate1.sstart, mate1.send, mate2.sstart, mate2.send]
        L = min(coords) - 1
        R = max(coords) - 1
        contig_coverage[contig][L:R + 1] += 1
        # find hit loci
        hits = find_hit_loci(
            mate1=mate1,
            mate2=mate2,
            loci=contig_loci.get(contig, []),
            args=args,
        )
        # attach self counts
        for code in hits:
            inner[(code, code)] += 1
        # attach pair counts (note: symmetric storage for safer lookup)
        for code1 in hits:
            for code2 in hits:
                if code1 != code2:
                    inner[(code1, code2)] += 1

    # detailed output?
    if args.write_detailed_output:
        write_detailed_output(
            basename=basename,
            outdir=p_outdir,
            contig_coverage=contig_coverage,
            contig_hits=contig_hits,
        )

    # write junction report
    wu.say("Writing junction report.")
    with wu.try_open(p_junctions, "w") as fh:
        wu.write_rowdict(
            format=c_formats["junctions"],
            file=fh,
        )
        for c in sorted(contig_lengths):
            rowdicts = evaluate_contig(
                loci=contig_loci.get(c, []),
                coverage=contig_coverage[c],
                gene_hits=contig_hits.get(c, {}),
                args=args,
            )
            for rowdict in rowdicts:
                rowdict["contig"] = c
                wu.write_rowdict(
                    rowdict=rowdict,
                    format=c_formats["junctions"],
                    file=fh,
                )

    # end
    wu.say("Finished successfully.")
Exemple #8
0
def main():

    args = get_args()

    # load junctions data
    hits = {}
    covs = {}
    wu.say("Loading junctions report.")
    F = wu.Frame(args.junctions)
    # loop over junctions
    for R in F.iter_rowdicts():
        contig = R["CONTIG"]
        gene1 = R["GENE1"]
        gene2 = R["GENE2"]
        hits.setdefault(contig, {})[(gene1, gene2)] = int(R["JUNCTION_HITS"])
        covs.setdefault(contig, {})[(gene1, gene2)] = float(R["RATIO"])

    # filter contigs
    total = 0
    failed = 0
    outfile = args.outfile
    if outfile is None:
        outfile = args.contig_profile + ".qc_pass"
    # load results, open new file, write headers
    F = wu.Frame(args.contig_profile)
    fh = wu.try_open(outfile, "w")
    wu.write_rowdict(None, F.headers, file=fh)
    # loop over contigs
    for R in F.iter_rowdicts():
        total += 1
        contig = R["CONTIG_NAME"]
        # contig-level filters
        if contig not in hits or contig not in covs:
            failed += 1
            wu.say("Missing junction data for contig:", contig)
            continue
        loci = R["LOCI"].split("|")
        synteny = R["SYNTENY"]
        qc_pass = True
        for i in range(len(loci) - 1):
            my_test = True
            spair = synteny[i] + synteny[i + 1]
            if spair not in ["AB", "BA"]:
                continue
            gpair = (loci[i], loci[i + 1])
            my_hits = hits[contig].get(gpair, -1)
            my_hits = my_hits >= args.min_junction_hits
            my_covs = covs[contig].get(gpair, -1)
            my_covs = my_covs >= args.min_junction_ratio
            my_test = my_hits or my_covs
            qc_pass = qc_pass and my_test
        if not qc_pass:
            failed += 1
            wu.say("Failed QC:", contig)
        else:
            wu.write_rowdict(R, F.headers, file=fh)

    # wrap-up
    wu.say("Failure rate: {} of {} ({:.1f}%)".format(
        failed, total, 100 * failed / float(total)))
    wu.say("Finished successfully.")
Exemple #9
0
def main():

    args = get_args()
    wu.say("Loading taxonomy.")
    taxonomy = wu.Taxonomy(args.taxonomy)

    # initialize contigs
    wu.say("Initializing contigs.")
    contigs = {}
    contig_lengths = wu.read_contig_lengths(args.contigs)
    index = 0
    for contig_name, length in contig_lengths.items():
        C = Contig(contig_name, args)
        C.length = length
        index += 1
        C.index = index
        contigs[contig_name] = C

    # process gff
    wu.say("Adding gene coordinates.")
    for contig_name, loci in wu.iter_contig_loci(args.gff,
                                                 attach_annotations=False):
        if contig_name not in contigs:
            wu.say("  Unknown contig in <gff> file", contig_name)
            continue
        C = contigs[contig_name]
        C.attach_loci(loci)

    # check basename in preparation for writing output
    if args.basename is None:
        args.basename = os.path.split(args.contigs)[1].split(".")[0]

    # prepare details file
    details = None
    if args.write_details:
        details = wu.try_open(
            os.path.join(args.outdir, args.basename + ".details.tsv.gz"), "w")
        # headers
        wu.write_rowdict(None, c_formats["details"], file=details)

    # parse hits, process contigs
    wu.say("Analyzing contigs.")

    # major contig loop
    for contig_name, hits in wu.iter_contig_hits(args.blastout):
        if contig_name not in contigs:
            wu.say("  Unknown contig in <blastout> file", contig_name)
            continue
        # this is a good contig
        C = contigs[contig_name]
        if not args.quiet:
            wu.say("  #{:>7,} of {:>7,}".format(C.index, len(contigs)))
        # attach hits to genes
        C.attach_hits(hits)
        C.update_gene_scores()
        # initial jumps?
        if args.jump_taxonomy is not None:
            for j in range(args.jump_taxonomy):
                C.raise_taxonomy(taxonomy)
        # evaluate; note: the 'ignore' option can result in "empty" contigs
        if not all([L.ignore for L in C.loci]):
            evaluate_contig(C, taxonomy, details, args)

    # wrap up
    write_main_output_files(contigs, taxonomy, args)
    wu.say("Finished successfully.")
    if details is not None:
        details.close()
Exemple #10
0
def write_main_output_files(contigs, taxonomy, args):

    # open output file handles
    wu.say("Initializing outputs.")
    handles = {}
    for option in ["lgt", "no_lgt", "unclassified"]:
        file_name = ".".join([args.basename, option, "tsv"])
        handles[option] = open(os.path.join(args.outdir, file_name), "w")

    # determine possible function annotation systems
    systems = set()
    for contig in contigs.values():
        for locus in contig.loci:
            for system in locus.annotations:
                systems.add(system)
    for option in c_main_formats:
        for s in sorted(systems):
            c_formats[option].append(c_annotation_prefix + s)

    # print headers
    for name in handles:
        wu.write_rowdict(None, c_main_formats[name], file=handles[name])

    # write results (sorted loop over contigs)
    for contig_name in sorted(contigs):
        contig = contigs[contig_name]
        best_one = contig.best_one
        best_two = contig.best_two
        # unclassified
        if not_ok(best_one) and not_ok(best_two):
            rowdict = {
                "contig_name": contig_name,
                "call": "unclassified",
                "contig_length": contig.length,
                "loci": make_loci_field(contig.loci),
            }
            attach_rowdict_functions(rowdict, contig, systems)
            wu.write_rowdict(rowdict, c_formats["unclassified"],
                             handles["unclassified"])
        # no_lgt
        elif is_ok(best_one):
            clade = best_one.clade1
            rowdict = {
                "contig_name": contig_name,
                "call": "no_lgt",
                "contig_length": contig.length,
                "min_score": best_one.crit,
                "avg_score": best_one.rank,
                "synteny": best_one.synteny,
                "clade": clade,
                "taxonomy": c_delim2.join(taxonomy.get_lineage(clade)),
                "melded": make_tails_field(best_one.tails1),
                "loci": make_loci_field(contig.loci),
            }
            attach_rowdict_functions(rowdict, contig, systems)
            wu.write_rowdict(rowdict, c_formats["no_lgt"], handles["no_lgt"])
        # lgt
        elif is_ok(best_two):
            clade1, clade2 = best_two.clade1, best_two.clade2
            rowdict = {
                "contig_name": contig_name,
                "call": "lgt",
                "contig_length": contig.length,
                "min_max_score": best_two.crit,
                "avg_max_score": best_two.rank,
                "synteny": best_two.synteny,
                "direction": best_two.direction,
                "clade_A": clade1,
                "clade_B": clade2,
                "lca": taxonomy.get_lca(clade1, clade2),
                "taxonomy_A": c_delim2.join(taxonomy.get_lineage(clade1)),
                "taxonomy_B": c_delim2.join(taxonomy.get_lineage(clade2)),
                "melded_A": make_tails_field(best_two.tails1),
                "melded_B": make_tails_field(best_two.tails2),
                "loci": make_loci_field(contig.loci),
            }
            attach_rowdict_functions(rowdict, contig, systems)
            wu.write_rowdict(rowdict, c_formats["lgt"], handles["lgt"])

    # wrap up
    for h in handles.values():
        h.close()