def loadHitRegions(blastFile, minLength, options):
    """
    Parse a hit table into a map from read names to lists of (start,end,annot)
    """
    hitMap = {}
    with InputFile(blastFile) as m8stream:
        params = FilterParams.create_from_arguments(options)
        hitcount = 0
        readcount = 0
        keepcount = 0
        for (read, hits) in filterM8Stream(m8stream, params,
                                           return_lines=False):
            readcount += 1
            hitTuples = []
            for hit in hits:
                hitcount += 1
                if abs(hit.qstart - hit.qend) + 1 < minLength:
                    continue

                keepcount += 1
                if hit.format == GFF:
                    annot = "# %d # %d # %s # %s;evalue=%s" % \
                            (hit.qstart, hit.qend,
                             hit.strand, hit.hitDesc, hit.evalue)
                else:
                    try:
                        annot = "%s [%d,%d] %0.1f%% %d bits" % \
                                (hit.hit, hit.hstart, hit.hend,
                                 hit.pctid, hit.score)
                    except AttributeError:
                        annot = "%s [%d,%d] score: %d" % (
                            hit.hit, hit.hstart, hit.hend, hit.score)

                if hit.format == GFF:
                    reverse = hit.strand != "+"
                else:
                    reverse = hit.hstart > hit.hend

                if reverse:
                    # reverse if hit is backwards
                    hitTuples.append((hit.qend, hit.qstart, annot))
                else:
                    hitTuples.append((hit.qstart, hit.qend, annot))
            hitMap[read] = hitTuples

        logging.debug(
            "Kept %d of %d hits from %d lines to %d reads" %
            (keepcount, hitcount, m8stream.lines, readcount))
    return hitMap
Example #2
0
def loadHitRegions(blastFile, minLength, options):
    """
    Parse a hit table into a map from read names to lists of (start,end,annot)
    """
    hitMap = {}
    params = FilterParams.create_from_arguments(options)
    m8stream = M8Stream(blastFile)
    hitcount = 0
    readcount = 0
    keepcount = 0
    for (read, hits) in filterM8Stream(m8stream, params, returnLines=False):
        readcount += 1
        hitTuples = []
        for hit in hits:
            hitcount += 1
            if abs(hit.qstart - hit.qend) + 1 < minLength:
                continue

            keepcount += 1
            if hit.format == GFF:
                annot = "# %d # %d # %s # %s;evalue=%s" % (
                    hit.qstart, hit.qend, hit.strand, hit.hitDesc, hit.evalue)
            else:
                try:
                    annot = "%s [%d,%d] %0.1f%% %d bits" % (
                        hit.hit, hit.hstart, hit.hend, hit.pctid, hit.score)
                except AttributeError:
                    annot = "%s [%d,%d] score: %d" % (
                        hit.hit, hit.hstart, hit.hend, hit.score)

            if hit.format == GFF:
                reverse = hit.strand != "+"
            else:
                reverse = hit.hstart > hit.hend

            if reverse:
                # reverse if hit is backwards
                hitTuples.append((hit.qend, hit.qstart, annot))
            else:
                hitTuples.append((hit.qstart, hit.qend, annot))
        hitMap[read] = hitTuples

    logging.debug(
        "Kept %d of %d hits to %d reads" %
        (keepcount, hitcount, readcount))
    return hitMap
Example #3
0
def main():
    description = """
    Take a blast result table and output a subset of hits based on the
    chosen filtering options. If more than one blast file given, use -O
    to get multiple output files, otherwise all output data will be
    concatenated into one output.
    """

    # command line arguments
    parser = argparse.ArgumentParser(description=description,
                                     conflict_handler='resolve')
    add_hit_table_arguments(parser, flags='all')
    parser.add_argument("-o",
                        "--outfilenome",
                        dest="outfilename",
                        default=None,
                        metavar="OUTFILENAME",
                        help="Write masked fasta output to OUTFILENAME.")
    parser.add_argument(
        '-O',
        '--autoOutName',
        default=False,
        action='store_true',
        help="Automatically generate output file name from input name "
        "and options. Overridden by -o, cannot be used with data "
        "from STDIN.")
    parser.add_argument('-G',
                        '--gff',
                        default=False,
                        action='store_true',
                        help="output GFF format instead of input format")
    parser.add_argument('hit_table',
                        nargs='*',
                        type=argparse.FileType('rU'),
                        default=[
                            sys.stdin,
                        ],
                        help="Table of search results to be filtered. "
                        "If absent, data will be read from STDIN")

    add_universal_arguments(parser)

    arguments = parser.parse_args()

    setup_logging(arguments)

    # check that we have blast file as argument

    # if we're not doing auto file names, wriate all outputs to same file
    if not arguments.autoOutName:
        if arguments.outfilename is not None:
            logging.info("Writing data to %s" % (arguments.outfilename))
            outfile_handle = open(arguments.outfilename, 'w')
        else:
            logging.info("writing data to STDOUT")
            outfile_handle = sys.stdout

    if arguments.gff:
        logging.info("Converting to GFF")

    # loop over inputs
    for infile_handle in arguments.hit_table:
        logging.info("reading data from %s" % (infile_handle.name))
        if arguments.autoOutName:
            outfile_handle = open(getOutputFile(infile_handle.name, arguments),
                                  'w')

        # filter
        params = FilterParams.create_from_arguments(arguments)
        filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff)

        if arguments.autoOutName:
            outfile_handle.close()
        infile_handle.close()
Example #4
0
def main():
    description = """
    Take a blast result table and output a subset of hits based on the
    chosen filtering options. If more than one blast file given, use -O
    to get multiple output files, otherwise all output data will be
    concatenated into one output.
    """

# command line arguments
    parser = argparse.ArgumentParser(
        description=description,
        conflict_handler='resolve')
    add_hit_table_arguments(parser, flags='all')
    parser.add_argument(
        "-o",
        "--outfilenome",
        dest="outfilename",
        default=None,
        metavar="OUTFILENAME",
        help="Write masked fasta output to OUTFILENAME.")
    parser.add_argument(
        '-O',
        '--autoOutName',
        default=False,
        action='store_true',
        help="Automatically generate output file name from input name "
             "and options. Overridden by -o, cannot be used with data "
             "from STDIN.")
    parser.add_argument('-G', '--gff', default=False, action='store_true',
                        help="output GFF format instead of input format")
    parser.add_argument('hit_table', nargs='*',
                        type=argparse.FileType('rU'), default=[sys.stdin, ],
                        help="Table of search results to be filtered. "
                             "If absent, data will be read from STDIN")

    add_universal_arguments(parser)

    arguments = parser.parse_args()

    setup_logging(arguments)

    # check that we have blast file as argument

    # if we're not doing auto file names, wriate all outputs to same file
    if not arguments.autoOutName:
        if arguments.outfilename is not None:
            logging.info("Writing data to %s" % (arguments.outfilename))
            outfile_handle = open(arguments.outfilename, 'w')
        else:
            logging.info("writing data to STDOUT")
            outfile_handle = sys.stdout

    if arguments.gff:
        logging.info("Converting to GFF")

    # loop over inputs
    for infile_handle in arguments.hit_table:
        logging.info("reading data from %s" % (infile_handle.name))
        if arguments.autoOutName:
            outfile_handle = open(
                getOutputFile(
                    infile_handle.name,
                    arguments),
                'w')

        # filter
        params = FilterParams.create_from_arguments(arguments)
        filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff)

        if arguments.autoOutName:
            outfile_handle.close()
        infile_handle.close()
Example #5
0
def main():
    # command line arguments
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        conflict_handler='resolve')

    # default to non-overlapping=0
    add_hit_table_arguments(parser,
                            flags='all',
                            defaults={'nonoverlapping': 0})
    parser.add_argument("-o",
                        "--outfilenome",
                        dest="outfilename",
                        default=None,
                        metavar="OUTFILENAME",
                        help="Write masked fasta output to OUTFILENAME.")
    parser.add_argument('hit_table',
                        nargs='?',
                        type=argparse.FileType('rU'),
                        default=sys.stdin,
                        help="Table of search results to be filtered. "
                        "If absent, data will be read from STDIN")

    add_universal_arguments(parser)

    arguments = parser.parse_args()

    setup_logging(arguments)

    # output file or STDOUT
    if arguments.outfilename is not None:
        logging.info("Writing data to %s" % (arguments.outfilename))
        outfile_handle = open(arguments.outfilename, 'w')
    else:
        logging.info("writing data to STDOUT")
        outfile_handle = sys.stdout

    # input file or STDIN (handled by argparse)
    infile_handle = arguments.hit_table
    logging.info("reading data from %s" % (infile_handle.name))

    # filter, but don't apply nonoverlapping yet
    # non-overlapping should be applied per-reference only
    params = FilterParams.create_from_arguments(arguments)
    # save user supplied value for later
    overlap_buffer = params.nonoverlapping
    # turn off for now
    params.set_nonoverlapping(-1)

    # merge
    hit_iter = filterM8Stream(infile_handle, params, return_lines=False)
    for query, query_hits in hit_iter:
        # group by reference hit
        hits_by_ref = defaultdict(list)
        for hit in query_hits:
            hits_by_ref[hit.hit].append(hit)

        # one output for query/reference pair
        for ref, ref_hits in hits_by_ref.items():

            # remove overlaps unless the buffer has been set to <0
            if overlap_buffer >= 0:
                ref_hits = remove_overlapping_hits(
                    ref_hits, on_hit=True, buffer=params.nonoverlapping)
                ref_hits = remove_overlapping_hits(
                    ref_hits, on_hit=False, buffer=params.nonoverlapping)

            # aggregate values
            length, score, identities = 0, 0, 0
            for hit in ref_hits:
                length += hit.mlen
                score += hit.score
                try:
                    # this will be off by 100x
                    identities += hit.pctid * hit.mlen
                except:
                    # just report pctid=0 if no pctid column in input
                    pass

            outfile_handle.write(
                "%s\t%s\t%d\t%d\t%0.2f\n" %
                (query, ref, length, score, identities / length))

    outfile_handle.close()
    infile_handle.close()