コード例 #1
0
ファイル: hits.py プロジェクト: jmeppley/py-metagenomics
def add_taxon_arguments(parser, defaults={}, choices={}):
    # get format and filter_top_pct options from blastm8
    add_hit_table_arguments(parser, defaults,
                            flags=['format', 'filter_top_pct'])

    # specific to taxon parsing:
    parser.add_argument(
        "-m",
        "--mapFile",
        dest="mapFile",
        default=defaults.get(
            "mapFile",
            None),
        metavar="MAPFILE",
        help="Location of file containing table of with db hit name "
             "as first column and taxa or taxonids in second column. "
             "Defaults to '%s'" % (defaults.get("mapFile", None)))
    parser.add_argument(
        "-p",
        "--parseStyle",
        default=defaults.get(
            "parseStyle",
            ACCS),
        choices=[
            ACCS,
            GIS,
            ORGS,
            HITID,
            HITDESC],
        help="What should be parsed from the hit table: accessions('accs'), "
             "'gis', organsim names in brackets ('orgs'), the full hit "
             "name('hitid'), or the full hit description('hitdesc'). "
             "(defaults to '%s')" % (defaults.get("parseStyles", ACCS)))
    parser.add_argument(
        "-C",
        "--countMethod",
        dest="countMethod",
        default=defaults.get(
            "countMethod",
            "first"),
        choices=choices.get(
            'countMethod',
            ('first',
             'most',
             'all',
             'LCA',
             'consensus')),
        help="How to deal with counts from multiple hits. (first, most: "
             "can return multiple hits in case of a tie, LCA: MEGAN-like, "
             "all: return every hit, consensus: return None unless all "
             "the same). Default is %s" % (defaults.get("countMethod",
                                                        "first")),
        metavar="COUNTMETHOD")
    add_taxonomy_dir_argument(parser, defaults)
コード例 #2
0
ファイル: hits.py プロジェクト: Piplopp/py-metagenomics
def add_taxon_arguments(parser, defaults={}, choices={}):
    # get format and filter_top_pct options from blastm8
    add_hit_table_arguments(parser, defaults,
                            flags=['format', 'filter_top_pct'])

    # specific to taxon parsing:
    parser.add_argument(
        "-m",
        "--mapFile",
        dest="mapFile",
        default=defaults.get(
            "mapFile",
            None),
        metavar="MAPFILE",
        help="Location of file containing table of with db hit name "
             "as first column and taxa or taxonids in second column. "
             "Defaults to '%s'" % (defaults.get("mapFile", None)))
    parser.add_argument(
        "-p",
        "--parseStyle",
        default=defaults.get(
            "parseStyle",
            ACCS),
        choices=[
            ACCS,
            GIS,
            ORGS,
            HITID,
            HITDESC],
        help="What should be parsed from the hit table: accessions('accs'), "
             "'gis', organsim names in brackets ('orgs'), the full hit "
             "name('hitid'), or the full hit description('hitdesc'). "
             "(defaults to '%s')" % (defaults.get("parseStyles", ACCS)))
    parser.add_argument(
        "-C",
        "--countMethod",
        dest="countMethod",
        default=defaults.get(
            "countMethod",
            "first"),
        choices=choices.get(
            'countMethod',
            ('first',
             'most',
             'all',
             'LCA',
             'consensus')),
        help="How to deal with counts from multiple hits. (first, most: "
             "can return multiple hits in case of a tie, LCA: MEGAN-like, "
             "all: return every hit, consensus: return None unless all "
             "the same). Default is %s" % (defaults.get("countMethod",
                                                        "first")),
        metavar="COUNTMETHOD")
    add_taxonomy_dir_argument(parser, defaults)
コード例 #3
0
def main():
    description = """
    Take a blast result table and output a subset of hits based on the
    chosen filtering options. If more than one blast file given, use -O
    to get multiple output files, otherwise all output data will be
    concatenated into one output.
    """

    # command line arguments
    parser = argparse.ArgumentParser(description=description,
                                     conflict_handler='resolve')
    add_hit_table_arguments(parser, flags='all')
    parser.add_argument("-o",
                        "--outfilenome",
                        dest="outfilename",
                        default=None,
                        metavar="OUTFILENAME",
                        help="Write masked fasta output to OUTFILENAME.")
    parser.add_argument(
        '-O',
        '--autoOutName',
        default=False,
        action='store_true',
        help="Automatically generate output file name from input name "
        "and options. Overridden by -o, cannot be used with data "
        "from STDIN.")
    parser.add_argument('-G',
                        '--gff',
                        default=False,
                        action='store_true',
                        help="output GFF format instead of input format")
    parser.add_argument('hit_table',
                        nargs='*',
                        type=argparse.FileType('rU'),
                        default=[
                            sys.stdin,
                        ],
                        help="Table of search results to be filtered. "
                        "If absent, data will be read from STDIN")

    add_universal_arguments(parser)

    arguments = parser.parse_args()

    setup_logging(arguments)

    # check that we have blast file as argument

    # if we're not doing auto file names, wriate all outputs to same file
    if not arguments.autoOutName:
        if arguments.outfilename is not None:
            logging.info("Writing data to %s" % (arguments.outfilename))
            outfile_handle = open(arguments.outfilename, 'w')
        else:
            logging.info("writing data to STDOUT")
            outfile_handle = sys.stdout

    if arguments.gff:
        logging.info("Converting to GFF")

    # loop over inputs
    for infile_handle in arguments.hit_table:
        logging.info("reading data from %s" % (infile_handle.name))
        if arguments.autoOutName:
            outfile_handle = open(getOutputFile(infile_handle.name, arguments),
                                  'w')

        # filter
        params = FilterParams.create_from_arguments(arguments)
        filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff)

        if arguments.autoOutName:
            outfile_handle.close()
        infile_handle.close()
コード例 #4
0
def add_path_arguments(parser, defaults={}, choices={}, helps={}):
    # get format and filter_top_pct arguments from blastm8
    from edl.hits import HITID, ACCS, GIS, KEGG, HITDESC, PFAM
    from edl.blastm8 import add_hit_table_arguments
    add_hit_table_arguments(parser, defaults, flags=['format',
                                                     'filter_top_pct',
                                                     'sort'
                                                    ])

    # specific to pathway parsing:
    pgroup = parser.add_argument_group(
        "Pathway Arguments",
        "These arguments control the mapping of hits to gene "
        "function heirarchies like KEGG or SEED""")
    pgroup.add_argument(
        "-m",
        "--mapFile",
        dest="mapFile",
        default=defaults.get(
            "mapFile",
            None),
        metavar="MAPFILE",
        help="Location of file containing table of with db hit name as "
             "first column and geneIDs (Knumber) in second column.")
    pgroup.add_argument(
        "-M",
        "--mapStyle",
        default='auto',
        choices=[
            'auto',
            'kegg',
            'tab',
            'seed'],
        help="What type of mapping file are you using: simple tab "
             "separated list of IDs and kos/subsystems/domains, the "
             "genes_ko.list file from KEGG (which adds ko: to the K "
             "numbers and can have multiple records for each gene id), "
             "or the 3 column file from SEED. By default, this script "
             "will inspect the file and guess, but you can force 'kegg', "
             "'seed' or 'tab' with this argument.")
    default = defaults.get('tab_map_delim', None)
    pgroup.add_argument("--tab_map_delim",
                        default=default,
                        help=("Delimiter to parse multiple assignments in "
                              "map from ids to ko/path/fam. Only used for "
                              "tabular mapping tables. Defaults to {}"
                              .format(str(default))))
    pgroup.add_argument(
        "-p",
        "--parseStyle",
        default=defaults.get(
            "parseStyle",
            HITID),
        choices=[
            ACCS,
            GIS,
            KEGG,
            HITID,
            HITDESC,
            PFAM],
        help="What should be parsed from the hit table: accessions('accs'), "
             "'gis', K numbers in description ('kegg'), the full hit "
             "name('hitid'), or the full hit description('hitdesc'). "
             "(defaults to '%s')" % (defaults.get("parseStyle",
                                                  HITID)))
    pgroup.add_argument(
        "-C",
        "--countMethod",
        dest="countMethod",
        default=defaults.get(
            "countMethod",
            "first"),
        choices=choices.get(
            'countMethod',
            ('first',
             'most',
             'all',
             'consensus')),
        help=helps.get(
            "countMethod",
            "How to deal with counts from multiple hits. (first, most: "
            "can return multiple hits, all: return every hit, consensus: "
            "return None unless all the same). Do not use most or consensus "
            "with more than one level at a time. Default is %s" %
            (defaults.get(
                "countMethod",
                "first"))),
        metavar="COUNTMETHOD")
    if defaults.get("filter_for_path", False):
        action = 'store_false'
        default = True
        helpstr = 'Consider all hits. By deafult, only hits with path \
assignments are used.'
    else:
        action = 'store_true'
        default = False
        helpstr = 'Ignore hits with no entry in pathway map (-m). By default \
all hits are used and if the best hit(s) is(are) to sequences with no path, \
then the read will not be assigned to a path'
    pgroup.add_argument(
        "-r",
        "--filter_for_path",
        action=action,
        dest="mappedHitsOnly",
        default=default,
        help=helpstr)
    add_pathways_argument(pgroup, defaults)
    parser.add_argument_group(pgroup)
コード例 #5
0
def main():
    description = """
    Take a blast result table and output a subset of hits based on the
    chosen filtering options. If more than one blast file given, use -O
    to get multiple output files, otherwise all output data will be
    concatenated into one output.
    """

# command line arguments
    parser = argparse.ArgumentParser(
        description=description,
        conflict_handler='resolve')
    add_hit_table_arguments(parser, flags='all')
    parser.add_argument(
        "-o",
        "--outfilenome",
        dest="outfilename",
        default=None,
        metavar="OUTFILENAME",
        help="Write masked fasta output to OUTFILENAME.")
    parser.add_argument(
        '-O',
        '--autoOutName',
        default=False,
        action='store_true',
        help="Automatically generate output file name from input name "
             "and options. Overridden by -o, cannot be used with data "
             "from STDIN.")
    parser.add_argument('-G', '--gff', default=False, action='store_true',
                        help="output GFF format instead of input format")
    parser.add_argument('hit_table', nargs='*',
                        type=argparse.FileType('rU'), default=[sys.stdin, ],
                        help="Table of search results to be filtered. "
                             "If absent, data will be read from STDIN")

    add_universal_arguments(parser)

    arguments = parser.parse_args()

    setup_logging(arguments)

    # check that we have blast file as argument

    # if we're not doing auto file names, wriate all outputs to same file
    if not arguments.autoOutName:
        if arguments.outfilename is not None:
            logging.info("Writing data to %s" % (arguments.outfilename))
            outfile_handle = open(arguments.outfilename, 'w')
        else:
            logging.info("writing data to STDOUT")
            outfile_handle = sys.stdout

    if arguments.gff:
        logging.info("Converting to GFF")

    # loop over inputs
    for infile_handle in arguments.hit_table:
        logging.info("reading data from %s" % (infile_handle.name))
        if arguments.autoOutName:
            outfile_handle = open(
                getOutputFile(
                    infile_handle.name,
                    arguments),
                'w')

        # filter
        params = FilterParams.create_from_arguments(arguments)
        filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff)

        if arguments.autoOutName:
            outfile_handle.close()
        infile_handle.close()
コード例 #6
0
def main():
    # command line arguments
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        conflict_handler='resolve')

    # default to non-overlapping=0
    add_hit_table_arguments(parser,
                            flags='all',
                            defaults={'nonoverlapping': 0})
    parser.add_argument("-o",
                        "--outfilenome",
                        dest="outfilename",
                        default=None,
                        metavar="OUTFILENAME",
                        help="Write masked fasta output to OUTFILENAME.")
    parser.add_argument('hit_table',
                        nargs='?',
                        type=argparse.FileType('rU'),
                        default=sys.stdin,
                        help="Table of search results to be filtered. "
                        "If absent, data will be read from STDIN")

    add_universal_arguments(parser)

    arguments = parser.parse_args()

    setup_logging(arguments)

    # output file or STDOUT
    if arguments.outfilename is not None:
        logging.info("Writing data to %s" % (arguments.outfilename))
        outfile_handle = open(arguments.outfilename, 'w')
    else:
        logging.info("writing data to STDOUT")
        outfile_handle = sys.stdout

    # input file or STDIN (handled by argparse)
    infile_handle = arguments.hit_table
    logging.info("reading data from %s" % (infile_handle.name))

    # filter, but don't apply nonoverlapping yet
    # non-overlapping should be applied per-reference only
    params = FilterParams.create_from_arguments(arguments)
    # save user supplied value for later
    overlap_buffer = params.nonoverlapping
    # turn off for now
    params.set_nonoverlapping(-1)

    # merge
    hit_iter = filterM8Stream(infile_handle, params, return_lines=False)
    for query, query_hits in hit_iter:
        # group by reference hit
        hits_by_ref = defaultdict(list)
        for hit in query_hits:
            hits_by_ref[hit.hit].append(hit)

        # one output for query/reference pair
        for ref, ref_hits in hits_by_ref.items():

            # remove overlaps unless the buffer has been set to <0
            if overlap_buffer >= 0:
                ref_hits = remove_overlapping_hits(
                    ref_hits, on_hit=True, buffer=params.nonoverlapping)
                ref_hits = remove_overlapping_hits(
                    ref_hits, on_hit=False, buffer=params.nonoverlapping)

            # aggregate values
            length, score, identities = 0, 0, 0
            for hit in ref_hits:
                length += hit.mlen
                score += hit.score
                try:
                    # this will be off by 100x
                    identities += hit.pctid * hit.mlen
                except:
                    # just report pctid=0 if no pctid column in input
                    pass

            outfile_handle.write(
                "%s\t%s\t%d\t%d\t%0.2f\n" %
                (query, ref, length, score, identities / length))

    outfile_handle.close()
    infile_handle.close()
コード例 #7
0
def main():
    description = __doc__

# command line options
    parser = argparse.ArgumentParser(description, conflict_handler='resolve')
    parser.add_argument("input_files", nargs=1,
                        default=[],
                        metavar="INFILE",
                        help="Hit table to process")
    parser.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        metavar="OUTFILE",
        help="Write masked fasta output to OUTFILE (default is STDOUT).")
    parser.add_argument(
        "-i",
        "--infile",
        dest="fasta",
        metavar="FILE",
        help=" File containing the fasta (defaults to STDIN)")
    parser.add_argument(
        "-M",
        "--mask",
        dest="keep",
        default=True,
        action="store_false",
        help="Return unmatched sequence fragments instead of hits.")
    parser.add_argument("-m", "--minLength", dest="minLength", type=int,
                        metavar="BASES", default=1,
                        help="minimum number of bases for sequences in output")
    parser.add_argument(
        "-n",
        "--numbering_prefix",
        default=None,
        help="If given, name extracted sequence with this scring followed "
             "by a sinmple counting index of all extracted sequences. For "
             "example, -n \"r\" would add _r1 to the end of the first "
             "extracted sequence and _r2 to the second, and so on. By "
             "default, extracted sequences are named with start_end "
             "positions.")

    parser.add_argument(
        "-t",
        "--translate",
        default=False,
        action='store_true',
        help="Transalte to Amino Acid sequences")

    add_hit_table_arguments(parser, flags='all')

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # check that we have blast file as argument
    if len(arguments.input_files) != 1:
        parser.error(
            "Please supply the name of a hit table as the only argument")
    blastFile = arguments.input_files[0]

    # set up input/output streams
    if arguments.fasta is None:
        fastaHandle = sys.stdin
        fastaStr = 'STDIN'
    else:
        fastaHandle = open(arguments.fasta, "rU")
        fastaStr = arguments.fasta
    logging.info(
        "Extrating sequence fragments from %s based on hits in %s" %
        (fastaStr, blastFile))

    if arguments.outfile is None:
        logging.info("Writing %s sequences to STDOUT" % ('fasta'))
        outputHandle = sys.stdout
    else:
        logging.info(
            "Writing %s sequences to %s" %
            ('fasta', arguments.outfile))
        outputHandle = open(arguments.outfile, 'w')

    # load hit regions
    if arguments.keep:
        minHitLength = arguments.minLength
    else:
        minHitLength = 1
    readHits = loadHitRegions(blastFile, minHitLength, arguments)
    logging.info("Found hits for %d reads" % (len(readHits)))

    # process the fasta file with hits
    extractHits(
        fastaHandle,
        outputHandle,
        readHits,
        arguments.translate,
        arguments.minLength,
        arguments.keep,
        arguments.numbering_prefix)
コード例 #8
0
def main():
    description = __doc__

# command line options
    parser = argparse.ArgumentParser(description, conflict_handler='resolve')
    parser.add_argument("input_files", nargs=1,
                        default=[],
                        metavar="INFILE",
                        help="Hit table to process")
    parser.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        metavar="OUTFILE",
        help="Write masked fasta output to OUTFILE (default is STDOUT).")
    parser.add_argument(
        "-i",
        "--infile",
        dest="fasta",
        metavar="FILE",
        help=" File containing the fasta (defaults to STDIN)")
    parser.add_argument(
        "-M",
        "--mask",
        dest="keep",
        default=True,
        action="store_false",
        help="Return unmatched sequence fragments instead of hits.")
    parser.add_argument("-m", "--minLength", dest="minLength", type=int,
                        metavar="BASES", default=1,
                        help="minimum number of bases for sequences in output")
    parser.add_argument(
        "-n",
        "--numbering_prefix",
        default=None,
        help="If given, name extracted sequence with this scring followed "
             "by a sinmple counting index of all extracted sequences. For "
             "example, -n \"r\" would add _r1 to the end of the first "
             "extracted sequence and _r2 to the second, and so on. By "
             "default, extracted sequences are named with start_end "
             "positions.")

    parser.add_argument(
        "-t",
        "--translate",
        default=False,
        action='store_true',
        help="Transalte to Amino Acid sequences")

    add_hit_table_arguments(parser, flags='all')

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # check that we have blast file as argument
    if len(arguments.input_files) != 1:
        parser.error(
            "Please supply the name of a hit table as the only argument")
    blastFile = arguments.input_files[0]

    # set up input/output streams
    if arguments.fasta is None:
        fastaHandle = sys.stdin
        fastaStr = 'STDIN'
    else:
        fastaHandle = open(arguments.fasta, "rt")
        fastaStr = arguments.fasta
    logging.info(
        "Extrating sequence fragments from %s based on hits in %s" %
        (fastaStr, blastFile))

    if arguments.outfile is None:
        logging.info("Writing %s sequences to STDOUT" % ('fasta'))
        outputHandle = sys.stdout
    else:
        logging.info(
            "Writing %s sequences to %s" %
            ('fasta', arguments.outfile))
        outputHandle = open(arguments.outfile, 'w')

    # load hit regions
    if arguments.keep:
        minHitLength = arguments.minLength
    else:
        minHitLength = 1
    readHits = loadHitRegions(blastFile, minHitLength, arguments)
    logging.info("Found hits for %d reads" % (len(readHits)))

    # process the fasta file with hits
    extractHits(
        fastaHandle,
        outputHandle,
        readHits,
        arguments.translate,
        arguments.minLength,
        arguments.keep,
        arguments.numbering_prefix)
コード例 #9
0
ファイル: kegg.py プロジェクト: jmeppley/py-metagenomics
def add_path_arguments(parser, defaults={}, choices={}, helps={}):
    # get format and filter_top_pct arguments from blastm8
    from edl.hits import HITID, ACCS, GIS, KEGG, HITDESC, PFAM
    from edl.blastm8 import add_hit_table_arguments
    add_hit_table_arguments(parser, defaults, flags=['format',
                                                     'filter_top_pct',
                                                     'sort'
                                                    ])

    # specific to pathway parsing:
    pgroup = parser.add_argument_group(
        "Pathway Arguments",
        "These arguments control the mapping of hits to gene "
        "function heirarchies like KEGG or SEED""")
    pgroup.add_argument(
        "-m",
        "--mapFile",
        dest="mapFile",
        default=defaults.get(
            "mapFile",
            None),
        metavar="MAPFILE",
        help="Location of file containing table of with db hit name as "
             "first column and geneIDs (Knumber) in second column.")
    pgroup.add_argument(
        "-M",
        "--mapStyle",
        default='auto',
        choices=[
            'auto',
            'kegg',
            'tab',
            'seed'],
        help="What type of mapping file are you using: simple tab "
             "separated list of IDs and kos/subsystems/domains, the "
             "genes_ko.list file from KEGG (which adds ko: to the K "
             "numbers and can have multiple records for each gene id), "
             "or the 3 column file from SEED. By default, this script "
             "will inspect the file and guess, but you can force 'kegg', "
             "'seed' or 'tab' with this argument.")
    default = defaults.get('tab_map_delim', None)
    pgroup.add_argument("--tab_map_delim",
                        default=default,
                        help=("Delimiter to parse multiple assignments in "
                              "map from ids to ko/path/fam. Only used for "
                              "tabular mapping tables. Defaults to {}"
                              .format(str(default))))
    pgroup.add_argument(
        "-p",
        "--parseStyle",
        default=defaults.get(
            "parseStyle",
            HITID),
        choices=[
            ACCS,
            GIS,
            KEGG,
            HITID,
            HITDESC,
            PFAM],
        help="What should be parsed from the hit table: accessions('accs'), "
             "'gis', K numbers in description ('kegg'), the full hit "
             "name('hitid'), or the full hit description('hitdesc'). "
             "(defaults to '%s')" % (defaults.get("parseStyle",
                                                  HITID)))
    pgroup.add_argument(
        "-C",
        "--countMethod",
        dest="countMethod",
        default=defaults.get(
            "countMethod",
            "first"),
        choices=choices.get(
            'countMethod',
            ('first',
             'most',
             'all',
             'consensus')),
        help=helps.get(
            "countMethod",
            "How to deal with counts from multiple hits. (first, most: "
            "can return multiple hits, all: return every hit, consensus: "
            "return None unless all the same). Do not use most or consensus "
            "with more than one level at a time. Default is %s" %
            (defaults.get(
                "countMethod",
                "first"))),
        metavar="COUNTMETHOD")
    if defaults.get("filter_for_path", False):
        action = 'store_false'
        default = True
        helpstr = 'Consider all hits. By deafult, only hits with path \
assignments are used.'
    else:
        action = 'store_true'
        default = False
        helpstr = 'Ignore hits with no entry in pathway map (-m). By default \
all hits are used and if the best hit(s) is(are) to sequences with no path, \
then the read will not be assigned to a path'
    pgroup.add_argument(
        "-r",
        "--filter_for_path",
        action=action,
        dest="mappedHitsOnly",
        default=default,
        help=helpstr)
    add_pathways_argument(pgroup, defaults)
    parser.add_argument_group(pgroup)