Esempio n. 1
0
def main():
    """Entrypoint for blobtools remove."""
    args = docopt(__doc__)
    meta = fetch_metadata(args['DIRECTORY'], **args)
    field_ids = []
    for field in FIELDS:
        if args[field['flag']] or args['--all']:
            field_ids += field['module'].remove_from_meta(meta=meta)
    for field_id in args['--field']:
        field_ids += remove_field(meta, field_id)
    if args['--all']:
        for field_id in meta.list_fields():
            if field_id != 'identifiers':
                field_ids += remove_field(meta, field_id)
    if meta.reads:
        remove_read_metadata(meta, field_ids)
    if meta.plot:
        axes = ['x', 'y', 'z', 'cat']
        for axis in axes:
            if axis not in meta.plot:
                remove_static_plots(meta, args['DIRECTORY'])
                break
    else:
        remove_static_plots(meta, args['DIRECTORY'])
    if field_ids:
        file_io.delete_file("%s/CHECKSUM" % args['DIRECTORY'])
        file_io.delete_file("%s/summary.json" % args['DIRECTORY'])
        for field_id in field_ids:
            file_io.delete_file("%s/%s.json" % (args['DIRECTORY'], field_id))
        file_io.write_file("%s/meta.json" % args['DIRECTORY'], meta.to_dict())
Esempio n. 2
0
def main():
    """Entrypoint for blobtools add."""
    args = docopt(__doc__)
    meta = fetch_metadata(args['DIRECTORY'], **args)
    if args['--fasta']:
        meta.assembly.update({'file': args['--fasta']})
    taxdump = None
    dependencies = {}
    for field in FIELDS:
        if args[field['flag']]:
            for dep in field['depends']:
                if dep not in dependencies or not dependencies[dep]:
                    dependencies[dep] = fetch_field(args['DIRECTORY'], dep,
                                                    meta)
            if field['flag'] == '--hits':
                if not taxdump:
                    taxdump = fetch_taxdump(args['--taxdump'])
            parents = field['module'].parent()
            parsed = field['module'].parse(
                args[field['flag']],
                **{key: args[key]
                   for key in PARAMS},
                taxdump=taxdump,
                dependencies=dependencies,
                meta=meta)
            if not isinstance(parsed, list):
                parsed = [parsed]
            for data in parsed:
                if not args['--replace']:
                    if has_field_warning(meta, data.field_id):
                        continue
                for parent in data.parents:
                    if 'range' in parent:
                        parent_meta = meta.field_meta(parent['id'])
                        if parent_meta and 'range' in parent_meta:
                            parent['range'][0] = min(parent['range'][0],
                                                     parent_meta['range'][0])
                            parent['range'][1] = max(parent['range'][1],
                                                     parent_meta['range'][1])
                meta.add_field(parents + data.parents, **data.meta)
                if isinstance(data, Identifier):
                    meta.records = len(data.values)
                json_file = "%s/%s.json" % (args['DIRECTORY'], data.field_id)
                file_io.write_file(json_file, data.values_to_dict())
                dependencies[data.field_id] = data
    if 'identifiers' not in dependencies:
        dependencies['identifiers'] = fetch_field(args['DIRECTORY'],
                                                  'identifiers', meta)
    for string in args['--link']:
        link.add(string, meta, dependencies['identifiers'].values,
                 args['--skip-link-test'])
    for string in args['--key']:
        key.add(string, meta)
    if args['--taxid']:
        if not taxdump:
            taxdump = fetch_taxdump(args['--taxdump'])
        taxid.add(args['--taxid'], taxdump, meta)
    file_io.write_file("%s/meta.json" % args['DIRECTORY'], meta.to_dict())
Esempio n. 3
0
def create_filtered_dataset(dataset_meta, indir, outdir, indices):
    """Write filtered records to new dataset."""
    meta = dataset_meta.to_dict()
    meta.update({
        "fields": [],
        "origin": dataset_meta.dataset_id,
        "records": len(indices)
    })
    meta.pop("id")
    meta = fetch_metadata(outdir, meta=meta)
    # meta = fetch_metadata(outdir, **args)
    for field_id in dataset_meta.list_fields():
        field_meta = dataset_meta.field_meta(field_id)
        if not field_meta.get("children"):
            field_meta.pop("data", False)
            keys = None
            slot = None
            headers = None
            full_field = fetch_field(indir, field_id, dataset_meta)
            if isinstance(full_field, (Variable, Identifier)):
                values = [full_field.values[i] for i in indices]
                if isinstance(full_field, Variable):
                    field_meta.update({"range": [min(values), max(values)]})
                    if field_id == "length":
                        meta.assembly.update({"span": sum(values)})
                        meta.assembly.update({"scaffold-count": len(values)})
            elif isinstance(full_field, Category):
                full_values = full_field.expand_values()
                values = [full_values[i] for i in indices]
            else:
                full_values = full_field.expand_values()
                values = [full_values[i] for i in indices]
                slot = full_field.category_slot
                try:
                    headers = full_field.headers
                except AttributeError:
                    pass
                if field_meta.get("parent"):
                    parent_field = fetch_field(outdir, field_meta["parent"],
                                               dataset_meta)
                    if parent_field:
                        keys = parent_field.keys
            field = type(full_field)(
                field_id,
                meta=field_meta,
                values=values,
                fixed_keys=keys,
                category_slot=slot,
                headers=headers,
            )
            parents = dataset_meta.field_parent_list(field_id)
            meta.add_field(parents, **field_meta, field_id=field_id)
            json_file = "%s/%s.json" % (outdir, field.field_id)
            file_io.write_file(json_file, field.values_to_dict())
    file_io.write_file("%s/meta.json" % outdir, meta.to_dict())
Esempio n. 4
0
def main():
    """Entrypoint for blobtools filter."""
    args = docopt(__doc__)
    meta = fetch_metadata(args['DATASET'], **args)
    params = parse_params(args, meta)
    identifiers = fetch_field(args['DATASET'], 'identifiers', meta)
    indices = [index for index, value in enumerate(identifiers.values)]
    invert = args['--invert']
    if params:
        indices = filter_by_params(meta, args['DATASET'], indices, params,
                                   invert)
    if args['--json']:
        indices = filter_by_json(identifiers.values, indices, args['--json'],
                                 invert)
    if args['--output']:
        create_filtered_dataset(meta, args['DATASET'], args['--output'],
                                indices)
    ids = [identifiers.values[i] for i in indices]
    for field in FIELDS:
        if args[field['flag']]:
            requirements = True
            if field.get('requires'):
                for flag in field['requires']:
                    if flag not in args:
                        print("WARN: '%s' must be set to use option '%s'" %
                              (flag, field['flag']))
                        requirements = False
            if not requirements:
                continue
            field['module'].apply_filter(ids, args[field['flag']], **args)
    if args['--table']:
        full_field_ids = args['--table-fields'].split(',')
        expanded_ids = ['index', 'identifiers']
        field_ids = []
        alt_ids = {field_id: field_id for field_id in expanded_ids}
        for full_id in full_field_ids:
            try:
                field_id, alt_id = full_id.split('=')
                field_ids.append(field_id)
                alt_ids[field_id] = alt_id
            except ValueError:
                field_ids.append(full_id)
                alt_ids[full_id] = full_id
        fields = {
            'identifiers': fetch_field(args['DATASET'], 'identifiers', meta)
        }
        for field_id in field_ids:
            if field_id == 'plot':
                for axis in ['x', 'z', 'y', 'cat']:
                    if axis in meta.plot:
                        expanded_ids.append(meta.plot[axis])
                        alt_ids.update({meta.plot[axis]: meta.plot[axis]})
                        fields[meta.plot[axis]] = fetch_field(
                            args['DATASET'], meta.plot[axis], meta)
            else:
                expanded_ids.append(field_id)
                alt_ids.update({field_id: field_id})
                fields[field_id] = fetch_field(args['DATASET'], field_id, meta)
        table = [[alt_ids[field_id] for field_id in expanded_ids]]
        for i in indices:
            record = []
            for field_id in expanded_ids:
                if field_id == 'index':
                    record.append(i)
                else:
                    value = fields[field_id].values[i]
                    if fields[field_id].keys:
                        value = fields[field_id].keys[value]
                    record.append(value)
            table.append(record)
        file_io.write_file(args['--table'], table)
    if args['--summary']:
        summary_stats = {}
        for section in SUMMARY:
            requirements = True
            if section.get('requires'):
                for flag in section['requires']:
                    if not args[flag]:
                        print(
                            "WARN: '%s' must be set to generate '%s' summary" %
                            (flag, section['title']))
                        requirements = False
            if not requirements:
                continue
            fields = {}
            if section.get('depends'):
                for field in section['depends']:
                    fields.update(
                        {field: fetch_field(args['DATASET'], field, meta)})
            if section['title'] == 'hits':
                field = "%s_%s" % (args['--taxrule'], args['--summary-rank'])
                fields.update(
                    {'hits': fetch_field(args['DATASET'], field, meta)})
                if 'y' in meta.plot:
                    fields.update({
                        'cov':
                        fetch_field(args['DATASET'], meta.plot['y'], meta)
                    })
            if section['title'] == 'busco':
                lineages = []
                for field in meta.list_fields():
                    if field.endswith('_busco'):
                        lineages.append(field)
                        fields.update(
                            {field: fetch_field(args['DATASET'], field, meta)})
                fields.update({'lineages': lineages})
            if section['title'] == 'readMapping':
                libraries = []
                for field in meta.list_fields():
                    if field.endswith('_read_cov'):
                        library = field.replace('_read_cov', '')
                        libraries.append(library)
                        fields.update({
                            "%s_cov" % library:
                            fetch_field(args['DATASET'], "%s_cov" % library,
                                        meta)
                        })
                        fields.update(
                            {field: fetch_field(args['DATASET'], field, meta)})
                fields.update({'libraries': libraries})
            summary_stats.update({
                section['title']:
                section['module'].summarise(indices,
                                            fields,
                                            **args,
                                            meta=meta,
                                            stats=summary_stats)
            })
        stats = {}
        if 'hits' in summary_stats:
            nohit_span = 0
            span = summary_stats['hits']['total']['span']
            if 'no-hit' in summary_stats['hits']:
                nohit_span = summary_stats['hits']['no-hit']['span']
                stats.update({'noHit': float("%.3f" % (nohit_span / span))})
            else:
                stats.update({'noHit': 0})
            if 'taxonomy' in summary_stats and 'target' in summary_stats[
                    'taxonomy']:
                if summary_stats['taxonomy']['target'] in summary_stats[
                        'hits']:
                    target_span = summary_stats['hits'][
                        summary_stats['taxonomy']['target']]['span']
                    stats.update({
                        'target':
                        float("%.3f" % (target_span / (span - nohit_span)))
                    })
                elif 'target' in summary_stats['hits']:
                    target_span = summary_stats['hits']['target']['span']
                    stats.update({
                        'target':
                        float("%.3f" % (target_span / (span - nohit_span)))
                    })
                    del summary_stats['hits']['target']
                else:
                    stats.update({'target': 0})
            ratio = summary_stats['hits']['total']['span'] / summary_stats[
                'hits']['total']['n50']
            if ratio >= 100:
                ratio = int(float('%.3g' % ratio))
            else:
                ratio = float('%.3g' % ratio)
            stats.update({'spanOverN50': ratio})
        summary_stats.update({'stats': stats})
        file_io.write_file(args['--summary'], {'summaryStats': summary_stats})
Esempio n. 5
0
def main():
    """Entrypoint for blobtools add."""
    args = docopt(__doc__)
    meta = fetch_metadata(args["DIRECTORY"], **args)
    if args["--fasta"]:
        meta.assembly.update({"file": args["--fasta"]})
    taxdump = None
    dependencies = {}
    for field in FIELDS:
        if args[field["flag"]]:
            if "depends" in field:
                for dep in field["depends"]:
                    if dep not in dependencies or not dependencies[dep]:
                        dependencies[dep] = fetch_field(
                            args["DIRECTORY"], dep, meta)
            for dep_key, dep_value in dependencies.items():
                if not dep_value:
                    print("ERROR: '%s.json' was not found in the BlobDir." %
                          dep_key)
                    print(
                        "ERROR: You may need to rebuild the BlobDir to run this command."
                    )
                    sys.exit(1)
            if field["flag"] == "--hits":
                if not taxdump:
                    taxdump = fetch_taxdump(args["--taxdump"])
            parents = field["module"].parent()
            if "optional" in field:
                for dep in field["optional"]:
                    if dep not in dependencies or not dependencies[dep]:
                        dependencies[dep] = fetch_field(
                            args["DIRECTORY"], dep, meta)
            parsed = field["module"].parse(
                args[field["flag"]],
                **{key: args[key]
                   for key in args.keys()},
                taxdump=taxdump,
                dependencies=dependencies,
                meta=meta)
            if not isinstance(parsed, list):
                parsed = [parsed]
            for data in parsed:
                if not args["--replace"]:
                    if has_field_warning(meta, data.field_id):
                        continue
                for parent in data.parents:
                    if "range" in parent:
                        parent_meta = meta.field_meta(parent["id"])
                        if parent_meta and "range" in parent_meta:
                            parent["range"][0] = min(parent["range"][0],
                                                     parent_meta["range"][0])
                            parent["range"][1] = max(parent["range"][1],
                                                     parent_meta["range"][1])
                meta.add_field(parents + data.parents, **data.meta)
                if isinstance(data, Identifier):
                    meta.records = len(data.values)
                json_file = "%s/%s.json" % (args["DIRECTORY"], data.field_id)
                file_io.write_file(json_file, data.values_to_dict())
                dependencies[data.field_id] = data
    if "identifiers" not in dependencies:
        dependencies["identifiers"] = fetch_field(args["DIRECTORY"],
                                                  "identifiers", meta)
    for string in args["--link"]:
        link.add(string, meta, dependencies["identifiers"].values,
                 args["--skip-link-test"])
    for string in args["--key"]:
        key.add(string, meta, args["--replace"])
    if args["--taxid"]:
        if not taxdump:
            taxdump = fetch_taxdump(args["--taxdump"])
        taxid.add(args["--taxid"], taxdump, meta)
    file_io.write_file("%s/meta.json" % args["DIRECTORY"], meta.to_dict())
Esempio n. 6
0
def main():
    """Entrypoint for blobtools filter."""
    args = docopt(__doc__)
    meta = fetch_metadata(args['DATASET'], **args)
    params = parse_params(args, meta)
    identifiers = fetch_field(args['DATASET'], 'identifiers', meta)
    indices = [index for index, value in enumerate(identifiers.values)]
    invert = args['--invert']
    if params:
        indices = filter_by_params(meta, args['DATASET'], indices, params,
                                   invert)
    if args['--json']:
        indices = filter_by_json(identifiers.values, indices, args['--json'],
                                 invert)
    if args['--output']:
        create_filtered_dataset(meta, args['DATASET'], args['--output'],
                                indices)
    ids = [identifiers.values[i] for i in indices]
    for field in FIELDS:
        if args[field['flag']]:
            requirements = True
            if field.get('requires'):
                for flag in field['requires']:
                    if not args[flag]:
                        print("WARN: '%s' must be set to use option '%s'" %
                              (flag, field['flag']))
                        requirements = False
            if not requirements:
                continue
            field['module'].apply_filter(ids, args[field['flag']], **args)
    if args['--print-names']:
        print(ids)
        # gc = fetch_field(args['DATASET'], 'gc', meta)
        # print([gc.values[i] for i in indices])
        # busco = fetch_field(args['DATASET'], 'eukaryota_odb9_busco', meta)
        # print([busco.values[i] for i in indices if busco.values[i]])
        # trna = fetch_field(args['DATASET'], 'trnascan_bacterial', meta)
        # values = [trna.values[i] for i in indices if trna.values[i]]
        # seen = {}
        # unique = []
        # for v in values:
        #     for t in v:
        #         string = ','.join(t)
        #         if string not in seen:
        #             unique.append(t)
        #             seen[string] = 1
        # print(unique)
    if args['--summary']:
        summary_stats = {}
        for section in SUMMARY:
            requirements = True
            if section.get('requires'):
                for flag in section['requires']:
                    if not args[flag]:
                        print(
                            "WARN: '%s' must be set to generate '%s' summary" %
                            (flag, section['title']))
                        requirements = False
            if not requirements:
                continue
            fields = {}
            if section.get('depends'):
                for field in section['depends']:
                    fields.update(
                        {field: fetch_field(args['DATASET'], field, meta)})
            if section['title'] == 'hits':
                field = "%s_%s" % (args['--taxrule'], args['--summary-rank'])
                fields.update(
                    {'hits': fetch_field(args['DATASET'], field, meta)})
                if 'y' in meta.plot:
                    fields.update({
                        'cov':
                        fetch_field(args['DATASET'], meta.plot['y'], meta)
                    })
            if section['title'] == 'busco':
                lineages = []
                for field in meta.list_fields():
                    if field.endswith('_busco'):
                        lineages.append(field)
                        fields.update(
                            {field: fetch_field(args['DATASET'], field, meta)})
                fields.update({'lineages': lineages})
            if section['title'] == 'readMapping':
                libraries = []
                for field in meta.list_fields():
                    if field.endswith('_read_cov'):
                        library = field.replace('_read_cov', '')
                        libraries.append(library)
                        fields.update({
                            "%s_cov" % library:
                            fetch_field(args['DATASET'], "%s_cov" % library,
                                        meta)
                        })
                        fields.update(
                            {field: fetch_field(args['DATASET'], field, meta)})
                fields.update({'libraries': libraries})
            summary_stats.update({
                section['title']:
                section['module'].summarise(indices, fields, **args, meta=meta)
            })
        stats = {}
        if 'hits' in summary_stats:
            nohit_span = 0
            if 'no-hit' in summary_stats['hits']:
                nohit_span = summary_stats['hits']['no-hit']['span']
                span = summary_stats['hits']['total']['span']
                stats.update({'noHit': float("%.3f" % (nohit_span / span))})
            if 'taxonomy' in summary_stats and 'target' in summary_stats[
                    'taxonomy']:
                target_span = summary_stats['hits'][summary_stats['taxonomy']
                                                    ['target']]['span']
                stats.update({
                    'target':
                    float("%.3f" % (target_span / (span - nohit_span)))
                })
            ratio = summary_stats['hits']['total']['span'] / summary_stats[
                'hits']['total']['n50']
            if ratio >= 100:
                ratio = int(float('%.3g' % ratio))
            else:
                ratio = float('%.3g' % ratio)
            stats.update({'spanOverN50': ratio})
        summary_stats.update({'stats': stats})
        file_io.write_file(args['--summary'], {'summaryStats': summary_stats})
Esempio n. 7
0
def main():
    """Entrypoint for blobtools filter."""
    args = docopt(__doc__)
    meta = fetch_metadata(args["DATASET"], **args)
    params = parse_params(args, meta)
    identifiers = fetch_field(args["DATASET"], "identifiers", meta)
    indices = [index for index, value in enumerate(identifiers.values)]
    invert = args["--invert"]
    if params:
        indices = filter_by_params(meta, args["DATASET"], indices, params,
                                   invert)
    if args["--json"]:
        indices = filter_by_json(identifiers.values, indices, args["--json"],
                                 invert)
    if args["--output"]:
        create_filtered_dataset(meta, args["DATASET"], args["--output"],
                                indices)
    ids = [identifiers.values[i] for i in indices]
    for field in FIELDS:
        if args[field["flag"]]:
            requirements = True
            if field.get("requires"):
                for flag in field["requires"]:
                    if flag not in args:
                        print("WARN: '%s' must be set to use option '%s'" %
                              (flag, field["flag"]))
                        requirements = False
            if not requirements:
                continue
            field["module"].apply_filter(ids, args[field["flag"]], **args)
    if args["--table"]:
        full_field_ids = args["--table-fields"].split(",")
        expanded_ids = ["index", "identifiers"]
        field_ids = []
        alt_ids = {field_id: field_id for field_id in expanded_ids}
        for full_id in full_field_ids:
            try:
                field_id, alt_id = full_id.split("=")
                field_ids.append(field_id)
                alt_ids[field_id] = alt_id
            except ValueError:
                field_ids.append(full_id)
                alt_ids[full_id] = full_id
        fields = {
            "identifiers": fetch_field(args["DATASET"], "identifiers", meta)
        }
        for field_id in field_ids:
            if field_id == "plot":
                for axis in ["x", "z", "y", "cat"]:
                    if axis in meta.plot:
                        expanded_ids.append(meta.plot[axis])
                        alt_ids.update({meta.plot[axis]: meta.plot[axis]})
                        fields[meta.plot[axis]] = fetch_field(
                            args["DATASET"], meta.plot[axis], meta)
            else:
                expanded_ids.append(field_id)
                alt_ids.update({field_id: field_id})
                fields[field_id] = fetch_field(args["DATASET"], field_id, meta)
        table = [[alt_ids[field_id] for field_id in expanded_ids]]
        for i in indices:
            record = []
            for field_id in expanded_ids:
                if field_id == "index":
                    record.append(i)
                else:
                    value = fields[field_id].values[i]
                    if fields[field_id].keys:
                        value = fields[field_id].keys[value]
                    record.append(value)
            table.append(record)
        file_io.write_file(args["--table"], table)
    if args["--summary"]:
        summary_stats = {}
        for section in SUMMARY:
            requirements = True
            if section.get("requires"):
                for flag in section["requires"]:
                    if not args[flag]:
                        print(
                            "WARN: '%s' must be set to generate '%s' summary" %
                            (flag, section["title"]))
                        requirements = False
            if not requirements:
                continue
            fields = {}
            if section.get("depends"):
                for field in section["depends"]:
                    fields.update(
                        {field: fetch_field(args["DATASET"], field, meta)})
            if section["title"] == "hits":
                taxrule = args.get("--taxrule", None)
                if taxrule is None:
                    taxrule = meta.plot.get("cat", None)
                    if taxrule is not None:
                        taxrule = re.sub(r"_[^_]+$", "", taxrule)
                        args["--taxrule"] = taxrule
                    else:
                        continue
                field = "%s_%s" % (taxrule, args["--summary-rank"])
                fields.update(
                    {"hits": fetch_field(args["DATASET"], field, meta)})
                if "y" in meta.plot:
                    fields.update({
                        "cov":
                        fetch_field(args["DATASET"], meta.plot["y"], meta)
                    })
            if section["title"] == "busco":
                lineages = []
                for field in meta.list_fields():
                    if field.endswith("_busco"):
                        lineages.append(field)
                        fields.update(
                            {field: fetch_field(args["DATASET"], field, meta)})
                fields.update({"lineages": lineages})
            if section["title"] == "readMapping":
                libraries = []
                for field in meta.list_fields():
                    if field.endswith(
                            "_cov") and not field.endswith("_read_cov"):
                        library = field.replace("_cov", "")
                        libraries.append(library)
                        fields.update(
                            {field: fetch_field(args["DATASET"], field, meta)})
                fields.update({"libraries": libraries})
            summary_stats.update({
                section["title"]:
                section["module"].summarise(indices,
                                            fields,
                                            **args,
                                            meta=meta,
                                            stats=summary_stats)
            })
        stats = {}
        if "hits" in summary_stats:
            nohit_span = 0
            span = summary_stats["hits"]["total"]["span"]
            if "no-hit" in summary_stats["hits"]:
                nohit_span = summary_stats["hits"]["no-hit"]["span"]
                stats.update({"noHit": float("%.3f" % (nohit_span / span))})
            else:
                stats.update({"noHit": 0})
            if "taxonomy" in summary_stats and "target" in summary_stats[
                    "taxonomy"]:
                if summary_stats["taxonomy"]["target"] in summary_stats[
                        "hits"]:
                    target_span = summary_stats["hits"][
                        summary_stats["taxonomy"]["target"]]["span"]
                    stats.update({
                        "target":
                        float("%.3f" % (target_span / (span - nohit_span)))
                    })
                elif "target" in summary_stats["hits"]:
                    target_span = summary_stats["hits"]["target"]["span"]
                    stats.update({
                        "target":
                        float("%.3f" % (target_span / (span - nohit_span)))
                    })
                    del summary_stats["hits"]["target"]
                else:
                    stats.update({"target": 0})
            ratio = (summary_stats["hits"]["total"]["span"] /
                     summary_stats["hits"]["total"]["n50"])
            if ratio >= 100:
                ratio = int(float("%.3g" % ratio))
            else:
                ratio = float("%.3g" % ratio)
            stats.update({"spanOverN50": ratio})
        summary_stats.update({"stats": stats})
        file_io.write_file(args["--summary"], {"summaryStats": summary_stats})