def apply_filter(ids, text_file, **kwargs): """Filter Text file.""" suffix = kwargs['--suffix'] path = pathlib.Path(text_file) outfile = str(path.parent / (path.stem + '.' + suffix + path.suffix)) data = file_io.read_file(text_file) lines = data.split('\n') delimiter = kwargs['--text-delimiter'] if delimiter == 'whitespace': delimit = re.compile(r'\s+') else: delimit = re.compile(r"%s" % delimiter) id_col = int(kwargs['--text-id-column']) - 1 output = [] if kwargs['--text-header']: header_row = lines.pop(0) header_row.rstrip() output.append(header_row) for line in lines: line = line row = re.split(delimit, line.replace('"', '')) try: if row[id_col] in ids: output.append(line) except IndexError: output.append(line) file_io.write_file(outfile, output, plain=True)
def main(): """Entrypoint for blobtools remove.""" args = docopt(__doc__) meta = fetch_metadata(args['DIRECTORY'], **args) field_ids = [] for field in FIELDS: if args[field['flag']] or args['--all']: field_ids += field['module'].remove_from_meta(meta=meta) for field_id in args['--field']: field_ids += remove_field(meta, field_id) if args['--all']: for field_id in meta.list_fields(): if field_id != 'identifiers': field_ids += remove_field(meta, field_id) if meta.reads: remove_read_metadata(meta, field_ids) if meta.plot: axes = ['x', 'y', 'z', 'cat'] for axis in axes: if axis not in meta.plot: remove_static_plots(meta, args['DIRECTORY']) break else: remove_static_plots(meta, args['DIRECTORY']) if field_ids: file_io.delete_file("%s/CHECKSUM" % args['DIRECTORY']) file_io.delete_file("%s/summary.json" % args['DIRECTORY']) for field_id in field_ids: file_io.delete_file("%s/%s.json" % (args['DIRECTORY'], field_id)) file_io.write_file("%s/meta.json" % args['DIRECTORY'], meta.to_dict())
def get_accounts_from_file(): if os.path.isfile(CONFIG_ACCOUNTS_FILE): file_content = read_file(CONFIG_ACCOUNTS_FILE) else: file_content = read_file(CONFIG_DEFAULT_ACCOUNTS_FILE) write_file(CONFIG_ACCOUNTS_FILE, file_content) return ast.literal_eval(file_content)
def main(): """Entrypoint for blobtools add.""" args = docopt(__doc__) meta = fetch_metadata(args['DIRECTORY'], **args) if args['--fasta']: meta.assembly.update({'file': args['--fasta']}) taxdump = None dependencies = {} for field in FIELDS: if args[field['flag']]: for dep in field['depends']: if dep not in dependencies or not dependencies[dep]: dependencies[dep] = fetch_field(args['DIRECTORY'], dep, meta) if field['flag'] == '--hits': if not taxdump: taxdump = fetch_taxdump(args['--taxdump']) parents = field['module'].parent() parsed = field['module'].parse( args[field['flag']], **{key: args[key] for key in PARAMS}, taxdump=taxdump, dependencies=dependencies, meta=meta) if not isinstance(parsed, list): parsed = [parsed] for data in parsed: if not args['--replace']: if has_field_warning(meta, data.field_id): continue for parent in data.parents: if 'range' in parent: parent_meta = meta.field_meta(parent['id']) if parent_meta and 'range' in parent_meta: parent['range'][0] = min(parent['range'][0], parent_meta['range'][0]) parent['range'][1] = max(parent['range'][1], parent_meta['range'][1]) meta.add_field(parents + data.parents, **data.meta) if isinstance(data, Identifier): meta.records = len(data.values) json_file = "%s/%s.json" % (args['DIRECTORY'], data.field_id) file_io.write_file(json_file, data.values_to_dict()) dependencies[data.field_id] = data if 'identifiers' not in dependencies: dependencies['identifiers'] = fetch_field(args['DIRECTORY'], 'identifiers', meta) for string in args['--link']: link.add(string, meta, dependencies['identifiers'].values, args['--skip-link-test']) for string in args['--key']: key.add(string, meta) if args['--taxid']: if not taxdump: taxdump = fetch_taxdump(args['--taxdump']) taxid.add(args['--taxid'], taxdump, meta) file_io.write_file("%s/meta.json" % args['DIRECTORY'], meta.to_dict())
def create_filtered_dataset(dataset_meta, indir, outdir, indices): """Write filtered records to new dataset.""" meta = dataset_meta.to_dict() meta.update({ "fields": [], "origin": dataset_meta.dataset_id, "records": len(indices) }) meta.pop("id") meta = fetch_metadata(outdir, meta=meta) # meta = fetch_metadata(outdir, **args) for field_id in dataset_meta.list_fields(): field_meta = dataset_meta.field_meta(field_id) if not field_meta.get("children"): field_meta.pop("data", False) keys = None slot = None headers = None full_field = fetch_field(indir, field_id, dataset_meta) if isinstance(full_field, (Variable, Identifier)): values = [full_field.values[i] for i in indices] if isinstance(full_field, Variable): field_meta.update({"range": [min(values), max(values)]}) if field_id == "length": meta.assembly.update({"span": sum(values)}) meta.assembly.update({"scaffold-count": len(values)}) elif isinstance(full_field, Category): full_values = full_field.expand_values() values = [full_values[i] for i in indices] else: full_values = full_field.expand_values() values = [full_values[i] for i in indices] slot = full_field.category_slot try: headers = full_field.headers except AttributeError: pass if field_meta.get("parent"): parent_field = fetch_field(outdir, field_meta["parent"], dataset_meta) if parent_field: keys = parent_field.keys field = type(full_field)( field_id, meta=field_meta, values=values, fixed_keys=keys, category_slot=slot, headers=headers, ) parents = dataset_meta.field_parent_list(field_id) meta.add_field(parents, **field_meta, field_id=field_id) json_file = "%s/%s.json" % (outdir, field.field_id) file_io.write_file(json_file, field.values_to_dict()) file_io.write_file("%s/meta.json" % outdir, meta.to_dict())
def fetch_taxdump(path_to_taxdump): """Load Taxdump from file.""" json_file = "%s/taxdump.json" % path_to_taxdump if not Path(json_file).exists(): print("Parsing taxdump") else: print("Loading parsed taxdump") data = file_io.load_yaml(json_file) if data is None: taxdump = Taxdump(path_to_taxdump) file_io.write_file(json_file, taxdump.values_to_dict()) else: taxdump = Taxdump(path_to_taxdump, **data) return taxdump
def apply_filter(ids, text_file, **kwargs): """Filter Text file.""" suffix = kwargs["--suffix"] path = pathlib.Path(text_file) outfile = str(path.parent / (path.stem + "." + suffix + path.suffix)) data = file_io.read_file(text_file) lines = data.split("\n") delimiter = kwargs["--text-delimiter"] delimit = set_delimiter(delimiter, sample=lines[0]) id_col = int(kwargs["--text-id-column"]) - 1 output = [] if kwargs["--text-header"]: header_row = lines.pop(0) header_row.rstrip() output.append(header_row) for line in lines: line = line row = re.split(delimit, line.replace('"', "")) try: if row[id_col] in ids: output.append(line) except IndexError: output.append(line) file_io.write_file(outfile, output, plain=True)
def main(): """Entrypoint for blobtools filter.""" args = docopt(__doc__) meta = fetch_metadata(args['DATASET'], **args) params = parse_params(args, meta) identifiers = fetch_field(args['DATASET'], 'identifiers', meta) indices = [index for index, value in enumerate(identifiers.values)] invert = args['--invert'] if params: indices = filter_by_params(meta, args['DATASET'], indices, params, invert) if args['--json']: indices = filter_by_json(identifiers.values, indices, args['--json'], invert) if args['--output']: create_filtered_dataset(meta, args['DATASET'], args['--output'], indices) ids = [identifiers.values[i] for i in indices] for field in FIELDS: if args[field['flag']]: requirements = True if field.get('requires'): for flag in field['requires']: if flag not in args: print("WARN: '%s' must be set to use option '%s'" % (flag, field['flag'])) requirements = False if not requirements: continue field['module'].apply_filter(ids, args[field['flag']], **args) if args['--table']: full_field_ids = args['--table-fields'].split(',') expanded_ids = ['index', 'identifiers'] field_ids = [] alt_ids = {field_id: field_id for field_id in expanded_ids} for full_id in full_field_ids: try: field_id, alt_id = full_id.split('=') field_ids.append(field_id) alt_ids[field_id] = alt_id except ValueError: field_ids.append(full_id) alt_ids[full_id] = full_id fields = { 'identifiers': fetch_field(args['DATASET'], 'identifiers', meta) } for field_id in field_ids: if field_id == 'plot': for axis in ['x', 'z', 'y', 'cat']: if axis in meta.plot: expanded_ids.append(meta.plot[axis]) alt_ids.update({meta.plot[axis]: meta.plot[axis]}) fields[meta.plot[axis]] = fetch_field( args['DATASET'], meta.plot[axis], meta) else: expanded_ids.append(field_id) alt_ids.update({field_id: field_id}) fields[field_id] = fetch_field(args['DATASET'], field_id, meta) table = [[alt_ids[field_id] for field_id in expanded_ids]] for i in indices: record = [] for field_id in expanded_ids: if field_id == 'index': record.append(i) else: value = fields[field_id].values[i] if fields[field_id].keys: value = fields[field_id].keys[value] record.append(value) table.append(record) file_io.write_file(args['--table'], table) if args['--summary']: summary_stats = {} for section in SUMMARY: requirements = True if section.get('requires'): for flag in section['requires']: if not args[flag]: print( "WARN: '%s' must be set to generate '%s' summary" % (flag, section['title'])) requirements = False if not requirements: continue fields = {} if section.get('depends'): for field in section['depends']: fields.update( {field: fetch_field(args['DATASET'], field, meta)}) if section['title'] == 'hits': field = "%s_%s" % (args['--taxrule'], args['--summary-rank']) fields.update( {'hits': fetch_field(args['DATASET'], field, meta)}) if 'y' in meta.plot: fields.update({ 'cov': fetch_field(args['DATASET'], meta.plot['y'], meta) }) if section['title'] == 'busco': lineages = [] for field in meta.list_fields(): if field.endswith('_busco'): lineages.append(field) fields.update( {field: fetch_field(args['DATASET'], field, meta)}) fields.update({'lineages': lineages}) if section['title'] == 'readMapping': libraries = [] for field in meta.list_fields(): if field.endswith('_read_cov'): library = field.replace('_read_cov', '') libraries.append(library) fields.update({ "%s_cov" % library: fetch_field(args['DATASET'], "%s_cov" % library, meta) }) fields.update( {field: fetch_field(args['DATASET'], field, meta)}) fields.update({'libraries': libraries}) summary_stats.update({ section['title']: section['module'].summarise(indices, fields, **args, meta=meta, stats=summary_stats) }) stats = {} if 'hits' in summary_stats: nohit_span = 0 span = summary_stats['hits']['total']['span'] if 'no-hit' in summary_stats['hits']: nohit_span = summary_stats['hits']['no-hit']['span'] stats.update({'noHit': float("%.3f" % (nohit_span / span))}) else: stats.update({'noHit': 0}) if 'taxonomy' in summary_stats and 'target' in summary_stats[ 'taxonomy']: if summary_stats['taxonomy']['target'] in summary_stats[ 'hits']: target_span = summary_stats['hits'][ summary_stats['taxonomy']['target']]['span'] stats.update({ 'target': float("%.3f" % (target_span / (span - nohit_span))) }) elif 'target' in summary_stats['hits']: target_span = summary_stats['hits']['target']['span'] stats.update({ 'target': float("%.3f" % (target_span / (span - nohit_span))) }) del summary_stats['hits']['target'] else: stats.update({'target': 0}) ratio = summary_stats['hits']['total']['span'] / summary_stats[ 'hits']['total']['n50'] if ratio >= 100: ratio = int(float('%.3g' % ratio)) else: ratio = float('%.3g' % ratio) stats.update({'spanOverN50': ratio}) summary_stats.update({'stats': stats}) file_io.write_file(args['--summary'], {'summaryStats': summary_stats})
def main(): """Entrypoint for blobtools add.""" args = docopt(__doc__) meta = fetch_metadata(args["DIRECTORY"], **args) if args["--fasta"]: meta.assembly.update({"file": args["--fasta"]}) taxdump = None dependencies = {} for field in FIELDS: if args[field["flag"]]: if "depends" in field: for dep in field["depends"]: if dep not in dependencies or not dependencies[dep]: dependencies[dep] = fetch_field( args["DIRECTORY"], dep, meta) for dep_key, dep_value in dependencies.items(): if not dep_value: print("ERROR: '%s.json' was not found in the BlobDir." % dep_key) print( "ERROR: You may need to rebuild the BlobDir to run this command." ) sys.exit(1) if field["flag"] == "--hits": if not taxdump: taxdump = fetch_taxdump(args["--taxdump"]) parents = field["module"].parent() if "optional" in field: for dep in field["optional"]: if dep not in dependencies or not dependencies[dep]: dependencies[dep] = fetch_field( args["DIRECTORY"], dep, meta) parsed = field["module"].parse( args[field["flag"]], **{key: args[key] for key in args.keys()}, taxdump=taxdump, dependencies=dependencies, meta=meta) if not isinstance(parsed, list): parsed = [parsed] for data in parsed: if not args["--replace"]: if has_field_warning(meta, data.field_id): continue for parent in data.parents: if "range" in parent: parent_meta = meta.field_meta(parent["id"]) if parent_meta and "range" in parent_meta: parent["range"][0] = min(parent["range"][0], parent_meta["range"][0]) parent["range"][1] = max(parent["range"][1], parent_meta["range"][1]) meta.add_field(parents + data.parents, **data.meta) if isinstance(data, Identifier): meta.records = len(data.values) json_file = "%s/%s.json" % (args["DIRECTORY"], data.field_id) file_io.write_file(json_file, data.values_to_dict()) dependencies[data.field_id] = data if "identifiers" not in dependencies: dependencies["identifiers"] = fetch_field(args["DIRECTORY"], "identifiers", meta) for string in args["--link"]: link.add(string, meta, dependencies["identifiers"].values, args["--skip-link-test"]) for string in args["--key"]: key.add(string, meta, args["--replace"]) if args["--taxid"]: if not taxdump: taxdump = fetch_taxdump(args["--taxdump"]) taxid.add(args["--taxid"], taxdump, meta) file_io.write_file("%s/meta.json" % args["DIRECTORY"], meta.to_dict())
def main(): """Entrypoint for blobtools filter.""" args = docopt(__doc__) meta = fetch_metadata(args['DATASET'], **args) params = parse_params(args, meta) identifiers = fetch_field(args['DATASET'], 'identifiers', meta) indices = [index for index, value in enumerate(identifiers.values)] invert = args['--invert'] if params: indices = filter_by_params(meta, args['DATASET'], indices, params, invert) if args['--json']: indices = filter_by_json(identifiers.values, indices, args['--json'], invert) if args['--output']: create_filtered_dataset(meta, args['DATASET'], args['--output'], indices) ids = [identifiers.values[i] for i in indices] for field in FIELDS: if args[field['flag']]: requirements = True if field.get('requires'): for flag in field['requires']: if not args[flag]: print("WARN: '%s' must be set to use option '%s'" % (flag, field['flag'])) requirements = False if not requirements: continue field['module'].apply_filter(ids, args[field['flag']], **args) if args['--print-names']: print(ids) # gc = fetch_field(args['DATASET'], 'gc', meta) # print([gc.values[i] for i in indices]) # busco = fetch_field(args['DATASET'], 'eukaryota_odb9_busco', meta) # print([busco.values[i] for i in indices if busco.values[i]]) # trna = fetch_field(args['DATASET'], 'trnascan_bacterial', meta) # values = [trna.values[i] for i in indices if trna.values[i]] # seen = {} # unique = [] # for v in values: # for t in v: # string = ','.join(t) # if string not in seen: # unique.append(t) # seen[string] = 1 # print(unique) if args['--summary']: summary_stats = {} for section in SUMMARY: requirements = True if section.get('requires'): for flag in section['requires']: if not args[flag]: print( "WARN: '%s' must be set to generate '%s' summary" % (flag, section['title'])) requirements = False if not requirements: continue fields = {} if section.get('depends'): for field in section['depends']: fields.update( {field: fetch_field(args['DATASET'], field, meta)}) if section['title'] == 'hits': field = "%s_%s" % (args['--taxrule'], args['--summary-rank']) fields.update( {'hits': fetch_field(args['DATASET'], field, meta)}) if 'y' in meta.plot: fields.update({ 'cov': fetch_field(args['DATASET'], meta.plot['y'], meta) }) if section['title'] == 'busco': lineages = [] for field in meta.list_fields(): if field.endswith('_busco'): lineages.append(field) fields.update( {field: fetch_field(args['DATASET'], field, meta)}) fields.update({'lineages': lineages}) if section['title'] == 'readMapping': libraries = [] for field in meta.list_fields(): if field.endswith('_read_cov'): library = field.replace('_read_cov', '') libraries.append(library) fields.update({ "%s_cov" % library: fetch_field(args['DATASET'], "%s_cov" % library, meta) }) fields.update( {field: fetch_field(args['DATASET'], field, meta)}) fields.update({'libraries': libraries}) summary_stats.update({ section['title']: section['module'].summarise(indices, fields, **args, meta=meta) }) stats = {} if 'hits' in summary_stats: nohit_span = 0 if 'no-hit' in summary_stats['hits']: nohit_span = summary_stats['hits']['no-hit']['span'] span = summary_stats['hits']['total']['span'] stats.update({'noHit': float("%.3f" % (nohit_span / span))}) if 'taxonomy' in summary_stats and 'target' in summary_stats[ 'taxonomy']: target_span = summary_stats['hits'][summary_stats['taxonomy'] ['target']]['span'] stats.update({ 'target': float("%.3f" % (target_span / (span - nohit_span))) }) ratio = summary_stats['hits']['total']['span'] / summary_stats[ 'hits']['total']['n50'] if ratio >= 100: ratio = int(float('%.3g' % ratio)) else: ratio = float('%.3g' % ratio) stats.update({'spanOverN50': ratio}) summary_stats.update({'stats': stats}) file_io.write_file(args['--summary'], {'summaryStats': summary_stats})
def save_accounts_list(self): write_file(CONFIG_ACCOUNTS_FILE, self.accounts_list)
def main(): """Entrypoint for blobtools filter.""" args = docopt(__doc__) meta = fetch_metadata(args["DATASET"], **args) params = parse_params(args, meta) identifiers = fetch_field(args["DATASET"], "identifiers", meta) indices = [index for index, value in enumerate(identifiers.values)] invert = args["--invert"] if params: indices = filter_by_params(meta, args["DATASET"], indices, params, invert) if args["--json"]: indices = filter_by_json(identifiers.values, indices, args["--json"], invert) if args["--output"]: create_filtered_dataset(meta, args["DATASET"], args["--output"], indices) ids = [identifiers.values[i] for i in indices] for field in FIELDS: if args[field["flag"]]: requirements = True if field.get("requires"): for flag in field["requires"]: if flag not in args: print("WARN: '%s' must be set to use option '%s'" % (flag, field["flag"])) requirements = False if not requirements: continue field["module"].apply_filter(ids, args[field["flag"]], **args) if args["--table"]: full_field_ids = args["--table-fields"].split(",") expanded_ids = ["index", "identifiers"] field_ids = [] alt_ids = {field_id: field_id for field_id in expanded_ids} for full_id in full_field_ids: try: field_id, alt_id = full_id.split("=") field_ids.append(field_id) alt_ids[field_id] = alt_id except ValueError: field_ids.append(full_id) alt_ids[full_id] = full_id fields = { "identifiers": fetch_field(args["DATASET"], "identifiers", meta) } for field_id in field_ids: if field_id == "plot": for axis in ["x", "z", "y", "cat"]: if axis in meta.plot: expanded_ids.append(meta.plot[axis]) alt_ids.update({meta.plot[axis]: meta.plot[axis]}) fields[meta.plot[axis]] = fetch_field( args["DATASET"], meta.plot[axis], meta) else: expanded_ids.append(field_id) alt_ids.update({field_id: field_id}) fields[field_id] = fetch_field(args["DATASET"], field_id, meta) table = [[alt_ids[field_id] for field_id in expanded_ids]] for i in indices: record = [] for field_id in expanded_ids: if field_id == "index": record.append(i) else: value = fields[field_id].values[i] if fields[field_id].keys: value = fields[field_id].keys[value] record.append(value) table.append(record) file_io.write_file(args["--table"], table) if args["--summary"]: summary_stats = {} for section in SUMMARY: requirements = True if section.get("requires"): for flag in section["requires"]: if not args[flag]: print( "WARN: '%s' must be set to generate '%s' summary" % (flag, section["title"])) requirements = False if not requirements: continue fields = {} if section.get("depends"): for field in section["depends"]: fields.update( {field: fetch_field(args["DATASET"], field, meta)}) if section["title"] == "hits": taxrule = args.get("--taxrule", None) if taxrule is None: taxrule = meta.plot.get("cat", None) if taxrule is not None: taxrule = re.sub(r"_[^_]+$", "", taxrule) args["--taxrule"] = taxrule else: continue field = "%s_%s" % (taxrule, args["--summary-rank"]) fields.update( {"hits": fetch_field(args["DATASET"], field, meta)}) if "y" in meta.plot: fields.update({ "cov": fetch_field(args["DATASET"], meta.plot["y"], meta) }) if section["title"] == "busco": lineages = [] for field in meta.list_fields(): if field.endswith("_busco"): lineages.append(field) fields.update( {field: fetch_field(args["DATASET"], field, meta)}) fields.update({"lineages": lineages}) if section["title"] == "readMapping": libraries = [] for field in meta.list_fields(): if field.endswith( "_cov") and not field.endswith("_read_cov"): library = field.replace("_cov", "") libraries.append(library) fields.update( {field: fetch_field(args["DATASET"], field, meta)}) fields.update({"libraries": libraries}) summary_stats.update({ section["title"]: section["module"].summarise(indices, fields, **args, meta=meta, stats=summary_stats) }) stats = {} if "hits" in summary_stats: nohit_span = 0 span = summary_stats["hits"]["total"]["span"] if "no-hit" in summary_stats["hits"]: nohit_span = summary_stats["hits"]["no-hit"]["span"] stats.update({"noHit": float("%.3f" % (nohit_span / span))}) else: stats.update({"noHit": 0}) if "taxonomy" in summary_stats and "target" in summary_stats[ "taxonomy"]: if summary_stats["taxonomy"]["target"] in summary_stats[ "hits"]: target_span = summary_stats["hits"][ summary_stats["taxonomy"]["target"]]["span"] stats.update({ "target": float("%.3f" % (target_span / (span - nohit_span))) }) elif "target" in summary_stats["hits"]: target_span = summary_stats["hits"]["target"]["span"] stats.update({ "target": float("%.3f" % (target_span / (span - nohit_span))) }) del summary_stats["hits"]["target"] else: stats.update({"target": 0}) ratio = (summary_stats["hits"]["total"]["span"] / summary_stats["hits"]["total"]["n50"]) if ratio >= 100: ratio = int(float("%.3g" % ratio)) else: ratio = float("%.3g" % ratio) stats.update({"spanOverN50": ratio}) summary_stats.update({"stats": stats}) file_io.write_file(args["--summary"], {"summaryStats": summary_stats})
return report if __name__ == '__main__': set_args = set(sys.argv[1:]) if not set_args: help() sys.exit() domains = all_domains if 'all' not in set_args: domains = list(set_args & set(all_domains)) if not domains: domain_not_supported() sys.exit() set_args = set(sys.argv[1:]) domains = all_domains if 'all' not in set_args: domains = list(set_args & set(all_domains)) for domain in domains: print 'Training LogisticRegression on {} domain \n'.format(domain) classifier, report = train(domain) write_file('data_train/' + domain + '/ensemble-report.txt', report) report = test(domain, classifier) write_file('data_test/' + domain + '/ensemble-report.txt', report)