def filter_by_json(identifiers, indices, json_file, invert): """Filter included set using json file.""" data = file_io.load_yaml(json_file) id_set = set(data['identifiers']) if not invert: indices = [i for i in indices if identifiers[i] in id_set] else: indices = [i for i in indices if identifiers[i] not in id_set] return indices
def fetch_metadata(path_to_dataset, **kwargs): """ Load Metadata from file. fetch_metadata('tests/files/dataset') """ dataset_id = path_to_dataset.split("/").pop() new_meta = {} meta = None if not os.path.exists(path_to_dataset): os.makedirs(path_to_dataset) if kwargs.get("--meta"): new_meta = file_io.load_yaml(kwargs["--meta"]) if (kwargs["--bed"] or kwargs["--fasta"]) and kwargs["--replace"]: files = glob.glob("%s/*" % kwargs["DIRECTORY"]) for file in files: os.remove(file) try: meta = kwargs["meta"] except KeyError: try: meta = file_io.load_yaml("%s/meta.json" % path_to_dataset) except ValueError: pass if meta is None: meta = {} if "id" not in meta: meta["id"] = dataset_id meta["name"] = dataset_id for key, value in new_meta.items(): if isinstance(value, dict): try: meta[key].update({k: v for k, v in value.items()}) except KeyError: meta[key] = value elif isinstance(value, list): meta[key] += value else: meta[key] = value return Metadata(dataset_id, **meta)
def parse_json_cov(json_file, **kwargs): """Parse coverage from JSON cov file.""" parts = json_file.split('=') base_name = parts[1] data = load_yaml(parts[0]) covs = [] if 'values' in data: for value in data['values']: covs.append(float("%.4f" % value)) if base_name.endswith('_read_cov'): type = 'read_cov' parent = 'read_coverage' datatype = 'float' clamp = 1 elif base_name.endswith('_cov'): type = 'cov' parent = 'base_coverage' datatype = 'integer' clamp = 0.01 else: return None field_id = base_name fields = {} fields["%s_id" % type] = field_id fields["%s_range" % type] = [ min(covs + [kwargs["%s_range" % type][0]]), max(covs + [kwargs["%s_range" % type][1]]) ] if kwargs['meta'].has_field(field_id): file_name = kwargs['meta'].field_meta(field_id)['file'] else: file_name = json_file fields[type] = Variable(field_id, values=covs, meta={ 'field_id': field_id, 'file': file_name }, parents=[ 'children', { 'id': parent, 'datatype': 'integer', 'clamp': clamp if fields["%s_range" % type][0] == 0 else False, 'range': fields["%s_range" % type] }, 'children' ]) return fields
def parse_json_cov(json_file, **kwargs): """Parse coverage from JSON cov file.""" parts = json_file.split("=") base_name = parts[1] data = load_yaml(parts[0]) covs = [] if "values" in data: for value in data["values"]: covs.append(float("%.4f" % value)) if base_name.endswith("_read_cov"): type = "read_cov" parent = "read_coverage" datatype = "float" clamp = 1 elif base_name.endswith("_cov"): type = "cov" parent = "base_coverage" datatype = "integer" clamp = 0.01 else: return None field_id = base_name fields = {} fields["%s_id" % type] = field_id fields["%s_range" % type] = [ min(covs + [kwargs["%s_range" % type][0]]), max(covs + [kwargs["%s_range" % type][1]]), ] if kwargs["meta"].has_field(field_id): file_name = kwargs["meta"].field_meta(field_id)["file"] else: file_name = json_file fields[type] = Variable( field_id, values=covs, meta={ "field_id": field_id, "file": file_name }, parents=[ "children", { "id": parent, "datatype": "integer", "clamp": clamp if fields["%s_range" % type][0] == 0 else False, "range": fields["%s_range" % type], }, "children", ], ) return fields
def fetch_taxdump(path_to_taxdump): """Load Taxdump from file.""" json_file = "%s/taxdump.json" % path_to_taxdump if not Path(json_file).exists(): print("Parsing taxdump") else: print("Loading parsed taxdump") data = file_io.load_yaml(json_file) if data is None: taxdump = Taxdump(path_to_taxdump) file_io.write_file(json_file, taxdump.values_to_dict()) else: taxdump = Taxdump(path_to_taxdump, **data) return taxdump
def fetch_metadata(path_to_dataset, **kwargs): """ Load Metadata from file. fetch_metadata('tests/files/dataset') """ dataset_id = path_to_dataset.split('/').pop() new_meta = {} if not os.path.exists(path_to_dataset): os.makedirs(path_to_dataset) if kwargs.get('--meta'): new_meta = file_io.load_yaml(kwargs['--meta']) if kwargs['--replace']: files = glob.glob("%s/*" % kwargs['DIRECTORY']) for file in files: os.remove(file) try: meta = kwargs['meta'] except KeyError: meta = file_io.load_yaml("%s/meta.json" % path_to_dataset) if not meta: meta = {} if 'id' not in meta: meta['id'] = dataset_id meta['name'] = dataset_id for key, value in new_meta.items(): if isinstance(value, dict): try: meta[key].update({k: v for k, v in value.items()}) except KeyError: meta[key] = value elif isinstance(value, list): meta[key] += value else: meta[key] = value return Metadata(dataset_id, **meta)
def fetch_field(path_to_dataset, field_id, meta=None): """ Load fields from file. fetch_field('tests/files/dataset', 'identifiers', meta) """ field_meta = meta.field_meta(field_id) try: data = file_io.load_yaml("%s/%s.json" % (path_to_dataset, field_id)) if data is not None: data.update({"meta": field_meta}) field = TYPES[field_meta["type"]](field_id, **data) except TypeError: field = False except KeyError: field = False return field
def parse(file, **kwargs): """Parse all synonym files.""" blob_db = file_io.load_yaml(file) kwargs['meta'].assembly.update({'file': blob_db['assembly_f']}) parsed = [] identifiers = kwargs['dependencies']['identifiers'] if not identifiers: identifiers = Identifier('identifiers', meta={'field_id': 'identifiers'}, values=blob_db['order_of_blobs'], parents=[]) kwargs['meta'].assembly.update( {'scaffold-count': len(identifiers.values)}) parsed.append(identifiers) values = values_from_blob_db(blob_db) kwargs['meta'].assembly.update({'span': sum(values['lengths'])}) parsed.append( Variable('gc', meta={ 'preload': True, 'scale': 'scaleLinear', 'field_id': 'gc', 'name': 'GC', 'datatype': 'float', 'range': [min(values['gcs']), max(values['gcs'])] }, values=values['gcs'], parents=[])) _min = min(values['lengths']) parsed.append( Variable('length', meta={ 'field_id': 'length', 'preload': True, 'scale': 'scaleLog', 'name': 'Length', 'clamp': 100 if _min == 0 else False, 'datatype': 'integer', 'range': [_min, max(values['lengths'])] }, parents=[], values=values['lengths'])) parsed.append( Variable('ncount', meta={ 'field_id': 'ncount', 'scale': 'scaleLinear', 'name': 'N count', 'datatype': 'integer', 'range': [min(values['n_counts']), max(values['n_counts'])] }, values=values['n_counts'], parents=[])) if 'z' not in kwargs['meta'].plot: kwargs['meta'].plot.update({'z': 'length'}) if 'x' not in kwargs['meta'].plot: kwargs['meta'].plot.update({'x': 'gc'}) cov_range = [math.inf, -math.inf] read_cov_range = [math.inf, -math.inf] for cov_lib, cov_meta in blob_db['covLibs'].items(): cov_file_name = field_name_from_path(blob_db['covLibs'][cov_lib]['f']) covs = values["%s_cov" % cov_lib] read_covs = values["%s_read_cov" % cov_lib] cov_range = [min(covs + [cov_range[0]]), max(covs + [cov_range[1]])] read_cov_range = [ min(read_covs + [read_cov_range[0]]), max(read_covs + [read_cov_range[1]]) ] if 'y' not in kwargs['meta'].plot: kwargs['meta'].plot.update({'y': "%s_cov" % cov_file_name}) parsed.append( Variable("%s_cov" % cov_file_name, values=covs, meta={ 'field_id': "%s_cov" % cov_file_name, 'file': cov_meta['f'] }, parents=cov.parent() + [ 'children', { 'id': 'base_coverage', 'clamp': 1 if cov_range[0] == 0 else False, 'range': cov_range }, 'children' ])) parsed.append( Variable("%s_read_cov" % cov_file_name, values=read_covs, meta={ 'field_id': "%s_read_cov" % cov_file_name, 'file': cov_meta['f'], 'reads_mapped': cov_meta['reads_mapped'], 'reads_unmapped': cov_meta['reads_unmapped'] }, parents=cov.parent() + [ 'children', { 'id': 'read_coverage', 'datatype': 'integer', 'clamp': 1 if read_cov_range[0] == 0 else False, 'range': read_cov_range }, 'children' ])) ranks = blob_db['dict_of_blobs'][identifiers.values[0]]['taxonomy'][ blob_db['taxrules'][0]].keys() for tax_rule in blob_db['taxrules']: if 'cat' not in kwargs['meta'].plot: kwargs['meta'].plot.update({'cat': "%s_phylum" % tax_rule}) hit_list = hits_from_blob_db(blob_db, tax_rule) parsed.append( MultiArray("%s_hits" % tax_rule, values=hit_list, meta={ 'field_id': "%s_hits" % tax_rule, 'type': 'multiarray', 'datatype': 'mixed', 'preload': False, 'active': False, 'files': [m['f'] for x, m in blob_db['hitLibs'].items()] }, parents=hits.parent() + ['children', { 'id': tax_rule }, 'children'], category_slot=None, headers=['taxid', 'score'])) for rank in ranks: field_id = "%s_%s" % (tax_rule, rank) parsed.append( Category(field_id, values=values[field_id], meta={'field_id': field_id}, parents=hits.parent() + ['children', { 'id': tax_rule }, 'children'])) parents = hits.parent() + [ 'children', { 'id': tax_rule }, 'children', { 'id': field_id }, 'data' ] field_id = "%s_%s_cindex" % (tax_rule, rank) parsed.append( Variable(field_id, values=values[field_id], meta={ 'scale': 'scaleLinear', 'field_id': field_id, 'datatype': 'integer', 'range': [min(values[field_id]), max(values[field_id])], 'preload': False, 'active': False }, parents=parents)) field_id = "%s_%s_score" % (tax_rule, rank) _min = min(values[field_id]) parsed.append( Variable(field_id, values=values[field_id], meta={ 'scale': 'scaleLog', 'field_id': field_id, 'clamp': 1 if _min == 0 else False, 'datatype': 'float', 'range': [_min, max(values[field_id])], 'preload': False, 'active': False }, parents=parents)) return parsed