Ejemplo n.º 1
0
def parse_trnascan(trnascan_file, identifiers):
    """Parse tRNAscan results into a MultiArray."""
    data = file_io.read_file(trnascan_file)
    lines = data.split('\n')
    header = True
    meta = {'file': trnascan_file}
    results = defaultdict(list)
    for line in lines:
        if header:
            row = re.split(' +', line)
            if len(row) > 1:
                if row[1].startswith('v.'):
                    meta.update({'version': row[1]})
                elif row[1] == 'Mode:':
                    meta.update({'mode': row[2]})
                    meta.update({'field_id': "trnascan_%s" % row[2].lower()})
                elif row[1].startswith('------'):
                    header = False
        else:
            row = re.split(r' +|\t', line)
            if len(row) == 9:
                results[row[0]].append([row[4], row[5]])
    if not identifiers.validate_list(list(results.keys())):
        raise UserWarning('Contig names in the tRNAScan file did not match dataset identifiers.')
    values = [results[id] if id in results else [] for id in identifiers.values]
    trnascan_field = MultiArray(meta['field_id'],
                                values=values,
                                meta=meta,
                                headers=('tRNA_type', 'Anticodon'),
                                parents=['children']
                                )
    return trnascan_field
Ejemplo n.º 2
0
def parse_busco(busco_file, identifiers):  # pylint: disable=too-many-locals
    """Parse BUSCO results into a MultiArray."""
    data = file_io.read_file(busco_file)
    lines = data.split("\n")
    version = lines[0].split(":")[1].strip()
    desc = re.split(r":\s*|\(|\)\s*|,\s*", lines[1])
    meta = {
        "version": version,
        "set": desc[1].strip(),
        "count": max(int(desc[5].strip()), int(desc[7].strip())),
        "file": busco_file,
    }
    version = int(version.split(".")[0])
    if version < 4:
        rows = [re.split("\t", line) for line in lines[5:]]
        meta["set"] = re.search(
            r"-l\s.*?\/*(\w+_odb\d+)\/", lines[2].split(":")[1].strip()
        )[1]
        columns = re.split(r"# |\t", lines[4])[1:]
        try:
            contig_index = columns.index("Contig")
        except ValueError:
            contig_index = columns.index("Sequence")
    else:
        rows = [re.split("\t", line) for line in lines[3:]]
        columns = re.split(r"# |\t", lines[2])[1:]
        contig_index = columns.index("Sequence")
    meta["field_id"] = "%s_busco" % meta["set"]
    busco_index = columns.index("Busco id")
    status_index = columns.index("Status")
    results = defaultdict(list)
    for row in rows:
        if len(row) > contig_index:
            if version < 4:
                contig = row[contig_index]
            else:
                contig = row[contig_index].split(":")[0]
            results[contig].append([row[busco_index], row[status_index]])
    if not identifiers.validate_list(list(results.keys())):
        raise UserWarning(
            "Contig names in the Busco file did not match dataset identifiers."
        )
    values = [results[id] if id in results else [] for id in identifiers.values]
    busco_field = MultiArray(
        meta["field_id"],
        values=values,
        meta=meta,
        headers=("Busco id", "Status"),
        parents=["children"],
        category_slot=1,
    )
    return busco_field
Ejemplo n.º 3
0
def parse_busco(busco_file, identifiers):
    """Parse BUSCO results into a MultiArray."""
    data = file_io.read_file(busco_file)
    lines = data.split('\n')
    rows = [re.split('\t', line) for line in lines[5:]]
    meta = {
        'version': lines[0].split(':')[1].strip(),
        'set': re.split(r':|\(|\)', lines[1])[1].strip(),
        'count': int(re.split(r':|\(|\)', lines[1])[5].strip()),
        'command': lines[2].split(':')[1].strip(),
        'file': busco_file
    }
    meta['set'] = re.search(r'-l\s.*?\/*(\w+_odb\d+)\/', meta['command'])[1]
    meta['field_id'] = "%s_busco" % meta['set']
    columns = re.split(r'# |\t', lines[4])[1:]
    busco_index = columns.index('Busco id')
    status_index = columns.index('Status')
    contig_index = columns.index('Contig')
    results = defaultdict(list)
    for row in rows:
        if len(row) > contig_index:
            results[row[contig_index]].append(
                [row[busco_index], row[status_index]])
    if not identifiers.validate_list(list(results.keys())):
        raise UserWarning(
            'Contig names in the Busco file did not match dataset identifiers.'
        )
    values = [
        results[id] if id in results else [] for id in identifiers.values
    ]
    busco_field = MultiArray(meta['field_id'],
                             values=values,
                             meta=meta,
                             headers=('Busco id', 'Status'),
                             parents=['children'],
                             category_slot=1)
    return busco_field
Ejemplo n.º 4
0
def parse(file, **kwargs):
    """Parse all synonym files."""
    blob_db = file_io.load_yaml(file)
    kwargs['meta'].assembly.update({'file': blob_db['assembly_f']})
    parsed = []
    identifiers = kwargs['dependencies']['identifiers']
    if not identifiers:
        identifiers = Identifier('identifiers',
                                 meta={'field_id': 'identifiers'},
                                 values=blob_db['order_of_blobs'],
                                 parents=[])
        kwargs['meta'].assembly.update(
            {'scaffold-count': len(identifiers.values)})
        parsed.append(identifiers)
    values = values_from_blob_db(blob_db)
    kwargs['meta'].assembly.update({'span': sum(values['lengths'])})
    parsed.append(
        Variable('gc',
                 meta={
                     'preload': True,
                     'scale': 'scaleLinear',
                     'field_id': 'gc',
                     'name': 'GC',
                     'datatype': 'float',
                     'range': [min(values['gcs']),
                               max(values['gcs'])]
                 },
                 values=values['gcs'],
                 parents=[]))
    _min = min(values['lengths'])
    parsed.append(
        Variable('length',
                 meta={
                     'field_id': 'length',
                     'preload': True,
                     'scale': 'scaleLog',
                     'name': 'Length',
                     'clamp': 100 if _min == 0 else False,
                     'datatype': 'integer',
                     'range': [_min, max(values['lengths'])]
                 },
                 parents=[],
                 values=values['lengths']))
    parsed.append(
        Variable('ncount',
                 meta={
                     'field_id': 'ncount',
                     'scale': 'scaleLinear',
                     'name': 'N count',
                     'datatype': 'integer',
                     'range':
                     [min(values['n_counts']),
                      max(values['n_counts'])]
                 },
                 values=values['n_counts'],
                 parents=[]))
    if 'z' not in kwargs['meta'].plot:
        kwargs['meta'].plot.update({'z': 'length'})
    if 'x' not in kwargs['meta'].plot:
        kwargs['meta'].plot.update({'x': 'gc'})
    cov_range = [math.inf, -math.inf]
    read_cov_range = [math.inf, -math.inf]
    for cov_lib, cov_meta in blob_db['covLibs'].items():
        cov_file_name = field_name_from_path(blob_db['covLibs'][cov_lib]['f'])
        covs = values["%s_cov" % cov_lib]
        read_covs = values["%s_read_cov" % cov_lib]
        cov_range = [min(covs + [cov_range[0]]), max(covs + [cov_range[1]])]
        read_cov_range = [
            min(read_covs + [read_cov_range[0]]),
            max(read_covs + [read_cov_range[1]])
        ]
        if 'y' not in kwargs['meta'].plot:
            kwargs['meta'].plot.update({'y': "%s_cov" % cov_file_name})
        parsed.append(
            Variable("%s_cov" % cov_file_name,
                     values=covs,
                     meta={
                         'field_id': "%s_cov" % cov_file_name,
                         'file': cov_meta['f']
                     },
                     parents=cov.parent() + [
                         'children', {
                             'id': 'base_coverage',
                             'clamp': 1 if cov_range[0] == 0 else False,
                             'range': cov_range
                         }, 'children'
                     ]))
        parsed.append(
            Variable("%s_read_cov" % cov_file_name,
                     values=read_covs,
                     meta={
                         'field_id': "%s_read_cov" % cov_file_name,
                         'file': cov_meta['f'],
                         'reads_mapped': cov_meta['reads_mapped'],
                         'reads_unmapped': cov_meta['reads_unmapped']
                     },
                     parents=cov.parent() + [
                         'children', {
                             'id': 'read_coverage',
                             'datatype': 'integer',
                             'clamp': 1 if read_cov_range[0] == 0 else False,
                             'range': read_cov_range
                         }, 'children'
                     ]))
    ranks = blob_db['dict_of_blobs'][identifiers.values[0]]['taxonomy'][
        blob_db['taxrules'][0]].keys()
    for tax_rule in blob_db['taxrules']:
        if 'cat' not in kwargs['meta'].plot:
            kwargs['meta'].plot.update({'cat': "%s_phylum" % tax_rule})
        hit_list = hits_from_blob_db(blob_db, tax_rule)
        parsed.append(
            MultiArray("%s_hits" % tax_rule,
                       values=hit_list,
                       meta={
                           'field_id': "%s_hits" % tax_rule,
                           'type': 'multiarray',
                           'datatype': 'mixed',
                           'preload': False,
                           'active': False,
                           'files':
                           [m['f'] for x, m in blob_db['hitLibs'].items()]
                       },
                       parents=hits.parent() +
                       ['children', {
                           'id': tax_rule
                       }, 'children'],
                       category_slot=None,
                       headers=['taxid', 'score']))
        for rank in ranks:
            field_id = "%s_%s" % (tax_rule, rank)
            parsed.append(
                Category(field_id,
                         values=values[field_id],
                         meta={'field_id': field_id},
                         parents=hits.parent() +
                         ['children', {
                             'id': tax_rule
                         }, 'children']))
            parents = hits.parent() + [
                'children', {
                    'id': tax_rule
                }, 'children', {
                    'id': field_id
                }, 'data'
            ]
            field_id = "%s_%s_cindex" % (tax_rule, rank)
            parsed.append(
                Variable(field_id,
                         values=values[field_id],
                         meta={
                             'scale':
                             'scaleLinear',
                             'field_id':
                             field_id,
                             'datatype':
                             'integer',
                             'range':
                             [min(values[field_id]),
                              max(values[field_id])],
                             'preload':
                             False,
                             'active':
                             False
                         },
                         parents=parents))
            field_id = "%s_%s_score" % (tax_rule, rank)
            _min = min(values[field_id])
            parsed.append(
                Variable(field_id,
                         values=values[field_id],
                         meta={
                             'scale': 'scaleLog',
                             'field_id': field_id,
                             'clamp': 1 if _min == 0 else False,
                             'datatype': 'float',
                             'range': [_min, max(values[field_id])],
                             'preload': False,
                             'active': False
                         },
                         parents=parents))

    return parsed
Ejemplo n.º 5
0
def create_fields(results, taxrule, files, fields=None):
    """Store BLAST results as Fields."""
    if fields is None:
        fields = []
    hits_id = "%s_%s" % (taxrule, "positions")
    fields.append(
        MultiArray(
            hits_id,
            values=results[0]["data"]["hits"],
            meta={
                "field_id": hits_id,
                "name": hits_id,
                "type": "multiarray",
                "datatype": "mixed",
                "preload": False,
                "active": False,
                "files": files,
            },
            parents=["children", {
                "id": taxrule
            }, "children"],
            category_slot=None,
            headers=[
                "taxid", "start", "end", "score", "subject", "index", "title"
            ],
        ))
    for result in results:
        main = Category(
            result["field_id"],
            values=result["values"],
            meta={
                "field_id": result["field_id"],
                "name": result["field_id"]
            },
            parents=["children", {
                "id": taxrule
            }, "children"],
        )
        fields.append(main)
        parents = [
            "children",
            {
                "id": taxrule
            },
            "children",
            {
                "id": result["field_id"]
            },
            "data",
        ]
        field_id = "%s_%s" % (result["field_id"], "cindex")
        fields.append(
            Variable(
                field_id,
                values=result["data"]["cindex"],
                meta={
                    "scale":
                    "scaleLinear",
                    "field_id":
                    field_id,
                    "name":
                    field_id,
                    "datatype":
                    "integer",
                    "range": [
                        min(result["data"]["cindex"]),
                        max(result["data"]["cindex"]),
                    ],
                    "preload":
                    False,
                    "active":
                    False,
                },
                parents=parents,
            ))
        field_id = "%s_%s" % (result["field_id"], "score")
        _min = min(result["data"]["score"])
        fields.append(
            Variable(
                field_id,
                values=result["data"]["score"],
                meta={
                    "scale": "scaleLog",
                    "field_id": field_id,
                    "name": field_id,
                    "clamp": 1 if _min == 0 else False,
                    "datatype": "float",
                    "range": [_min, max(result["data"]["score"])],
                    "preload": False,
                    "active": False,
                },
                parents=parents,
            ))
        subfield = "positions"
        field_id = "%s_%s" % (result["field_id"], subfield)
        if len(result["data"][subfield]) > 1:
            headers = ["name"]
        else:
            headers = ["name"]
        fields.append(
            MultiArray(
                field_id,
                values=result["data"][subfield],
                fixed_keys=main.keys,
                meta={
                    "field_id": field_id,
                    "name": field_id,
                    "type": "multiarray",
                    "datatype": "string",
                    "preload": False,
                    "active": False,
                    "linked_field": hits_id,
                },
                parents=parents,
                category_slot=0,
                headers=headers,
            ))
        for subfield in result["data"].keys():
            if subfield.startswith("windows"):
                field_id = "%s_%s" % (result["field_id"], subfield)
                if len(result["data"][subfield]) > 1:
                    headers = ["name"]
                else:
                    headers = ["name"]
                fields.append(
                    MultiArray(
                        field_id,
                        values=result["data"][subfield],
                        fixed_keys=main.keys,
                        meta={
                            "field_id": field_id,
                            "name": field_id,
                            "type": "array",
                            "datatype": "string",
                            "preload": False,
                            "active": False,
                        },
                        parents=parents,
                        category_slot=0,
                        headers=headers,
                    ))

    return fields
Ejemplo n.º 6
0
def parse(files, **kwargs):
    if "--bedtsvdir" in kwargs or "--bedtsvdir" in kwargs:
        if isinstance(files, str) and path.isdir(files):
            print("Reading all TSV files in %s" % files)
            files = glob("%s/*.tsv" % files)
        filename, all_windows, full = parse_tsvfiles(files)
        filenames = {"all": filename}
    else:
        if isinstance(files, str) and path.isdir(files):
            print("Reading all BED files in %s" % files)
            files = glob("%s/*.bed" % files)
        filenames, all_windows, full = parse_bedfiles(files)
    full_n = {}
    full_sd = {}
    if isinstance(full, dict):
        full_sd = full["sd"]
        full_n = full["n"]
        full = full["values"]
    all_windows_n = {}
    all_windows_sd = {}
    if isinstance(all_windows, dict):
        all_windows_n = all_windows["n"]
        all_windows_sd = all_windows["sd"]
        all_windows = all_windows["values"]
    parsed = []
    settings = field_settings()
    identifiers = kwargs["dependencies"]["identifiers"]
    keys = []
    if "length" in full:
        keys = list(full["length"].keys())
        lengths = list(full["length"].values())
        kwargs["meta"].assembly.update({"span": sum(lengths)})
        if "z" not in kwargs["meta"].plot:
            kwargs["meta"].plot.update({"z": "length"})
    if "gc" in full and "x" not in kwargs["meta"].plot:
        kwargs["meta"].plot.update({"x": "gc"})
    if not identifiers:
        if not keys:
            print("ERROR: Unable to set identifiers")
            sys.exit(1)
        identifiers = Identifier(
            "identifiers",
            meta={"field_id": "identifiers"},
            values=keys,
            parents=[],
        )
        kwargs["meta"].assembly.update({"scaffold-count": len(identifiers.values)})
        parsed.append(identifiers)
    ranges = {
        key: {"range": [math.inf, -math.inf], "meta": {}} for key in settings.keys()
    }
    for field, data in full.items():
        filename = filenames.get(field, filenames.get("all", ""))
        if data:
            values = []
            for seq_id in identifiers.values:
                values.append(data[seq_id] if seq_id in data else 0)
            if values:
                meta = {}
                parents = []
                suffix = field.split("_")[-1]
                if suffix in settings:
                    meta = deepcopy(settings[suffix]["meta"])
                    meta.update(
                        {
                            "field_id": field,
                            "file": filename,
                        }
                    )
                    if meta["datatype"] == "integer":
                        values = [int(value) for value in values]
                    value_range = [min(values), max(values)]
                    if "clamp" in meta and value_range[0] >= meta["clamp"]:
                        meta["clamp"] = False
                    parent_range = False
                    if "parents" in settings[suffix]:
                        parents = settings[suffix]["parents"]
                        for parent in parents:
                            if "range" in parent:
                                parent_range = True
                                parent["range"][0] = min(
                                    parent["range"][0], value_range[0]
                                )
                                parent["range"][1] = max(
                                    parent["range"][1], value_range[1]
                                )
                    if not parent_range:
                        if "range" in meta:
                            meta["range"][0] = min(meta["range"][0], value_range[0])
                            meta["range"][1] = max(meta["range"][1], value_range[1])
                        else:
                            meta["range"] = value_range
                    if meta["range"][1] <= meta["range"][0]:
                        continue
                    if "preload" in meta and meta["preload"] == 1:
                        if value_range[1] > ranges[suffix]["range"][1]:
                            meta["preload"] = True
                            if "plot_axis" in settings[suffix]:
                                kwargs["meta"].plot.update(
                                    {settings[suffix]["plot_axis"]: field}
                                )
                            if "preload" in ranges[suffix]["meta"]:
                                ranges[suffix]["meta"]["preload"] = False
                            ranges[suffix].update({"range": value_range, "meta": meta})
                        else:
                            meta["preload"] = False
                    if field.endswith("_%s" % suffix):
                        meta["name"] = "%s %s" % (
                            field.replace("_%s" % suffix, ""),
                            meta["name"],
                        )
                parsed.append(
                    Variable(
                        field,
                        meta=meta,
                        values=values,
                        parents=parents,
                    )
                )
                if field in full_sd:
                    stats_values = []
                    for seq_id in identifiers.values:
                        values = (
                            [full_sd[field][seq_id], full_n[field][seq_id]]
                            if seq_id in data
                            else []
                        )
                        stats_values.append(values)
                    parsed.append(
                        Array(
                            "%s_stats" % field,
                            meta={
                                "field_id": "%s_stats" % field,
                                "name": "%s stats" % meta["name"],
                                "type": "array",
                                "datatype": "mixed",
                            },
                            values=stats_values,
                            parents=parents,
                            headers=["sd", "n"],
                        )
                    )
                for window, windows in all_windows.items():
                    windows_sd = all_windows_sd.get(window, {})
                    windows_n = all_windows_n.get(window, {})
                    if field in windows:
                        window_values = []
                        headers = [field]
                        if field in windows_sd:
                            headers += ["sd", "n"]
                        for seq_id in identifiers.values:
                            seq_values = []
                            if seq_id in data:
                                for idx, value in enumerate(windows[field][seq_id]):
                                    if meta["datatype"] == "integer":
                                        value = int(value)
                                    if field in windows_sd:
                                        value = [
                                            value,
                                            windows_sd[field][seq_id][idx],
                                            windows_n[field][seq_id][idx],
                                        ]
                                    else:
                                        value = [value]
                                    seq_values.append(value)
                            window_values.append(seq_values)
                        windows_field = "%s_windows" % field
                        if str(window) != "0.1":
                            windows_field += "_%s" % str(window)
                        parsed.append(
                            MultiArray(
                                windows_field,
                                meta={
                                    "field_id": windows_field,
                                    "name": "%s windows %s" % (meta["name"], window),
                                    "type": "multiarray",
                                    "datatype": "mixed",
                                },
                                values=window_values,
                                parents=parents,
                                headers=headers,
                            )
                        )
    return parsed
Ejemplo n.º 7
0
def create_fields(results, taxrule, files, fields=None):
    """Store BLAST results as Fields."""
    if fields is None:
        fields = []
    hits_id = "%s_%s" % (taxrule, 'positions')
    fields.append(
        MultiArray(
            hits_id,
            values=results[0]['data']['hits'],
            meta={
                'field_id': hits_id,
                'name': hits_id,
                'type': 'multiarray',
                'datatype': 'mixed',
                'preload': False,
                'active': False,
                'files': files
            },
            parents=['children', {
                'id': taxrule
            }, 'children'],
            category_slot=None,
            headers=['taxid', 'start', 'end', 'score', 'subject', 'index']))
    for result in results:
        main = Category(result['field_id'],
                        values=result['values'],
                        meta={
                            'field_id': result['field_id'],
                            'name': result['field_id']
                        },
                        parents=['children', {
                            'id': taxrule
                        }, 'children'])
        fields.append(main)
        parents = [
            'children', {
                'id': taxrule
            }, 'children', {
                'id': result['field_id']
            }, 'data'
        ]
        field_id = "%s_%s" % (result['field_id'], 'cindex')
        fields.append(
            Variable(field_id,
                     values=result['data']['cindex'],
                     meta={
                         'scale':
                         'scaleLinear',
                         'field_id':
                         field_id,
                         'name':
                         field_id,
                         'datatype':
                         'integer',
                         'range': [
                             min(result['data']['cindex']),
                             max(result['data']['cindex'])
                         ],
                         'preload':
                         False,
                         'active':
                         False
                     },
                     parents=parents))
        field_id = "%s_%s" % (result['field_id'], 'score')
        _min = min(result['data']['score'])
        fields.append(
            Variable(field_id,
                     values=result['data']['score'],
                     meta={
                         'scale': 'scaleLog',
                         'field_id': field_id,
                         'name': field_id,
                         'clamp': 1 if _min == 0 else False,
                         'datatype': 'float',
                         'range': [_min, max(result['data']['score'])],
                         'preload': False,
                         'active': False
                     },
                     parents=parents))
        subfield = 'positions'
        field_id = "%s_%s" % (result['field_id'], subfield)
        if len(result['data'][subfield]) > 1:
            headers = ['name']
        else:
            headers = ['name']
        fields.append(
            MultiArray(field_id,
                       values=result['data'][subfield],
                       fixed_keys=main.keys,
                       meta={
                           'field_id': field_id,
                           'name': field_id,
                           'type': 'multiarray',
                           'datatype': 'string',
                           'preload': False,
                           'active': False,
                           'linked_field': hits_id
                       },
                       parents=parents,
                       category_slot=0,
                       headers=headers))
    return fields