Esempio n. 1
0
def parse_json_cov(json_file, **kwargs):
    """Parse coverage from JSON cov file."""
    parts = json_file.split('=')
    base_name = parts[1]
    data = load_yaml(parts[0])
    covs = []
    if 'values' in data:
        for value in data['values']:
            covs.append(float("%.4f" % value))
    if base_name.endswith('_read_cov'):
        type = 'read_cov'
        parent = 'read_coverage'
        datatype = 'float'
        clamp = 1
    elif base_name.endswith('_cov'):
        type = 'cov'
        parent = 'base_coverage'
        datatype = 'integer'
        clamp = 0.01
    else:
        return None
    field_id = base_name
    fields = {}
    fields["%s_id" % type] = field_id
    fields["%s_range" % type] = [
        min(covs + [kwargs["%s_range" % type][0]]),
        max(covs + [kwargs["%s_range" % type][1]])
    ]
    if kwargs['meta'].has_field(field_id):
        file_name = kwargs['meta'].field_meta(field_id)['file']
    else:
        file_name = json_file
    fields[type] = Variable(field_id,
                            values=covs,
                            meta={
                                'field_id': field_id,
                                'file': file_name
                            },
                            parents=[
                                'children', {
                                    'id':
                                    parent,
                                    'datatype':
                                    'integer',
                                    'clamp':
                                    clamp if fields["%s_range" %
                                                    type][0] == 0 else False,
                                    'range':
                                    fields["%s_range" % type]
                                }, 'children'
                            ])
    return fields
Esempio n. 2
0
def parse_json_cov(json_file, **kwargs):
    """Parse coverage from JSON cov file."""
    parts = json_file.split("=")
    base_name = parts[1]
    data = load_yaml(parts[0])
    covs = []
    if "values" in data:
        for value in data["values"]:
            covs.append(float("%.4f" % value))
    if base_name.endswith("_read_cov"):
        type = "read_cov"
        parent = "read_coverage"
        datatype = "float"
        clamp = 1
    elif base_name.endswith("_cov"):
        type = "cov"
        parent = "base_coverage"
        datatype = "integer"
        clamp = 0.01
    else:
        return None
    field_id = base_name
    fields = {}
    fields["%s_id" % type] = field_id
    fields["%s_range" % type] = [
        min(covs + [kwargs["%s_range" % type][0]]),
        max(covs + [kwargs["%s_range" % type][1]]),
    ]
    if kwargs["meta"].has_field(field_id):
        file_name = kwargs["meta"].field_meta(field_id)["file"]
    else:
        file_name = json_file
    fields[type] = Variable(
        field_id,
        values=covs,
        meta={
            "field_id": field_id,
            "file": file_name
        },
        parents=[
            "children",
            {
                "id": parent,
                "datatype": "integer",
                "clamp": clamp if fields["%s_range" % type][0] == 0 else False,
                "range": fields["%s_range" % type],
            },
            "children",
        ],
    )
    return fields
Esempio n. 3
0
def parse(file, **kwargs):
    """Parse all synonym files."""
    parsed = []
    _lengths = OrderedDict()
    _gc_portions = OrderedDict()
    _n_counts = OrderedDict()
    lengths = []
    gc_portions = []
    n_counts = []
    print("Loading sequences from %s" % file)
    pbar = tqdm(file_io.stream_fasta(file))
    for seq_id, seq_str in pbar:
        pbar.set_description(" - processing %s" % seq_id)
        _lengths[seq_id] = len(seq_str)
        _gc_portions[seq_id], _n_counts[seq_id] = base_composition(seq_str)
    identifiers = kwargs["dependencies"]["identifiers"]
    if not identifiers:
        identifiers = Identifier(
            "identifiers",
            meta={"field_id": "identifiers"},
            values=list(_lengths.keys()),
            parents=[],
        )
        kwargs["meta"].assembly.update({"scaffold-count": len(identifiers.values)})
        parsed.append(identifiers)
    for seq_id in identifiers.values:
        lengths.append(_lengths[seq_id] if seq_id in _lengths else 0)
        gc_portions.append(_gc_portions[seq_id] if seq_id in _gc_portions else 0)
        n_counts.append(_n_counts[seq_id] if seq_id in _n_counts else 0)
    kwargs["meta"].assembly.update({"span": sum(lengths)})
    parsed.append(
        Variable(
            "gc",
            meta={
                "field_id": "gc",
                "preload": True,
                "scale": "scaleLinear",
                "name": "GC",
                "datatype": "float",
                "range": [min(gc_portions), max(gc_portions)],
            },
            values=gc_portions,
            parents=[],
        )
    )
    _min = min(lengths)
    parsed.append(
        Variable(
            "length",
            meta={
                "preload": True,
                "scale": "scaleLog",
                "field_id": "length",
                "name": "Length",
                "clamp": 1 if _min == 0 else False,
                "datatype": "integer",
                "range": [_min, max(lengths)],
            },
            values=lengths,
            parents=[],
        )
    )
    parsed.append(
        Variable(
            "ncount",
            meta={
                "scale": "scaleLinear",
                "field_id": "ncount",
                "name": "N count",
                "datatype": "integer",
                "range": [min(n_counts), max(n_counts)],
            },
            values=n_counts,
            parents=[],
        )
    )
    if "x" not in kwargs["meta"].plot:
        kwargs["meta"].plot.update({"x": "gc"})
    if "z" not in kwargs["meta"].plot:
        kwargs["meta"].plot.update({"z": "length"})
    return parsed
Esempio n. 4
0
def parse(file, **kwargs):
    """Parse all synonym files."""
    blob_db = file_io.load_yaml(file)
    kwargs['meta'].assembly.update({'file': blob_db['assembly_f']})
    parsed = []
    identifiers = kwargs['dependencies']['identifiers']
    if not identifiers:
        identifiers = Identifier('identifiers',
                                 meta={'field_id': 'identifiers'},
                                 values=blob_db['order_of_blobs'],
                                 parents=[])
        kwargs['meta'].assembly.update(
            {'scaffold-count': len(identifiers.values)})
        parsed.append(identifiers)
    values = values_from_blob_db(blob_db)
    kwargs['meta'].assembly.update({'span': sum(values['lengths'])})
    parsed.append(
        Variable('gc',
                 meta={
                     'preload': True,
                     'scale': 'scaleLinear',
                     'field_id': 'gc',
                     'name': 'GC',
                     'datatype': 'float',
                     'range': [min(values['gcs']),
                               max(values['gcs'])]
                 },
                 values=values['gcs'],
                 parents=[]))
    _min = min(values['lengths'])
    parsed.append(
        Variable('length',
                 meta={
                     'field_id': 'length',
                     'preload': True,
                     'scale': 'scaleLog',
                     'name': 'Length',
                     'clamp': 100 if _min == 0 else False,
                     'datatype': 'integer',
                     'range': [_min, max(values['lengths'])]
                 },
                 parents=[],
                 values=values['lengths']))
    parsed.append(
        Variable('ncount',
                 meta={
                     'field_id': 'ncount',
                     'scale': 'scaleLinear',
                     'name': 'N count',
                     'datatype': 'integer',
                     'range':
                     [min(values['n_counts']),
                      max(values['n_counts'])]
                 },
                 values=values['n_counts'],
                 parents=[]))
    if 'z' not in kwargs['meta'].plot:
        kwargs['meta'].plot.update({'z': 'length'})
    if 'x' not in kwargs['meta'].plot:
        kwargs['meta'].plot.update({'x': 'gc'})
    cov_range = [math.inf, -math.inf]
    read_cov_range = [math.inf, -math.inf]
    for cov_lib, cov_meta in blob_db['covLibs'].items():
        cov_file_name = field_name_from_path(blob_db['covLibs'][cov_lib]['f'])
        covs = values["%s_cov" % cov_lib]
        read_covs = values["%s_read_cov" % cov_lib]
        cov_range = [min(covs + [cov_range[0]]), max(covs + [cov_range[1]])]
        read_cov_range = [
            min(read_covs + [read_cov_range[0]]),
            max(read_covs + [read_cov_range[1]])
        ]
        if 'y' not in kwargs['meta'].plot:
            kwargs['meta'].plot.update({'y': "%s_cov" % cov_file_name})
        parsed.append(
            Variable("%s_cov" % cov_file_name,
                     values=covs,
                     meta={
                         'field_id': "%s_cov" % cov_file_name,
                         'file': cov_meta['f']
                     },
                     parents=cov.parent() + [
                         'children', {
                             'id': 'base_coverage',
                             'clamp': 1 if cov_range[0] == 0 else False,
                             'range': cov_range
                         }, 'children'
                     ]))
        parsed.append(
            Variable("%s_read_cov" % cov_file_name,
                     values=read_covs,
                     meta={
                         'field_id': "%s_read_cov" % cov_file_name,
                         'file': cov_meta['f'],
                         'reads_mapped': cov_meta['reads_mapped'],
                         'reads_unmapped': cov_meta['reads_unmapped']
                     },
                     parents=cov.parent() + [
                         'children', {
                             'id': 'read_coverage',
                             'datatype': 'integer',
                             'clamp': 1 if read_cov_range[0] == 0 else False,
                             'range': read_cov_range
                         }, 'children'
                     ]))
    ranks = blob_db['dict_of_blobs'][identifiers.values[0]]['taxonomy'][
        blob_db['taxrules'][0]].keys()
    for tax_rule in blob_db['taxrules']:
        if 'cat' not in kwargs['meta'].plot:
            kwargs['meta'].plot.update({'cat': "%s_phylum" % tax_rule})
        hit_list = hits_from_blob_db(blob_db, tax_rule)
        parsed.append(
            MultiArray("%s_hits" % tax_rule,
                       values=hit_list,
                       meta={
                           'field_id': "%s_hits" % tax_rule,
                           'type': 'multiarray',
                           'datatype': 'mixed',
                           'preload': False,
                           'active': False,
                           'files':
                           [m['f'] for x, m in blob_db['hitLibs'].items()]
                       },
                       parents=hits.parent() +
                       ['children', {
                           'id': tax_rule
                       }, 'children'],
                       category_slot=None,
                       headers=['taxid', 'score']))
        for rank in ranks:
            field_id = "%s_%s" % (tax_rule, rank)
            parsed.append(
                Category(field_id,
                         values=values[field_id],
                         meta={'field_id': field_id},
                         parents=hits.parent() +
                         ['children', {
                             'id': tax_rule
                         }, 'children']))
            parents = hits.parent() + [
                'children', {
                    'id': tax_rule
                }, 'children', {
                    'id': field_id
                }, 'data'
            ]
            field_id = "%s_%s_cindex" % (tax_rule, rank)
            parsed.append(
                Variable(field_id,
                         values=values[field_id],
                         meta={
                             'scale':
                             'scaleLinear',
                             'field_id':
                             field_id,
                             'datatype':
                             'integer',
                             'range':
                             [min(values[field_id]),
                              max(values[field_id])],
                             'preload':
                             False,
                             'active':
                             False
                         },
                         parents=parents))
            field_id = "%s_%s_score" % (tax_rule, rank)
            _min = min(values[field_id])
            parsed.append(
                Variable(field_id,
                         values=values[field_id],
                         meta={
                             'scale': 'scaleLog',
                             'field_id': field_id,
                             'clamp': 1 if _min == 0 else False,
                             'datatype': 'float',
                             'range': [_min, max(values[field_id])],
                             'preload': False,
                             'active': False
                         },
                         parents=parents))

    return parsed
Esempio n. 5
0
def parse_bam(bam_file, **kwargs):
    """Parse coverage into a Variables."""
    identifiers = kwargs['dependencies']['identifiers']
    ids = identifiers.values
    lengths = kwargs['dependencies']['length'].values
    ncounts = kwargs['dependencies']['ncount'].values
    parts = bam_file.split('=')
    base_name = parts[1]
    f_char = Path(parts[0]).suffix[1]
    index_file = Path("%s.bai" % parts[0])
    if not index_file.is_file():
        pysam.index(parts[0])
    else:
        index_file = False
    stats = {}
    print("Loading mapping data from %s as %s" % (parts[0], parts[1]))
    with pysam.AlignmentFile(parts[0], "r%s" % f_char) as aln:
        stats = {'mapped': aln.mapped, 'unmapped': aln.unmapped}
        _covs, _read_covs = calculate_coverage(aln, aln.mapped)
    if index_file:
        os.remove(index_file)
    if not identifiers.validate_list(list(_covs.keys())):
        raise UserWarning(
            'Contig names in the coverage file did not match dataset identifiers.'
        )
    covs = []
    read_covs = []
    for index, seq_id in enumerate(ids):
        acgt_count = lengths[index] - ncounts[index]
        covs.append(
            float("%.4f" %
                  (_covs[seq_id] / acgt_count)) if seq_id in _covs else 0)
        read_covs.append(_read_covs[seq_id] if seq_id in _read_covs else 0)
    field_id = "%s_cov" % base_name
    fields = {'cov_id': field_id}
    fields['cov_range'] = [
        min(covs + [kwargs['cov_range'][0]]),
        max(covs + [kwargs['cov_range'][1]])
    ]
    fields['cov'] = Variable(
        field_id,
        values=covs,
        meta={
            'field_id': field_id,
            'file': bam_file
        },
        parents=[
            'children', {
                'id': 'base_coverage',
                'clamp': 0.01 if fields['cov_range'][0] == 0 else False,
                'range': fields['cov_range']
            }, 'children'
        ])
    field_id = "%s_read_cov" % base_name
    fields['read_cov_range'] = [
        min(read_covs + [kwargs['read_cov_range'][0]]),
        max(read_covs + [kwargs['read_cov_range'][1]])
    ]
    fields['read_cov'] = Variable(
        field_id,
        values=read_covs,
        meta={
            'field_id': field_id,
            'file': bam_file,
            'reads_mapped': stats['mapped'],
            'reads_unmapped': stats['unmapped']
        },
        parents=[
            'children', {
                'id': 'read_coverage',
                'datatype': 'integer',
                'clamp': 1 if fields['read_cov_range'][0] == 0 else False,
                'range': fields['read_cov_range']
            }, 'children'
        ])
    return fields
Esempio n. 6
0
def parse(file, **kwargs):
    """Parse all synonym files."""
    parsed = []
    _lengths = OrderedDict()
    _gc_portions = OrderedDict()
    _n_counts = OrderedDict()
    lengths = []
    gc_portions = []
    n_counts = []
    print("Loading sequences from %s" % file)
    pbar = tqdm(file_io.stream_fasta(file))
    for seq_id, seq_str in pbar:
        pbar.set_description(" - processing %s" % seq_id)
        _lengths[seq_id] = len(seq_str)
        _gc_portions[seq_id], _n_counts[seq_id] = base_composition(seq_str)
    identifiers = kwargs['dependencies']['identifiers']
    if not identifiers:
        identifiers = Identifier('identifiers',
                                 meta={'field_id': 'identifiers'},
                                 values=list(_lengths.keys()),
                                 parents=[])
        kwargs['meta'].assembly.update({'scaffold-count': len(identifiers.values)})
        parsed.append(identifiers)
    for seq_id in identifiers.values:
        lengths.append(_lengths[seq_id] if seq_id in _lengths else 0)
        gc_portions.append(_gc_portions[seq_id] if seq_id in _gc_portions else 0)
        n_counts.append(_n_counts[seq_id] if seq_id in _n_counts else 0)
    kwargs['meta'].assembly.update({'span': sum(lengths)})
    parsed.append(Variable('gc',
                           meta={
                               'field_id': 'gc',
                               'preload': True,
                               'scale': 'scaleLinear',
                               'name': 'GC',
                               'datatype': 'float',
                               'range': [min(gc_portions), max(gc_portions)]
                           },
                           values=gc_portions,
                           parents=[]))
    _min = min(lengths)
    parsed.append(Variable('length',
                           meta={
                               'preload': True,
                               'scale': 'scaleLog',
                               'field_id': 'length',
                               'name': 'Length',
                               'clamp': 1 if _min == 0 else False,
                               'datatype': 'integer',
                               'range': [_min, max(lengths)]
                           },
                           values=lengths,
                           parents=[]))
    parsed.append(Variable('ncount',
                           meta={
                               'scale': 'scaleLinear',
                               'field_id': 'ncount',
                               'name': 'N count',
                               'datatype': 'integer',
                               'range': [min(n_counts), max(n_counts)]
                           },
                           values=n_counts,
                           parents=[]))
    if 'x' not in kwargs['meta'].plot:
        kwargs['meta'].plot.update({'x': 'gc'})
    if 'z' not in kwargs['meta'].plot:
        kwargs['meta'].plot.update({'z': 'length'})
    return parsed
Esempio n. 7
0
def create_fields(results, taxrule, files, fields=None):
    """Store BLAST results as Fields."""
    if fields is None:
        fields = []
    hits_id = "%s_%s" % (taxrule, "positions")
    fields.append(
        MultiArray(
            hits_id,
            values=results[0]["data"]["hits"],
            meta={
                "field_id": hits_id,
                "name": hits_id,
                "type": "multiarray",
                "datatype": "mixed",
                "preload": False,
                "active": False,
                "files": files,
            },
            parents=["children", {
                "id": taxrule
            }, "children"],
            category_slot=None,
            headers=[
                "taxid", "start", "end", "score", "subject", "index", "title"
            ],
        ))
    for result in results:
        main = Category(
            result["field_id"],
            values=result["values"],
            meta={
                "field_id": result["field_id"],
                "name": result["field_id"]
            },
            parents=["children", {
                "id": taxrule
            }, "children"],
        )
        fields.append(main)
        parents = [
            "children",
            {
                "id": taxrule
            },
            "children",
            {
                "id": result["field_id"]
            },
            "data",
        ]
        field_id = "%s_%s" % (result["field_id"], "cindex")
        fields.append(
            Variable(
                field_id,
                values=result["data"]["cindex"],
                meta={
                    "scale":
                    "scaleLinear",
                    "field_id":
                    field_id,
                    "name":
                    field_id,
                    "datatype":
                    "integer",
                    "range": [
                        min(result["data"]["cindex"]),
                        max(result["data"]["cindex"]),
                    ],
                    "preload":
                    False,
                    "active":
                    False,
                },
                parents=parents,
            ))
        field_id = "%s_%s" % (result["field_id"], "score")
        _min = min(result["data"]["score"])
        fields.append(
            Variable(
                field_id,
                values=result["data"]["score"],
                meta={
                    "scale": "scaleLog",
                    "field_id": field_id,
                    "name": field_id,
                    "clamp": 1 if _min == 0 else False,
                    "datatype": "float",
                    "range": [_min, max(result["data"]["score"])],
                    "preload": False,
                    "active": False,
                },
                parents=parents,
            ))
        subfield = "positions"
        field_id = "%s_%s" % (result["field_id"], subfield)
        if len(result["data"][subfield]) > 1:
            headers = ["name"]
        else:
            headers = ["name"]
        fields.append(
            MultiArray(
                field_id,
                values=result["data"][subfield],
                fixed_keys=main.keys,
                meta={
                    "field_id": field_id,
                    "name": field_id,
                    "type": "multiarray",
                    "datatype": "string",
                    "preload": False,
                    "active": False,
                    "linked_field": hits_id,
                },
                parents=parents,
                category_slot=0,
                headers=headers,
            ))
        for subfield in result["data"].keys():
            if subfield.startswith("windows"):
                field_id = "%s_%s" % (result["field_id"], subfield)
                if len(result["data"][subfield]) > 1:
                    headers = ["name"]
                else:
                    headers = ["name"]
                fields.append(
                    MultiArray(
                        field_id,
                        values=result["data"][subfield],
                        fixed_keys=main.keys,
                        meta={
                            "field_id": field_id,
                            "name": field_id,
                            "type": "array",
                            "datatype": "string",
                            "preload": False,
                            "active": False,
                        },
                        parents=parents,
                        category_slot=0,
                        headers=headers,
                    ))

    return fields
Esempio n. 8
0
def parse(files, **kwargs):
    if "--bedtsvdir" in kwargs or "--bedtsvdir" in kwargs:
        if isinstance(files, str) and path.isdir(files):
            print("Reading all TSV files in %s" % files)
            files = glob("%s/*.tsv" % files)
        filename, all_windows, full = parse_tsvfiles(files)
        filenames = {"all": filename}
    else:
        if isinstance(files, str) and path.isdir(files):
            print("Reading all BED files in %s" % files)
            files = glob("%s/*.bed" % files)
        filenames, all_windows, full = parse_bedfiles(files)
    full_n = {}
    full_sd = {}
    if isinstance(full, dict):
        full_sd = full["sd"]
        full_n = full["n"]
        full = full["values"]
    all_windows_n = {}
    all_windows_sd = {}
    if isinstance(all_windows, dict):
        all_windows_n = all_windows["n"]
        all_windows_sd = all_windows["sd"]
        all_windows = all_windows["values"]
    parsed = []
    settings = field_settings()
    identifiers = kwargs["dependencies"]["identifiers"]
    keys = []
    if "length" in full:
        keys = list(full["length"].keys())
        lengths = list(full["length"].values())
        kwargs["meta"].assembly.update({"span": sum(lengths)})
        if "z" not in kwargs["meta"].plot:
            kwargs["meta"].plot.update({"z": "length"})
    if "gc" in full and "x" not in kwargs["meta"].plot:
        kwargs["meta"].plot.update({"x": "gc"})
    if not identifiers:
        if not keys:
            print("ERROR: Unable to set identifiers")
            sys.exit(1)
        identifiers = Identifier(
            "identifiers",
            meta={"field_id": "identifiers"},
            values=keys,
            parents=[],
        )
        kwargs["meta"].assembly.update({"scaffold-count": len(identifiers.values)})
        parsed.append(identifiers)
    ranges = {
        key: {"range": [math.inf, -math.inf], "meta": {}} for key in settings.keys()
    }
    for field, data in full.items():
        filename = filenames.get(field, filenames.get("all", ""))
        if data:
            values = []
            for seq_id in identifiers.values:
                values.append(data[seq_id] if seq_id in data else 0)
            if values:
                meta = {}
                parents = []
                suffix = field.split("_")[-1]
                if suffix in settings:
                    meta = deepcopy(settings[suffix]["meta"])
                    meta.update(
                        {
                            "field_id": field,
                            "file": filename,
                        }
                    )
                    if meta["datatype"] == "integer":
                        values = [int(value) for value in values]
                    value_range = [min(values), max(values)]
                    if "clamp" in meta and value_range[0] >= meta["clamp"]:
                        meta["clamp"] = False
                    parent_range = False
                    if "parents" in settings[suffix]:
                        parents = settings[suffix]["parents"]
                        for parent in parents:
                            if "range" in parent:
                                parent_range = True
                                parent["range"][0] = min(
                                    parent["range"][0], value_range[0]
                                )
                                parent["range"][1] = max(
                                    parent["range"][1], value_range[1]
                                )
                    if not parent_range:
                        if "range" in meta:
                            meta["range"][0] = min(meta["range"][0], value_range[0])
                            meta["range"][1] = max(meta["range"][1], value_range[1])
                        else:
                            meta["range"] = value_range
                    if meta["range"][1] <= meta["range"][0]:
                        continue
                    if "preload" in meta and meta["preload"] == 1:
                        if value_range[1] > ranges[suffix]["range"][1]:
                            meta["preload"] = True
                            if "plot_axis" in settings[suffix]:
                                kwargs["meta"].plot.update(
                                    {settings[suffix]["plot_axis"]: field}
                                )
                            if "preload" in ranges[suffix]["meta"]:
                                ranges[suffix]["meta"]["preload"] = False
                            ranges[suffix].update({"range": value_range, "meta": meta})
                        else:
                            meta["preload"] = False
                    if field.endswith("_%s" % suffix):
                        meta["name"] = "%s %s" % (
                            field.replace("_%s" % suffix, ""),
                            meta["name"],
                        )
                parsed.append(
                    Variable(
                        field,
                        meta=meta,
                        values=values,
                        parents=parents,
                    )
                )
                if field in full_sd:
                    stats_values = []
                    for seq_id in identifiers.values:
                        values = (
                            [full_sd[field][seq_id], full_n[field][seq_id]]
                            if seq_id in data
                            else []
                        )
                        stats_values.append(values)
                    parsed.append(
                        Array(
                            "%s_stats" % field,
                            meta={
                                "field_id": "%s_stats" % field,
                                "name": "%s stats" % meta["name"],
                                "type": "array",
                                "datatype": "mixed",
                            },
                            values=stats_values,
                            parents=parents,
                            headers=["sd", "n"],
                        )
                    )
                for window, windows in all_windows.items():
                    windows_sd = all_windows_sd.get(window, {})
                    windows_n = all_windows_n.get(window, {})
                    if field in windows:
                        window_values = []
                        headers = [field]
                        if field in windows_sd:
                            headers += ["sd", "n"]
                        for seq_id in identifiers.values:
                            seq_values = []
                            if seq_id in data:
                                for idx, value in enumerate(windows[field][seq_id]):
                                    if meta["datatype"] == "integer":
                                        value = int(value)
                                    if field in windows_sd:
                                        value = [
                                            value,
                                            windows_sd[field][seq_id][idx],
                                            windows_n[field][seq_id][idx],
                                        ]
                                    else:
                                        value = [value]
                                    seq_values.append(value)
                            window_values.append(seq_values)
                        windows_field = "%s_windows" % field
                        if str(window) != "0.1":
                            windows_field += "_%s" % str(window)
                        parsed.append(
                            MultiArray(
                                windows_field,
                                meta={
                                    "field_id": windows_field,
                                    "name": "%s windows %s" % (meta["name"], window),
                                    "type": "multiarray",
                                    "datatype": "mixed",
                                },
                                values=window_values,
                                parents=parents,
                                headers=headers,
                            )
                        )
    return parsed
Esempio n. 9
0
def parse_bam(bam_file, **kwargs):
    """Parse coverage into a Variables."""
    identifiers = kwargs["dependencies"]["identifiers"]
    ids = identifiers.values
    lengths = kwargs["dependencies"]["length"].values
    ncounts = kwargs["dependencies"]["ncount"].values
    parts = bam_file.split("=")
    base_name = parts[1]
    f_char = Path(parts[0]).suffix[1]
    index_file = Path("%s.csi" % parts[0])
    if not index_file.is_file():
        pysam.index("-c", "-m", "14", parts[0])
    else:
        index_file = False
    stats = {}
    print("Loading mapping data from %s as %s" % (parts[0], parts[1]))
    with pysam.AlignmentFile(parts[0], "r%s" % f_char) as aln:
        stats = {"mapped": aln.mapped, "unmapped": aln.unmapped}
        _covs, _read_covs = calculate_coverage(aln, aln.mapped)
    if index_file:
        os.remove(index_file)
    if not identifiers.validate_list(list(_covs.keys())):
        raise UserWarning(
            "Contig names in the coverage file did not match dataset identifiers."
        )
    covs = []
    read_covs = []
    for index, seq_id in enumerate(ids):
        acgt_count = lengths[index] - ncounts[index]
        covs.append(
            float("%.4f" %
                  (_covs[seq_id] / acgt_count)) if seq_id in _covs else 0)
        read_covs.append(_read_covs[seq_id] if seq_id in _read_covs else 0)
    field_id = "%s_cov" % base_name
    fields = {"cov_id": field_id}
    fields["cov_range"] = [
        min(covs + [kwargs["cov_range"][0]]),
        max(covs + [kwargs["cov_range"][1]]),
    ]
    fields["cov"] = Variable(
        field_id,
        values=covs,
        meta={
            "field_id": field_id,
            "file": bam_file
        },
        parents=[
            "children",
            {
                "id": "base_coverage",
                "clamp": 0.01 if fields["cov_range"][0] == 0 else False,
                "range": fields["cov_range"],
            },
            "children",
        ],
    )
    field_id = "%s_read_cov" % base_name
    fields["read_cov_range"] = [
        min(read_covs + [kwargs["read_cov_range"][0]]),
        max(read_covs + [kwargs["read_cov_range"][1]]),
    ]
    fields["read_cov"] = Variable(
        field_id,
        values=read_covs,
        meta={
            "field_id": field_id,
            "file": bam_file,
            "reads_mapped": stats["mapped"],
            "reads_unmapped": stats["unmapped"],
        },
        parents=[
            "children",
            {
                "id": "read_coverage",
                "datatype": "integer",
                "clamp": 1 if fields["read_cov_range"][0] == 0 else False,
                "range": fields["read_cov_range"],
            },
            "children",
        ],
    )
    return fields
Esempio n. 10
0
def create_fields(results, taxrule, files, fields=None):
    """Store BLAST results as Fields."""
    if fields is None:
        fields = []
    hits_id = "%s_%s" % (taxrule, 'positions')
    fields.append(
        MultiArray(
            hits_id,
            values=results[0]['data']['hits'],
            meta={
                'field_id': hits_id,
                'name': hits_id,
                'type': 'multiarray',
                'datatype': 'mixed',
                'preload': False,
                'active': False,
                'files': files
            },
            parents=['children', {
                'id': taxrule
            }, 'children'],
            category_slot=None,
            headers=['taxid', 'start', 'end', 'score', 'subject', 'index']))
    for result in results:
        main = Category(result['field_id'],
                        values=result['values'],
                        meta={
                            'field_id': result['field_id'],
                            'name': result['field_id']
                        },
                        parents=['children', {
                            'id': taxrule
                        }, 'children'])
        fields.append(main)
        parents = [
            'children', {
                'id': taxrule
            }, 'children', {
                'id': result['field_id']
            }, 'data'
        ]
        field_id = "%s_%s" % (result['field_id'], 'cindex')
        fields.append(
            Variable(field_id,
                     values=result['data']['cindex'],
                     meta={
                         'scale':
                         'scaleLinear',
                         'field_id':
                         field_id,
                         'name':
                         field_id,
                         'datatype':
                         'integer',
                         'range': [
                             min(result['data']['cindex']),
                             max(result['data']['cindex'])
                         ],
                         'preload':
                         False,
                         'active':
                         False
                     },
                     parents=parents))
        field_id = "%s_%s" % (result['field_id'], 'score')
        _min = min(result['data']['score'])
        fields.append(
            Variable(field_id,
                     values=result['data']['score'],
                     meta={
                         'scale': 'scaleLog',
                         'field_id': field_id,
                         'name': field_id,
                         'clamp': 1 if _min == 0 else False,
                         'datatype': 'float',
                         'range': [_min, max(result['data']['score'])],
                         'preload': False,
                         'active': False
                     },
                     parents=parents))
        subfield = 'positions'
        field_id = "%s_%s" % (result['field_id'], subfield)
        if len(result['data'][subfield]) > 1:
            headers = ['name']
        else:
            headers = ['name']
        fields.append(
            MultiArray(field_id,
                       values=result['data'][subfield],
                       fixed_keys=main.keys,
                       meta={
                           'field_id': field_id,
                           'name': field_id,
                           'type': 'multiarray',
                           'datatype': 'string',
                           'preload': False,
                           'active': False,
                           'linked_field': hits_id
                       },
                       parents=parents,
                       category_slot=0,
                       headers=headers))
    return fields