m = re.search( "Q score distribution\s*\n(?P<header>[a-zA-Z\s]+)\n" + "[\s\-]+\n(?P<data>.*)\n\s+L\s+PctRecs", data, re.DOTALL) indexcol = "Q" elif key == "Truncate at first Q": m = re.search( "Truncate at first Q\s*\n(?P<header>[0-9=a-zA-Z\s]+)\n" + "[\s\-]+\n(?P<data>.*)\n\n\s+\d+\s+Recs", data, re.DOTALL) indexcol = "Len" else: logger.warn("No such key '{}'".format(key)) try: header = re.split("\s+", m.group("header").strip()) if key == "Read length distribution": d = [re.split("\s+", re.sub("[><=]+", "", x).strip()) for x in re.split("\n", m.group("data").strip())] else: d = [re.split("\s+", x.strip()) for x in re.split("\n", m.group("data").strip())] df = DataFrame.from_records(d, columns=header) df = df.apply(pd.to_numeric, errors='ignore') df = df.set_index(indexcol) except: raise return df aggregate = utils.aggregate_factory("vsearch")
@resource.register(config['genes']['pattern'], priority=config['genes']['priority']) @pivot @annotate_by_uri def resource_genes_results(uri, **kwargs): with open(uri): data = pd.read_csv(uri, sep="\t", header=0, comment="#", index_col=["gene_id"]) return data @resource.register(config['isoforms']['pattern'], priority=config['isoforms']['priority']) @pivot @annotate_by_uri def resource_isoforms_results(uri, **kwargs): with open(uri): data = pd.read_csv(uri, sep="\t", header=0, comment="#", index_col=["transcript_id"]) return data aggregate = utils.aggregate_factory("rsem")
d[h] = DataFrame.from_records( [re.split("\s+", x.strip()) for x in sec.split("\n") if x], columns=COVERAGE_PER_CONTIG_COLUMNS, index="chr") d[h] = d[h].apply(pd.to_numeric) elif h in ["Coverage", "Header"]: pass else: d[h] = DataFrame.from_records( [_split_x(x) for x in sec.split("\n") if x], columns=["statistic", "value"], index="statistic") if h not in ["Input"]: d[h] = d[h].apply(pd.to_numeric) return d[key] @resource.register(config['data_frame']['pattern'], priority=config['data_frame']['priority']) @pivot @annotate_by_uri def resource_read_data_frame(uri, **kwargs): d = pd.read_table(uri) columns = list(d.columns) columns[0] = re.sub("#", "", columns[0]).strip() d.columns = columns return d aggregate = utils.aggregate_factory("qualimap")
Args: uri (str): filename Returns: DataFrame: DataFrame for requested section """ def _parse(): data = [] with open(uri) as fh: for x in fh.readlines()[5:]: if x.startswith("\n"): continue x = re.sub("Read (\d+)", "Read_\\1", x) x = re.sub("(^\t|:|'|\s+$)", "", x) x = re.sub("\s+(\d+)", "\t\\1", x).split("\t") data.append(x) return data data = _parse() df = DataFrame.from_records(data) df.columns = ["statistic", "value", "percent"] df['percent'].replace("[\(\)%]", "", inplace=True, regex=True) df["percent"] = pd.to_numeric(df['percent'], errors="ignore") df["value"] = pd.to_numeric(df['value'], errors="ignore") df.set_index("statistic", inplace=True, drop=False) return df aggregate = utils.aggregate_factory("bamtools")
# Copyright (C) 2015 by Per Unneberg import pandas as pd import bioodo from bioodo import resource, annotate_by_uri, pivot, utils import logging logger = logging.getLogger(__name__) config = bioodo.__RESOURCE_CONFIG__['star'] @resource.register(config['log_final']['pattern'], priority=config['log_final']['priority']) @pivot @annotate_by_uri def resource_star_log(uri, **kwargs): """Parse Star Log.final.out log file""" df = pd.read_table(uri, sep="|", names=["name", "value"]) df["name"] = [x.strip() for x in df["name"]] df["value"] = [utils.recast(x) for x in df["value"]] df = df.set_index("name") return df aggregate = utils.aggregate_factory("star")
return _hist_reader(uri) @resource.register(config['dup_metrics']['pattern'], priority=config['dup_metrics']['priority']) @pivot @annotate_by_uri def resource_dup_metrics(uri, key="metrics", **kwargs): """Parse picard DuplicationMetrics text output file. Args: uri (str): filename key (str): result section to return (hist or metrics) Returns: DataFrame for requested section """ (_metrics, hist) = _hist_reader(uri) metrics = _metrics[_metrics.columns.difference(["LIBRARY" ])].apply(pd.to_numeric, axis=0) if hist is not None: hist = hist.apply(pd.to_numeric, axis=0) if key == "metrics": return metrics elif key == "hist": return hist aggregate = utils.aggregate_factory("picard")
columns=["type", "statistic", "value"]) df = df.set_index('statistic') df["value"] = df["value"].apply(pd.to_numeric, errors="ignore") return df @resource.register(config['filter']['pattern'], priority=config['filter']['priority']) @pivot @annotate_by_uri def resource_sga_filter(uri, **kwargs): """Parse sga filter log output file. Args: uri (str): filename Returns: DataFrame: DataFrame for requested section """ logger.debug("Parsing {} in resource_sga_filter".format(uri)) with open(uri) as fh: data = [[y for y in x.strip().split(":")] for x in fh.readlines() if x.startswith("Reads")] df = DataFrame.from_records(data, columns=["statistic", "value"]) df = df.set_index('statistic') df["value"] = df["value"].apply(pd.to_numeric, errors="ignore") return df aggregate = utils.aggregate_factory("sga")
@pivot @annotate_by_uri def resource_bwa_mem(uri, **kwargs): """Parse bwa mem log output file. Args: uri (str): filename Returns: DataFrame: DataFrame for requested section """ logger.debug("Parsing {} in resource_bwa_preprocess".format(uri)) with open(uri) as fh: data = "".join(fh) sections = re.split("Preprocess stats:\n", data) parameters = [["parameter"] + [x.strip() for x in y.strip().split(":")] for y in re.sub("Parameters:\n", "", sections[0]).split("\n") if ":" in y] preprocess = [["preprocess stats"] + [x.strip() for x in y.strip().split(":")] for y in re.sub("\([0-9\.]+\)", "", sections[1]).split("\n") if y and "wall" not in y] df = DataFrame.from_records(parameters + preprocess, columns=["type", "statistic", "value"]) df = df.set_index('statistic') df["value"] = df["value"].apply(pd.to_numeric, errors="ignore") return df aggregate = utils.aggregate_factory("bwa")
"""Parse mapdamage lgdistribution.txt Args: uri (str): filename Returns: DataFrame: DataFrame representation of lgdistribution """ df = pd.read_table(uri, sep="\s+", comment="#") return df @resource.register(config['misincorporation']['pattern'], priority=config['misincorporation']['priority']) @pivot @annotate_by_uri def resource_mapdamage_misincorporation(uri, **kwargs): """Parse mapdamage misincorporation.txt Args: uri (str): filename Returns: DataFrame: DataFrame representation of misincorporation """ df = pd.read_table(uri, sep="\t", comment="#") return df aggregate = utils.aggregate_factory("mapdamage")
return DataFrame.from_records([ re.sub(" ", "_", x).split("\t") for x in re.findall(">>(.+)", data) ], columns=["Statistic", "Value"], index="Statistic") for h, sec in zip(headings, sections): if not h == key: continue logger.debug("Parsing section {}".format(h)) if h == "Header": d = DataFrame.from_records( [[re.sub("#", "", x) for x in re.split("\t", sec.strip())]]) else: i = 1 if h.startswith("Sequence_Duplication") else 0 columns = [ re.sub("#", "", x) for x in re.split("\t", sec.split("\n")[i].strip()) ] d = DataFrame.from_records([ re.split("\t", x.strip()) for x in sec.split("\n") if x and not x.startswith("#") ], columns=columns, index=columns[0]) return d aggregate = utils.aggregate_factory("fastqc")
# Copyright (C) 2015 by Per Unneberg import pandas as pd import bioodo from bioodo import resource, annotate_by_uri, pivot, utils config = bioodo.__RESOURCE_CONFIG__['rpkmforgenes'] @resource.register(config['rpkmforgenes']['pattern'], priority=config['rpkmforgenes']['priority']) @pivot @annotate_by_uri def resource_rpkmforgenes(uri, **kwargs): with open(uri): data = pd.read_csv(uri, sep="\t", header=None, comment="#", names=["gene_id", "transcript_id", "FPKM", "TPM"], index_col=["gene_id"]) return data aggregate = utils.aggregate_factory("rpkmforgenes")
regex = "\n===\s*(?P<Read>(First read|Second read)?):?" + \ "\s+Adapter\s+'(?P<Adapter>[^\s]+)'\s+===" adapter_re = re.compile(regex) re_trim = re.compile(r'(\([0-9.]+%\)|,| |bp)') def _split_x(x, delim=":"): y = x.strip().split(delim) return [y[0], re_trim.sub("", y[1])] # For now only return the summary section @resource.register(config['metrics']['pattern'], priority=config['metrics']['priority']) @pivot @annotate_by_uri def resource_cutadapt_metrics(uri, **kwargs): with open(uri) as fh: data = "".join(fh) sections = re.split("\n===.*===\n", data) df = DataFrame.from_records( [_split_x(x) for x in sections[1].split("\n") if x], index=["statistic"], columns=["statistic", "value"]) df["value"] = pd.to_numeric(df["value"]) return df aggregate = utils.aggregate_factory("cutadapt")