def resource_cutadapt_metrics(uri, **kwargs): with open(uri) as fh: data = "".join(fh) sections = re.split("\n===.*===\n", data) df = DataFrame.from_records([_split_x(x) for x in sections[1].split("\n") if x], index=["statistic"], columns=["statistic", "value"]) df["value"] = pd.to_numeric(df["value"]) return df
def _reader(uri): with open(uri) as fh: data = [x.strip("\n").split("\t") for x in fh if not x.strip() == ""] indices = list((i for i, val in enumerate(data) if val[0].startswith("## METRICS CLASS"))) metrics = DataFrame.from_records(data[(indices[0] + 2):], columns=data[(indices[0] + 1)], index="CATEGORY") return (metrics, None)
def _hist_reader(uri): with open(uri) as fh: data = [x.strip("\n").split("\t") for x in fh if not x.strip() == ""] indices = list((i for i, val in enumerate(data) if val[0].startswith("## METRICS CLASS") or val[0].startswith("## HISTOGRAM"))) if len(indices) == 1: indices.append(len(data)) metrics = DataFrame.from_records(data[(indices[0] + 2):(indices[1])], columns=data[(indices[0] + 1)]) # We could be missing the histogram try: hist = DataFrame.from_records(data[(indices[1] + 2):], columns=data[(indices[1] + 1)]) except: logger.warn("No histogram data for {}".format(uri)) hist = None return (metrics, hist)
def resource_genome_results(uri, key="Globals", **kwargs): with open(uri) as fh: data = "".join(fh) sections = re.split(">+\s+[a-zA-Z ]+", data) section_names = ["Header"] + [re.sub(" ", "_", x) for x in re.findall(">+\s+([a-zA-Z ]+)", data)] d = dict() for h, sec in zip(section_names, sections): if h == "Coverage_per_contig": d[h] = DataFrame.from_records([re.split("\s+", x.strip()) for x in sec.split("\n") if x], columns=COVERAGE_PER_CONTIG_COLUMNS, index="chr") d[h] = d[h].apply(pd.to_numeric) elif h in ["Coverage", "Header"]: pass else: d[h] = DataFrame.from_records([_split_x(x) for x in sec.split("\n") if x], columns=["statistic", "value"], index="statistic") if not h in ["Input"]: d[h] = d[h].apply(pd.to_numeric) return d[key]