Esempio n. 1
0
        m = re.search(
            "Q score distribution\s*\n(?P<header>[a-zA-Z\s]+)\n" +
            "[\s\-]+\n(?P<data>.*)\n\s+L\s+PctRecs",
            data, re.DOTALL)
        indexcol = "Q"
    elif key == "Truncate at first Q":
        m = re.search(
            "Truncate at first Q\s*\n(?P<header>[0-9=a-zA-Z\s]+)\n" +
            "[\s\-]+\n(?P<data>.*)\n\n\s+\d+\s+Recs",
            data, re.DOTALL)
        indexcol = "Len"
    else:
        logger.warn("No such key '{}'".format(key))
    try:
        header = re.split("\s+", m.group("header").strip())
        if key == "Read length distribution":
            d = [re.split("\s+", re.sub("[><=]+", "", x).strip())
                 for x in re.split("\n", m.group("data").strip())]
        else:
            d = [re.split("\s+", x.strip())
                 for x in re.split("\n", m.group("data").strip())]
        df = DataFrame.from_records(d, columns=header)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.set_index(indexcol)
    except:
        raise
    return df


aggregate = utils.aggregate_factory("vsearch")
Esempio n. 2
0
@resource.register(config['genes']['pattern'],
                   priority=config['genes']['priority'])
@pivot
@annotate_by_uri
def resource_genes_results(uri, **kwargs):
    with open(uri):
        data = pd.read_csv(uri,
                           sep="\t",
                           header=0,
                           comment="#",
                           index_col=["gene_id"])
    return data


@resource.register(config['isoforms']['pattern'],
                   priority=config['isoforms']['priority'])
@pivot
@annotate_by_uri
def resource_isoforms_results(uri, **kwargs):
    with open(uri):
        data = pd.read_csv(uri,
                           sep="\t",
                           header=0,
                           comment="#",
                           index_col=["transcript_id"])
    return data


aggregate = utils.aggregate_factory("rsem")
Esempio n. 3
0
            d[h] = DataFrame.from_records(
                [re.split("\s+", x.strip()) for x in sec.split("\n") if x],
                columns=COVERAGE_PER_CONTIG_COLUMNS,
                index="chr")
            d[h] = d[h].apply(pd.to_numeric)
        elif h in ["Coverage", "Header"]:
            pass
        else:
            d[h] = DataFrame.from_records(
                [_split_x(x) for x in sec.split("\n") if x],
                columns=["statistic", "value"],
                index="statistic")
            if h not in ["Input"]:
                d[h] = d[h].apply(pd.to_numeric)
    return d[key]


@resource.register(config['data_frame']['pattern'],
                   priority=config['data_frame']['priority'])
@pivot
@annotate_by_uri
def resource_read_data_frame(uri, **kwargs):
    d = pd.read_table(uri)
    columns = list(d.columns)
    columns[0] = re.sub("#", "", columns[0]).strip()
    d.columns = columns
    return d


aggregate = utils.aggregate_factory("qualimap")
Esempio n. 4
0
    Args:
      uri (str): filename

    Returns:
      DataFrame: DataFrame for requested section
    """
    def _parse():
        data = []
        with open(uri) as fh:
            for x in fh.readlines()[5:]:
                if x.startswith("\n"):
                    continue
                x = re.sub("Read (\d+)", "Read_\\1", x)
                x = re.sub("(^\t|:|'|\s+$)", "", x)
                x = re.sub("\s+(\d+)", "\t\\1", x).split("\t")
                data.append(x)
        return data

    data = _parse()
    df = DataFrame.from_records(data)
    df.columns = ["statistic", "value", "percent"]
    df['percent'].replace("[\(\)%]", "", inplace=True, regex=True)
    df["percent"] = pd.to_numeric(df['percent'], errors="ignore")
    df["value"] = pd.to_numeric(df['value'], errors="ignore")
    df.set_index("statistic", inplace=True, drop=False)
    return df


aggregate = utils.aggregate_factory("bamtools")
Esempio n. 5
0
# Copyright (C) 2015 by Per Unneberg
import pandas as pd
import bioodo
from bioodo import resource, annotate_by_uri, pivot, utils
import logging


logger = logging.getLogger(__name__)
config = bioodo.__RESOURCE_CONFIG__['star']


@resource.register(config['log_final']['pattern'],
                   priority=config['log_final']['priority'])
@pivot
@annotate_by_uri
def resource_star_log(uri, **kwargs):
    """Parse Star Log.final.out log file"""
    df = pd.read_table(uri, sep="|", names=["name", "value"])
    df["name"] = [x.strip() for x in df["name"]]
    df["value"] = [utils.recast(x) for x in df["value"]]
    df = df.set_index("name")
    return df


aggregate = utils.aggregate_factory("star")
Esempio n. 6
0
    return _hist_reader(uri)


@resource.register(config['dup_metrics']['pattern'],
                   priority=config['dup_metrics']['priority'])
@pivot
@annotate_by_uri
def resource_dup_metrics(uri, key="metrics", **kwargs):
    """Parse picard DuplicationMetrics text output file.

    Args:
      uri (str): filename
      key (str): result section to return (hist or metrics)

    Returns:
      DataFrame for requested section
    """
    (_metrics, hist) = _hist_reader(uri)
    metrics = _metrics[_metrics.columns.difference(["LIBRARY"
                                                    ])].apply(pd.to_numeric,
                                                              axis=0)
    if hist is not None:
        hist = hist.apply(pd.to_numeric, axis=0)
    if key == "metrics":
        return metrics
    elif key == "hist":
        return hist


aggregate = utils.aggregate_factory("picard")
Esempio n. 7
0
                                columns=["type", "statistic", "value"])
    df = df.set_index('statistic')
    df["value"] = df["value"].apply(pd.to_numeric, errors="ignore")
    return df


@resource.register(config['filter']['pattern'],
                   priority=config['filter']['priority'])
@pivot
@annotate_by_uri
def resource_sga_filter(uri, **kwargs):
    """Parse sga filter log output file.

    Args:
      uri (str): filename

    Returns:
      DataFrame: DataFrame for requested section
    """
    logger.debug("Parsing {} in resource_sga_filter".format(uri))
    with open(uri) as fh:
        data = [[y for y in x.strip().split(":")] for x in fh.readlines()
                if x.startswith("Reads")]
    df = DataFrame.from_records(data, columns=["statistic", "value"])
    df = df.set_index('statistic')
    df["value"] = df["value"].apply(pd.to_numeric, errors="ignore")
    return df


aggregate = utils.aggregate_factory("sga")
Esempio n. 8
0
@pivot
@annotate_by_uri
def resource_bwa_mem(uri, **kwargs):
    """Parse bwa mem log output file.

    Args:
      uri (str): filename

    Returns:
      DataFrame: DataFrame for requested section
    """
    logger.debug("Parsing {} in resource_bwa_preprocess".format(uri))
    with open(uri) as fh:
        data = "".join(fh)
    sections = re.split("Preprocess stats:\n", data)
    parameters = [["parameter"] + [x.strip() for x in y.strip().split(":")]
                  for y in re.sub("Parameters:\n", "", sections[0]).split("\n")
                  if ":" in y]
    preprocess = [["preprocess stats"] +
                  [x.strip() for x in y.strip().split(":")]
                  for y in re.sub("\([0-9\.]+\)", "", sections[1]).split("\n")
                  if y and "wall" not in y]
    df = DataFrame.from_records(parameters + preprocess,
                                columns=["type", "statistic", "value"])
    df = df.set_index('statistic')
    df["value"] = df["value"].apply(pd.to_numeric, errors="ignore")
    return df


aggregate = utils.aggregate_factory("bwa")
Esempio n. 9
0
    """Parse mapdamage lgdistribution.txt

    Args:
      uri (str): filename

    Returns:
      DataFrame: DataFrame representation of lgdistribution
    """
    df = pd.read_table(uri, sep="\s+", comment="#")
    return df


@resource.register(config['misincorporation']['pattern'],
                   priority=config['misincorporation']['priority'])
@pivot
@annotate_by_uri
def resource_mapdamage_misincorporation(uri, **kwargs):
    """Parse mapdamage misincorporation.txt

    Args:
      uri (str): filename

    Returns:
      DataFrame: DataFrame representation of misincorporation
    """
    df = pd.read_table(uri, sep="\t", comment="#")
    return df


aggregate = utils.aggregate_factory("mapdamage")
Esempio n. 10
0
        return DataFrame.from_records([
            re.sub(" ", "_", x).split("\t")
            for x in re.findall(">>(.+)", data)
        ],
                                      columns=["Statistic", "Value"],
                                      index="Statistic")
    for h, sec in zip(headings, sections):
        if not h == key:
            continue
        logger.debug("Parsing section {}".format(h))
        if h == "Header":
            d = DataFrame.from_records(
                [[re.sub("#", "", x) for x in re.split("\t", sec.strip())]])
        else:
            i = 1 if h.startswith("Sequence_Duplication") else 0
            columns = [
                re.sub("#", "", x)
                for x in re.split("\t",
                                  sec.split("\n")[i].strip())
            ]
            d = DataFrame.from_records([
                re.split("\t", x.strip())
                for x in sec.split("\n") if x and not x.startswith("#")
            ],
                                       columns=columns,
                                       index=columns[0])
    return d


aggregate = utils.aggregate_factory("fastqc")
Esempio n. 11
0
# Copyright (C) 2015 by Per Unneberg
import pandas as pd
import bioodo
from bioodo import resource, annotate_by_uri, pivot, utils

config = bioodo.__RESOURCE_CONFIG__['rpkmforgenes']


@resource.register(config['rpkmforgenes']['pattern'],
                   priority=config['rpkmforgenes']['priority'])
@pivot
@annotate_by_uri
def resource_rpkmforgenes(uri, **kwargs):
    with open(uri):
        data = pd.read_csv(uri,
                           sep="\t",
                           header=None,
                           comment="#",
                           names=["gene_id", "transcript_id", "FPKM", "TPM"],
                           index_col=["gene_id"])
    return data


aggregate = utils.aggregate_factory("rpkmforgenes")
Esempio n. 12
0
regex = "\n===\s*(?P<Read>(First read|Second read)?):?" + \
        "\s+Adapter\s+'(?P<Adapter>[^\s]+)'\s+==="
adapter_re = re.compile(regex)

re_trim = re.compile(r'(\([0-9.]+%\)|,| |bp)')


def _split_x(x, delim=":"):
    y = x.strip().split(delim)
    return [y[0], re_trim.sub("", y[1])]


# For now only return the summary section
@resource.register(config['metrics']['pattern'],
                   priority=config['metrics']['priority'])
@pivot
@annotate_by_uri
def resource_cutadapt_metrics(uri, **kwargs):
    with open(uri) as fh:
        data = "".join(fh)
    sections = re.split("\n===.*===\n", data)
    df = DataFrame.from_records(
        [_split_x(x) for x in sections[1].split("\n") if x],
        index=["statistic"],
        columns=["statistic", "value"])
    df["value"] = pd.to_numeric(df["value"])
    return df


aggregate = utils.aggregate_factory("cutadapt")