コード例 #1
0
ファイル: multiqc.py プロジェクト: matthdsm/bcbio-nextgen
def _merge_fastqc(samples):
    """
    merge all fastqc samples into one by module
    """
    fastqc_list = collections.defaultdict(list)
    seen = set()
    for data in samples:
        name = dd.get_sample_name(data)
        if name in seen:
            continue
        seen.add(name)
        fns = glob.glob(os.path.join(dd.get_work_dir(data), "qc", dd.get_sample_name(data), "fastqc") + "/*")
        for fn in fns:
            if fn.endswith("tsv"):
                metric = os.path.basename(fn)
                fastqc_list[metric].append([name, fn])

    for metric in fastqc_list:
        dt_by_sample = []
        for fn in fastqc_list[metric]:
            dt = pd.read_csv(fn[1], sep="\t")
            dt['sample'] = fn[0]
            dt_by_sample.append(dt)
        dt = utils.rbind(dt_by_sample)
        dt.to_csv(metric, sep="\t", index=False, mode ='w')
    return samples
コード例 #2
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples),
                                         True)
    if not to_combine:
        return samples

    out_file = os.path.join(work_dir, "sailfish", "combined.sf")
    if not file_exists(out_file):
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        with file_transaction(out_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_combined(data, out_file)
        updated_samples.append([data])
    return updated_samples
コード例 #3
0
ファイル: qcsummary.py プロジェクト: fw1121/bcbio-nextgen
def _get_module(fastq_list, module, wide=True):
    dt_together = []
    for sample in fastq_list:
        dt = []
        itern = fastq_list[sample].clean_data(module)
        header = itern[0]
        total = fastq_list[sample].clean_data("Basic Statistics")[4][1]
        for data in itern[1:]:
            if data[0].startswith("#"):
                header = data
                continue
            if wide:
                if data[0].find("-") > -1:
                    f, s = map(int, data[0].split("-"))
                    for pos in range(f, s):
                        dt.append([str(pos)] + data[1:])
                else:
                    dt.append(data)
        dt = pd.DataFrame(dt)
        dt.columns = [h.replace(" ", "_") for h in header]
        dt['sample'] = sample
        dt['total'] = total
        dt_together.append(dt)
    dt_together = utils.rbind(dt_together)
    return dt_together
コード例 #4
0
def _merge_metrics(samples):
    """
    parse project.yaml file to get metrics for each bam
    """
    out_file = os.path.join("metrics", "metrics.tsv")
    dt_together = []
    cov = {}
    with file_transaction(out_file) as out_tx:
        for s in samples:
            s = s[0]
            m = tz.get_in(['summary', 'metrics'], s)
            if m:
                for me in m:
                    if isinstance(m[me], list):
                        m[me] = ":".join(m[me])
                dt = pd.DataFrame(m, index=['1'])
                dt['avg_coverage_per_region'] = _get_coverage_per_region(s['description'])
                cov[s['description']] = dt['avg_coverage_per_region'][0]
                # dt = pd.DataFrame.from_dict(m)
                dt.columns = [k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns]
                dt['sample'] = s['description']
                dt_together.append(dt)
        if len(dt_together) > 0:
            dt_together = utils.rbind(dt_together)
            dt_together.to_csv(out_tx, index=False, sep="\t")

    for i, s in enumerate(samples):
        if s[0]['description'] in cov:
            samples[i][0]['summary']['metrics']['avg_coverage_per_region'] = cov[s[0]['description']]
    return samples
コード例 #5
0
def _get_module(fastq_list, module, wide=True):
    dt_together = []
    for sample in fastq_list:
        dt = []
        itern = fastq_list[sample].clean_data(module)
        header = itern[0]
        total = fastq_list[sample].clean_data("Basic Statistics")[4][1]
        for data in itern[1:]:
            if data[0].startswith("#"):
                header = data
                continue
            if wide:
                if data[0].find("-") > -1:
                    f, s = map(int, data[0].split("-"))
                    for pos in range(f, s):
                        dt.append([str(pos)] + data[1:])
                else:
                    dt.append(data)
        dt = pd.DataFrame(dt)
        dt.columns = [h.replace(" ", "_") for h in header]
        dt['sample'] = sample
        dt['total'] = total
        dt_together.append(dt)
    dt_together = utils.rbind(dt_together)
    return dt_together
コード例 #6
0
ファイル: multiqc.py プロジェクト: zhangj5/bcbio-nextgen
def _merge_fastqc(samples):
    """
    merge all fastqc samples into one by module
    """
    fastqc_list = collections.defaultdict(list)
    seen = set()
    for data in samples:
        name = dd.get_sample_name(data)
        if name in seen:
            continue
        seen.add(name)
        fns = glob.glob(
            os.path.join(dd.get_work_dir(data), "qc", dd.get_sample_name(data),
                         "fastqc") + "/*")
        for fn in fns:
            if fn.endswith("tsv"):
                metric = os.path.basename(fn)
                fastqc_list[metric].append([name, fn])

    for metric in fastqc_list:
        dt_by_sample = []
        for fn in fastqc_list[metric]:
            dt = pd.read_csv(fn[1], sep="\t")
            dt['sample'] = fn[0]
            dt_by_sample.append(dt)
        dt = utils.rbind(dt_by_sample)
        dt.to_csv(metric, sep="\t", index=False, mode='w')
    return samples
コード例 #7
0
ファイル: spikein.py プロジェクト: zhangj5/bcbio-nextgen
def combine_spikein(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "spikein")
    dont_combine, to_combine = partition(dd.get_spikein_counts,
                                         dd.sample_data_iterator(samples),
                                         True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "spikein.sf")
    if not file_exists(tidy_file):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_spikein_counts(data)
            samplename = dd.get_sample_name(data)
            new_df = sailfish._sailfish_expression_parser(
                sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_spikein_counts(data, tidy_file)
        updated_samples.append([data])
    return updated_samples
コード例 #8
0
ファイル: spikein.py プロジェクト: DoaneAS/bcbio-nextgen
def combine_spikein(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "spikein")
    dont_combine, to_combine = partition(dd.get_spikein_counts,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "spikein.sf")
    if not file_exists(tidy_file):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_spikein_counts(data)
            samplename = dd.get_sample_name(data)
            new_df = sailfish._sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_spikein_counts(data, tidy_file)
        updated_samples.append([data])
    return updated_samples
コード例 #9
0
ファイル: sailfish.py プロジェクト: pansapiens/bcbio-nextgen
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "sailfish")
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples),
                                         True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "combined.sf")
    transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm")
    tx2gene = os.path.join(sailfish_dir, "tx2gene.csv")
    if not all([
            file_exists(x)
            for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene]
    ]):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as tx_out_file:
            df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as tx_out_file:
            pivot = df.pivot("id", "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")
        tx2gene = gtf.tx2genefile(gtf_file, tx2gene)
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        data = dd.set_tx2gene(data, tx2gene)
        updated_samples.append([data])
    return updated_samples
コード例 #10
0
def _combine_regional_coverage(in_bams, samplenames, chrom, start, end, work_dir):
    """
    given a list of bam files, sample names and a region, calculate the
    coverage in the region for each of the samples and return a tidy pandas
    dataframe of the format:

    chrom position coverage name
    """
    dfs = [_calc_regional_coverage(bam, chrom, start, end, sample, work_dir) for bam, sample
           in zip(in_bams, samplenames)]
    return rbind(dfs)
コード例 #11
0
ファイル: coverage.py プロジェクト: vhuarui/bcbio-nextgen
def _combine_regional_coverage(in_bams, samplenames, chrom, start, end, work_dir):
    """
    given a list of bam files, sample names and a region, calculate the
    coverage in the region for each of the samples and return a tidy pandas
    dataframe of the format:

    chrom position coverage name
    """
    dfs = [_calc_regional_coverage(bam, chrom, start, end, sample, work_dir) for bam, sample
           in zip(in_bams, samplenames)]
    return rbind(dfs)
コード例 #12
0
ファイル: sailfish.py プロジェクト: pansapiens/bcbio-nextgen
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "sailfish")
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "combined.sf")
    transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm")
    tx2gene = os.path.join(sailfish_dir, "tx2gene.csv")
    if not all([file_exists(x) for x in [gene_tpm_file, tidy_file,
                                         transcript_tpm_file, tx2gene]]):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as  tx_out_file:
            df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as  tx_out_file:
            pivot = df.pivot("id", "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")
        tx2gene = gtf.tx2genefile(gtf_file, tx2gene)
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        data = dd.set_tx2gene(data, tx2gene)
        updated_samples.append([data])
    return updated_samples
コード例 #13
0
ファイル: multiqc.py プロジェクト: Yixf-Self/bcbio-nextgen
def _merge_metrics(samples):
    """
    parse project.yaml file to get metrics for each bam
    """
    out_file = os.path.join("metrics", "metrics.tsv")
    dt_together = []
    cov = {}
    with file_transaction(out_file) as out_tx:
        for s in samples:
            sample_name = dd.get_sample_name(s)
            s = _add_disambiguate(s)
            if sample_name in cov:
                continue
            m = tz.get_in(['summary', 'metrics'], s)
            sample_file = os.path.abspath(
                os.path.join("metrics", "%s_bcbio.txt" % sample_name))
            if not tz.get_in(['summary', 'qc'], s):
                s['summary'] = {"qc": {}}
            if m:
                for me in m.keys():
                    if isinstance(m[me], list) or isinstance(
                            m[me], dict) or isinstance(m[me], tuple):
                        m.pop(me, None)
                dt = pd.DataFrame(m, index=['1'])
                dt['avg_coverage_per_region'] = _get_coverage_per_region(s)
                cov[sample_name] = dt['avg_coverage_per_region'][0]
                dt.columns = [
                    k.replace(" ", "_").replace("(", "").replace(")", "")
                    for k in dt.columns
                ]
                dt['sample'] = sample_name
                dt['rRNA_rate'] = m.get('rRNA_rate', "NA")
                df = _fix_duplicated_rate(dt)
                dt.transpose().to_csv(sample_file, sep="\t", header=False)
                dt_together.append(dt)
                s['summary']['qc'].update(
                    {'bcbio': {
                        'base': sample_file,
                        'secondary': []
                    }})
        if len(dt_together) > 0:
            dt_together = utils.rbind(dt_together)
            dt_together.to_csv(out_tx, index=False, sep="\t")

    out = []
    for s in samples:
        if sample_name in cov:
            s['summary']['metrics']['avg_coverage_per_region'] = cov[
                sample_name]
        out.append(s)
    return out
コード例 #14
0
ファイル: coverage.py プロジェクト: roryk/exomeCov
def bcbio_metrics(args):
    """
    parse project.yaml file to get metrics for each bam
    """
    project = yaml.load(open(args.bams[0]))
    out_dir = safe_makedir(args.out)
    out_file = op.join(out_dir, "metrics.tsv")
    dt_together = []
    with file_transaction(out_file) as out_tx:
        for s in project['samples']:
            m = s['summary']['metrics']
            dt = pd.DataFrame.from_dict(m)
            dt.columns = [k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns]
            dt['sample'] = s['description']
            dt_together.append(dt)
        dt_together = rbind(dt_together)
        dt_together.to_csv(out_tx, index=False, sep="\t")
コード例 #15
0
ファイル: sailfish.py プロジェクト: Kange2014/bcbio-nextgen
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(work_dir, "sailfish", "combined.sf")
    transcript_tpm_file = os.path.join(work_dir, "sailfish",
                                       "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(work_dir, "sailfish",
                                 "combined.gene.sf.tpm")
    if not all([file_exists(x) for x in [gene_tpm_file, tidy_file,
                                         transcript_tpm_file]]):
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as  tx_out_file:
            df.pivot(None, "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as  tx_out_file:
            pivot = df.pivot(None, "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        updated_samples.append([data])
    return updated_samples
コード例 #16
0
ファイル: sailfish.py プロジェクト: samesun/bcbio-nextgen
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(work_dir, "sailfish", "combined.sf")
    transcript_tpm_file = os.path.join(work_dir, "sailfish",
                                       "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(work_dir, "sailfish",
                                 "combined.gene.sf.tpm")
    if not all([file_exists(x) for x in [gene_tpm_file, tidy_file,
                                         transcript_tpm_file]]):
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as  tx_out_file:
            df.pivot(None, "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as  tx_out_file:
            pivot = df.pivot(None, "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        updated_samples.append([data])
    return updated_samples
コード例 #17
0
def _merge_metrics(yaml_data):
    """
    parse project.yaml file to get metrics for each bam
    """
    project = yaml_data
    out_file = os.path.join("metrics", "metrics.tsv")
    dt_together = []
    with file_transaction(out_file) as out_tx:
        for s in project['samples']:
            m = s['summary']['metrics']
            for me in m:
                if isinstance(m[me], list):
                    m[me] = ":".join(m[me])
            dt = pd.DataFrame(m, index=['1'])
            # dt = pd.DataFrame.from_dict(m)
            dt.columns = [k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns]
            dt['sample'] = s['description']
            dt_together.append(dt)
        dt_together = utils.rbind(dt_together)
        dt_together.to_csv(out_tx, index=False, sep="\t")
コード例 #18
0
def _merge_metrics(yaml_data):
    """
    parse project.yaml file to get metrics for each bam
    """
    project = yaml_data
    out_file = os.path.join("metrics", "metrics.tsv")
    dt_together = []
    with file_transaction(out_file) as out_tx:
        for s in project['samples']:
            m = s['summary']['metrics']
            for me in m:
                if isinstance(m[me], list):
                    m[me] = ":".join(m[me])
            dt = pd.DataFrame(m, index=['1'])
            # dt = pd.DataFrame.from_dict(m)
            dt.columns = [k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns]
            dt['sample'] = s['description']
            dt_together.append(dt)
        dt_together = utils.rbind(dt_together)
        dt_together.to_csv(out_tx, index=False, sep="\t")
コード例 #19
0
ファイル: multiqc.py プロジェクト: hliang/bcbio-nextgen
def _merge_metrics(samples):
    """
    parse project.yaml file to get metrics for each bam
    """
    out_file = os.path.join("metrics", "metrics.tsv")
    dt_together = []
    cov = {}
    with file_transaction(out_file) as out_tx:
        for s in samples:
            sample_name = dd.get_sample_name(s)
            s = _add_disambiguate(s)
            if sample_name in cov:
                continue
            m = tz.get_in(['summary', 'metrics'], s)
            sample_file = os.path.abspath(os.path.join("metrics", "%s_bcbio.txt" % sample_name))
            if not tz.get_in(['summary', 'qc'], s):
                s['summary'] = {"qc": {}}
            if m:
                for me in m.keys():
                    if isinstance(m[me], list) or isinstance(m[me], dict) or isinstance(m[me], tuple):
                        m.pop(me, None)
                dt = pd.DataFrame(m, index=['1'])
                dt['avg_coverage_per_region'] = _get_coverage_per_region(s)
                cov[sample_name] = dt['avg_coverage_per_region'][0]
                dt.columns = [k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns]
                dt['sample'] = sample_name
                dt['rRNA_rate'] = m.get('rRNA_rate', "NA")
                df = _fix_duplicated_rate(dt)
                dt.transpose().to_csv(sample_file, sep="\t", header=False)
                dt_together.append(dt)
                s['summary']['qc'].update({'bcbio':{'base': sample_file, 'secondary': []}})
        if len(dt_together) > 0:
            dt_together = utils.rbind(dt_together)
            dt_together.to_csv(out_tx, index=False, sep="\t")

    out = []
    for s in samples:
        if sample_name in cov:
            s['summary']['metrics']['avg_coverage_per_region'] = cov[sample_name]
        out.append(s)
    return out
コード例 #20
0
def _merge_fastqc(data):
    """
    merge all fastqc samples into one by module
    """
    fastqc_list = defaultdict(list)
    for sample in data:
        name = dd.get_sample_name(sample[0])
        fns = glob.glob(os.path.join(dd.get_work_dir(sample[0]), "qc", dd.get_sample_name(sample[0]), "fastqc") + "/*")
        for fn in fns:
            if fn.endswith("tsv"):
                metric = os.path.basename(fn)
                fastqc_list[metric].append([name, fn])
    for metric in fastqc_list:
        dt_by_sample = []
        for fn in fastqc_list[metric]:
            dt = pd.read_csv(fn[1], sep="\t")
            dt['sample'] = fn[0]
            dt_by_sample.append(dt)
        dt = utils.rbind(dt_by_sample)
        dt.to_csv(metric, sep="\t", index=False, mode = 'w')
    return [data]
コード例 #21
0
def _merge_metrics(samples):
    """
    parse project.yaml file to get metrics for each bam
    """
    out_file = os.path.join("metrics", "metrics.tsv")
    dt_together = []
    cov = {}
    with file_transaction(out_file) as out_tx:
        for s in samples:
            s = s[0]
            if s['description'] in cov:
                continue
            m = tz.get_in(['summary', 'metrics'], s)
            if m:
                for me in m:
                    if isinstance(m[me], list):
                        m[me] = ":".join(m[me])
                dt = pd.DataFrame(m, index=['1'])
                dt['avg_coverage_per_region'] = _get_coverage_per_region(
                    s['description'])
                cov[s['description']] = dt['avg_coverage_per_region'][0]
                # dt = pd.DataFrame.from_dict(m)
                dt.columns = [
                    k.replace(" ", "_").replace("(", "").replace(")", "")
                    for k in dt.columns
                ]
                dt['sample'] = s['description']
                dt_together.append(dt)
        if len(dt_together) > 0:
            dt_together = utils.rbind(dt_together)
            dt_together.to_csv(out_tx, index=False, sep="\t")

    for i, s in enumerate(samples):
        if s[0]['description'] in cov:
            samples[i][0]['summary']['metrics'][
                'avg_coverage_per_region'] = cov[s[0]['description']]
    return samples