def _merge_fastqc(samples): """ merge all fastqc samples into one by module """ fastqc_list = collections.defaultdict(list) seen = set() for data in samples: name = dd.get_sample_name(data) if name in seen: continue seen.add(name) fns = glob.glob(os.path.join(dd.get_work_dir(data), "qc", dd.get_sample_name(data), "fastqc") + "/*") for fn in fns: if fn.endswith("tsv"): metric = os.path.basename(fn) fastqc_list[metric].append([name, fn]) for metric in fastqc_list: dt_by_sample = [] for fn in fastqc_list[metric]: dt = pd.read_csv(fn[1], sep="\t") dt['sample'] = fn[0] dt_by_sample.append(dt) dt = utils.rbind(dt_by_sample) dt.to_csv(metric, sep="\t", index=False, mode ='w') return samples
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples out_file = os.path.join(work_dir, "sailfish", "combined.sf") if not file_exists(out_file): df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) with file_transaction(out_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_combined(data, out_file) updated_samples.append([data]) return updated_samples
def _get_module(fastq_list, module, wide=True): dt_together = [] for sample in fastq_list: dt = [] itern = fastq_list[sample].clean_data(module) header = itern[0] total = fastq_list[sample].clean_data("Basic Statistics")[4][1] for data in itern[1:]: if data[0].startswith("#"): header = data continue if wide: if data[0].find("-") > -1: f, s = map(int, data[0].split("-")) for pos in range(f, s): dt.append([str(pos)] + data[1:]) else: dt.append(data) dt = pd.DataFrame(dt) dt.columns = [h.replace(" ", "_") for h in header] dt['sample'] = sample dt['total'] = total dt_together.append(dt) dt_together = utils.rbind(dt_together) return dt_together
def _merge_metrics(samples): """ parse project.yaml file to get metrics for each bam """ out_file = os.path.join("metrics", "metrics.tsv") dt_together = [] cov = {} with file_transaction(out_file) as out_tx: for s in samples: s = s[0] m = tz.get_in(['summary', 'metrics'], s) if m: for me in m: if isinstance(m[me], list): m[me] = ":".join(m[me]) dt = pd.DataFrame(m, index=['1']) dt['avg_coverage_per_region'] = _get_coverage_per_region(s['description']) cov[s['description']] = dt['avg_coverage_per_region'][0] # dt = pd.DataFrame.from_dict(m) dt.columns = [k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns] dt['sample'] = s['description'] dt_together.append(dt) if len(dt_together) > 0: dt_together = utils.rbind(dt_together) dt_together.to_csv(out_tx, index=False, sep="\t") for i, s in enumerate(samples): if s[0]['description'] in cov: samples[i][0]['summary']['metrics']['avg_coverage_per_region'] = cov[s[0]['description']] return samples
def _merge_fastqc(samples): """ merge all fastqc samples into one by module """ fastqc_list = collections.defaultdict(list) seen = set() for data in samples: name = dd.get_sample_name(data) if name in seen: continue seen.add(name) fns = glob.glob( os.path.join(dd.get_work_dir(data), "qc", dd.get_sample_name(data), "fastqc") + "/*") for fn in fns: if fn.endswith("tsv"): metric = os.path.basename(fn) fastqc_list[metric].append([name, fn]) for metric in fastqc_list: dt_by_sample = [] for fn in fastqc_list[metric]: dt = pd.read_csv(fn[1], sep="\t") dt['sample'] = fn[0] dt_by_sample.append(dt) dt = utils.rbind(dt_by_sample) dt.to_csv(metric, sep="\t", index=False, mode='w') return samples
def combine_spikein(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "spikein") dont_combine, to_combine = partition(dd.get_spikein_counts, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "spikein.sf") if not file_exists(tidy_file): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_spikein_counts(data) samplename = dd.get_sample_name(data) new_df = sailfish._sailfish_expression_parser( sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_spikein_counts(data, tidy_file) updated_samples.append([data]) return updated_samples
def combine_spikein(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "spikein") dont_combine, to_combine = partition(dd.get_spikein_counts, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "spikein.sf") if not file_exists(tidy_file): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_spikein_counts(data) samplename = dd.get_sample_name(data) new_df = sailfish._sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_spikein_counts(data, tidy_file) updated_samples.append([data]) return updated_samples
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "sailfish") gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "combined.sf") transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm") tx2gene = os.path.join(sailfish_dir, "tx2gene.csv") if not all([ file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene] ]): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot("id", "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") tx2gene = gtf.tx2genefile(gtf_file, tx2gene) logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) data = dd.set_tx2gene(data, tx2gene) updated_samples.append([data]) return updated_samples
def _combine_regional_coverage(in_bams, samplenames, chrom, start, end, work_dir): """ given a list of bam files, sample names and a region, calculate the coverage in the region for each of the samples and return a tidy pandas dataframe of the format: chrom position coverage name """ dfs = [_calc_regional_coverage(bam, chrom, start, end, sample, work_dir) for bam, sample in zip(in_bams, samplenames)] return rbind(dfs)
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "sailfish") gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "combined.sf") transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm") tx2gene = os.path.join(sailfish_dir, "tx2gene.csv") if not all([file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene]]): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot("id", "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") tx2gene = gtf.tx2genefile(gtf_file, tx2gene) logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) data = dd.set_tx2gene(data, tx2gene) updated_samples.append([data]) return updated_samples
def _merge_metrics(samples): """ parse project.yaml file to get metrics for each bam """ out_file = os.path.join("metrics", "metrics.tsv") dt_together = [] cov = {} with file_transaction(out_file) as out_tx: for s in samples: sample_name = dd.get_sample_name(s) s = _add_disambiguate(s) if sample_name in cov: continue m = tz.get_in(['summary', 'metrics'], s) sample_file = os.path.abspath( os.path.join("metrics", "%s_bcbio.txt" % sample_name)) if not tz.get_in(['summary', 'qc'], s): s['summary'] = {"qc": {}} if m: for me in m.keys(): if isinstance(m[me], list) or isinstance( m[me], dict) or isinstance(m[me], tuple): m.pop(me, None) dt = pd.DataFrame(m, index=['1']) dt['avg_coverage_per_region'] = _get_coverage_per_region(s) cov[sample_name] = dt['avg_coverage_per_region'][0] dt.columns = [ k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns ] dt['sample'] = sample_name dt['rRNA_rate'] = m.get('rRNA_rate', "NA") df = _fix_duplicated_rate(dt) dt.transpose().to_csv(sample_file, sep="\t", header=False) dt_together.append(dt) s['summary']['qc'].update( {'bcbio': { 'base': sample_file, 'secondary': [] }}) if len(dt_together) > 0: dt_together = utils.rbind(dt_together) dt_together.to_csv(out_tx, index=False, sep="\t") out = [] for s in samples: if sample_name in cov: s['summary']['metrics']['avg_coverage_per_region'] = cov[ sample_name] out.append(s) return out
def bcbio_metrics(args): """ parse project.yaml file to get metrics for each bam """ project = yaml.load(open(args.bams[0])) out_dir = safe_makedir(args.out) out_file = op.join(out_dir, "metrics.tsv") dt_together = [] with file_transaction(out_file) as out_tx: for s in project['samples']: m = s['summary']['metrics'] dt = pd.DataFrame.from_dict(m) dt.columns = [k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns] dt['sample'] = s['description'] dt_together.append(dt) dt_together = rbind(dt_together) dt_together.to_csv(out_tx, index=False, sep="\t")
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(work_dir, "sailfish", "combined.sf") transcript_tpm_file = os.path.join(work_dir, "sailfish", "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(work_dir, "sailfish", "combined.gene.sf.tpm") if not all([file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file]]): df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot(None, "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot(None, "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) updated_samples.append([data]) return updated_samples
def _merge_metrics(yaml_data): """ parse project.yaml file to get metrics for each bam """ project = yaml_data out_file = os.path.join("metrics", "metrics.tsv") dt_together = [] with file_transaction(out_file) as out_tx: for s in project['samples']: m = s['summary']['metrics'] for me in m: if isinstance(m[me], list): m[me] = ":".join(m[me]) dt = pd.DataFrame(m, index=['1']) # dt = pd.DataFrame.from_dict(m) dt.columns = [k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns] dt['sample'] = s['description'] dt_together.append(dt) dt_together = utils.rbind(dt_together) dt_together.to_csv(out_tx, index=False, sep="\t")
def _merge_metrics(samples): """ parse project.yaml file to get metrics for each bam """ out_file = os.path.join("metrics", "metrics.tsv") dt_together = [] cov = {} with file_transaction(out_file) as out_tx: for s in samples: sample_name = dd.get_sample_name(s) s = _add_disambiguate(s) if sample_name in cov: continue m = tz.get_in(['summary', 'metrics'], s) sample_file = os.path.abspath(os.path.join("metrics", "%s_bcbio.txt" % sample_name)) if not tz.get_in(['summary', 'qc'], s): s['summary'] = {"qc": {}} if m: for me in m.keys(): if isinstance(m[me], list) or isinstance(m[me], dict) or isinstance(m[me], tuple): m.pop(me, None) dt = pd.DataFrame(m, index=['1']) dt['avg_coverage_per_region'] = _get_coverage_per_region(s) cov[sample_name] = dt['avg_coverage_per_region'][0] dt.columns = [k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns] dt['sample'] = sample_name dt['rRNA_rate'] = m.get('rRNA_rate', "NA") df = _fix_duplicated_rate(dt) dt.transpose().to_csv(sample_file, sep="\t", header=False) dt_together.append(dt) s['summary']['qc'].update({'bcbio':{'base': sample_file, 'secondary': []}}) if len(dt_together) > 0: dt_together = utils.rbind(dt_together) dt_together.to_csv(out_tx, index=False, sep="\t") out = [] for s in samples: if sample_name in cov: s['summary']['metrics']['avg_coverage_per_region'] = cov[sample_name] out.append(s) return out
def _merge_fastqc(data): """ merge all fastqc samples into one by module """ fastqc_list = defaultdict(list) for sample in data: name = dd.get_sample_name(sample[0]) fns = glob.glob(os.path.join(dd.get_work_dir(sample[0]), "qc", dd.get_sample_name(sample[0]), "fastqc") + "/*") for fn in fns: if fn.endswith("tsv"): metric = os.path.basename(fn) fastqc_list[metric].append([name, fn]) for metric in fastqc_list: dt_by_sample = [] for fn in fastqc_list[metric]: dt = pd.read_csv(fn[1], sep="\t") dt['sample'] = fn[0] dt_by_sample.append(dt) dt = utils.rbind(dt_by_sample) dt.to_csv(metric, sep="\t", index=False, mode = 'w') return [data]
def _merge_metrics(samples): """ parse project.yaml file to get metrics for each bam """ out_file = os.path.join("metrics", "metrics.tsv") dt_together = [] cov = {} with file_transaction(out_file) as out_tx: for s in samples: s = s[0] if s['description'] in cov: continue m = tz.get_in(['summary', 'metrics'], s) if m: for me in m: if isinstance(m[me], list): m[me] = ":".join(m[me]) dt = pd.DataFrame(m, index=['1']) dt['avg_coverage_per_region'] = _get_coverage_per_region( s['description']) cov[s['description']] = dt['avg_coverage_per_region'][0] # dt = pd.DataFrame.from_dict(m) dt.columns = [ k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns ] dt['sample'] = s['description'] dt_together.append(dt) if len(dt_together) > 0: dt_together = utils.rbind(dt_together) dt_together.to_csv(out_tx, index=False, sep="\t") for i, s in enumerate(samples): if s[0]['description'] in cov: samples[i][0]['summary']['metrics'][ 'avg_coverage_per_region'] = cov[s[0]['description']] return samples