def combine_spikein(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "spikein") dont_combine, to_combine = partition(dd.get_spikein_counts, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "spikein.sf") if not file_exists(tidy_file): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_spikein_counts(data) samplename = dd.get_sample_name(data) new_df = sailfish._sailfish_expression_parser( sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_spikein_counts(data, tidy_file) updated_samples.append([data]) return updated_samples
def combine_spikein(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "spikein") dont_combine, to_combine = partition(dd.get_spikein_counts, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "spikein.sf") if not file_exists(tidy_file): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_spikein_counts(data) samplename = dd.get_sample_name(data) new_df = sailfish._sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_spikein_counts(data, tidy_file) updated_samples.append([data]) return updated_samples
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples out_file = os.path.join(work_dir, "sailfish", "combined.sf") if not file_exists(out_file): df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) with file_transaction(out_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_combined(data, out_file) updated_samples.append([data]) return updated_samples
def _expand_dirs(in_files): def _is_dir(in_file): return os.path.isdir(os.path.expanduser(in_file)) files, dirs = utils.partition(_is_dir, in_files) for dir in dirs: wildcard = os.path.join(os.path.expanduser(dir), "*") files = itertools.chain(glob.glob(wildcard), files) return list(files)
def _expand_wildcards(in_files): def _has_wildcard(in_file): return "*" in in_file files, wildcards = utils.partition(_has_wildcard, in_files) for wc in wildcards: abs_path = os.path.expanduser(wc) files = itertools.chain(glob.glob(abs_path), files) return list(files)
def _expand_dirs(in_files, known_exts): def _is_dir(in_file): return os.path.isdir(os.path.expanduser(in_file)) files, dirs = utils.partition(_is_dir, in_files) for dir in dirs: for ext in known_exts.keys(): wildcard = os.path.join(os.path.expanduser(dir), "*" + ext) files = itertools.chain(glob.glob(wildcard), files) return list(files)
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "sailfish") gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "combined.sf") transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm") tx2gene = os.path.join(sailfish_dir, "tx2gene.csv") if not all([ file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene] ]): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot("id", "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") tx2gene = gtf.tx2genefile(gtf_file, tx2gene) logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) data = dd.set_tx2gene(data, tx2gene) updated_samples.append([data]) return updated_samples
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "sailfish") gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "combined.sf") transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm") tx2gene = os.path.join(sailfish_dir, "tx2gene.csv") if not all([file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene]]): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot("id", "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") tx2gene = gtf.tx2genefile(gtf_file, tx2gene) logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) data = dd.set_tx2gene(data, tx2gene) updated_samples.append([data]) return updated_samples
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(work_dir, "sailfish", "combined.sf") transcript_tpm_file = os.path.join(work_dir, "sailfish", "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(work_dir, "sailfish", "combined.gene.sf.tpm") if not all([file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file]]): df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot(None, "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot(None, "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) updated_samples.append([data]) return updated_samples