def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard=None): if guard is None: guard = [] # Calculate the profile data data = {} regions = [] print("Loading data") try: # Load data in parallel pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append(pool.apply_async(load_heatmap_data, args=( featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard))) for job in jobs: track, regions, profile, guard = job.get() data[track] = profile except Exception as e: sys.stderr.write("Error loading data in parallel, trying serial\n") sys.stderr.write("Error: {}\n".format(e)) for datafile in datafiles: track, regions, profile, guard = load_heatmap_data(featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) data[track] = profile return data, regions, guard
def _load_bams(self, bams, title, window=200): tmp = pd.DataFrame(index=self.regions) with NamedTemporaryFile(mode="w") as f_out: for region in self.regions: print("{}\t{}\t{}".format(*re.split("[:-]", region)), file=f_out) f_out.flush() for bam in bams: result = load_heatmap_data( f_out.name, bam, bins=1, up=window // 2, down=window // 2, rmdup=True, rmrepeats=True, ) tmp[result[0]] = result[2].T[0] fname = f"{self.data_dir}/{title}.qnorm.ref.txt.gz" if os.path.exists(fname): logger.debug(f"quantile normalization for {title}") qnorm_ref = pd.read_table(fname, index_col=0)["qnorm_ref"].values if len(self.regions) != len(qnorm_ref): qnorm_ref = np.random.choice(qnorm_ref, size=len(self.regions), replace=True) tmp = qnorm.quantile_normalize(tmp, target=qnorm_ref) else: tmp = np.log1p(tmp) # Limit memory usage by using float16 tmp = tmp.mean(1).astype("float16").to_frame(title) fname = f"{self.data_dir}/{title}.mean.ref.txt.gz" if self.region_type == "reference" and os.path.exists(fname): mean_ref = pd.read_table(fname, index_col=0) if mean_ref.shape[0] == tmp.shape[0]: mean_ref.index = tmp.index tmp[f"{title}.relative"] = ( tmp[title] - mean_ref.loc[tmp.index]["mean_ref"].values) tmp[f"{title}.relative"] = scale(tmp[f"{title}.relative"]) else: logger.debug( f"Regions of {fname} are not the same as input regions.") logger.debug("Skipping calculation of relative values.") tmp[title] = tmp[title] / tmp[title].max() return tmp
def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard=None): if guard is None: guard = [] # Calculate the profile data data = {} regions = [] print("Loading data") try: # Load data in parallel pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append( pool.apply_async(load_heatmap_data, args=(featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard))) for job in jobs: track, regions, profile, guard = job.get() data[track] = profile except Exception as e: sys.stderr.write("Error loading data in parallel, trying serial\n") sys.stderr.write("Error: {}\n".format(e)) for datafile in datafiles: track, regions, profile, guard = load_heatmap_data( featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) data[track] = profile return data, regions, guard
def coverage_table( peakfile, datafiles, window, log_transform=True, normalization="none", top=0, topmethod="var", rmdup=True, rmrepeats=True, ncpus=12, ): for x in datafiles: if not os.path.isfile(x): print("ERROR: Data file '{0}' does not exist".format(x)) sys.exit(1) for x in datafiles: if ".bam" in x and not os.path.isfile("{0}.bai".format(x)): print("Data file '{0}' does not have an index file." " Creating an index file for {0}.".format(x)) pysam.index(x) logger.info("Loading data") data = {} try: # Load data in parallel pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append( pool.apply_async( load_heatmap_data, args=( peakfile, datafile, 1, window // 2, window // 2, rmdup, False, rmrepeats, None, False, None, ), )) for job in tqdm(jobs): track, regions, profile, guard = job.get() data[os.path.splitext(track)[0]] = profile[:, 0] except Exception as e: sys.stderr.write("Error loading data in parallel, trying serial\n") sys.stderr.write("Error: {}\n".format(e)) for datafile in tqdm(datafiles): track, regions, profile, guard = load_heatmap_data( peakfile, datafile, 1, window // 2, window // 2, rmdup, False, rmrepeats, None, False, None, ) data[os.path.splitext(track)[0]] = profile[:, 0] # Create DataFrame with regions as index regions = ["{}:{}-{}".format(*region[:3]) for region in regions] df = pd.DataFrame(data, index=regions) if log_transform: logger.info("Log transform") df = np.log1p(df) if normalization == "scale": logger.info("Normalization by scaling") df[:] = scale(df, axis=0) if normalization == "quantile": logger.info("Normalization by quantile normalization") df = qnorm.quantile_normalize(df) else: logger.info("No normalization") if top > 0: if topmethod == "var": idx = df.var(1).sort_values().tail(top).index elif topmethod == "std": idx = df.std(1).sort_values().tail(top).index elif topmethod == "mean": idx = df.mean(1).sort_values().tail(top).index elif topmethod == "random": idx = df.sample(top).index else: raise ValueError( "unknown method {} for selecting regions".format(topmethod)) df = df.loc[idx] return df