def test_011_axis_pandas(self): """ test numpy axis support """ df = pd.DataFrame({ "C1": { "A": 5.0, "B": 2.0, "C": 3.0, "D": 4.0 }, "C2": { "A": 4.0, "B": 1.0, "C": 4.0, "D": 2.0 }, "C3": { "A": 3.0, "B": 4.0, "C": 6.0, "D": 8.0 }, }) np.testing.assert_array_almost_equal( qnorm.quantile_normalize(df.T, axis=0).T, qnorm.quantile_normalize(df, axis=1), ) np.testing.assert_array_almost_equal( qnorm.quantile_normalize(df, axis=1), qnorm.quantile_normalize(df.T, axis=0).T, )
def test_001_pandas(self): """ test pandas support """ df = pd.DataFrame({ "C1": { "A": 5.0, "B": 2.0, "C": 3.0, "D": 4.0 }, "C2": { "A": 4.0, "B": 1.0, "C": 4.0, "D": 2.0 }, "C3": { "A": 3.0, "B": 4.0, "C": 6.0, "D": 8.0 }, }) qnorm.quantile_normalize(df)
def test_010_axis_numpy(self): """ test numpy axis support """ arr = np.random.normal(size=(50, 4)) np.testing.assert_array_almost_equal( qnorm.quantile_normalize(arr.T, axis=0).T, qnorm.quantile_normalize(arr, axis=1), ) np.testing.assert_array_almost_equal( qnorm.quantile_normalize(arr, axis=1), qnorm.quantile_normalize(arr.T, axis=0).T, )
def test_021_from_hdf_largefile(self): """ test whether or not incremental_quantile_normalize works with a larger random file """ np.random.seed(42) df1 = pd.DataFrame( index=range(5000), columns=["sample" + str(col) for col in range(100)], dtype=int, ) df1[:] = np.random.randint(0, 100, size=df1.shape) df1.to_hdf("test_large.hdf", key="qnorm", format="table", data_columns=True) qnorm.incremental_quantile_normalize( "test_large.hdf", "test_large_out.hdf", rowchunksize=11, colchunksize=11, ) df2 = pd.read_hdf("test_large_out.hdf", index_col=0, header=0) np.testing.assert_almost_equal(qnorm.quantile_normalize(df1), df2.values, decimal=4)
def test_027_from_parquet_largefile(self): """ test whether or not incremental_quantile_normalize works with a larger random file """ np.random.seed(42) df1 = pd.DataFrame( index=range(5000), columns=["sample" + str(col) for col in range(100)], ) df1[:] = np.random.randint(0, 100, size=df1.shape) df1 = df1.astype(float) df1.to_parquet("test_large.parquet") qnorm.incremental_quantile_normalize( "test_large.parquet", "test_large_out.parquet", rowchunksize=11, colchunksize=11, ) df2 = pd.read_parquet("test_large_out.parquet") np.testing.assert_almost_equal(qnorm.quantile_normalize(df1), df2.values, decimal=4)
def test_028(self): """ Test another array, not just wiki example. """ df = pd.DataFrame({ "C1": { "A": 2.0, "B": 2.0, "C": 2.0, "D": 2.0, "E": 6.0, "F": 1.0, }, "C2": { "A": 2.0, "B": 2.0, "C": 1.0, "D": 3.5, "E": 5.0, "F": 1.0, }, }) np.testing.assert_almost_equal( qnorm.quantile_normalize(df).values, np.array([ [2.0625, 2.0], [2.0625, 2.0], [2.0625, 1.25], [2.0625, 2.75], [5.5, 5.5], [1.0, 1.25], ]), )
def test_002_wiki(self): """ test the wiki example https://en.wikipedia.org/wiki/Quantile_normalization """ df = pd.DataFrame({ "C1": { "A": 5.0, "B": 2.0, "C": 3.0, "D": 4.0 }, "C2": { "A": 4.0, "B": 1.0, "C": 4.0, "D": 2.0 }, "C3": { "A": 3.0, "B": 4.0, "C": 6.0, "D": 8.0 }, }) result = np.array([ [5.66666667, 5.16666667, 2.0], [2.0, 2.0, 3.0], [3.0, 5.16666667, 4.66666667], [4.66666667, 3.0, 5.66666667], ]) np.testing.assert_array_almost_equal( qnorm.quantile_normalize(df).values, result)
def test_009_wiki_ncpus(self): """ test if an error is raised with a invalid sized target """ df = pd.DataFrame({ "C1": { "A": 5.0, "B": 2.0, "C": 3.0, "D": 4.0 }, "C2": { "A": 4.0, "B": 1.0, "C": 4.0, "D": 2.0 }, "C3": { "A": 3.0, "B": 4.0, "C": 6.0, "D": 8.0 }, }) result = np.array([ [5.66666667, 5.16666667, 2.0], [2.0, 2.0, 3.0], [3.0, 5.16666667, 4.66666667], [4.66666667, 3.0, 5.66666667], ]) np.testing.assert_array_almost_equal( qnorm.quantile_normalize(df, ncpus=10).values, result)
def test_005_single(self): """ if dtype is single, return single """ arr = np.random.normal(0, 1, size=(20, 3)) arr = arr.astype(np.float32) qnorm_arr = qnorm.quantile_normalize(arr) assert qnorm_arr.dtype == np.float32
def test_017_from_hdf(self): """ test the basic incremental_quantile_normalize functionality """ qnorm.incremental_quantile_normalize("test.hdf", "test_out.hdf") df1 = pd.read_hdf("test.hdf", index_col=0, header=0) df2 = pd.read_hdf("test_out.hdf", index_col=0, header=0) np.testing.assert_almost_equal(qnorm.quantile_normalize(df1), df2.values, decimal=5)
def test_006_target(self): """ test if the target is used instead of the qnorm values """ arr = np.array([np.arange(0, 10), np.arange(0, 10)]).T np.random.shuffle(arr) target = np.arange(10, 20) qnorm_arr = qnorm.quantile_normalize(arr, target=target) for val in target: assert (val in qnorm_arr[:, 0] and val in qnorm_arr[:, 1]), f"value {val} not in qnorm array"
def _load_bams(self, bams, title, window=200): tmp = pd.DataFrame(index=self.regions) with NamedTemporaryFile(mode="w") as f_out: for region in self.regions: print("{}\t{}\t{}".format(*re.split("[:-]", region)), file=f_out) f_out.flush() for bam in bams: result = load_heatmap_data( f_out.name, bam, bins=1, up=window // 2, down=window // 2, rmdup=True, rmrepeats=True, ) tmp[result[0]] = result[2].T[0] fname = f"{self.data_dir}/{title}.qnorm.ref.txt.gz" if os.path.exists(fname): logger.debug(f"quantile normalization for {title}") qnorm_ref = pd.read_table(fname, index_col=0)["qnorm_ref"].values if len(self.regions) != len(qnorm_ref): qnorm_ref = np.random.choice(qnorm_ref, size=len(self.regions), replace=True) tmp = qnorm.quantile_normalize(tmp, target=qnorm_ref) else: tmp = np.log1p(tmp) # Limit memory usage by using float16 tmp = tmp.mean(1).astype("float16").to_frame(title) fname = f"{self.data_dir}/{title}.mean.ref.txt.gz" if self.region_type == "reference" and os.path.exists(fname): mean_ref = pd.read_table(fname, index_col=0) if mean_ref.shape[0] == tmp.shape[0]: mean_ref.index = tmp.index tmp[f"{title}.relative"] = ( tmp[title] - mean_ref.loc[tmp.index]["mean_ref"].values) tmp[f"{title}.relative"] = scale(tmp[f"{title}.relative"]) else: logger.debug( f"Regions of {fname} are not the same as input regions.") logger.debug("Skipping calculation of relative values.") tmp[title] = tmp[title] / tmp[title].max() return tmp
def test_007_target_notsorted(self): """ make sure an unsorted target gets sorted first """ arr = np.array([np.arange(0, 10), np.arange(0, 10)]).T np.random.shuffle(arr) # take the reverse, which should be sorted by qnorm target = np.arange(10, 20)[::-1] qnorm_arr = qnorm.quantile_normalize(arr, target=target) for val in target: assert (val in qnorm_arr[:, 0] and val in qnorm_arr[:, 1]), f"value {val} not in qnorm array"
def test_003_no_change(self): """ no sorting should happen here """ arr = np.empty(shape=(20, 3)) for col in range(arr.shape[1]): vals = np.arange(arr.shape[0]) np.random.shuffle(vals) arr[:, col] = vals qnorm_arr = qnorm.quantile_normalize(arr) np.testing.assert_array_almost_equal(arr, qnorm_arr)
def test_023_from_parquet(self): """ test the basic incremental_quantile_normalize functionality """ qnorm.incremental_quantile_normalize("test.parquet", "test_out.parquet") df1 = pd.read_parquet("test.parquet") df2 = pd.read_parquet("test_out.parquet") np.testing.assert_almost_equal(qnorm.quantile_normalize(df1), df2.values, decimal=5)
def extend(self, outdir: str, data_files: List[str]) -> T: if self.schema_version == "0.0.0": raise ValueError("dataset does not support custom sources") outdir = Path(outdir) if not os.path.exists(outdir): os.makedirs(outdir) meanstd = pd.read_table(self.meanstd_file) bed = meanstd["index"].str.replace("[:-]", "\t").to_frame() logger.info("Processing BAM files") with NamedTemporaryFile() as f: bed.to_csv(f.name, index=False, header=False) # create coverage_table df = coverage_table(peakfile=f.name, datafiles=data_files, window=self.window, ncpus=12) target = np.load(self.target_file)["target"] df = qnorm.quantile_normalize(df, target=target) df.index = meanstd["index"] df = df.sub(meanstd["mean"].values, axis=0) df = df.div(meanstd["std"].values, axis=0) genes = _create_gene_table( df, self.meanstd_file, self.gene_file, self.gene_mapping, genome=self.genome, link_file=self.link_file, ) logger.info(f"Writing reference to {outdir}") df.reset_index().to_feather(outdir / "enhancers.feather") genes.to_csv(outdir / "genes.txt", sep="\t") info = { "genes": "genes.txt", "enhancers": "enhancers.feather", "source": self.name, "genome": self.genome, "schema_version": __schema_version__, } with open(outdir / "info.yaml", "w") as f: yaml.dump(info, f) return ScepiaDataset(outdir)
def test_013_from_csv_rowchunk(self): """ test the incremental_quantile_normalize with rowchunks functionality """ df1 = pd.read_csv("test.csv", index_col=0, header=0) for rowchunksize in range(1, 10): qnorm.incremental_quantile_normalize("test.csv", "test_out.csv", rowchunksize=rowchunksize) df2 = pd.read_csv("test_out.csv", index_col=0, header=0) np.testing.assert_almost_equal(qnorm.quantile_normalize(df1), df2.values, decimal=5)
def test_019_from_hdf_colchunk(self): """ test the incremental_quantile_normalize with colchunks functionality """ df1 = pd.read_hdf("test.hdf", index_col=0, header=0) for colchunksize in range(1, 10): qnorm.incremental_quantile_normalize("test.hdf", "test_out.hdf", colchunksize=colchunksize) df2 = pd.read_hdf("test_out.hdf", index_col=0, header=0) np.testing.assert_almost_equal(qnorm.quantile_normalize(df1), df2.values, decimal=5)
def tpm_normalization( tpms: pd.DataFrame, column_order: list, minimum_value: int = None, ) -> pd.DataFrame: """filter and order a tpm table, then quantile normalize and log transform""" bc = tpms[column_order] # filter & order samples if minimum_value: b4 = bc.shape[0] bc = bc[bc.max(axis=1) >= minimum_value] # filter genes aft = b4 - bc.shape[0] print(f"Genes with TPM below {minimum_value}: {aft} of {b4} ({round(100*aft/b4,0)}%)") bc = quantile_normalize(bc, axis=1) # normalize bc = np.log2(bc+1) # transform return bc
def test_025_from_parquet_colchunk(self): """ test the incremental_quantile_normalize with colchunks functionality """ df1 = pd.read_parquet("test.parquet") for colchunksize in range(1, 10): qnorm.incremental_quantile_normalize("test.parquet", "test_out.parquet", colchunksize=colchunksize) df2 = pd.read_parquet("test_out.parquet") np.testing.assert_almost_equal(qnorm.quantile_normalize(df1), df2.values, decimal=5)
def peaks_merge(coverage_files, bed_output, ncore=1): """ averages all peaks_count outputs uses quantile normalization to normalize for read depth returns one BED 3+1 file """ ncore = min(4, ncore) bed = pd.read_csv(coverage_files[0], header=None, sep="\t") if len(coverage_files) > 1: for file in coverage_files[1:]: scores = pd.read_csv(file, header=None, sep="\t")[3] bed = pd.concat([bed, scores], axis=1) scores = bed.iloc[:, 3:] scores = qnorm.quantile_normalize(scores, axis=1, ncpus=ncore) scores = scores.mean(axis=1) bed = pd.concat([bed.iloc[:, :3], scores], axis=1) bed.to_csv(bed_output, sep="\t", header=False, index=False)
def test_020_from_hdf_colrowchunk(self): """ test the incremental_quantile_normalize with both row and colchunks """ df1 = pd.read_hdf("test.hdf", index_col=0, header=0) for colchunksize in range(1, 10): for rowchunksize in range(1, 10): qnorm.incremental_quantile_normalize( "test.hdf", "test_out.hdf", rowchunksize=rowchunksize, colchunksize=colchunksize, ) df2 = pd.read_hdf("test_out.hdf", index_col=0, header=0) np.testing.assert_almost_equal(qnorm.quantile_normalize(df1), df2.values, decimal=5)
def test_026_from_parquet_colrowchunk(self): """ test the incremental_quantile_normalize with both row and colchunks """ df1 = pd.read_parquet("test.parquet") for colchunksize in range(1, 10): for rowchunksize in range(1, 10): qnorm.incremental_quantile_normalize( "test.parquet", "test_out.parquet", rowchunksize=rowchunksize, colchunksize=colchunksize, ) df2 = pd.read_parquet("test_out.parquet") np.testing.assert_almost_equal(qnorm.quantile_normalize(df1), df2.values, decimal=5)
def main(): """Console script for qnorm.""" parser = argparse.ArgumentParser( description="Quantile normalization from the CLI!") parser.add_argument( "-v", "--version", action="version", version=f"qnorm: v{qnorm.__version__}", ) parser.add_argument( "table", help="input csv/tsv file which will be quantile normalized") args = parser.parse_args() delimiter = get_delim(args.table) df = pd.read_csv(args.table, index_col=0, sep=delimiter, comment="#") qnorm_df = qnorm.quantile_normalize(df) print(qnorm_df.to_csv(sep=delimiter))
def test_016_from_csv_largefile(self): """ test whether or not incremental_quantile_normalize works with a larger random file """ np.random.seed(42) df1 = pd.DataFrame(index=range(5000), columns=range(100)) df1[:] = np.random.randint(0, 100, size=df1.shape) df1.to_csv("test_large.csv") qnorm.incremental_quantile_normalize( "test_large.csv", "test_large_out.csv", rowchunksize=11, colchunksize=11, ) df2 = pd.read_csv("test_large_out.csv", index_col=0, header=0) np.testing.assert_almost_equal(qnorm.quantile_normalize(df1), df2.values, decimal=4)
import pandas as pd import qnorm df = pd.read_csv(snakemake.input[0], comment="#", index_col=0, sep="\t") # cpm normalization df = df * 1_000_000 / df.sum(axis=0) # quantile normalize df_qn = qnorm.quantile_normalize(df) open(str(snakemake.output[0]), "w").write( "# The number of reads under each peak, cpm quantile normalized\n" + df_qn.to_csv(index_label="loc", index=True, header=True, sep="\t") )
if plt_x_ax < 4: plt_x_ax = plt_x_ax + 1 elif plt_x_ax == 4: plt_x_ax = 0 plt_y_ax = plt_y_ax + 1 fig.savefig(f"{exp_path}/Distribution_graphs_notNorm.pdf") # #### Quantile normalization on samples per mark # In[21]: dict_norm_dfs = {} for mark in marks: df = dict_of_dfs[mark] norm_df = qnorm.quantile_normalize(df, axis=1, ncpus=20) dict_norm_dfs[mark] = norm_df # In[22]: warnings.filterwarnings("ignore") plt.style.use("seaborn") sns.set(rc={"figure.figsize": (23, 16)}) fig, axes = plt.subplots(3, 5) plt_x_ax = 0 plt_y_ax = 0 for mark in marks: samples = list(dict_norm_dfs[mark])
def normalize(exp: pd.DataFrame, transpose: bool = False) -> pd.DataFrame: if transpose: exp = exp.transpose() exp = pd.DataFrame(np.log2(exp + 1)) exp = qnorm.quantile_normalize(exp) return (exp)
def coverage_table( peakfile, datafiles, window, log_transform=True, normalization="none", top=0, topmethod="var", rmdup=True, rmrepeats=True, ncpus=12, ): for x in datafiles: if not os.path.isfile(x): print("ERROR: Data file '{0}' does not exist".format(x)) sys.exit(1) for x in datafiles: if ".bam" in x and not os.path.isfile("{0}.bai".format(x)): print("Data file '{0}' does not have an index file." " Creating an index file for {0}.".format(x)) pysam.index(x) logger.info("Loading data") data = {} try: # Load data in parallel pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append( pool.apply_async( load_heatmap_data, args=( peakfile, datafile, 1, window // 2, window // 2, rmdup, False, rmrepeats, None, False, None, ), )) for job in tqdm(jobs): track, regions, profile, guard = job.get() data[os.path.splitext(track)[0]] = profile[:, 0] except Exception as e: sys.stderr.write("Error loading data in parallel, trying serial\n") sys.stderr.write("Error: {}\n".format(e)) for datafile in tqdm(datafiles): track, regions, profile, guard = load_heatmap_data( peakfile, datafile, 1, window // 2, window // 2, rmdup, False, rmrepeats, None, False, None, ) data[os.path.splitext(track)[0]] = profile[:, 0] # Create DataFrame with regions as index regions = ["{}:{}-{}".format(*region[:3]) for region in regions] df = pd.DataFrame(data, index=regions) if log_transform: logger.info("Log transform") df = np.log1p(df) if normalization == "scale": logger.info("Normalization by scaling") df[:] = scale(df, axis=0) if normalization == "quantile": logger.info("Normalization by quantile normalization") df = qnorm.quantile_normalize(df) else: logger.info("No normalization") if top > 0: if topmethod == "var": idx = df.var(1).sort_values().tail(top).index elif topmethod == "std": idx = df.std(1).sort_values().tail(top).index elif topmethod == "mean": idx = df.mean(1).sort_values().tail(top).index elif topmethod == "random": idx = df.sample(top).index else: raise ValueError( "unknown method {} for selecting regions".format(topmethod)) df = df.loc[idx] return df
def test_000_numpy(self): """ test numpy support """ arr = np.random.normal(size=(20, 2)) qnorm.quantile_normalize(arr)