def testConvertUnconvertToFromLog2(self): if IGNORE_TEST: return def test(pd_obj): if isinstance(pd_obj, pd.DataFrame): base_obj = DF else: base_obj = SER obj1 = util.convertToLog2(base_obj) obj2 = util.unconvertFromLog2(obj1) if isinstance(pd_obj, pd.DataFrame): ser2 = obj2["a"] else: ser2 = obj2 ser2.loc[0] = 0 trues = [np.isclose(v1, v2) for v1, v2 in zip(ser2, SER)] self.assertTrue(all(trues)) # test(SER) test(DF) ser = util.convertToLog2(SER) ser1 = util.unconvertFromLog2(ser) ser1.loc[0] = 0 trues = [np.isclose(v1, v2) for v1, v2 in zip(ser1, SER)] self.assertTrue(all(trues))
def _makeNormalizedDF(self): """ Transformation of the "normalized Read Counts" processed by DESeq2. Standardized the values for each gene. Drops rows where all columns are minimum values. Assumes that self.df_gene_expression_state has been initialized. Only includes genes that are expressed. :return pd.DataFrame: rows: gene columns: time """ def defaultCalcRef(df): return df[cn.TIME_0] # df = self._getLog2NormalizedReadcounts() # Normalize w.r.t. the counts drops = [] # Rows to drop df_unlog2 = util.unconvertFromLog2(df) ser_ref_unlog2 = self.calcRef(df_unlog2) ser_ref = util.convertToLog2(ser_ref_unlog2) for idx in df.index: values = df.loc[idx, :] - ser_ref.loc[idx] df.loc[idx, :] = [max(MIN_LOG2_VALUE, v) for v in values] if all([v <= MIN_LOG2_VALUE for v in df.loc[idx, :]]): drops.append(idx) df = df.drop(index=drops) # Drop the 0 rows # Find genes to keep if self._is_only_qgenes: keep_genes = self.df_gene_expression_state.index df = df[df.index.isin(keep_genes)] # return df
def initialize(self): """ Construct the feature vectors for the samples. """ # Iterate across all samples for sample_name, descriptor in SAMPLE_DESCRIPTOR_DCT.items(): attribute_name = self.getDataframeAttributeName(sample_name) ### # Construct a data frame that is normalized for gene and library # and has log2 units ### # Select indices that are for the conditions/times considered df = transform_data.readGeneCSV(descriptor.csv).T sel = [ any([d in i for d in descriptor.cnm]) for i in df.index] df = df[sel] # Sort the instances and complete initial processing indices = sorted(df.index, key=lambda v: self._makeSortKey(sample_name, v)) df.index = indices if not descriptor.nrml: raise RuntimeError("Do gene normalization for sample %s" % sample_name) if not descriptor.log2: df = util.convertToLog2(df) ### # Convert to trinary values. This takes into account the reference values # for gene expression ### if self.ref_type == REF_TYPE_BIOREACTOR: ser_ref = transform_data.makeBioreactorT0ReferenceData() elif self.ref_type == REF_TYPE_POOLED: ser_ref = df.mean(axis=0) elif self.ref_type == REF_TYPE_SELF: if descriptor.sel is None: print("***%s: no selection for reference type 'self'. Result is None." % sample_name) df = None else: ser_ref = self._calcRefFromIndices(df, descriptor.sel) else: raise RuntimeError("%s is an invalid reference type" % self.ref_type) if df is not None: ### # Average replicas if requested ### if self.is_average: df = self.averageReplicas(df, descriptor.cnm) ### # Convert to trinary values ### df = transform_data.calcTrinaryComparison(df.T, ser_ref, is_convert_log2=False).T ### # Restrict to regulators? ### if self.is_regulator: trinary_data.subsetToRegulators(df) # self.__setattr__(attribute_name, df)
def makeBioreactorT0ReferenceData(): """ Creates the T0 reference data in log units. :return Series: """ dfs = copy.deepcopy(PROVIDER.dfs_adjusted_read_count) for df in dfs: df.columns = stripReplicaString(df.columns) df_ref = sum(dfs) / len(PROVIDER.dfs_adjusted_read_count) ser = df_ref[cn.TIME_0] ser = util.convertToLog2(ser) return ser
def test(pd_obj): if isinstance(pd_obj, pd.DataFrame): base_obj = DF else: base_obj = SER obj1 = util.convertToLog2(base_obj) obj2 = util.unconvertFromLog2(obj1) if isinstance(pd_obj, pd.DataFrame): ser2 = obj2["a"] else: ser2 = obj2 ser2.loc[0] = 0 trues = [np.isclose(v1, v2) for v1, v2 in zip(ser2, SER)] self.assertTrue(all(trues))
def calcTrinaryComparison(df, ser_ref, threshold=1, is_convert_log2=True): """ Calculates trinary values of a DataFrame w.r.t. a reference in log2 units. :param pd.DataFrame df: comparison values; columns are instances, has same inde as ser_ref :param pd.Series ser_ref: reference values :param float threshold: comparison threshold. :param bool is_convert_log2: convert to log2 :return pd.DataFrame: trinary values resulting from comparisons -1: df is less than 2**threshold*ser_ref 1: df is greater than 2**threshol*ser_ref 0: otherwise """ if ser_ref is None: raise RuntimeError("ser_ref cannot be None.") if is_convert_log2: ser_ref_log = util.convertToLog2(ser_ref) df_log = util.convertToLog2(df) else: df_log = df.copy() ser_ref_log = ser_ref.copy() # if ser_ref is None: ser_ref_log = pd.Series(np.repeat(0, len(df)), index=df.index) # Find the common indices indices = set(df_log.index).intersection(ser_ref_log.index) df_log = df_log.loc[indices, :] ser_ref_log = ser_ref_log[indices] df_comp_T = df_log.T - ser_ref_log # Drop the nan columns, those genes for which there is no reference df_comp = (df_comp_T.dropna(axis=1, how='all')).T df_result = makeTrinaryData(df=df_comp, min_abs=threshold, is_include_nan=False) return df_result
def testRemoveGenesWithExcessiveReplicationVariance(self): if IGNORE_TEST: return trinary = TrinaryData(is_averaged=False, is_dropT1=False, is_regulator=False) df_base = transform_data.removeGenesWithExcessiveReplicationVariance( trinary.df_X) for max_var in [1, 2, 3]: df = transform_data.removeGenesWithExcessiveReplicationVariance( trinary.df_X, max_var=max_var) self.assertGreaterEqual(len(df_base.columns), len(df.columns)) ser = util.convertToLog2(SER) ser1 = util.unconvertFromLog2(ser) ser1.loc[0] = 0 trues = [np.isclose(v1, v2) for v1, v2 in zip(ser1, SER)] self.assertTrue(all(trues))