def testGetStateNames(self): if IGNORE_TEST: return trinary = TrinaryData() names = trinary.getStateNames([0, 4]) self.assertEqual(names[0], "Transition") self.assertEqual(names[1], "Resuscitation")
def testSubsetToStates(self): if IGNORE_TEST: return trinary = TrinaryData() subset_trinary = trinary.subsetToStates(["Transition"], genes=GENES) len1 = len(trinary.ser_y[trinary.ser_y > 0]) len2 = len(subset_trinary.ser_y[subset_trinary.ser_y > 0]) self.assertGreater(len1, len2) self.assertEqual(len(GENES), len(subset_trinary.df_X.columns))
def testRegulator(self): if IGNORE_TEST: return trinary_full = TrinaryData(is_averaged=False, is_dropT1=False, is_regulator=False) trinary_regulator = TrinaryData(is_averaged=False, is_dropT1=False, is_regulator=True) self.assertGreater(len(trinary_full.df_X.columns), len(trinary_regulator.df_X.columns))
def testIsStageAveraged(self): if IGNORE_TEST: return def test(data): self.assertEqual(len(data.df_X), len(data.ser_y)) # data = TrinaryData(is_stage_averaged=True) test(data) data = TrinaryData(is_dropT1=False, is_stage_averaged=True) test(data)
def testPlotExpressionLevels(self): if IGNORE_TEST: return trinary = TrinaryData() trinary.plotExpressionLevels(GENES, is_plot=IS_PLOT, title="title") trinary.plotExpressionLevels(GENES, df_X=trinary.df_X, is_plot=IS_PLOT) trinary.plotExpressionLevels(GENES, df_X=trinary.df_X, ser_y=trinary.ser_y, is_plot=IS_PLOT) trinary.plotExpressionLevels(GENES, is_plot=IS_PLOT, title="title", is_color_bar=False)
def testTrinaryRefPooled(self): if IGNORE_TEST: return trinary1 = TrinaryData(is_reinitialize=True) trinary2 = TrinaryData(calcRef=PROVIDER.calcRefPooled, is_reinitialize=True) is_different = False for column in trinary1.df_X.columns: if column not in trinary2.df_X.columns: is_different = True break if not trinary1.df_X[column].equals(trinary2.df_X[column]): is_different = True break self.assertTrue(is_different)
def testTrinaryReadsDF2(self): return # Checks that trinary values computed directly from reads # are the same as those of normalized samples. # Get raw value of read counts provider = DataProvider() provider.do() # def calcTrinaryTimeSample(time_index): """ Calculates the trinary value of a time sample :param str time_index: name of time value """ int_index = int(time_index[1:]) df0 = provider.dfs_read_count[0] num = len(provider.dfs_read_count) ser = pd.Series(np.repeat(0, len(df0.index)), index=df0.index) for idx in range(num): ser += provider.dfs_read_count[idx][int_index] df = pd.DataFrame(ser/num) df_result = transform_data.trinaryReadsDF(df_sample=df) return df_result.T # data = TrinaryData() data.df_X.columns = data.features for time_index in data.df_X.index: df_result = calcTrinaryTimeSample(time_index) import pdb; pdb.set_trace()
def make(num_features=NUM_FEATURES, num_classifiers=NUM_CLASSIFIERS, file_path=cn.ENSEMBLE_PATH, is_force=True, **kwargs): """ Creates a classifier ensemble from the time course data, and fits the classifier. :param int num_features: number of features in the ensemble :param int num_classifiers: number of classifiers :param str file_path: path where classifier is exported :param bool is_force: Create new classifier even if one exists already. :param dict kwargs: Options passed to TrinaryData :return ClassifierEnsemble: """ # See if there's a classifier already if not is_force: try: return classifier_ensemble.ClassifierEnsemble.deserialize( file_path) except ValueError: pass # Construct a new classifier svm_ensemble = classifier_ensemble.ClassifierEnsemble( classifier_ensemble.ClassifierDescriptorSVM(), filter_high_rank=num_features, size=num_classifiers) data = TrinaryData(**kwargs) data.df_X.columns = data.features svm_ensemble.fit(data.df_X, data.ser_y) svm_ensemble.serialize(file_path) return svm_ensemble
def testNonAveraged(self): if IGNORE_TEST: return def test(df_X, ser_y): isConsistentState(ser_y) self.assertEqual(len(df_X), len(ser_y)) # data1 = TrinaryData(is_averaged=False, is_dropT1=False) data2 = TrinaryData(is_averaged=True, is_dropT1=False) for data in [data1, data2]: test(data.df_X, data.ser_y) self.assertGreater(len(data1.df_X), len(data2.df_X)) self.assertGreater(len(data1.df_X.columns), len(data2.df_X.columns)) # Replicated data should have 3 Normoxia states self.assertEqual(data1.ser_y[data1.ser_y == 0].count(), NUM_REPL)
def __init__(self, data=None): """ :param NormalizedData data: """ if data is None: self.data = TrinaryData() else: self.data = data self.scores = None
def _getData(): provider = DataProvider() provider.do() trinary = TrinaryData(is_averaged=False, is_dropT1=False) if IS_ONLY_TFS: columns = set(trinary.df_X.columns).intersection( provider.tfs) else: columns = trinary.df_X.columns columns = list(columns) return trinary.df_X[columns], trinary.ser_y
def _getData(state, columns=None, **kwargs): """ Obtains data for a binary classifier for the class. :param int state: state for which classification is done :param kwargs: dict: Options for TrinaryData :returns pd.DataFrame, pd.Series: """ trinary = TrinaryData(**kwargs) ser_y = trinary.ser_y.apply(lambda v: 1 if v == state else 0) if columns is None: df_X = trinary.df_X else: df_X = trinary.df_X[columns].copy() return df_X, ser_y
def testGetStateNames(self): if IGNORE_TEST: return self.provider.do() trinary = TrinaryData() result1s = self.provider.getStateNames(trinary.ser_y) count = len(set(trinary.ser_y.values)) self.assertEqual(len(result1s), count) # ser_y = trinary.ser_y.copy() indices = [i + ".0" for i in ser_y.index] ser_y.index = indices result2s = self.provider.getStateNames(trinary.ser_y) self.assertTrue(all([v1 == v2 for v1, v2 in zip(result1s, result2s)]))
def _getData(state): """ Obtains data for a binary classifier for the class. :param int state: state for which classification is done :param pd.DataFrame, pd.Series: """ provider = DataProvider() provider.do() trinary = TrinaryData(is_averaged=False, is_dropT1=False) columns = set(trinary.df_X.columns).intersection( provider.tfs) columns = list(columns) ser_y = trinary.ser_y.apply(lambda v: 1 if v == state else 0) return trinary.df_X[columns], ser_y
def testRemoveGenesWithExcessiveReplicationVariance(self): if IGNORE_TEST: return trinary = TrinaryData(is_averaged=False, is_dropT1=False, is_regulator=False) df_base = transform_data.removeGenesWithExcessiveReplicationVariance( trinary.df_X) for max_var in [1, 2, 3]: df = transform_data.removeGenesWithExcessiveReplicationVariance( trinary.df_X, max_var=max_var) self.assertGreaterEqual(len(df_base.columns), len(df.columns)) ser = util.convertToLog2(SER) ser1 = util.unconvertFromLog2(ser) ser1.loc[0] = 0 trues = [np.isclose(v1, v2) for v1, v2 in zip(ser1, SER)] self.assertTrue(all(trues))
def setUp(self): self.trinary = TrinaryData() self.provider = DataProvider() self.provider.do()
def testPlotFeatureSignificanceByState(self): if IGNORE_TEST: return trinary = TrinaryData(is_averaged=False, is_dropT1=False) trinary.plotFeatureSignificanceByState(is_plot=IS_PLOT)
def initialize(self): """ Initializes the data. Defines and initializes all names added to globals(). """ # T0 = "T0" POOLED = "pooled" self._addName("T0", "T0") self._addName("POOLED", "pooled") self._addName("REF_TYPE_POOLED", REF_TYPE_POOLED) self._addName("REF_TYPE_BIOREACTOR", REF_TYPE_BIOREACTOR) self._addName("REF_TYPE_SELF", REF_TYPE_SELF) # Provider PROVIDER = DataProvider() self._addName("PROVIDER", PROVIDER) PROVIDER.do() TRINARY = TrinaryData() self._addName("TRINARY", TRINARY) # Gene Classes ALL_GENES = list(TRINARY.df_X.columns) self._addName("ALL_GENES", ALL_GENES) # Gene groupings. Added later so can include top12 from classifier MYCOBACTIN_GENES = [ "Rv2377c", "Rv2378c", "Rv2379c", "Rv2380c", "Rv2381c", "Rv2382c", "Rv2383c", "Rv2384", "Rv2385", "Rv2386c", ] self._addName("MYCOBACTIN_GENES", MYCOBACTIN_GENES) BACTERIOFERRITIN_GENES = [ "Rv2341", "Rv3841", ] self._addName("BACTERIOFERRITIN_GENES", BACTERIOFERRITIN_GENES) MYCOBACTIN_BACTERIOFERRIN_GENES = list(MYCOBACTIN_GENES) self._addName("MYCOBACTIN_BACTERIOFERRIN_GENES", MYCOBACTIN_BACTERIOFERRIN_GENES) MYCOBACTIN_BACTERIOFERRIN_GENES.extend(BACTERIOFERRITIN_GENES) MYCOBACTIN_BACTERIOFERRITIN = "mycobactin_bacterioferritin" BACTERIOFERRITIN = "bacterioferritin" MYCOBACTIN = "mycobactin" ALL = "all" GENE_DCT = { MYCOBACTIN: MYCOBACTIN_GENES, BACTERIOFERRITIN: BACTERIOFERRITIN_GENES, MYCOBACTIN_BACTERIOFERRITIN: MYCOBACTIN_BACTERIOFERRIN_GENES, ALL: ALL_GENES, } # Define the stage names STAGE_NAMES = list(cn.STATE_NAMES) self._addName("STAGE_NAMES", STAGE_NAMES) STAGE_NAMES.remove("Normoxia") STAGE_NAMES = np.array(STAGE_NAMES) # Bioreactor data calculated with two different references DATA_DCT = { T0: TrinaryData(is_regulator=False, is_dropT1=True, is_averaged=True), POOLED: TrinaryData(is_regulator=False, is_dropT1=True, is_averaged=True, calcRef=PROVIDER.calcRefPooled) } self._addName("DATA_DCT", DATA_DCT) SER_Y_DCT = {k: t.ser_y for k, t in DATA_DCT.items()} self._addName("SER_Y_DCT", SER_Y_DCT) # Feature vectors are specific to the gene subsets DF_X_DCT = {k: t.df_X.copy() for k, t in DATA_DCT.items()} DF_X_DCT = {k: df[MYCOBACTIN_GENES] for k, df in DF_X_DCT.items()} self._addName("DF_X_DCT", DF_X_DCT) # Sample data SAMPLE_DCT = { r: sample_data.getSampleData(ref_type=r, is_regulator=False) for r in [REF_TYPE_BIOREACTOR, REF_TYPE_SELF, REF_TYPE_POOLED] } self._addName("SAMPLE_DCT", SAMPLE_DCT) SAMPLE_AVG_DCT = { r: sample_data.getSampleData(ref_type=r, is_regulator=False, is_average=True) for r in [REF_TYPE_BIOREACTOR, REF_TYPE_SELF, REF_TYPE_POOLED] } self._addName("SAMPLE_AVG_DCT", SAMPLE_AVG_DCT) # Classifiers num_feature = len(MYCOBACTIN_BACTERIOFERRIN_GENES) CLASSIFIER_BASE = classifier_ensemble.ClassifierEnsemble( classifier_ensemble.ClassifierDescriptorSVM(), filter_high_rank=num_feature, size=NUM_CLASSIFIER_IN_ENSEMBLE) self._addName("CLASSIFIER_BASE", CLASSIFIER_BASE) CLASSIFIER_DCT = {} self._addName("CLASSIFIER_DCT", CLASSIFIER_DCT) for trinary_key, trinary in DATA_DCT.items(): for gene_key, gene_list in GENE_DCT.items(): classifier = copy.deepcopy(CLASSIFIER_BASE) # Not all genes may be present in TrinaryData since they may be correlated or unvarying. df_X = dataframe.subset(trinary.df_X, gene_list, axis=1) classifier.fit(df_X, trinary.ser_y, class_names=STAGE_NAMES) CLASSIFIER_DCT[(trinary_key, gene_key)] = classifier # Calculate the rest of the gene groups and add them TOP12_T0 = "top12_T0" TOP12_POOLED = "top12_pooled" TOP12_T0_GENES = list(CLASSIFIER_DCT[(T0, ALL)].columns) TOP12_POOLED_GENES = list(CLASSIFIER_DCT[(POOLED, ALL)].columns) GENE_DCT[TOP12_T0] = TOP12_T0_GENES GENE_DCT[TOP12_POOLED] = TOP12_POOLED_GENES GENE_GROUPS = list(GENE_DCT.keys()) self._addName("GENE_GROUPS", GENE_GROUPS) for name in GENE_GROUPS: self._addName(name.upper(), name) # Add the name of each group self._addName("GENE_DCT", GENE_DCT) # Construct derivative structures self._addName("DF_X", DF_X_DCT[T0]) self._addName("SER_Y", SER_Y_DCT[T0]) self._addName("SAMPLE_DATA_DCT", SAMPLE_DCT[REF_TYPE_BIOREACTOR]) self._addName("CLASSIFIER", CLASSIFIER_DCT[('T0', 'mycobactin')]) key = (T0, "mycobactin_bacterioferritin") self._addName("GENES", CLASSIFIER_DCT[key].features) # Accuracy calculations for classifiers DF_ACCURACY = self.calcAccuracy() self._addName("DF_ACCURACY", DF_ACCURACY)