Example #1
0
 def testGetStateNames(self):
     if IGNORE_TEST:
         return
     trinary = TrinaryData()
     names = trinary.getStateNames([0, 4])
     self.assertEqual(names[0], "Transition")
     self.assertEqual(names[1], "Resuscitation")
Example #2
0
 def testSubsetToStates(self):
     if IGNORE_TEST:
         return
     trinary = TrinaryData()
     subset_trinary = trinary.subsetToStates(["Transition"], genes=GENES)
     len1 = len(trinary.ser_y[trinary.ser_y > 0])
     len2 = len(subset_trinary.ser_y[subset_trinary.ser_y > 0])
     self.assertGreater(len1, len2)
     self.assertEqual(len(GENES), len(subset_trinary.df_X.columns))
Example #3
0
 def testRegulator(self):
     if IGNORE_TEST:
         return
     trinary_full = TrinaryData(is_averaged=False,
                                is_dropT1=False,
                                is_regulator=False)
     trinary_regulator = TrinaryData(is_averaged=False,
                                     is_dropT1=False,
                                     is_regulator=True)
     self.assertGreater(len(trinary_full.df_X.columns),
                        len(trinary_regulator.df_X.columns))
Example #4
0
    def testIsStageAveraged(self):
        if IGNORE_TEST:
            return

        def test(data):
            self.assertEqual(len(data.df_X), len(data.ser_y))

        #
        data = TrinaryData(is_stage_averaged=True)
        test(data)
        data = TrinaryData(is_dropT1=False, is_stage_averaged=True)
        test(data)
Example #5
0
 def testPlotExpressionLevels(self):
     if IGNORE_TEST:
         return
     trinary = TrinaryData()
     trinary.plotExpressionLevels(GENES, is_plot=IS_PLOT, title="title")
     trinary.plotExpressionLevels(GENES, df_X=trinary.df_X, is_plot=IS_PLOT)
     trinary.plotExpressionLevels(GENES,
                                  df_X=trinary.df_X,
                                  ser_y=trinary.ser_y,
                                  is_plot=IS_PLOT)
     trinary.plotExpressionLevels(GENES,
                                  is_plot=IS_PLOT,
                                  title="title",
                                  is_color_bar=False)
Example #6
0
 def testTrinaryRefPooled(self):
     if IGNORE_TEST:
         return
     trinary1 = TrinaryData(is_reinitialize=True)
     trinary2 = TrinaryData(calcRef=PROVIDER.calcRefPooled,
                            is_reinitialize=True)
     is_different = False
     for column in trinary1.df_X.columns:
         if column not in trinary2.df_X.columns:
             is_different = True
             break
         if not trinary1.df_X[column].equals(trinary2.df_X[column]):
             is_different = True
             break
     self.assertTrue(is_different)
Example #7
0
 def testTrinaryReadsDF2(self):
   return
   # Checks that trinary values computed directly from reads
   # are the same as those of normalized samples.
   # Get raw value of read counts
   provider = DataProvider()
   provider.do()
   #
   def calcTrinaryTimeSample(time_index):
       """
       Calculates the trinary value of a time sample
       :param str time_index: name of time value
       """
       int_index = int(time_index[1:])
       df0 = provider.dfs_read_count[0]
       num = len(provider.dfs_read_count)
       ser = pd.Series(np.repeat(0, len(df0.index)), index=df0.index)
       for idx in range(num):
           ser += provider.dfs_read_count[idx][int_index]
       df = pd.DataFrame(ser/num)
       df_result = transform_data.trinaryReadsDF(df_sample=df)
       return df_result.T
   #
   data = TrinaryData()
   data.df_X.columns = data.features
   for time_index in data.df_X.index:
     df_result = calcTrinaryTimeSample(time_index)
     import pdb; pdb.set_trace()
Example #8
0
def make(num_features=NUM_FEATURES,
    num_classifiers=NUM_CLASSIFIERS, 
    file_path=cn.ENSEMBLE_PATH, is_force=True,
    **kwargs):
  """
  Creates a classifier ensemble from the time course data,
  and fits the classifier.
  :param int num_features: number of features in the ensemble
  :param int num_classifiers: number of classifiers
  :param str file_path: path where classifier is exported
  :param bool is_force: Create new classifier even
      if one exists already.
  :param dict kwargs: Options passed to TrinaryData
  :return  ClassifierEnsemble:
  """
  # See if there's a classifier already
  if not is_force:
    try:
      return classifier_ensemble.ClassifierEnsemble.deserialize(
          file_path)
    except ValueError:
      pass
  # Construct a new classifier
  svm_ensemble = classifier_ensemble.ClassifierEnsemble(
          classifier_ensemble.ClassifierDescriptorSVM(), 
          filter_high_rank=num_features, size=num_classifiers)
  data = TrinaryData(**kwargs)
  data.df_X.columns = data.features
  svm_ensemble.fit(data.df_X, data.ser_y)
  svm_ensemble.serialize(file_path)
  return svm_ensemble
Example #9
0
    def testNonAveraged(self):
        if IGNORE_TEST:
            return

        def test(df_X, ser_y):
            isConsistentState(ser_y)
            self.assertEqual(len(df_X), len(ser_y))

        #
        data1 = TrinaryData(is_averaged=False, is_dropT1=False)
        data2 = TrinaryData(is_averaged=True, is_dropT1=False)
        for data in [data1, data2]:
            test(data.df_X, data.ser_y)
        self.assertGreater(len(data1.df_X), len(data2.df_X))
        self.assertGreater(len(data1.df_X.columns), len(data2.df_X.columns))
        # Replicated data should have 3 Normoxia states
        self.assertEqual(data1.ser_y[data1.ser_y == 0].count(), NUM_REPL)
Example #10
0
 def __init__(self, data=None):
   """
   :param NormalizedData data:
   """
   if data is None:
     self.data = TrinaryData()
   else:
     self.data = data
   self.scores = None
def _getData():
  provider = DataProvider()
  provider.do()
  trinary = TrinaryData(is_averaged=False,
      is_dropT1=False)
  if IS_ONLY_TFS:
    columns = set(trinary.df_X.columns).intersection(
        provider.tfs)
  else:
    columns = trinary.df_X.columns
  columns = list(columns)
  return trinary.df_X[columns], trinary.ser_y
def _getData(state, columns=None, **kwargs):
    """
  Obtains data for a binary classifier for the class.
  :param int state: state for which classification is done
  :param kwargs: dict: Options for TrinaryData
  :returns pd.DataFrame, pd.Series:
  """
    trinary = TrinaryData(**kwargs)
    ser_y = trinary.ser_y.apply(lambda v: 1 if v == state else 0)
    if columns is None:
        df_X = trinary.df_X
    else:
        df_X = trinary.df_X[columns].copy()
    return df_X, ser_y
Example #13
0
 def testGetStateNames(self):
     if IGNORE_TEST:
         return
     self.provider.do()
     trinary = TrinaryData()
     result1s = self.provider.getStateNames(trinary.ser_y)
     count = len(set(trinary.ser_y.values))
     self.assertEqual(len(result1s), count)
     #
     ser_y = trinary.ser_y.copy()
     indices = [i + ".0" for i in ser_y.index]
     ser_y.index = indices
     result2s = self.provider.getStateNames(trinary.ser_y)
     self.assertTrue(all([v1 == v2 for v1, v2 in zip(result1s, result2s)]))
def _getData(state):
  """
  Obtains data for a binary classifier for the class.
  :param int state: state for which classification is done
  :param pd.DataFrame, pd.Series:
  """
  provider = DataProvider()
  provider.do()
  trinary = TrinaryData(is_averaged=False,
      is_dropT1=False)
  columns = set(trinary.df_X.columns).intersection(
      provider.tfs)
  columns = list(columns)
  ser_y = trinary.ser_y.apply(lambda v:
    1 if v == state else 0)
  return trinary.df_X[columns], ser_y
Example #15
0
  def testRemoveGenesWithExcessiveReplicationVariance(self):
    if IGNORE_TEST:
      return
    trinary = TrinaryData(is_averaged=False, is_dropT1=False,
        is_regulator=False)
    df_base = transform_data.removeGenesWithExcessiveReplicationVariance(
        trinary.df_X)
    for max_var in [1, 2, 3]:
      df = transform_data.removeGenesWithExcessiveReplicationVariance(
          trinary.df_X, max_var=max_var)
      self.assertGreaterEqual(len(df_base.columns), len(df.columns))

    ser = util.convertToLog2(SER)
    ser1 = util.unconvertFromLog2(ser)
    ser1.loc[0] = 0
    trues = [np.isclose(v1, v2) for v1, v2 in zip(ser1, SER)]
    self.assertTrue(all(trues))
Example #16
0
 def setUp(self):
     self.trinary = TrinaryData()
     self.provider = DataProvider()
     self.provider.do()
Example #17
0
 def testPlotFeatureSignificanceByState(self):
     if IGNORE_TEST:
         return
     trinary = TrinaryData(is_averaged=False, is_dropT1=False)
     trinary.plotFeatureSignificanceByState(is_plot=IS_PLOT)
Example #18
0
 def initialize(self):
     """
 Initializes the data. Defines and initializes all names added to globals().
 """
     #
     T0 = "T0"
     POOLED = "pooled"
     self._addName("T0", "T0")
     self._addName("POOLED", "pooled")
     self._addName("REF_TYPE_POOLED", REF_TYPE_POOLED)
     self._addName("REF_TYPE_BIOREACTOR", REF_TYPE_BIOREACTOR)
     self._addName("REF_TYPE_SELF", REF_TYPE_SELF)
     # Provider
     PROVIDER = DataProvider()
     self._addName("PROVIDER", PROVIDER)
     PROVIDER.do()
     TRINARY = TrinaryData()
     self._addName("TRINARY", TRINARY)
     # Gene Classes
     ALL_GENES = list(TRINARY.df_X.columns)
     self._addName("ALL_GENES", ALL_GENES)
     # Gene groupings. Added later so can include top12 from classifier
     MYCOBACTIN_GENES = [
         "Rv2377c",
         "Rv2378c",
         "Rv2379c",
         "Rv2380c",
         "Rv2381c",
         "Rv2382c",
         "Rv2383c",
         "Rv2384",
         "Rv2385",
         "Rv2386c",
     ]
     self._addName("MYCOBACTIN_GENES", MYCOBACTIN_GENES)
     BACTERIOFERRITIN_GENES = [
         "Rv2341",
         "Rv3841",
     ]
     self._addName("BACTERIOFERRITIN_GENES", BACTERIOFERRITIN_GENES)
     MYCOBACTIN_BACTERIOFERRIN_GENES = list(MYCOBACTIN_GENES)
     self._addName("MYCOBACTIN_BACTERIOFERRIN_GENES",
                   MYCOBACTIN_BACTERIOFERRIN_GENES)
     MYCOBACTIN_BACTERIOFERRIN_GENES.extend(BACTERIOFERRITIN_GENES)
     MYCOBACTIN_BACTERIOFERRITIN = "mycobactin_bacterioferritin"
     BACTERIOFERRITIN = "bacterioferritin"
     MYCOBACTIN = "mycobactin"
     ALL = "all"
     GENE_DCT = {
         MYCOBACTIN: MYCOBACTIN_GENES,
         BACTERIOFERRITIN: BACTERIOFERRITIN_GENES,
         MYCOBACTIN_BACTERIOFERRITIN: MYCOBACTIN_BACTERIOFERRIN_GENES,
         ALL: ALL_GENES,
     }
     # Define the stage names
     STAGE_NAMES = list(cn.STATE_NAMES)
     self._addName("STAGE_NAMES", STAGE_NAMES)
     STAGE_NAMES.remove("Normoxia")
     STAGE_NAMES = np.array(STAGE_NAMES)
     # Bioreactor data calculated with two different references
     DATA_DCT = {
         T0:
         TrinaryData(is_regulator=False, is_dropT1=True, is_averaged=True),
         POOLED:
         TrinaryData(is_regulator=False,
                     is_dropT1=True,
                     is_averaged=True,
                     calcRef=PROVIDER.calcRefPooled)
     }
     self._addName("DATA_DCT", DATA_DCT)
     SER_Y_DCT = {k: t.ser_y for k, t in DATA_DCT.items()}
     self._addName("SER_Y_DCT", SER_Y_DCT)
     # Feature vectors are specific to the gene subsets
     DF_X_DCT = {k: t.df_X.copy() for k, t in DATA_DCT.items()}
     DF_X_DCT = {k: df[MYCOBACTIN_GENES] for k, df in DF_X_DCT.items()}
     self._addName("DF_X_DCT", DF_X_DCT)
     # Sample data
     SAMPLE_DCT = {
         r: sample_data.getSampleData(ref_type=r, is_regulator=False)
         for r in [REF_TYPE_BIOREACTOR, REF_TYPE_SELF, REF_TYPE_POOLED]
     }
     self._addName("SAMPLE_DCT", SAMPLE_DCT)
     SAMPLE_AVG_DCT = {
         r: sample_data.getSampleData(ref_type=r,
                                      is_regulator=False,
                                      is_average=True)
         for r in [REF_TYPE_BIOREACTOR, REF_TYPE_SELF, REF_TYPE_POOLED]
     }
     self._addName("SAMPLE_AVG_DCT", SAMPLE_AVG_DCT)
     # Classifiers
     num_feature = len(MYCOBACTIN_BACTERIOFERRIN_GENES)
     CLASSIFIER_BASE = classifier_ensemble.ClassifierEnsemble(
         classifier_ensemble.ClassifierDescriptorSVM(),
         filter_high_rank=num_feature,
         size=NUM_CLASSIFIER_IN_ENSEMBLE)
     self._addName("CLASSIFIER_BASE", CLASSIFIER_BASE)
     CLASSIFIER_DCT = {}
     self._addName("CLASSIFIER_DCT", CLASSIFIER_DCT)
     for trinary_key, trinary in DATA_DCT.items():
         for gene_key, gene_list in GENE_DCT.items():
             classifier = copy.deepcopy(CLASSIFIER_BASE)
             # Not all genes may be present in TrinaryData since they may be correlated or unvarying.
             df_X = dataframe.subset(trinary.df_X, gene_list, axis=1)
             classifier.fit(df_X, trinary.ser_y, class_names=STAGE_NAMES)
             CLASSIFIER_DCT[(trinary_key, gene_key)] = classifier
     # Calculate the rest of the gene groups and add them
     TOP12_T0 = "top12_T0"
     TOP12_POOLED = "top12_pooled"
     TOP12_T0_GENES = list(CLASSIFIER_DCT[(T0, ALL)].columns)
     TOP12_POOLED_GENES = list(CLASSIFIER_DCT[(POOLED, ALL)].columns)
     GENE_DCT[TOP12_T0] = TOP12_T0_GENES
     GENE_DCT[TOP12_POOLED] = TOP12_POOLED_GENES
     GENE_GROUPS = list(GENE_DCT.keys())
     self._addName("GENE_GROUPS", GENE_GROUPS)
     for name in GENE_GROUPS:
         self._addName(name.upper(), name)  # Add the name of each group
     self._addName("GENE_DCT", GENE_DCT)
     # Construct derivative structures
     self._addName("DF_X", DF_X_DCT[T0])
     self._addName("SER_Y", SER_Y_DCT[T0])
     self._addName("SAMPLE_DATA_DCT", SAMPLE_DCT[REF_TYPE_BIOREACTOR])
     self._addName("CLASSIFIER", CLASSIFIER_DCT[('T0', 'mycobactin')])
     key = (T0, "mycobactin_bacterioferritin")
     self._addName("GENES", CLASSIFIER_DCT[key].features)
     # Accuracy calculations for classifiers
     DF_ACCURACY = self.calcAccuracy()
     self._addName("DF_ACCURACY", DF_ACCURACY)