Ejemplo n.º 1
0
    def testConvertUnconvertToFromLog2(self):
        if IGNORE_TEST:
            return

        def test(pd_obj):
            if isinstance(pd_obj, pd.DataFrame):
                base_obj = DF
            else:
                base_obj = SER
            obj1 = util.convertToLog2(base_obj)
            obj2 = util.unconvertFromLog2(obj1)
            if isinstance(pd_obj, pd.DataFrame):
                ser2 = obj2["a"]
            else:
                ser2 = obj2
            ser2.loc[0] = 0
            trues = [np.isclose(v1, v2) for v1, v2 in zip(ser2, SER)]
            self.assertTrue(all(trues))

        #
        test(SER)
        test(DF)

        ser = util.convertToLog2(SER)
        ser1 = util.unconvertFromLog2(ser)
        ser1.loc[0] = 0
        trues = [np.isclose(v1, v2) for v1, v2 in zip(ser1, SER)]
        self.assertTrue(all(trues))
Ejemplo n.º 2
0
    def _makeNormalizedDF(self):
        """
    Transformation of the "normalized Read Counts" processed by DESeq2.
    Standardized the values for each gene.
    Drops rows where all columns are minimum values.
    Assumes that self.df_gene_expression_state has been initialized.
    Only includes genes that are expressed.
    :return pd.DataFrame:
        rows: gene
        columns: time
    """
        def defaultCalcRef(df):
            return df[cn.TIME_0]

        #
        df = self._getLog2NormalizedReadcounts()
        # Normalize w.r.t. the counts
        drops = []  # Rows to drop
        df_unlog2 = util.unconvertFromLog2(df)
        ser_ref_unlog2 = self.calcRef(df_unlog2)
        ser_ref = util.convertToLog2(ser_ref_unlog2)
        for idx in df.index:
            values = df.loc[idx, :] - ser_ref.loc[idx]
            df.loc[idx, :] = [max(MIN_LOG2_VALUE, v) for v in values]
            if all([v <= MIN_LOG2_VALUE for v in df.loc[idx, :]]):
                drops.append(idx)
        df = df.drop(index=drops)  # Drop the 0 rows
        # Find genes to keep
        if self._is_only_qgenes:
            keep_genes = self.df_gene_expression_state.index
            df = df[df.index.isin(keep_genes)]
        #
        return df
Ejemplo n.º 3
0
 def initialize(self):
   """
   Construct the feature vectors for the samples.
   """
   # Iterate across all samples
   for sample_name, descriptor in SAMPLE_DESCRIPTOR_DCT.items():
     attribute_name = self.getDataframeAttributeName(sample_name)
     ###
     # Construct a data frame that is normalized for gene and library
     # and has log2 units
     ###
     # Select indices that are for the conditions/times considered
     df = transform_data.readGeneCSV(descriptor.csv).T
     sel = [ any([d in i for d in descriptor.cnm]) for i in df.index]
     df = df[sel]
     # Sort the instances and complete initial processing
     indices = sorted(df.index, key=lambda v: self._makeSortKey(sample_name, v))
     df.index = indices
     if not descriptor.nrml:
       raise RuntimeError("Do gene normalization for sample %s" % sample_name)
     if not descriptor.log2:
       df = util.convertToLog2(df)
     ###
     # Convert to trinary values. This takes into account the reference values
     # for gene expression
     ###
     if self.ref_type == REF_TYPE_BIOREACTOR:
       ser_ref = transform_data.makeBioreactorT0ReferenceData()
     elif self.ref_type == REF_TYPE_POOLED:
       ser_ref = df.mean(axis=0)
     elif self.ref_type == REF_TYPE_SELF:
       if descriptor.sel is None:
         print("***%s: no selection for reference type 'self'. Result is None."
             % sample_name)
         df = None
       else:
         ser_ref = self._calcRefFromIndices(df, descriptor.sel)
     else:
       raise RuntimeError("%s is an invalid reference type" % self.ref_type)
     if df is not None:
       ###
       # Average replicas if requested
       ###
       if self.is_average:
         df = self.averageReplicas(df, descriptor.cnm)
       ###
       # Convert to trinary values
       ###
       df = transform_data.calcTrinaryComparison(df.T, ser_ref,
           is_convert_log2=False).T
       ###
       # Restrict to regulators?
       ###
       if self.is_regulator:
         trinary_data.subsetToRegulators(df)
     #
     self.__setattr__(attribute_name, df)
Ejemplo n.º 4
0
def makeBioreactorT0ReferenceData():
    """
  Creates the T0 reference data in log units.
  :return Series:
  """
    dfs = copy.deepcopy(PROVIDER.dfs_adjusted_read_count)
    for df in dfs:
        df.columns = stripReplicaString(df.columns)
    df_ref = sum(dfs) / len(PROVIDER.dfs_adjusted_read_count)
    ser = df_ref[cn.TIME_0]
    ser = util.convertToLog2(ser)
    return ser
Ejemplo n.º 5
0
 def test(pd_obj):
     if isinstance(pd_obj, pd.DataFrame):
         base_obj = DF
     else:
         base_obj = SER
     obj1 = util.convertToLog2(base_obj)
     obj2 = util.unconvertFromLog2(obj1)
     if isinstance(pd_obj, pd.DataFrame):
         ser2 = obj2["a"]
     else:
         ser2 = obj2
     ser2.loc[0] = 0
     trues = [np.isclose(v1, v2) for v1, v2 in zip(ser2, SER)]
     self.assertTrue(all(trues))
Ejemplo n.º 6
0
def calcTrinaryComparison(df, ser_ref, threshold=1, is_convert_log2=True):
    """
  Calculates trinary values of a DataFrame w.r.t. a reference in
  log2 units.
  :param pd.DataFrame df: comparison values; columns are instances,
      has same inde as ser_ref
  :param pd.Series ser_ref: reference values
  :param float threshold: comparison threshold.
  :param bool is_convert_log2: convert to log2
  :return pd.DataFrame: trinary values resulting from comparisons
    -1: df is less than 2**threshold*ser_ref
     1: df is greater than 2**threshol*ser_ref
     0: otherwise
  """
    if ser_ref is None:
        raise RuntimeError("ser_ref cannot be None.")
    if is_convert_log2:
        ser_ref_log = util.convertToLog2(ser_ref)
        df_log = util.convertToLog2(df)
    else:
        df_log = df.copy()
        ser_ref_log = ser_ref.copy()
    #
    if ser_ref is None:
        ser_ref_log = pd.Series(np.repeat(0, len(df)), index=df.index)
    # Find the common indices
    indices = set(df_log.index).intersection(ser_ref_log.index)
    df_log = df_log.loc[indices, :]
    ser_ref_log = ser_ref_log[indices]
    df_comp_T = df_log.T - ser_ref_log
    # Drop the nan columns, those genes for which there is no reference
    df_comp = (df_comp_T.dropna(axis=1, how='all')).T
    df_result = makeTrinaryData(df=df_comp,
                                min_abs=threshold,
                                is_include_nan=False)
    return df_result
Ejemplo n.º 7
0
  def testRemoveGenesWithExcessiveReplicationVariance(self):
    if IGNORE_TEST:
      return
    trinary = TrinaryData(is_averaged=False, is_dropT1=False,
        is_regulator=False)
    df_base = transform_data.removeGenesWithExcessiveReplicationVariance(
        trinary.df_X)
    for max_var in [1, 2, 3]:
      df = transform_data.removeGenesWithExcessiveReplicationVariance(
          trinary.df_X, max_var=max_var)
      self.assertGreaterEqual(len(df_base.columns), len(df.columns))

    ser = util.convertToLog2(SER)
    ser1 = util.unconvertFromLog2(ser)
    ser1.loc[0] = 0
    trues = [np.isclose(v1, v2) for v1, v2 in zip(ser1, SER)]
    self.assertTrue(all(trues))