Ejemplo n.º 1
0
 def testTrinaryReadsDF2(self):
   return
   # Checks that trinary values computed directly from reads
   # are the same as those of normalized samples.
   # Get raw value of read counts
   provider = DataProvider()
   provider.do()
   #
   def calcTrinaryTimeSample(time_index):
       """
       Calculates the trinary value of a time sample
       :param str time_index: name of time value
       """
       int_index = int(time_index[1:])
       df0 = provider.dfs_read_count[0]
       num = len(provider.dfs_read_count)
       ser = pd.Series(np.repeat(0, len(df0.index)), index=df0.index)
       for idx in range(num):
           ser += provider.dfs_read_count[idx][int_index]
       df = pd.DataFrame(ser/num)
       df_result = transform_data.trinaryReadsDF(df_sample=df)
       return df_result.T
   #
   data = TrinaryData()
   data.df_X.columns = data.features
   for time_index in data.df_X.index:
     df_result = calcTrinaryTimeSample(time_index)
     import pdb; pdb.set_trace()
Ejemplo n.º 2
0
def trinaryReadsDF(csv_file=None, df_sample=None,
    csv_dir=cn.SAMPLES_DIR, is_display_errors=True):
  """
  Creates trinary values for read counts w.r.t. data provider.
  (a) adjusting for gene length, (b) library size,
  (c) log2, (d) ratio w.r.t. T0.
  Data may come from an existing dataframe or a CSV file.
  :param str csv_file: File in "samples" directory.
      columns are: "GENE_ID", instance ids
  :param pd.DataFrame df_sample: columns are genes,
      index are instances, values are raw readcounts
  :param str csv_dir: directory where csv file is found
  :return pd.DataFrame: columns are genes, 
      indexes are instances, trinary values
  At least one of df_sample and csv_file must be non-null
  """
  provider = DataProvider(is_display_errors=is_display_errors)
  provider.do()
  if df_sample is None:
    path = os.path.join(csv_dir, csv_file)
    df_sample = pd.read_csv(path)
    df_sample.index = df_sample['GENE_ID']
    del df_sample['GENE_ID']
  #
  df_normalized = provider.normalizeReadsDF(df_sample)
  # Compute trinary values relative to original reads
  df_ref = sum(provider.dfs_adjusted_read_count)  \
      / len(provider.dfs_adjusted_read_count)  # Mean values
  ser_ref = df_ref[cn.REF_TIME]
  return calcTrinaryComparison(df_normalized, ser_ref=ser_ref)
Ejemplo n.º 3
0
class NormalizedData(object):
    """ Exposes values described above. """
    def __init__(self, is_display_errors=True, is_averaged=True):
        """
    :param bool is_display_errors: Shows errors encountered
    :param bool is_averaged: Use averaged read counts
    Public instance variables:
      df_X are normalized read counts
      states_dict - mapping of literal to numeric values of state
      ser_y - numeric value of state corresponding to each row in df_X
    """
        self._is_display_errors = is_display_errors
        self.provider = DataProvider(is_display_errors=self._is_display_errors)
        self.provider.do()
        self.df_X = self.provider.df_normalized.T
        self.df_X = self.df_X.drop(index="T0")
        self.features = self.df_X.columns.tolist()
        self.df_X.columns = range(len(self.features))
        # Create class information
        ser_y = self.provider.df_stage_matrix[cn.STAGE_NAME]
        ser_y = ser_y.drop(index="T0")
        ser_y = ser_y.copy()
        ser_y[ser_y == 'Normoxia'] = 'Resuscitation'
        # Create converter from state name to numeric index
        states = ser_y.unique()
        self.state_dict = {k: v for v, k in enumerate(states)}
        self.ser_y = ser_y.apply(lambda k: self.state_dict[k])
Ejemplo n.º 4
0
def aggregateGenes(df=None, provider=None):
  """
  Combines genes that are perfectly correlated in time for trinary
  values.
  :param DataFrame df: dataframe to transform
  :param DataProvider provider: uses df_normalized
  :return pd.DataFrame: names are combined for aggregated
      genes; calculates trinary values
  """
  if df is None:
    if provider is None:
      provider = DataProvider()
      provider.do()
    df = provider.df_normalized
  df_trinary = makeTrinaryData(df, is_include_nan=False)
  dfg = df_trinary.groupby(df_trinary.columns.tolist())
  groups = dfg.groups
  data = {}
  for key, genes in groups.items():
    label = "--".join(genes)
    data[label] = list(key)
  df = pd.DataFrame(data)
  df_result = df.T
  df_result.columns = df_trinary.columns
  return df_result
Ejemplo n.º 5
0
def getProvider(provider):
    """
  Returns a provider
  """
    if provider is None:
        provider = DataProvider()
        provider.do()
    return provider
Ejemplo n.º 6
0
 def testAggregateGenes(self):
   if IGNORE_TEST:
     return
   provider = DataProvider()
   provider.do()
   df = transform_data.aggregateGenes(provider=provider)
   self.assertTrue(helpers.isValidDataFrame(df,
       provider.df_normalized.columns))
Ejemplo n.º 7
0
 def __init__(self, df_trinary=None):
     """
 :param pd.DataFrame df: trinary valued DF (has values -1, 0, 1)
 """
     if df_trinary is None:
         provider = DataProvider()
         provider.do()
         df_trinary = transform_data.makeTrinaryData(is_include_nan=False)
     self.df_trinary = df_trinary
     self.df_group = None  # Dataframe describing groups
     self.df_gene_group = None  # Genes by group
def _getData():
  provider = DataProvider()
  provider.do()
  trinary = TrinaryData(is_averaged=False,
      is_dropT1=False)
  if IS_ONLY_TFS:
    columns = set(trinary.df_X.columns).intersection(
        provider.tfs)
  else:
    columns = trinary.df_X.columns
  columns = list(columns)
  return trinary.df_X[columns], trinary.ser_y
Ejemplo n.º 9
0
class CVPlotter():

  def __init__(self, provider=None, is_plot=True):
    if provider is None:
      self._provider = DataProvider()
      self._provider.do()
    else:
      self._provider = provider
    self._is_plot = is_plot

  def heatMap(self, min_cv=0):
    """
    Plots a heatmap of the coefficient of variations.
    :param pd.DataFrame df_cv: CVs
    :param float min_cv: minimum CV to consider
    """
    plt.figure(figsize=(16, 10))
    df = self._provider.df_cv
    # Rename columns to their hours
    ax = plt.gca()
    ax.set_xticks(np.arange(len(df.columns))+0.5)
    ax.set_xticklabels(df.columns)
    df = df.applymap(lambda v: v if v >= min_cv else np.nan)
    heatmap = plt.pcolor(df, cmap='jet')
    plt.colorbar(heatmap)
    plt.xlabel("times")
    plt.ylabel("gene")
    plt.title("Coefficient of Variation > %d percent" % min_cv)
    if self._is_plot:
      plt.show()

  def readsAndDO(self):
    """
    Plots the following lines for the hours of the experiments:
      Average CV of genes
      CV of dissolved oxygen (DO)
      Avg dissolved oxygen
    """
    hours = self._provider.df_hypoxia[cn.HOURS]
    means = self._provider.df_hypoxia[cn.MEAN]
    error_bars = [2*s for s in self._provider.df_hypoxia[cn.STD]]
    plt.errorbar(hours, means, yerr=error_bars, marker="o")
    ax = plt.gca()
    # Plot CVs of genes
    ser = self._provider.df_cv.mean()  # Average over geans
    ax.plot(hours, ser.values, linestyle='dashed', marker="o",
        color='r')
    plt.xlabel("hours")
    plt.ylabel("DO/CV")
    plt.legend(["CV for read counts", "DO +/- 2 std"])
    if self._is_plot:
      plt.show()
Ejemplo n.º 10
0
class TestFunctions(unittest.TestCase):
    def setUp(self):
        self.trinary = TrinaryData()
        self.provider = DataProvider()
        self.provider.do()

    def testCountTerms(self):
        if IGNORE_TEST:
            return
        TERM = "DNA replication"
        EXPECTED_COUNT = 2

        def test(terms, expected_count, fset=None):
            df_gene = self.provider.df_go_terms
            if fset is None:
                feature_set = FeatureSet(df_gene[xcn.GENE_ID][1:3])
                fset = FeatureSet(df_gene[xcn.GENE_ID][1:3])
            count = util_classifier.countTerms(fset, terms)
            self.assertEqual(count, expected_count)

        #
        test([TERM], EXPECTED_COUNT)
        test([TERM, TERM], 2 * EXPECTED_COUNT)
        test(["DUMMY"], 0)
        #
        fset = FeatureSet(['Rv0981--Rv1332--Rv1828'])
        test(["DUMMY"], 0, fset=fset)

    def testCountTerms2(self):
        if IGNORE_TEST:
            return
        TERMS = ["a"]
        fset = FeatureSet(["Rv2009"])
        count1 = util_classifier.countTerms(fset,
                                            TERMS,
                                            is_include_module=False)
        count2 = util_classifier.countTerms(fset,
                                            TERMS,
                                            is_include_module=True)
        self.assertGreater(count2, count1)

    def testExtractAggregatedGene(self):
        if IGNORE_TEST:
            return
        GENES = ['Rv0981', 'Rv1332', 'Rv1828']
        AGGREGATED_GENE = xcn.GENE_SEPARATOR.join(GENES)
        genes = util_classifier.extractAggregatedGene(AGGREGATED_GENE)
        diff = set(GENES).symmetric_difference(genes)
        self.assertEqual(len(diff), 0)
Ejemplo n.º 11
0
 def testTrinaryReadsDF1(self):
     if IGNORE_TEST:
         return
     provider = DataProvider()
     provider.do()
     df = provider.dfs_read_count[0]
     df_result = transform_data.trinaryReadsDF(df_sample=df)
     # See if number of "-1" is excessive
     dff = df_result + df_result.applymap(lambda v: -np.abs(v))
     frac_minus1 = -dff.sum().sum()  \
         /(2*len(df_result)*len(df_result.columns))
     self.assertLess(frac_minus1, 0.25)
     # Smoke tests for csv
     df_result = transform_data.trinaryReadsDF(
         csv_file="AM_MDM_Mtb_transcripts_DEseq.csv",
         is_display_errors=False)
def _getData(state):
  """
  Obtains data for a binary classifier for the class.
  :param int state: state for which classification is done
  :param pd.DataFrame, pd.Series:
  """
  provider = DataProvider()
  provider.do()
  trinary = TrinaryData(is_averaged=False,
      is_dropT1=False)
  columns = set(trinary.df_X.columns).intersection(
      provider.tfs)
  columns = list(columns)
  ser_y = trinary.ser_y.apply(lambda v:
    1 if v == state else 0)
  return trinary.df_X[columns], ser_y
Ejemplo n.º 13
0
class TestTermAnalyzer(unittest.TestCase):

  def setUp(self):
    self.provider = DataProvider()
    self.provider.do()
    self.analyzer = term_analyzer.TermAnalyzer(
        self.provider.df_ec_terms, is_plot=IS_PLOT)

  def testConstructor(self):
    if IGNORE_TEST:
      return
    self.assertTrue(helpers.isValidDataFrame(self.analyzer.df_term,
        self.analyzer.df_term.columns))

  def testPlotTermHeatmap(self):
    if IGNORE_TEST:
      return
    self.analyzer.plotTermHeatmap(is_plot=IS_PLOT)
Ejemplo n.º 14
0
def makeTrinaryData(df=None, min_abs=1.0, is_include_nan=True):
    """
  Thresholds data based on its absolute magnitude.
  Values are assigned as -1, 0, 1
  :param pd.DataFrame df: default is provider.df_normalized
    values are in log2 units
  :param float min_abs: minimal absolute value to threshold.
  :param bool is_include_nan: Include nan values; else set to 0
  :return pd.DataFrame: same index and columns as df
  """
    if df is None:
        provider = DataProvider()
        provider.do()
        df = provider.df_normalized
    df_result = df.copy()
    df_result = df_result.applymap(lambda v: 0 if np.abs(v) < min_abs else -1
                                   if v < 0 else 1)
    if is_include_nan:
        df_result = df_result.applymap(lambda v: np.nan if v == 0 else v)
    return df_result
Ejemplo n.º 15
0
def countTerms(fset, terms, is_include_module=True):
    """ 
  Counts the occurrences of terms in the GO terms
  of genes in the FeatureSet.

  Parameters
  ----------
  fset: FeatureSet
  terms: list-str
  is_include_module: bool
      consider all genes in modules activated by a
      gene in fset

  Returns
  -------
  int
  """
    provider = DataProvider()
    provider.do()
    # Extract the genes
    genes = []
    [genes.extend(extractAggregatedGene(c)) for c in fset.list]
    if is_include_module:
        new_genes = []
        tfs = list(set(provider.df_trn_unsigned[xcn.TF].tolist()))
        for gene in genes:
            if gene in tfs:
                sel = provider.df_trn_unsigned[xcn.TF] == gene
                df = provider.df_trn_unsigned[sel]
                new_genes.extend(df[xcn.GENE_ID].tolist())
        genes.extend(new_genes)
        genes = list(set(genes))
    # Compile the string of go terms for the genes
    df = provider.df_go_terms
    indices = [df[df[xcn.GENE_ID] == g].index.tolist() for g in genes]
    indices = [t for l in indices for t in l]
    go_terms = df.loc[indices, xcn.GO_TERM].to_list()
    go_str = "****".join(go_terms)
    count = sum([go_str.count(t) for t in terms])
    return count
Ejemplo n.º 16
0
class TermMatrix(object):
    """
  The core dataframe is the term matrix, self.df_matrix. Its columns
  are terms; the rows are groups of correlated genes. A group is
  a tuple of trinary values indicating when that terms is expressed.
  """
    def __init__(self, term_column=cn.GO_TERM, is_plot=True, **kwargs):
        """
    :param str term_column: column in go_terms to use for text
    :param dict **kwargs: arguments to DataGrouper
    """
        self._is_plot = is_plot
        self.provider = DataProvider()
        self.provider.do()
        self.grouper = DataGrouper(**kwargs)
        self.grouper.do(min_size=1)
        self.df_matrix = self._makeMatrix(term_column)
        self.df_gene_term = self._makeGeneTerm()
        import pdb
        pdb.set_trace()

    def _makeTermGroup(self, term_column=cn.GO_TERM):
        """
    :param str term_column: column in go_terms to use for text
    :return pd.DataFrame:
      index - group (time intervals with trinary values)
      column - Term
    """
        df = self.grouper.df_gene_group.merge(self.provider.df_go_terms,
                                              left_index=True,
                                              right_index=True,
                                              how='inner')
        if term_column == cn.INDEX:
            df_term = df[[cn.GROUP]].copy()
            df_term[term_column] = df.index
        else:
            df_term = df[[cn.GROUP, term_column]].copy()
        df_term = df_term.set_index(cn.GROUP)
        return df_term

    def _makeMatrix(self, term_column=cn.GO_TERM):
        """
    :param str term_column: column in go_terms to use for text
    :return pd.DataFrame: matrix with the terms
    """
        df_term = self._makeTermGroup(term_column=term_column)
        df_result = util_text.makeTermMatrix(df_term[term_column])
        return df_result

    def _makeGeneTerm(self):
        """
    Finds the genes and terms that co-occur at the same times.
    :return pd.DataFrame:
        cn.GROUP - trinary values for genes at times
        cn.TERM - list of GO terms
        cn.GENE_D - list of genes
        cn.CNT_TERM - count of GO terms
        cn.CNT_GENE - count of genes
        cn.CNT_REGULATED - count of times up- down-regulated
    """
        def makeGroupedDF(df):
            df = df.reset_index()
            return df.groupby(cn.GROUP)

        def extract(df, key, col):
            sel = df.index == key
            result = df[sel][col].values.tolist()
            return result

        #
        df_term = self._makeTermGroup()
        df_gene = self.grouper.df_gene_group
        df_gene = df_gene.reset_index()
        df_gene = df_gene.set_index(cn.GROUP)
        dfg_term = makeGroupedDF(df_term)
        dfg_gene = makeGroupedDF(self.grouper.df_gene_group)
        # Find the keys in common
        keys_term = [k for k in dfg_term.groups]
        keys_gene = [k for k in dfg_gene.groups]
        keys_common = set(keys_term).intersection(keys_gene)
        dict_df = {cn.GROUP: [], cn.TERM: [], cn.GENE_ID: []}
        for key in keys_common:
            dict_df[cn.GROUP].append(key)
            dict_df[cn.TERM].append(extract(df_term, key, cn.GO_TERM))
            dict_df[cn.GENE_ID].append(extract(df_gene, key, cn.GENE_ID))
        df_result = pd.DataFrame(dict_df)
        df_result[cn.CNT_GENE] = [len(g) for g in df_result[cn.GENE_ID]]
        df_result[cn.CNT_TERM] = [len(t) for t in df_result[cn.TERM]]
        df_result[cn.CNT_REGULATED] =   \
            [sum([np.abs(x) for x in g]) for g in df_result[cn.GROUP]]
        return df_result

    def makeAggregationMatrix(self, predicates):
        """
    Creates a matrix with columns the same as df_matrix
    and row i that is the summation of the values in rows
    that satisfy predicate i.
    :param list-BooleanFunc predicates: predicate on group tuples
    """
        columns = self.df_matrix.columns
        column_values = {c.strip(): [] for c in columns}
        for pos, predicate in enumerate(predicates):
            row = np.repeat(0.0, len(columns))
            row = row.reshape(1, len(columns))
            # TODO: Fix predicates
            if False:
                for group in self.df_matrix.index:
                    if predicate(group):
                        values = np.array(self.df_matrix.loc[[group], :])
                        row += values
            for group in self.df_matrix.index:
                if group[pos] == 1:
                    values = np.array(self.df_matrix.loc[[group], :])
                    row += values
            row = row.reshape(len(columns))
            # Add the row for this predicate
            for idx, col in enumerate(columns):
                column_values[col].append(row[idx])
        return pd.DataFrame(column_values)

    # TODO: Fix use of predicates
    def plotAggregation(self, predicates, min_val=0, is_include_ylabels=True):
        df = self.makeAggregationMatrix(predicates)
        df = df.applymap(lambda v: 0 if v < min_val else v)
        df = df.applymap(lambda v: np.nan if np.isclose(v, 0) else v)
        # Drop columns that are all nans
        for col in df.columns:
            if all([np.isnan(v) for v in df[col]]):
                del df[col]
        # Construct the plot
        plt.subplot(1, 2, 2)
        heatmap = plt.pcolor(df.transpose(), cmap='jet')
        if is_include_ylabels:
            ax = plt.gca()
            ax.set_yticks(np.arange(len(df.columns)) + 0.5)
            ax.set_yticklabels(df.columns, fontsize=8)
        plt.title("Term Counts")
        plt.colorbar(heatmap)
        plt.show()

    def makeTimeAggregationMatrix(self, is_up_regulated=True):
        """
    Creates a matrix with columns the same as df_matrix
    and row i that is the summation of the values in rows
    that satisfy predicate i.
    :param bool is_up_regulated:
    """
        if is_up_regulated:
            direction = 1
        else:
            direction = -1
        columns = self.df_matrix.columns
        column_values = {c.strip(): [] for c in columns}
        for time in range(cn.NUM_TIMES):
            row = np.repeat(0.0, len(columns))
            row = row.reshape(1, len(columns))
            for group in self.df_matrix.index:
                if group[time] == direction:
                    values = np.array(self.df_matrix.loc[[group], :])
                    row += values
            row = row.reshape(len(columns))
            # Add the row for this predicate
            for idx, col in enumerate(columns):
                column_values[col].append(row[idx])
        return pd.DataFrame(column_values)

    def calcClusters(self, max_distance=1, is_up_regulated=True):
        """
    Calculates log significance levels and clusters.
    :param float max_distance: maximum distance between clusters,
        otherwise merged
    :return pd.DataFrame, ndarray, pd.Series:
       df_log - log of significance level
       row_linkage - linkage matrix
           See https://stackoverflow.com/questions/9838861/scipy-linkage-format
       ser_cluster - cn.GROUP (indexed by term)
    """
        df = self.makeTimeAggregationMatrix(is_up_regulated=is_up_regulated)
        # Remove rows with zero variance
        df_filtered = util_statistics.filterZeroVarianceRows(df.T)
        # Compute significance levels
        df_log = util_statistics.calcLogSL(df_filtered, round_decimal=3)
        df_log = df_log.applymap(lambda v: HIGH_SL if np.isnan(v) else v)
        # Compute the clusters
        log_arrays = np.asarray(df_log)
        row_linkage = linkage(distance.pdist(log_arrays), method='average')
        ser_cluster = pd.Series(
            fcluster(row_linkage, 0.1, criterion="distance"))
        ser_cluster.index = df_log.index
        #
        return df_log, row_linkage, ser_cluster

    # Include state transitions
    # Note how clusters relate to state observations
    def plotTimeAggregation(self, is_up_regulated=True):
        """
    Plots aggregation of groups over time.
    :param bool is_include_ylabels:
    :param bool is_up_regulated:
    """
        df_log, row_linkage, ser_cluster =  \
            self.calcClusters(is_up_regulated=is_up_regulated)
        # Heatmap
        cg = sns.clustermap(df_log,
                            row_linkage=row_linkage,
                            col_cluster=False,
                            cbar_kws={"ticks": [0, 5]},
                            cmap="Blues")
        # Construct a cluster map
        #cg = sns.clustermap(df_log, col_cluster=False,
        #    cbar_kws={"ticks":[0,5]}, cmap="Blues")
        # Set the labels
        cg.ax_heatmap.set_xlabel("Time")
        if is_up_regulated:
            direction = "Up"
        else:
            direction = "Down"
        title = "-log10 zscores of %s-regulated term counts" % (direction)
        cg.ax_heatmap.set_title(title)
        xticks = cg.ax_heatmap.get_xticks() - 0.5  # Correct tick position
        cg.ax_heatmap.set_xticks(xticks)
        cg.ax_heatmap.set_yticks([])
        cg.ax_heatmap.set_yticklabels([])
        # Add the state transitions
        util_plots.plotStateTransitions(ymax=len(df_log),
                                        ax=cg.ax_heatmap,
                                        is_plot=False)
        #
        if self._is_plot:
            plt.show()
Ejemplo n.º 17
0
from common import data_provider
import common.transform_data as transform_data
from common_python.classifier import util_classifier

import collections
import copy
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns

T1_INDEX = "T1"
MIN_NUM_NORMOXIA = 2  # Minimum number of normoxia states
PROVIDER = DataProvider(is_display_errors=False)
PROVIDER.do()


################## FUNCTIONS ###############
def subsetToRegulators(df):
    regulators = PROVIDER.df_trn_unsigned[cn.TF]
    regulators = list(set(regulators))
    regulator_cols = list(set(df.columns).intersection(regulators))
    for column in df.columns:
        if not column in regulator_cols:
            del df[column]


def convertToTrinary(df, threshold_low=-1, threshold_high=1):
    """
  Converts the dataframe to trinary values using the indicated thresholds.
Ejemplo n.º 18
0
class NormalizedData(object):
    """ Exposes values described above. """
    def __init__(self, is_averaged=True, is_regulator=False, **kwargs):
        """
    :param bool is_averaged: Use averaged read counts
    :param bool is_regulator: use regulators for TRN
    :param dict kwargs: options passed to DataProvider

    Public instance variables:
      df_X are normalized read counts
          instances are either times (begin with T) for stage (S)
      ser_y - numeric value of state corresponding to each row in df_X
      self.state_dct:
          key: state name
          value: state index
      self.features: list of names of gene
    """
        self.provider = DataProvider(**kwargs)
        self.provider.do()
        if is_averaged:
            self.df_X = self.provider.df_normalized.T
        else:
            # Use the adjusted values for each replication
            dfs = [
                df.copy()
                for df in self.provider.dfs_adjusted_read_count_wrtT0_log2
            ]
            self.df_X = pd.concat([df.T for df in dfs])
        drop_indices = self._getDropIndices(self.df_X.index)
        self.df_X = self.df_X.drop(drop_indices)
        if is_regulator:
            subsetToRegulators(self.df_X)
        self.features = self.df_X.columns.tolist()
        # Create class information
        ser_y = self.provider.df_stage_matrix[cn.STAGE_NAME]
        if not is_averaged:
            # Replica information has a specical time format
            num_repl = len(self.provider.dfs_read_count)
            sers = []
            for idx in range(num_repl):
                new_ser_y = ser_y.copy()
                new_ser_y.index = self.provider.makeTimes(suffix=idx)
                sers.append(new_ser_y)
            ser_y = pd.concat(sers)
        states = list(cn.STATE_NAMES)
        ser_y = ser_y.drop(self._getDropIndices(ser_y.index))
        if len(ser_y[ser_y == cn.STATE_NORMOXIA])  \
            <= MIN_NUM_NORMOXIA:
            ser_y[ser_y == cn.STATE_NORMOXIA] = cn.STATE_RESCUSCITATION
            states.remove("Normoxia")
        # Create converter from state name to numeric index
        self.state_dct = {k: v for v, k in enumerate(states)}
        self.ser_y = ser_y.apply(lambda k: self.state_dct[k])

    def _getDropIndices(self, indices, drop_index=cn.TIME_0):
        """
    Handles dropping time index when replicas are present
    """
        result = []
        for idx in indices:
            splits = idx.split(data_provider.SEPARATOR)
            if splits[0] == drop_index:
                result.append(idx)
        return result
Ejemplo n.º 19
0
class TestFunctions(unittest.TestCase):

  def setUp(self):
    if IGNORE_TEST:
      return
    self._init()

  def _init(self):
    self.provider = DataProvider()
    self.provider.do()

  def testMakeTrinaryData(self):
    if IGNORE_TEST:
      return
    df = transform_data.makeTrinaryData(
        df=self.provider.df_normalized)
    columns = self.provider.df_normalized.columns
    self.assertTrue(helpers.isValidDataFrame(df, columns))

  def testAggregateGenes(self):
    if IGNORE_TEST:
      return
    provider = DataProvider()
    provider.do()
    df = transform_data.aggregateGenes(provider=provider)
    self.assertTrue(helpers.isValidDataFrame(df,
        provider.df_normalized.columns))

  def testTrinaryReadsDF1(self):
    if IGNORE_TEST:
      return
    provider = DataProvider()
    provider.do()
    df = provider.dfs_read_count[0]
    df_result = transform_data.trinaryReadsDF(
        df_sample=df)
    # See if number of "-1" is excessive
    dff = df_result + df_result.applymap(lambda v: -np.abs(v))
    frac_minus1 = -dff.sum().sum()  \
        /(2*len(df_result)*len(df_result.columns))
    self.assertLess(frac_minus1, 0.25)
    # Smoke tests for csv
    df_result = transform_data.trinaryReadsDF(
        csv_file="AM_MDM_Mtb_transcripts_DEseq.csv",
        is_time_columns=False)

  # TODO: Fix so working with the same transformation of features,
  #       either all genes features or all gene-groups.
  def testTrinaryReadsDF2(self):
    return
    # Checks that trinary values computed directly from reads
    # are the same as those of normalized samples.
    # Get raw value of read counts
    provider = DataProvider()
    provider.do()
    #
    def calcTrinaryTimeSample(time_index):
        """
        Calculates the trinary value of a time sample
        :param str time_index: name of time value
        """
        int_index = int(time_index[1:])
        df0 = provider.dfs_read_count[0]
        num = len(provider.dfs_read_count)
        ser = pd.Series(np.repeat(0, len(df0.index)), index=df0.index)
        for idx in range(num):
            ser += provider.dfs_read_count[idx][int_index]
        df = pd.DataFrame(ser/num)
        df_result = transform_data.trinaryReadsDF(df_sample=df)
        return df_result.T
    #
    data = TrinaryData()
    data.df_X.columns = data.features
    for time_index in data.df_X.index:
      df_result = calcTrinaryTimeSample(time_index)
      import pdb; pdb.set_trace()
        
  def testCalcTrinaryComparison(self):
    if IGNORE_TEST:
      return
    df_in = pd.DataFrame({'a': [4, 0.20, 1]})
    df_expected = pd.DataFrame({'a': [1, -1, 0]})
    ser_ref = pd.Series(np.repeat(1, len(df_in)))
    df_out = transform_data.calcTrinaryComparison(df_in, ser_ref,
        is_convert_log2=True)
    self.assertTrue(df_out.equals(df_expected))

  def testStripReplicaString(self):
    if IGNORE_TEST:
      return
    TIME = "TO"
    SIZE = 3
    names = ["%s.%d" % (TIME, n) for n in range(SIZE)]
    result = transform_data.stripReplicaString(names)
    self.assertEqual(result[0], TIME)
    self.assertEqual(len(result), SIZE)

  def testRemoveGenesWithExcessiveReplicationVariance(self):
    if IGNORE_TEST:
      return
    trinary = TrinaryData(is_averaged=False, is_dropT1=False,
        is_regulator=False)
    df_base = transform_data.removeGenesWithExcessiveReplicationVariance(
        trinary.df_X)
    for max_var in [1, 2, 3]:
      df = transform_data.removeGenesWithExcessiveReplicationVariance(
          trinary.df_X, max_var=max_var)
      self.assertGreaterEqual(len(df_base.columns), len(df.columns))

    ser = util.convertToLog2(SER)
    ser1 = util.unconvertFromLog2(ser)
    ser1.loc[0] = 0
    trues = [np.isclose(v1, v2) for v1, v2 in zip(ser1, SER)]
    self.assertTrue(all(trues))

  def testMakeBioreactorT0ReferenceData(self):
    if IGNORE_TEST:
      return
    ser = transform_data.makeBioreactorT0ReferenceData()
    self.assertTrue(isinstance(ser, pd.Series))
    self.assertGreater(ser.min(), 0)
    self.assertGreater(len(ser), 10)
Ejemplo n.º 20
0
def _runState(arguments):
    """
  Does case evaluation for all instances for a single state.
  Run in multiple proceses concurrently.

  Parameters
  ----------
  state: int
  df_instance: pd.DataFrame
      Instances of feature vectors
  num_fset: int

  Return
  ------
  pd.DataFrame
      FEATURE_VECTOR
      SIGLVL: significance level of FRAC
      STATE: state analyzed
      INSTANCE: from data feature vector
      COUNT: number of cases
      FRAC: fraction of positive cases
  """
    state = arguments.state
    df_instance = arguments.df
    num_fset = arguments.num_fset
    #
    shared_data = SharedData()
    fset_selector = lambda f: True
    dfs = []
    for instance in df_instance.index:
        ser_X = df_instance.loc[instance, :]
        collection = shared_data.collection_dct[state]
        df = collection.getFVEvaluations(ser_X,
                                         fset_selector=fset_selector,
                                         num_fset=num_fset,
                                         max_sl=MAX_SL)
        if len(df) > 0:
            df[cn.STATE] = state
            df[INSTANCE] = instance
        dfs.append(df)
    df_result = pd.concat(dfs)
    df_result.index = range(len(df_result.index))
    # Augment the dataframe with gene descriptions
    provider = DataProvider()
    provider.do()
    df_go = provider.df_go_terms
    descriptions = []
    for stg in df_result[ccn.FEATURE_VECTOR]:
        if not isinstance(stg, str):
            descriptions.append("")
        else:
            feature_vector = FeatureVector.make(stg)
            features = feature_vector.fset.set
            description = []
            for feature in features:
                df_sub = df_go[df_go[cn.GENE_ID] == feature]
                this_desc = [
                    "%s: %s " % (feature, f) for f in df_sub[cn.GO_TERM]
                ]
                description.extend(this_desc)
            description = "\n".join(description)
            descriptions.append(description)
    #
    df_result[cn.GENE_DESCRIPTION] = descriptions
    return df_result
Ejemplo n.º 21
0
class TestDataTransformer(unittest.TestCase):
    def setUp(self):
        if IGNORE_TEST:
            return
        self._init()

    def _init(self):
        self.provider = DataProvider()
        self.provider.do()

    def testMakeTrinaryData(self):
        if IGNORE_TEST:
            return
        df = transform_data.makeTrinaryData(df=self.provider.df_normalized)
        columns = self.provider.df_normalized.columns
        self.assertTrue(helpers.isValidDataFrame(df, columns))

    def testAggregateGenes(self):
        if IGNORE_TEST:
            return
        provider = DataProvider()
        provider.do()
        df = transform_data.aggregateGenes(provider=provider)
        self.assertTrue(
            helpers.isValidDataFrame(df, provider.df_normalized.columns))

    def testTrinaryReadsDF1(self):
        if IGNORE_TEST:
            return
        provider = DataProvider()
        provider.do()
        df = provider.dfs_read_count[0]
        df_result = transform_data.trinaryReadsDF(df_sample=df)
        # See if number of "-1" is excessive
        dff = df_result + df_result.applymap(lambda v: -np.abs(v))
        frac_minus1 = -dff.sum().sum()  \
            /(2*len(df_result)*len(df_result.columns))
        self.assertLess(frac_minus1, 0.25)
        # Smoke tests for csv
        df_result = transform_data.trinaryReadsDF(
            csv_file="AM_MDM_Mtb_transcripts_DEseq.csv",
            is_display_errors=False)

    # TODO: Fix so working with the same transformation of features,
    #       either all genes features or all gene-groups.
    def testTrinaryReadsDF2(self):
        return
        # Checks that trinary values computed directly from reads
        # are the same as those of normalized samples.
        # Get raw value of read counts
        provider = DataProvider()
        provider.do()

        #
        def calcTrinaryTimeSample(time_index):
            """
        Calculates the trinary value of a time sample
        :param str time_index: name of time value
        """
            int_index = int(time_index[1:])
            df0 = provider.dfs_read_count[0]
            num = len(provider.dfs_read_count)
            ser = pd.Series(np.repeat(0, len(df0.index)), index=df0.index)
            for idx in range(num):
                ser += provider.dfs_read_count[idx][int_index]
            df = pd.DataFrame(ser / num)
            df_result = transform_data.trinaryReadsDF(df_sample=df)
            return df_result.T

        #
        data = TrinaryData()
        data.df_X.columns = data.features
        for time_index in data.df_X.index:
            df_result = calcTrinaryTimeSample(time_index)
            import pdb
            pdb.set_trace()

    def testCalcTrinaryComparison(self):
        if IGNORE_TEST:
            return
        df_in = pd.DataFrame({'a': [4, 0.20, 1]})
        df_expected = pd.DataFrame({'a': [1, -1, 0]})
        df_out = transform_data.calcTrinaryComparison(df_in)
        self.assertTrue(df_out.equals(df_expected))
        #
        df_out = transform_data.calcTrinaryComparison(df_in,
                                                      ser_ref=df_in['a'])
        trues = [v == 0 for v in df_out['a']]
        self.assertTrue(all(trues))
Ejemplo n.º 22
0
 def initialize(self):
     """
 Initializes the data. Defines and initializes all names added to globals().
 """
     #
     T0 = "T0"
     POOLED = "pooled"
     self._addName("T0", "T0")
     self._addName("POOLED", "pooled")
     self._addName("REF_TYPE_POOLED", REF_TYPE_POOLED)
     self._addName("REF_TYPE_BIOREACTOR", REF_TYPE_BIOREACTOR)
     self._addName("REF_TYPE_SELF", REF_TYPE_SELF)
     # Provider
     PROVIDER = DataProvider()
     self._addName("PROVIDER", PROVIDER)
     PROVIDER.do()
     TRINARY = TrinaryData()
     self._addName("TRINARY", TRINARY)
     # Gene Classes
     ALL_GENES = list(TRINARY.df_X.columns)
     self._addName("ALL_GENES", ALL_GENES)
     # Gene groupings. Added later so can include top12 from classifier
     MYCOBACTIN_GENES = [
         "Rv2377c",
         "Rv2378c",
         "Rv2379c",
         "Rv2380c",
         "Rv2381c",
         "Rv2382c",
         "Rv2383c",
         "Rv2384",
         "Rv2385",
         "Rv2386c",
     ]
     self._addName("MYCOBACTIN_GENES", MYCOBACTIN_GENES)
     BACTERIOFERRITIN_GENES = [
         "Rv2341",
         "Rv3841",
     ]
     self._addName("BACTERIOFERRITIN_GENES", BACTERIOFERRITIN_GENES)
     MYCOBACTIN_BACTERIOFERRIN_GENES = list(MYCOBACTIN_GENES)
     self._addName("MYCOBACTIN_BACTERIOFERRIN_GENES",
                   MYCOBACTIN_BACTERIOFERRIN_GENES)
     MYCOBACTIN_BACTERIOFERRIN_GENES.extend(BACTERIOFERRITIN_GENES)
     MYCOBACTIN_BACTERIOFERRITIN = "mycobactin_bacterioferritin"
     BACTERIOFERRITIN = "bacterioferritin"
     MYCOBACTIN = "mycobactin"
     ALL = "all"
     GENE_DCT = {
         MYCOBACTIN: MYCOBACTIN_GENES,
         BACTERIOFERRITIN: BACTERIOFERRITIN_GENES,
         MYCOBACTIN_BACTERIOFERRITIN: MYCOBACTIN_BACTERIOFERRIN_GENES,
         ALL: ALL_GENES,
     }
     # Define the stage names
     STAGE_NAMES = list(cn.STATE_NAMES)
     self._addName("STAGE_NAMES", STAGE_NAMES)
     STAGE_NAMES.remove("Normoxia")
     STAGE_NAMES = np.array(STAGE_NAMES)
     # Bioreactor data calculated with two different references
     DATA_DCT = {
         T0:
         TrinaryData(is_regulator=False, is_dropT1=True, is_averaged=True),
         POOLED:
         TrinaryData(is_regulator=False,
                     is_dropT1=True,
                     is_averaged=True,
                     calcRef=PROVIDER.calcRefPooled)
     }
     self._addName("DATA_DCT", DATA_DCT)
     SER_Y_DCT = {k: t.ser_y for k, t in DATA_DCT.items()}
     self._addName("SER_Y_DCT", SER_Y_DCT)
     # Feature vectors are specific to the gene subsets
     DF_X_DCT = {k: t.df_X.copy() for k, t in DATA_DCT.items()}
     DF_X_DCT = {k: df[MYCOBACTIN_GENES] for k, df in DF_X_DCT.items()}
     self._addName("DF_X_DCT", DF_X_DCT)
     # Sample data
     SAMPLE_DCT = {
         r: sample_data.getSampleData(ref_type=r, is_regulator=False)
         for r in [REF_TYPE_BIOREACTOR, REF_TYPE_SELF, REF_TYPE_POOLED]
     }
     self._addName("SAMPLE_DCT", SAMPLE_DCT)
     SAMPLE_AVG_DCT = {
         r: sample_data.getSampleData(ref_type=r,
                                      is_regulator=False,
                                      is_average=True)
         for r in [REF_TYPE_BIOREACTOR, REF_TYPE_SELF, REF_TYPE_POOLED]
     }
     self._addName("SAMPLE_AVG_DCT", SAMPLE_AVG_DCT)
     # Classifiers
     num_feature = len(MYCOBACTIN_BACTERIOFERRIN_GENES)
     CLASSIFIER_BASE = classifier_ensemble.ClassifierEnsemble(
         classifier_ensemble.ClassifierDescriptorSVM(),
         filter_high_rank=num_feature,
         size=NUM_CLASSIFIER_IN_ENSEMBLE)
     self._addName("CLASSIFIER_BASE", CLASSIFIER_BASE)
     CLASSIFIER_DCT = {}
     self._addName("CLASSIFIER_DCT", CLASSIFIER_DCT)
     for trinary_key, trinary in DATA_DCT.items():
         for gene_key, gene_list in GENE_DCT.items():
             classifier = copy.deepcopy(CLASSIFIER_BASE)
             # Not all genes may be present in TrinaryData since they may be correlated or unvarying.
             df_X = dataframe.subset(trinary.df_X, gene_list, axis=1)
             classifier.fit(df_X, trinary.ser_y, class_names=STAGE_NAMES)
             CLASSIFIER_DCT[(trinary_key, gene_key)] = classifier
     # Calculate the rest of the gene groups and add them
     TOP12_T0 = "top12_T0"
     TOP12_POOLED = "top12_pooled"
     TOP12_T0_GENES = list(CLASSIFIER_DCT[(T0, ALL)].columns)
     TOP12_POOLED_GENES = list(CLASSIFIER_DCT[(POOLED, ALL)].columns)
     GENE_DCT[TOP12_T0] = TOP12_T0_GENES
     GENE_DCT[TOP12_POOLED] = TOP12_POOLED_GENES
     GENE_GROUPS = list(GENE_DCT.keys())
     self._addName("GENE_GROUPS", GENE_GROUPS)
     for name in GENE_GROUPS:
         self._addName(name.upper(), name)  # Add the name of each group
     self._addName("GENE_DCT", GENE_DCT)
     # Construct derivative structures
     self._addName("DF_X", DF_X_DCT[T0])
     self._addName("SER_Y", SER_Y_DCT[T0])
     self._addName("SAMPLE_DATA_DCT", SAMPLE_DCT[REF_TYPE_BIOREACTOR])
     self._addName("CLASSIFIER", CLASSIFIER_DCT[('T0', 'mycobactin')])
     key = (T0, "mycobactin_bacterioferritin")
     self._addName("GENES", CLASSIFIER_DCT[key].features)
     # Accuracy calculations for classifiers
     DF_ACCURACY = self.calcAccuracy()
     self._addName("DF_ACCURACY", DF_ACCURACY)