def testTrinaryReadsDF2(self): return # Checks that trinary values computed directly from reads # are the same as those of normalized samples. # Get raw value of read counts provider = DataProvider() provider.do() # def calcTrinaryTimeSample(time_index): """ Calculates the trinary value of a time sample :param str time_index: name of time value """ int_index = int(time_index[1:]) df0 = provider.dfs_read_count[0] num = len(provider.dfs_read_count) ser = pd.Series(np.repeat(0, len(df0.index)), index=df0.index) for idx in range(num): ser += provider.dfs_read_count[idx][int_index] df = pd.DataFrame(ser/num) df_result = transform_data.trinaryReadsDF(df_sample=df) return df_result.T # data = TrinaryData() data.df_X.columns = data.features for time_index in data.df_X.index: df_result = calcTrinaryTimeSample(time_index) import pdb; pdb.set_trace()
def trinaryReadsDF(csv_file=None, df_sample=None, csv_dir=cn.SAMPLES_DIR, is_display_errors=True): """ Creates trinary values for read counts w.r.t. data provider. (a) adjusting for gene length, (b) library size, (c) log2, (d) ratio w.r.t. T0. Data may come from an existing dataframe or a CSV file. :param str csv_file: File in "samples" directory. columns are: "GENE_ID", instance ids :param pd.DataFrame df_sample: columns are genes, index are instances, values are raw readcounts :param str csv_dir: directory where csv file is found :return pd.DataFrame: columns are genes, indexes are instances, trinary values At least one of df_sample and csv_file must be non-null """ provider = DataProvider(is_display_errors=is_display_errors) provider.do() if df_sample is None: path = os.path.join(csv_dir, csv_file) df_sample = pd.read_csv(path) df_sample.index = df_sample['GENE_ID'] del df_sample['GENE_ID'] # df_normalized = provider.normalizeReadsDF(df_sample) # Compute trinary values relative to original reads df_ref = sum(provider.dfs_adjusted_read_count) \ / len(provider.dfs_adjusted_read_count) # Mean values ser_ref = df_ref[cn.REF_TIME] return calcTrinaryComparison(df_normalized, ser_ref=ser_ref)
class NormalizedData(object): """ Exposes values described above. """ def __init__(self, is_display_errors=True, is_averaged=True): """ :param bool is_display_errors: Shows errors encountered :param bool is_averaged: Use averaged read counts Public instance variables: df_X are normalized read counts states_dict - mapping of literal to numeric values of state ser_y - numeric value of state corresponding to each row in df_X """ self._is_display_errors = is_display_errors self.provider = DataProvider(is_display_errors=self._is_display_errors) self.provider.do() self.df_X = self.provider.df_normalized.T self.df_X = self.df_X.drop(index="T0") self.features = self.df_X.columns.tolist() self.df_X.columns = range(len(self.features)) # Create class information ser_y = self.provider.df_stage_matrix[cn.STAGE_NAME] ser_y = ser_y.drop(index="T0") ser_y = ser_y.copy() ser_y[ser_y == 'Normoxia'] = 'Resuscitation' # Create converter from state name to numeric index states = ser_y.unique() self.state_dict = {k: v for v, k in enumerate(states)} self.ser_y = ser_y.apply(lambda k: self.state_dict[k])
def aggregateGenes(df=None, provider=None): """ Combines genes that are perfectly correlated in time for trinary values. :param DataFrame df: dataframe to transform :param DataProvider provider: uses df_normalized :return pd.DataFrame: names are combined for aggregated genes; calculates trinary values """ if df is None: if provider is None: provider = DataProvider() provider.do() df = provider.df_normalized df_trinary = makeTrinaryData(df, is_include_nan=False) dfg = df_trinary.groupby(df_trinary.columns.tolist()) groups = dfg.groups data = {} for key, genes in groups.items(): label = "--".join(genes) data[label] = list(key) df = pd.DataFrame(data) df_result = df.T df_result.columns = df_trinary.columns return df_result
def getProvider(provider): """ Returns a provider """ if provider is None: provider = DataProvider() provider.do() return provider
def testAggregateGenes(self): if IGNORE_TEST: return provider = DataProvider() provider.do() df = transform_data.aggregateGenes(provider=provider) self.assertTrue(helpers.isValidDataFrame(df, provider.df_normalized.columns))
def __init__(self, df_trinary=None): """ :param pd.DataFrame df: trinary valued DF (has values -1, 0, 1) """ if df_trinary is None: provider = DataProvider() provider.do() df_trinary = transform_data.makeTrinaryData(is_include_nan=False) self.df_trinary = df_trinary self.df_group = None # Dataframe describing groups self.df_gene_group = None # Genes by group
def _getData(): provider = DataProvider() provider.do() trinary = TrinaryData(is_averaged=False, is_dropT1=False) if IS_ONLY_TFS: columns = set(trinary.df_X.columns).intersection( provider.tfs) else: columns = trinary.df_X.columns columns = list(columns) return trinary.df_X[columns], trinary.ser_y
class CVPlotter(): def __init__(self, provider=None, is_plot=True): if provider is None: self._provider = DataProvider() self._provider.do() else: self._provider = provider self._is_plot = is_plot def heatMap(self, min_cv=0): """ Plots a heatmap of the coefficient of variations. :param pd.DataFrame df_cv: CVs :param float min_cv: minimum CV to consider """ plt.figure(figsize=(16, 10)) df = self._provider.df_cv # Rename columns to their hours ax = plt.gca() ax.set_xticks(np.arange(len(df.columns))+0.5) ax.set_xticklabels(df.columns) df = df.applymap(lambda v: v if v >= min_cv else np.nan) heatmap = plt.pcolor(df, cmap='jet') plt.colorbar(heatmap) plt.xlabel("times") plt.ylabel("gene") plt.title("Coefficient of Variation > %d percent" % min_cv) if self._is_plot: plt.show() def readsAndDO(self): """ Plots the following lines for the hours of the experiments: Average CV of genes CV of dissolved oxygen (DO) Avg dissolved oxygen """ hours = self._provider.df_hypoxia[cn.HOURS] means = self._provider.df_hypoxia[cn.MEAN] error_bars = [2*s for s in self._provider.df_hypoxia[cn.STD]] plt.errorbar(hours, means, yerr=error_bars, marker="o") ax = plt.gca() # Plot CVs of genes ser = self._provider.df_cv.mean() # Average over geans ax.plot(hours, ser.values, linestyle='dashed', marker="o", color='r') plt.xlabel("hours") plt.ylabel("DO/CV") plt.legend(["CV for read counts", "DO +/- 2 std"]) if self._is_plot: plt.show()
class TestFunctions(unittest.TestCase): def setUp(self): self.trinary = TrinaryData() self.provider = DataProvider() self.provider.do() def testCountTerms(self): if IGNORE_TEST: return TERM = "DNA replication" EXPECTED_COUNT = 2 def test(terms, expected_count, fset=None): df_gene = self.provider.df_go_terms if fset is None: feature_set = FeatureSet(df_gene[xcn.GENE_ID][1:3]) fset = FeatureSet(df_gene[xcn.GENE_ID][1:3]) count = util_classifier.countTerms(fset, terms) self.assertEqual(count, expected_count) # test([TERM], EXPECTED_COUNT) test([TERM, TERM], 2 * EXPECTED_COUNT) test(["DUMMY"], 0) # fset = FeatureSet(['Rv0981--Rv1332--Rv1828']) test(["DUMMY"], 0, fset=fset) def testCountTerms2(self): if IGNORE_TEST: return TERMS = ["a"] fset = FeatureSet(["Rv2009"]) count1 = util_classifier.countTerms(fset, TERMS, is_include_module=False) count2 = util_classifier.countTerms(fset, TERMS, is_include_module=True) self.assertGreater(count2, count1) def testExtractAggregatedGene(self): if IGNORE_TEST: return GENES = ['Rv0981', 'Rv1332', 'Rv1828'] AGGREGATED_GENE = xcn.GENE_SEPARATOR.join(GENES) genes = util_classifier.extractAggregatedGene(AGGREGATED_GENE) diff = set(GENES).symmetric_difference(genes) self.assertEqual(len(diff), 0)
def testTrinaryReadsDF1(self): if IGNORE_TEST: return provider = DataProvider() provider.do() df = provider.dfs_read_count[0] df_result = transform_data.trinaryReadsDF(df_sample=df) # See if number of "-1" is excessive dff = df_result + df_result.applymap(lambda v: -np.abs(v)) frac_minus1 = -dff.sum().sum() \ /(2*len(df_result)*len(df_result.columns)) self.assertLess(frac_minus1, 0.25) # Smoke tests for csv df_result = transform_data.trinaryReadsDF( csv_file="AM_MDM_Mtb_transcripts_DEseq.csv", is_display_errors=False)
def _getData(state): """ Obtains data for a binary classifier for the class. :param int state: state for which classification is done :param pd.DataFrame, pd.Series: """ provider = DataProvider() provider.do() trinary = TrinaryData(is_averaged=False, is_dropT1=False) columns = set(trinary.df_X.columns).intersection( provider.tfs) columns = list(columns) ser_y = trinary.ser_y.apply(lambda v: 1 if v == state else 0) return trinary.df_X[columns], ser_y
class TestTermAnalyzer(unittest.TestCase): def setUp(self): self.provider = DataProvider() self.provider.do() self.analyzer = term_analyzer.TermAnalyzer( self.provider.df_ec_terms, is_plot=IS_PLOT) def testConstructor(self): if IGNORE_TEST: return self.assertTrue(helpers.isValidDataFrame(self.analyzer.df_term, self.analyzer.df_term.columns)) def testPlotTermHeatmap(self): if IGNORE_TEST: return self.analyzer.plotTermHeatmap(is_plot=IS_PLOT)
def makeTrinaryData(df=None, min_abs=1.0, is_include_nan=True): """ Thresholds data based on its absolute magnitude. Values are assigned as -1, 0, 1 :param pd.DataFrame df: default is provider.df_normalized values are in log2 units :param float min_abs: minimal absolute value to threshold. :param bool is_include_nan: Include nan values; else set to 0 :return pd.DataFrame: same index and columns as df """ if df is None: provider = DataProvider() provider.do() df = provider.df_normalized df_result = df.copy() df_result = df_result.applymap(lambda v: 0 if np.abs(v) < min_abs else -1 if v < 0 else 1) if is_include_nan: df_result = df_result.applymap(lambda v: np.nan if v == 0 else v) return df_result
def countTerms(fset, terms, is_include_module=True): """ Counts the occurrences of terms in the GO terms of genes in the FeatureSet. Parameters ---------- fset: FeatureSet terms: list-str is_include_module: bool consider all genes in modules activated by a gene in fset Returns ------- int """ provider = DataProvider() provider.do() # Extract the genes genes = [] [genes.extend(extractAggregatedGene(c)) for c in fset.list] if is_include_module: new_genes = [] tfs = list(set(provider.df_trn_unsigned[xcn.TF].tolist())) for gene in genes: if gene in tfs: sel = provider.df_trn_unsigned[xcn.TF] == gene df = provider.df_trn_unsigned[sel] new_genes.extend(df[xcn.GENE_ID].tolist()) genes.extend(new_genes) genes = list(set(genes)) # Compile the string of go terms for the genes df = provider.df_go_terms indices = [df[df[xcn.GENE_ID] == g].index.tolist() for g in genes] indices = [t for l in indices for t in l] go_terms = df.loc[indices, xcn.GO_TERM].to_list() go_str = "****".join(go_terms) count = sum([go_str.count(t) for t in terms]) return count
class TermMatrix(object): """ The core dataframe is the term matrix, self.df_matrix. Its columns are terms; the rows are groups of correlated genes. A group is a tuple of trinary values indicating when that terms is expressed. """ def __init__(self, term_column=cn.GO_TERM, is_plot=True, **kwargs): """ :param str term_column: column in go_terms to use for text :param dict **kwargs: arguments to DataGrouper """ self._is_plot = is_plot self.provider = DataProvider() self.provider.do() self.grouper = DataGrouper(**kwargs) self.grouper.do(min_size=1) self.df_matrix = self._makeMatrix(term_column) self.df_gene_term = self._makeGeneTerm() import pdb pdb.set_trace() def _makeTermGroup(self, term_column=cn.GO_TERM): """ :param str term_column: column in go_terms to use for text :return pd.DataFrame: index - group (time intervals with trinary values) column - Term """ df = self.grouper.df_gene_group.merge(self.provider.df_go_terms, left_index=True, right_index=True, how='inner') if term_column == cn.INDEX: df_term = df[[cn.GROUP]].copy() df_term[term_column] = df.index else: df_term = df[[cn.GROUP, term_column]].copy() df_term = df_term.set_index(cn.GROUP) return df_term def _makeMatrix(self, term_column=cn.GO_TERM): """ :param str term_column: column in go_terms to use for text :return pd.DataFrame: matrix with the terms """ df_term = self._makeTermGroup(term_column=term_column) df_result = util_text.makeTermMatrix(df_term[term_column]) return df_result def _makeGeneTerm(self): """ Finds the genes and terms that co-occur at the same times. :return pd.DataFrame: cn.GROUP - trinary values for genes at times cn.TERM - list of GO terms cn.GENE_D - list of genes cn.CNT_TERM - count of GO terms cn.CNT_GENE - count of genes cn.CNT_REGULATED - count of times up- down-regulated """ def makeGroupedDF(df): df = df.reset_index() return df.groupby(cn.GROUP) def extract(df, key, col): sel = df.index == key result = df[sel][col].values.tolist() return result # df_term = self._makeTermGroup() df_gene = self.grouper.df_gene_group df_gene = df_gene.reset_index() df_gene = df_gene.set_index(cn.GROUP) dfg_term = makeGroupedDF(df_term) dfg_gene = makeGroupedDF(self.grouper.df_gene_group) # Find the keys in common keys_term = [k for k in dfg_term.groups] keys_gene = [k for k in dfg_gene.groups] keys_common = set(keys_term).intersection(keys_gene) dict_df = {cn.GROUP: [], cn.TERM: [], cn.GENE_ID: []} for key in keys_common: dict_df[cn.GROUP].append(key) dict_df[cn.TERM].append(extract(df_term, key, cn.GO_TERM)) dict_df[cn.GENE_ID].append(extract(df_gene, key, cn.GENE_ID)) df_result = pd.DataFrame(dict_df) df_result[cn.CNT_GENE] = [len(g) for g in df_result[cn.GENE_ID]] df_result[cn.CNT_TERM] = [len(t) for t in df_result[cn.TERM]] df_result[cn.CNT_REGULATED] = \ [sum([np.abs(x) for x in g]) for g in df_result[cn.GROUP]] return df_result def makeAggregationMatrix(self, predicates): """ Creates a matrix with columns the same as df_matrix and row i that is the summation of the values in rows that satisfy predicate i. :param list-BooleanFunc predicates: predicate on group tuples """ columns = self.df_matrix.columns column_values = {c.strip(): [] for c in columns} for pos, predicate in enumerate(predicates): row = np.repeat(0.0, len(columns)) row = row.reshape(1, len(columns)) # TODO: Fix predicates if False: for group in self.df_matrix.index: if predicate(group): values = np.array(self.df_matrix.loc[[group], :]) row += values for group in self.df_matrix.index: if group[pos] == 1: values = np.array(self.df_matrix.loc[[group], :]) row += values row = row.reshape(len(columns)) # Add the row for this predicate for idx, col in enumerate(columns): column_values[col].append(row[idx]) return pd.DataFrame(column_values) # TODO: Fix use of predicates def plotAggregation(self, predicates, min_val=0, is_include_ylabels=True): df = self.makeAggregationMatrix(predicates) df = df.applymap(lambda v: 0 if v < min_val else v) df = df.applymap(lambda v: np.nan if np.isclose(v, 0) else v) # Drop columns that are all nans for col in df.columns: if all([np.isnan(v) for v in df[col]]): del df[col] # Construct the plot plt.subplot(1, 2, 2) heatmap = plt.pcolor(df.transpose(), cmap='jet') if is_include_ylabels: ax = plt.gca() ax.set_yticks(np.arange(len(df.columns)) + 0.5) ax.set_yticklabels(df.columns, fontsize=8) plt.title("Term Counts") plt.colorbar(heatmap) plt.show() def makeTimeAggregationMatrix(self, is_up_regulated=True): """ Creates a matrix with columns the same as df_matrix and row i that is the summation of the values in rows that satisfy predicate i. :param bool is_up_regulated: """ if is_up_regulated: direction = 1 else: direction = -1 columns = self.df_matrix.columns column_values = {c.strip(): [] for c in columns} for time in range(cn.NUM_TIMES): row = np.repeat(0.0, len(columns)) row = row.reshape(1, len(columns)) for group in self.df_matrix.index: if group[time] == direction: values = np.array(self.df_matrix.loc[[group], :]) row += values row = row.reshape(len(columns)) # Add the row for this predicate for idx, col in enumerate(columns): column_values[col].append(row[idx]) return pd.DataFrame(column_values) def calcClusters(self, max_distance=1, is_up_regulated=True): """ Calculates log significance levels and clusters. :param float max_distance: maximum distance between clusters, otherwise merged :return pd.DataFrame, ndarray, pd.Series: df_log - log of significance level row_linkage - linkage matrix See https://stackoverflow.com/questions/9838861/scipy-linkage-format ser_cluster - cn.GROUP (indexed by term) """ df = self.makeTimeAggregationMatrix(is_up_regulated=is_up_regulated) # Remove rows with zero variance df_filtered = util_statistics.filterZeroVarianceRows(df.T) # Compute significance levels df_log = util_statistics.calcLogSL(df_filtered, round_decimal=3) df_log = df_log.applymap(lambda v: HIGH_SL if np.isnan(v) else v) # Compute the clusters log_arrays = np.asarray(df_log) row_linkage = linkage(distance.pdist(log_arrays), method='average') ser_cluster = pd.Series( fcluster(row_linkage, 0.1, criterion="distance")) ser_cluster.index = df_log.index # return df_log, row_linkage, ser_cluster # Include state transitions # Note how clusters relate to state observations def plotTimeAggregation(self, is_up_regulated=True): """ Plots aggregation of groups over time. :param bool is_include_ylabels: :param bool is_up_regulated: """ df_log, row_linkage, ser_cluster = \ self.calcClusters(is_up_regulated=is_up_regulated) # Heatmap cg = sns.clustermap(df_log, row_linkage=row_linkage, col_cluster=False, cbar_kws={"ticks": [0, 5]}, cmap="Blues") # Construct a cluster map #cg = sns.clustermap(df_log, col_cluster=False, # cbar_kws={"ticks":[0,5]}, cmap="Blues") # Set the labels cg.ax_heatmap.set_xlabel("Time") if is_up_regulated: direction = "Up" else: direction = "Down" title = "-log10 zscores of %s-regulated term counts" % (direction) cg.ax_heatmap.set_title(title) xticks = cg.ax_heatmap.get_xticks() - 0.5 # Correct tick position cg.ax_heatmap.set_xticks(xticks) cg.ax_heatmap.set_yticks([]) cg.ax_heatmap.set_yticklabels([]) # Add the state transitions util_plots.plotStateTransitions(ymax=len(df_log), ax=cg.ax_heatmap, is_plot=False) # if self._is_plot: plt.show()
from common import data_provider import common.transform_data as transform_data from common_python.classifier import util_classifier import collections import copy import matplotlib.pyplot as plt import numpy as np import os import pandas as pd import seaborn as sns T1_INDEX = "T1" MIN_NUM_NORMOXIA = 2 # Minimum number of normoxia states PROVIDER = DataProvider(is_display_errors=False) PROVIDER.do() ################## FUNCTIONS ############### def subsetToRegulators(df): regulators = PROVIDER.df_trn_unsigned[cn.TF] regulators = list(set(regulators)) regulator_cols = list(set(df.columns).intersection(regulators)) for column in df.columns: if not column in regulator_cols: del df[column] def convertToTrinary(df, threshold_low=-1, threshold_high=1): """ Converts the dataframe to trinary values using the indicated thresholds.
class NormalizedData(object): """ Exposes values described above. """ def __init__(self, is_averaged=True, is_regulator=False, **kwargs): """ :param bool is_averaged: Use averaged read counts :param bool is_regulator: use regulators for TRN :param dict kwargs: options passed to DataProvider Public instance variables: df_X are normalized read counts instances are either times (begin with T) for stage (S) ser_y - numeric value of state corresponding to each row in df_X self.state_dct: key: state name value: state index self.features: list of names of gene """ self.provider = DataProvider(**kwargs) self.provider.do() if is_averaged: self.df_X = self.provider.df_normalized.T else: # Use the adjusted values for each replication dfs = [ df.copy() for df in self.provider.dfs_adjusted_read_count_wrtT0_log2 ] self.df_X = pd.concat([df.T for df in dfs]) drop_indices = self._getDropIndices(self.df_X.index) self.df_X = self.df_X.drop(drop_indices) if is_regulator: subsetToRegulators(self.df_X) self.features = self.df_X.columns.tolist() # Create class information ser_y = self.provider.df_stage_matrix[cn.STAGE_NAME] if not is_averaged: # Replica information has a specical time format num_repl = len(self.provider.dfs_read_count) sers = [] for idx in range(num_repl): new_ser_y = ser_y.copy() new_ser_y.index = self.provider.makeTimes(suffix=idx) sers.append(new_ser_y) ser_y = pd.concat(sers) states = list(cn.STATE_NAMES) ser_y = ser_y.drop(self._getDropIndices(ser_y.index)) if len(ser_y[ser_y == cn.STATE_NORMOXIA]) \ <= MIN_NUM_NORMOXIA: ser_y[ser_y == cn.STATE_NORMOXIA] = cn.STATE_RESCUSCITATION states.remove("Normoxia") # Create converter from state name to numeric index self.state_dct = {k: v for v, k in enumerate(states)} self.ser_y = ser_y.apply(lambda k: self.state_dct[k]) def _getDropIndices(self, indices, drop_index=cn.TIME_0): """ Handles dropping time index when replicas are present """ result = [] for idx in indices: splits = idx.split(data_provider.SEPARATOR) if splits[0] == drop_index: result.append(idx) return result
class TestFunctions(unittest.TestCase): def setUp(self): if IGNORE_TEST: return self._init() def _init(self): self.provider = DataProvider() self.provider.do() def testMakeTrinaryData(self): if IGNORE_TEST: return df = transform_data.makeTrinaryData( df=self.provider.df_normalized) columns = self.provider.df_normalized.columns self.assertTrue(helpers.isValidDataFrame(df, columns)) def testAggregateGenes(self): if IGNORE_TEST: return provider = DataProvider() provider.do() df = transform_data.aggregateGenes(provider=provider) self.assertTrue(helpers.isValidDataFrame(df, provider.df_normalized.columns)) def testTrinaryReadsDF1(self): if IGNORE_TEST: return provider = DataProvider() provider.do() df = provider.dfs_read_count[0] df_result = transform_data.trinaryReadsDF( df_sample=df) # See if number of "-1" is excessive dff = df_result + df_result.applymap(lambda v: -np.abs(v)) frac_minus1 = -dff.sum().sum() \ /(2*len(df_result)*len(df_result.columns)) self.assertLess(frac_minus1, 0.25) # Smoke tests for csv df_result = transform_data.trinaryReadsDF( csv_file="AM_MDM_Mtb_transcripts_DEseq.csv", is_time_columns=False) # TODO: Fix so working with the same transformation of features, # either all genes features or all gene-groups. def testTrinaryReadsDF2(self): return # Checks that trinary values computed directly from reads # are the same as those of normalized samples. # Get raw value of read counts provider = DataProvider() provider.do() # def calcTrinaryTimeSample(time_index): """ Calculates the trinary value of a time sample :param str time_index: name of time value """ int_index = int(time_index[1:]) df0 = provider.dfs_read_count[0] num = len(provider.dfs_read_count) ser = pd.Series(np.repeat(0, len(df0.index)), index=df0.index) for idx in range(num): ser += provider.dfs_read_count[idx][int_index] df = pd.DataFrame(ser/num) df_result = transform_data.trinaryReadsDF(df_sample=df) return df_result.T # data = TrinaryData() data.df_X.columns = data.features for time_index in data.df_X.index: df_result = calcTrinaryTimeSample(time_index) import pdb; pdb.set_trace() def testCalcTrinaryComparison(self): if IGNORE_TEST: return df_in = pd.DataFrame({'a': [4, 0.20, 1]}) df_expected = pd.DataFrame({'a': [1, -1, 0]}) ser_ref = pd.Series(np.repeat(1, len(df_in))) df_out = transform_data.calcTrinaryComparison(df_in, ser_ref, is_convert_log2=True) self.assertTrue(df_out.equals(df_expected)) def testStripReplicaString(self): if IGNORE_TEST: return TIME = "TO" SIZE = 3 names = ["%s.%d" % (TIME, n) for n in range(SIZE)] result = transform_data.stripReplicaString(names) self.assertEqual(result[0], TIME) self.assertEqual(len(result), SIZE) def testRemoveGenesWithExcessiveReplicationVariance(self): if IGNORE_TEST: return trinary = TrinaryData(is_averaged=False, is_dropT1=False, is_regulator=False) df_base = transform_data.removeGenesWithExcessiveReplicationVariance( trinary.df_X) for max_var in [1, 2, 3]: df = transform_data.removeGenesWithExcessiveReplicationVariance( trinary.df_X, max_var=max_var) self.assertGreaterEqual(len(df_base.columns), len(df.columns)) ser = util.convertToLog2(SER) ser1 = util.unconvertFromLog2(ser) ser1.loc[0] = 0 trues = [np.isclose(v1, v2) for v1, v2 in zip(ser1, SER)] self.assertTrue(all(trues)) def testMakeBioreactorT0ReferenceData(self): if IGNORE_TEST: return ser = transform_data.makeBioreactorT0ReferenceData() self.assertTrue(isinstance(ser, pd.Series)) self.assertGreater(ser.min(), 0) self.assertGreater(len(ser), 10)
def _runState(arguments): """ Does case evaluation for all instances for a single state. Run in multiple proceses concurrently. Parameters ---------- state: int df_instance: pd.DataFrame Instances of feature vectors num_fset: int Return ------ pd.DataFrame FEATURE_VECTOR SIGLVL: significance level of FRAC STATE: state analyzed INSTANCE: from data feature vector COUNT: number of cases FRAC: fraction of positive cases """ state = arguments.state df_instance = arguments.df num_fset = arguments.num_fset # shared_data = SharedData() fset_selector = lambda f: True dfs = [] for instance in df_instance.index: ser_X = df_instance.loc[instance, :] collection = shared_data.collection_dct[state] df = collection.getFVEvaluations(ser_X, fset_selector=fset_selector, num_fset=num_fset, max_sl=MAX_SL) if len(df) > 0: df[cn.STATE] = state df[INSTANCE] = instance dfs.append(df) df_result = pd.concat(dfs) df_result.index = range(len(df_result.index)) # Augment the dataframe with gene descriptions provider = DataProvider() provider.do() df_go = provider.df_go_terms descriptions = [] for stg in df_result[ccn.FEATURE_VECTOR]: if not isinstance(stg, str): descriptions.append("") else: feature_vector = FeatureVector.make(stg) features = feature_vector.fset.set description = [] for feature in features: df_sub = df_go[df_go[cn.GENE_ID] == feature] this_desc = [ "%s: %s " % (feature, f) for f in df_sub[cn.GO_TERM] ] description.extend(this_desc) description = "\n".join(description) descriptions.append(description) # df_result[cn.GENE_DESCRIPTION] = descriptions return df_result
class TestDataTransformer(unittest.TestCase): def setUp(self): if IGNORE_TEST: return self._init() def _init(self): self.provider = DataProvider() self.provider.do() def testMakeTrinaryData(self): if IGNORE_TEST: return df = transform_data.makeTrinaryData(df=self.provider.df_normalized) columns = self.provider.df_normalized.columns self.assertTrue(helpers.isValidDataFrame(df, columns)) def testAggregateGenes(self): if IGNORE_TEST: return provider = DataProvider() provider.do() df = transform_data.aggregateGenes(provider=provider) self.assertTrue( helpers.isValidDataFrame(df, provider.df_normalized.columns)) def testTrinaryReadsDF1(self): if IGNORE_TEST: return provider = DataProvider() provider.do() df = provider.dfs_read_count[0] df_result = transform_data.trinaryReadsDF(df_sample=df) # See if number of "-1" is excessive dff = df_result + df_result.applymap(lambda v: -np.abs(v)) frac_minus1 = -dff.sum().sum() \ /(2*len(df_result)*len(df_result.columns)) self.assertLess(frac_minus1, 0.25) # Smoke tests for csv df_result = transform_data.trinaryReadsDF( csv_file="AM_MDM_Mtb_transcripts_DEseq.csv", is_display_errors=False) # TODO: Fix so working with the same transformation of features, # either all genes features or all gene-groups. def testTrinaryReadsDF2(self): return # Checks that trinary values computed directly from reads # are the same as those of normalized samples. # Get raw value of read counts provider = DataProvider() provider.do() # def calcTrinaryTimeSample(time_index): """ Calculates the trinary value of a time sample :param str time_index: name of time value """ int_index = int(time_index[1:]) df0 = provider.dfs_read_count[0] num = len(provider.dfs_read_count) ser = pd.Series(np.repeat(0, len(df0.index)), index=df0.index) for idx in range(num): ser += provider.dfs_read_count[idx][int_index] df = pd.DataFrame(ser / num) df_result = transform_data.trinaryReadsDF(df_sample=df) return df_result.T # data = TrinaryData() data.df_X.columns = data.features for time_index in data.df_X.index: df_result = calcTrinaryTimeSample(time_index) import pdb pdb.set_trace() def testCalcTrinaryComparison(self): if IGNORE_TEST: return df_in = pd.DataFrame({'a': [4, 0.20, 1]}) df_expected = pd.DataFrame({'a': [1, -1, 0]}) df_out = transform_data.calcTrinaryComparison(df_in) self.assertTrue(df_out.equals(df_expected)) # df_out = transform_data.calcTrinaryComparison(df_in, ser_ref=df_in['a']) trues = [v == 0 for v in df_out['a']] self.assertTrue(all(trues))
def initialize(self): """ Initializes the data. Defines and initializes all names added to globals(). """ # T0 = "T0" POOLED = "pooled" self._addName("T0", "T0") self._addName("POOLED", "pooled") self._addName("REF_TYPE_POOLED", REF_TYPE_POOLED) self._addName("REF_TYPE_BIOREACTOR", REF_TYPE_BIOREACTOR) self._addName("REF_TYPE_SELF", REF_TYPE_SELF) # Provider PROVIDER = DataProvider() self._addName("PROVIDER", PROVIDER) PROVIDER.do() TRINARY = TrinaryData() self._addName("TRINARY", TRINARY) # Gene Classes ALL_GENES = list(TRINARY.df_X.columns) self._addName("ALL_GENES", ALL_GENES) # Gene groupings. Added later so can include top12 from classifier MYCOBACTIN_GENES = [ "Rv2377c", "Rv2378c", "Rv2379c", "Rv2380c", "Rv2381c", "Rv2382c", "Rv2383c", "Rv2384", "Rv2385", "Rv2386c", ] self._addName("MYCOBACTIN_GENES", MYCOBACTIN_GENES) BACTERIOFERRITIN_GENES = [ "Rv2341", "Rv3841", ] self._addName("BACTERIOFERRITIN_GENES", BACTERIOFERRITIN_GENES) MYCOBACTIN_BACTERIOFERRIN_GENES = list(MYCOBACTIN_GENES) self._addName("MYCOBACTIN_BACTERIOFERRIN_GENES", MYCOBACTIN_BACTERIOFERRIN_GENES) MYCOBACTIN_BACTERIOFERRIN_GENES.extend(BACTERIOFERRITIN_GENES) MYCOBACTIN_BACTERIOFERRITIN = "mycobactin_bacterioferritin" BACTERIOFERRITIN = "bacterioferritin" MYCOBACTIN = "mycobactin" ALL = "all" GENE_DCT = { MYCOBACTIN: MYCOBACTIN_GENES, BACTERIOFERRITIN: BACTERIOFERRITIN_GENES, MYCOBACTIN_BACTERIOFERRITIN: MYCOBACTIN_BACTERIOFERRIN_GENES, ALL: ALL_GENES, } # Define the stage names STAGE_NAMES = list(cn.STATE_NAMES) self._addName("STAGE_NAMES", STAGE_NAMES) STAGE_NAMES.remove("Normoxia") STAGE_NAMES = np.array(STAGE_NAMES) # Bioreactor data calculated with two different references DATA_DCT = { T0: TrinaryData(is_regulator=False, is_dropT1=True, is_averaged=True), POOLED: TrinaryData(is_regulator=False, is_dropT1=True, is_averaged=True, calcRef=PROVIDER.calcRefPooled) } self._addName("DATA_DCT", DATA_DCT) SER_Y_DCT = {k: t.ser_y for k, t in DATA_DCT.items()} self._addName("SER_Y_DCT", SER_Y_DCT) # Feature vectors are specific to the gene subsets DF_X_DCT = {k: t.df_X.copy() for k, t in DATA_DCT.items()} DF_X_DCT = {k: df[MYCOBACTIN_GENES] for k, df in DF_X_DCT.items()} self._addName("DF_X_DCT", DF_X_DCT) # Sample data SAMPLE_DCT = { r: sample_data.getSampleData(ref_type=r, is_regulator=False) for r in [REF_TYPE_BIOREACTOR, REF_TYPE_SELF, REF_TYPE_POOLED] } self._addName("SAMPLE_DCT", SAMPLE_DCT) SAMPLE_AVG_DCT = { r: sample_data.getSampleData(ref_type=r, is_regulator=False, is_average=True) for r in [REF_TYPE_BIOREACTOR, REF_TYPE_SELF, REF_TYPE_POOLED] } self._addName("SAMPLE_AVG_DCT", SAMPLE_AVG_DCT) # Classifiers num_feature = len(MYCOBACTIN_BACTERIOFERRIN_GENES) CLASSIFIER_BASE = classifier_ensemble.ClassifierEnsemble( classifier_ensemble.ClassifierDescriptorSVM(), filter_high_rank=num_feature, size=NUM_CLASSIFIER_IN_ENSEMBLE) self._addName("CLASSIFIER_BASE", CLASSIFIER_BASE) CLASSIFIER_DCT = {} self._addName("CLASSIFIER_DCT", CLASSIFIER_DCT) for trinary_key, trinary in DATA_DCT.items(): for gene_key, gene_list in GENE_DCT.items(): classifier = copy.deepcopy(CLASSIFIER_BASE) # Not all genes may be present in TrinaryData since they may be correlated or unvarying. df_X = dataframe.subset(trinary.df_X, gene_list, axis=1) classifier.fit(df_X, trinary.ser_y, class_names=STAGE_NAMES) CLASSIFIER_DCT[(trinary_key, gene_key)] = classifier # Calculate the rest of the gene groups and add them TOP12_T0 = "top12_T0" TOP12_POOLED = "top12_pooled" TOP12_T0_GENES = list(CLASSIFIER_DCT[(T0, ALL)].columns) TOP12_POOLED_GENES = list(CLASSIFIER_DCT[(POOLED, ALL)].columns) GENE_DCT[TOP12_T0] = TOP12_T0_GENES GENE_DCT[TOP12_POOLED] = TOP12_POOLED_GENES GENE_GROUPS = list(GENE_DCT.keys()) self._addName("GENE_GROUPS", GENE_GROUPS) for name in GENE_GROUPS: self._addName(name.upper(), name) # Add the name of each group self._addName("GENE_DCT", GENE_DCT) # Construct derivative structures self._addName("DF_X", DF_X_DCT[T0]) self._addName("SER_Y", SER_Y_DCT[T0]) self._addName("SAMPLE_DATA_DCT", SAMPLE_DCT[REF_TYPE_BIOREACTOR]) self._addName("CLASSIFIER", CLASSIFIER_DCT[('T0', 'mycobactin')]) key = (T0, "mycobactin_bacterioferritin") self._addName("GENES", CLASSIFIER_DCT[key].features) # Accuracy calculations for classifiers DF_ACCURACY = self.calcAccuracy() self._addName("DF_ACCURACY", DF_ACCURACY)