def test(): persister = Persister(PERSISTER_PATH) self.assertTrue(persister.isExist()) calculator = persister.get() self.assertTrue(isinstance( calculator.df_ria, pd.DataFrame)) self.assertTrue(os.path.isfile(OUT_PATH))
def __init__(self, persister_path=None): """ Parameters ---------- persister_path: str path to persister file """ if persister_path is None: persister_path = os.path.join(cn.DATA_DIR, DATA_FILE) self.persister = Persister(persister_path) self.namespace_dct = {} # Items that go in the caller's namespace
class TestSharedData(unittest.TestCase): def deleteFiles(self): if os.path.isfile(PERSISTER_PATH): os.remove(PERSISTER_PATH) def setUp(self): self.deleteFiles() self.persister = Persister(PERSISTER_PATH) def tearDown(self): self.deleteFiles() def testConstructor(self): if IGNORE_TEST: return def test(): data = shared_data.SharedData(persister=self.persister) self.assertTrue(isinstance(data.provider, DataProvider)) self.assertTrue(isinstance(data.df_X, pd.DataFrame)) self.assertTrue(isinstance(data.ser_y, pd.Series)) self.assertTrue(isinstance(data.states, np.ndarray)) self.assertEqual(len(data.states), len(data.collection_dct.keys())) # Test without persister test() self.assertTrue(self.persister.isExist()) # Test with persister test()
def testRun(self): if IGNORE_TEST: return main.run(PERSISTER_PATH, True, max_iter=1, is_report=False, mcfo_kwargs=MCFO_KWARGS) persister = Persister(PERSISTER_PATH) self.assertTrue(persister.isExist()) optimizer = persister.get() self.assertTrue(isinstance(optimizer.fit_result_dct, dict)) # main.run(PERSISTER_PATH, False, max_iter=1, is_report=False, mcfo_kwargs=MCFO_KWARGS) optimizer2 = persister.get() for cls in optimizer.fit_result_dct.keys(): self.assertTrue( len(optimizer.fit_result_dct[cls]) == len( optimizer2.fit_result_dct[cls]))
def do(self, data_dir=cn.DATA_DIR): """ Assigns values to the instance data. """ persister = Persister(cn.DATA_PROVIDER_PERSISTER_PATH) if persister.isExist(): provider = persister.get() self._setValues(provider=provider) else: # Gene categorizations self.df_ec_terms = \ self._makeDFFromCSV(FILENAME_EC_TERMS, is_index_geneid=True) self.df_ko_terms = \ self._makeDFFromCSV(FILENAME_KO_TERMS, is_index_geneid=True) self.df_kegg_pathways = \ self._makeDFFromCSV(FILENAME_KEGG_PATHWAYS, is_index_geneid=False) self.df_kegg_gene_pathways = \ self._makeDFFromCSV(FILENAME_KEGG_GENE_PATHWAY, is_index_geneid=True) # GO Terms self.df_go_terms = self._makeGoTerms() # Gene expression for state self.df_gene_expression_state = self._makeDFFromCSV( FILENAME_GENE_EXPRESSION_STATE, is_index_geneid=True) # Gene description self.df_gene_description = self._makeGeneDescriptionDF() # Stages matrix self.df_stage_matrix = self._makeStageMatrixDF() # Normalized data values self.df_normalized = self._makeNormalizedDF() # Raw readcounts self.dfs_read_count = self._makeReadCountDFS() # Hypoxia data self.df_hypoxia = self._makeHypoxiaDF() # Create mean and std dataframes self.df_mean = self._makeMeanDF() self.df_std = self._makeStdDF() self.df_cv = 100 * self.df_std / self.df_mean persister.set(self)
def getPersister(path=None): if path is None: path = _makePath(filename=PERSISTER_FILE) return Persister(path)
def tearDown(self): persister = Persister(cn.DATA_PROVIDER_PERSISTER_PATH) persister.remove()
class ClassificationData(): # Data preparation constants def __init__(self, persister_path=None): """ Parameters ---------- persister_path: str path to persister file """ if persister_path is None: persister_path = os.path.join(cn.DATA_DIR, DATA_FILE) self.persister = Persister(persister_path) self.namespace_dct = {} # Items that go in the caller's namespace def initialize(self): """ Initializes the data. Defines and initializes all names added to globals(). """ # T0 = "T0" POOLED = "pooled" self._addName("T0", "T0") self._addName("POOLED", "pooled") self._addName("REF_TYPE_POOLED", REF_TYPE_POOLED) self._addName("REF_TYPE_BIOREACTOR", REF_TYPE_BIOREACTOR) self._addName("REF_TYPE_SELF", REF_TYPE_SELF) # Provider PROVIDER = DataProvider() self._addName("PROVIDER", PROVIDER) PROVIDER.do() TRINARY = TrinaryData() self._addName("TRINARY", TRINARY) # Gene Classes ALL_GENES = list(TRINARY.df_X.columns) self._addName("ALL_GENES", ALL_GENES) # Gene groupings. Added later so can include top12 from classifier MYCOBACTIN_GENES = [ "Rv2377c", "Rv2378c", "Rv2379c", "Rv2380c", "Rv2381c", "Rv2382c", "Rv2383c", "Rv2384", "Rv2385", "Rv2386c", ] self._addName("MYCOBACTIN_GENES", MYCOBACTIN_GENES) BACTERIOFERRITIN_GENES = [ "Rv2341", "Rv3841", ] self._addName("BACTERIOFERRITIN_GENES", BACTERIOFERRITIN_GENES) MYCOBACTIN_BACTERIOFERRIN_GENES = list(MYCOBACTIN_GENES) self._addName("MYCOBACTIN_BACTERIOFERRIN_GENES", MYCOBACTIN_BACTERIOFERRIN_GENES) MYCOBACTIN_BACTERIOFERRIN_GENES.extend(BACTERIOFERRITIN_GENES) MYCOBACTIN_BACTERIOFERRITIN = "mycobactin_bacterioferritin" BACTERIOFERRITIN = "bacterioferritin" MYCOBACTIN = "mycobactin" ALL = "all" GENE_DCT = { MYCOBACTIN: MYCOBACTIN_GENES, BACTERIOFERRITIN: BACTERIOFERRITIN_GENES, MYCOBACTIN_BACTERIOFERRITIN: MYCOBACTIN_BACTERIOFERRIN_GENES, ALL: ALL_GENES, } # Define the stage names STAGE_NAMES = list(cn.STATE_NAMES) self._addName("STAGE_NAMES", STAGE_NAMES) STAGE_NAMES.remove("Normoxia") STAGE_NAMES = np.array(STAGE_NAMES) # Bioreactor data calculated with two different references DATA_DCT = { T0: TrinaryData(is_regulator=False, is_dropT1=True, is_averaged=True), POOLED: TrinaryData(is_regulator=False, is_dropT1=True, is_averaged=True, calcRef=PROVIDER.calcRefPooled) } self._addName("DATA_DCT", DATA_DCT) SER_Y_DCT = {k: t.ser_y for k, t in DATA_DCT.items()} self._addName("SER_Y_DCT", SER_Y_DCT) # Feature vectors are specific to the gene subsets DF_X_DCT = {k: t.df_X.copy() for k, t in DATA_DCT.items()} DF_X_DCT = {k: df[MYCOBACTIN_GENES] for k, df in DF_X_DCT.items()} self._addName("DF_X_DCT", DF_X_DCT) # Sample data SAMPLE_DCT = { r: sample_data.getSampleData(ref_type=r, is_regulator=False) for r in [REF_TYPE_BIOREACTOR, REF_TYPE_SELF, REF_TYPE_POOLED] } self._addName("SAMPLE_DCT", SAMPLE_DCT) SAMPLE_AVG_DCT = { r: sample_data.getSampleData(ref_type=r, is_regulator=False, is_average=True) for r in [REF_TYPE_BIOREACTOR, REF_TYPE_SELF, REF_TYPE_POOLED] } self._addName("SAMPLE_AVG_DCT", SAMPLE_AVG_DCT) # Classifiers num_feature = len(MYCOBACTIN_BACTERIOFERRIN_GENES) CLASSIFIER_BASE = classifier_ensemble.ClassifierEnsemble( classifier_ensemble.ClassifierDescriptorSVM(), filter_high_rank=num_feature, size=NUM_CLASSIFIER_IN_ENSEMBLE) self._addName("CLASSIFIER_BASE", CLASSIFIER_BASE) CLASSIFIER_DCT = {} self._addName("CLASSIFIER_DCT", CLASSIFIER_DCT) for trinary_key, trinary in DATA_DCT.items(): for gene_key, gene_list in GENE_DCT.items(): classifier = copy.deepcopy(CLASSIFIER_BASE) # Not all genes may be present in TrinaryData since they may be correlated or unvarying. df_X = dataframe.subset(trinary.df_X, gene_list, axis=1) classifier.fit(df_X, trinary.ser_y, class_names=STAGE_NAMES) CLASSIFIER_DCT[(trinary_key, gene_key)] = classifier # Calculate the rest of the gene groups and add them TOP12_T0 = "top12_T0" TOP12_POOLED = "top12_pooled" TOP12_T0_GENES = list(CLASSIFIER_DCT[(T0, ALL)].columns) TOP12_POOLED_GENES = list(CLASSIFIER_DCT[(POOLED, ALL)].columns) GENE_DCT[TOP12_T0] = TOP12_T0_GENES GENE_DCT[TOP12_POOLED] = TOP12_POOLED_GENES GENE_GROUPS = list(GENE_DCT.keys()) self._addName("GENE_GROUPS", GENE_GROUPS) for name in GENE_GROUPS: self._addName(name.upper(), name) # Add the name of each group self._addName("GENE_DCT", GENE_DCT) # Construct derivative structures self._addName("DF_X", DF_X_DCT[T0]) self._addName("SER_Y", SER_Y_DCT[T0]) self._addName("SAMPLE_DATA_DCT", SAMPLE_DCT[REF_TYPE_BIOREACTOR]) self._addName("CLASSIFIER", CLASSIFIER_DCT[('T0', 'mycobactin')]) key = (T0, "mycobactin_bacterioferritin") self._addName("GENES", CLASSIFIER_DCT[key].features) # Accuracy calculations for classifiers DF_ACCURACY = self.calcAccuracy() self._addName("DF_ACCURACY", DF_ACCURACY) def _addName(self, name, value): """ Adds the name and value to the namespace. Parameters ---------- name: str value: object ------- """ stmt = "self.namespace_dct['%s'] = value" % name exec(stmt) def serialize(self): """ Writes the current contents of self.namespace_dct to the persister. """ self.persister.set(self.namespace_dct) def deserialize(self): """ Recovers previously serialized data, initializing self.namespace_dct. ------- """ if not self.persister.isExist(): raise ValueError( "Persister file %s does not exist. Use serialize first." % self.persister.path) self.namespace_dct = self.persister.get() return self.namespace_dct def setNamespace(self, globals_dct): """ Sets the globals provided based on the initialized namespace. Parameters ---------- globals_dct: dict """ for name, value in self.namespace_dct.items(): globals_dct[name] = value def get(self, globals_dct): """ Deserializes an existing persister file and initializes the namespace. Parameters ---------- globals_dct: dict """ self.deserialize() self.setNamespace(globals_dct) def calcAccuracy(self, num_features=NUM_FEATURES, num_clf=100, is_debug=False): """ Calculates the accuracy of classifiers using 10 iterations of cross validation with one holdout per state (stage). Parameters ---------- num_features: list-int num_clf: number of classifiers in the ensemble is_debug: bool Creates dummy data Returns ------- DataFrame: COL_REF: how reference is calculated for gene expressions COL_GENE_GROUP: grouping of genes used in classifier COL_NUM_FEATURE: number of features in classifiers COL_MEAN_ACCURACY: mean accuracy of the classifiers COL_STD_ACCURACY: standard deviation of accuracy """ classifier_dct = self.namespace_dct["CLASSIFIER_DCT"] data_dct = self.namespace_dct["DATA_DCT"] gene_dct = self.namespace_dct["GENE_DCT"] line_dct = {r: l for r, l in zip(data_dct.keys(), ["-", "--"])} accuracy_dct = {c: [] for c in DF_ACCURACY_COLUMNS} for (ref, group), clf in classifier_dct.items(): num_features = list(range(1, 13)) num_features.insert(0, 1) trinary = copy.deepcopy(data_dct[ref]) trinary.df_X = dataframe.subset(trinary.df_X, gene_dct[group]) for num_feature in num_features: if is_debug: # Create a dummy value mean_accuracy = np.random.rand() else: mean_accuracy = clf.crossValidate( trinary, num_iter=10, num_holdout=1, filter_high_rank=num_feature, size=num_clf) accuracy_dct[COL_REF].append(ref) accuracy_dct[COL_GENE_GROUP].append(group) accuracy_dct[COL_NUM_FEATURE].append(num_feature) accuracy_dct[COL_MEAN_ACCURACY].append(mean_accuracy) std_accuracy = np.sqrt(mean_accuracy * (1 - mean_accuracy) / num_clf) accuracy_dct[COL_STD_ACCURACY].append(std_accuracy) df_accuracy = pd.DataFrame(accuracy_dct) return df_accuracy
def getPersister(path=PERSISTER_PATH): return Persister(path)
def run(state, out_dir_pat=OUT_PATH_DIR_PAT, num_cross_iter=NUM_CROSS_ITER, is_status=False, report_interval=REPORT_INTERVAL, is_report=True, columns=None, is_restart=IS_RESTART, **kwargs): """ Runs feature selection. :param int state: State being analyzed :param list-str columns: columns of df_X to use :param bool is_status: report status extracted from the persister :param kwargs dict: arguments for TrinaryData :param dict kwargs: optional arguments for FeatureAnalyzer """ def calcLen(obj, func): if obj is None: return 0 else: return func(obj) # CUR_LEN = "cur_len" MAX_LEN = "max_len" persister_path = PERSISTER_PATH_PAT % state df_X, ser_y = _getData(state, columns, **kwargs) if is_status: func = lambda d: len(d["score"]) # persister = Persister(persister_path) analyzer = persister.get() pair_length = MAX_FEATURES_FOR_PAIRING*(MAX_FEATURES_FOR_PAIRING-1)/2 \ + MAX_FEATURES_FOR_PAIRING dct = { "sfa": { CUR_LEN: calcLen(analyzer._sfa_dct, lambda d: len(d.keys())), MAX_LEN: len(df_X.columns) }, "cpc": { CUR_LEN: calcLen(analyzer._cpc_dct, func), MAX_LEN: pair_length }, "ipa": { CUR_LEN: calcLen(analyzer._ipa_dct, func), MAX_LEN: pair_length }, } report_stg = "State %s: " % str(state) for metric in dct.keys(): cur_length = 0 if dct[metric][MAX_LEN] is not None: cur_length = dct[metric][CUR_LEN] frac = min(1.0, cur_length / dct[metric][MAX_LEN]) report_stg = ("%s %s/%2.3f" % (report_stg, metric, frac)) if is_report: print(report_stg) else: analyzer = feature_analyzer.FeatureAnalyzer( CLF, df_X, ser_y, max_features_for_pairing=MAX_FEATURES_FOR_PAIRING, persister_path=persister_path, num_cross_iter=num_cross_iter, report_interval=report_interval) out_dir = out_dir_pat % state _ = analyzer.serialize(out_dir, is_restart=is_restart)
from common_python.util import dataframe from common_python.util.persister import Persister from tools import make_classification_data import os import shutil import unittest IGNORE_TEST = False IS_PLOT = False IS_CHANGED = False # ClassificationData namespace has changed TEST_DIR = os.path.dirname(os.path.abspath(__file__)) DATA_FILE_PATH = os.path.join(TEST_DIR, make_classification_data.DATA_FILE) DATA_FILE_PATH_TEST = os.path.join( TEST_DIR, "make_classification_data_save.pcl") PERSISTER = Persister(DATA_FILE_PATH_TEST) FILES = [DATA_FILE_PATH] data= make_classification_data.ClassificationData( persister_path=DATA_FILE_PATH_TEST) # Creates a file with the desired initializtions, if necessary # Note, if there are changes in the state variables, you # must set IS_CHANGED to True. if not PERSISTER.isExist() or IS_CHANGED: data.initialize() data.serialize() class TestClassificationData(unittest.TestCase): def deleteFiles(self): for ffile in FILES:
from common.data_provider import DataProvider import common_python.classifier.feature_analyzer as fa from common_python.classifier \ import feature_set_collection as fsc from common_python.classifier.feature_set import FeatureSet from common import trinary_data from common_python.util.persister import Persister import argparse import numpy as np import os import pandas as pd DIR = os.path.dirname(os.path.abspath(__file__)) PERSISTER_PATH = os.path.join(DIR, "persister_shared_data.pcl") PERSISTER = Persister(PERSISTER_PATH) DIRECTORY = "feature_analyzer_averaged" class SharedData(object): def __init__(self, persister=PERSISTER): if persister.isExist(): shared_data = persister.get() for key in shared_data.__dict__.keys(): self.__setattr__(key, shared_data.__getattribute__(key)) else: self.provider = DataProvider() self.trinary = trinary_data.TrinaryData(is_averaged=False, is_dropT1=False, is_regulator=True) self.df_X = self.trinary.df_X
def setUp(self): self.deleteFiles() self.persister = Persister(PERSISTER_PATH)
import common.constants as cn from common import sample_data from common_python.util.persister import Persister import numpy as np import os import pandas as pd import unittest IGNORE_TEST = False IS_PLOT = False TEST_DIR = os.path.dirname(os.path.abspath(__file__)) PERSISTER_PATH = os.path.join(TEST_DIR, "test_sample_data_persister.pcl") PERSISTER = Persister(PERSISTER_PATH) got_sample = False if PERSISTER.isExist(): SAMPLE_DATA = PERSISTER.get() if SAMPLE_DATA is None: got_sample = False else: got_sample = True if not got_sample: try: SAMPLE_DATA = sample_data.getSampleData() SAMPLE_DATA.initialize() except: SAMPLE_DATA = None print("***Proceeding without SAMPLE_DATA") PERSISTER.set(SAMPLE_DATA)
def do(self, data_dir=cn.DATA_DIR): """ Assigns values to the instance data. """ # Determine if can initialize from existing data persister = Persister(cn.DATA_PROVIDER_PERSISTER_PATH) is_initialized = False if persister.isExist(): if not self.is_reinitialize: provider = persister.get() # See if there's a change in the calculation of reference values if self.calcRef == provider.calcRef: is_initialized = True self._setValues(provider=provider) if not "is_reinitialize" in dir(self): self.is_reinitialize = False if not is_initialized: # Do the initializtions # Gene categorizations self.df_ec_terms = \ self._makeDFFromCSV(FILENAME_EC_TERMS, is_index_geneid=True) self.df_ko_terms = \ self._makeDFFromCSV(FILENAME_KO_TERMS, is_index_geneid=True) self.df_kegg_pathways = \ self._makeDFFromCSV(FILENAME_KEGG_PATHWAYS, is_index_geneid=False) self.df_kegg_gene_pathways = \ self._makeDFFromCSV(FILENAME_KEGG_GENE_PATHWAY, is_index_geneid=True) # Transcription Regulation Network self.df_trn_unsigned = self._makeDFFromCSV(FILENAME_TRN_UNSIGNED) self.df_trn_unsigned.columns = TRN_COLUMNS self.df_trn_signed = self._makeDFFromCSV(FILENAME_TRN_SIGNED) self.df_trn_signed.columns = TRN_COLUMNS # GO Terms self.df_go_terms = self._makeGoTerms() # Gene expression for state self.df_gene_expression_state = self._makeDFFromCSV( FILENAME_GENE_EXPRESSION_STATE, is_index_geneid=True) # Gene description self.df_gene_description = self._makeGeneDescriptionDF() # Stages matrix self.df_stage_matrix = self._makeStageMatrixDF() # Normalized data values self.df_normalized = self._makeNormalizedDF() # Raw readcounts self.dfs_read_count = self._makeReadCountDFS() # Hypoxia data self.df_hypoxia = self._makeHypoxiaDF() # Create mean and std dataframes self.df_mean = self._makeMeanDF() self.df_std = self._makeStdDF() self.df_cv = 100 * self.df_std / self.df_mean # Transcription factors self.tfs = self.df_trn_unsigned[cn.TF].unique() self.tfs = list( set(self.tfs).intersection( self.dfs_adjusted_read_count[0].index)) persister.set(self)
from common import trinary_data from common_python.testing import helpers from common_python.util.persister import Persister import numpy as np import os import pandas as pd import unittest IGNORE_TEST = False IS_PLOT = False NUM_REPL = 3 DIR = os.path.dirname(os.path.abspath(__file__)) TEST_SAMPLE_PATH = os.path.join(DIR, "test_trinary_data_sample.csv") PERSISTER_PATH = os.path.join(DIR, "test_trinary_data_persister.pcl") PERSISTER = Persister(PERSISTER_PATH) GENES = ["Rv1927", "Rv3083"] if PERSISTER.isExist(): PROVIDER, SAMPLE_DATA = PERSISTER.get() else: SAMPLE_DATA = sample_data.getSampleData() PROVIDER = DataProvider(is_reinitialize=True) PROVIDER.do() PERSISTER.set((PROVIDER, SAMPLE_DATA)) ################### FUNCTIONS ############ def isConsistentState(ser_y): # Check consistency of states times = ser_y.index if len(times) == 0: