def __init__(self): Scorer.LOGGER = LoggerFactory.getLogger("Scorer") Scorer.updateViewerScoreNum = 0 Scorer.updateResourceNum = 0 Scorer.updateViewerNum = 0 Scorer.updateAuthorNum = 0 Scorer.updateResourceTime = 0 Scorer.updateViewerTime = 0 Scorer.updateAuthorTime = 0 Scorer.dupAuthor = 0 Scorer.dupViewer = 0 Scorer.avgViewerScore = 0 Scorer.updateNumLock = thread.allocate_lock() Scorer.authorHashSet = dict() Scorer.viewerHashSet = dict() Scorer.cacheLimitDay = 30 self.mysqlHost = "10.163.102.88" self.mysqlUserName = "******" self.mysqlPassword = "******" self.mysqlDb = "networkresourcesort" self.viewerScoreThreadPool = None Scorer.scoreModelPool = Queue.Queue(10) for i in xrange(0, 7): Scorer.scoreModelPool.put(ScoreModel(self.mysqlHost, self.mysqlUserName, self.mysqlPassword, self.mysqlDb))
def __init__(self): Scorer.LOGGER = LoggerFactory.getLogger("Scorer") Scorer.updateViewerScoreNum = 0 Scorer.updateResourceNum = 0 Scorer.updateViewerNum = 0 Scorer.updateAuthorNum = 0 Scorer.updateResourceTime = 0 Scorer.updateViewerTime = 0 Scorer.updateAuthorTime = 0 Scorer.dupAuthor = 0 Scorer.dupViewer = 0 Scorer.avgViewerScore = 0 Scorer.updateNumLock = thread.allocate_lock() Scorer.authorHashSet = dict() Scorer.viewerHashSet = dict() Scorer.cacheLimitDay = 30 self.mysqlHost = "10.163.102.88" self.mysqlUserName = "******" self.mysqlPassword = "******" self.mysqlDb = "networkresourcesort" self.viewerScoreThreadPool = None Scorer.scoreModelPool = Queue.Queue(10) for i in xrange(0, 7): Scorer.scoreModelPool.put( ScoreModel(self.mysqlHost, self.mysqlUserName, self.mysqlPassword, self.mysqlDb))
class FileConverterIT(unittest.TestCase): log = LoggerFactory.createLog(__name__) def setUp(self): current_working_dir = os.getcwd() # Should be this package. self.input_folder = current_working_dir + "/SampleMatlabDataFolder" self.createdFolder = self.input_folder + "/Trametinib_analysis" def tearDown(self): if self.input_folder != "/": for file in os.listdir(self.createdFolder): if file == "__init__.py" or ".mat" in file: continue os.remove(self.createdFolder + "/" + file) os.removedirs(self.createdFolder) def testMatlabFileConversionProperlyFormatsMatrices(self): FileConverter.convertMatLabToCSV(self.input_folder) for generated_csv in [ file for file in os.listdir(self.createdFolder) if ".csv" in file ]: with open(self.createdFolder + "/" + generated_csv) as csv: try: for line in csv: assert "['" not in line assert "']" not in line except ValueError as valueError: self.log.error(valueError) finally: csv.close()
def __init__(self, host, userName, password, dbName): self.LOGGER = LoggerFactory.getLogger("ScoreModel") self.host = host self.userName = userName self.password = password self.dbName = dbName self.dbConn = self._getConn()
def convertMatLabToCSV(matlab_files_directory): log = LoggerFactory.createLog(__name__) os.chdir(matlab_files_directory) matlab_files = glob.glob("*.mat") for input_file in matlab_files: drug_name = input_file.split("gexmutcnum.mat")[0].strip() new_directory = matlab_files_directory + "/" + drug_name + "_analysis" matlab_file = scipy.io.loadmat(input_file) os.mkdir(new_directory) format_id_string = lambda array: SafeCastUtil.safeCast( array[0], str) for key in SafeCastUtil.safeCast( FileConverter.VARIABLE_MATCHES.keys(), list): header = [ format_id_string(feature_name) for feature_name in matlab_file.get(key)[0] ] file_name = new_directory + "/" + drug_name + "_" + FileConverter.FILE_NAMES[ key] + ".csv" cell_line_data = FileConverter.formatCellLineData( matlab_file.get(FileConverter.VARIABLE_MATCHES.get(key)), key) FileConverter.validateAndWriteCSV( cell_line_data, header, file_name, log, FileConverter.EXPECTED_TYPES[key]) cell_line_ids = [ format_id_string(cell_id) for cell_id in matlab_file.get(FileConverter.ID_FIELD) ] results = matlab_file.get(FileConverter.RESULTS_FIELD) zipped_results = SafeCastUtil.safeCast( zip(cell_line_ids, results[0]), list) results_file = new_directory + "/" + drug_name + "_results.csv" FileConverter.validateAndWriteCSV(zipped_results, ["cell_line", "result"], results_file, log, float) log.info( "The MATLAB file for %s has been successfully converted into csv files ready to be used" " with the CLA software!", drug_name) log.info("All MATLAB files have been processed!")
import pycom from helper import blink_led pycom.heartbeat(False) # disable the heartbeat LED pycom.rgbled(0x552000) # flash orange to indicate startup # Try to mount SD card, if this fails, keep blinking red and do not proceed try: from machine import SD, Pin, reset import os from loggingpycom import DEBUG from LoggerFactory import LoggerFactory from UserButton import UserButton # Initialise LoggerFactory and status logger logger_factory = LoggerFactory() status_logger = logger_factory.create_status_logger( 'status_logger', level=DEBUG, terminal_out=True, filename='status_log.txt') # Initialize button interrupt on pin 14 for user interaction user_button = UserButton(status_logger) pin_14 = Pin("P14", mode=Pin.IN, pull=Pin.PULL_DOWN) pin_14.callback(Pin.IRQ_RISING | Pin.IRQ_FALLING, user_button.button_handler) # Mount SD card sd = SD() os.mount(sd, '/sd')
# Disable the heartbeat LED and set to orange to indicate startup pycom.heartbeat(False) pycom.rgbled(0x552000) # Try to mount SD card, if this fails, keep blinking red and do not proceed try: import os import time from loggingpycom import DEBUG from LoggerFactory import LoggerFactory from userbutton import UserButton os.mount(SD(), "/sd") logger_factory = LoggerFactory() status_logger = logger_factory.create_status_logger( "status_logger", level=DEBUG, terminal_out=True, filename="status_log.txt") # Initialise button interrupt on pin 14 for user interaction user_button = UserButton(status_logger) pin_14 = Pin("P14", mode=Pin.IN, pull=Pin.PULL_DOWN) pin_14.callback(Pin.IRQ_RISING | Pin.IRQ_FALLING, user_button.button_handler) except Exception as e: # If something goes wrong, blink red LED and reboot after 60 seconds print("Startup failed:", str(e))
class RandomSubsetElasticNetModelTest(unittest.TestCase): log = LoggerFactory.createLog(__name__) train_features = [[0, 0, 0, 1, 0.32, 0.25, 0.52, 0.63], [0, 0, 1, 1, 1.11, 1.45, 0.31, 0.22], [0, 1, 0, 0, 0.32, 0.56, 0.66, 0.25], [1, 0, 0, 1, 0.32, 0.34, 0.13, 0.54]] test_features = [[0, 1, 0, 1, 0.11, 0.41, 0.11, 2.63], [0, 0, 0, 1, 3.23, 1.45, 0.01, 1.22]] train_results = [0.5, 0.3, 0.9, 1.3] test_results = [1.5, 0.5] binary_feature_indices = [0, 1, 2, 3] def testPValueWorksAsIntended(self): model = self.trainModelWithExplicitNumberOfPhrases(10, True) for enet_model in model.models_by_phrase: # fake the scores so that we don't have models which tie enet_model.score = random.random() score_0 = model.score(self.test_features, self.test_results) score_0_redundant = model.score(self.test_features, self.test_results) assert score_0 == score_0_redundant model.p = 0.5 score_half = model.score(self.test_features, self.test_results) assert score_0 != score_half model.p = 1.0 score_1 = model.score(self.test_features, self.test_results) assert score_0 != score_1 assert score_half != score_1 def testExplicitModelCountWorks(self): model = self.trainModelWithExplicitNumberOfPhrases(5, False) assert len(model.models_by_phrase) == 5 def testDuplicatePhrasesAreNotCreated(self): model = self.trainModelWithExplicitNumberOfPhrases(5, False) assert len(model.models_by_phrase) == 5 first_phrase = copy.deepcopy(model.models_by_phrase[0].phrase) assert first_phrase.equals(model.models_by_phrase[0].phrase) assert model.currentPhraseExists(first_phrase) first_phrase.is_or = not first_phrase.is_or assert not first_phrase.equals(model.models_by_phrase[0].phrase) assert not model.currentPhraseExists(first_phrase) def trainModelWithExplicitNumberOfPhrases(self, phrase_count, at_least): num_phrases = 0 model = None explicit_count = 0 if not at_least: explicit_count = phrase_count while (not at_least and num_phrases != phrase_count) or (at_least and num_phrases < phrase_count): model = RandomSubsetElasticNet(1, 0.5, self.binary_feature_indices, upper_bound=0.5, lower_bound=0, p=0, explicit_model_count=(explicit_count - 1)) model.fit(self.train_features, self.train_results) num_phrases = len(model.models_by_phrase) [self.assertScore(model_phrase) for model_phrase in model.models_by_phrase if model_phrase.phrase.value is not None] return model def assertScore(self, phrase): assert phrase.score > 0 def testParameterValidationWorks(self): bad_explicit_phrases = [RecursiveBooleanPhrase(5, 1, False, None)] self.assertInvalidParams([-1, 0, 1]) self.assertInvalidParams([0, 1, "test"]) self.assertInvalidParams(self.binary_feature_indices, alpha=-1) self.assertInvalidParams(self.binary_feature_indices, l_one_ratio=-1) self.assertInvalidParams(self.binary_feature_indices, upper_bound=5) self.assertInvalidParams(self.binary_feature_indices, lower_bound=-1) self.assertInvalidParams(self.binary_feature_indices, lower_bound=.3, upper_bound=.1) self.assertInvalidParams(self.binary_feature_indices, p=100) self.assertInvalidParams(self.binary_feature_indices, explicit_model_count=-2) self.assertInvalidParams(self.binary_feature_indices, max_boolean_generation_attempts=0) self.assertInvalidParams(self.binary_feature_indices, default_coverage_threshold=1.4) self.assertInvalidParams(self.binary_feature_indices, explicit_phrases=bad_explicit_phrases) def assertInvalidParams(self, binary_feature_indices, alpha=1, l_one_ratio=2, upper_bound=0.5, lower_bound=0.1, p=0, explicit_model_count=-1, max_boolean_generation_attempts=10, default_coverage_threshold=0.8, explicit_phrases=None): error = "" try: RandomSubsetElasticNet(alpha, l_one_ratio, binary_feature_indices, upper_bound=upper_bound, lower_bound=lower_bound, p=p, explicit_model_count=explicit_model_count, max_boolean_generation_attempts=max_boolean_generation_attempts, coverage_threshold=default_coverage_threshold, explicit_phrases=explicit_phrases) except AttributeError as attributeError: error = SafeCastUtil.safeCast(attributeError, str) assert "invalid parameters" in error def testRSENFailsIfNonBinaryMatrixSentIn(self): self.train_features[0][0] = 2 error = "" try: model = RandomSubsetElasticNet(1, 2, self.binary_feature_indices) model.fit(self.train_features, self.train_results) except ValueError as valueError: error = SafeCastUtil.safeCast(valueError, str) assert "Non-binary feature" in error
class RecommendationsServiceIT(unittest.TestCase): log = LoggerFactory.createLog(__name__) def setUp(self): self.current_working_dir = os.getcwd() # Should be this package. self.DRUG_DIRECTORY = "DrugAnalysisResults" self.NUM_DRUGS = 10 def tearDown(self): if self.current_working_dir != "/": directory = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER for file_or_dir in os.listdir(directory): if file_or_dir == "__init__.py": continue current_path = directory + "/" + file_or_dir if self.DRUG_DIRECTORY in file_or_dir: for file in os.listdir(current_path): os.remove(current_path + "/" + file) os.removedirs(current_path) else: os.remove(current_path) def testRecommendations(self): num_cell_lines = 30 inputs = self.formatRandomizedData(False, num_cell_lines) target_dir = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER try: recs_service = RecommendationsService(inputs) recs_service.recommendByHoldout(target_dir) drug_names = SafeCastUtil.safeCast(recs_service.inputs.keys(), list) self.assertRecsByDrug(num_cell_lines, drug_names, target_dir) recs_service.writeFinalRecsResults(target_dir) self.assertRecsByCellLine(num_cell_lines, drug_names, target_dir) except KeyboardInterrupt as keyboard_interrupt: assert False def assertRecsByDrug(self, num_cell_lines, drug_names, target_dir): file_name = target_dir + "/" + RecommendationsService.PREDICTIONS_FILE num_lines = 0 with open(file_name) as csv_file: try: for line_index, line in enumerate(csv_file): num_lines += 1 line_split = line.split(",") if line_index == 0: assert line_split[0] == "Drug" else: assert line_split[0] in drug_names assert "cell_line" in line_split[1] assert SafeCastUtil.safeCast(line_split[2], float) is not None assert SafeCastUtil.safeCast(line_split[3].strip(), float) is not None except AssertionError as error: self.log.error(error) finally: self.log.debug("Closing file %s", file_name) csv_file.close() assert num_lines == (num_cell_lines * self.NUM_DRUGS) + 1 def assertRecsByCellLine(self, num_cell_lines, drug_names, target_dir): file_name = target_dir + "/" + RecommendationsService.PREDICTIONS_BY_CELL_LINE_FILE num_lines = 0 with open(file_name) as csv_file: try: for line_index, line in enumerate(csv_file): num_lines += 1 line_split = line.split(",") if line_index == 0: assert line_split[0] == "Cell Line" else: for i in range(0, len(line_split)): if i == 0: assert "cell_line" in line_split[i] elif i % 2 == 0: assert SafeCastUtil.safeCast(line_split[i], float) > AbstractModelTrainer.DEFAULT_MIN_SCORE elif i % 2 == 1: assert line_split[i] in drug_names except AssertionError as error: self.log.error(error) finally: self.log.debug("Closing file %s", file_name) csv_file.close() assert num_lines == num_cell_lines + 1 def testPreRecsAnalysis(self): num_cell_lines = 1000 inputs = self.formatRandomizedData(False, num_cell_lines) for processed_arguments in inputs.values(): sample_features = processed_arguments.features.get(RandomizedDataGenerator.CELL_LINE + "0") for _ in range(10): num_cell_lines += 1 self.addRandomCellLine(processed_arguments, sample_features) target_dir = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER try: recs_service = RecommendationsService(inputs) recs_service.preRecsAnalysis(target_dir) file_name = target_dir + "/" + RecommendationsService.PRE_REC_ANALYSIS_FILE num_lines = 0 drug_names = SafeCastUtil.safeCast(recs_service.inputs.keys(), list) cell_line = RandomizedDataGenerator.CELL_LINE with open(file_name) as csv_file: try: for line_index, line in enumerate(csv_file): num_lines += 1 line_split = line.split(",") for i in range(0, len(line_split)): value_in_csv = line_split[i].strip() if line_index == 0: if i == 0: assert value_in_csv == cell_line else: assert value_in_csv == drug_names[i - 1] else: if i == 0: assert cell_line or RecommendationsService.STD_DEVIATION or \ RecommendationsService.MEAN or RecommendationsService.MEDIAN in value_in_csv else: assert value_in_csv == MachineLearningService.DELIMITER.strip() or \ SafeCastUtil.safeCast(value_in_csv, float) > AbstractModelTrainer.DEFAULT_MIN_SCORE except AssertionError as error: self.log.error(error) finally: self.log.debug("Closing file %s", file_name) csv_file.close() assert num_lines == num_cell_lines + 4 except KeyboardInterrupt as keyboard_interrupt: assert False def addRandomCellLine(self, processed_arguments, sample_features): random_string = self.randomString(16) processed_arguments.features[random_string] = sample_features processed_arguments.results.append([random_string, random.random()]) def randomString(self, string_length): letters = string.hexdigits return ''.join(random.choice(letters) for i in range(string_length)) def formatRandomizedData(self, is_classifier, num_cell_lines): randomized_data_path = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER processed_arguments = {} for i in range(self.NUM_DRUGS): drug_name = self.DRUG_DIRECTORY + SafeCastUtil.safeCast(i + 1, str) drug_path = randomized_data_path + "/" + drug_name if drug_name not in os.listdir(randomized_data_path): os.mkdir(drug_path) random_data_generator = RandomizedDataGenerator(drug_path) random_data_generator.generateRandomizedFiles(3, num_cell_lines, 150, is_classifier, 2, .8, use_static_features=True) argument_processing_service = ArgumentProcessingService(drug_path) processed_args = argument_processing_service.handleInputFolder() processed_args.recs_config.viability_acceptance = 0.1 processed_arguments[drug_name] = processed_args ml_service = MachineLearningService(processed_args) combos = [ml_service.generateFeatureSetString(combo) for combo in ml_service.determineGeneListCombos()] self.setupDrugData(combos, ml_service, drug_path) return processed_arguments def setupDrugData(self, combos, ml_service, drug_path): for algo in SupportedMachineLearningAlgorithms.fetchAlgorithms(): file_name = drug_path + "/" + algo + ".csv" with open(file_name, 'w', newline='') as feature_file: writer = csv.writer(feature_file) header = ml_service.getCSVFileHeader(ml_service.inputs.is_classifier, algo, ml_service.inputs.outer_monte_carlo_permutations) writer.writerow(header) for combo in combos: row = RandomizedDataGenerator.generateAnalysisRowForCombo(ml_service, combo, algo) writer.writerow(row) feature_file.close()
import fridge import cooker from LoggerFactory import LoggerFactory log = LoggerFactory.getLogger('main') log.info('Log started.') fridge.first() cooker.first() fridge.second() cooker.second() cooker.third() fridge.third() log.info('Log stopped.')
def __init__(self): self.LOGGER = LoggerFactory.getLogger("ScoreManager") self.scorer =Scorer() self.round = 1
class AbstractModelTrainer(ABC): log = LoggerFactory.createLog(__name__) DEFAULT_MIN_SCORE = -10 ADDITIONAL_DATA = "additional_data" EMPTY_MODEL_RESPONSE = DEFAULT_MIN_SCORE, 0.0 @abstractmethod def __init__(self, algorithm, hyperparameters, is_classifier): self.algorithm = algorithm self.hyperparameters = hyperparameters self.is_classifier = is_classifier @abstractmethod def hyperparameterize(self, training_matrix, testing_matrix, results): pass @abstractmethod def train(self, results, features, hyperparams, feature_names): pass @abstractmethod def supportsHyperparams(self): pass @abstractmethod def fetchFeatureImportances(self, model, features_in_order): pass def preserveNonHyperparamData(self, model_data, model): pass def shouldProcessFeatureSet(self, feature_set): return True def fetchModelPhrases(self, model, gene_list_combo): return {} def logTrainingMessage(self, outer_monte_carlo_perms, inner_monte_carlo_perms, num_gene_list_combos): num_models = self.determineNumModelsToCreate(outer_monte_carlo_perms, inner_monte_carlo_perms, num_gene_list_combos) self.log.info( "Running permutations on %s different combinations of features. Requires creation of %s " "different %s models.", SafeCastUtil.safeCast(num_gene_list_combos, str), num_models, self.algorithm) def determineNumModelsToCreate(self, outer_monte_carlo_perms, inner_monte_carlo_perms, num_gene_list_combos): num_models = outer_monte_carlo_perms * inner_monte_carlo_perms * num_gene_list_combos for hyperparam_set in self.hyperparameters.values(): num_models *= len(hyperparam_set) return num_models + (outer_monte_carlo_perms * num_gene_list_combos) def loopThroughHyperparams(self, hyperparams, training_matrix, testing_matrix, results): self.hyperparameters = hyperparams features, relevant_results = self.populateFeaturesAndResultsByCellLine( training_matrix, results) feature_names = training_matrix.get( ArgumentProcessingService.FEATURE_NAMES) hyperparam_permutations = self.fetchAllHyperparamPermutations( hyperparams) GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log) return self.hyperparameterizeInSerial(feature_names, features, hyperparam_permutations, relevant_results, results, testing_matrix) def hyperparameterizeInSerial(self, feature_names, features, hyperparam_permutations, relevant_results, results, testing_matrix): model_data = {} for hyperparam_set in hyperparam_permutations: self.buildModelAndRecordScore(feature_names, features, hyperparam_set, model_data, relevant_results, results, testing_matrix) return model_data def chunkList(self, original_list, size): return [ original_list[i * size:(i + 1) * size] for i in range((len(original_list) + size - 1) // size) ] def buildModelAndRecordScore(self, feature_names, features, hyperparam_set, model_data, relevant_results, results, testing_matrix): if not isinstance(model_data, dict) and model_data._address_to_local is not None: shared_file = SafeCastUtil.safeCast( model_data._address_to_local.keys(), list)[0] if not os.path.exists(shared_file): self.log.warning( "Unable to find shared file %s, process likely ended prematurely.", shared_file) return self.log.debug("Building %s model with hyperparams %s.", self.algorithm, hyperparam_set) model = self.buildModel(relevant_results, features, hyperparam_set, feature_names) self.preserveNonHyperparamData(model_data, model) current_model_score = self.fetchPredictionsAndScore( model, testing_matrix, results) lock = threading.Lock() lock.acquire(True) try: model_data[DictionaryUtility.toString( hyperparam_set)] = current_model_score except FileNotFoundError as fnfe: self.log.error( "Unable to write to shared model_data object for algorithm: %s.\n", fnfe) except AttributeError as ae: self.log.error( "Unable to write to shared model_data object for algorithm: %s.\n", ae) finally: lock.release() self.log.debug("Finished building %s model with hyperparams %s.", self.algorithm, hyperparam_set) return model_data def buildModel(self, relevant_results, features, hyperparam_set, feature_names): model = None try: model = self.train(relevant_results, features, hyperparam_set, feature_names) except ValueError as valueError: self.log.error("Failed to create model build for %s:\n%s", self.algorithm, valueError) return model def fetchAllHyperparamPermutations(self, hyperparams): all_perms = [] hyperparam_keys = SafeCastUtil.safeCast(hyperparams.keys(), list) zero_filled_indices = SafeCastUtil.safeCast( numpy.zeros(len(hyperparam_keys)), list) target_index = len(zero_filled_indices) - 1 current_perm = zero_filled_indices[:] while target_index >= 0: current_hyperparams = OrderedDict() for i in range(0, len(current_perm)): param_name = hyperparam_keys[i] current_hyperparams[param_name] = hyperparams[param_name][ SafeCastUtil.safeCast(current_perm[i], int)] if current_hyperparams not in all_perms: clone_map = copy.deepcopy(current_hyperparams) all_perms.append(clone_map) if current_perm[target_index] < len( hyperparams[hyperparam_keys[target_index]]) - 1: current_perm[target_index] += 1 while len(current_perm) > target_index + 1 and current_perm[target_index + 1] <\ len(hyperparams[hyperparam_keys[target_index]]): target_index += 1 else: target_index -= 1 for subsequent_index in range(target_index, len(current_perm) - 1): current_perm[subsequent_index + 1] = 0 return all_perms def fetchPredictionsAndScore(self, model, testing_matrix, results): if model is None: return self.EMPTY_MODEL_RESPONSE features, relevant_results = self.populateFeaturesAndResultsByCellLine( testing_matrix, results) predictions = model.predict(features) score = AbstractModelTrainer.DEFAULT_MIN_SCORE try: score = model.score(features, relevant_results) except ValueError as valueError: self.log.error(valueError) if self.is_classifier: accuracy = accuracy_score(relevant_results, predictions) else: accuracy = mean_squared_error(relevant_results, predictions) del model return score, accuracy def populateFeaturesAndResultsByCellLine(self, matrix, results): features = [] relevant_results = [] for cell in matrix.keys(): if cell == ArgumentProcessingService.FEATURE_NAMES: continue features.append(matrix[cell]) for result in results: if result[0] == cell: relevant_results.append(result[1]) return features, relevant_results def logIfBestHyperparamsOnRangeThreshold(self, best_hyperparams, record_diagnostics, input_folder): if not self.supportsHyperparams() or best_hyperparams is None: return hyperparam_keys = SafeCastUtil.safeCast(self.hyperparameters.keys(), list) for i in range(0, len(hyperparam_keys)): hyperparam_set = self.hyperparameters[hyperparam_keys[i]] optimal_value = best_hyperparams.get(hyperparam_keys[i]) if optimal_value is None: self.log.warn( "Unable to determine optimal value given hyperparams: %s", SafeCastUtil.safeCast(best_hyperparams, str, None)) continue if optimal_value >= hyperparam_set[len(hyperparam_set) - 1]: message = "Best hyperparam for " + self.algorithm + " on upper threshold of provided hyperparam " \ "set: " + hyperparam_keys[i] + " = " + SafeCastUtil.safeCast(optimal_value, str) + "\n" self.log.debug(message) if record_diagnostics: DiagnosticsFileWriter.writeToFile(input_folder, message, self.log) elif optimal_value <= hyperparam_set[0]: message = "Best hyperparam for " + self.algorithm + " on lower threshold of provided hyperparam " \ "set: " + hyperparam_keys[i] + " = " + SafeCastUtil.safeCast(optimal_value, str) + "\n" self.log.debug(message) if record_diagnostics: DiagnosticsFileWriter.writeToFile(input_folder, message, self.log) def logOptimalHyperParams(self, hyperparams, feature_set_as_string, record_diagnostics, input_folder): message = "Optimal Hyperparameters for " + feature_set_as_string + " " + self.algorithm + " algorithm " \ "chosen as:\n" for key in SafeCastUtil.safeCast(hyperparams.keys(), list): message += "\t" + key + " = " + SafeCastUtil.safeCast( hyperparams[key], str) + "\n" self.log.info(message) if record_diagnostics: DiagnosticsFileWriter.writeToFile(input_folder, message, self.log) def generateFeaturesInOrder(self, gene_list_combo): features_in_order = [] for feature_file in gene_list_combo: for feature in feature_file: features_in_order.append(feature) return features_in_order def normalizeCoefficients(self, coefficients, features_in_order): importances = {} absolute_sum = numpy.sum([numpy.abs(coeff) for coeff in coefficients]) for i in range(0, len(features_in_order)): if absolute_sum > 0: importances[features_in_order[i]] = numpy.abs( coefficients[i]) / absolute_sum else: importances[features_in_order[i]] = numpy.abs( coefficients[i]) # should be 0. return importances
class DataFormattingService(object): log = LoggerFactory.createLog(__name__) TRAINING_MATRIX = "trainingMatrix" TESTING_MATRIX = "testingMatrix" # Will either be outer testing or inner validation matrix P_VALUE_CUTOFF = 0.05 def __init__(self, inputs): self.inputs = inputs def formatData(self, should_scale, should_one_hot_encode=True): features_df = pd.DataFrame.from_dict(self.inputs.features, orient='index') columns = self.inputs.features.get( ArgumentProcessingService.FEATURE_NAMES) features_df.columns = columns features_df = features_df.drop(ArgumentProcessingService.FEATURE_NAMES) x_train, x_test, y_train, y_test = self.testTrainSplit( features_df, self.inputs.results, self.inputs.data_split) x_train_corr, x_test_corr = self.maybeFilterCorrelatedFeatures( x_train, x_test, y_train, columns, self.inputs.analysisType()) if should_one_hot_encode: x_train_one_hot = self.oneHot(x_train_corr) x_test_one_hot = self.oneHot(x_test_corr) else: x_train_one_hot = x_train_corr x_test_one_hot = x_test_corr outputs = OrderedDict() outputs[self.TRAINING_MATRIX] = self.maybeScaleFeatures( x_train_one_hot, should_scale) outputs[self.TESTING_MATRIX] = self.maybeScaleFeatures( x_test_one_hot, should_scale) outputs[ ArgumentProcessingService.FEATURE_NAMES] = SafeCastUtil.safeCast( x_train_one_hot.columns, list) return outputs def maybeFilterCorrelatedFeatures(self, x_train, x_test, y_train, feature_names, analysis_type): if analysis_type is not AnalysisType.NO_GENE_LISTS: return x_train, x_test results = [result[1] for result in y_train] spearman_p_vals = {} ranksum_p_vals = {} for feature_name in feature_names: try: feature_column = x_train.get(feature_name) is_categorical = all( isinstance(feature, str) for feature in feature_column) file = feature_name.split(".")[0] if is_categorical: if ranksum_p_vals.get(file) is None: ranksum_p_vals[file] = {} ranksum = self.fetchRanksum(feature_column, results) ranksum_p_vals[file][feature_name] = SafeCastUtil.safeCast( ranksum[1], float, 1) else: if spearman_p_vals.get(file) is None: spearman_p_vals[file] = {} spearman_corr = spearmanr(feature_column, results) spearman_p_vals[file][ feature_name] = SafeCastUtil.safeCast( spearman_corr[1], float, 1) except ValueError as error: self.log.error("Exception while trying to trim features: %s", error) return self.trimFeatures(x_train, x_test, [ranksum_p_vals, spearman_p_vals]) def fetchRanksum(self, feature_column, results): value_counts = {} for val in feature_column: if value_counts.get(val) is None: value_counts[val] = 1 else: value_counts[val] += 1 dominant_value = max(value_counts.items(), key=operator.itemgetter(1))[0] dominant_results = [] non_dominant_results = [] for feature_val_and_result in zip( SafeCastUtil.safeCast(feature_column, list), results): if feature_val_and_result[0] == dominant_value: dominant_results.append(feature_val_and_result[1]) else: non_dominant_results.append(feature_val_and_result[1]) return ranksums(dominant_results, non_dominant_results) def trimFeatures(self, x_train, x_test, p_val_sets): features_to_keep = [] num_top_features = self.inputs.univariate_config.num_top_features for p_val_set in p_val_sets: for file in p_val_set: features_and_p_vals = [ item for item in p_val_set[file].items() if not np.isnan(item[1]) ] sorted_features_and_p_vals = sorted( features_and_p_vals, key=operator.itemgetter(1))[:num_top_features] [ features_to_keep.append(feature_and_p_val[0]) for feature_and_p_val in sorted_features_and_p_vals ] filtered_df_train = x_train.filter(features_to_keep, axis=1) filtered_df_test = x_test.filter(features_to_keep, axis=1) return filtered_df_train, filtered_df_test def maybeScaleFeatures(self, data_frame, should_scale): as_dict = data_frame.transpose().to_dict('list') maybe_scaled_dict = OrderedDict() keys_as_list = SafeCastUtil.safeCast(as_dict.keys(), list) for key in keys_as_list: maybe_scaled_dict[key] = [] if len(keys_as_list) > 0: for i in range(0, len(as_dict[keys_as_list[0]])): array_to_maybe_scale = [] for key in keys_as_list: array_to_maybe_scale.append(as_dict[key][i]) if should_scale: maybe_scaled_array = preprocessing.scale( array_to_maybe_scale) else: maybe_scaled_array = array_to_maybe_scale for j in range(0, len(keys_as_list)): maybe_scaled_dict[keys_as_list[j]].append( maybe_scaled_array[j]) return maybe_scaled_dict def encodeCategorical(self, array): if array.dtype == np.dtype('float64') or array.dtype == np.dtype( 'int64'): return array else: return preprocessing.LabelEncoder().fit_transform(array) # Encode sites as categorical variables def oneHot(self, dataframe): # Encode all labels dataframe = dataframe.apply(self.encodeCategorical) return dataframe # Binary one hot encoding def binaryOneHot(self, dataframe): dataframe_binary_pd = pd.get_dummies(dataframe) return dataframe_binary_pd def testTrainSplit(self, x_values, y_values, data_split): if data_split == 1.0: return x_values, pd.DataFrame(columns=SafeCastUtil.safeCast( x_values.columns, list)), y_values, [] x_train, x_test, y_train, y_test = train_test_split( x_values, y_values, test_size=(1 - data_split)) return x_train, x_test, y_train, y_test def testStratifySplit(self, x_values, y_values): x_train, x_split, y_train, y_split = train_test_split( x_values, y_values, test_size=0.2, random_state=42, stratify=x_values.iloc[:, -1]) x_test, x_validate, y_test, y_validate = train_test_split( x_split, y_split, test_size=0.5, random_state=42, stratify=x_split.iloc[:, -1]) return x_train, x_validate, x_test, y_train, y_validate, y_test
def main(args): factory = LoggerFactory() logger = factory.getLogger() logger.log("A Message to Log")
class MachineLearningServiceIT(unittest.TestCase): log = LoggerFactory.createLog(__name__) THRESHOLD_OF_SIGNIFICANCE = 0.60 MONTE_CARLO_PERMS = 2 INDIVIDUAL_MONTE_CARLO_PERMS = 10 def setUp(self): self.current_working_dir = os.getcwd() # Should be this package. def tearDown(self): if self.current_working_dir != "/": for file in os.listdir( self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER): if file == "__init__.py": continue os.remove(self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER + "/" + file) def testRandomForestRegressor(self): self.evaluateMachineLearningModel(RandomForestTrainer(False)) def testRandomForestClassifier(self): self.evaluateMachineLearningModel(RandomForestTrainer(True)) def testLinearSVMRegressor(self): self.evaluateMachineLearningModel(LinearSVMTrainer(False)) def testLinearSVMClassifier(self): self.evaluateMachineLearningModel(LinearSVMTrainer(True)) def testRadialBasisFunctionSVMRegressor(self): self.evaluateMachineLearningModel(RadialBasisFunctionSVMTrainer(False)) def testRadialBasisFunctionSVMClassifier(self): self.evaluateMachineLearningModel(RadialBasisFunctionSVMTrainer(True)) def testElasticNetRegressor(self): self.evaluateMachineLearningModel(ElasticNetTrainer(False)) def testRidgeRegressor(self): self.evaluateMachineLearningModel(RidgeRegressionTrainer(False)) def testLassoRegressor(self): self.evaluateMachineLearningModel(LassoRegressionTrainer(False)) def testRandomSubsetElasticNet(self): ml_service = MachineLearningService( self.formatRandomizedData(False, False, False)) ml_service.log.setLevel(logging.DEBUG) binary_cat_matrix = ml_service.inputs.rsen_config.binary_cat_matrix rsen_trainer = RandomSubsetElasticNetTrainer(False, binary_cat_matrix, 0, 0.4) filtered_combos = self.fetchFilteredRSENCombos(ml_service, rsen_trainer) trimmed_combos = filtered_combos[0:8] target_dir = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER ml_service.handleParallellization(trimmed_combos, target_dir, rsen_trainer) self.assertResults(target_dir, rsen_trainer, len(trimmed_combos) + 1, rsen_trainer.is_classifier, False, False) def fetchFilteredRSENCombos(self, ml_service, rsen_trainer): filtered_combos = [] for combo in ml_service.determineGeneListCombos(): is_valid = True for feature_set in combo: if len([ feature for feature in feature_set if "bin_cat.significant_feature" in feature ]) > 0: is_valid = False if is_valid and rsen_trainer.shouldProcessFeatureSet(combo): filtered_combos.append(combo) return filtered_combos def testRandomSubsetElasticNetWithCombinedGeneLists(self): inputs = self.formatRandomizedData(False, False, False) input_folder = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER inputs.rsen_config.combine_gene_lists = True ml_service = MachineLearningService(inputs) ml_service.log.setLevel(logging.DEBUG) binary_cat_matrix = ml_service.inputs.rsen_config.binary_cat_matrix rsen_trainer = RandomSubsetElasticNetTrainer(False, binary_cat_matrix, 0, 0.4) gene_list_combos = ml_service.determineGeneListCombos() combos = ml_service.fetchValidGeneListCombos(input_folder, gene_list_combos, rsen_trainer) assert len(combos) < len(gene_list_combos) for combo in combos: assert "ALL_GENE_LISTS" in ml_service.generateFeatureSetString( combo) def evaluateMachineLearningModel(self, trainer): ml_service = MachineLearningService( self.formatRandomizedData(trainer.is_classifier, False, False)) ml_service.log.setLevel(logging.DEBUG) num_gene_list_combos = 8 self.analyzeAndAssertResults(ml_service, num_gene_list_combos, trainer, False, False) def analyzeAndAssertResults(self, ml_service, num_gene_list_combos, trainer, univariate, has_static_features): try: gene_list_combos_shortened = ml_service.determineGeneListCombos( )[0:num_gene_list_combos] target_dir = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER ml_service.handleParallellization(gene_list_combos_shortened, target_dir, trainer) self.assertResults(target_dir, trainer, num_gene_list_combos + 1, trainer.is_classifier, univariate, has_static_features) except KeyboardInterrupt as keyboardInterrupt: self.log.error( "Interrupted manually, failing and initiating cleanup.") assert False def formatRandomizedData(self, is_classifier, analyze_all, use_static_features): random_data_generator = RandomizedDataGenerator( RandomizedDataGenerator.GENERATED_DATA_FOLDER) random_data_generator.generateRandomizedFiles( 3, 1000, 150, is_classifier, self.MONTE_CARLO_PERMS, .8, analyze_all=analyze_all, use_static_features=use_static_features) input_folder = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER argument_processing_service = ArgumentProcessingService(input_folder) argument_processing_service.log.setLevel(logging.DEBUG) return argument_processing_service.handleInputFolder() def assertResults(self, target_dir, trainer, expected_lines, is_classifier, univariate, has_static_features): self.assertDiagnosticResults(target_dir, trainer, univariate) file_name = trainer.algorithm + ".csv" assert file_name in os.listdir(target_dir) num_lines = 0 with open(target_dir + "/" + file_name) as csv_file: try: for line_index, line in enumerate(csv_file): num_lines += 1 line_split = line.strip().split(",") if line_index == 0: assert line_split == MachineLearningService.getCSVFileHeader( is_classifier, trainer.algorithm, self.MONTE_CARLO_PERMS) continue feature_gene_list_combo = line_split[0] assert ":" in feature_gene_list_combo or \ (has_static_features and GeneListComboUtility.ONLY_STATIC_FEATURES in feature_gene_list_combo) score = SafeCastUtil.safeCast(line_split[1], float) accuracy = SafeCastUtil.safeCast(line_split[2], float) assert score > trainer.DEFAULT_MIN_SCORE if RandomizedDataGenerator.SIGNIFICANT_GENE_LIST in feature_gene_list_combo or has_static_features: assert score >= self.THRESHOLD_OF_SIGNIFICANCE else: assert score < self.THRESHOLD_OF_SIGNIFICANCE assert accuracy > 0 if len(line_split) > 3: top_importance = line_split[3] assert top_importance is not None except AssertionError as error: self.log.error(error) finally: self.log.debug("Closing file %s", file_name) csv_file.close() assert num_lines == expected_lines def assertDiagnosticResults(self, target_dir, trainer, univariate): if trainer.supportsHyperparams(): saved_features_logged_if_univariate = not univariate diagnostics_file = DiagnosticsFileWriter.FILE_NAME if diagnostics_file in os.listdir(target_dir): with open(target_dir + "/" + diagnostics_file) as open_file: try: for line_index, line in enumerate(open_file): if "Best Hyperparam" in line: assert trainer.algorithm in line assert "upper" in line or "lower" in line if "Monte Carlo loop" in line: saved_features_logged_if_univariate = True except ValueError as valueError: self.log.error(valueError) finally: self.log.debug("Closing file %s", open_file) open_file.close() assert saved_features_logged_if_univariate def testIndividualRandomForestRegressor(self): self.evaluateMachineLearningModelForIndividualCombo( SupportedMachineLearningAlgorithms.RANDOM_FOREST, "200,20", False) def testIndividualRandomForestClassifier(self): self.evaluateMachineLearningModelForIndividualCombo( SupportedMachineLearningAlgorithms.RANDOM_FOREST, "200,20", True) def testIndividualLinearSVMRegressor(self): self.evaluateMachineLearningModelForIndividualCombo( SupportedMachineLearningAlgorithms.LINEAR_SVM, "0.1,0,1", False) def testIndividualLinearSVMClassifier(self): self.evaluateMachineLearningModelForIndividualCombo( SupportedMachineLearningAlgorithms.LINEAR_SVM, "0.1", True) def testIndividualRadialBasisFunctionSVMRegressor(self): self.evaluateMachineLearningModelForIndividualCombo( SupportedMachineLearningAlgorithms.RADIAL_BASIS_FUNCTION_SVM, "0.1,0.1,0.1", False) def testIndividualRadialBasisFunctionSVMClassifier(self): self.evaluateMachineLearningModelForIndividualCombo( SupportedMachineLearningAlgorithms.RADIAL_BASIS_FUNCTION_SVM, "0.1,0.1,0.1", True) def testIndividualElasticNetRegressor(self): self.evaluateMachineLearningModelForIndividualCombo( SupportedMachineLearningAlgorithms.ELASTIC_NET, "0.1,0.1", False) def testIndividualRidgeRegressor(self): self.evaluateMachineLearningModelForIndividualCombo( SupportedMachineLearningAlgorithms.RIDGE_REGRESSION, "1", False) def testIndividualLassoRegressor(self): self.evaluateMachineLearningModelForIndividualCombo( SupportedMachineLearningAlgorithms.LASSO_REGRESSION, "1", False) def testIndividualRandomSubsetElasticNet(self): self.evaluateMachineLearningModelForIndividualCombo( SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET, "0.1,0.1", False) def evaluateMachineLearningModelForIndividualCombo(self, algorithm, hyperparams, is_classifier): input_folder = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER ml_service = MachineLearningService( self.formatRandomizedDataForIndividualCombo( is_classifier, algorithm, hyperparams, input_folder)) if algorithm is SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET: binary_categorical_matrix = ml_service.inputs.rsen_config.binary_cat_matrix dummy_trainer = RandomSubsetElasticNetTrainer( False, binary_categorical_matrix, 0, 0.4) target_combo = self.fetchFilteredRSENCombos( ml_service, dummy_trainer)[0] target_combo_string = ml_service.generateFeatureSetString( target_combo) ml_service.inputs.individual_train_config.combo = target_combo_string try: ml_service.analyze(input_folder) self.assertResultsForIndividualCombo(input_folder, algorithm, 11, is_classifier) except KeyboardInterrupt as keyboardInterrupt: self.log.error( "Interrupted manually, failing and initiating cleanup.") assert False def formatRandomizedDataForIndividualCombo(self, is_classifier, algorithm, hyperparams, input_folder): random_data_generator = RandomizedDataGenerator( RandomizedDataGenerator.GENERATED_DATA_FOLDER) random_data_generator.generateRandomizedFiles( 3, 1000, 150, is_classifier, self.INDIVIDUAL_MONTE_CARLO_PERMS, .8, algorithm, hyperparams) argument_processing_service = ArgumentProcessingService(input_folder) return argument_processing_service.handleInputFolder() def assertResultsForIndividualCombo(self, target_dir, algorithm, expected_lines, is_classifier): file_name = algorithm + ".csv" assert file_name in os.listdir(target_dir) num_lines = 0 with open(target_dir + "/" + file_name) as csv_file: try: for line_index, line in enumerate(csv_file): num_lines += 1 line_split = line.strip().split(",") if line_index == 0: assert line_split == MachineLearningService.getCSVFileHeader( is_classifier, algorithm, 1) continue feature_gene_list_combo = line_split[0] assert ":" in feature_gene_list_combo score = SafeCastUtil.safeCast(line_split[1], float) assert score > AbstractModelTrainer.DEFAULT_MIN_SCORE if len(line_split) > 3: top_importance = line_split[3] assert top_importance is not None except AssertionError as error: self.log.error(error) finally: self.log.debug("Closing file %s", file_name) csv_file.close() assert num_lines == expected_lines def testTrimmingExistingFeatures(self): input_folder = self.current_working_dir + "/SampleClassifierDataFolder" argument_processing_service = ArgumentProcessingService(input_folder) inputs = argument_processing_service.handleInputFolder() ml_service = MachineLearningService(inputs) gene_list_combos = ml_service.determineGeneListCombos() trainer = RandomForestTrainer(True) trimmed_combos = ml_service.fetchValidGeneListCombos( input_folder, gene_list_combos, trainer) assert len(trimmed_combos) == (len(gene_list_combos) - 1) def testSortingByFeatureImportances(self): delimiter = MachineLearningService.DELIMITER ml_service = MachineLearningService(None) # All columns add up to 1. Equal number of importances for each feature. importances = { "geneA": [0.0, 0.1, 0.2, 0.4, 0.0], # total == 0.7 "geneB": [1.0, 0.1, 0.2, 0.1, 0.5], # total == 1.9 "geneC": [0.0, 0.1, 0.2, 0.1, 0.25], # total == 0.65 "geneD": [0.0, 0.1, 0.2, 0.3, 0.25], # total == 0.85 "geneE": [0.0, 0.6, 0.2, 0.1, 0.0], # total == 0.9 } sorted_importances1 = ml_service.averageAndSortImportances( importances, 5) assert sorted_importances1[0] == "geneB --- 0.38" assert sorted_importances1[1] == "geneE --- 0.18" assert sorted_importances1[2] == "geneD --- 0.17" assert sorted_importances1[3] == "geneA --- 0.14" assert sorted_importances1[4] == "geneC --- 0.13" assert numpy.sum([ SafeCastUtil.safeCast(imp.split(delimiter)[1], float) for imp in sorted_importances1 if imp is not "" ]) == 1.0 sorted_importances2 = ml_service.averageAndSortImportances( importances, 6) assert len(sorted_importances1) == len(sorted_importances1) for i in range(0, len(sorted_importances2)): split1 = sorted_importances1[i].split(delimiter) split2 = sorted_importances2[i].split(delimiter) assert split1[0] == split2[0] if split1 == split2: continue assert SafeCastUtil.safeCast( split1[1], float) > SafeCastUtil.safeCast(split2[1], float) assert numpy.sum([ SafeCastUtil.safeCast(imp.split(delimiter)[1], float) for imp in sorted_importances2 if imp is not "" ]) < 1.0 # 6 columns. Now all the others are missing one. importances["geneF"] = [0, 0, 0, 0, 0, 1.0] # total == 1.0 sorted_importances3 = ml_service.averageAndSortImportances( importances, 6) assert len([imp for imp in sorted_importances3 if imp != ""]) > len( [imp for imp in sorted_importances1 if imp != ""]) assert math.isclose( numpy.sum([ SafeCastUtil.safeCast(imp.split(delimiter)[1], float) for imp in sorted_importances3 if imp is not "" ]), 1.0) importances["geneG"] = [0, 0, 0, 0, 0, 0, 2.0] # total == 2.0 sorted_importances4 = ml_service.averageAndSortImportances( importances, 7) assert len([imp for imp in sorted_importances4 if imp != ""]) > len( [imp for imp in sorted_importances3 if imp != ""]) assert numpy.sum([ SafeCastUtil.safeCast(imp.split(delimiter)[1], float) for imp in sorted_importances4 if imp is not "" ]) > 1.0 def testSpecifiedCombosAreSelectedProperly(self): arguments = self.formatRandomizedData(False, False, False) file_names = [] for feature in arguments.features.get( ArgumentProcessingService.FEATURE_NAMES): file_name = feature.split(".")[0] if file_name not in file_names: file_names.append(file_name) gene_lists = SafeCastUtil.safeCast(arguments.gene_lists.keys(), list) self.assertSpecificComboGeneration( arguments, self.generateSpecificCombos(file_names, gene_lists, False)) self.assertSpecificComboGeneration( arguments, self.generateSpecificCombos(file_names, gene_lists, True)) def generateSpecificCombos(self, file_names, gene_lists, flip_order): specific_combos = [] if len(file_names) > 1 and len(gene_lists) > 1: if flip_order: specific_combos.append(file_names[0] + ":" + gene_lists[1] + " " + file_names[1] + ":" + gene_lists[1]) else: specific_combos.append(file_names[1] + ":" + gene_lists[1] + " " + file_names[0] + ":" + gene_lists[1]) for file in file_names: for gene_list in gene_lists: if gene_list is not "null_gene_list": specific_combos.append(file + ":" + gene_list) if len(specific_combos) > 4: return specific_combos return specific_combos def assertSpecificComboGeneration(self, arguments, specific_combos): arguments.specific_combos = specific_combos ml_service = MachineLearningService(arguments) gene_list_combos = ml_service.determineGeneListCombos() filtered_combos = ml_service.determineSpecificCombos(gene_list_combos) assert len(filtered_combos) == len(specific_combos) def testFullAnalysisSansGeneListRandomForestRegressor(self): self.evaluateModelFullAnalysisSansGeneList(RandomForestTrainer(False)) def testFullAnalysisSansGeneListRandomForestClassifier(self): self.evaluateModelFullAnalysisSansGeneList(RandomForestTrainer(True)) def testFullAnalysisSansGeneListLinearSVMRegressor(self): self.evaluateModelFullAnalysisSansGeneList(LinearSVMTrainer(False)) def testFullAnalysisSansGeneListLinearSVMClassifier(self): self.evaluateModelFullAnalysisSansGeneList(LinearSVMTrainer(True)) def testFullAnalysisSansGeneListRadialBasisFunctionSVMRegressor(self): self.evaluateModelFullAnalysisSansGeneList( RadialBasisFunctionSVMTrainer(False)) def testFullAnalysisSansGeneListRadialBasisFunctionSVMClassifier(self): self.evaluateModelFullAnalysisSansGeneList( RadialBasisFunctionSVMTrainer(True)) def testFullAnalysisSansGeneListElasticNetRegressor(self): self.evaluateModelFullAnalysisSansGeneList(ElasticNetTrainer(False)) def testFullAnalysisSansGeneListRidgeRegressor(self): self.evaluateModelFullAnalysisSansGeneList( RidgeRegressionTrainer(False)) def testFullAnalysisSansGeneListLassoRegressor(self): self.evaluateModelFullAnalysisSansGeneList( LassoRegressionTrainer(False)) def evaluateModelFullAnalysisSansGeneList(self, trainer): processed_args = self.formatRandomizedData(trainer.is_classifier, True, False) processed_args.analyze_all = True ml_service = MachineLearningService(processed_args) ml_service.log.setLevel(logging.DEBUG) trainer.log.setLevel(logging.DEBUG) self.analyzeAndAssertResults(ml_service, 1, trainer, True, False) def testStaticFeaturesAnalysis(self): trainer = ElasticNetTrainer(False) processed_args = self.formatRandomizedData(trainer.is_classifier, False, True) assert len(processed_args.static_features) > 0 ml_service = MachineLearningService(processed_args) ml_service.log.setLevel(logging.DEBUG) trainer.log.setLevel(logging.DEBUG) self.analyzeAndAssertResults(ml_service, 8, trainer, False, True)
class HTMLWritingService(object): log = LoggerFactory.createLog(__name__) RECORD_FILE = "FullResultsSummary.txt" SUMMARY_FILE = "SummaryReport.html" def __init__(self, input_folder, is_classifier): self.input_folder = input_folder self.is_classifier = is_classifier def writeSummaryFile(self): self.createStatsOverviewFromFile() def createStatsOverviewFromFile(self): stats_overview_object = self.generateStatsOverviewObject() new_file = self.generateNewReportFile(stats_overview_object) with open(self.input_folder + "/" + self.SUMMARY_FILE, "w") as summary_file: try: for line in new_file: summary_file.write(line) except ValueError as valueError: self.log.error(valueError) finally: summary_file.close() def generateStatsOverviewObject(self): stats_overview_object = {} with open(self.input_folder + "/" + self.RECORD_FILE) as record_file: try: for line_index, line in enumerate(record_file): line_split = [ segment.strip() for segment in line.split("---") ] if len(line_split) < 3: self.log.warning( "Line from results file not split properly: %s", line) continue scores = self.translateToNumericList(line_split[2]) accuracies = self.translateToNumericList(line_split[3]) if stats_overview_object.get(line_split[0]) is None: stats_overview_object[line_split[0]] = { line_split[1]: [scores, accuracies] } else: stats_overview_object[line_split[0]][line_split[1]] = [ scores, accuracies ] except ValueError as value_error: self.log.error(value_error) finally: record_file.close() return stats_overview_object def translateToNumericList(self, line_split): return [ SafeCastUtil.safeCast(val, float) for val in line_split.replace("[", "").replace("]", "").split(",") ] def generateNewReportFile(self, stats_overview_object): path_of_this_file = os.path.realpath(__file__) template_path = os.path.abspath( os.path.join(path_of_this_file, os.pardir)) + "/Reports/reportTemplate.html" new_file = [] with open(template_path) as template_file: try: for line_index, line in enumerate(template_file): if "//INSERT DEFAULT MIN SCORE HERE" in line: new_file.append( "\t\t\t\tvar DEFAULT_MIN_SCORE = " + SafeCastUtil.safeCast( AbstractModelTrainer.DEFAULT_MIN_SCORE, str) + ";\n") elif "//INSERT CHART DATA HERE" in line: new_file.append( "\t\t\t\t$scope.allData = " + SafeCastUtil.safeCast(stats_overview_object, str) + ";\n") elif "//INSERT IS CLASSIFIER HERE" in line: new_file.append("\t\t\t\t$scope.isClassifier = " + SafeCastUtil.safeCast( self.is_classifier, str).lower() + ";\n") else: new_file.append(line) except ValueError as valueError: self.log.error(valueError) finally: template_file.close() return new_file
class ArgumentProcessingService(object): log = LoggerFactory.createLog(__name__) ARGUMENTS_FILE = "arguments.txt" GENE_LISTS = "gene_list" UNFILLED_VALUE_PLACEHOLDER = "'0'" RESULTS = "results" IS_CLASSIFIER = "is_classifier" FEATURES = "features" FEATURE_NAMES = "featureNames" INNER_MONTE_CARLO_PERMUTATIONS = "inner_monte_carlo_permutations" OUTER_MONTE_CARLO_PERMUTATIONS = "outer_monte_carlo_permutations" DATA_SPLIT = "data_split" NUM_THREADS = "num_threads" ALGORITHM_CONFIGS = "algorithm_configs" RECORD_DIAGNOSTICS = "record_diagnostics" STATIC_FEATURES = "static_features" # RSEN Specific Arguments RSEN_P_VAL = "rsen_p_val" RSEN_K_VAL = "rsen_k_val" RSEN_COMBINE_GENE_LISTS = "rsen_combine_gene_lists" BINARY_CATEGORICAL_MATRIX = "binary_categorical_matrix" # For AnalysisType.FULL_CLA_SPECIFIC_COMBO SPECIFIC_COMBOS = "specific_combos" # For AnalysisType.NO_GENE_LISTS IGNORE_GENE_LISTS = "ignore_gene_lists" NUM_TOP_FEATURES = "num_top_features" # For AnalysisType.INDIVIDUAL_TRAIN INDIVIDUAL_TRAIN_ALGORITHM = "individual_train_algorithm" INDIVIDUAL_TRAIN_HYPERPARAMS = "individual_train_hyperparams" INDIVIDUAL_TRAIN_FEATURE_GENE_LIST_COMBO = "individual_train_combo" # For AnalysisType.RECOMMENDATIONS VIABILITY_ACCEPTANCE = "viability_acceptance" def __init__(self, input_folder): self.input_folder = input_folder def handleInputFolder(self): directory_contents = os.listdir(self.input_folder) if not self.validateDirectoryContents(directory_contents): self.log.error("Invalid directory contents, needs a %s file.", self.ARGUMENTS_FILE) return None arguments = self.fetchArguments(self.input_folder + "/" + self.ARGUMENTS_FILE) results_file = arguments.get(self.RESULTS) is_classifier = SafeCastUtil.safeCast(arguments.get(self.IS_CLASSIFIER), int) == 1 analyze_all = self.fetchOrReturnDefault(arguments.get(self.IGNORE_GENE_LISTS), bool, False) algorithm_configs = self.handleAlgorithmConfigs(arguments) if is_classifier is None or results_file is None: self.log.error("Unable to perform CLA analysis. Must explicitly state is_classifier and declare the results" "file in the %s file.", self.ARGUMENTS_FILE) return None results_list = self.validateAndExtractResults(results_file, is_classifier) gene_lists = self.extractGeneLists() if len(gene_lists) <= 1 and not analyze_all: self.log.error("Unable to perform standard CLA analysis. No gene lists found in the target folder.") return None write_diagnostics = self.fetchOrReturnDefault(arguments.get(self.RECORD_DIAGNOSTICS), bool, False) feature_files = [file for file in os.listdir(self.input_folder) if self.fileIsFeatureFile(file, results_file)] static_feature_files = [feature_file for feature_file in self.fetchOrReturnDefault(arguments.get(self.STATIC_FEATURES), str, "").split(",") if len(feature_file.strip()) > 0] if analyze_all: feature_map = self.createAndValidateFullFeatureMatrix(results_list, feature_files) else: feature_map = self.createAndValidateFeatureMatrix(results_list, gene_lists, write_diagnostics, feature_files, static_feature_files) binary_cat_matrix = self.fetchBinaryCatMatrixIfApplicable(arguments, gene_lists, results_list, analyze_all, static_feature_files) if not feature_map or not results_list: return None inner_monte_carlo_perms = self.fetchOrReturnDefault(arguments.get(self.INNER_MONTE_CARLO_PERMUTATIONS), int, 10) outer_monte_carlo_perms = self.fetchOrReturnDefault(arguments.get(self.OUTER_MONTE_CARLO_PERMUTATIONS), int, 10) data_split = self.fetchOrReturnDefault(arguments.get(self.DATA_SPLIT), float, 0.8) num_threads = self.fetchOrReturnDefault(arguments.get(self.NUM_THREADS), int, multiprocessing.cpu_count()) individual_train_config = self.createIndividualTrainConfig(arguments) rsen_config = self.createRSENConfig(arguments, binary_cat_matrix) univariate_config = self.createUnivariateConfig(arguments, analyze_all) specific_combos = self.determineSpecificCombos(arguments.get(self.SPECIFIC_COMBOS)) recs_config = self.createRecommendationsConfig(arguments) return ProcessedArguments(results_list, is_classifier, feature_map, gene_lists, inner_monte_carlo_perms, outer_monte_carlo_perms, data_split, algorithm_configs, num_threads, write_diagnostics, individual_train_config, rsen_config, recs_config, univariate_config, specific_combos, static_feature_files) def validateDirectoryContents(self, directory_contents): return self.ARGUMENTS_FILE in directory_contents def fetchArguments(self, arguments_file): arguments = {} with open(arguments_file) as data_file: try: for line in data_file: line_trimmed_split = line.strip().split("=") if len(line_trimmed_split) > 1: arguments[line_trimmed_split[0]] = line_trimmed_split[1] except ValueError as value_error: self.log.error(value_error) finally: self.log.debug("Closing file %s", arguments_file) data_file.close() return arguments def validateAndExtractResults(self, results_file, is_classifier): sample_list = [] cast_type = float if is_classifier: cast_type = int results_path = self.input_folder + "/" + results_file with open(results_path) as data_file: try: for line_index, line in enumerate(data_file): if len(re.findall(r'^\s*$', line)) > 0 or line_index == 0: # header or whitespace continue line_trimmed_split = line.strip().split(",") if len(line_trimmed_split) != 2: self.log.error("Each line in %s must be 2 columns. Aborting argument processing.", results_file) raise ValueError("Each line in results file must be 2 columns.") cell_line = line_trimmed_split[0] cell_result = SafeCastUtil.safeCast(line_trimmed_split[1], cast_type) if cell_line in sample_list: self.log.error("Repeated cell line name: %s. Aborting argument processing.", cell_line) raise ValueError("Repeated cell line name.") else: sample_list.append([cell_line, cell_result]) except ValueError as value_error: self.log.error(value_error) finally: self.log.debug("Closing file %s", results_file) data_file.close() return sample_list def extractGeneLists(self): gene_lists = {"null_gene_list": []} files = os.listdir(self.input_folder) for file in [f for f in files if self.GENE_LISTS in f]: file_path = self.input_folder + "/" + file with open(file_path) as gene_list_file: genes = gene_list_file.read().strip().split(",") genes_deduped = [] [genes_deduped.append(g.strip()) for g in genes if g not in genes_deduped and len(g.strip()) > 0] if len(genes_deduped) > 0: gene_lists[file.split(".csv")[0]] = genes_deduped else: self.log.warning("No genes found in gene list %s, will not process.", file) return gene_lists def createAndValidateFullFeatureMatrix(self, results_list, feature_files): frames = [] cell_lines = [result[0] for result in results_list] for file in feature_files: self.log.info("Fetching all features for file %s", file) frames.append(self.fetchFullDataframe(cell_lines, file)) combined_frame = pandas.concat(frames, axis=1, join='inner') transposed_dict = combined_frame.T.to_dict() self.log.info("Formatting all features across all files.") return self.formatFullFeatureMatrix(SafeCastUtil.safeCast(combined_frame.columns, list), transposed_dict) def fetchFullDataframe(self, cell_lines, file): file_name = file.split(".")[0] features_path = self.input_folder + "/" + file try: frame = pandas.read_csv(features_path) except ValueError as value_error: self.log.error("Make sure feature file %s is well formed with no superfluous commas.", file) raise value_error frame = frame.loc[:, ~frame.columns.str.contains('^Unnamed')] frame.columns = [file_name + "." + feature for feature in frame.columns] frame.index = cell_lines columns = SafeCastUtil.safeCast(frame.columns, list) [frame.drop(feature) for feature in columns if columns.count(feature) > 1] return frame def formatFullFeatureMatrix(self, feature_names, transposed_dict): feature_matrix = {self.FEATURE_NAMES: feature_names} all_cell_lines = SafeCastUtil.safeCast(transposed_dict.keys(), list) num_cell_lines = len(all_cell_lines) for i in range(num_cell_lines): values = SafeCastUtil.safeCast(transposed_dict[all_cell_lines[i]].values(), list) formatted_values = [self.formatValue(value) for value in values] feature_matrix[all_cell_lines[i]] = SafeCastUtil.safeCast(formatted_values, list) return feature_matrix def formatValue(self, value): value_as_float = SafeCastUtil.safeCast(value, float) if value_as_float is not None: return value_as_float else: return value.strip() def fetchUniqueFeatureNamesAndIndices(self, line_split, file_name): unvalidated_features = [file_name + "." + name.strip() for name in line_split if len(name.strip()) > 0] valid_indices = [] valid_features = [] for i in range(0, len(unvalidated_features)): if unvalidated_features.count(unvalidated_features[i]) == 1: valid_indices.append(i) for i in range(0, len(unvalidated_features)): if i in valid_indices: valid_features.append(unvalidated_features[i]) return valid_indices, valid_features def createAndValidateFeatureMatrix(self, results_list, gene_lists, write_diagnostics, feature_files, static_feature_files): incomplete_features = [] for file in [feature_file for feature_file in feature_files if feature_file not in static_feature_files]: features_path = self.input_folder + "/" + file validated_features, num_features = self.validateGeneLists(features_path, file, gene_lists) incomplete_features.append([file, validated_features, num_features]) if write_diagnostics: self.writeDiagnostics(incomplete_features) feature_matrix = {self.FEATURE_NAMES: []} for file in feature_files: features_path = self.input_folder + "/" + file if file not in static_feature_files: self.extractFeatureMatrix(feature_matrix, features_path, file, gene_lists, results_list) else: data_frame = self.fetchFullDataframe([result[0] for result in results_list], file) feature_names = SafeCastUtil.safeCast(data_frame.columns, list) transposed_dict = data_frame.T.to_dict() formatted_matrix = self.formatFullFeatureMatrix(feature_names, transposed_dict) for key in formatted_matrix.keys(): if key in feature_matrix: [feature_matrix[key].append(value) for value in formatted_matrix[key]] else: feature_matrix[key] = formatted_matrix[key] return feature_matrix def validateGeneLists(self, features_path, file, gene_lists): features_missing_from_files = {} num_features = 0 with open(features_path) as feature_file: try: for line_index, line in enumerate(feature_file): if line_index == 0: feature_names = line.split(",") num_features = len(feature_names) features_missing_from_files = self.validateAndTrimGeneList(feature_names, gene_lists, file) break except ValueError as value_error: self.log.error(value_error) return features_missing_from_files finally: self.log.debug("Closing file %s", feature_file) feature_file.close() return features_missing_from_files, num_features def validateAndTrimGeneList(self, feature_list, gene_lists, file): unused_features = {} for key in gene_lists.keys(): for gene in gene_lists[key]: if gene not in [feature.strip() for feature in feature_list]: index = gene_lists[key].index(gene) if unused_features.get(key) is None: unused_features[key] = [[gene, index]] else: unused_features[key].append([gene, (index + len(unused_features[key]))]) self.log.warning("Incomplete dataset: gene %s from gene list %s not found in file %s. " "Will not process this gene in this file.", gene, key, file) return unused_features def writeDiagnostics(self, features_removed): message = "" for feature_file in features_removed: message += "\nFeatures from gene list(s) not available in " + feature_file[0] + ":\n" for gene_list in feature_file[1].keys(): num_genes_missing = len(feature_file[1][gene_list]) percent_genes_missing = round((num_genes_missing / feature_file[2]) * 100, 2) message += ("\t" + SafeCastUtil.safeCast(num_genes_missing, str) + " (" + SafeCastUtil.safeCast(percent_genes_missing, str) + " %" + ") features not present in " + gene_list + ".csv:\n") for gene in feature_file[1][gene_list]: message += ("\t\t" + gene[0] + " at index " + SafeCastUtil.safeCast(gene[1], str) + "\n") message += "\n\n######################\n\n" DiagnosticsFileWriter.writeToFile(self.input_folder, message, self.log) def extractFeatureMatrix(self, feature_matrix, features_path, file, gene_lists, results_list): self.log.info("Extracting important features for %s.", file) gene_list_features = [] for gene_list in gene_lists.values(): for gene_list_feature in gene_list: if gene_list_feature not in gene_list_features: gene_list_features.append(gene_list_feature) with open(features_path) as feature_file: try: important_feature_indices = [] for line_index, line in enumerate(feature_file): if line_index == 0: feature_names = line.split(",") for gene_list_feature in gene_list_features: important_index = None feature_name = self.determineFeatureName(gene_list_feature, file) for i in range(0, len(feature_names)): if feature_names[i].strip() == gene_list_feature.strip(): important_index = i if feature_name not in feature_matrix[self.FEATURE_NAMES]: feature_matrix[self.FEATURE_NAMES].append(feature_name) important_feature_indices.append(important_index) else: features = self.extractCastedFeatures(line, important_feature_indices) try: cell_line = results_list[line_index - 1] except IndexError as index_error: self.log.error("Index out of range. Results file is shorter than feature file [%s]: %s", feature_file, SafeCastUtil.safeCast(index_error, str)) raise ValueError("Make sure there are no extra lines (including whitespace) in ALL feature " "files and only feature files you want to analyze are in target folder.") if not cell_line[0] in feature_matrix: feature_matrix[cell_line[0]] = features else: feature_matrix[cell_line[0]] = feature_matrix[cell_line[0]] + features if line_index > len(results_list): self.log.error("Invalid line count for %s", file) raise ValueError("Invalid line count for" + file + ". Must be " + SafeCastUtil.safeCast(file, str) + "lines long.") except ValueError as value_error: self.log.error("Please verify results file is the same number of rows as all feature files.") self.log.error(value_error) return None finally: feature_file.close() self.log.debug("Closing file %s", feature_file) def fileIsFeatureFile(self, file, results_file): algorithm_files = [algo + ".csv" for algo in SupportedMachineLearningAlgorithms.fetchAlgorithms()] return file != results_file and file != self.ARGUMENTS_FILE and self.GENE_LISTS not in file and\ file not in algorithm_files and ".csv" in file.lower() def determineFeatureName(self, feature_name, file): return SafeCastUtil.safeCast(file.split(".")[0] + "." + feature_name.strip(), str) def extractCastedFeatures(self, line, important_feature_indices): important_features = [] feature_values = line.strip().split(",") for index in important_feature_indices: if index is None: # TODO: Verify that this is acceptable, it works for one hot encoding and should never vary in any model important_features.append(self.UNFILLED_VALUE_PLACEHOLDER) else: if SafeCastUtil.safeCast(feature_values[index], float) is not None: important_features.append(SafeCastUtil.safeCast(feature_values[index].strip(), float)) else: important_features.append(SafeCastUtil.safeCast(feature_values[index].strip(), str)) return important_features def handleAlgorithmConfigs(self, arguments): algos = SupportedMachineLearningAlgorithms.fetchAlgorithms() configs = {} default_inner_perms = self.fetchOrReturnDefault(arguments.get(self.INNER_MONTE_CARLO_PERMUTATIONS), int, 10) default_outer_perms = self.fetchOrReturnDefault(arguments.get(self.OUTER_MONTE_CARLO_PERMUTATIONS), int, 10) for algo in algos: algo_config = arguments.get(algo) if algo_config is None: configs[algo] = [True, default_inner_perms, default_outer_perms] else: config_split = [param.strip() for param in algo_config.split(",")] if len(config_split) >= 3: configs[algo] = [config_split[0] == 'True', SafeCastUtil.safeCast(config_split[1], int), SafeCastUtil.safeCast(config_split[2], int)] return configs def createRSENConfig(self, arguments, binary_cat_matrix): rsen_p_val = self.fetchOrReturnDefault(arguments.get(self.RSEN_P_VAL), float, 0.0) rsen_k_val = self.fetchOrReturnDefault(arguments.get(self.RSEN_P_VAL), float, 0.1) rsen_combine_gene_lists = self.fetchOrReturnDefault(arguments.get(self.RSEN_COMBINE_GENE_LISTS), bool, False) rsen_config = RSENConfig(binary_cat_matrix, rsen_p_val, rsen_k_val, rsen_combine_gene_lists) return rsen_config def createUnivariateConfig(self, arguments, analyze_all): num_top_features = self.fetchOrReturnDefault(arguments.get(self.NUM_TOP_FEATURES), int, 147) return UnivariateConfig(analyze_all, num_top_features) def createIndividualTrainConfig(self, arguments): individual_train_algorithm = self.fetchOrReturnDefault(arguments.get(self.INDIVIDUAL_TRAIN_ALGORITHM), str, None) individual_train_hyperparams = self.fetchOrReturnDefault(arguments.get(self.INDIVIDUAL_TRAIN_HYPERPARAMS), str, "") individual_train_feature_gene_list_combo = self.fetchOrReturnDefault( arguments.get(self.INDIVIDUAL_TRAIN_FEATURE_GENE_LIST_COMBO), str, None) individual_train_config = IndividualTrainConfig(individual_train_algorithm, individual_train_hyperparams, individual_train_feature_gene_list_combo) return individual_train_config def createRecommendationsConfig(self, arguments): viability_acceptance = self.fetchOrReturnDefault(arguments.get(self.VIABILITY_ACCEPTANCE), float, None) recs_config = RecommendationsConfig(viability_acceptance) return recs_config def fetchBinaryCatMatrixIfApplicable(self, arguments, gene_lists, results_list, analyze_all, static_feature_files): binary_matrix_file = arguments.get(ArgumentProcessingService.BINARY_CATEGORICAL_MATRIX) if binary_matrix_file is not None: if analyze_all: return self.createAndValidateFullFeatureMatrix(results_list, [binary_matrix_file]) return self.createAndValidateFeatureMatrix(results_list, gene_lists, False, [binary_matrix_file], static_feature_files) else: return None def fetchOrReturnDefault(self, field, to_type, default): if field: if field.lower() == 'false' and to_type is bool: return False return SafeCastUtil.safeCast(field, to_type) else: return default def determineSpecificCombos(self, combos): if combos is None: return [] return [combo.strip().replace("\"", "") for combo in combos.split(",")]
from LoggerFactory import LoggerFactory log = LoggerFactory.getLogger('fridge') friend = 'cooker' def first(): log.info('Hi, I\'m the fridge') def second(): log.warning('Some strange things happen near my place') def third(): log.error('OMG, the %s exploded!', friend)
import sys import os from ArgumentProcessingService import ArgumentProcessingService from LoggerFactory import LoggerFactory from MachineLearningService import MachineLearningService from HTMLWritingService import HTMLWritingService from RecommendationsService import RecommendationsService from Utilities.SafeCastUtil import SafeCastUtil from Utilities.FileConverter import FileConverter log = LoggerFactory.createLog(__name__) def main(): arguments = sys.argv[1:] if len(arguments) == 0: promptUserForInput() elif len(arguments) == 2 and arguments[0] == '0': runMainCellLineAnalysis(arguments[1]) elif len(arguments) == 2 and arguments[0] == '1': FileConverter.convertMatLabToCSV(arguments[1]) elif len(arguments) == 2 and arguments[0] == '2': fetchRecommendations(arguments[1]) else: log.error("Exiting program, invalid data sent in target folder.") return def promptUserForInput(): simulation_to_run = input(
class RecommendationsService(object): log = LoggerFactory.createLog(__name__) PRE_REC_ANALYSIS_FILE = "PreRecAnalysis.csv" PREDICTIONS_FILE = "Predictions.csv" PREDICTIONS_BY_CELL_LINE_FILE = "PredictionsByCellLine.csv" HEADER = "header" STD_DEVIATION = "std_deviation" MEAN = "mean" MEDIAN = "median" def __init__(self, inputs): self.inputs = inputs def analyzeRecommendations(self, input_folder): self.preRecsAnalysis(input_folder) self.recommendByHoldout(input_folder) self.writeFinalRecsResults(input_folder) def preRecsAnalysis(self, input_folder): self.log.info("Performing pre-recs analysis on all drugs.") drugs = self.inputs.keys() cell_line_predictions_by_drug = OrderedDict() header = numpy.concatenate( (["cell_line"], SafeCastUtil.safeCast(drugs, list)), axis=0) cell_line_predictions_by_drug[self.HEADER] = header cell_line_predictions_by_drug[self.STD_DEVIATION] = [ self.STD_DEVIATION ] cell_line_predictions_by_drug[self.MEAN] = [self.MEAN] cell_line_predictions_by_drug[self.MEDIAN] = [self.MEDIAN] for drug in drugs: processed_arguments = self.inputs.get(drug) results = processed_arguments.results combos = self.determineGeneListCombos(processed_arguments) processed_arguments.data_split = 1.0 data_formatting_service = DataFormattingService( processed_arguments) formatted_inputs = data_formatting_service.formatData(True, True) self.log.info("Determining best combo and score for drug %s.", drug) recs_model_info = self.fetchBestModelComboAndScore( drug, input_folder, formatted_inputs, results, combos, processed_arguments) if recs_model_info is None or recs_model_info.model is None or recs_model_info.combo is None: continue self.generateMultiplePredictions(recs_model_info, formatted_inputs, results, cell_line_predictions_by_drug) for cell_line in cell_line_predictions_by_drug: while len(cell_line_predictions_by_drug[cell_line]) < \ len(cell_line_predictions_by_drug[RecommendationsService.HEADER]): cell_line_predictions_by_drug[cell_line].append( MachineLearningService.DELIMITER) self.writePreRecAnalysisFile(cell_line_predictions_by_drug, input_folder) def generateMultiplePredictions(self, recs_model_info, formatted_inputs, results, cell_line_predictions_by_drug): trimmed_matrix = GeneListComboUtility.trimMatrixByFeatureSet( DataFormattingService.TRAINING_MATRIX, recs_model_info.combo, formatted_inputs, AnalysisType.RECOMMENDATIONS) features, relevant_results = recs_model_info.trainer.populateFeaturesAndResultsByCellLine( trimmed_matrix, results) cell_lines_in_order = [ key for key in trimmed_matrix.keys() if key is not ArgumentProcessingService.FEATURE_NAMES ] predictions = recs_model_info.model.predict(features) for i in range(0, len(cell_lines_in_order)): cell_line = cell_lines_in_order[i] if cell_line_predictions_by_drug.get(cell_line) is not None: cell_line_predictions_by_drug[cell_line].append(predictions[i]) else: max_dict_length = 2 for key in cell_line_predictions_by_drug.keys(): if key == self.HEADER: continue if len(cell_line_predictions_by_drug[key] ) > max_dict_length: max_dict_length = len( cell_line_predictions_by_drug[key]) row = [cell_line] for _ in range(2, max_dict_length): row.append(MachineLearningService.DELIMITER) row.append(predictions[i]) cell_line_predictions_by_drug[cell_line] = row cell_line_predictions_by_drug[self.STD_DEVIATION].append( numpy.std(predictions)) cell_line_predictions_by_drug[self.MEAN].append( numpy.mean(predictions)) cell_line_predictions_by_drug[self.MEDIAN].append( numpy.median(predictions)) def writePreRecAnalysisFile(self, cell_line_predictions_by_drug, input_folder): with open(input_folder + "/" + self.PRE_REC_ANALYSIS_FILE, "w", newline='') as pre_rec_analysis_file: try: writer = csv.writer(pre_rec_analysis_file) for key in [ key for key in cell_line_predictions_by_drug.keys() if key is not RecommendationsService.STD_DEVIATION and key is not RecommendationsService.MEDIAN and key is not RecommendationsService.MEAN ]: writer.writerow(cell_line_predictions_by_drug.get(key)) writer.writerow( cell_line_predictions_by_drug[RecommendationsService.MEAN]) writer.writerow(cell_line_predictions_by_drug[ RecommendationsService.MEDIAN]) writer.writerow(cell_line_predictions_by_drug[ RecommendationsService.STD_DEVIATION]) except ValueError as error: self.log.error("Error writing to file %s. %s", pre_rec_analysis_file, error) finally: pre_rec_analysis_file.close() def recommendByHoldout(self, input_folder): # TODO: Support for inputs to be a dict of drug_name => input, not just one set of inputs for all drugs. self.log.info( "Starting recommendation by holdout analysis on all drugs.") max_nodes = multiprocessing.cpu_count() for drug in self.inputs.keys(): self.log.info( "Starting recommendation by holdout analysis on specific drug %s.", drug) self.handleDrug(drug, input_folder, max_nodes, self.inputs.get(drug)) def handleDrug(self, drug, input_folder, max_nodes, processed_arguments): combos = self.determineGeneListCombos(processed_arguments) cell_line_map = processed_arguments.features results = processed_arguments.results cloned_inputs = copy.deepcopy(processed_arguments) cloned_inputs.data_split = 1.0 data_formatting_service = DataFormattingService(cloned_inputs) formatted_inputs = data_formatting_service.formatData(True, True) feature_names = formatted_inputs.get( ArgumentProcessingService.FEATURE_NAMES) requested_threads = processed_arguments.num_threads nodes_to_use = numpy.amin([requested_threads, max_nodes]) Parallel(n_jobs=nodes_to_use)(delayed(self.handleCellLine)( cell_line, combos, drug, feature_names, formatted_inputs, input_folder, processed_arguments, results) for cell_line in cell_line_map.keys()) def handleCellLine(self, cell_line, combos, drug, feature_names, formatted_inputs, input_folder, processed_arguments, results): if cell_line == ArgumentProcessingService.FEATURE_NAMES: return self.log.info("Holding out cell line %s for drug %s", cell_line, drug) trimmed_cell_lines, trimmed_results = self.removeNonNullCellLineFromFeaturesAndResults( cell_line, formatted_inputs, results) recs_model_info = self.fetchBestModelComboAndScore( drug, input_folder, trimmed_cell_lines, trimmed_results, combos, processed_arguments) if recs_model_info is None or recs_model_info.model is None or recs_model_info.combo is None: self.log.warn( "Unable to train best model or get best combo for cell line %s and drug %s.", cell_line, drug) return prediction = self.generateSinglePrediction(recs_model_info.model, recs_model_info.combo, cell_line, feature_names, formatted_inputs) self.writeToPredictionsCsvInLock(cell_line, drug, input_folder, prediction, recs_model_info.score) def writeToPredictionsCsvInLock(self, cell_line, drug, input_folder, prediction, score): self.log.debug("Locking current thread %s.", threading.current_thread()) lock = threading.Lock() lock.acquire(True) write_action = "w" if self.PREDICTIONS_FILE in os.listdir(input_folder): write_action = "a" with open(input_folder + "/" + self.PREDICTIONS_FILE, write_action, newline='') as predictions_file: try: writer = csv.writer(predictions_file) if write_action == "w": writer.writerow( ["Drug", "Cell_Line", "Prediction", "R2^Score"]) line = [ drug, cell_line, SafeCastUtil.safeCast(prediction, str), SafeCastUtil.safeCast(score, str) ] writer.writerow(line) except ValueError as error: self.log.error("Error writing to file %s. %s", self.PREDICTIONS_FILE, error) finally: predictions_file.close() self.log.debug("Releasing current thread %s.", threading.current_thread()) lock.release() def determineGeneListCombos(self, processed_arguments): gene_lists = processed_arguments.gene_lists feature_names = processed_arguments.features.get( ArgumentProcessingService.FEATURE_NAMES) static_features = processed_arguments.static_features combos, expected_length = GeneListComboUtility.determineCombos( gene_lists, feature_names, static_features) if len(combos) != expected_length: self.log.warning( "Unexpected number of combos detected, should be %s but instead created %s.\n%s", expected_length, len(combos), combos) return combos def removeNonNullCellLineFromFeaturesAndResults(self, cell_line, formatted_inputs, results): cloned_formatted_data = copy.deepcopy(formatted_inputs) if cell_line is not None: del cloned_formatted_data.get( DataFormattingService.TRAINING_MATRIX)[cell_line] cloned_results = [ result for result in results if result[0] is not cell_line and cell_line is not None ] return cloned_formatted_data, cloned_results def getDrugFolders(self, input_folder): folders = os.listdir(input_folder) # TODO: Figure out required phrase to mark it as a drug folder drug_folders = [f for f in folders if 'Analysis' in f] return drug_folders def fetchBestModelComboAndScore(self, drug, analysis_files_folder, trimmed_cell_lines, trimmed_results, combos, processed_arguments): # TODO: ultimately we'd want to use multiple algorithms, and make an ensemble prediction/prescription. # But for now, let's stick with one algorithm. best_combo_string = None best_scoring_algo = None optimal_hyperparams = None top_score = AbstractModelTrainer.DEFAULT_MIN_SCORE for analysis_file_name in self.fetchAnalysisFiles( drug, analysis_files_folder): file = analysis_files_folder + "/" + drug + "/" + analysis_file_name with open(file, 'rt') as analysis_file: reader = csv.reader(analysis_file) try: header = [] indices_of_outer_loops = [] line_index = -1 for row in reader: line_index += 1 if line_index == 0: header = row for i in range(0, len(row)): if MachineLearningService.SCORE_AND_HYPERPARAM_PHRASE in row[ i]: indices_of_outer_loops.append(i) continue string_combo = row[header.index( MachineLearningService.FEATURE_FILE_GENE_LIST_COMBO )] score = SafeCastUtil.safeCast( row[header.index( self.scorePhrase(processed_arguments))], float) if score is not None and score > top_score: best_scoring_algo = analysis_file_name.split( ".")[0] best_combo_string = string_combo top_score = score optimal_hyperparams = self.fetchBestHyperparams( row, indices_of_outer_loops) except ValueError as valueError: self.log.error(valueError) finally: self.log.debug("Closing file %s", analysis_file) analysis_file.close() if top_score <= 0: # TODO - Consider writing this to an explicit diagnostic file via extracting to first class service, # not just the process error log. self.log.error( 'Error: no method found an R2 higher than 0 for drug: %s.', drug) return None best_combo = self.determineBestComboFromString(best_combo_string, combos, processed_arguments) best_model, trainer = self.trainBestModelWithCombo( best_scoring_algo, best_combo, optimal_hyperparams, trimmed_cell_lines, trimmed_results, processed_arguments) return RecommendationsModelInfo(trainer, top_score, best_combo, best_model) def scorePhrase(self, processed_arguments): if processed_arguments.is_classifier: return MachineLearningService.PERCENT_ACCURATE_PREDICTIONS return MachineLearningService.R_SQUARED_SCORE def fetchBestHyperparams(self, row, indices_of_outer_loops): monte_carlo_results = self.getMonteCarloResults( row, indices_of_outer_loops) best_hyps = None top_score = AbstractModelTrainer.DEFAULT_MIN_SCORE max_num_occurrences = 0 best_hyps_list = [] for hyperparam in SafeCastUtil.safeCast(monte_carlo_results.keys(), list): if len(monte_carlo_results.get(hyperparam)) > max_num_occurrences: max_num_occurrences = len(monte_carlo_results.get(hyperparam)) best_hyps_list = [hyperparam] elif len(monte_carlo_results.get( hyperparam)) == max_num_occurrences: best_hyps_list.append(hyperparam) if len(best_hyps_list) == 1: best_hyps = hyperparam top_score = numpy.average(monte_carlo_results.get(hyperparam)) elif len(best_hyps_list) > 1: top_score = 0 for hyperparam in best_hyps_list: if numpy.average( monte_carlo_results.get(hyperparam)) > top_score: top_score = numpy.average( monte_carlo_results.get(hyperparam)) best_hyps = hyperparam return best_hyps def getMonteCarloResults(self, row, indices_of_outer_loops): hyperparams_to_scores = {} for i in range(0, len(row)): if i in indices_of_outer_loops: score_and_hyperparam = row[i].split( MachineLearningService.DELIMITER) score = SafeCastUtil.safeCast(score_and_hyperparam[0], float) if hyperparams_to_scores.get( score_and_hyperparam[1]) is not None: hyperparams_to_scores[score_and_hyperparam[1]].append( score) else: hyperparams_to_scores[score_and_hyperparam[1]] = [score] return hyperparams_to_scores def fetchAnalysisFiles(self, drug, input_folder): files = os.listdir(input_folder + "/" + drug) return [file for file in files if "Analysis.csv" in file] def trainBestModelWithCombo(self, best_scoring_algo, best_scoring_combo, optimal_hyperparams, trimmed_cell_lines, trimmed_results, processed_arguments): is_classifier = processed_arguments.is_classifier rsen_config = processed_arguments.rsen_config training_matrix = GeneListComboUtility.trimMatrixByFeatureSet( DataFormattingService.TRAINING_MATRIX, best_scoring_combo, trimmed_cell_lines, AnalysisType.RECOMMENDATIONS) trainer = ModelTrainerFactory.createTrainerFromTargetAlgorithm( is_classifier, best_scoring_algo, rsen_config) features, relevant_results = trainer.populateFeaturesAndResultsByCellLine( training_matrix, trimmed_results) params = DictionaryUtility.toDict(optimal_hyperparams) feature_names = training_matrix.get( ArgumentProcessingService.FEATURE_NAMES) model = trainer.buildModel(relevant_results, features, params, feature_names) return model, trainer def determineBestComboFromString(self, best_combo_string, combos, processed_arguments): gene_lists = processed_arguments.gene_lists combine_gene_lists = processed_arguments.rsen_config.combine_gene_lists analysis_type = processed_arguments.analysisType() static_features = processed_arguments.static_features for combo in combos: feature_set_string = GeneListComboUtility.generateFeatureSetString( combo, gene_lists, combine_gene_lists, analysis_type, static_features) if GeneListComboUtility.combosAreEquivalent( feature_set_string, best_combo_string): return combo raise ValueError( "Unable to determine feature set from given combo gene list and feature file combo: " + best_combo_string + ".\n Please make sure all gene lists and feature files in the combo " + "are present in the drug folder.") def generateSinglePrediction(self, best_model, best_combo, cell_line, all_features, formatted_inputs): ommited_cell_line = formatted_inputs.get( DataFormattingService.TRAINING_MATRIX).get(cell_line) input_wrapper = OrderedDict() input_wrapper[DataFormattingService.TRAINING_MATRIX] = OrderedDict() input_wrapper[DataFormattingService. TRAINING_MATRIX][cell_line] = ommited_cell_line input_wrapper[ArgumentProcessingService.FEATURE_NAMES] = all_features trimmed_matrix = GeneListComboUtility.trimMatrixByFeatureSet( DataFormattingService.TRAINING_MATRIX, best_combo, input_wrapper, AnalysisType.RECOMMENDATIONS) return best_model.predict([trimmed_matrix.get(cell_line)])[0] def writeFinalRecsResults(self, input_folder): drug_scores_by_cell_line = self.fetchDrugScoresByCellLine(input_folder) self.writePredictionsByCellLine(drug_scores_by_cell_line, input_folder) def fetchDrugScoresByCellLine(self, input_folder): predictions_file = input_folder + "/" + RecommendationsService.PREDICTIONS_FILE drug_scores_by_cell_line = {} with open(predictions_file) as input_file: try: for line_index, line in enumerate(input_file): if line_index == 0: continue line_split = line.split(",") drug = line_split[0] cell_line = line_split[1] score = SafeCastUtil.safeCast(line_split[2], float) if not drug or not cell_line or not score: self.log.warning( "Invalid line detected for %s at line %s.", predictions_file, line_index + 1) continue if not drug_scores_by_cell_line.get(cell_line): drug_scores_by_cell_line[cell_line] = [(drug, score)] else: drug_scores_by_cell_line[cell_line].append( (drug, score)) except ValueError as error: self.log.error("Error parsing predictions file %s. %s", predictions_file, error) return drug_scores_by_cell_line def writePredictionsByCellLine(self, drug_scores_by_cell_line, input_folder): total_drugs = numpy.max( [len(drugs) for drugs in drug_scores_by_cell_line.values()]) header = ["Cell Line"] best_drug = " best drug" best_drug_score = " best drug score" for i in range(1, total_drugs + 1): suffix = MachineLearningService.generateNumericalSuffix(i) header.append(SafeCastUtil.safeCast(i, str) + suffix + best_drug) header.append( SafeCastUtil.safeCast(i, str) + suffix + best_drug_score) predictions_by_cell_line_path = input_folder + "/" + RecommendationsService.PREDICTIONS_BY_CELL_LINE_FILE with open(predictions_by_cell_line_path, "w", newline='') as predictions_by_cell_line_file: try: writer = csv.writer(predictions_by_cell_line_file) writer.writerow(header) for cell_line in drug_scores_by_cell_line.keys(): drug_scores = sorted( drug_scores_by_cell_line.get(cell_line), reverse=True, key=lambda x: x[1]) row = [cell_line] for drug_and_score in drug_scores: row.append(drug_and_score[0]) row.append(drug_and_score[1]) writer.writerow(row) except ValueError as error: self.log.error("Error writing to %s. %s", predictions_by_cell_line_file, error)
class MachineLearningService(object): log = LoggerFactory.createLog(__name__) MAXIMUM_FEATURES_RECORDED = 20 DELIMITER = " --- " #TODO: consider extracting these to a helper class. SCORE_AND_HYPERPARAM_PHRASE = "score and optimal hyperparams for outer perm " FEATURE_FILE_GENE_LIST_COMBO = "feature file: gene list combo" R_SQUARED_SCORE = "R^2 score" PERCENT_ACCURATE_PREDICTIONS = "percentage accurate predictions" def __init__(self, data): self.inputs = data def analyze(self, input_folder): gene_list_combos = self.determineGeneListCombos() is_classifier = self.inputs.is_classifier analysis_type = self.inputs.analysisType() if analysis_type is AnalysisType.INDIVIDUAL_TRAIN: self.analyzeIndividualGeneListCombo(gene_list_combos, input_folder, is_classifier) elif analysis_type is AnalysisType.FULL_CLA_SPECIFIC_COMBO: self.analyzeGeneListCombos(self.determineSpecificCombos(gene_list_combos), input_folder, is_classifier) else: self.analyzeGeneListCombos(gene_list_combos, input_folder, is_classifier) def determineGeneListCombos(self): feature_names = self.inputs.features.get(ArgumentProcessingService.FEATURE_NAMES) if self.inputs.analysisType() is AnalysisType.NO_GENE_LISTS: return [[feature_names]] gene_lists = self.inputs.gene_lists static_features = self.inputs.static_features combos, expected_length = GeneListComboUtility.determineCombos(gene_lists, feature_names, static_features) if len(combos) != expected_length: self.log.warning("Unexpected number of combos detected, should be %s but instead created %s.\n%s", expected_length, len(combos), combos) return combos def analyzeIndividualGeneListCombo(self, gene_list_combos, input_folder, is_classifier): config = self.inputs.individual_train_config target_combo = config.combo target_algorithm = config.algorithm rsen_config = self.inputs.rsen_config outer_monte_carlo_loops = self.inputs.outer_monte_carlo_permutations for gene_list_combo in gene_list_combos: plain_text_name = self.generateFeatureSetString(gene_list_combo) if plain_text_name == target_combo: trainer = ModelTrainerFactory.createTrainerFromTargetAlgorithm(is_classifier, target_algorithm, rsen_config) hyperparams = self.fetchAndCastHyperparams(config, trainer) for permutation in range(0, outer_monte_carlo_loops): results = self.inputs.results formatted_data = self.formatData(self.inputs, True, True) training_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX, gene_list_combo, formatted_data, self.inputs.analysisType()) testing_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX, gene_list_combo, formatted_data, self.inputs.analysisType()) features, relevant_results = trainer.populateFeaturesAndResultsByCellLine(training_matrix, results) feature_names = training_matrix.get(ArgumentProcessingService.FEATURE_NAMES) model = trainer.buildModel(relevant_results, features, hyperparams, feature_names) model_score = trainer.fetchPredictionsAndScore(model, testing_matrix, results) score = model_score[0] accuracy = model_score[1] importances = trainer.fetchFeatureImportances(model, feature_names) for key in importances.keys(): importances[key] = [importances[key]] ordered_importances = self.averageAndSortImportances(importances, 1) ordered_phrases = self.averageAndSortImportantRSENPhrases( trainer.fetchModelPhrases(model, gene_list_combo), trainer) numbered_combo = target_combo + " RUN " + SafeCastUtil.safeCast(permutation, str) self.log.debug("Final score and accuracy of individual analysis for feature gene combo %s " "using algorithm %s: %s, %s", numbered_combo, target_algorithm, score, accuracy) score_and_hyperparam = [self.generateScoreAndHyperParam(score, hyperparams)] line = self.generateLine(accuracy, numbered_combo, ordered_importances, ordered_phrases, score, score_and_hyperparam) self.writeToCSVInLock(line, input_folder, target_algorithm, outer_monte_carlo_loops, 1) return self.log.info("Gene list feature file %s combo not found in current dataset.", target_combo) return def fetchAndCastHyperparams(self, config, trainer): hyperparams = config.hyperparams.split(",") hyperparam_dict = OrderedDict() keys = SafeCastUtil.safeCast(trainer.hyperparameters.keys(), list) for i in range(0, len(keys)): hyperparam_dict[keys[i]] = SafeCastUtil.safeCast(hyperparams[i], float) return hyperparam_dict def generateLine(self, accuracy, combo, ordered_importances, ordered_phrases, score, score_and_hyperparam): return numpy.concatenate([[combo, score, accuracy], score_and_hyperparam, ordered_importances, ordered_phrases]) def generateScoreAndHyperParam(self, score, hyperparam): return SafeCastUtil.safeCast(score, str) + self.DELIMITER + DictionaryUtility.toString(hyperparam) def shouldTrainAlgorithm(self, algorithm): configs = self.inputs.algorithm_configs return configs is not None and configs.get(algorithm) is not None and configs.get(algorithm)[0] def determineSpecificCombos(self, all_combos): specific_combos = self.inputs.specific_combos selected_combos = {} for specific_combo in specific_combos: for combo in all_combos: combo_string = self.generateFeatureSetString(combo) if specific_combo == combo_string and selected_combos.get(combo_string) is None: selected_combos[combo_string] = combo else: equivalent_combos = GeneListComboUtility.combosAreEquivalent(combo_string, specific_combo) if equivalent_combos and selected_combos.get(combo_string) is None: selected_combos[combo_string] = combo selected_combo_names = SafeCastUtil.safeCast(selected_combos.keys(), list) if len(selected_combo_names) < len(specific_combos): self.log.warning("Not all specified combos were available in this data folder.\n" "Specified combos: %s\n Selected combos: %s", specific_combos, selected_combo_names) else: self.log.info("Only running analysis on following combos:\n %s", selected_combo_names) return SafeCastUtil.safeCast(selected_combos.values(), list) def analyzeGeneListCombos(self, gene_list_combos, input_folder, is_classifier): rsen_config = self.inputs.rsen_config for algo in SupportedMachineLearningAlgorithms.fetchAlgorithms(): if self.shouldTrainAlgorithm(algo): trainer = None try: trainer = ModelTrainerFactory.createTrainerFromTargetAlgorithm(is_classifier, algo, rsen_config) except ValueError as valueError: self.log.error("Improper configuration for algorithm: [%s], %s.", algo, valueError) finally: if trainer is not None: trainer.logTrainingMessage(self.monteCarloPermsByAlgorithm(algo, True), self.monteCarloPermsByAlgorithm(algo, False), len(gene_list_combos)) self.handleParallellization(gene_list_combos, input_folder, trainer) def monteCarloPermsByAlgorithm(self, algorithm, outer): monte_carlo_config = self.inputs.algorithm_configs.get(algorithm) return monte_carlo_config[1] if outer else monte_carlo_config[2] def handleParallellization(self, gene_list_combos, input_folder, trainer): max_nodes = multiprocessing.cpu_count() requested_threads = self.inputs.num_threads nodes_to_use = numpy.amin([requested_threads, max_nodes]) valid_combos = self.fetchValidGeneListCombos(input_folder, gene_list_combos, trainer) Parallel(n_jobs=nodes_to_use)(delayed(self.runMonteCarloSelection)(feature_set, trainer, input_folder, len(valid_combos)) for feature_set in valid_combos) GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log) def fetchValidGeneListCombos(self, input_folder, gene_list_combos, trainer): valid_combos = [feature_set for feature_set in gene_list_combos if trainer.shouldProcessFeatureSet(feature_set)] rsen_config = self.inputs.rsen_config if trainer.algorithm == SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET and \ rsen_config.combine_gene_lists: all_genes = GeneListComboUtility.fetchAllGeneListGenesDeduped(self.inputs.gene_lists) # TODO: Can fail if "." in feature name. bin_cat_matrix = rsen_config.binary_cat_matrix.get(ArgumentProcessingService.FEATURE_NAMES)[0].split(".")[0] full_gene_list = [bin_cat_matrix + "." + gene for gene in all_genes if len(gene.strip()) > 0] new_combos = [] for combo in valid_combos: new_combo = [] for feature_set in combo: if bin_cat_matrix in feature_set[0]: new_combo.append(full_gene_list) else: new_combo.append(feature_set) if new_combo not in new_combos: new_combos.append(new_combo) return self.trimAnalyzedCombos(input_folder, new_combos, trainer) else: return self.trimAnalyzedCombos(input_folder, valid_combos, trainer) def trimAnalyzedCombos(self, input_folder, valid_combos, trainer): file_name = trainer.algorithm + ".csv" if file_name not in os.listdir(input_folder): return valid_combos existing_combo_strings = [] with open(input_folder + "/" + file_name) as analyzed_file: try: for line_index, line in enumerate(analyzed_file): if line_index == 0: continue existing_combo_strings.append(line.strip().split(",")[0]) except ValueError as error: self.log.error("Error reading existing combos from analysis file: %s", analyzed_file, error) finally: analyzed_file.close() trimmed_combos = [] for combo in valid_combos: if self.generateFeatureSetString(combo) not in existing_combo_strings: trimmed_combos.append(combo) return trimmed_combos def runMonteCarloSelection(self, feature_set, trainer, input_folder, num_combos): scores = [] accuracies = [] importances = {} feature_set_as_string = self.generateFeatureSetString(feature_set) outer_perms = self.monteCarloPermsByAlgorithm(trainer.algorithm, True) important_rsen_phrases = {} scores_and_hyperparams = [] for i in range(1, outer_perms + 1): self.log.info("Computing outer Monte Carlo Permutation %s for %s.\n", i, feature_set_as_string) formatted_data = self.formatData(self.inputs, True, True) if self.inputs.analysisType() is AnalysisType.NO_GENE_LISTS: self.logKeptFeatures(formatted_data, i, input_folder, trainer) self.log.info("Creating train and test matrices by feature set: %s.", feature_set_as_string) training_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX, feature_set, formatted_data, self.inputs.analysisType()) testing_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX, feature_set, formatted_data, self.inputs.analysisType()) optimal_hyperparams = self.determineOptimalHyperparameters(feature_set, formatted_data, trainer) record_diagnostics = self.inputs.record_diagnostics trainer.logIfBestHyperparamsOnRangeThreshold(optimal_hyperparams, record_diagnostics, input_folder) trainer.logOptimalHyperParams(optimal_hyperparams, self.generateFeatureSetString(feature_set), record_diagnostics, input_folder) prediction_data = self.fetchOuterPermutationModelScore(feature_set, trainer, optimal_hyperparams, testing_matrix, training_matrix) scores.append(prediction_data[0]) accuracies.append(prediction_data[1]) for importance in prediction_data[2].keys(): if importances.get(importance) is not None: importances[importance].append(prediction_data[2].get(importance)) else: importances[importance] = [prediction_data[2].get(importance)] if len(prediction_data) == 4 and \ trainer.algorithm == SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET: for phrase in prediction_data[3].keys(): if important_rsen_phrases.get(phrase) is not None: important_rsen_phrases[phrase].append(prediction_data[3].get(phrase)) else: important_rsen_phrases[phrase] = [prediction_data[3].get(phrase)] scores_and_hyperparams.append(self.generateScoreAndHyperParam(prediction_data[0], optimal_hyperparams)) GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log) average_score = numpy.mean(scores) average_accuracy = numpy.mean(accuracies) self.log.debug("Average score and accuracy of all Monte Carlo runs for %s: %s, %s", feature_set_as_string, average_score, average_accuracy) ordered_importances = self.averageAndSortImportances(importances, outer_perms) ordered_phrases = self.averageAndSortImportantRSENPhrases(important_rsen_phrases, trainer) line = self.generateLine(average_accuracy, feature_set_as_string, ordered_importances, ordered_phrases, average_score, scores_and_hyperparams) self.writeToCSVInLock(line, input_folder, trainer.algorithm, num_combos, outer_perms) self.saveOutputToTxtFile(scores, accuracies, feature_set_as_string, input_folder, trainer.algorithm) def generateFeatureSetString(self, feature_set): return GeneListComboUtility.generateFeatureSetString(feature_set, self.inputs.gene_lists, self.inputs.rsen_config.combine_gene_lists, self.inputs.analysisType(), self.inputs.static_features) def fetchOuterPermutationModelScore(self, feature_set, trainer, optimal_hyperparams, testing_matrix, training_matrix): # TODO: Handle hyperparams with n results = self.inputs.results features, relevant_results = trainer.populateFeaturesAndResultsByCellLine(training_matrix, results) feature_names = training_matrix.get(ArgumentProcessingService.FEATURE_NAMES) model = trainer.buildModel(relevant_results, features, optimal_hyperparams, feature_names) score, accuracy = trainer.fetchPredictionsAndScore(model, testing_matrix, results) # TODO: This should be it's own class. return [score, accuracy, trainer.fetchFeatureImportances(model, feature_names), trainer.fetchModelPhrases(model, feature_set)] def averageAndSortImportances(self, importances, outer_loops): for key in importances.keys(): if len(importances[key]) < outer_loops: self.log.warning("Different amount of importances for feature %s than expected. Should be %s but is " "instead %s.", key, outer_loops, len(importances[key])) while len(importances[key]) < outer_loops: importances[key].append(0.0) ordered = [] [ordered.append({"feature": key, "importance": numpy.sum(importances[key]) / outer_loops}) for key in importances.keys()] ordered = sorted(ordered, key=lambda k: k["importance"], reverse=True) trimmed = ordered[:self.MAXIMUM_FEATURES_RECORDED] final_imps = [] for i in range(0, self.MAXIMUM_FEATURES_RECORDED): if i < len(trimmed): summary = trimmed[i].get("feature") + self.DELIMITER + \ SafeCastUtil.safeCast(trimmed[i].get("importance"), str) final_imps.append(summary) else: final_imps.append("") return final_imps def averageAndSortImportantRSENPhrases(self, important_rsen_phrases, trainer): if trainer.algorithm == SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET: ordered_phrases = [] [ordered_phrases.append({"phrase": key, "score": numpy.average(important_rsen_phrases[key])}) for key in important_rsen_phrases.keys()] ordered_phrases = sorted(ordered_phrases, key=lambda k: k["score"], reverse=True) trimmed = ordered_phrases[:self.MAXIMUM_FEATURES_RECORDED] final_phrases = [] for i in range(0, self.MAXIMUM_FEATURES_RECORDED): if i < len(trimmed): summary = trimmed[i].get("phrase") + self.DELIMITER + \ SafeCastUtil.safeCast(trimmed[i].get("score"), str) final_phrases.append(summary) else: final_phrases.append("") return final_phrases else: return [] def determineInnerHyperparameters(self, feature_set, formatted_data, trainer): inner_model_hyperparams = {} inner_perms = self.monteCarloPermsByAlgorithm(trainer.algorithm, False) for j in range(1, inner_perms + 1): self.log.info("Computing inner Monte Carlo Permutation %s for %s.\n", j, self.generateFeatureSetString(feature_set)) GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log) formatted_inputs = self.reformatInputsByTrainingMatrix( formatted_data.get(DataFormattingService.TRAINING_MATRIX), formatted_data.get(ArgumentProcessingService.FEATURE_NAMES)) further_formatted_data = self.formatData(formatted_inputs, False, False) inner_validation_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX, feature_set, further_formatted_data, formatted_inputs.analysisType()) inner_train_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX, feature_set, further_formatted_data, formatted_inputs.analysisType()) model_data = trainer.hyperparameterize(inner_train_matrix, inner_validation_matrix, self.inputs.results) for data in model_data.keys(): if inner_model_hyperparams.get(data) is not None: inner_model_hyperparams[data].append(model_data[data]) else: inner_model_hyperparams[data] = [model_data[data]] return inner_model_hyperparams def formatData(self, inputs, should_scale, should_one_hot_encode): data_formatting_service = DataFormattingService(inputs) return data_formatting_service.formatData(should_scale, should_one_hot_encode) def logKeptFeatures(self, formatted_data, monte_carlo_perm, input_folder, trainer): features_by_file = {} for full_feature in formatted_data.get(ArgumentProcessingService.FEATURE_NAMES): feature_split = full_feature.split(".") file = feature_split[0] feature = feature_split[1] if features_by_file.get(file) is None: features_by_file[file] = [feature] else: features_by_file[file].append(feature) message = "Only using the following features for outer Monte Carlo loop " +\ SafeCastUtil.safeCast(monte_carlo_perm, str) + ". All other features have been removed.\n" for file in features_by_file.keys(): message += ("\t" + file + ":\n") for feature in features_by_file[file]: message += ("\t\t" + feature + "\n") self.log.info(message) if self.inputs.record_diagnostics: DiagnosticsFileWriter.writeToFile(input_folder, message, self.log) def reformatInputsByTrainingMatrix(self, training_matrix, feature_names): real_inputs = self.inputs features = {} results = [] features[ArgumentProcessingService.FEATURE_NAMES] = feature_names for training_cell in training_matrix.keys(): for input_cell in real_inputs.features.keys(): if training_cell is input_cell: features[training_cell] = training_matrix.get(training_cell) for result in real_inputs.results: if result[0] is training_cell: results.append(result) break break cloned_univariate_config = deepcopy(real_inputs.univariate_config) cloned_univariate_config.analyze_all = False return ProcessedArguments(results, real_inputs.is_classifier, features, real_inputs.gene_lists, real_inputs.inner_monte_carlo_permutations, real_inputs.outer_monte_carlo_permutations, real_inputs.data_split, real_inputs.algorithm_configs, real_inputs.num_threads, real_inputs.record_diagnostics, real_inputs.individual_train_config, real_inputs.rsen_config, real_inputs.recs_config, cloned_univariate_config, real_inputs.specific_combos, real_inputs.static_features) def determineOptimalHyperparameters(self, feature_set, formatted_data, trainer): inner_model_hyperparams = self.determineInnerHyperparameters(feature_set, formatted_data, trainer) highest_average = trainer.DEFAULT_MIN_SCORE best_hyperparam = {} for hyperparam_set in inner_model_hyperparams.keys(): if hyperparam_set == AbstractModelTrainer.ADDITIONAL_DATA: continue average = numpy.average([results[0] for results in inner_model_hyperparams[hyperparam_set]]) # raw score if average > highest_average: best_hyperparam = DictionaryUtility.toDict(hyperparam_set) highest_average = average additional_data = inner_model_hyperparams.get(AbstractModelTrainer.ADDITIONAL_DATA) if additional_data: best_hyperparam[AbstractModelTrainer.ADDITIONAL_DATA] = additional_data return best_hyperparam def writeToCSVInLock(self, line, input_folder, ml_algorithm, num_combos, outer_perms): lock = threading.Lock() lock.acquire(True) self.lockThreadMessage() file_name = ml_algorithm + ".csv" write_action = "w" if file_name in os.listdir(input_folder): write_action = "a" with open(input_folder + "/" + file_name, write_action, newline='') as csv_file: try: writer = csv.writer(csv_file) if write_action == "w": writer.writerow(self.getCSVFileHeader(self.inputs.is_classifier, ml_algorithm, outer_perms)) writer.writerow(line) except ValueError as error: self.log.error("Error writing to file %s. %s", file_name, error) finally: csv_file.close() total_lines = 0 with open(input_folder + "/" + file_name) as csv_file: try: reader = csv.reader(csv_file) total_lines += (len(SafeCastUtil.safeCast(reader, list)) - 1) except ValueError as error: self.log.error("Error reading lines from file %s. %s", file_name, error) finally: csv_file.close() self.logPercentDone(total_lines, num_combos, ml_algorithm) self.unlockThreadMessage() lock.release() @staticmethod def getCSVFileHeader(is_classifier, ml_algorithm, outer_perms): header = [MachineLearningService.FEATURE_FILE_GENE_LIST_COMBO] if is_classifier: header.append(MachineLearningService.PERCENT_ACCURATE_PREDICTIONS) header.append("accuracy score") else: header.append(MachineLearningService.R_SQUARED_SCORE) header.append("mean squared error") for i in range(1, outer_perms + 1): header.append(MachineLearningService.SCORE_AND_HYPERPARAM_PHRASE + SafeCastUtil.safeCast(i, str)) if ml_algorithm == SupportedMachineLearningAlgorithms.RADIAL_BASIS_FUNCTION_SVM: return header feature_analysis = " most important feature" for i in range(1, MachineLearningService.MAXIMUM_FEATURES_RECORDED + 1): suffix = MachineLearningService.generateNumericalSuffix(i) header.append(SafeCastUtil.safeCast(i, str) + suffix + feature_analysis) if ml_algorithm == SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET: phrase_analysis = " most significant boolean phrase" for i in range(1, MachineLearningService.MAXIMUM_FEATURES_RECORDED + 1): suffix = MachineLearningService.generateNumericalSuffix(i) header.append(SafeCastUtil.safeCast(i, str) + suffix + phrase_analysis) return header @staticmethod def generateNumericalSuffix(i): if i == 1 and i != 11: return "st" elif i == 2 and i != 12: return "nd" elif i == 3 and i != 13: return "rd" else: return "th" def logPercentDone(self, total_lines, num_combos, ml_algorithm): percent_done, percentage_bar = PercentageBarUtility.calculateAndCreatePercentageBar(total_lines, num_combos) self.log.info("Total progress for %s: %s%% done:\n %s", ml_algorithm, percent_done, percentage_bar) def saveOutputToTxtFile(self, scores, accuracies, feature_set_as_string, input_folder, algorithm): lock = threading.Lock() self.lockThreadMessage() lock.acquire(True) file_name = HTMLWritingService.RECORD_FILE write_action = "w" if file_name in os.listdir(input_folder): write_action = "a" with open(input_folder + "/" + file_name, write_action) as output_file: try: output_file.write(algorithm + MachineLearningService.DELIMITER + feature_set_as_string + MachineLearningService.DELIMITER + SafeCastUtil.safeCast(scores, str) + MachineLearningService.DELIMITER + SafeCastUtil.safeCast(accuracies, str) + "\n") except ValueError as error: self.log.error("Error saving output of %s analysis to memory: %s", algorithm, error) finally: output_file.close() self.unlockThreadMessage() lock.release() def lockThreadMessage(self): self.log.debug("Locking current thread %s.", threading.current_thread()) def unlockThreadMessage(self): self.log.debug("Releasing current thread %s.", threading.current_thread())
class DataFormattingServiceIT(unittest.TestCase): log = LoggerFactory.createLog(__name__) def setUp(self): self.current_working_dir = os.getcwd() # Should be this package. input_folder = self.current_working_dir + "/SampleClassifierDataFolder" self.instantiateDataFormattingService(input_folder) def tearDown(self): if self.current_working_dir != "/": for file in os.listdir( self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER): if file == "__init__.py": continue os.remove(self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER + "/" + file) def instantiateDataFormattingService(self, input_folder): argument_processing_service = ArgumentProcessingService(input_folder) arguments = argument_processing_service.handleInputFolder() self.data_formatting_service = DataFormattingService(arguments) def fetchTrainAndTestData(self): s = self.data_formatting_service features = pd.read_csv('SampleClassifierDataFolder/features.csv', delimiter=',') results = pd.read_csv('SampleClassifierDataFolder/results.csv', delimiter=',') x_train, x_test, y_train, y_test = s.testTrainSplit( features, results, self.data_formatting_service.inputs.data_split) return x_test, x_train, y_test, y_train def testFormattingDataRandomizesMatrices(self): original_outputs = self.data_formatting_service.formatData(True) self.validateOutput(original_outputs) self.instantiateDataFormattingService(self.current_working_dir + "/SampleClassifierDataFolder") new_outputs = self.data_formatting_service.formatData(True) self.validateOutput(new_outputs) original_trained_cells = SafeCastUtil.safeCast( original_outputs.get(DataFormattingService.TRAINING_MATRIX).keys(), list) new_trained_cells = SafeCastUtil.safeCast( new_outputs.get(DataFormattingService.TRAINING_MATRIX).keys(), list) non_identical_matrices = False for i in range(0, len(new_trained_cells)): if original_trained_cells[i] != new_trained_cells[i]: non_identical_matrices = True assert non_identical_matrices def testFormattingRandomizedData(self): self.validateOutput(self.formatRandomizedData(True)) self.validateOutput(self.formatRandomizedData(False)) def formatRandomizedData(self, is_classifier): arguments = self.processArguments(is_classifier, False, 150) data_formatting_service = DataFormattingService(arguments) return data_formatting_service.formatData(True) def processArguments(self, is_classifier, analyze_all, num_features): random_data_generator = RandomizedDataGenerator( RandomizedDataGenerator.GENERATED_DATA_FOLDER) random_data_generator.generateRandomizedFiles(5, 50, num_features, is_classifier, 10, .8, analyze_all=analyze_all) input_folder = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER argument_processing_service = ArgumentProcessingService(input_folder) arguments = argument_processing_service.handleInputFolder() return arguments def testTrimmingDoesNotTrimSignificantFeatures(self): significant_prefix = RandomizedDataGenerator.SIGNIFICANT_FEATURE_PREFIX arguments = self.processArguments(True, True, 1000) arguments.univariate_config.analyze_all = True assert arguments.univariate_config.num_top_features == 147 orig_features = arguments.features.get( ArgumentProcessingService.FEATURE_NAMES) orig_sig_features = [ feature for feature in orig_features if significant_prefix in feature ] data_formatting_service = DataFormattingService(arguments) output = data_formatting_service.formatData(True) trimmed_features = output.get(ArgumentProcessingService.FEATURE_NAMES) trimmed_sig_features = [ feature for feature in trimmed_features if significant_prefix in feature ] training_matrix = output.get(DataFormattingService.TRAINING_MATRIX) testing_matrix = output.get(DataFormattingService.TESTING_MATRIX) expected_feature_count = 735 for matrix in [training_matrix, testing_matrix]: for cell_line in matrix: assert len(matrix[cell_line]) == expected_feature_count assert len(orig_features) > len(trimmed_features) assert len(orig_sig_features) == len(trimmed_sig_features) assert len(trimmed_features) == expected_feature_count def testNumFeaturesInUnivariateModeCanBeTuned(self): arguments = self.processArguments(True, True, 1000) arguments.univariate_config.analyze_all = True arguments.univariate_config.num_top_features = 10 data_formatting_service = DataFormattingService(arguments) output = data_formatting_service.formatData(True) training_matrix = output.get(DataFormattingService.TRAINING_MATRIX) testing_matrix = output.get(DataFormattingService.TESTING_MATRIX) expected_feature_count = 50 for matrix in [training_matrix, testing_matrix]: for cell_line in matrix: assert len(matrix[cell_line]) == expected_feature_count @staticmethod def validateOutput(output): assert output is not None assert output.get(DataFormattingService.TRAINING_MATRIX) is not None assert output.get(DataFormattingService.TESTING_MATRIX) is not None num_train = len( output.get(DataFormattingService.TRAINING_MATRIX).keys()) num_test = len(output.get(DataFormattingService.TESTING_MATRIX).keys()) assert num_train > num_test def testCheckImportData(self): features = np.genfromtxt(self.current_working_dir + '/SampleClassifierDataFolder/features.csv', delimiter=',') results = np.genfromtxt(self.current_working_dir + '/SampleClassifierDataFolder/results.csv', delimiter=',') assert np.array(features[1:]).dtype == "float64" assert np.array(results[1:, 1]).dtype == "float64" assert not np.isnan(features[1:]).any() assert not np.isnan(results[1:, 1]).any() assert len(features) == len(results) def testCheckOneHotEncoding(self): s = self.data_formatting_service categorical_pd = pd.read_csv( 'SampleClassifierDataFolder/categorical.csv', delimiter=',') assert ((s.binaryOneHot(categorical_pd).dtypes.values != np.dtype('float64')).all()) assert ((s.oneHot(categorical_pd).dtypes.values != np.dtype('float64')).all()) def testSplit(self): x_test, x_train, y_test, y_train = self.fetchTrainAndTestData() assert (len(x_train) and len(x_test) and len(y_train) and len(y_test) != 0) def testStratifySplit(self): x_test, x_train, y_test, y_train = self.fetchTrainAndTestData() assert (len(x_train) and len(x_test) and len(y_train) and len(y_test) != 0) categorical_pd = pd.read_csv( self.current_working_dir + '/SampleClassifierDataFolder/categorical.csv', delimiter=',') data_formatting_service = DataFormattingService(None) categorical_onehot = data_formatting_service.oneHot(categorical_pd) assert (np.shape(categorical_onehot))[1] == 2 def testFeatureOrderIsPreserved(self): original_input = self.data_formatting_service.inputs.features self.data_formatting_service.analyze_all = False # don't attempt trimming formatted_output = self.data_formatting_service.formatData( False, False) self.validateMatrixOrderHasNotChanged( formatted_output, original_input, DataFormattingService.TESTING_MATRIX) self.validateMatrixOrderHasNotChanged( formatted_output, original_input, DataFormattingService.TRAINING_MATRIX) def validateMatrixOrderHasNotChanged(self, formatted_output, original_input, matrix): for cell_line in formatted_output.get(matrix).keys(): formatted_features = formatted_output.get(matrix).get(cell_line) original_features = original_input.get(cell_line) assert original_features == formatted_features def testFeatureScaling(self): x_test, x_train, y_test, y_train = self.fetchTrainAndTestData() self.scaleFeaturesAndAssert(x_test) self.scaleFeaturesAndAssert(x_train) def scaleFeaturesAndAssert(self, x_vals): feature_one_orig = list(x_vals.get("feature_one")) feature_two_orig = list(x_vals.get("feature_two")) feature_three_orig = list(x_vals.get("feature_three")) scaled_test = self.data_formatting_service.maybeScaleFeatures( x_vals, True) assert scaled_test scaled_test_vals_as_list = SafeCastUtil.safeCast( scaled_test.values(), list) self.assertFeaturesScaled(feature_one_orig, scaled_test_vals_as_list, 0) self.assertFeaturesScaled(feature_two_orig, scaled_test_vals_as_list, 1) self.assertFeaturesScaled(feature_three_orig, scaled_test_vals_as_list, 2) def assertFeaturesScaled(self, feature, scaled_test_vals_as_list, index): for i in range(0, len(feature)): for j in range(0, len(feature)): if feature[i] == feature[j]: assert scaled_test_vals_as_list[i][ index] == scaled_test_vals_as_list[j][index] elif feature[i] < feature[j]: assert scaled_test_vals_as_list[i][ index] < scaled_test_vals_as_list[j][index] else: assert scaled_test_vals_as_list[i][ index] > scaled_test_vals_as_list[j][index]
from LoggerFactory import LoggerFactory log = LoggerFactory.getLogger('cooker') temperature = 115 def first(): log.info('Hi, I\'m the cooker') def second(): log.warning('It gets to hot here') log.debug('Temp is: %i', temperature) def third(): log.critical('BOOM!!!')