예제 #1
0
 def __init__(self):
     Scorer.LOGGER = LoggerFactory.getLogger("Scorer")
     
     Scorer.updateViewerScoreNum = 0
     
     Scorer.updateResourceNum = 0
     Scorer.updateViewerNum = 0
     Scorer.updateAuthorNum = 0
     Scorer.updateResourceTime = 0
     Scorer.updateViewerTime = 0
     Scorer.updateAuthorTime = 0
     Scorer.dupAuthor = 0
     Scorer.dupViewer = 0
     Scorer.avgViewerScore = 0
     
     Scorer.updateNumLock = thread.allocate_lock()
     
     Scorer.authorHashSet = dict()
     Scorer.viewerHashSet = dict()
     Scorer.cacheLimitDay = 30
     
     self.mysqlHost = "10.163.102.88"
     self.mysqlUserName = "******"
     self.mysqlPassword = "******"
     self.mysqlDb = "networkresourcesort"
     
     self.viewerScoreThreadPool = None
     
     Scorer.scoreModelPool = Queue.Queue(10)
     for i in xrange(0, 7):
         Scorer.scoreModelPool.put(ScoreModel(self.mysqlHost, self.mysqlUserName, self.mysqlPassword, self.mysqlDb))
예제 #2
0
    def __init__(self):
        Scorer.LOGGER = LoggerFactory.getLogger("Scorer")

        Scorer.updateViewerScoreNum = 0

        Scorer.updateResourceNum = 0
        Scorer.updateViewerNum = 0
        Scorer.updateAuthorNum = 0
        Scorer.updateResourceTime = 0
        Scorer.updateViewerTime = 0
        Scorer.updateAuthorTime = 0
        Scorer.dupAuthor = 0
        Scorer.dupViewer = 0
        Scorer.avgViewerScore = 0

        Scorer.updateNumLock = thread.allocate_lock()

        Scorer.authorHashSet = dict()
        Scorer.viewerHashSet = dict()
        Scorer.cacheLimitDay = 30

        self.mysqlHost = "10.163.102.88"
        self.mysqlUserName = "******"
        self.mysqlPassword = "******"
        self.mysqlDb = "networkresourcesort"

        self.viewerScoreThreadPool = None

        Scorer.scoreModelPool = Queue.Queue(10)
        for i in xrange(0, 7):
            Scorer.scoreModelPool.put(
                ScoreModel(self.mysqlHost, self.mysqlUserName,
                           self.mysqlPassword, self.mysqlDb))
예제 #3
0
class FileConverterIT(unittest.TestCase):

    log = LoggerFactory.createLog(__name__)

    def setUp(self):
        current_working_dir = os.getcwd()  # Should be this package.
        self.input_folder = current_working_dir + "/SampleMatlabDataFolder"
        self.createdFolder = self.input_folder + "/Trametinib_analysis"

    def tearDown(self):
        if self.input_folder != "/":
            for file in os.listdir(self.createdFolder):
                if file == "__init__.py" or ".mat" in file:
                    continue
                os.remove(self.createdFolder + "/" + file)
            os.removedirs(self.createdFolder)

    def testMatlabFileConversionProperlyFormatsMatrices(self):
        FileConverter.convertMatLabToCSV(self.input_folder)
        for generated_csv in [
                file for file in os.listdir(self.createdFolder)
                if ".csv" in file
        ]:
            with open(self.createdFolder + "/" + generated_csv) as csv:
                try:
                    for line in csv:
                        assert "['" not in line
                        assert "']" not in line
                except ValueError as valueError:
                    self.log.error(valueError)
                finally:
                    csv.close()
예제 #4
0
    def __init__(self, host, userName, password, dbName):
        self.LOGGER = LoggerFactory.getLogger("ScoreModel")
        self.host = host
        self.userName = userName
        self.password = password
        self.dbName = dbName

        self.dbConn = self._getConn()
 def __init__(self, host, userName, password, dbName):
     self.LOGGER = LoggerFactory.getLogger("ScoreModel")
     self.host = host
     self.userName = userName
     self.password = password
     self.dbName = dbName
     
     self.dbConn = self._getConn()
예제 #6
0
    def convertMatLabToCSV(matlab_files_directory):

        log = LoggerFactory.createLog(__name__)

        os.chdir(matlab_files_directory)
        matlab_files = glob.glob("*.mat")

        for input_file in matlab_files:
            drug_name = input_file.split("gexmutcnum.mat")[0].strip()
            new_directory = matlab_files_directory + "/" + drug_name + "_analysis"
            matlab_file = scipy.io.loadmat(input_file)

            os.mkdir(new_directory)

            format_id_string = lambda array: SafeCastUtil.safeCast(
                array[0], str)
            for key in SafeCastUtil.safeCast(
                    FileConverter.VARIABLE_MATCHES.keys(), list):
                header = [
                    format_id_string(feature_name)
                    for feature_name in matlab_file.get(key)[0]
                ]
                file_name = new_directory + "/" + drug_name + "_" + FileConverter.FILE_NAMES[
                    key] + ".csv"
                cell_line_data = FileConverter.formatCellLineData(
                    matlab_file.get(FileConverter.VARIABLE_MATCHES.get(key)),
                    key)
                FileConverter.validateAndWriteCSV(
                    cell_line_data, header, file_name, log,
                    FileConverter.EXPECTED_TYPES[key])

            cell_line_ids = [
                format_id_string(cell_id)
                for cell_id in matlab_file.get(FileConverter.ID_FIELD)
            ]
            results = matlab_file.get(FileConverter.RESULTS_FIELD)
            zipped_results = SafeCastUtil.safeCast(
                zip(cell_line_ids, results[0]), list)
            results_file = new_directory + "/" + drug_name + "_results.csv"

            FileConverter.validateAndWriteCSV(zipped_results,
                                              ["cell_line", "result"],
                                              results_file, log, float)
            log.info(
                "The MATLAB file for %s has been successfully converted into csv files ready to be used"
                " with the CLA software!", drug_name)

        log.info("All MATLAB files have been processed!")
예제 #7
0
import pycom
from helper import blink_led

pycom.heartbeat(False)  # disable the heartbeat LED
pycom.rgbled(0x552000)  # flash orange to indicate startup

# Try to mount SD card, if this fails, keep blinking red and do not proceed
try:
    from machine import SD, Pin, reset
    import os
    from loggingpycom import DEBUG
    from LoggerFactory import LoggerFactory
    from UserButton import UserButton

    # Initialise LoggerFactory and status logger
    logger_factory = LoggerFactory()
    status_logger = logger_factory.create_status_logger(
        'status_logger',
        level=DEBUG,
        terminal_out=True,
        filename='status_log.txt')

    # Initialize button interrupt on pin 14 for user interaction
    user_button = UserButton(status_logger)
    pin_14 = Pin("P14", mode=Pin.IN, pull=Pin.PULL_DOWN)
    pin_14.callback(Pin.IRQ_RISING | Pin.IRQ_FALLING,
                    user_button.button_handler)

    # Mount SD card
    sd = SD()
    os.mount(sd, '/sd')
예제 #8
0
# Disable the heartbeat LED and set to orange to indicate startup
pycom.heartbeat(False)
pycom.rgbled(0x552000)

# Try to mount SD card, if this fails, keep blinking red and do not proceed
try:
    import os
    import time
    from loggingpycom import DEBUG
    from LoggerFactory import LoggerFactory
    from userbutton import UserButton

    os.mount(SD(), "/sd")

    logger_factory = LoggerFactory()
    status_logger = logger_factory.create_status_logger(
        "status_logger",
        level=DEBUG,
        terminal_out=True,
        filename="status_log.txt")

    # Initialise button interrupt on pin 14 for user interaction
    user_button = UserButton(status_logger)
    pin_14 = Pin("P14", mode=Pin.IN, pull=Pin.PULL_DOWN)
    pin_14.callback(Pin.IRQ_RISING | Pin.IRQ_FALLING,
                    user_button.button_handler)

except Exception as e:
    # If something goes wrong, blink red LED and reboot after 60 seconds
    print("Startup failed:", str(e))
class RandomSubsetElasticNetModelTest(unittest.TestCase):

    log = LoggerFactory.createLog(__name__)

    train_features = [[0, 0, 0, 1, 0.32, 0.25, 0.52, 0.63],
                      [0, 0, 1, 1, 1.11, 1.45, 0.31, 0.22],
                      [0, 1, 0, 0, 0.32, 0.56, 0.66, 0.25],
                      [1, 0, 0, 1, 0.32, 0.34, 0.13, 0.54]]

    test_features = [[0, 1, 0, 1, 0.11, 0.41, 0.11, 2.63],
                     [0, 0, 0, 1, 3.23, 1.45, 0.01, 1.22]]

    train_results = [0.5, 0.3, 0.9, 1.3]
    test_results = [1.5, 0.5]

    binary_feature_indices = [0, 1, 2, 3]

    def testPValueWorksAsIntended(self):
        model = self.trainModelWithExplicitNumberOfPhrases(10, True)

        for enet_model in model.models_by_phrase:  # fake the scores so that we don't have models which tie
            enet_model.score = random.random()

        score_0 = model.score(self.test_features, self.test_results)
        score_0_redundant = model.score(self.test_features, self.test_results)
        assert score_0 == score_0_redundant

        model.p = 0.5
        score_half = model.score(self.test_features, self.test_results)
        assert score_0 != score_half

        model.p = 1.0
        score_1 = model.score(self.test_features, self.test_results)
        assert score_0 != score_1
        assert score_half != score_1

    def testExplicitModelCountWorks(self):
        model = self.trainModelWithExplicitNumberOfPhrases(5, False)
        assert len(model.models_by_phrase) == 5

    def testDuplicatePhrasesAreNotCreated(self):
        model = self.trainModelWithExplicitNumberOfPhrases(5, False)
        assert len(model.models_by_phrase) == 5

        first_phrase = copy.deepcopy(model.models_by_phrase[0].phrase)
        assert first_phrase.equals(model.models_by_phrase[0].phrase)
        assert model.currentPhraseExists(first_phrase)

        first_phrase.is_or = not first_phrase.is_or
        assert not first_phrase.equals(model.models_by_phrase[0].phrase)
        assert not model.currentPhraseExists(first_phrase)

    def trainModelWithExplicitNumberOfPhrases(self, phrase_count, at_least):
        num_phrases = 0
        model = None
        explicit_count = 0
        if not at_least:
            explicit_count = phrase_count
        while (not at_least and num_phrases != phrase_count) or (at_least and num_phrases < phrase_count):

            model = RandomSubsetElasticNet(1, 0.5, self.binary_feature_indices, upper_bound=0.5, lower_bound=0, p=0,
                                           explicit_model_count=(explicit_count - 1))
            model.fit(self.train_features, self.train_results)
            num_phrases = len(model.models_by_phrase)
            [self.assertScore(model_phrase) for model_phrase in model.models_by_phrase if model_phrase.phrase.value is not None]

        return model

    def assertScore(self, phrase):
        assert phrase.score > 0

    def testParameterValidationWorks(self):
        bad_explicit_phrases = [RecursiveBooleanPhrase(5, 1, False, None)]
        self.assertInvalidParams([-1, 0, 1])
        self.assertInvalidParams([0, 1, "test"])
        self.assertInvalidParams(self.binary_feature_indices, alpha=-1)
        self.assertInvalidParams(self.binary_feature_indices, l_one_ratio=-1)
        self.assertInvalidParams(self.binary_feature_indices, upper_bound=5)
        self.assertInvalidParams(self.binary_feature_indices, lower_bound=-1)
        self.assertInvalidParams(self.binary_feature_indices, lower_bound=.3, upper_bound=.1)
        self.assertInvalidParams(self.binary_feature_indices, p=100)
        self.assertInvalidParams(self.binary_feature_indices, explicit_model_count=-2)
        self.assertInvalidParams(self.binary_feature_indices, max_boolean_generation_attempts=0)
        self.assertInvalidParams(self.binary_feature_indices, default_coverage_threshold=1.4)
        self.assertInvalidParams(self.binary_feature_indices, explicit_phrases=bad_explicit_phrases)

    def assertInvalidParams(self, binary_feature_indices, alpha=1, l_one_ratio=2, upper_bound=0.5, lower_bound=0.1, p=0,
                            explicit_model_count=-1, max_boolean_generation_attempts=10,
                            default_coverage_threshold=0.8, explicit_phrases=None):
        error = ""
        try:
            RandomSubsetElasticNet(alpha, l_one_ratio, binary_feature_indices, upper_bound=upper_bound,
                                   lower_bound=lower_bound, p=p, explicit_model_count=explicit_model_count,
                                   max_boolean_generation_attempts=max_boolean_generation_attempts,
                                   coverage_threshold=default_coverage_threshold, explicit_phrases=explicit_phrases)
        except AttributeError as attributeError:
            error = SafeCastUtil.safeCast(attributeError, str)
        assert "invalid parameters" in error

    def testRSENFailsIfNonBinaryMatrixSentIn(self):
        self.train_features[0][0] = 2
        error = ""
        try:
            model = RandomSubsetElasticNet(1, 2, self.binary_feature_indices)
            model.fit(self.train_features, self.train_results)
        except ValueError as valueError:
            error = SafeCastUtil.safeCast(valueError, str)
        assert "Non-binary feature" in error
class RecommendationsServiceIT(unittest.TestCase):

    log = LoggerFactory.createLog(__name__)

    def setUp(self):
        self.current_working_dir = os.getcwd()  # Should be this package.
        self.DRUG_DIRECTORY = "DrugAnalysisResults"
        self.NUM_DRUGS = 10

    def tearDown(self):
        if self.current_working_dir != "/":
            directory = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER
            for file_or_dir in os.listdir(directory):
                if file_or_dir == "__init__.py":
                    continue
                current_path = directory + "/" + file_or_dir
                if self.DRUG_DIRECTORY in file_or_dir:
                    for file in os.listdir(current_path):
                        os.remove(current_path + "/" + file)
                    os.removedirs(current_path)
                else:
                    os.remove(current_path)

    def testRecommendations(self):
        num_cell_lines = 30
        inputs = self.formatRandomizedData(False, num_cell_lines)
        target_dir = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER

        try:
            recs_service = RecommendationsService(inputs)
            recs_service.recommendByHoldout(target_dir)

            drug_names = SafeCastUtil.safeCast(recs_service.inputs.keys(), list)

            self.assertRecsByDrug(num_cell_lines, drug_names, target_dir)

            recs_service.writeFinalRecsResults(target_dir)
            self.assertRecsByCellLine(num_cell_lines, drug_names, target_dir)

        except KeyboardInterrupt as keyboard_interrupt:
            assert False

    def assertRecsByDrug(self, num_cell_lines, drug_names, target_dir):
        file_name = target_dir + "/" + RecommendationsService.PREDICTIONS_FILE
        num_lines = 0
        with open(file_name) as csv_file:
            try:
                for line_index, line in enumerate(csv_file):
                    num_lines += 1
                    line_split = line.split(",")

                    if line_index == 0:
                        assert line_split[0] == "Drug"
                    else:
                        assert line_split[0] in drug_names
                        assert "cell_line" in line_split[1]
                        assert SafeCastUtil.safeCast(line_split[2], float) is not None
                        assert SafeCastUtil.safeCast(line_split[3].strip(), float) is not None
            except AssertionError as error:
                self.log.error(error)
            finally:
                self.log.debug("Closing file %s", file_name)
                csv_file.close()
                assert num_lines == (num_cell_lines * self.NUM_DRUGS) + 1

    def assertRecsByCellLine(self, num_cell_lines, drug_names, target_dir):
        file_name = target_dir + "/" + RecommendationsService.PREDICTIONS_BY_CELL_LINE_FILE
        num_lines = 0
        with open(file_name) as csv_file:
            try:
                for line_index, line in enumerate(csv_file):
                    num_lines += 1
                    line_split = line.split(",")

                    if line_index == 0:
                        assert line_split[0] == "Cell Line"
                    else:
                        for i in range(0, len(line_split)):
                            if i == 0:
                                assert "cell_line" in line_split[i]
                            elif i % 2 == 0:
                                assert SafeCastUtil.safeCast(line_split[i], float) > AbstractModelTrainer.DEFAULT_MIN_SCORE
                            elif i % 2 == 1:
                                assert line_split[i] in drug_names
            except AssertionError as error:
                self.log.error(error)
            finally:
                self.log.debug("Closing file %s", file_name)
                csv_file.close()
                assert num_lines == num_cell_lines + 1

    def testPreRecsAnalysis(self):
        num_cell_lines = 1000
        inputs = self.formatRandomizedData(False, num_cell_lines)
        for processed_arguments in inputs.values():
            sample_features = processed_arguments.features.get(RandomizedDataGenerator.CELL_LINE + "0")
            for _ in range(10):
                num_cell_lines += 1
                self.addRandomCellLine(processed_arguments, sample_features)

        target_dir = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER

        try:
            recs_service = RecommendationsService(inputs)
            recs_service.preRecsAnalysis(target_dir)

            file_name = target_dir + "/" + RecommendationsService.PRE_REC_ANALYSIS_FILE
            num_lines = 0
            drug_names = SafeCastUtil.safeCast(recs_service.inputs.keys(), list)
            cell_line = RandomizedDataGenerator.CELL_LINE
            with open(file_name) as csv_file:
                try:
                    for line_index, line in enumerate(csv_file):
                        num_lines += 1
                        line_split = line.split(",")
                        for i in range(0, len(line_split)):
                            value_in_csv = line_split[i].strip()
                            if line_index == 0:
                                if i == 0:
                                    assert value_in_csv == cell_line
                                else:
                                    assert value_in_csv == drug_names[i - 1]
                            else:
                                if i == 0:
                                    assert cell_line or RecommendationsService.STD_DEVIATION or \
                                           RecommendationsService.MEAN or RecommendationsService.MEDIAN in value_in_csv
                                else:
                                    assert value_in_csv == MachineLearningService.DELIMITER.strip() or \
                                           SafeCastUtil.safeCast(value_in_csv, float) > AbstractModelTrainer.DEFAULT_MIN_SCORE

                except AssertionError as error:
                    self.log.error(error)
                finally:
                    self.log.debug("Closing file %s", file_name)
                    csv_file.close()
                    assert num_lines == num_cell_lines + 4
        except KeyboardInterrupt as keyboard_interrupt:
            assert False

    def addRandomCellLine(self, processed_arguments, sample_features):
        random_string = self.randomString(16)
        processed_arguments.features[random_string] = sample_features
        processed_arguments.results.append([random_string, random.random()])

    def randomString(self, string_length):
        letters = string.hexdigits
        return ''.join(random.choice(letters) for i in range(string_length))

    def formatRandomizedData(self, is_classifier, num_cell_lines):
        randomized_data_path = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER
        processed_arguments = {}
        for i in range(self.NUM_DRUGS):
            drug_name = self.DRUG_DIRECTORY + SafeCastUtil.safeCast(i + 1, str)
            drug_path = randomized_data_path + "/" + drug_name
            if drug_name not in os.listdir(randomized_data_path):
                os.mkdir(drug_path)
            random_data_generator = RandomizedDataGenerator(drug_path)
            random_data_generator.generateRandomizedFiles(3, num_cell_lines, 150, is_classifier, 2, .8,
                                                          use_static_features=True)
            argument_processing_service = ArgumentProcessingService(drug_path)
            processed_args = argument_processing_service.handleInputFolder()
            processed_args.recs_config.viability_acceptance = 0.1
            processed_arguments[drug_name] = processed_args
            ml_service = MachineLearningService(processed_args)
            combos = [ml_service.generateFeatureSetString(combo) for combo in ml_service.determineGeneListCombos()]
            self.setupDrugData(combos, ml_service, drug_path)

        return processed_arguments

    def setupDrugData(self, combos, ml_service, drug_path):
        for algo in SupportedMachineLearningAlgorithms.fetchAlgorithms():
            file_name = drug_path + "/" + algo + ".csv"
            with open(file_name, 'w', newline='') as feature_file:
                writer = csv.writer(feature_file)
                header = ml_service.getCSVFileHeader(ml_service.inputs.is_classifier,
                                                     algo, ml_service.inputs.outer_monte_carlo_permutations)
                writer.writerow(header)
                for combo in combos:
                    row = RandomizedDataGenerator.generateAnalysisRowForCombo(ml_service, combo, algo)
                    writer.writerow(row)
                feature_file.close()
예제 #11
0
import fridge
import cooker
from LoggerFactory import LoggerFactory

log = LoggerFactory.getLogger('main')

log.info('Log started.')
fridge.first()
cooker.first()
fridge.second()
cooker.second()
cooker.third()
fridge.third()
log.info('Log stopped.')
예제 #12
0
 def __init__(self):
     self.LOGGER = LoggerFactory.getLogger("ScoreManager")
     self.scorer =Scorer()
     self.round = 1
예제 #13
0
class AbstractModelTrainer(ABC):

    log = LoggerFactory.createLog(__name__)

    DEFAULT_MIN_SCORE = -10

    ADDITIONAL_DATA = "additional_data"

    EMPTY_MODEL_RESPONSE = DEFAULT_MIN_SCORE, 0.0

    @abstractmethod
    def __init__(self, algorithm, hyperparameters, is_classifier):
        self.algorithm = algorithm
        self.hyperparameters = hyperparameters
        self.is_classifier = is_classifier

    @abstractmethod
    def hyperparameterize(self, training_matrix, testing_matrix, results):
        pass

    @abstractmethod
    def train(self, results, features, hyperparams, feature_names):
        pass

    @abstractmethod
    def supportsHyperparams(self):
        pass

    @abstractmethod
    def fetchFeatureImportances(self, model, features_in_order):
        pass

    def preserveNonHyperparamData(self, model_data, model):
        pass

    def shouldProcessFeatureSet(self, feature_set):
        return True

    def fetchModelPhrases(self, model, gene_list_combo):
        return {}

    def logTrainingMessage(self, outer_monte_carlo_perms,
                           inner_monte_carlo_perms, num_gene_list_combos):
        num_models = self.determineNumModelsToCreate(outer_monte_carlo_perms,
                                                     inner_monte_carlo_perms,
                                                     num_gene_list_combos)
        self.log.info(
            "Running permutations on %s different combinations of features. Requires creation of %s "
            "different %s models.",
            SafeCastUtil.safeCast(num_gene_list_combos, str), num_models,
            self.algorithm)

    def determineNumModelsToCreate(self, outer_monte_carlo_perms,
                                   inner_monte_carlo_perms,
                                   num_gene_list_combos):
        num_models = outer_monte_carlo_perms * inner_monte_carlo_perms * num_gene_list_combos
        for hyperparam_set in self.hyperparameters.values():
            num_models *= len(hyperparam_set)
        return num_models + (outer_monte_carlo_perms * num_gene_list_combos)

    def loopThroughHyperparams(self, hyperparams, training_matrix,
                               testing_matrix, results):
        self.hyperparameters = hyperparams

        features, relevant_results = self.populateFeaturesAndResultsByCellLine(
            training_matrix, results)
        feature_names = training_matrix.get(
            ArgumentProcessingService.FEATURE_NAMES)

        hyperparam_permutations = self.fetchAllHyperparamPermutations(
            hyperparams)
        GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log)
        return self.hyperparameterizeInSerial(feature_names, features,
                                              hyperparam_permutations,
                                              relevant_results, results,
                                              testing_matrix)

    def hyperparameterizeInSerial(self, feature_names, features,
                                  hyperparam_permutations, relevant_results,
                                  results, testing_matrix):
        model_data = {}
        for hyperparam_set in hyperparam_permutations:
            self.buildModelAndRecordScore(feature_names, features,
                                          hyperparam_set, model_data,
                                          relevant_results, results,
                                          testing_matrix)
        return model_data

    def chunkList(self, original_list, size):
        return [
            original_list[i * size:(i + 1) * size]
            for i in range((len(original_list) + size - 1) // size)
        ]

    def buildModelAndRecordScore(self, feature_names, features, hyperparam_set,
                                 model_data, relevant_results, results,
                                 testing_matrix):
        if not isinstance(model_data,
                          dict) and model_data._address_to_local is not None:
            shared_file = SafeCastUtil.safeCast(
                model_data._address_to_local.keys(), list)[0]
            if not os.path.exists(shared_file):
                self.log.warning(
                    "Unable to find shared file %s, process likely ended prematurely.",
                    shared_file)
                return

        self.log.debug("Building %s model with hyperparams %s.",
                       self.algorithm, hyperparam_set)
        model = self.buildModel(relevant_results, features, hyperparam_set,
                                feature_names)
        self.preserveNonHyperparamData(model_data, model)
        current_model_score = self.fetchPredictionsAndScore(
            model, testing_matrix, results)

        lock = threading.Lock()
        lock.acquire(True)
        try:
            model_data[DictionaryUtility.toString(
                hyperparam_set)] = current_model_score
        except FileNotFoundError as fnfe:
            self.log.error(
                "Unable to write to shared model_data object for algorithm: %s.\n",
                fnfe)
        except AttributeError as ae:
            self.log.error(
                "Unable to write to shared model_data object for algorithm: %s.\n",
                ae)
        finally:
            lock.release()

        self.log.debug("Finished building %s model with hyperparams %s.",
                       self.algorithm, hyperparam_set)
        return model_data

    def buildModel(self, relevant_results, features, hyperparam_set,
                   feature_names):
        model = None
        try:
            model = self.train(relevant_results, features, hyperparam_set,
                               feature_names)
        except ValueError as valueError:
            self.log.error("Failed to create model build for %s:\n%s",
                           self.algorithm, valueError)
        return model

    def fetchAllHyperparamPermutations(self, hyperparams):
        all_perms = []
        hyperparam_keys = SafeCastUtil.safeCast(hyperparams.keys(), list)
        zero_filled_indices = SafeCastUtil.safeCast(
            numpy.zeros(len(hyperparam_keys)), list)
        target_index = len(zero_filled_indices) - 1
        current_perm = zero_filled_indices[:]
        while target_index >= 0:
            current_hyperparams = OrderedDict()
            for i in range(0, len(current_perm)):
                param_name = hyperparam_keys[i]
                current_hyperparams[param_name] = hyperparams[param_name][
                    SafeCastUtil.safeCast(current_perm[i], int)]
            if current_hyperparams not in all_perms:
                clone_map = copy.deepcopy(current_hyperparams)
                all_perms.append(clone_map)

            if current_perm[target_index] < len(
                    hyperparams[hyperparam_keys[target_index]]) - 1:
                current_perm[target_index] += 1
                while len(current_perm) > target_index + 1 and current_perm[target_index + 1] <\
                        len(hyperparams[hyperparam_keys[target_index]]):
                    target_index += 1
            else:
                target_index -= 1
                for subsequent_index in range(target_index,
                                              len(current_perm) - 1):
                    current_perm[subsequent_index + 1] = 0
        return all_perms

    def fetchPredictionsAndScore(self, model, testing_matrix, results):
        if model is None:
            return self.EMPTY_MODEL_RESPONSE
        features, relevant_results = self.populateFeaturesAndResultsByCellLine(
            testing_matrix, results)
        predictions = model.predict(features)
        score = AbstractModelTrainer.DEFAULT_MIN_SCORE
        try:
            score = model.score(features, relevant_results)
        except ValueError as valueError:
            self.log.error(valueError)
        if self.is_classifier:
            accuracy = accuracy_score(relevant_results, predictions)
        else:
            accuracy = mean_squared_error(relevant_results, predictions)
        del model
        return score, accuracy

    def populateFeaturesAndResultsByCellLine(self, matrix, results):
        features = []
        relevant_results = []
        for cell in matrix.keys():
            if cell == ArgumentProcessingService.FEATURE_NAMES:
                continue
            features.append(matrix[cell])
            for result in results:
                if result[0] == cell:
                    relevant_results.append(result[1])
        return features, relevant_results

    def logIfBestHyperparamsOnRangeThreshold(self, best_hyperparams,
                                             record_diagnostics, input_folder):
        if not self.supportsHyperparams() or best_hyperparams is None:
            return
        hyperparam_keys = SafeCastUtil.safeCast(self.hyperparameters.keys(),
                                                list)
        for i in range(0, len(hyperparam_keys)):
            hyperparam_set = self.hyperparameters[hyperparam_keys[i]]
            optimal_value = best_hyperparams.get(hyperparam_keys[i])
            if optimal_value is None:
                self.log.warn(
                    "Unable to determine optimal value given hyperparams: %s",
                    SafeCastUtil.safeCast(best_hyperparams, str, None))
                continue
            if optimal_value >= hyperparam_set[len(hyperparam_set) - 1]:
                message = "Best hyperparam for " + self.algorithm + " on upper threshold of provided hyperparam " \
                          "set: " + hyperparam_keys[i] + " = " + SafeCastUtil.safeCast(optimal_value, str) + "\n"
                self.log.debug(message)
                if record_diagnostics:
                    DiagnosticsFileWriter.writeToFile(input_folder, message,
                                                      self.log)
            elif optimal_value <= hyperparam_set[0]:
                message = "Best hyperparam for " + self.algorithm + " on lower threshold of provided hyperparam " \
                          "set: " + hyperparam_keys[i] + " = " + SafeCastUtil.safeCast(optimal_value, str) + "\n"
                self.log.debug(message)
                if record_diagnostics:
                    DiagnosticsFileWriter.writeToFile(input_folder, message,
                                                      self.log)

    def logOptimalHyperParams(self, hyperparams, feature_set_as_string,
                              record_diagnostics, input_folder):
        message = "Optimal Hyperparameters for " + feature_set_as_string + " " + self.algorithm + " algorithm " \
                  "chosen as:\n"

        for key in SafeCastUtil.safeCast(hyperparams.keys(), list):
            message += "\t" + key + " = " + SafeCastUtil.safeCast(
                hyperparams[key], str) + "\n"
        self.log.info(message)
        if record_diagnostics:
            DiagnosticsFileWriter.writeToFile(input_folder, message, self.log)

    def generateFeaturesInOrder(self, gene_list_combo):
        features_in_order = []
        for feature_file in gene_list_combo:
            for feature in feature_file:
                features_in_order.append(feature)
        return features_in_order

    def normalizeCoefficients(self, coefficients, features_in_order):
        importances = {}
        absolute_sum = numpy.sum([numpy.abs(coeff) for coeff in coefficients])
        for i in range(0, len(features_in_order)):
            if absolute_sum > 0:
                importances[features_in_order[i]] = numpy.abs(
                    coefficients[i]) / absolute_sum
            else:
                importances[features_in_order[i]] = numpy.abs(
                    coefficients[i])  # should be 0.
        return importances
예제 #14
0
class DataFormattingService(object):

    log = LoggerFactory.createLog(__name__)

    TRAINING_MATRIX = "trainingMatrix"
    TESTING_MATRIX = "testingMatrix"  # Will either be outer testing or inner validation matrix

    P_VALUE_CUTOFF = 0.05

    def __init__(self, inputs):
        self.inputs = inputs

    def formatData(self, should_scale, should_one_hot_encode=True):
        features_df = pd.DataFrame.from_dict(self.inputs.features,
                                             orient='index')
        columns = self.inputs.features.get(
            ArgumentProcessingService.FEATURE_NAMES)
        features_df.columns = columns
        features_df = features_df.drop(ArgumentProcessingService.FEATURE_NAMES)

        x_train, x_test, y_train, y_test = self.testTrainSplit(
            features_df, self.inputs.results, self.inputs.data_split)

        x_train_corr, x_test_corr = self.maybeFilterCorrelatedFeatures(
            x_train, x_test, y_train, columns, self.inputs.analysisType())

        if should_one_hot_encode:
            x_train_one_hot = self.oneHot(x_train_corr)
            x_test_one_hot = self.oneHot(x_test_corr)
        else:
            x_train_one_hot = x_train_corr
            x_test_one_hot = x_test_corr

        outputs = OrderedDict()
        outputs[self.TRAINING_MATRIX] = self.maybeScaleFeatures(
            x_train_one_hot, should_scale)
        outputs[self.TESTING_MATRIX] = self.maybeScaleFeatures(
            x_test_one_hot, should_scale)
        outputs[
            ArgumentProcessingService.FEATURE_NAMES] = SafeCastUtil.safeCast(
                x_train_one_hot.columns, list)
        return outputs

    def maybeFilterCorrelatedFeatures(self, x_train, x_test, y_train,
                                      feature_names, analysis_type):
        if analysis_type is not AnalysisType.NO_GENE_LISTS:
            return x_train, x_test

        results = [result[1] for result in y_train]

        spearman_p_vals = {}
        ranksum_p_vals = {}

        for feature_name in feature_names:
            try:
                feature_column = x_train.get(feature_name)

                is_categorical = all(
                    isinstance(feature, str) for feature in feature_column)
                file = feature_name.split(".")[0]

                if is_categorical:
                    if ranksum_p_vals.get(file) is None:
                        ranksum_p_vals[file] = {}
                    ranksum = self.fetchRanksum(feature_column, results)
                    ranksum_p_vals[file][feature_name] = SafeCastUtil.safeCast(
                        ranksum[1], float, 1)
                else:
                    if spearman_p_vals.get(file) is None:
                        spearman_p_vals[file] = {}
                    spearman_corr = spearmanr(feature_column, results)
                    spearman_p_vals[file][
                        feature_name] = SafeCastUtil.safeCast(
                            spearman_corr[1], float, 1)

            except ValueError as error:
                self.log.error("Exception while trying to trim features: %s",
                               error)

        return self.trimFeatures(x_train, x_test,
                                 [ranksum_p_vals, spearman_p_vals])

    def fetchRanksum(self, feature_column, results):
        value_counts = {}
        for val in feature_column:
            if value_counts.get(val) is None:
                value_counts[val] = 1
            else:
                value_counts[val] += 1
        dominant_value = max(value_counts.items(),
                             key=operator.itemgetter(1))[0]
        dominant_results = []
        non_dominant_results = []
        for feature_val_and_result in zip(
                SafeCastUtil.safeCast(feature_column, list), results):
            if feature_val_and_result[0] == dominant_value:
                dominant_results.append(feature_val_and_result[1])
            else:
                non_dominant_results.append(feature_val_and_result[1])
        return ranksums(dominant_results, non_dominant_results)

    def trimFeatures(self, x_train, x_test, p_val_sets):
        features_to_keep = []
        num_top_features = self.inputs.univariate_config.num_top_features
        for p_val_set in p_val_sets:
            for file in p_val_set:
                features_and_p_vals = [
                    item for item in p_val_set[file].items()
                    if not np.isnan(item[1])
                ]
                sorted_features_and_p_vals = sorted(
                    features_and_p_vals,
                    key=operator.itemgetter(1))[:num_top_features]
                [
                    features_to_keep.append(feature_and_p_val[0])
                    for feature_and_p_val in sorted_features_and_p_vals
                ]

        filtered_df_train = x_train.filter(features_to_keep, axis=1)
        filtered_df_test = x_test.filter(features_to_keep, axis=1)
        return filtered_df_train, filtered_df_test

    def maybeScaleFeatures(self, data_frame, should_scale):
        as_dict = data_frame.transpose().to_dict('list')
        maybe_scaled_dict = OrderedDict()

        keys_as_list = SafeCastUtil.safeCast(as_dict.keys(), list)
        for key in keys_as_list:
            maybe_scaled_dict[key] = []

        if len(keys_as_list) > 0:
            for i in range(0, len(as_dict[keys_as_list[0]])):
                array_to_maybe_scale = []
                for key in keys_as_list:
                    array_to_maybe_scale.append(as_dict[key][i])
                if should_scale:
                    maybe_scaled_array = preprocessing.scale(
                        array_to_maybe_scale)
                else:
                    maybe_scaled_array = array_to_maybe_scale
                for j in range(0, len(keys_as_list)):
                    maybe_scaled_dict[keys_as_list[j]].append(
                        maybe_scaled_array[j])

        return maybe_scaled_dict

    def encodeCategorical(self, array):
        if array.dtype == np.dtype('float64') or array.dtype == np.dtype(
                'int64'):
            return array
        else:
            return preprocessing.LabelEncoder().fit_transform(array)

    # Encode sites as categorical variables
    def oneHot(self, dataframe):
        # Encode all labels
        dataframe = dataframe.apply(self.encodeCategorical)
        return dataframe

    # Binary one hot encoding
    def binaryOneHot(self, dataframe):
        dataframe_binary_pd = pd.get_dummies(dataframe)
        return dataframe_binary_pd

    def testTrainSplit(self, x_values, y_values, data_split):
        if data_split == 1.0:
            return x_values, pd.DataFrame(columns=SafeCastUtil.safeCast(
                x_values.columns, list)), y_values, []
        x_train, x_test, y_train, y_test = train_test_split(
            x_values, y_values, test_size=(1 - data_split))
        return x_train, x_test, y_train, y_test

    def testStratifySplit(self, x_values, y_values):
        x_train, x_split, y_train, y_split = train_test_split(
            x_values,
            y_values,
            test_size=0.2,
            random_state=42,
            stratify=x_values.iloc[:, -1])
        x_test, x_validate, y_test, y_validate = train_test_split(
            x_split,
            y_split,
            test_size=0.5,
            random_state=42,
            stratify=x_split.iloc[:, -1])
        return x_train, x_validate, x_test, y_train, y_validate, y_test
예제 #15
0
 def main(args):
     factory = LoggerFactory()
     logger = factory.getLogger()
     logger.log("A Message to Log")
class MachineLearningServiceIT(unittest.TestCase):

    log = LoggerFactory.createLog(__name__)

    THRESHOLD_OF_SIGNIFICANCE = 0.60

    MONTE_CARLO_PERMS = 2
    INDIVIDUAL_MONTE_CARLO_PERMS = 10

    def setUp(self):
        self.current_working_dir = os.getcwd()  # Should be this package.

    def tearDown(self):
        if self.current_working_dir != "/":
            for file in os.listdir(
                    self.current_working_dir + "/" +
                    RandomizedDataGenerator.GENERATED_DATA_FOLDER):
                if file == "__init__.py":
                    continue
                os.remove(self.current_working_dir + "/" +
                          RandomizedDataGenerator.GENERATED_DATA_FOLDER + "/" +
                          file)

    def testRandomForestRegressor(self):
        self.evaluateMachineLearningModel(RandomForestTrainer(False))

    def testRandomForestClassifier(self):
        self.evaluateMachineLearningModel(RandomForestTrainer(True))

    def testLinearSVMRegressor(self):
        self.evaluateMachineLearningModel(LinearSVMTrainer(False))

    def testLinearSVMClassifier(self):
        self.evaluateMachineLearningModel(LinearSVMTrainer(True))

    def testRadialBasisFunctionSVMRegressor(self):
        self.evaluateMachineLearningModel(RadialBasisFunctionSVMTrainer(False))

    def testRadialBasisFunctionSVMClassifier(self):
        self.evaluateMachineLearningModel(RadialBasisFunctionSVMTrainer(True))

    def testElasticNetRegressor(self):
        self.evaluateMachineLearningModel(ElasticNetTrainer(False))

    def testRidgeRegressor(self):
        self.evaluateMachineLearningModel(RidgeRegressionTrainer(False))

    def testLassoRegressor(self):
        self.evaluateMachineLearningModel(LassoRegressionTrainer(False))

    def testRandomSubsetElasticNet(self):
        ml_service = MachineLearningService(
            self.formatRandomizedData(False, False, False))
        ml_service.log.setLevel(logging.DEBUG)
        binary_cat_matrix = ml_service.inputs.rsen_config.binary_cat_matrix
        rsen_trainer = RandomSubsetElasticNetTrainer(False, binary_cat_matrix,
                                                     0, 0.4)

        filtered_combos = self.fetchFilteredRSENCombos(ml_service,
                                                       rsen_trainer)

        trimmed_combos = filtered_combos[0:8]
        target_dir = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER
        ml_service.handleParallellization(trimmed_combos, target_dir,
                                          rsen_trainer)

        self.assertResults(target_dir, rsen_trainer,
                           len(trimmed_combos) + 1, rsen_trainer.is_classifier,
                           False, False)

    def fetchFilteredRSENCombos(self, ml_service, rsen_trainer):
        filtered_combos = []
        for combo in ml_service.determineGeneListCombos():
            is_valid = True
            for feature_set in combo:
                if len([
                        feature for feature in feature_set
                        if "bin_cat.significant_feature" in feature
                ]) > 0:
                    is_valid = False
            if is_valid and rsen_trainer.shouldProcessFeatureSet(combo):
                filtered_combos.append(combo)
        return filtered_combos

    def testRandomSubsetElasticNetWithCombinedGeneLists(self):
        inputs = self.formatRandomizedData(False, False, False)
        input_folder = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER
        inputs.rsen_config.combine_gene_lists = True
        ml_service = MachineLearningService(inputs)
        ml_service.log.setLevel(logging.DEBUG)
        binary_cat_matrix = ml_service.inputs.rsen_config.binary_cat_matrix
        rsen_trainer = RandomSubsetElasticNetTrainer(False, binary_cat_matrix,
                                                     0, 0.4)
        gene_list_combos = ml_service.determineGeneListCombos()

        combos = ml_service.fetchValidGeneListCombos(input_folder,
                                                     gene_list_combos,
                                                     rsen_trainer)
        assert len(combos) < len(gene_list_combos)

        for combo in combos:
            assert "ALL_GENE_LISTS" in ml_service.generateFeatureSetString(
                combo)

    def evaluateMachineLearningModel(self, trainer):
        ml_service = MachineLearningService(
            self.formatRandomizedData(trainer.is_classifier, False, False))
        ml_service.log.setLevel(logging.DEBUG)
        num_gene_list_combos = 8
        self.analyzeAndAssertResults(ml_service, num_gene_list_combos, trainer,
                                     False, False)

    def analyzeAndAssertResults(self, ml_service, num_gene_list_combos,
                                trainer, univariate, has_static_features):
        try:
            gene_list_combos_shortened = ml_service.determineGeneListCombos(
            )[0:num_gene_list_combos]
            target_dir = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER
            ml_service.handleParallellization(gene_list_combos_shortened,
                                              target_dir, trainer)
            self.assertResults(target_dir, trainer, num_gene_list_combos + 1,
                               trainer.is_classifier, univariate,
                               has_static_features)
        except KeyboardInterrupt as keyboardInterrupt:
            self.log.error(
                "Interrupted manually, failing and initiating cleanup.")
            assert False

    def formatRandomizedData(self, is_classifier, analyze_all,
                             use_static_features):
        random_data_generator = RandomizedDataGenerator(
            RandomizedDataGenerator.GENERATED_DATA_FOLDER)
        random_data_generator.generateRandomizedFiles(
            3,
            1000,
            150,
            is_classifier,
            self.MONTE_CARLO_PERMS,
            .8,
            analyze_all=analyze_all,
            use_static_features=use_static_features)
        input_folder = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER
        argument_processing_service = ArgumentProcessingService(input_folder)
        argument_processing_service.log.setLevel(logging.DEBUG)
        return argument_processing_service.handleInputFolder()

    def assertResults(self, target_dir, trainer, expected_lines, is_classifier,
                      univariate, has_static_features):
        self.assertDiagnosticResults(target_dir, trainer, univariate)

        file_name = trainer.algorithm + ".csv"
        assert file_name in os.listdir(target_dir)
        num_lines = 0
        with open(target_dir + "/" + file_name) as csv_file:
            try:
                for line_index, line in enumerate(csv_file):
                    num_lines += 1
                    line_split = line.strip().split(",")
                    if line_index == 0:
                        assert line_split == MachineLearningService.getCSVFileHeader(
                            is_classifier, trainer.algorithm,
                            self.MONTE_CARLO_PERMS)
                        continue
                    feature_gene_list_combo = line_split[0]

                    assert ":" in feature_gene_list_combo or \
                           (has_static_features and GeneListComboUtility.ONLY_STATIC_FEATURES in feature_gene_list_combo)
                    score = SafeCastUtil.safeCast(line_split[1], float)
                    accuracy = SafeCastUtil.safeCast(line_split[2], float)
                    assert score > trainer.DEFAULT_MIN_SCORE
                    if RandomizedDataGenerator.SIGNIFICANT_GENE_LIST in feature_gene_list_combo or has_static_features:
                        assert score >= self.THRESHOLD_OF_SIGNIFICANCE
                    else:
                        assert score < self.THRESHOLD_OF_SIGNIFICANCE
                    assert accuracy > 0
                    if len(line_split) > 3:
                        top_importance = line_split[3]
                        assert top_importance is not None
            except AssertionError as error:
                self.log.error(error)
            finally:
                self.log.debug("Closing file %s", file_name)
                csv_file.close()
                assert num_lines == expected_lines

    def assertDiagnosticResults(self, target_dir, trainer, univariate):
        if trainer.supportsHyperparams():
            saved_features_logged_if_univariate = not univariate

            diagnostics_file = DiagnosticsFileWriter.FILE_NAME
            if diagnostics_file in os.listdir(target_dir):
                with open(target_dir + "/" + diagnostics_file) as open_file:
                    try:
                        for line_index, line in enumerate(open_file):
                            if "Best Hyperparam" in line:
                                assert trainer.algorithm in line
                                assert "upper" in line or "lower" in line
                            if "Monte Carlo loop" in line:
                                saved_features_logged_if_univariate = True
                    except ValueError as valueError:
                        self.log.error(valueError)
                    finally:
                        self.log.debug("Closing file %s", open_file)
                        open_file.close()
            assert saved_features_logged_if_univariate

    def testIndividualRandomForestRegressor(self):
        self.evaluateMachineLearningModelForIndividualCombo(
            SupportedMachineLearningAlgorithms.RANDOM_FOREST, "200,20", False)

    def testIndividualRandomForestClassifier(self):
        self.evaluateMachineLearningModelForIndividualCombo(
            SupportedMachineLearningAlgorithms.RANDOM_FOREST, "200,20", True)

    def testIndividualLinearSVMRegressor(self):
        self.evaluateMachineLearningModelForIndividualCombo(
            SupportedMachineLearningAlgorithms.LINEAR_SVM, "0.1,0,1", False)

    def testIndividualLinearSVMClassifier(self):
        self.evaluateMachineLearningModelForIndividualCombo(
            SupportedMachineLearningAlgorithms.LINEAR_SVM, "0.1", True)

    def testIndividualRadialBasisFunctionSVMRegressor(self):
        self.evaluateMachineLearningModelForIndividualCombo(
            SupportedMachineLearningAlgorithms.RADIAL_BASIS_FUNCTION_SVM,
            "0.1,0.1,0.1", False)

    def testIndividualRadialBasisFunctionSVMClassifier(self):
        self.evaluateMachineLearningModelForIndividualCombo(
            SupportedMachineLearningAlgorithms.RADIAL_BASIS_FUNCTION_SVM,
            "0.1,0.1,0.1", True)

    def testIndividualElasticNetRegressor(self):
        self.evaluateMachineLearningModelForIndividualCombo(
            SupportedMachineLearningAlgorithms.ELASTIC_NET, "0.1,0.1", False)

    def testIndividualRidgeRegressor(self):
        self.evaluateMachineLearningModelForIndividualCombo(
            SupportedMachineLearningAlgorithms.RIDGE_REGRESSION, "1", False)

    def testIndividualLassoRegressor(self):
        self.evaluateMachineLearningModelForIndividualCombo(
            SupportedMachineLearningAlgorithms.LASSO_REGRESSION, "1", False)

    def testIndividualRandomSubsetElasticNet(self):
        self.evaluateMachineLearningModelForIndividualCombo(
            SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET,
            "0.1,0.1", False)

    def evaluateMachineLearningModelForIndividualCombo(self, algorithm,
                                                       hyperparams,
                                                       is_classifier):
        input_folder = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER
        ml_service = MachineLearningService(
            self.formatRandomizedDataForIndividualCombo(
                is_classifier, algorithm, hyperparams, input_folder))
        if algorithm is SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET:
            binary_categorical_matrix = ml_service.inputs.rsen_config.binary_cat_matrix
            dummy_trainer = RandomSubsetElasticNetTrainer(
                False, binary_categorical_matrix, 0, 0.4)
            target_combo = self.fetchFilteredRSENCombos(
                ml_service, dummy_trainer)[0]
            target_combo_string = ml_service.generateFeatureSetString(
                target_combo)
            ml_service.inputs.individual_train_config.combo = target_combo_string

        try:
            ml_service.analyze(input_folder)
            self.assertResultsForIndividualCombo(input_folder, algorithm, 11,
                                                 is_classifier)
        except KeyboardInterrupt as keyboardInterrupt:
            self.log.error(
                "Interrupted manually, failing and initiating cleanup.")
            assert False

    def formatRandomizedDataForIndividualCombo(self, is_classifier, algorithm,
                                               hyperparams, input_folder):
        random_data_generator = RandomizedDataGenerator(
            RandomizedDataGenerator.GENERATED_DATA_FOLDER)
        random_data_generator.generateRandomizedFiles(
            3, 1000, 150, is_classifier, self.INDIVIDUAL_MONTE_CARLO_PERMS, .8,
            algorithm, hyperparams)
        argument_processing_service = ArgumentProcessingService(input_folder)
        return argument_processing_service.handleInputFolder()

    def assertResultsForIndividualCombo(self, target_dir, algorithm,
                                        expected_lines, is_classifier):
        file_name = algorithm + ".csv"
        assert file_name in os.listdir(target_dir)
        num_lines = 0
        with open(target_dir + "/" + file_name) as csv_file:
            try:
                for line_index, line in enumerate(csv_file):
                    num_lines += 1
                    line_split = line.strip().split(",")
                    if line_index == 0:
                        assert line_split == MachineLearningService.getCSVFileHeader(
                            is_classifier, algorithm, 1)
                        continue
                    feature_gene_list_combo = line_split[0]
                    assert ":" in feature_gene_list_combo
                    score = SafeCastUtil.safeCast(line_split[1], float)
                    assert score > AbstractModelTrainer.DEFAULT_MIN_SCORE
                    if len(line_split) > 3:
                        top_importance = line_split[3]
                        assert top_importance is not None
            except AssertionError as error:
                self.log.error(error)
            finally:
                self.log.debug("Closing file %s", file_name)
                csv_file.close()
                assert num_lines == expected_lines

    def testTrimmingExistingFeatures(self):
        input_folder = self.current_working_dir + "/SampleClassifierDataFolder"
        argument_processing_service = ArgumentProcessingService(input_folder)
        inputs = argument_processing_service.handleInputFolder()
        ml_service = MachineLearningService(inputs)
        gene_list_combos = ml_service.determineGeneListCombos()
        trainer = RandomForestTrainer(True)
        trimmed_combos = ml_service.fetchValidGeneListCombos(
            input_folder, gene_list_combos, trainer)
        assert len(trimmed_combos) == (len(gene_list_combos) - 1)

    def testSortingByFeatureImportances(self):
        delimiter = MachineLearningService.DELIMITER
        ml_service = MachineLearningService(None)
        # All columns add up to 1. Equal number of importances for each feature.
        importances = {
            "geneA": [0.0, 0.1, 0.2, 0.4, 0.0],  # total == 0.7
            "geneB": [1.0, 0.1, 0.2, 0.1, 0.5],  # total == 1.9
            "geneC": [0.0, 0.1, 0.2, 0.1, 0.25],  # total == 0.65
            "geneD": [0.0, 0.1, 0.2, 0.3, 0.25],  # total == 0.85
            "geneE": [0.0, 0.6, 0.2, 0.1, 0.0],  # total == 0.9
        }

        sorted_importances1 = ml_service.averageAndSortImportances(
            importances, 5)
        assert sorted_importances1[0] == "geneB --- 0.38"
        assert sorted_importances1[1] == "geneE --- 0.18"
        assert sorted_importances1[2] == "geneD --- 0.17"
        assert sorted_importances1[3] == "geneA --- 0.14"
        assert sorted_importances1[4] == "geneC --- 0.13"
        assert numpy.sum([
            SafeCastUtil.safeCast(imp.split(delimiter)[1], float)
            for imp in sorted_importances1 if imp is not ""
        ]) == 1.0

        sorted_importances2 = ml_service.averageAndSortImportances(
            importances, 6)
        assert len(sorted_importances1) == len(sorted_importances1)
        for i in range(0, len(sorted_importances2)):
            split1 = sorted_importances1[i].split(delimiter)
            split2 = sorted_importances2[i].split(delimiter)
            assert split1[0] == split2[0]
            if split1 == split2:
                continue
            assert SafeCastUtil.safeCast(
                split1[1], float) > SafeCastUtil.safeCast(split2[1], float)
        assert numpy.sum([
            SafeCastUtil.safeCast(imp.split(delimiter)[1], float)
            for imp in sorted_importances2 if imp is not ""
        ]) < 1.0

        # 6 columns. Now all the others are missing one.
        importances["geneF"] = [0, 0, 0, 0, 0, 1.0]  # total == 1.0
        sorted_importances3 = ml_service.averageAndSortImportances(
            importances, 6)
        assert len([imp for imp in sorted_importances3 if imp != ""]) > len(
            [imp for imp in sorted_importances1 if imp != ""])
        assert math.isclose(
            numpy.sum([
                SafeCastUtil.safeCast(imp.split(delimiter)[1], float)
                for imp in sorted_importances3 if imp is not ""
            ]), 1.0)

        importances["geneG"] = [0, 0, 0, 0, 0, 0, 2.0]  # total == 2.0
        sorted_importances4 = ml_service.averageAndSortImportances(
            importances, 7)
        assert len([imp for imp in sorted_importances4 if imp != ""]) > len(
            [imp for imp in sorted_importances3 if imp != ""])
        assert numpy.sum([
            SafeCastUtil.safeCast(imp.split(delimiter)[1], float)
            for imp in sorted_importances4 if imp is not ""
        ]) > 1.0

    def testSpecifiedCombosAreSelectedProperly(self):
        arguments = self.formatRandomizedData(False, False, False)
        file_names = []
        for feature in arguments.features.get(
                ArgumentProcessingService.FEATURE_NAMES):
            file_name = feature.split(".")[0]
            if file_name not in file_names:
                file_names.append(file_name)

        gene_lists = SafeCastUtil.safeCast(arguments.gene_lists.keys(), list)

        self.assertSpecificComboGeneration(
            arguments,
            self.generateSpecificCombos(file_names, gene_lists, False))
        self.assertSpecificComboGeneration(
            arguments, self.generateSpecificCombos(file_names, gene_lists,
                                                   True))

    def generateSpecificCombos(self, file_names, gene_lists, flip_order):
        specific_combos = []
        if len(file_names) > 1 and len(gene_lists) > 1:
            if flip_order:
                specific_combos.append(file_names[0] + ":" + gene_lists[1] +
                                       " " + file_names[1] + ":" +
                                       gene_lists[1])
            else:
                specific_combos.append(file_names[1] + ":" + gene_lists[1] +
                                       " " + file_names[0] + ":" +
                                       gene_lists[1])

        for file in file_names:
            for gene_list in gene_lists:
                if gene_list is not "null_gene_list":
                    specific_combos.append(file + ":" + gene_list)
                    if len(specific_combos) > 4:
                        return specific_combos
        return specific_combos

    def assertSpecificComboGeneration(self, arguments, specific_combos):
        arguments.specific_combos = specific_combos
        ml_service = MachineLearningService(arguments)
        gene_list_combos = ml_service.determineGeneListCombos()
        filtered_combos = ml_service.determineSpecificCombos(gene_list_combos)
        assert len(filtered_combos) == len(specific_combos)

    def testFullAnalysisSansGeneListRandomForestRegressor(self):
        self.evaluateModelFullAnalysisSansGeneList(RandomForestTrainer(False))

    def testFullAnalysisSansGeneListRandomForestClassifier(self):
        self.evaluateModelFullAnalysisSansGeneList(RandomForestTrainer(True))

    def testFullAnalysisSansGeneListLinearSVMRegressor(self):
        self.evaluateModelFullAnalysisSansGeneList(LinearSVMTrainer(False))

    def testFullAnalysisSansGeneListLinearSVMClassifier(self):
        self.evaluateModelFullAnalysisSansGeneList(LinearSVMTrainer(True))

    def testFullAnalysisSansGeneListRadialBasisFunctionSVMRegressor(self):
        self.evaluateModelFullAnalysisSansGeneList(
            RadialBasisFunctionSVMTrainer(False))

    def testFullAnalysisSansGeneListRadialBasisFunctionSVMClassifier(self):
        self.evaluateModelFullAnalysisSansGeneList(
            RadialBasisFunctionSVMTrainer(True))

    def testFullAnalysisSansGeneListElasticNetRegressor(self):
        self.evaluateModelFullAnalysisSansGeneList(ElasticNetTrainer(False))

    def testFullAnalysisSansGeneListRidgeRegressor(self):
        self.evaluateModelFullAnalysisSansGeneList(
            RidgeRegressionTrainer(False))

    def testFullAnalysisSansGeneListLassoRegressor(self):
        self.evaluateModelFullAnalysisSansGeneList(
            LassoRegressionTrainer(False))

    def evaluateModelFullAnalysisSansGeneList(self, trainer):
        processed_args = self.formatRandomizedData(trainer.is_classifier, True,
                                                   False)
        processed_args.analyze_all = True
        ml_service = MachineLearningService(processed_args)

        ml_service.log.setLevel(logging.DEBUG)
        trainer.log.setLevel(logging.DEBUG)

        self.analyzeAndAssertResults(ml_service, 1, trainer, True, False)

    def testStaticFeaturesAnalysis(self):
        trainer = ElasticNetTrainer(False)
        processed_args = self.formatRandomizedData(trainer.is_classifier,
                                                   False, True)
        assert len(processed_args.static_features) > 0
        ml_service = MachineLearningService(processed_args)

        ml_service.log.setLevel(logging.DEBUG)
        trainer.log.setLevel(logging.DEBUG)

        self.analyzeAndAssertResults(ml_service, 8, trainer, False, True)
예제 #17
0
class HTMLWritingService(object):

    log = LoggerFactory.createLog(__name__)

    RECORD_FILE = "FullResultsSummary.txt"
    SUMMARY_FILE = "SummaryReport.html"

    def __init__(self, input_folder, is_classifier):
        self.input_folder = input_folder
        self.is_classifier = is_classifier

    def writeSummaryFile(self):
        self.createStatsOverviewFromFile()

    def createStatsOverviewFromFile(self):
        stats_overview_object = self.generateStatsOverviewObject()
        new_file = self.generateNewReportFile(stats_overview_object)

        with open(self.input_folder + "/" + self.SUMMARY_FILE,
                  "w") as summary_file:
            try:
                for line in new_file:
                    summary_file.write(line)
            except ValueError as valueError:
                self.log.error(valueError)
            finally:
                summary_file.close()

    def generateStatsOverviewObject(self):
        stats_overview_object = {}
        with open(self.input_folder + "/" + self.RECORD_FILE) as record_file:
            try:
                for line_index, line in enumerate(record_file):
                    line_split = [
                        segment.strip() for segment in line.split("---")
                    ]
                    if len(line_split) < 3:
                        self.log.warning(
                            "Line from results file not split properly: %s",
                            line)
                        continue

                    scores = self.translateToNumericList(line_split[2])
                    accuracies = self.translateToNumericList(line_split[3])
                    if stats_overview_object.get(line_split[0]) is None:
                        stats_overview_object[line_split[0]] = {
                            line_split[1]: [scores, accuracies]
                        }
                    else:
                        stats_overview_object[line_split[0]][line_split[1]] = [
                            scores, accuracies
                        ]
            except ValueError as value_error:
                self.log.error(value_error)
            finally:
                record_file.close()
        return stats_overview_object

    def translateToNumericList(self, line_split):
        return [
            SafeCastUtil.safeCast(val, float)
            for val in line_split.replace("[", "").replace("]", "").split(",")
        ]

    def generateNewReportFile(self, stats_overview_object):
        path_of_this_file = os.path.realpath(__file__)
        template_path = os.path.abspath(
            os.path.join(path_of_this_file,
                         os.pardir)) + "/Reports/reportTemplate.html"
        new_file = []
        with open(template_path) as template_file:
            try:
                for line_index, line in enumerate(template_file):
                    if "//INSERT DEFAULT MIN SCORE HERE" in line:
                        new_file.append(
                            "\t\t\t\tvar DEFAULT_MIN_SCORE = " +
                            SafeCastUtil.safeCast(
                                AbstractModelTrainer.DEFAULT_MIN_SCORE, str) +
                            ";\n")
                    elif "//INSERT CHART DATA HERE" in line:
                        new_file.append(
                            "\t\t\t\t$scope.allData = " +
                            SafeCastUtil.safeCast(stats_overview_object, str) +
                            ";\n")
                    elif "//INSERT IS CLASSIFIER HERE" in line:
                        new_file.append("\t\t\t\t$scope.isClassifier = " +
                                        SafeCastUtil.safeCast(
                                            self.is_classifier, str).lower() +
                                        ";\n")
                    else:
                        new_file.append(line)
            except ValueError as valueError:
                self.log.error(valueError)
            finally:
                template_file.close()
        return new_file
class ArgumentProcessingService(object):

    log = LoggerFactory.createLog(__name__)

    ARGUMENTS_FILE = "arguments.txt"
    GENE_LISTS = "gene_list"

    UNFILLED_VALUE_PLACEHOLDER = "'0'"

    RESULTS = "results"
    IS_CLASSIFIER = "is_classifier"
    FEATURES = "features"
    FEATURE_NAMES = "featureNames"
    INNER_MONTE_CARLO_PERMUTATIONS = "inner_monte_carlo_permutations"
    OUTER_MONTE_CARLO_PERMUTATIONS = "outer_monte_carlo_permutations"
    DATA_SPLIT = "data_split"
    NUM_THREADS = "num_threads"
    ALGORITHM_CONFIGS = "algorithm_configs"
    RECORD_DIAGNOSTICS = "record_diagnostics"
    STATIC_FEATURES = "static_features"

    # RSEN Specific Arguments
    RSEN_P_VAL = "rsen_p_val"
    RSEN_K_VAL = "rsen_k_val"
    RSEN_COMBINE_GENE_LISTS = "rsen_combine_gene_lists"
    BINARY_CATEGORICAL_MATRIX = "binary_categorical_matrix"

    # For AnalysisType.FULL_CLA_SPECIFIC_COMBO
    SPECIFIC_COMBOS = "specific_combos"

    # For AnalysisType.NO_GENE_LISTS
    IGNORE_GENE_LISTS = "ignore_gene_lists"
    NUM_TOP_FEATURES = "num_top_features"

    # For AnalysisType.INDIVIDUAL_TRAIN
    INDIVIDUAL_TRAIN_ALGORITHM = "individual_train_algorithm"
    INDIVIDUAL_TRAIN_HYPERPARAMS = "individual_train_hyperparams"
    INDIVIDUAL_TRAIN_FEATURE_GENE_LIST_COMBO = "individual_train_combo"

    # For AnalysisType.RECOMMENDATIONS
    VIABILITY_ACCEPTANCE = "viability_acceptance"

    def __init__(self, input_folder):
        self.input_folder = input_folder

    def handleInputFolder(self):
        directory_contents = os.listdir(self.input_folder)
        if not self.validateDirectoryContents(directory_contents):
            self.log.error("Invalid directory contents, needs a %s file.", self.ARGUMENTS_FILE)
            return None

        arguments = self.fetchArguments(self.input_folder + "/" + self.ARGUMENTS_FILE)
        results_file = arguments.get(self.RESULTS)
        is_classifier = SafeCastUtil.safeCast(arguments.get(self.IS_CLASSIFIER), int) == 1
        analyze_all = self.fetchOrReturnDefault(arguments.get(self.IGNORE_GENE_LISTS), bool, False)

        algorithm_configs = self.handleAlgorithmConfigs(arguments)

        if is_classifier is None or results_file is None:
            self.log.error("Unable to perform CLA analysis. Must explicitly state is_classifier and declare the results"
                           "file in the %s file.", self.ARGUMENTS_FILE)
            return None
        results_list = self.validateAndExtractResults(results_file, is_classifier)

        gene_lists = self.extractGeneLists()
        if len(gene_lists) <= 1 and not analyze_all:
            self.log.error("Unable to perform standard CLA analysis. No gene lists found in the target folder.")
            return None

        write_diagnostics = self.fetchOrReturnDefault(arguments.get(self.RECORD_DIAGNOSTICS), bool, False)
        feature_files = [file for file in os.listdir(self.input_folder) if self.fileIsFeatureFile(file, results_file)]

        static_feature_files = [feature_file for feature_file in
                                self.fetchOrReturnDefault(arguments.get(self.STATIC_FEATURES), str, "").split(",")
                                if len(feature_file.strip()) > 0]

        if analyze_all:
            feature_map = self.createAndValidateFullFeatureMatrix(results_list, feature_files)
        else:
            feature_map = self.createAndValidateFeatureMatrix(results_list, gene_lists, write_diagnostics, feature_files,
                                                              static_feature_files)
        binary_cat_matrix = self.fetchBinaryCatMatrixIfApplicable(arguments, gene_lists, results_list, analyze_all,
                                                                  static_feature_files)

        if not feature_map or not results_list:
            return None
        inner_monte_carlo_perms = self.fetchOrReturnDefault(arguments.get(self.INNER_MONTE_CARLO_PERMUTATIONS), int, 10)
        outer_monte_carlo_perms = self.fetchOrReturnDefault(arguments.get(self.OUTER_MONTE_CARLO_PERMUTATIONS), int, 10)
        data_split = self.fetchOrReturnDefault(arguments.get(self.DATA_SPLIT), float, 0.8)
        num_threads = self.fetchOrReturnDefault(arguments.get(self.NUM_THREADS), int, multiprocessing.cpu_count())

        individual_train_config = self.createIndividualTrainConfig(arguments)
        rsen_config = self.createRSENConfig(arguments, binary_cat_matrix)
        univariate_config = self.createUnivariateConfig(arguments, analyze_all)
        specific_combos = self.determineSpecificCombos(arguments.get(self.SPECIFIC_COMBOS))

        recs_config = self.createRecommendationsConfig(arguments)

        return ProcessedArguments(results_list, is_classifier, feature_map, gene_lists, inner_monte_carlo_perms,
                                  outer_monte_carlo_perms, data_split, algorithm_configs, num_threads,
                                  write_diagnostics, individual_train_config, rsen_config, recs_config, univariate_config,
                                  specific_combos, static_feature_files)

    def validateDirectoryContents(self, directory_contents):
        return self.ARGUMENTS_FILE in directory_contents

    def fetchArguments(self, arguments_file):
        arguments = {}
        with open(arguments_file) as data_file:
            try:
                for line in data_file:
                    line_trimmed_split = line.strip().split("=")
                    if len(line_trimmed_split) > 1:
                        arguments[line_trimmed_split[0]] = line_trimmed_split[1]
            except ValueError as value_error:
                self.log.error(value_error)
            finally:
                self.log.debug("Closing file %s", arguments_file)
                data_file.close()
        return arguments

    def validateAndExtractResults(self, results_file, is_classifier):
        sample_list = []
        cast_type = float
        if is_classifier:
            cast_type = int
        results_path = self.input_folder + "/" + results_file
        with open(results_path) as data_file:
            try:
                for line_index, line in enumerate(data_file):
                    if len(re.findall(r'^\s*$', line)) > 0 or line_index == 0:  # header or whitespace
                        continue
                    line_trimmed_split = line.strip().split(",")
                    if len(line_trimmed_split) != 2:
                        self.log.error("Each line in %s must be 2 columns. Aborting argument processing.",
                                       results_file)
                        raise ValueError("Each line in results file must be 2 columns.")
                    cell_line = line_trimmed_split[0]
                    cell_result = SafeCastUtil.safeCast(line_trimmed_split[1], cast_type)
                    if cell_line in sample_list:
                        self.log.error("Repeated cell line name: %s. Aborting argument processing.", cell_line)
                        raise ValueError("Repeated cell line name.")
                    else:
                        sample_list.append([cell_line, cell_result])
            except ValueError as value_error:
                self.log.error(value_error)
            finally:
                self.log.debug("Closing file %s", results_file)
                data_file.close()
        return sample_list

    def extractGeneLists(self):
        gene_lists = {"null_gene_list": []}
        files = os.listdir(self.input_folder)
        for file in [f for f in files if self.GENE_LISTS in f]:
            file_path = self.input_folder + "/" + file
            with open(file_path) as gene_list_file:
                genes = gene_list_file.read().strip().split(",")
                genes_deduped = []
                [genes_deduped.append(g.strip()) for g in genes if g not in genes_deduped and len(g.strip()) > 0]
                if len(genes_deduped) > 0:
                    gene_lists[file.split(".csv")[0]] = genes_deduped
                else:
                    self.log.warning("No genes found in gene list %s, will not process.", file)

        return gene_lists

    def createAndValidateFullFeatureMatrix(self, results_list, feature_files):
        frames = []
        cell_lines = [result[0] for result in results_list]

        for file in feature_files:
            self.log.info("Fetching all features for file %s", file)
            frames.append(self.fetchFullDataframe(cell_lines, file))

        combined_frame = pandas.concat(frames, axis=1, join='inner')
        transposed_dict = combined_frame.T.to_dict()

        self.log.info("Formatting all features across all files.")
        return self.formatFullFeatureMatrix(SafeCastUtil.safeCast(combined_frame.columns, list), transposed_dict)

    def fetchFullDataframe(self, cell_lines, file):
        file_name = file.split(".")[0]
        features_path = self.input_folder + "/" + file
        try:
            frame = pandas.read_csv(features_path)
        except ValueError as value_error:
            self.log.error("Make sure feature file %s is well formed with no superfluous commas.", file)
            raise value_error
        frame = frame.loc[:, ~frame.columns.str.contains('^Unnamed')]
        frame.columns = [file_name + "." + feature for feature in frame.columns]
        frame.index = cell_lines
        columns = SafeCastUtil.safeCast(frame.columns, list)
        [frame.drop(feature) for feature in columns if columns.count(feature) > 1]
        return frame

    def formatFullFeatureMatrix(self, feature_names, transposed_dict):
        feature_matrix = {self.FEATURE_NAMES: feature_names}
        all_cell_lines = SafeCastUtil.safeCast(transposed_dict.keys(), list)
        num_cell_lines = len(all_cell_lines)
        for i in range(num_cell_lines):
            values = SafeCastUtil.safeCast(transposed_dict[all_cell_lines[i]].values(), list)
            formatted_values = [self.formatValue(value) for value in values]
            feature_matrix[all_cell_lines[i]] = SafeCastUtil.safeCast(formatted_values, list)
        return feature_matrix

    def formatValue(self, value):
        value_as_float = SafeCastUtil.safeCast(value, float)
        if value_as_float is not None:
            return value_as_float
        else:
            return value.strip()

    def fetchUniqueFeatureNamesAndIndices(self, line_split, file_name):
        unvalidated_features = [file_name + "." + name.strip() for name in line_split if len(name.strip()) > 0]
        valid_indices = []
        valid_features = []
        for i in range(0, len(unvalidated_features)):
            if unvalidated_features.count(unvalidated_features[i]) == 1:
                valid_indices.append(i)

        for i in range(0, len(unvalidated_features)):
            if i in valid_indices:
                valid_features.append(unvalidated_features[i])
        return valid_indices, valid_features

    def createAndValidateFeatureMatrix(self, results_list, gene_lists, write_diagnostics, feature_files,
                                       static_feature_files):
        incomplete_features = []
        for file in [feature_file for feature_file in feature_files if feature_file not in static_feature_files]:
            features_path = self.input_folder + "/" + file
            validated_features, num_features = self.validateGeneLists(features_path, file, gene_lists)
            incomplete_features.append([file, validated_features, num_features])

        if write_diagnostics:
            self.writeDiagnostics(incomplete_features)

        feature_matrix = {self.FEATURE_NAMES: []}
        for file in feature_files:
            features_path = self.input_folder + "/" + file
            if file not in static_feature_files:
                self.extractFeatureMatrix(feature_matrix, features_path, file, gene_lists, results_list)
            else:
                data_frame = self.fetchFullDataframe([result[0] for result in results_list], file)
                feature_names = SafeCastUtil.safeCast(data_frame.columns, list)
                transposed_dict = data_frame.T.to_dict()
                formatted_matrix = self.formatFullFeatureMatrix(feature_names, transposed_dict)

                for key in formatted_matrix.keys():
                    if key in feature_matrix:
                        [feature_matrix[key].append(value) for value in formatted_matrix[key]]
                    else:
                        feature_matrix[key] = formatted_matrix[key]
        return feature_matrix

    def validateGeneLists(self, features_path, file, gene_lists):
        features_missing_from_files = {}
        num_features = 0
        with open(features_path) as feature_file:
            try:
                for line_index, line in enumerate(feature_file):
                    if line_index == 0:
                        feature_names = line.split(",")
                        num_features = len(feature_names)
                        features_missing_from_files = self.validateAndTrimGeneList(feature_names, gene_lists, file)
                    break
            except ValueError as value_error:
                self.log.error(value_error)
                return features_missing_from_files
            finally:
                self.log.debug("Closing file %s", feature_file)
                feature_file.close()
        return features_missing_from_files, num_features

    def validateAndTrimGeneList(self, feature_list, gene_lists, file):
        unused_features = {}
        for key in gene_lists.keys():
            for gene in gene_lists[key]:
                if gene not in [feature.strip() for feature in feature_list]:
                    index = gene_lists[key].index(gene)
                    if unused_features.get(key) is None:
                        unused_features[key] = [[gene, index]]
                    else:
                        unused_features[key].append([gene, (index + len(unused_features[key]))])
                    self.log.warning("Incomplete dataset: gene %s from gene list %s not found in file %s. "
                                     "Will not process this gene in this file.", gene, key, file)
        return unused_features

    def writeDiagnostics(self, features_removed):
        message = ""
        for feature_file in features_removed:
            message += "\nFeatures from gene list(s) not available in " + feature_file[0] + ":\n"
            for gene_list in feature_file[1].keys():
                num_genes_missing = len(feature_file[1][gene_list])
                percent_genes_missing = round((num_genes_missing / feature_file[2]) * 100, 2)
                message += ("\t" + SafeCastUtil.safeCast(num_genes_missing, str) + " (" +
                                   SafeCastUtil.safeCast(percent_genes_missing, str) + " %" +
                            ") features not present in " + gene_list + ".csv:\n")
                for gene in feature_file[1][gene_list]:
                    message += ("\t\t" + gene[0] + " at index " + SafeCastUtil.safeCast(gene[1], str) + "\n")
        message += "\n\n######################\n\n"
        DiagnosticsFileWriter.writeToFile(self.input_folder, message, self.log)

    def extractFeatureMatrix(self, feature_matrix, features_path, file, gene_lists, results_list):
        self.log.info("Extracting important features for %s.", file)
        gene_list_features = []
        for gene_list in gene_lists.values():
            for gene_list_feature in gene_list:
                if gene_list_feature not in gene_list_features:
                    gene_list_features.append(gene_list_feature)

        with open(features_path) as feature_file:
            try:
                important_feature_indices = []
                for line_index, line in enumerate(feature_file):
                    if line_index == 0:
                        feature_names = line.split(",")
                        for gene_list_feature in gene_list_features:
                            important_index = None
                            feature_name = self.determineFeatureName(gene_list_feature, file)
                            for i in range(0, len(feature_names)):
                                if feature_names[i].strip() == gene_list_feature.strip():
                                    important_index = i
                            if feature_name not in feature_matrix[self.FEATURE_NAMES]:
                                feature_matrix[self.FEATURE_NAMES].append(feature_name)
                            important_feature_indices.append(important_index)
                    else:
                        features = self.extractCastedFeatures(line, important_feature_indices)
                        try:
                            cell_line = results_list[line_index - 1]
                        except IndexError as index_error:
                            self.log.error("Index out of range. Results file is shorter than feature file [%s]: %s",
                                           feature_file, SafeCastUtil.safeCast(index_error, str))
                            raise ValueError("Make sure there are no extra lines (including whitespace) in ALL feature "
                                             "files and only feature files you want to analyze are in target folder.")
                        if not cell_line[0] in feature_matrix:
                            feature_matrix[cell_line[0]] = features
                        else:
                            feature_matrix[cell_line[0]] = feature_matrix[cell_line[0]] + features
                        if line_index > len(results_list):
                            self.log.error("Invalid line count for %s", file)
                            raise ValueError("Invalid line count for" + file + ". Must be " +
                                             SafeCastUtil.safeCast(file, str) + "lines long.")
            except ValueError as value_error:
                self.log.error("Please verify results file is the same number of rows as all feature files.")
                self.log.error(value_error)
                return None
            finally:
                feature_file.close()
                self.log.debug("Closing file %s", feature_file)

    def fileIsFeatureFile(self, file, results_file):
        algorithm_files = [algo + ".csv" for algo in SupportedMachineLearningAlgorithms.fetchAlgorithms()]

        return file != results_file and file != self.ARGUMENTS_FILE and self.GENE_LISTS not in file and\
               file not in algorithm_files and ".csv" in file.lower()

    def determineFeatureName(self, feature_name, file):
        return SafeCastUtil.safeCast(file.split(".")[0] + "." + feature_name.strip(), str)

    def extractCastedFeatures(self, line, important_feature_indices):
        important_features = []
        feature_values = line.strip().split(",")
        for index in important_feature_indices:
            if index is None:
                # TODO: Verify that this is acceptable, it works for one hot encoding and should never vary in any model
                important_features.append(self.UNFILLED_VALUE_PLACEHOLDER)
            else:
                if SafeCastUtil.safeCast(feature_values[index], float) is not None:
                    important_features.append(SafeCastUtil.safeCast(feature_values[index].strip(), float))
                else:
                    important_features.append(SafeCastUtil.safeCast(feature_values[index].strip(), str))
        return important_features

    def handleAlgorithmConfigs(self, arguments):
        algos = SupportedMachineLearningAlgorithms.fetchAlgorithms()
        configs = {}
        default_inner_perms = self.fetchOrReturnDefault(arguments.get(self.INNER_MONTE_CARLO_PERMUTATIONS), int, 10)
        default_outer_perms = self.fetchOrReturnDefault(arguments.get(self.OUTER_MONTE_CARLO_PERMUTATIONS), int, 10)

        for algo in algos:
            algo_config = arguments.get(algo)
            if algo_config is None:
                configs[algo] = [True, default_inner_perms, default_outer_perms]
            else:
                config_split = [param.strip() for param in algo_config.split(",")]
                if len(config_split) >= 3:
                    configs[algo] = [config_split[0] == 'True',
                                     SafeCastUtil.safeCast(config_split[1], int),
                                     SafeCastUtil.safeCast(config_split[2], int)]
        return configs

    def createRSENConfig(self, arguments, binary_cat_matrix):
        rsen_p_val = self.fetchOrReturnDefault(arguments.get(self.RSEN_P_VAL), float, 0.0)
        rsen_k_val = self.fetchOrReturnDefault(arguments.get(self.RSEN_P_VAL), float, 0.1)
        rsen_combine_gene_lists = self.fetchOrReturnDefault(arguments.get(self.RSEN_COMBINE_GENE_LISTS), bool, False)
        rsen_config = RSENConfig(binary_cat_matrix, rsen_p_val, rsen_k_val, rsen_combine_gene_lists)
        return rsen_config

    def createUnivariateConfig(self, arguments, analyze_all):
        num_top_features = self.fetchOrReturnDefault(arguments.get(self.NUM_TOP_FEATURES), int, 147)
        return UnivariateConfig(analyze_all, num_top_features)

    def createIndividualTrainConfig(self, arguments):
        individual_train_algorithm = self.fetchOrReturnDefault(arguments.get(self.INDIVIDUAL_TRAIN_ALGORITHM), str,
                                                               None)
        individual_train_hyperparams = self.fetchOrReturnDefault(arguments.get(self.INDIVIDUAL_TRAIN_HYPERPARAMS), str,
                                                                 "")
        individual_train_feature_gene_list_combo = self.fetchOrReturnDefault(
            arguments.get(self.INDIVIDUAL_TRAIN_FEATURE_GENE_LIST_COMBO),
            str, None)
        individual_train_config = IndividualTrainConfig(individual_train_algorithm, individual_train_hyperparams,
                                                        individual_train_feature_gene_list_combo)
        return individual_train_config

    def createRecommendationsConfig(self, arguments):
        viability_acceptance = self.fetchOrReturnDefault(arguments.get(self.VIABILITY_ACCEPTANCE), float, None)
        recs_config = RecommendationsConfig(viability_acceptance)
        return recs_config

    def fetchBinaryCatMatrixIfApplicable(self, arguments, gene_lists, results_list, analyze_all, static_feature_files):
        binary_matrix_file = arguments.get(ArgumentProcessingService.BINARY_CATEGORICAL_MATRIX)
        if binary_matrix_file is not None:
            if analyze_all:
                return self.createAndValidateFullFeatureMatrix(results_list, [binary_matrix_file])
            return self.createAndValidateFeatureMatrix(results_list, gene_lists, False, [binary_matrix_file],
                                                       static_feature_files)
        else:
            return None

    def fetchOrReturnDefault(self, field, to_type, default):
        if field:
            if field.lower() == 'false' and to_type is bool:
                return False
            return SafeCastUtil.safeCast(field, to_type)
        else:
            return default

    def determineSpecificCombos(self, combos):
        if combos is None:
            return []
        return [combo.strip().replace("\"", "") for combo in combos.split(",")]
예제 #19
0
from LoggerFactory import LoggerFactory

log = LoggerFactory.getLogger('fridge')
friend = 'cooker'

def first():
    log.info('Hi, I\'m the fridge')

def second():
    log.warning('Some strange things happen near my place')

def third():
    log.error('OMG, the %s exploded!', friend)
예제 #20
0
파일: __main__.py 프로젝트: sailfish009/cla
import sys
import os

from ArgumentProcessingService import ArgumentProcessingService
from LoggerFactory import LoggerFactory
from MachineLearningService import MachineLearningService
from HTMLWritingService import HTMLWritingService
from RecommendationsService import RecommendationsService
from Utilities.SafeCastUtil import SafeCastUtil
from Utilities.FileConverter import FileConverter

log = LoggerFactory.createLog(__name__)


def main():
    arguments = sys.argv[1:]
    if len(arguments) == 0:
        promptUserForInput()
    elif len(arguments) == 2 and arguments[0] == '0':
        runMainCellLineAnalysis(arguments[1])
    elif len(arguments) == 2 and arguments[0] == '1':
        FileConverter.convertMatLabToCSV(arguments[1])
    elif len(arguments) == 2 and arguments[0] == '2':
        fetchRecommendations(arguments[1])
    else:
        log.error("Exiting program, invalid data sent in target folder.")
    return


def promptUserForInput():
    simulation_to_run = input(
class RecommendationsService(object):

    log = LoggerFactory.createLog(__name__)

    PRE_REC_ANALYSIS_FILE = "PreRecAnalysis.csv"
    PREDICTIONS_FILE = "Predictions.csv"
    PREDICTIONS_BY_CELL_LINE_FILE = "PredictionsByCellLine.csv"

    HEADER = "header"

    STD_DEVIATION = "std_deviation"
    MEAN = "mean"
    MEDIAN = "median"

    def __init__(self, inputs):
        self.inputs = inputs

    def analyzeRecommendations(self, input_folder):
        self.preRecsAnalysis(input_folder)
        self.recommendByHoldout(input_folder)
        self.writeFinalRecsResults(input_folder)

    def preRecsAnalysis(self, input_folder):
        self.log.info("Performing pre-recs analysis on all drugs.")
        drugs = self.inputs.keys()
        cell_line_predictions_by_drug = OrderedDict()
        header = numpy.concatenate(
            (["cell_line"], SafeCastUtil.safeCast(drugs, list)), axis=0)
        cell_line_predictions_by_drug[self.HEADER] = header
        cell_line_predictions_by_drug[self.STD_DEVIATION] = [
            self.STD_DEVIATION
        ]
        cell_line_predictions_by_drug[self.MEAN] = [self.MEAN]
        cell_line_predictions_by_drug[self.MEDIAN] = [self.MEDIAN]
        for drug in drugs:
            processed_arguments = self.inputs.get(drug)
            results = processed_arguments.results
            combos = self.determineGeneListCombos(processed_arguments)

            processed_arguments.data_split = 1.0
            data_formatting_service = DataFormattingService(
                processed_arguments)
            formatted_inputs = data_formatting_service.formatData(True, True)
            self.log.info("Determining best combo and score for drug %s.",
                          drug)
            recs_model_info = self.fetchBestModelComboAndScore(
                drug, input_folder, formatted_inputs, results, combos,
                processed_arguments)

            if recs_model_info is None or recs_model_info.model is None or recs_model_info.combo is None:
                continue
            self.generateMultiplePredictions(recs_model_info, formatted_inputs,
                                             results,
                                             cell_line_predictions_by_drug)

        for cell_line in cell_line_predictions_by_drug:
            while len(cell_line_predictions_by_drug[cell_line]) < \
                    len(cell_line_predictions_by_drug[RecommendationsService.HEADER]):
                cell_line_predictions_by_drug[cell_line].append(
                    MachineLearningService.DELIMITER)
        self.writePreRecAnalysisFile(cell_line_predictions_by_drug,
                                     input_folder)

    def generateMultiplePredictions(self, recs_model_info, formatted_inputs,
                                    results, cell_line_predictions_by_drug):
        trimmed_matrix = GeneListComboUtility.trimMatrixByFeatureSet(
            DataFormattingService.TRAINING_MATRIX, recs_model_info.combo,
            formatted_inputs, AnalysisType.RECOMMENDATIONS)

        features, relevant_results = recs_model_info.trainer.populateFeaturesAndResultsByCellLine(
            trimmed_matrix, results)
        cell_lines_in_order = [
            key for key in trimmed_matrix.keys()
            if key is not ArgumentProcessingService.FEATURE_NAMES
        ]
        predictions = recs_model_info.model.predict(features)

        for i in range(0, len(cell_lines_in_order)):
            cell_line = cell_lines_in_order[i]
            if cell_line_predictions_by_drug.get(cell_line) is not None:
                cell_line_predictions_by_drug[cell_line].append(predictions[i])
            else:
                max_dict_length = 2
                for key in cell_line_predictions_by_drug.keys():
                    if key == self.HEADER:
                        continue
                    if len(cell_line_predictions_by_drug[key]
                           ) > max_dict_length:
                        max_dict_length = len(
                            cell_line_predictions_by_drug[key])
                row = [cell_line]
                for _ in range(2, max_dict_length):
                    row.append(MachineLearningService.DELIMITER)
                row.append(predictions[i])
                cell_line_predictions_by_drug[cell_line] = row
        cell_line_predictions_by_drug[self.STD_DEVIATION].append(
            numpy.std(predictions))
        cell_line_predictions_by_drug[self.MEAN].append(
            numpy.mean(predictions))
        cell_line_predictions_by_drug[self.MEDIAN].append(
            numpy.median(predictions))

    def writePreRecAnalysisFile(self, cell_line_predictions_by_drug,
                                input_folder):
        with open(input_folder + "/" + self.PRE_REC_ANALYSIS_FILE,
                  "w",
                  newline='') as pre_rec_analysis_file:
            try:
                writer = csv.writer(pre_rec_analysis_file)
                for key in [
                        key for key in cell_line_predictions_by_drug.keys()
                        if key is not RecommendationsService.STD_DEVIATION
                        and key is not RecommendationsService.MEDIAN
                        and key is not RecommendationsService.MEAN
                ]:
                    writer.writerow(cell_line_predictions_by_drug.get(key))
                writer.writerow(
                    cell_line_predictions_by_drug[RecommendationsService.MEAN])
                writer.writerow(cell_line_predictions_by_drug[
                    RecommendationsService.MEDIAN])
                writer.writerow(cell_line_predictions_by_drug[
                    RecommendationsService.STD_DEVIATION])
            except ValueError as error:
                self.log.error("Error writing to file %s. %s",
                               pre_rec_analysis_file, error)
            finally:
                pre_rec_analysis_file.close()

    def recommendByHoldout(self, input_folder):
        # TODO: Support for inputs to be a dict of drug_name => input, not just one set of inputs for all drugs.
        self.log.info(
            "Starting recommendation by holdout analysis on all drugs.")
        max_nodes = multiprocessing.cpu_count()

        for drug in self.inputs.keys():
            self.log.info(
                "Starting recommendation by holdout analysis on specific drug %s.",
                drug)
            self.handleDrug(drug, input_folder, max_nodes,
                            self.inputs.get(drug))

    def handleDrug(self, drug, input_folder, max_nodes, processed_arguments):
        combos = self.determineGeneListCombos(processed_arguments)
        cell_line_map = processed_arguments.features
        results = processed_arguments.results
        cloned_inputs = copy.deepcopy(processed_arguments)
        cloned_inputs.data_split = 1.0
        data_formatting_service = DataFormattingService(cloned_inputs)
        formatted_inputs = data_formatting_service.formatData(True, True)
        feature_names = formatted_inputs.get(
            ArgumentProcessingService.FEATURE_NAMES)

        requested_threads = processed_arguments.num_threads
        nodes_to_use = numpy.amin([requested_threads, max_nodes])

        Parallel(n_jobs=nodes_to_use)(delayed(self.handleCellLine)(
            cell_line, combos, drug, feature_names, formatted_inputs,
            input_folder, processed_arguments, results)
                                      for cell_line in cell_line_map.keys())

    def handleCellLine(self, cell_line, combos, drug, feature_names,
                       formatted_inputs, input_folder, processed_arguments,
                       results):
        if cell_line == ArgumentProcessingService.FEATURE_NAMES:
            return
        self.log.info("Holding out cell line %s for drug %s", cell_line, drug)
        trimmed_cell_lines, trimmed_results = self.removeNonNullCellLineFromFeaturesAndResults(
            cell_line, formatted_inputs, results)
        recs_model_info = self.fetchBestModelComboAndScore(
            drug, input_folder, trimmed_cell_lines, trimmed_results, combos,
            processed_arguments)
        if recs_model_info is None or recs_model_info.model is None or recs_model_info.combo is None:
            self.log.warn(
                "Unable to train best model or get best combo for cell line %s and drug %s.",
                cell_line, drug)
            return

        prediction = self.generateSinglePrediction(recs_model_info.model,
                                                   recs_model_info.combo,
                                                   cell_line, feature_names,
                                                   formatted_inputs)

        self.writeToPredictionsCsvInLock(cell_line, drug, input_folder,
                                         prediction, recs_model_info.score)

    def writeToPredictionsCsvInLock(self, cell_line, drug, input_folder,
                                    prediction, score):
        self.log.debug("Locking current thread %s.",
                       threading.current_thread())
        lock = threading.Lock()
        lock.acquire(True)
        write_action = "w"
        if self.PREDICTIONS_FILE in os.listdir(input_folder):
            write_action = "a"
        with open(input_folder + "/" + self.PREDICTIONS_FILE,
                  write_action,
                  newline='') as predictions_file:
            try:
                writer = csv.writer(predictions_file)
                if write_action == "w":
                    writer.writerow(
                        ["Drug", "Cell_Line", "Prediction", "R2^Score"])
                line = [
                    drug, cell_line,
                    SafeCastUtil.safeCast(prediction, str),
                    SafeCastUtil.safeCast(score, str)
                ]
                writer.writerow(line)
            except ValueError as error:
                self.log.error("Error writing to file %s. %s",
                               self.PREDICTIONS_FILE, error)
            finally:
                predictions_file.close()
                self.log.debug("Releasing current thread %s.",
                               threading.current_thread())
                lock.release()

    def determineGeneListCombos(self, processed_arguments):
        gene_lists = processed_arguments.gene_lists
        feature_names = processed_arguments.features.get(
            ArgumentProcessingService.FEATURE_NAMES)
        static_features = processed_arguments.static_features
        combos, expected_length = GeneListComboUtility.determineCombos(
            gene_lists, feature_names, static_features)

        if len(combos) != expected_length:
            self.log.warning(
                "Unexpected number of combos detected, should be %s but instead created %s.\n%s",
                expected_length, len(combos), combos)
        return combos

    def removeNonNullCellLineFromFeaturesAndResults(self, cell_line,
                                                    formatted_inputs, results):
        cloned_formatted_data = copy.deepcopy(formatted_inputs)
        if cell_line is not None:
            del cloned_formatted_data.get(
                DataFormattingService.TRAINING_MATRIX)[cell_line]

        cloned_results = [
            result for result in results
            if result[0] is not cell_line and cell_line is not None
        ]
        return cloned_formatted_data, cloned_results

    def getDrugFolders(self, input_folder):
        folders = os.listdir(input_folder)
        # TODO: Figure out required phrase to mark it as a drug folder
        drug_folders = [f for f in folders if 'Analysis' in f]
        return drug_folders

    def fetchBestModelComboAndScore(self, drug, analysis_files_folder,
                                    trimmed_cell_lines, trimmed_results,
                                    combos, processed_arguments):
        # TODO: ultimately we'd want to use multiple algorithms, and make an ensemble prediction/prescription.
        # But for now, let's stick with one algorithm.
        best_combo_string = None
        best_scoring_algo = None
        optimal_hyperparams = None
        top_score = AbstractModelTrainer.DEFAULT_MIN_SCORE
        for analysis_file_name in self.fetchAnalysisFiles(
                drug, analysis_files_folder):
            file = analysis_files_folder + "/" + drug + "/" + analysis_file_name
            with open(file, 'rt') as analysis_file:
                reader = csv.reader(analysis_file)
                try:
                    header = []
                    indices_of_outer_loops = []
                    line_index = -1
                    for row in reader:
                        line_index += 1
                        if line_index == 0:
                            header = row
                            for i in range(0, len(row)):
                                if MachineLearningService.SCORE_AND_HYPERPARAM_PHRASE in row[
                                        i]:
                                    indices_of_outer_loops.append(i)
                            continue
                        string_combo = row[header.index(
                            MachineLearningService.FEATURE_FILE_GENE_LIST_COMBO
                        )]
                        score = SafeCastUtil.safeCast(
                            row[header.index(
                                self.scorePhrase(processed_arguments))], float)
                        if score is not None and score > top_score:
                            best_scoring_algo = analysis_file_name.split(
                                ".")[0]
                            best_combo_string = string_combo
                            top_score = score
                            optimal_hyperparams = self.fetchBestHyperparams(
                                row, indices_of_outer_loops)
                except ValueError as valueError:
                    self.log.error(valueError)
                finally:
                    self.log.debug("Closing file %s", analysis_file)
                    analysis_file.close()
        if top_score <= 0:
            # TODO - Consider writing this to an explicit diagnostic file via extracting to first class service,
            # not just the process error log.
            self.log.error(
                'Error: no method found an R2 higher than 0 for drug: %s.',
                drug)
            return None

        best_combo = self.determineBestComboFromString(best_combo_string,
                                                       combos,
                                                       processed_arguments)
        best_model, trainer = self.trainBestModelWithCombo(
            best_scoring_algo, best_combo, optimal_hyperparams,
            trimmed_cell_lines, trimmed_results, processed_arguments)
        return RecommendationsModelInfo(trainer, top_score, best_combo,
                                        best_model)

    def scorePhrase(self, processed_arguments):
        if processed_arguments.is_classifier:
            return MachineLearningService.PERCENT_ACCURATE_PREDICTIONS
        return MachineLearningService.R_SQUARED_SCORE

    def fetchBestHyperparams(self, row, indices_of_outer_loops):
        monte_carlo_results = self.getMonteCarloResults(
            row, indices_of_outer_loops)
        best_hyps = None
        top_score = AbstractModelTrainer.DEFAULT_MIN_SCORE
        max_num_occurrences = 0
        best_hyps_list = []
        for hyperparam in SafeCastUtil.safeCast(monte_carlo_results.keys(),
                                                list):
            if len(monte_carlo_results.get(hyperparam)) > max_num_occurrences:
                max_num_occurrences = len(monte_carlo_results.get(hyperparam))
                best_hyps_list = [hyperparam]
            elif len(monte_carlo_results.get(
                    hyperparam)) == max_num_occurrences:
                best_hyps_list.append(hyperparam)
        if len(best_hyps_list) == 1:
            best_hyps = hyperparam
            top_score = numpy.average(monte_carlo_results.get(hyperparam))
        elif len(best_hyps_list) > 1:
            top_score = 0
            for hyperparam in best_hyps_list:
                if numpy.average(
                        monte_carlo_results.get(hyperparam)) > top_score:
                    top_score = numpy.average(
                        monte_carlo_results.get(hyperparam))
                    best_hyps = hyperparam
        return best_hyps

    def getMonteCarloResults(self, row, indices_of_outer_loops):
        hyperparams_to_scores = {}
        for i in range(0, len(row)):
            if i in indices_of_outer_loops:
                score_and_hyperparam = row[i].split(
                    MachineLearningService.DELIMITER)
                score = SafeCastUtil.safeCast(score_and_hyperparam[0], float)
                if hyperparams_to_scores.get(
                        score_and_hyperparam[1]) is not None:
                    hyperparams_to_scores[score_and_hyperparam[1]].append(
                        score)
                else:
                    hyperparams_to_scores[score_and_hyperparam[1]] = [score]
        return hyperparams_to_scores

    def fetchAnalysisFiles(self, drug, input_folder):
        files = os.listdir(input_folder + "/" + drug)
        return [file for file in files if "Analysis.csv" in file]

    def trainBestModelWithCombo(self, best_scoring_algo, best_scoring_combo,
                                optimal_hyperparams, trimmed_cell_lines,
                                trimmed_results, processed_arguments):
        is_classifier = processed_arguments.is_classifier
        rsen_config = processed_arguments.rsen_config
        training_matrix = GeneListComboUtility.trimMatrixByFeatureSet(
            DataFormattingService.TRAINING_MATRIX, best_scoring_combo,
            trimmed_cell_lines, AnalysisType.RECOMMENDATIONS)
        trainer = ModelTrainerFactory.createTrainerFromTargetAlgorithm(
            is_classifier, best_scoring_algo, rsen_config)

        features, relevant_results = trainer.populateFeaturesAndResultsByCellLine(
            training_matrix, trimmed_results)
        params = DictionaryUtility.toDict(optimal_hyperparams)
        feature_names = training_matrix.get(
            ArgumentProcessingService.FEATURE_NAMES)
        model = trainer.buildModel(relevant_results, features, params,
                                   feature_names)
        return model, trainer

    def determineBestComboFromString(self, best_combo_string, combos,
                                     processed_arguments):
        gene_lists = processed_arguments.gene_lists
        combine_gene_lists = processed_arguments.rsen_config.combine_gene_lists
        analysis_type = processed_arguments.analysisType()
        static_features = processed_arguments.static_features
        for combo in combos:
            feature_set_string = GeneListComboUtility.generateFeatureSetString(
                combo, gene_lists, combine_gene_lists, analysis_type,
                static_features)
            if GeneListComboUtility.combosAreEquivalent(
                    feature_set_string, best_combo_string):
                return combo

        raise ValueError(
            "Unable to determine feature set from given combo gene list and feature file combo: "
            + best_combo_string +
            ".\n Please make sure all gene lists and feature files in the combo "
            + "are present in the drug folder.")

    def generateSinglePrediction(self, best_model, best_combo, cell_line,
                                 all_features, formatted_inputs):
        ommited_cell_line = formatted_inputs.get(
            DataFormattingService.TRAINING_MATRIX).get(cell_line)
        input_wrapper = OrderedDict()
        input_wrapper[DataFormattingService.TRAINING_MATRIX] = OrderedDict()
        input_wrapper[DataFormattingService.
                      TRAINING_MATRIX][cell_line] = ommited_cell_line
        input_wrapper[ArgumentProcessingService.FEATURE_NAMES] = all_features
        trimmed_matrix = GeneListComboUtility.trimMatrixByFeatureSet(
            DataFormattingService.TRAINING_MATRIX, best_combo, input_wrapper,
            AnalysisType.RECOMMENDATIONS)
        return best_model.predict([trimmed_matrix.get(cell_line)])[0]

    def writeFinalRecsResults(self, input_folder):
        drug_scores_by_cell_line = self.fetchDrugScoresByCellLine(input_folder)
        self.writePredictionsByCellLine(drug_scores_by_cell_line, input_folder)

    def fetchDrugScoresByCellLine(self, input_folder):
        predictions_file = input_folder + "/" + RecommendationsService.PREDICTIONS_FILE
        drug_scores_by_cell_line = {}
        with open(predictions_file) as input_file:
            try:
                for line_index, line in enumerate(input_file):
                    if line_index == 0:
                        continue
                    line_split = line.split(",")
                    drug = line_split[0]
                    cell_line = line_split[1]
                    score = SafeCastUtil.safeCast(line_split[2], float)
                    if not drug or not cell_line or not score:
                        self.log.warning(
                            "Invalid line detected for %s at line %s.",
                            predictions_file, line_index + 1)
                        continue
                    if not drug_scores_by_cell_line.get(cell_line):
                        drug_scores_by_cell_line[cell_line] = [(drug, score)]
                    else:
                        drug_scores_by_cell_line[cell_line].append(
                            (drug, score))
            except ValueError as error:
                self.log.error("Error parsing predictions file %s. %s",
                               predictions_file, error)
        return drug_scores_by_cell_line

    def writePredictionsByCellLine(self, drug_scores_by_cell_line,
                                   input_folder):
        total_drugs = numpy.max(
            [len(drugs) for drugs in drug_scores_by_cell_line.values()])
        header = ["Cell Line"]
        best_drug = " best drug"
        best_drug_score = " best drug score"
        for i in range(1, total_drugs + 1):
            suffix = MachineLearningService.generateNumericalSuffix(i)
            header.append(SafeCastUtil.safeCast(i, str) + suffix + best_drug)
            header.append(
                SafeCastUtil.safeCast(i, str) + suffix + best_drug_score)
        predictions_by_cell_line_path = input_folder + "/" + RecommendationsService.PREDICTIONS_BY_CELL_LINE_FILE
        with open(predictions_by_cell_line_path, "w",
                  newline='') as predictions_by_cell_line_file:
            try:
                writer = csv.writer(predictions_by_cell_line_file)
                writer.writerow(header)
                for cell_line in drug_scores_by_cell_line.keys():
                    drug_scores = sorted(
                        drug_scores_by_cell_line.get(cell_line),
                        reverse=True,
                        key=lambda x: x[1])
                    row = [cell_line]
                    for drug_and_score in drug_scores:
                        row.append(drug_and_score[0])
                        row.append(drug_and_score[1])
                    writer.writerow(row)
            except ValueError as error:
                self.log.error("Error writing to %s. %s",
                               predictions_by_cell_line_file, error)
class MachineLearningService(object):

    log = LoggerFactory.createLog(__name__)

    MAXIMUM_FEATURES_RECORDED = 20
    DELIMITER = " --- "

    #TODO: consider extracting these to a helper class.
    SCORE_AND_HYPERPARAM_PHRASE = "score and optimal hyperparams for outer perm "
    FEATURE_FILE_GENE_LIST_COMBO = "feature file: gene list combo"
    R_SQUARED_SCORE = "R^2 score"
    PERCENT_ACCURATE_PREDICTIONS = "percentage accurate predictions"

    def __init__(self, data):
        self.inputs = data

    def analyze(self, input_folder):
        gene_list_combos = self.determineGeneListCombos()

        is_classifier = self.inputs.is_classifier
        analysis_type = self.inputs.analysisType()

        if analysis_type is AnalysisType.INDIVIDUAL_TRAIN:
            self.analyzeIndividualGeneListCombo(gene_list_combos, input_folder, is_classifier)
        elif analysis_type is AnalysisType.FULL_CLA_SPECIFIC_COMBO:
            self.analyzeGeneListCombos(self.determineSpecificCombos(gene_list_combos), input_folder, is_classifier)
        else:
            self.analyzeGeneListCombos(gene_list_combos, input_folder, is_classifier)

    def determineGeneListCombos(self):
        feature_names = self.inputs.features.get(ArgumentProcessingService.FEATURE_NAMES)
        if self.inputs.analysisType() is AnalysisType.NO_GENE_LISTS:
            return [[feature_names]]

        gene_lists = self.inputs.gene_lists
        static_features = self.inputs.static_features
        combos, expected_length = GeneListComboUtility.determineCombos(gene_lists, feature_names, static_features)
        if len(combos) != expected_length:
            self.log.warning("Unexpected number of combos detected, should be %s but instead created %s.\n%s",
                             expected_length, len(combos), combos)
        return combos

    def analyzeIndividualGeneListCombo(self, gene_list_combos, input_folder, is_classifier):
        config = self.inputs.individual_train_config
        target_combo = config.combo
        target_algorithm = config.algorithm
        rsen_config = self.inputs.rsen_config

        outer_monte_carlo_loops = self.inputs.outer_monte_carlo_permutations
        for gene_list_combo in gene_list_combos:
            plain_text_name = self.generateFeatureSetString(gene_list_combo)
            if plain_text_name == target_combo:
                trainer = ModelTrainerFactory.createTrainerFromTargetAlgorithm(is_classifier, target_algorithm, rsen_config)
                hyperparams = self.fetchAndCastHyperparams(config, trainer)

                for permutation in range(0, outer_monte_carlo_loops):
                    results = self.inputs.results
                    formatted_data = self.formatData(self.inputs, True, True)
                    training_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX,
                                                                                  gene_list_combo, formatted_data, self.inputs.analysisType())
                    testing_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX, gene_list_combo,
                                                                                 formatted_data, self.inputs.analysisType())
                    features, relevant_results = trainer.populateFeaturesAndResultsByCellLine(training_matrix, results)
                    feature_names = training_matrix.get(ArgumentProcessingService.FEATURE_NAMES)
                    model = trainer.buildModel(relevant_results, features, hyperparams, feature_names)
                    model_score = trainer.fetchPredictionsAndScore(model, testing_matrix, results)
                    score = model_score[0]
                    accuracy = model_score[1]
                    importances = trainer.fetchFeatureImportances(model, feature_names)
                    for key in importances.keys():
                        importances[key] = [importances[key]]
                    ordered_importances = self.averageAndSortImportances(importances, 1)
                    ordered_phrases = self.averageAndSortImportantRSENPhrases(
                                                            trainer.fetchModelPhrases(model, gene_list_combo), trainer)

                    numbered_combo = target_combo + " RUN " + SafeCastUtil.safeCast(permutation, str)
                    self.log.debug("Final score and accuracy of individual analysis for feature gene combo %s "
                                   "using algorithm %s: %s, %s", numbered_combo, target_algorithm, score, accuracy)
                    score_and_hyperparam = [self.generateScoreAndHyperParam(score, hyperparams)]
                    line = self.generateLine(accuracy, numbered_combo, ordered_importances, ordered_phrases, score,
                                             score_and_hyperparam)
                    self.writeToCSVInLock(line, input_folder, target_algorithm, outer_monte_carlo_loops, 1)
                return
        self.log.info("Gene list feature file %s combo not found in current dataset.", target_combo)
        return

    def fetchAndCastHyperparams(self, config, trainer):
        hyperparams = config.hyperparams.split(",")
        hyperparam_dict = OrderedDict()
        keys = SafeCastUtil.safeCast(trainer.hyperparameters.keys(), list)
        for i in range(0, len(keys)):
            hyperparam_dict[keys[i]] = SafeCastUtil.safeCast(hyperparams[i], float)
        return hyperparam_dict

    def generateLine(self, accuracy, combo, ordered_importances, ordered_phrases, score, score_and_hyperparam):
        return numpy.concatenate([[combo, score, accuracy], score_and_hyperparam,
                                  ordered_importances, ordered_phrases])

    def generateScoreAndHyperParam(self, score, hyperparam):
        return SafeCastUtil.safeCast(score, str) + self.DELIMITER + DictionaryUtility.toString(hyperparam)

    def shouldTrainAlgorithm(self, algorithm):
        configs = self.inputs.algorithm_configs
        return configs is not None and configs.get(algorithm) is not None and configs.get(algorithm)[0]

    def determineSpecificCombos(self, all_combos):
        specific_combos = self.inputs.specific_combos
        selected_combos = {}
        for specific_combo in specific_combos:
            for combo in all_combos:
                combo_string = self.generateFeatureSetString(combo)
                if specific_combo == combo_string and selected_combos.get(combo_string) is None:
                    selected_combos[combo_string] = combo
                else:
                    equivalent_combos = GeneListComboUtility.combosAreEquivalent(combo_string, specific_combo)
                    if equivalent_combos and selected_combos.get(combo_string) is None:
                        selected_combos[combo_string] = combo
        selected_combo_names = SafeCastUtil.safeCast(selected_combos.keys(), list)
        if len(selected_combo_names) < len(specific_combos):
            self.log.warning("Not all specified combos were available in this data folder.\n"
                             "Specified combos: %s\n Selected combos: %s", specific_combos, selected_combo_names)
        else:
            self.log.info("Only running analysis on following combos:\n %s", selected_combo_names)
        return SafeCastUtil.safeCast(selected_combos.values(), list)

    def analyzeGeneListCombos(self, gene_list_combos, input_folder, is_classifier):
        rsen_config = self.inputs.rsen_config
        for algo in SupportedMachineLearningAlgorithms.fetchAlgorithms():
            if self.shouldTrainAlgorithm(algo):
                trainer = None
                try:
                    trainer = ModelTrainerFactory.createTrainerFromTargetAlgorithm(is_classifier, algo, rsen_config)
                except ValueError as valueError:
                    self.log.error("Improper configuration for algorithm: [%s], %s.", algo, valueError)
                finally:
                    if trainer is not None:
                        trainer.logTrainingMessage(self.monteCarloPermsByAlgorithm(algo, True),
                                                   self.monteCarloPermsByAlgorithm(algo, False),
                                                   len(gene_list_combos))
                        self.handleParallellization(gene_list_combos, input_folder, trainer)


    def monteCarloPermsByAlgorithm(self, algorithm, outer):
        monte_carlo_config = self.inputs.algorithm_configs.get(algorithm)
        return monte_carlo_config[1] if outer else monte_carlo_config[2]

    def handleParallellization(self, gene_list_combos, input_folder, trainer):
        max_nodes = multiprocessing.cpu_count()
        requested_threads = self.inputs.num_threads
        nodes_to_use = numpy.amin([requested_threads, max_nodes])

        valid_combos = self.fetchValidGeneListCombos(input_folder, gene_list_combos, trainer)

        Parallel(n_jobs=nodes_to_use)(delayed(self.runMonteCarloSelection)(feature_set, trainer, input_folder,
                                                                           len(valid_combos))
                                      for feature_set in valid_combos)
        GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log)

    def fetchValidGeneListCombos(self, input_folder, gene_list_combos, trainer):
        valid_combos = [feature_set for feature_set in gene_list_combos if trainer.shouldProcessFeatureSet(feature_set)]

        rsen_config = self.inputs.rsen_config
        if trainer.algorithm == SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET and \
                rsen_config.combine_gene_lists:
            all_genes = GeneListComboUtility.fetchAllGeneListGenesDeduped(self.inputs.gene_lists)
            # TODO: Can fail if "." in feature name.
            bin_cat_matrix = rsen_config.binary_cat_matrix.get(ArgumentProcessingService.FEATURE_NAMES)[0].split(".")[0]
            full_gene_list = [bin_cat_matrix + "." + gene for gene in all_genes if len(gene.strip()) > 0]

            new_combos = []
            for combo in valid_combos:
                new_combo = []
                for feature_set in combo:
                    if bin_cat_matrix in feature_set[0]:
                        new_combo.append(full_gene_list)
                    else:
                        new_combo.append(feature_set)
                if new_combo not in new_combos:
                    new_combos.append(new_combo)
            return self.trimAnalyzedCombos(input_folder, new_combos, trainer)
        else:
            return self.trimAnalyzedCombos(input_folder, valid_combos, trainer)

    def trimAnalyzedCombos(self, input_folder, valid_combos, trainer):
        file_name = trainer.algorithm + ".csv"
        if file_name not in os.listdir(input_folder):
            return valid_combos

        existing_combo_strings = []
        with open(input_folder + "/" + file_name) as analyzed_file:
            try:
                for line_index, line in enumerate(analyzed_file):
                    if line_index == 0:
                        continue
                    existing_combo_strings.append(line.strip().split(",")[0])
            except ValueError as error:
                self.log.error("Error reading existing combos from analysis file: %s", analyzed_file, error)
            finally:
                analyzed_file.close()

        trimmed_combos = []
        for combo in valid_combos:
            if self.generateFeatureSetString(combo) not in existing_combo_strings:
                trimmed_combos.append(combo)
        return trimmed_combos

    def runMonteCarloSelection(self, feature_set, trainer, input_folder, num_combos):
        scores = []
        accuracies = []
        importances = {}
        feature_set_as_string = self.generateFeatureSetString(feature_set)
        outer_perms = self.monteCarloPermsByAlgorithm(trainer.algorithm, True)
        important_rsen_phrases = {}
        scores_and_hyperparams = []

        for i in range(1, outer_perms + 1):
            self.log.info("Computing outer Monte Carlo Permutation %s for %s.\n", i, feature_set_as_string)
            formatted_data = self.formatData(self.inputs, True, True)
            if self.inputs.analysisType() is AnalysisType.NO_GENE_LISTS:
                self.logKeptFeatures(formatted_data, i, input_folder, trainer)

            self.log.info("Creating train and test matrices by feature set: %s.", feature_set_as_string)
            training_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX, feature_set,
                                                                          formatted_data, self.inputs.analysisType())
            testing_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX, feature_set,
                                                                         formatted_data, self.inputs.analysisType())

            optimal_hyperparams = self.determineOptimalHyperparameters(feature_set, formatted_data, trainer)
            record_diagnostics = self.inputs.record_diagnostics
            trainer.logIfBestHyperparamsOnRangeThreshold(optimal_hyperparams, record_diagnostics, input_folder)
            trainer.logOptimalHyperParams(optimal_hyperparams, self.generateFeatureSetString(feature_set),
                                          record_diagnostics, input_folder)

            prediction_data = self.fetchOuterPermutationModelScore(feature_set, trainer,
                                                                   optimal_hyperparams, testing_matrix,
                                                                   training_matrix)
            scores.append(prediction_data[0])
            accuracies.append(prediction_data[1])
            for importance in prediction_data[2].keys():
                if importances.get(importance) is not None:
                    importances[importance].append(prediction_data[2].get(importance))
                else:
                    importances[importance] = [prediction_data[2].get(importance)]
            if len(prediction_data) == 4 and \
                    trainer.algorithm == SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET:
                for phrase in prediction_data[3].keys():
                    if important_rsen_phrases.get(phrase) is not None:
                        important_rsen_phrases[phrase].append(prediction_data[3].get(phrase))
                    else:
                        important_rsen_phrases[phrase] = [prediction_data[3].get(phrase)]
            scores_and_hyperparams.append(self.generateScoreAndHyperParam(prediction_data[0], optimal_hyperparams))

            GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log)

        average_score = numpy.mean(scores)
        average_accuracy = numpy.mean(accuracies)
        self.log.debug("Average score and accuracy of all Monte Carlo runs for %s: %s, %s",
                       feature_set_as_string, average_score, average_accuracy)
        ordered_importances = self.averageAndSortImportances(importances, outer_perms)

        ordered_phrases = self.averageAndSortImportantRSENPhrases(important_rsen_phrases, trainer)

        line = self.generateLine(average_accuracy, feature_set_as_string, ordered_importances, ordered_phrases,
                                 average_score, scores_and_hyperparams)
        self.writeToCSVInLock(line, input_folder, trainer.algorithm, num_combos, outer_perms)
        self.saveOutputToTxtFile(scores, accuracies, feature_set_as_string, input_folder, trainer.algorithm)

    def generateFeatureSetString(self, feature_set):
        return GeneListComboUtility.generateFeatureSetString(feature_set, self.inputs.gene_lists,
                                                             self.inputs.rsen_config.combine_gene_lists,
                                                             self.inputs.analysisType(),
                                                             self.inputs.static_features)

    def fetchOuterPermutationModelScore(self, feature_set, trainer, optimal_hyperparams, testing_matrix,
                                        training_matrix):
        # TODO: Handle hyperparams with n
        results = self.inputs.results
        features, relevant_results = trainer.populateFeaturesAndResultsByCellLine(training_matrix, results)
        feature_names = training_matrix.get(ArgumentProcessingService.FEATURE_NAMES)
        model = trainer.buildModel(relevant_results, features, optimal_hyperparams, feature_names)
        score, accuracy = trainer.fetchPredictionsAndScore(model, testing_matrix, results)
        # TODO: This should be it's own class.
        return [score, accuracy, trainer.fetchFeatureImportances(model, feature_names),
                trainer.fetchModelPhrases(model, feature_set)]

    def averageAndSortImportances(self, importances, outer_loops):
        for key in importances.keys():
            if len(importances[key]) < outer_loops:
                self.log.warning("Different amount of importances for feature %s than expected. Should be %s but is "
                                 "instead %s.", key, outer_loops, len(importances[key]))
                while len(importances[key]) < outer_loops:
                    importances[key].append(0.0)
        ordered = []
        [ordered.append({"feature": key, "importance": numpy.sum(importances[key]) / outer_loops}) for key in
         importances.keys()]
        ordered = sorted(ordered, key=lambda k: k["importance"], reverse=True)
        trimmed = ordered[:self.MAXIMUM_FEATURES_RECORDED]

        final_imps = []
        for i in range(0, self.MAXIMUM_FEATURES_RECORDED):
            if i < len(trimmed):
                summary = trimmed[i].get("feature") + self.DELIMITER + \
                          SafeCastUtil.safeCast(trimmed[i].get("importance"), str)
                final_imps.append(summary)
            else:
                final_imps.append("")
        return final_imps

    def averageAndSortImportantRSENPhrases(self, important_rsen_phrases, trainer):
        if trainer.algorithm == SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET:
            ordered_phrases = []
            [ordered_phrases.append({"phrase": key, "score": numpy.average(important_rsen_phrases[key])}) for key
             in important_rsen_phrases.keys()]
            ordered_phrases = sorted(ordered_phrases, key=lambda k: k["score"], reverse=True)
            trimmed = ordered_phrases[:self.MAXIMUM_FEATURES_RECORDED]
            final_phrases = []
            for i in range(0, self.MAXIMUM_FEATURES_RECORDED):
                if i < len(trimmed):
                    summary = trimmed[i].get("phrase") + self.DELIMITER + \
                              SafeCastUtil.safeCast(trimmed[i].get("score"), str)
                    final_phrases.append(summary)
                else:
                    final_phrases.append("")
            return final_phrases
        else:
            return []

    def determineInnerHyperparameters(self, feature_set, formatted_data, trainer):
        inner_model_hyperparams = {}
        inner_perms = self.monteCarloPermsByAlgorithm(trainer.algorithm, False)
        for j in range(1, inner_perms + 1):
            self.log.info("Computing inner Monte Carlo Permutation %s for %s.\n", j,
                           self.generateFeatureSetString(feature_set))
            GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log)
            formatted_inputs = self.reformatInputsByTrainingMatrix(
                formatted_data.get(DataFormattingService.TRAINING_MATRIX),
                formatted_data.get(ArgumentProcessingService.FEATURE_NAMES))
            further_formatted_data = self.formatData(formatted_inputs, False, False)
            inner_validation_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX,
                                                                                  feature_set, further_formatted_data,
                                                                                  formatted_inputs.analysisType())
            inner_train_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX,
                                                                             feature_set, further_formatted_data,
                                                                             formatted_inputs.analysisType())
            model_data = trainer.hyperparameterize(inner_train_matrix, inner_validation_matrix, self.inputs.results)
            for data in model_data.keys():
                if inner_model_hyperparams.get(data) is not None:
                    inner_model_hyperparams[data].append(model_data[data])
                else:
                    inner_model_hyperparams[data] = [model_data[data]]
        return inner_model_hyperparams

    def formatData(self, inputs, should_scale, should_one_hot_encode):
        data_formatting_service = DataFormattingService(inputs)
        return data_formatting_service.formatData(should_scale, should_one_hot_encode)

    def logKeptFeatures(self, formatted_data, monte_carlo_perm, input_folder, trainer):
        features_by_file = {}
        for full_feature in formatted_data.get(ArgumentProcessingService.FEATURE_NAMES):
            feature_split = full_feature.split(".")
            file = feature_split[0]
            feature = feature_split[1]
            if features_by_file.get(file) is None:
                features_by_file[file] = [feature]
            else:
                features_by_file[file].append(feature)

        message = "Only using the following features for outer Monte Carlo loop " +\
                  SafeCastUtil.safeCast(monte_carlo_perm, str) + ". All other features have been removed.\n"
        for file in features_by_file.keys():
            message += ("\t" + file + ":\n")
            for feature in features_by_file[file]:
                message += ("\t\t" + feature + "\n")

        self.log.info(message)
        if self.inputs.record_diagnostics:
            DiagnosticsFileWriter.writeToFile(input_folder, message, self.log)

    def reformatInputsByTrainingMatrix(self, training_matrix, feature_names):
        real_inputs = self.inputs

        features = {}
        results = []
        features[ArgumentProcessingService.FEATURE_NAMES] = feature_names
        for training_cell in training_matrix.keys():
            for input_cell in real_inputs.features.keys():
                if training_cell is input_cell:
                    features[training_cell] = training_matrix.get(training_cell)
                    for result in real_inputs.results:
                        if result[0] is training_cell:
                            results.append(result)
                            break
                    break
        cloned_univariate_config = deepcopy(real_inputs.univariate_config)
        cloned_univariate_config.analyze_all = False
        return ProcessedArguments(results, real_inputs.is_classifier, features, real_inputs.gene_lists,
                                  real_inputs.inner_monte_carlo_permutations,
                                  real_inputs.outer_monte_carlo_permutations, real_inputs.data_split,
                                  real_inputs.algorithm_configs, real_inputs.num_threads,
                                  real_inputs.record_diagnostics,
                                  real_inputs.individual_train_config, real_inputs.rsen_config, real_inputs.recs_config,
                                  cloned_univariate_config, real_inputs.specific_combos, real_inputs.static_features)

    def determineOptimalHyperparameters(self, feature_set, formatted_data, trainer):
        inner_model_hyperparams = self.determineInnerHyperparameters(feature_set, formatted_data, trainer)
        highest_average = trainer.DEFAULT_MIN_SCORE
        best_hyperparam = {}
        for hyperparam_set in inner_model_hyperparams.keys():
            if hyperparam_set == AbstractModelTrainer.ADDITIONAL_DATA:
                continue
            average = numpy.average([results[0] for results in inner_model_hyperparams[hyperparam_set]])  # raw score
            if average > highest_average:
                best_hyperparam = DictionaryUtility.toDict(hyperparam_set)
                highest_average = average
        additional_data = inner_model_hyperparams.get(AbstractModelTrainer.ADDITIONAL_DATA)
        if additional_data:
            best_hyperparam[AbstractModelTrainer.ADDITIONAL_DATA] = additional_data
        return best_hyperparam

    def writeToCSVInLock(self, line, input_folder, ml_algorithm, num_combos, outer_perms):
        lock = threading.Lock()
        lock.acquire(True)
        self.lockThreadMessage()

        file_name = ml_algorithm + ".csv"
        write_action = "w"
        if file_name in os.listdir(input_folder):
            write_action = "a"
        with open(input_folder + "/" + file_name, write_action, newline='') as csv_file:
            try:
                writer = csv.writer(csv_file)
                if write_action == "w":
                    writer.writerow(self.getCSVFileHeader(self.inputs.is_classifier, ml_algorithm, outer_perms))
                writer.writerow(line)
            except ValueError as error:
                self.log.error("Error writing to file %s. %s", file_name, error)
            finally:
                csv_file.close()

        total_lines = 0
        with open(input_folder + "/" + file_name) as csv_file:
            try:
                reader = csv.reader(csv_file)
                total_lines += (len(SafeCastUtil.safeCast(reader, list)) - 1)
            except ValueError as error:
                self.log.error("Error reading lines from file %s. %s", file_name, error)
            finally:
                csv_file.close()
                self.logPercentDone(total_lines, num_combos, ml_algorithm)

        self.unlockThreadMessage()
        lock.release()

    @staticmethod
    def getCSVFileHeader(is_classifier, ml_algorithm, outer_perms):
        header = [MachineLearningService.FEATURE_FILE_GENE_LIST_COMBO]
        if is_classifier:
            header.append(MachineLearningService.PERCENT_ACCURATE_PREDICTIONS)
            header.append("accuracy score")
        else:
            header.append(MachineLearningService.R_SQUARED_SCORE)
            header.append("mean squared error")
        for i in range(1, outer_perms + 1):
            header.append(MachineLearningService.SCORE_AND_HYPERPARAM_PHRASE + SafeCastUtil.safeCast(i, str))
        if ml_algorithm == SupportedMachineLearningAlgorithms.RADIAL_BASIS_FUNCTION_SVM:
            return header
        feature_analysis = " most important feature"
        for i in range(1, MachineLearningService.MAXIMUM_FEATURES_RECORDED + 1):
            suffix = MachineLearningService.generateNumericalSuffix(i)
            header.append(SafeCastUtil.safeCast(i, str) + suffix + feature_analysis)
        if ml_algorithm == SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET:
            phrase_analysis = " most significant boolean phrase"
            for i in range(1, MachineLearningService.MAXIMUM_FEATURES_RECORDED + 1):
                suffix = MachineLearningService.generateNumericalSuffix(i)
                header.append(SafeCastUtil.safeCast(i, str) + suffix + phrase_analysis)
        return header

    @staticmethod
    def generateNumericalSuffix(i):
        if i == 1 and i != 11:
            return "st"
        elif i == 2 and i != 12:
            return "nd"
        elif i == 3 and i != 13:
            return "rd"
        else:
            return "th"

    def logPercentDone(self, total_lines, num_combos, ml_algorithm):
        percent_done, percentage_bar = PercentageBarUtility.calculateAndCreatePercentageBar(total_lines, num_combos)
        self.log.info("Total progress for %s: %s%% done:\n %s", ml_algorithm, percent_done, percentage_bar)

    def saveOutputToTxtFile(self, scores, accuracies, feature_set_as_string, input_folder, algorithm):
        lock = threading.Lock()
        self.lockThreadMessage()
        lock.acquire(True)

        file_name = HTMLWritingService.RECORD_FILE
        write_action = "w"
        if file_name in os.listdir(input_folder):
            write_action = "a"
        with open(input_folder + "/" + file_name, write_action) as output_file:
            try:
                output_file.write(algorithm + MachineLearningService.DELIMITER + feature_set_as_string +
                                  MachineLearningService.DELIMITER + SafeCastUtil.safeCast(scores, str) +
                                  MachineLearningService.DELIMITER + SafeCastUtil.safeCast(accuracies, str)
                                  + "\n")
            except ValueError as error:
                self.log.error("Error saving output of %s analysis to memory: %s", algorithm, error)
            finally:
                output_file.close()
                self.unlockThreadMessage()
                lock.release()

    def lockThreadMessage(self):
        self.log.debug("Locking current thread %s.", threading.current_thread())

    def unlockThreadMessage(self):
        self.log.debug("Releasing current thread %s.", threading.current_thread())
class DataFormattingServiceIT(unittest.TestCase):

    log = LoggerFactory.createLog(__name__)

    def setUp(self):
        self.current_working_dir = os.getcwd()  # Should be this package.
        input_folder = self.current_working_dir + "/SampleClassifierDataFolder"
        self.instantiateDataFormattingService(input_folder)

    def tearDown(self):
        if self.current_working_dir != "/":
            for file in os.listdir(
                    self.current_working_dir + "/" +
                    RandomizedDataGenerator.GENERATED_DATA_FOLDER):
                if file == "__init__.py":
                    continue
                os.remove(self.current_working_dir + "/" +
                          RandomizedDataGenerator.GENERATED_DATA_FOLDER + "/" +
                          file)

    def instantiateDataFormattingService(self, input_folder):
        argument_processing_service = ArgumentProcessingService(input_folder)
        arguments = argument_processing_service.handleInputFolder()
        self.data_formatting_service = DataFormattingService(arguments)

    def fetchTrainAndTestData(self):
        s = self.data_formatting_service
        features = pd.read_csv('SampleClassifierDataFolder/features.csv',
                               delimiter=',')
        results = pd.read_csv('SampleClassifierDataFolder/results.csv',
                              delimiter=',')
        x_train, x_test, y_train, y_test = s.testTrainSplit(
            features, results, self.data_formatting_service.inputs.data_split)
        return x_test, x_train, y_test, y_train

    def testFormattingDataRandomizesMatrices(self):
        original_outputs = self.data_formatting_service.formatData(True)
        self.validateOutput(original_outputs)

        self.instantiateDataFormattingService(self.current_working_dir +
                                              "/SampleClassifierDataFolder")
        new_outputs = self.data_formatting_service.formatData(True)
        self.validateOutput(new_outputs)

        original_trained_cells = SafeCastUtil.safeCast(
            original_outputs.get(DataFormattingService.TRAINING_MATRIX).keys(),
            list)
        new_trained_cells = SafeCastUtil.safeCast(
            new_outputs.get(DataFormattingService.TRAINING_MATRIX).keys(),
            list)
        non_identical_matrices = False
        for i in range(0, len(new_trained_cells)):
            if original_trained_cells[i] != new_trained_cells[i]:
                non_identical_matrices = True
        assert non_identical_matrices

    def testFormattingRandomizedData(self):
        self.validateOutput(self.formatRandomizedData(True))
        self.validateOutput(self.formatRandomizedData(False))

    def formatRandomizedData(self, is_classifier):
        arguments = self.processArguments(is_classifier, False, 150)
        data_formatting_service = DataFormattingService(arguments)
        return data_formatting_service.formatData(True)

    def processArguments(self, is_classifier, analyze_all, num_features):
        random_data_generator = RandomizedDataGenerator(
            RandomizedDataGenerator.GENERATED_DATA_FOLDER)
        random_data_generator.generateRandomizedFiles(5,
                                                      50,
                                                      num_features,
                                                      is_classifier,
                                                      10,
                                                      .8,
                                                      analyze_all=analyze_all)
        input_folder = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER
        argument_processing_service = ArgumentProcessingService(input_folder)
        arguments = argument_processing_service.handleInputFolder()
        return arguments

    def testTrimmingDoesNotTrimSignificantFeatures(self):
        significant_prefix = RandomizedDataGenerator.SIGNIFICANT_FEATURE_PREFIX
        arguments = self.processArguments(True, True, 1000)
        arguments.univariate_config.analyze_all = True
        assert arguments.univariate_config.num_top_features == 147
        orig_features = arguments.features.get(
            ArgumentProcessingService.FEATURE_NAMES)
        orig_sig_features = [
            feature for feature in orig_features
            if significant_prefix in feature
        ]
        data_formatting_service = DataFormattingService(arguments)
        output = data_formatting_service.formatData(True)
        trimmed_features = output.get(ArgumentProcessingService.FEATURE_NAMES)
        trimmed_sig_features = [
            feature for feature in trimmed_features
            if significant_prefix in feature
        ]

        training_matrix = output.get(DataFormattingService.TRAINING_MATRIX)
        testing_matrix = output.get(DataFormattingService.TESTING_MATRIX)
        expected_feature_count = 735

        for matrix in [training_matrix, testing_matrix]:
            for cell_line in matrix:
                assert len(matrix[cell_line]) == expected_feature_count

        assert len(orig_features) > len(trimmed_features)
        assert len(orig_sig_features) == len(trimmed_sig_features)
        assert len(trimmed_features) == expected_feature_count

    def testNumFeaturesInUnivariateModeCanBeTuned(self):
        arguments = self.processArguments(True, True, 1000)
        arguments.univariate_config.analyze_all = True
        arguments.univariate_config.num_top_features = 10
        data_formatting_service = DataFormattingService(arguments)
        output = data_formatting_service.formatData(True)

        training_matrix = output.get(DataFormattingService.TRAINING_MATRIX)
        testing_matrix = output.get(DataFormattingService.TESTING_MATRIX)
        expected_feature_count = 50

        for matrix in [training_matrix, testing_matrix]:
            for cell_line in matrix:
                assert len(matrix[cell_line]) == expected_feature_count

    @staticmethod
    def validateOutput(output):
        assert output is not None
        assert output.get(DataFormattingService.TRAINING_MATRIX) is not None
        assert output.get(DataFormattingService.TESTING_MATRIX) is not None
        num_train = len(
            output.get(DataFormattingService.TRAINING_MATRIX).keys())
        num_test = len(output.get(DataFormattingService.TESTING_MATRIX).keys())
        assert num_train > num_test

    def testCheckImportData(self):
        features = np.genfromtxt(self.current_working_dir +
                                 '/SampleClassifierDataFolder/features.csv',
                                 delimiter=',')
        results = np.genfromtxt(self.current_working_dir +
                                '/SampleClassifierDataFolder/results.csv',
                                delimiter=',')
        assert np.array(features[1:]).dtype == "float64"
        assert np.array(results[1:, 1]).dtype == "float64"
        assert not np.isnan(features[1:]).any()
        assert not np.isnan(results[1:, 1]).any()
        assert len(features) == len(results)

    def testCheckOneHotEncoding(self):
        s = self.data_formatting_service
        categorical_pd = pd.read_csv(
            'SampleClassifierDataFolder/categorical.csv', delimiter=',')
        assert ((s.binaryOneHot(categorical_pd).dtypes.values !=
                 np.dtype('float64')).all())
        assert ((s.oneHot(categorical_pd).dtypes.values !=
                 np.dtype('float64')).all())

    def testSplit(self):
        x_test, x_train, y_test, y_train = self.fetchTrainAndTestData()
        assert (len(x_train) and len(x_test) and len(y_train)
                and len(y_test) != 0)

    def testStratifySplit(self):
        x_test, x_train, y_test, y_train = self.fetchTrainAndTestData()
        assert (len(x_train) and len(x_test) and len(y_train)
                and len(y_test) != 0)
        categorical_pd = pd.read_csv(
            self.current_working_dir +
            '/SampleClassifierDataFolder/categorical.csv',
            delimiter=',')
        data_formatting_service = DataFormattingService(None)
        categorical_onehot = data_formatting_service.oneHot(categorical_pd)
        assert (np.shape(categorical_onehot))[1] == 2

    def testFeatureOrderIsPreserved(self):
        original_input = self.data_formatting_service.inputs.features
        self.data_formatting_service.analyze_all = False  # don't attempt trimming
        formatted_output = self.data_formatting_service.formatData(
            False, False)
        self.validateMatrixOrderHasNotChanged(
            formatted_output, original_input,
            DataFormattingService.TESTING_MATRIX)
        self.validateMatrixOrderHasNotChanged(
            formatted_output, original_input,
            DataFormattingService.TRAINING_MATRIX)

    def validateMatrixOrderHasNotChanged(self, formatted_output,
                                         original_input, matrix):
        for cell_line in formatted_output.get(matrix).keys():
            formatted_features = formatted_output.get(matrix).get(cell_line)
            original_features = original_input.get(cell_line)
            assert original_features == formatted_features

    def testFeatureScaling(self):
        x_test, x_train, y_test, y_train = self.fetchTrainAndTestData()

        self.scaleFeaturesAndAssert(x_test)
        self.scaleFeaturesAndAssert(x_train)

    def scaleFeaturesAndAssert(self, x_vals):
        feature_one_orig = list(x_vals.get("feature_one"))
        feature_two_orig = list(x_vals.get("feature_two"))
        feature_three_orig = list(x_vals.get("feature_three"))
        scaled_test = self.data_formatting_service.maybeScaleFeatures(
            x_vals, True)
        assert scaled_test
        scaled_test_vals_as_list = SafeCastUtil.safeCast(
            scaled_test.values(), list)
        self.assertFeaturesScaled(feature_one_orig, scaled_test_vals_as_list,
                                  0)
        self.assertFeaturesScaled(feature_two_orig, scaled_test_vals_as_list,
                                  1)
        self.assertFeaturesScaled(feature_three_orig, scaled_test_vals_as_list,
                                  2)

    def assertFeaturesScaled(self, feature, scaled_test_vals_as_list, index):
        for i in range(0, len(feature)):
            for j in range(0, len(feature)):
                if feature[i] == feature[j]:
                    assert scaled_test_vals_as_list[i][
                        index] == scaled_test_vals_as_list[j][index]
                elif feature[i] < feature[j]:
                    assert scaled_test_vals_as_list[i][
                        index] < scaled_test_vals_as_list[j][index]
                else:
                    assert scaled_test_vals_as_list[i][
                        index] > scaled_test_vals_as_list[j][index]
예제 #24
0
from LoggerFactory import LoggerFactory

log = LoggerFactory.getLogger('cooker')
temperature = 115


def first():
    log.info('Hi, I\'m the cooker')


def second():
    log.warning('It gets to hot here')
    log.debug('Temp is: %i', temperature)


def third():
    log.critical('BOOM!!!')