def testTrimmingDoesNotTrimSignificantFeatures(self):
        significant_prefix = RandomizedDataGenerator.SIGNIFICANT_FEATURE_PREFIX
        arguments = self.processArguments(True, True, 1000)
        arguments.univariate_config.analyze_all = True
        assert arguments.univariate_config.num_top_features == 147
        orig_features = arguments.features.get(
            ArgumentProcessingService.FEATURE_NAMES)
        orig_sig_features = [
            feature for feature in orig_features
            if significant_prefix in feature
        ]
        data_formatting_service = DataFormattingService(arguments)
        output = data_formatting_service.formatData(True)
        trimmed_features = output.get(ArgumentProcessingService.FEATURE_NAMES)
        trimmed_sig_features = [
            feature for feature in trimmed_features
            if significant_prefix in feature
        ]

        training_matrix = output.get(DataFormattingService.TRAINING_MATRIX)
        testing_matrix = output.get(DataFormattingService.TESTING_MATRIX)
        expected_feature_count = 735

        for matrix in [training_matrix, testing_matrix]:
            for cell_line in matrix:
                assert len(matrix[cell_line]) == expected_feature_count

        assert len(orig_features) > len(trimmed_features)
        assert len(orig_sig_features) == len(trimmed_sig_features)
        assert len(trimmed_features) == expected_feature_count
 def setUp(self):
     current_working_dir = os.getcwd()  # Should be this package.
     input_folder = current_working_dir + "/SampleClassifierDataFolder"
     argument_processing_service = ArgumentProcessingService(input_folder)
     arguments = argument_processing_service.handleInputFolder()
     data_formatting_service = DataFormattingService(arguments)
     self.arguments = data_formatting_service.formatData()
 def testStratifySplit(self):
     x_test, x_train, y_test, y_train = self.fetchTrainAndTestData()
     assert (len(x_train) and len(x_test) and len(y_train)
             and len(y_test) != 0)
     categorical_pd = pd.read_csv(
         self.current_working_dir +
         '/SampleClassifierDataFolder/categorical.csv',
         delimiter=',')
     data_formatting_service = DataFormattingService(None)
     categorical_onehot = data_formatting_service.oneHot(categorical_pd)
     assert (np.shape(categorical_onehot))[1] == 2
    def testNumFeaturesInUnivariateModeCanBeTuned(self):
        arguments = self.processArguments(True, True, 1000)
        arguments.univariate_config.analyze_all = True
        arguments.univariate_config.num_top_features = 10
        data_formatting_service = DataFormattingService(arguments)
        output = data_formatting_service.formatData(True)

        training_matrix = output.get(DataFormattingService.TRAINING_MATRIX)
        testing_matrix = output.get(DataFormattingService.TESTING_MATRIX)
        expected_feature_count = 50

        for matrix in [training_matrix, testing_matrix]:
            for cell_line in matrix:
                assert len(matrix[cell_line]) == expected_feature_count
    def handleDrug(self, drug, input_folder, max_nodes, processed_arguments):
        combos = self.determineGeneListCombos(processed_arguments)
        cell_line_map = processed_arguments.features
        results = processed_arguments.results
        cloned_inputs = copy.deepcopy(processed_arguments)
        cloned_inputs.data_split = 1.0
        data_formatting_service = DataFormattingService(cloned_inputs)
        formatted_inputs = data_formatting_service.formatData(True, True)
        feature_names = formatted_inputs.get(
            ArgumentProcessingService.FEATURE_NAMES)

        requested_threads = processed_arguments.num_threads
        nodes_to_use = numpy.amin([requested_threads, max_nodes])

        Parallel(n_jobs=nodes_to_use)(delayed(self.handleCellLine)(
            cell_line, combos, drug, feature_names, formatted_inputs,
            input_folder, processed_arguments, results)
                                      for cell_line in cell_line_map.keys())
    def preRecsAnalysis(self, input_folder):
        self.log.info("Performing pre-recs analysis on all drugs.")
        drugs = self.inputs.keys()
        cell_line_predictions_by_drug = OrderedDict()
        header = numpy.concatenate(
            (["cell_line"], SafeCastUtil.safeCast(drugs, list)), axis=0)
        cell_line_predictions_by_drug[self.HEADER] = header
        cell_line_predictions_by_drug[self.STD_DEVIATION] = [
            self.STD_DEVIATION
        ]
        cell_line_predictions_by_drug[self.MEAN] = [self.MEAN]
        cell_line_predictions_by_drug[self.MEDIAN] = [self.MEDIAN]
        for drug in drugs:
            processed_arguments = self.inputs.get(drug)
            results = processed_arguments.results
            combos = self.determineGeneListCombos(processed_arguments)

            processed_arguments.data_split = 1.0
            data_formatting_service = DataFormattingService(
                processed_arguments)
            formatted_inputs = data_formatting_service.formatData(True, True)
            self.log.info("Determining best combo and score for drug %s.",
                          drug)
            recs_model_info = self.fetchBestModelComboAndScore(
                drug, input_folder, formatted_inputs, results, combos,
                processed_arguments)

            if recs_model_info is None or recs_model_info.model is None or recs_model_info.combo is None:
                continue
            self.generateMultiplePredictions(recs_model_info, formatted_inputs,
                                             results,
                                             cell_line_predictions_by_drug)

        for cell_line in cell_line_predictions_by_drug:
            while len(cell_line_predictions_by_drug[cell_line]) < \
                    len(cell_line_predictions_by_drug[RecommendationsService.HEADER]):
                cell_line_predictions_by_drug[cell_line].append(
                    MachineLearningService.DELIMITER)
        self.writePreRecAnalysisFile(cell_line_predictions_by_drug,
                                     input_folder)
 def formatData(self, inputs, should_scale, should_one_hot_encode):
     data_formatting_service = DataFormattingService(inputs)
     return data_formatting_service.formatData(should_scale, should_one_hot_encode)
Esempio n. 8
0
def handleDataFormatting(inputs):
    data_formatting_service = DataFormattingService(inputs)
    return data_formatting_service.formatData()
 def formatRandomizedData(self, is_classifier):
     arguments = self.processArguments(is_classifier, False, 150)
     data_formatting_service = DataFormattingService(arguments)
     return data_formatting_service.formatData(True)
 def instantiateDataFormattingService(self, input_folder):
     argument_processing_service = ArgumentProcessingService(input_folder)
     arguments = argument_processing_service.handleInputFolder()
     self.data_formatting_service = DataFormattingService(arguments)
class DataFormattingServiceIT(unittest.TestCase):

    log = LoggerFactory.createLog(__name__)

    def setUp(self):
        self.current_working_dir = os.getcwd()  # Should be this package.
        input_folder = self.current_working_dir + "/SampleClassifierDataFolder"
        self.instantiateDataFormattingService(input_folder)

    def tearDown(self):
        if self.current_working_dir != "/":
            for file in os.listdir(
                    self.current_working_dir + "/" +
                    RandomizedDataGenerator.GENERATED_DATA_FOLDER):
                if file == "__init__.py":
                    continue
                os.remove(self.current_working_dir + "/" +
                          RandomizedDataGenerator.GENERATED_DATA_FOLDER + "/" +
                          file)

    def instantiateDataFormattingService(self, input_folder):
        argument_processing_service = ArgumentProcessingService(input_folder)
        arguments = argument_processing_service.handleInputFolder()
        self.data_formatting_service = DataFormattingService(arguments)

    def fetchTrainAndTestData(self):
        s = self.data_formatting_service
        features = pd.read_csv('SampleClassifierDataFolder/features.csv',
                               delimiter=',')
        results = pd.read_csv('SampleClassifierDataFolder/results.csv',
                              delimiter=',')
        x_train, x_test, y_train, y_test = s.testTrainSplit(
            features, results, self.data_formatting_service.inputs.data_split)
        return x_test, x_train, y_test, y_train

    def testFormattingDataRandomizesMatrices(self):
        original_outputs = self.data_formatting_service.formatData(True)
        self.validateOutput(original_outputs)

        self.instantiateDataFormattingService(self.current_working_dir +
                                              "/SampleClassifierDataFolder")
        new_outputs = self.data_formatting_service.formatData(True)
        self.validateOutput(new_outputs)

        original_trained_cells = SafeCastUtil.safeCast(
            original_outputs.get(DataFormattingService.TRAINING_MATRIX).keys(),
            list)
        new_trained_cells = SafeCastUtil.safeCast(
            new_outputs.get(DataFormattingService.TRAINING_MATRIX).keys(),
            list)
        non_identical_matrices = False
        for i in range(0, len(new_trained_cells)):
            if original_trained_cells[i] != new_trained_cells[i]:
                non_identical_matrices = True
        assert non_identical_matrices

    def testFormattingRandomizedData(self):
        self.validateOutput(self.formatRandomizedData(True))
        self.validateOutput(self.formatRandomizedData(False))

    def formatRandomizedData(self, is_classifier):
        arguments = self.processArguments(is_classifier, False, 150)
        data_formatting_service = DataFormattingService(arguments)
        return data_formatting_service.formatData(True)

    def processArguments(self, is_classifier, analyze_all, num_features):
        random_data_generator = RandomizedDataGenerator(
            RandomizedDataGenerator.GENERATED_DATA_FOLDER)
        random_data_generator.generateRandomizedFiles(5,
                                                      50,
                                                      num_features,
                                                      is_classifier,
                                                      10,
                                                      .8,
                                                      analyze_all=analyze_all)
        input_folder = self.current_working_dir + "/" + RandomizedDataGenerator.GENERATED_DATA_FOLDER
        argument_processing_service = ArgumentProcessingService(input_folder)
        arguments = argument_processing_service.handleInputFolder()
        return arguments

    def testTrimmingDoesNotTrimSignificantFeatures(self):
        significant_prefix = RandomizedDataGenerator.SIGNIFICANT_FEATURE_PREFIX
        arguments = self.processArguments(True, True, 1000)
        arguments.univariate_config.analyze_all = True
        assert arguments.univariate_config.num_top_features == 147
        orig_features = arguments.features.get(
            ArgumentProcessingService.FEATURE_NAMES)
        orig_sig_features = [
            feature for feature in orig_features
            if significant_prefix in feature
        ]
        data_formatting_service = DataFormattingService(arguments)
        output = data_formatting_service.formatData(True)
        trimmed_features = output.get(ArgumentProcessingService.FEATURE_NAMES)
        trimmed_sig_features = [
            feature for feature in trimmed_features
            if significant_prefix in feature
        ]

        training_matrix = output.get(DataFormattingService.TRAINING_MATRIX)
        testing_matrix = output.get(DataFormattingService.TESTING_MATRIX)
        expected_feature_count = 735

        for matrix in [training_matrix, testing_matrix]:
            for cell_line in matrix:
                assert len(matrix[cell_line]) == expected_feature_count

        assert len(orig_features) > len(trimmed_features)
        assert len(orig_sig_features) == len(trimmed_sig_features)
        assert len(trimmed_features) == expected_feature_count

    def testNumFeaturesInUnivariateModeCanBeTuned(self):
        arguments = self.processArguments(True, True, 1000)
        arguments.univariate_config.analyze_all = True
        arguments.univariate_config.num_top_features = 10
        data_formatting_service = DataFormattingService(arguments)
        output = data_formatting_service.formatData(True)

        training_matrix = output.get(DataFormattingService.TRAINING_MATRIX)
        testing_matrix = output.get(DataFormattingService.TESTING_MATRIX)
        expected_feature_count = 50

        for matrix in [training_matrix, testing_matrix]:
            for cell_line in matrix:
                assert len(matrix[cell_line]) == expected_feature_count

    @staticmethod
    def validateOutput(output):
        assert output is not None
        assert output.get(DataFormattingService.TRAINING_MATRIX) is not None
        assert output.get(DataFormattingService.TESTING_MATRIX) is not None
        num_train = len(
            output.get(DataFormattingService.TRAINING_MATRIX).keys())
        num_test = len(output.get(DataFormattingService.TESTING_MATRIX).keys())
        assert num_train > num_test

    def testCheckImportData(self):
        features = np.genfromtxt(self.current_working_dir +
                                 '/SampleClassifierDataFolder/features.csv',
                                 delimiter=',')
        results = np.genfromtxt(self.current_working_dir +
                                '/SampleClassifierDataFolder/results.csv',
                                delimiter=',')
        assert np.array(features[1:]).dtype == "float64"
        assert np.array(results[1:, 1]).dtype == "float64"
        assert not np.isnan(features[1:]).any()
        assert not np.isnan(results[1:, 1]).any()
        assert len(features) == len(results)

    def testCheckOneHotEncoding(self):
        s = self.data_formatting_service
        categorical_pd = pd.read_csv(
            'SampleClassifierDataFolder/categorical.csv', delimiter=',')
        assert ((s.binaryOneHot(categorical_pd).dtypes.values !=
                 np.dtype('float64')).all())
        assert ((s.oneHot(categorical_pd).dtypes.values !=
                 np.dtype('float64')).all())

    def testSplit(self):
        x_test, x_train, y_test, y_train = self.fetchTrainAndTestData()
        assert (len(x_train) and len(x_test) and len(y_train)
                and len(y_test) != 0)

    def testStratifySplit(self):
        x_test, x_train, y_test, y_train = self.fetchTrainAndTestData()
        assert (len(x_train) and len(x_test) and len(y_train)
                and len(y_test) != 0)
        categorical_pd = pd.read_csv(
            self.current_working_dir +
            '/SampleClassifierDataFolder/categorical.csv',
            delimiter=',')
        data_formatting_service = DataFormattingService(None)
        categorical_onehot = data_formatting_service.oneHot(categorical_pd)
        assert (np.shape(categorical_onehot))[1] == 2

    def testFeatureOrderIsPreserved(self):
        original_input = self.data_formatting_service.inputs.features
        self.data_formatting_service.analyze_all = False  # don't attempt trimming
        formatted_output = self.data_formatting_service.formatData(
            False, False)
        self.validateMatrixOrderHasNotChanged(
            formatted_output, original_input,
            DataFormattingService.TESTING_MATRIX)
        self.validateMatrixOrderHasNotChanged(
            formatted_output, original_input,
            DataFormattingService.TRAINING_MATRIX)

    def validateMatrixOrderHasNotChanged(self, formatted_output,
                                         original_input, matrix):
        for cell_line in formatted_output.get(matrix).keys():
            formatted_features = formatted_output.get(matrix).get(cell_line)
            original_features = original_input.get(cell_line)
            assert original_features == formatted_features

    def testFeatureScaling(self):
        x_test, x_train, y_test, y_train = self.fetchTrainAndTestData()

        self.scaleFeaturesAndAssert(x_test)
        self.scaleFeaturesAndAssert(x_train)

    def scaleFeaturesAndAssert(self, x_vals):
        feature_one_orig = list(x_vals.get("feature_one"))
        feature_two_orig = list(x_vals.get("feature_two"))
        feature_three_orig = list(x_vals.get("feature_three"))
        scaled_test = self.data_formatting_service.maybeScaleFeatures(
            x_vals, True)
        assert scaled_test
        scaled_test_vals_as_list = SafeCastUtil.safeCast(
            scaled_test.values(), list)
        self.assertFeaturesScaled(feature_one_orig, scaled_test_vals_as_list,
                                  0)
        self.assertFeaturesScaled(feature_two_orig, scaled_test_vals_as_list,
                                  1)
        self.assertFeaturesScaled(feature_three_orig, scaled_test_vals_as_list,
                                  2)

    def assertFeaturesScaled(self, feature, scaled_test_vals_as_list, index):
        for i in range(0, len(feature)):
            for j in range(0, len(feature)):
                if feature[i] == feature[j]:
                    assert scaled_test_vals_as_list[i][
                        index] == scaled_test_vals_as_list[j][index]
                elif feature[i] < feature[j]:
                    assert scaled_test_vals_as_list[i][
                        index] < scaled_test_vals_as_list[j][index]
                else:
                    assert scaled_test_vals_as_list[i][
                        index] > scaled_test_vals_as_list[j][index]