def test_num_features(self): # TODO THIS TEST REDUNDANT configure_pipeline(self.__experiment_dir, 'data', min_df=1) feature_pipe = PipelineFeatures(self.__experiment_dir, 'train.pkl.bz2') X_text = [self.__text[['text1']], self.__text[['text2']]] feature_pipe.fit(X_text, self.__text['label']) X = feature_pipe.transform(X_text, self.__text['label']) self.assertEqual(X.shape[1], 70, 0)
def test_features_dims_two_text_columns(self): configure_pipeline(self.__experiment_dir, 'data', min_df=1) feature_pipe = PipelineFeatures(self.__experiment_dir, 'train.pkl.bz2') X_text = [self.__text[['text1']], self.__text[['text2']]] feature_pipe.fit(X_text, self.__text['label']) X = feature_pipe.transform(X_text, self.__text['label']) n_rows = X.shape[0] n_cols = X.shape[1] self.assertEqual(n_rows, 10) self.assertEqual(n_cols, 70)
# ---- Text processing --- # pt.configure_pipeline(experiment_path, data_path, spell=True, split_words=True, text_headers=['RECDESC', 'EXPDESC'], stop_words=True, lemmatize=False, stemm=False) pipe_text = pt.PipelineText(experiment_path, 'train.pkl.bz2') pipe_text.fit_transform() pipe_text = pt.PipelineText(experiment_path, 'valid.pkl.bz2') pipe_text.transform() # ---- Feature engineering --- # pf.configure_pipeline(experiment_path, feature_set=['frequency_matrix'], num_features=0, idf=True, feature_selection_type='Logistic', min_df=3, min_ngram=1, max_ngram=3) pipe_features = pf.PipelineFeatures('exp_1', 'train.pkl.bz2') pipe_features.fit_transform() pipe_features = pf.PipelineFeatures('exp_1', 'valid.pkl.bz2') pipe_features.transform() pm.configure_pipeline(experiment_path) # ---- MODEL 1 --- # cls = MLFactory.factory(classifier_name) pipe_ml = pm.MLTrainTest(experiment_path, 'train.pkl.bz2', classifier=cls) pipe_ml.fit_transform()
def config_location(self): configure_pipeline(self.__experiment_dir, 'data') config_path = os.path.join(self.__experiment_dir, 'features', 'config.json') self.assertEqual(config_path, '/tests/exp/features/config.json')
def test_config_file_non_empty(self): configure_pipeline(self.__experiment_dir, 'data') config_path = os.path.join(self.__experiment_dir, 'features', 'config.json') filesize = os.path.getsize(config_path) self.assertNotEqual(filesize, 0)