def test_punctunation(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_remove_punctuation()) loader = DataPreprocess(config) process = loader.process_item(TEST_DATA) self.assertEqual("This isn t a TEST sentences ", process["data"])
def test_whitespace(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_remove_whitespace()) loader = DataPreprocess(config) process = loader.process_item(" remove whitespace ") self.assertEqual("remove whitespace", process["data"])
def test_lower(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_lowercase()) loader = DataPreprocess(config) process = loader.process_item(TEST_DATA) self.assertEqual(TEST_DATA.lower(), process["data"])
def test_csv_pipeline(self): config = { "data_loader": { "name": "data_loader", "type": "csv", "file_path": "test_data/test.csv", "columns": { "id": "id", "data": "text", "additional_columns": ["username"] }, }, "steps": [ { "name": "normalize_text", "type": "lowercase", "log_level": "INFO" }, ], } loader = DataPreprocess(config) data = [] for batch in loader.process_data(): for item in batch: data.append(item["data"]) self.assertEqual(4, len(data))
def test_url(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_remove_urls()) loader = DataPreprocess(config) process = loader.process_item("remove url http://www.google.com") self.assertEqual("remove url ", process["data"])
def test_stopwords(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_lowercase()) config['steps'].append(templates.normalize_text_remove_stopwords()) loader = DataPreprocess(config) process = loader.process_item(TEST_DATA) self.assertEqual("isn't test sentences!", process["data"])
def test_lemmatizer(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_lowercase()) config['steps'].append(templates.normalize_text_lemmatizer()) loader = DataPreprocess(config) process = loader.process_item("How many cities are there?") self.assertEqual("how many city are there?", process["data"])
def test_contractions(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_lowercase()) config['steps'].append(templates.normalize_text_expand_contractions()) loader = DataPreprocess(config) process = loader.process_item(TEST_DATA) self.assertEqual("this is not a test sentences!", process["data"])
def __init__(self): parser = argparse.ArgumentParser(description='List the content of a folder') parser.add_argument('--text_file', type=str, help='File path to classify') args = parser.parse_args() self.text_file_path = args.text_file self.df = None self.X = None self.word_to_vector_model_path = r'models/w2v.pkl' self.dim_reduction_path = r'models/dim_reduction.pkl' self.model_path = r'models/svc_model.pkl' self.data_preprocessing = DataPreprocess() self.build_features = BuildFeatures()
def main(): ################################################## # parse data from original data & construct images ################################################## print( "parsing data from log files which are generated by Atheros-CSI-TOOL\n" ) data_generator = DataLogParser(conf.n_timestamps, conf.D, conf.step_size, conf.ntx_max, conf.nrx_max, conf.nsubcarrier_max, conf.data_folder, conf.log_folder, conf.skip_frames, conf.time_offset_ratio, conf.day_conf, conf.label) data_generator.generate_image_no_label(conf.draw_date, conf.draw_label) # train_data, test_data: classes (key: label, value: images under this label) test_data = data_generator.get_data_no_label() ################################################## # apply signal processing blocks to images ################################################## print("Pre-processing data\n") data_process = DataPreprocess(conf.n_timestamps, conf.D, conf.step_size, conf.ntx_max, conf.ntx, conf.nrx_max, conf.nrx, conf.nsubcarrier_max, conf.nsubcarrier, conf.data_shape_to_nn, conf.data_folder, conf.label) data_process.add_image_no_label(test_data) data_process.signal_processing(conf.do_fft, conf.fft_shape) data_process.prepare_shape() final_test_data = data_process.get_data_no_label() ################################################## # train or test data with neural netowrk ################################################## nn_model = NeuralNetworkModel(conf.data_shape_to_nn, conf.abs_shape_to_nn, conf.phase_shape_to_nn, conf.total_classes) print("Get test result using existing model (in test mode)\n") nn_model.load_model(conf.model_name) for key in final_test_data: plt.figure() total_test = len(final_test_data[key]) cc = 1 for idx in final_test_data[key]: # if want to output motion probability, please set output_label == False result = nn_model.get_no_label_result(final_test_data[key][idx], output_label=True) plt.subplot(total_test, 1, cc) plt.plot(result) plt.title(idx) plt.ylim(0, 1.05) cc = cc + 1 plt.suptitle(key) nn_model.end() plt.show() print("Done!")
def test_single_item_pipeline(self): config = { "data_loader": { "type": "single_item" }, "steps": [ { "name": "normalize_text", "type": "lowercase", "log_level": "INFO" }, ], } loader = DataPreprocess(config) process = loader.process_item(TEST_DATA) self.assertEqual(TEST_DATA.lower(), process["data"])
def generate_evaluate(self, file_path): while True: for list_x, list_y, list_time_x, list_time_y in self.get_feature( file_path): list_x = DataPreprocess.add_feature( file_path, list_time_x) if self.tcn_add_features else list_x yield np.array(list_x), np.array(list_y)
def execute(): test_data = prepare_test_data() preprocessing = DataPreprocess(test_data, do_load_existing_tokenizer=True) prediction = make_prediction(preprocessing) roc_auc = evaluate_roc_auc(preprocessing, prediction > 0.5) accuracy = evaluate_accuracy_score(preprocessing, prediction > 0.5) print(f'Average ROC_AUC Score on Test Data: {roc_auc}') print(f'Average Accuracy Score on Test Data: {accuracy}')
def execute(): # Import the training data csv file and save it into a dataframe training_data = pd.read_csv(TRAINING_DATA_LOC) preprocessing = DataPreprocess(training_data) rnn_model, history = build_rnn_model(preprocessing.padded_data, preprocessing.target_classes, preprocessing.embedding_layer) plot_training_history(rnn_model, history, preprocessing.padded_data, preprocessing.target_classes)
def execute(data): training_data = pd.read_csv(data) preprocessing = DataPreprocess(training_data) lstm_model, history = build_lstm_model(preprocessing.X_t, preprocessing.target_classes, preprocessing.embedding_layer) plot_training_history(lstm_model, history, preprocessing.X_t, preprocessing.target_classes)
def test_list_pipeline(self): config = { "data_loader": { "type": "list" }, "steps": [ { "name": "normalize_text", "type": "lowercase", "log_level": "INFO" }, ], } loader = DataPreprocess(config) data = [] for batch in loader.process_data(TEST_LIST): for item in batch: data.append(item["data"]) test = [item.lower() for item in TEST_LIST] self.assertEqual(test, data)
def main(): logger.info(SEPARATOR) configs = get_configs() # Run data preprocess if configs.data_preprocess_active: logger.info(configs) preprocess = DataPreprocess(configs) preprocess.data_preprocess() logger.info("Data preprocess finished!") # Run train model if configs.da_rnn_model_active: logger.info(configs) da_rnn_model = DaRnnModel(configs) da_rnn_model.run() logger.info("Da_rnnModel finished!") if configs.xgboost_gridsearch_model_active: xgboost_model = XgboostGridSearchModel(configs) xgboost_model.run() logger.info("XGboost finished!") if configs.tcn_big_file_model_active: tcn_model = TcnModel(configs) tcn_model.run() logger.info("TcnModel finished!")
def read_data_from_disk(self, queue): # optional pre-processing arguments dataproc = DataPreprocess(queue, coords=self.coord, dataset_catgry=self.dataset_catgry, dataset_type=self.data_type) h, w = self.params['input_size'] dstep = self.params['dce_dstep'] if self.data_type == 1: color_img = dataproc.load_colorimg(height=h, width=w, channels=3) depth_gt = dataproc.load_depthgt(height=h, width=w, channels=1) subsample_depth = dataproc.load_subsampledepth(height=h, width=w, channels=1) if self.params['orig_normalizefac']: depth_gt = tf.cast(depth_gt, tf.float32) * 100 / 256 subsample_depth = tf.cast(subsample_depth, tf.float32) * 100 / 256 else: depth_gt = tf.cast(depth_gt, tf.float32) subsample_depth = tf.cast(subsample_depth, tf.float32) color_img = tf.slice(color_img, [self.params['truncated_height_start'], 0, 0], [self.params['truncated_height_end'], w, 3]) depth_gt = tf.slice(depth_gt, [self.params['truncated_height_start'], 0, 0], [self.params['truncated_height_end'], w, 1]) subsample_depth = tf.slice( subsample_depth, [self.params['truncated_height_start'], 0, 0], [self.params['truncated_height_end'], w, 1]) if self.datainput in ['color_dc_dcclabels']: color_img_processed = dataproc.preprocess_color( color_img, self.params['coloraugmentflag']) depth_gt = tf.cast(depth_gt, tf.float32) if self.params['Gen_uniformsampflag']: subsample_depth = dataproc.uniform_sampling( depth_gt, self.params['Uniform_samp']) subsampledepth_dcc = dataproc.depth_2_dcc_channelsgeneralize( subsample_depth, dstep, self.params['depth_maxrange'], spatial_dim=(self.params['truncated_height_end'], w)) depth_gt_dcc = dataproc.depth_2_dcc_channelsgeneralize( depth_gt, self.params['dce_dstep'], self.params['depth_maxrange'], spatial_dim=(self.params['truncated_height_end'], w)) #depth_gt_dcc = dataproc.depth_2_dcc_channelsgeneralize(depth_gt, dstep, self.params['depth_maxrange'], oorFlag = False, spatial_dim = (h,w)) depth_gt_dcc = tf.squeeze(depth_gt_dcc) data_processed = tf.concat([ color_img_processed, subsampledepth_dcc, depth_gt_dcc, depth_gt ], axis=2) else: ValueError('Data-Input Type is Unrecognized. Exiting ...\n') if self.random_mirror: data_processed = data_mirroring(data_processed) if self.random_crop: data, labels = self.random_crop_and_pad_data_and_labels( data_processed, self.params['crop_size'][0], self.params['crop_size'][1]) else: data, labels = self.crop_pad_data_labels( data_processed, self.params['crop_size'][0], self.params['crop_size'][1]) return data, labels
def __init__(self): self.model_path = r'models/svc_model.pkl' self.data_preprocessing = DataPreprocess() self.build_features = BuildFeatures() self.X = None self.y = None
class TrainModel: def __init__(self): self.model_path = r'models/svc_model.pkl' self.data_preprocessing = DataPreprocess() self.build_features = BuildFeatures() self.X = None self.y = None def run(self): # data preprocessing pipeline self.data_preprocessing.load_csv() self.data_preprocessing.clean_conversation() self.data_preprocessing.extract_meaning_phrases() self.data_preprocessing.group_convs_by_file_id() self.data_preprocessing.rm_dups_phrases_in_same_conv() self.X, self.y = self.data_preprocessing.get_X_y() # with open('X.pkl', 'rb') as fp: # self.X = pickle.load(fp) # self.X = [list(a) for a in self.X] # # with open('y.pkl', 'rb') as fp: # self.y = pickle.load(fp) # Train and test set X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.1, stratify=self.y) # build features # oversampling on training data only X_train, y_train = self.build_features.oversampling_on_training_data( X_train, y_train) # X_train = [' '.join(a).replace('[PAD]', '').strip() for a in X_train] # X_test = [' '.join(a).replace('[PAD]', '').strip() for a in X_test] # Word to vectors self.build_features.word_to_vectors_model(X_train) X_train = self.build_features.word_to_vectors_transformed(X_train) X_test = self.build_features.word_to_vectors_transformed(X_test) # Dimenstion reduction technique. self.build_features.dimension_reduction_model(X_train) X_train = self.build_features.dimension_reduction_transformed(X_train) X_test = self.build_features.dimension_reduction_transformed(X_test) # train model model = LinearSVC(random_state=25) model.fit(X_train, y_train) print('\n\n') print('-*-' * 20) print('Training accuracy: ', model.score(X_train, y_train) * 100) print('Accuracy on unseen documents: ', model.score(X_test, y_test) * 100) print('-*-' * 20) pickle.dump(model, open(self.model_path, 'wb')) # save
def main(): args = get_input_arguments() training_mode = (args.mode == 'Y') if args.mode not in ['Y', 'N']: raise ValueError('Invalid input value for m should be either Y or N') data_folder = conf.data_folder if training_mode: label = conf.train_label print('in training mode') print('training data from {} \nvalidation data from {}\n'.format( conf.training_date, conf.training_validate_date)) print('training label is {}\n'.format(label)) data_folder += "training/" else: label = conf.test_label print('in test mode') print('test date from {}'.format(conf.test_date)) print('test label is {}\n'.format(label)) data_folder += "test/" ################################################## # parse data from original data & construct images ################################################## print( "parsing data from log files which are generated by Atheros-CSI-TOOL\n" ) data_generator = DataLogParser(conf.n_timestamps, conf.D, conf.step_size, conf.ntx_max, conf.nrx_max, conf.nsubcarrier_max, data_folder, conf.log_folder, conf.skip_frames, conf.time_offset_ratio, conf.day_conf, label) train_date = conf.training_date if training_mode else [] if training_mode: data_generator.generate_image(conf.training_date, conf.training_validate_date) else: data_generator.generate_image([], conf.test_date) # train_data, test_data: classes (key: label, value: images under this label) train_data, test_data = data_generator.get_data() ################################################## # apply signal processing blocks to images ################################################## print("Pre-processing data\n") data_process = DataPreprocess(conf.n_timestamps, conf.D, conf.step_size, conf.ntx_max, conf.ntx, conf.nrx_max, conf.nrx, conf.nsubcarrier_max, conf.nsubcarrier, conf.data_shape_to_nn, data_folder, label) data_process.load_image(training_mode, False, train_data, test_data) data_process.signal_processing(conf.do_fft, conf.fft_shape) data_process.prepare_shape() x_train, y_train, x_test, y_test = data_process.get_data() ################################################## # train or test data with neural netowrk ################################################## nn_model = NeuralNetworkModel(conf.data_shape_to_nn, conf.abs_shape_to_nn, conf.phase_shape_to_nn, conf.total_classes) nn_model.add_data(x_train, y_train, x_test, y_test) if training_mode: print("Building a new model (in training mode)\n") nn_model.cnn_model_abs_phase() nn_model.fit_data(conf.epochs) nn_model.save_model(conf.model_name) else: print("Get test result using existing model (in test mode)\n") nn_model.load_model(conf.model_name) result = nn_model.get_test_result(label) # nn_model.save_result(result, conf.file_prefix+conf.test_result_filename) nn_model.end() print("Done!")
class Classifier: def __init__(self): parser = argparse.ArgumentParser(description='List the content of a folder') parser.add_argument('--text_file', type=str, help='File path to classify') args = parser.parse_args() self.text_file_path = args.text_file self.df = None self.X = None self.word_to_vector_model_path = r'models/w2v.pkl' self.dim_reduction_path = r'models/dim_reduction.pkl' self.model_path = r'models/svc_model.pkl' self.data_preprocessing = DataPreprocess() self.build_features = BuildFeatures() def read_text_file(self): with open(self.text_file_path) as fp: text = [x.strip('\r\n') for x in fp.readlines()] return text def create_dataframe(self, text): self.df = pd.DataFrame(text, columns=['conversation']) def load_model(self): return pickle.load(open(self.model_path, 'rb')) def run(self): text = self.read_text_file() self.create_dataframe(text) # data preprocessing pipeline self.data_preprocessing.test_fill_df(self.df) self.data_preprocessing.clean_conversation() self.data_preprocessing.extract_meaning_phrases() self.data_preprocessing.test_group_convs() self.data_preprocessing.rm_dups_phrases_in_same_conv() X_test = self.data_preprocessing.test_get_X() print(len(X_test)) # Word to vectors X_test = self.build_features.word_to_vectors_transformed(X_test) # Dimenstion reduction technique. X_test = self.build_features.dimension_reduction_transformed(X_test) model = self.load_model() print('-*-' * 20) predicted_class = model.predict(X_test) print('Result: ', predicted_class)