def test__check_is_list(self): df_long = self.spark.read.csv('tests/fixtures/preprocess/long.csv', header=True) Preprocess(df_labels=df_long, columns=['country', 'protein']) with self.assertRaises(AssertionError): Preprocess(df_labels=df_long, columns='protein')
def test__check_is_spark_data_frame(self): df_simple_table = self.spark.read.csv( 'tests/fixtures/preprocess/simple_table.csv', header=True) pd_df_simple_table = pd.read_csv( 'tests/fixtures/preprocess/simple_table.csv') Preprocess(df_labels=df_simple_table, columns=['']) with self.assertRaises(AssertionError): Preprocess(df_labels=pd_df_simple_table, columns=[''])
def test__check_nulls_in_index_column(self): df_nulls = self.spark.read.csv( 'tests/fixtures/preprocess/nulls_recipe_id.csv', header=True) df_no_nulls = self.spark.read.csv( 'tests/fixtures/preprocess/no_nulls_recipe_id.csv', header=True) Preprocess(df_labels=df_no_nulls, columns=['']) with self.assertRaises(AssertionError): Preprocess(df_labels=df_nulls, columns=[''])
def test__remove_columns(self): df_long = self.spark.read.csv('tests/fixtures/preprocess/long.csv', header=True) preprocessor_2_columns = Preprocess(df_labels=df_long, columns=['country', 'protein']) preprocessor_2_columns._remove_columns() self.assertEqual(len(preprocessor_2_columns.df_labels.columns), 1 + 2) preprocessor_all = Preprocess(df_labels=df_long, columns='all') preprocessor_all._remove_columns() self.assertEqual(len(preprocessor_all.df_labels.columns), 4)
def get_environment_state(self): state = self.game_environment.get_state() img = state.screen_buffer img = Preprocess(img).image game_vars = state.game_variables return img, game_vars
def controler_1(loc_fp, cat_fp, mongo_username, mongo_password, dbname, collectionname): query_dict = GetQuery(loc_fp, cat_fp).get_final_query_dict_bygugun() i = 0 for category_query, local_category_query_list in query_dict.items(): i += 1 api_id, api_secret = get_api_id_secret(i) display = 10 category_query_dictlist = NaverRequests(api_id, api_secret, local_category_query_list, display).get_category_query() if len(category_query_dictlist) < 1: print("{} / {} : {}, items : 0 #### Failed".format( i, len(query_dict.items()), category_query)) pass else: processed_df = Preprocess(category_query, category_query_dictlist).preprocess() save_to_mongodb(processed_df, username=mongo_username, userpassword=mongo_password, dbname=dbname, collectionname=collectionname) print("{} / {} : {}, items : {}".format(i, len(query_dict.items()), category_query, processed_df.shape[0]))
def text_preprocess(text): pre = Preprocess(text) text = pre.clean_text(text) text = pre.clean_contractions(text, pre.contraction_mapping) text = pre.correct_spelling(text, pre.mispell_dict) text = pre.clean_special_chars(text, pre.punct, pre.punct_mapping) return text
def __init__(self, data_path): self.path = data_path self.preprocess = Preprocess() self.gender = [] self.userid = [] self.model = Model() self.model.load_model()
def LoadData(): print("Preprocess the dataset...", end = ' ') preprocess = Preprocess() SRC, TRG, tr, valid, ts = preprocess.Build() print("DONE") return SRC,TRG, tr, valid, ts
def __init__(self): self.vect = TfidfVectorizer() self.data = None self.vect_data = None self.pre = Preprocess()
def __init__(self): cursor.execute("SELECT content FROM data") scripts = cursor.fetchall() fw = open('vector.txt', 'w') fresult = open('result.txt', 'w') mPreprocess = Preprocess() mPairedToken = PairToken() mConvertVector = ConvertVector() stanford = StanfordCoreNLP('http://localhost:9000') for script in scripts: # if type(script) is tuple: listToken = mPreprocess.exec(script[0]) # else: # listToken = mPreprocess.exec(script) listCouple = mPairedToken.exec(listToken) output = stanford.annotate(script[0], properties={'annotators': 'coref', 'outputFormat': 'json'}) for mCoupleToken in listCouple: if self.checkCoreF(output['corefs'], mCoupleToken): # fresult.write(str(1) + ' ' + mCoupleToken.np1.text + ' ' + mCoupleToken.np2.text) fresult.write(str(1)) fresult.write('\n') else: # fresult.write(str(-1) + ' ' + mCoupleToken.np1.text + ' ' + mCoupleToken.np2.text) fresult.write(str(-1)) fresult.write('\n') vector = mConvertVector.exec(mCoupleToken) fw.write(str(vector)) fw.write('\n')
def preprocess_pipeline(self, df, upper_quantile, lower_quantile, target_name, none_values, outlier_column: str = None, polytrans_columns: list = None, corr_percentage=0.7, test_size=0.25, validation=False): preprocess = Preprocess(df) target_feature = preprocess.target_feature(none_values, target_name) if outlier_column != None: preprocess.drop_outliers(outlier_column, upper_quantile, lower_quantile) preprocess.drop_multicoll_columns(ALLOWED_CORR_PER) preprocess.imputer() if polytrans_columns != None: preprocess.polytrans(polytrans_columns) features = preprocess.one_hot_encoder() if validation: x_train, y_train, x_test, y_test, x_validation, y_validation = train_test_split( features, target_feature, test_size, validation) return x_train, y_train, x_test, y_test, x_validation, y_validation x_train, y_train, x_test, y_test = preprocess.train_test_split( features, target_feature, test_size, validation) return x_train, x_test, y_train, y_test
def main(): '''Training of the model on the preprocessed data. ''' preprocess = Preprocess() data = preprocess.getData( path="creditcard.csv", # path of the csv file feature_incides=[0, 29], # column indices of the features label_indices=[30], # column indices of the labels training_size=0.5, # size for the training set standardize=True, # apply standardization? eval_set=True # create evaluation set? ) model = Model( batch_size=10, # size of the training batch epochs=50, # number of training epochs nodes=[ 29, 200, 2 ], # List of neurons, first entry is the number of input, last entry # the number of output neurons. The values in between are the hidden neurons. learning_rate=0.0001, # learning rate for the training hidden_activation= "sigmoid", # activation function for the hidden nodes, choose between "tanh", "sigmoid" and "relu" output_activation= "linear", # activation function for the output nodes, choose between "tanh", "sigmoid" and "linear" data=data, # the loaded and preprocessed data form the csv file do_eval=True # measure accuracy of the evaluation set? ) model.train()
def getConstraints(self, setnumber=""): try: preprocess = Preprocess() absolute_path = path.join(self._path, self._data['params']) count = [] constraints = [] for filename in listdir(absolute_path): match = re.match(self._patterns['params'], filename) if match: if (match.group(2) == setnumber): count.append(match.group(3)) with open(path.join(absolute_path, filename), "r") as c: constraints.append( preprocess.preprocessConstraints( c.read().split("\n"))) if constraints == []: return {"error": True, "message": "Something's up"} return {"error": False, "constraints": constraints, "count": count} except FileNotFoundError: return { "error": True, "message": """Files not found. Please make sure that there is a directory called 'params' in the given path, with the files named as params.txt or params1.txt or params1-1.txt""" }
def closed_form_extra_features(self): preprocess1 = Preprocess() x_set = preprocess1.matrixify(self.data, 60) y_set = Preprocess.get_y(self.data) lengths = [] length_squared = [] for datapoint in self.data: text_length = len(datapoint['text']) lengths.append(text_length) children_length_inter = [] children_list = [] log_children_list = [] for datapoint in self.data: children_list.append(datapoint['children']) if datapoint['children'] != 0: log_children_list.append(math.log(datapoint['children'])) else: log_children_list.append(0) for length, children in zip(lengths, children_list): children_length_inter.append(length * children) preprocess1.add_features(children_length_inter) x_set = preprocess1.add_features(log_children_list) x_set = feature_selector.backwardElimination(x_set, y_set, 0.1) return self.run_model(x_set, y_set)
def display_training_and_validation_error(self): num_words = 160 word_nums = np.arange(num_words) val_error_list = [] train_error_list = [] preprocess1 = Preprocess() x_set = preprocess1.matrixify(self.data, num_words) y_set = Preprocess.get_y(self.data) for x in word_nums: cur = x_set[:, 3:3 + x] print("Running on top " + str(x) + " words") val_error, train_error = self.run_model(cur, y_set) val_error_list.append(val_error) train_error_list.append(train_error) fig, ax = plt.subplots() plt.scatter(word_nums, val_error_list, color='blue', s=5, label="Validation set") plt.scatter(word_nums, train_error_list, color='red', s=5, label="Training set") plt.title("MSE vs number of words used") ax.set_xlabel("Words Used") ax.set_ylabel("MSE") plt.legend(loc='upper right') plt.show()
def main(): # Load and merge review, business and user data loader = DataLoader(path="../data/") df = loader.merge() preprocessor = Preprocess(df) # Remove weekday columns preprocessor.rm_weekdays() # Convert categorical variables to one-hot encoded variables and convert # strings to numerical types if possible preprocessor.categorical_to_numerical() # Fill NA observations with the mode of the given feature preprocessor.fill_na() # Unravel and one-hot encode business categories preprocessor.unravel_categories() # Sort observations by date and reindex with this ordering preprocessor.sort_by_date() # Form temporal train-val-test split preprocessor.split_data() # Write the final dataframe to a pickle file preprocessor.dump("../data/yelp_df.pkl")
def create_threads(detector): config = configparser.ConfigParser() config.read(FACE_DETEC_CONF) video_decoders = [] for item in config['videostream']: preprocesser = Preprocess(config['videostream'][item], len(video_decoders), MODEL_WIDTH, MODEL_HEIGHT) video_decoders.append(preprocesser) rtsp_num = len(video_decoders) if rtsp_num == 0: log_error("No video stream name or addr configuration in ", FACE_DETEC_CONF) return None, None postprocessor = Postprocess(detector) display_channel = int(config['display']['channel']) if (display_channel is None) or (display_channel >= rtsp_num): log_info("No video to display, display configuration: ", config['display']['channel']) else: video_decoders[display_channel].set_display(True) ret = postprocessor.create_presenter_channel(FACE_DETEC_CONF) if ret == False: log_error("Create presenter channel failed") return None, None return video_decoders, postprocessor
def train(self, data, X_column, y_columns): if y_columns is None: _ = data.columns.to_list() y_columns = list(set(_) - set([X_column])) X = data[X_column] y = data.drop(X_column, axis=1) xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=42, test_size=0.2) mlb = MultiLabelBinarizer() train_labels = mlb.fit_transform(ytrain[y_columns].values) # test_labels not used when training # test_labels = mlb.fit_transform(ytest[y_columns].values) train_cleaned = xtrain.copy(deep=True).apply(Preprocess().clean_text) # test cleaned not used when training # test_cleaned = xtest.copy(deep=True).apply(clean_text) vectorizer = TfidfVectorizer() vectorised_train_documents = vectorizer.fit_transform(train_cleaned) powersetsvc = LabelPowerset(LinearSVC()) powersetsvc.fit(vectorised_train_documents, train_labels) dump(powersetsvc, open("powersetsvc.pickle", "wb")) with open('vec.pickle', 'wb') as f1: dump(vectorizer, f1) return powersetsvc, vectorizer
def gettimeparams(self, method): time_window = self.time_win.get() entries = int(int(self.sr) * float(time_window)) k = 0 process = Preprocess() if self.fftflag == True: l = len(self.tfdata) n = len(self.tfdata[0]) m = len(self.tfdata[0][0]) for i in range(l): for j in range(n): k = 0 self.classes_new = [] while k < m / entries: if (all(x == self.classes[k * entries] for x in self.classes[k * entries:(k + 1) * entries])): self.features.append( getattr(process, method)( self.tfdata[i][j][k * entries:(k + 1) * entries])) self.classes_new.append(self.classes[k * entries]) k += 1 elif (all(x == self.classes[k * entries] for x in self.classes[k * entries:(k + 1) * entries]) == False): k += 1 self.final_df[method + "_fft_{}_{}".format(i, j)] = self.features self.features = [] else: df = pd.read_csv(self.filename) [m, n] = df.shape df = pd.DataFrame(df.values, columns=range(n)) self.classes = df[n - 1] self.classes_fin = self.classes df = df.drop(labels=n - 1, axis=1) [m, n] = df.shape for i in range(n): k = 0 self.classes_new = [] while k < m / entries: if (all(x == self.classes[k * entries] for x in self.classes[k * entries:(k + 1) * entries])): self.features.append( getattr(process, method)( df[i].iloc[k * entries:(k + 1) * entries])) self.classes_new.append(self.classes[k * entries]) k += 1 elif (all(x == self.classes[k * entries] for x in self.classes[k * entries:(k + 1) * entries]) == False): k += 1 self.final_df[method + "_{}".format(i)] = self.features self.features = [] self.classes_fin = self.classes_new
def main(model_num=1): preprocess = Preprocess() texts_train, labels_train = preprocess.preprocessData( '../projet2/train.txt', mode="train") texts_dev, labels_dev = preprocess.preprocessData('../projet2/dev.txt', mode="train") MAX_SEQUENCE_LENGTH = 24 LSTM_DIM = 64 HIDDEN_LAYER_DIM = 30 NUM_CLASSES = 4 GAUSSIAN_NOISE = 0.1 DROPOUT = 0.2 DROPOUT_LSTM = 0.2 BATCH_SIZE = 200 X_train, X_val, y_train, y_val = train_test_split(texts_train, labels_train, test_size=0.2, random_state=42) labels_categorical_train = to_categorical(np.asarray(y_train)) labels_categorical_val = to_categorical(np.asarray(y_val)) labels_categorical_dev = to_categorical(np.asarray(labels_dev)) embedding = Embedding('../projet2/emosense.300d.txt') embeddings = embedding.getMatrix() tokenizer = embedding.getTokenizer() message_first_message_train, message_second_message_train, message_third_message_train = get_sequences( X_train, MAX_SEQUENCE_LENGTH, tokenizer) message_first_message_val, message_second_message_val, message_third_message_val = get_sequences( X_val, MAX_SEQUENCE_LENGTH, tokenizer) message_first_message_dev, message_second_message_dev, message_third_message_dev = get_sequences( texts_dev, MAX_SEQUENCE_LENGTH, tokenizer) model = CustomModel(model_num) model.build(embeddings, MAX_SEQUENCE_LENGTH, LSTM_DIM, HIDDEN_LAYER_DIM, NUM_CLASSES, noise=GAUSSIAN_NOISE, dropout_lstm=DROPOUT_LSTM, dropout=DROPOUT) model.summary() history = model.train(message_first_message_train, message_second_message_train, message_third_message_train, labels_categorical_train, message_first_message_val, message_second_message_val, message_third_message_val, labels_categorical_val) y_pred = model.predict([ message_first_message_dev, message_second_message_dev, message_third_message_dev ])
def compile(self, dataset, lookahead, dense, dimension): prep = Preprocess(emb_file=self.emb_file, dataset=dataset,lookahead=lookahead, dense=dense, dimension=dimension) num_classes = prep.num_classes x_train, y_train, x_test, y_test = prep.x_train, prep.y_train, prep.x_test, prep.y_test return x_train, y_train, x_test, y_test, num_classes
def test__convert_column_argument(self): df_long = self.spark.read.csv('tests/fixtures/preprocess/long.csv', header=True) preprocessor = Preprocess(df_labels=df_long, columns='all') self.assertEqual(len(df_long.columns) - 1, len(preprocessor.columns))
def setUp(self): self.test_size = 12000 self.data = data[:self.test_size] self.training_set = data[:10000] self.validation_set = data[10000:11000] self.testing_set = data[11000:12000] preprocess1 = Preprocess() preprocess1.preprocess(self.training_set) self.x_train = preprocess1.matrixify(self.training_set) self.y_train = Preprocess.get_y(self.training_set) preprocess2 = Preprocess() preprocess2.preprocess(self.validation_set) self.x_val = preprocess2.matrixify(self.validation_set) self.y_val = Preprocess.get_y(self.validation_set)
def main(): preprocess = Preprocess() preprocess.check_data_distribution() print "\n\n*********** ANALYSIS PART I *******************" partI_classifier = Classifiers(1) partI_classifier.draw_auc_curve(1)
def main(): df = load_csv_to_df(path, input_filename) print(df.shape) pipeline = Pipeline([('apply_rules', Preprocess())]) df = pipeline.fit_transform(df) dump_df_to_csv(df, path, output_filename)
def test_preprocess(self): df_recipe_info = self.spark.read.csv( 'tests/fixtures/preprocess/recipe_info.csv', header=True) preprocessor_all = Preprocess(df_labels=df_recipe_info, columns='all') df_preprocessed_all = preprocessor_all.preprocess() self.assertEqual(df_preprocessed_all.count(), df_recipe_info.count() - 1) preprocessor_country = Preprocess(df_labels=df_recipe_info, columns=['country']) df_preprocessed_country = preprocessor_country.preprocess() self.assertEqual(df_preprocessed_country.count(), df_recipe_info.count() - 1) self.assertEqual(len(df_preprocessed_country.columns), 1 + 4)
def test_remove_non_alpha(self): preprocessor = Preprocess() preprocessor.preprocess_remove_non_alpha(self.data) for point in self.data: for word in point['text']: try: self.assertTrue(word.isalpha()) except AssertionError: print(word)
def __init__(self): """initialize dataset and load model""" self.model = load_model(config.model_path) print("[Log] Pretrained model was loaded.") self.preprocess = Preprocess(database_path=config.database_path) print("[Log] Preprocess object was created.") self.database = self.init_database()
def main(): #df = pd.read_csv(os.path.join(path, input_filename), dtype='unicode') df = load_csv_to_df(path, input_filename) print(df.shape) pipeline = Pipeline([('apply_rules', Preprocess())]) df = pipeline.fit_transform(df) dump_df_to_csv(df, path, output_filename)