def cv_1fold(df, df_store, with_pca=True): train = df.copy().iloc[::-1] train.Date = pd.to_datetime(train.Date) train_set = train[train.Date < '2015-06-19'] test_set = train[train.Date >= '2015-06-19'] reg_model = Regressor() X_train, y_train, X_PCA_train = pp.Preprocessor().transform( df_store, train_set) X_test, y_test, X_PCA_test = pp.Preprocessor().transform( df_store, test_set) # Dummy variables can induce differences in the schemas if with_pca: X_train = X_PCA_train.copy() X_test = X_PCA_test.copy() missing_test = set(X_train.columns) - set(X_test.columns) missing_train = set(X_test.columns) - set(X_train.columns) for c in missing_test: X_test[c] = 0 for c in missing_train: X_train[c] = 0 # Reorder to match columns order in train and test X_test = X_test[X_train.columns] # Model fitting on training set train_model(reg_model, X_train, y_train) # Scoring on test set y_pred = reg_model.predict(X_test) rmse_scores = rmse(y_test, y_pred) r2_scores = r2(y_test, y_pred) print("RSME = " + str(rmse_scores) + " | R² = " + str(r2_scores)) results = {} results['RMSE'] = rmse_scores results['R2'] = r2_scores return results
def get_pipeline_steps(config): """ Determine whether the full pipeline / a section of it is to be run Return a list of parallel steps to run, and boolean variables denoting whether the serial steps (batching, relation extraction) should be run. """ steps = [] partial_execution = config.getboolean('General', 'partial_execution') if partial_execution: start_step = config.getint('General', 'start_step') end_step = config.getint('General', 'end_step') else: start_step = 1 end_step = 6 # Run batching and relation extraction steps? batching = True if start_step == 1 else False rel_extraction = True if end_step == 6 else False #nel = True if start_step<=4 and end_step>4 else False # Parallel pipeline steps, removed parsing.UnstParser(configmap) from list, parallel_step_list = [ pre.Preprocessor(configmap), ner.Ner(configmap), nel.Nel(configmap) ] parallel_steps = parallel_step_list[max(0, start_step - 2):end_step - 1] return parallel_steps, batching, rel_extraction
def test_serialization(self): """Test serialization and de-serialization code""" pre_orig = pp.Preprocessor() pre_test = pp.Preprocessor() # Populate orig from input file, create new object # from serialized JSON pre_orig.preprocess("input/bible_characters.txt", 5) txt = pre_orig.to_json() pre_test.from_json(txt) # Check non-numeric attributes self.assertEqual(pre_orig.get_max_length(), pre_test.get_max_length()) self.assertEqual(pre_orig.filename, pre_test.filename) self.assertEqual(pre_orig.window, pre_test.window) self.assertEqual(pre_orig.get_targets(), pre_test.get_targets()) # Check numeric self.assertTrue(np.all(np.isclose(pre_orig.x_train, pre_test.x_train))) self.assertTrue(np.all(np.isclose(pre_orig.y_train, pre_test.y_train))) self.assertTrue(np.all(np.isclose(pre_orig.x_test, pre_test.x_test))) self.assertTrue(np.all(np.isclose(pre_orig.y_test, pre_test.y_test))) # Check statistics - order self.assertTrue( np.all(pre_test.statistics.get_second_df().index == pre_orig.statistics.get_second_df().index)) self.assertTrue( np.all(pre_test.statistics.get_second_df().columns == pre_orig.statistics.get_second_df().columns)) # Check statistics - values self.assertTrue( np.all( np.isclose(pre_orig.statistics.get_first_prob(), pre_test.statistics.get_first_prob()))) for test_letter in pp.LETTERS: self.assertTrue( np.all( np.isclose( pre_orig.statistics.get_second_prob(test_letter), pre_test.statistics.get_second_prob(test_letter))))
def __init__(self): self.EMBED_HIDDEN_SIZE = 50 self.SENT_HIDDEN_SIZE = 100 self.QUERY_HIDDEN_SIZE = 100 self.BATCH_SIZE = 32 self.EPOCHS = 5 prep = pp.Preprocessor() self.x, self.tx, self.xq, self.txq, self.y, self.ty = prep.prepare_cmu_data("../../Data/Question_Answer_Dataset_v1.2/S08") self.embedding_matrix = prep.generate_embedding_matrix(prep.word_idx, r"C:\Users\Anand Natu\Desktop\glove.6B") self.story_maxlen = prep.story_maxlen self.query_maxlen = prep.query_maxlen
def build_index(): corpus_path = util.get_corpus_dir_path_from_args() preprocessor = preprocessing.Preprocessor(corpus_path) doc_to_terms: list[preprocessing.DocToTerms] = preprocessor.parse() indexer_ob = indexer.Indexer(doc_to_terms) inverted_index: dict[str, indexer.Posting] = indexer_ob.inverter_index() doc_id_name_index: dict[int, str] = indexer_ob.doc_id_to_doc_name_index() tf_idf_ranker = ranker.Ranker(inverted_index, doc_id_name_index) _tfidf = tf_idf_ranker.tfidf() print('Indexing completed..saving...') util.save_obj(doc_id_name_index, DOC_ID_NAME_INDEX_NAME) util.save_obj(inverted_index, INVERTED_INDEX_FILE_NAME) util.save_pandas_df_as_pickle(_tfidf, TFIDF_NAME_INDEX_NAME) print('Saved index for quick results for future queries')
def __init__(self): self.feature_list = [] self.train_range = None self.test_range = None self.x_train = None self.y_train = None self.x_test = None self.y_test = None self.y_test_pred_prob = None self.y_test_pred = None self.y_var = "abandoned_y" self.has_prob = True self.preprocessor = preprocessing.Preprocessor() self.subsampled = False self.subsample_ratio = 0.0 self.html_file = None self.hdf_file = '/mnt/data/infonavit/master_loan_features/master_loan_features_v4.h5'
import numpy as np import std_models.vgg as vgg IMG_SIZE = 150 #224 CATEGORIES = ["Human", "NoHuman"] DATADIR = "/mnt/HDD/Masterthesis/DB" PARTIAL_LOAD = 0.1 PARTIAL_LOAD_STR = "01" COLOR_MODE = pp.Colormode.GRAYSCALE #pp.Colormode.RGB PROCESSED_IMG_DIR = "../res/data/p{0}_s{1}_{2}".format(PARTIAL_LOAD_STR, str(IMG_SIZE), COLOR_MODE.value) p = pp.Preprocessor(img_size=IMG_SIZE, categories=CATEGORIES, colormode=COLOR_MODE, datadir=DATADIR, data_pattern=pp.DataPattern.X_X_Y_Y) #x_train, x_test, y_train, y_test = p.run(partial_load=PARTIAL_LOAD) #p.save(PROCESSED_IMG_DIR, (x_train, x_test, y_train, y_test)) x_train, x_test, y_train, y_test = p.load(PROCESSED_IMG_DIR) #learner = vgg.VGGAdapter(version=vgg.VGGVersion.VGG_19, input_shape=x_train.shape[1:], output_shape=[0, 1]) #learner.model = learner.load("../res/models/model_002_001.model") #model.train(x_train, y_train, x_val=None, y_val=None, validation_split=0.2, batch_size=2, epochs=2) #model.evaluate(x_test, y_test) #model.save() model = learn.ImageClassifier(input_shape=x_train.shape[1:], model_path="../res/models/model_001_006.model") #model.train(x_train, y_train, x_val=None, y_val=None, validation_split=0.2, batch_size=132, epochs=5) #model.evaluate(x_test, y_test)
def setUp(self): """Load bible dataset for testing""" self.pre = pp.Preprocessor() self.pre.preprocess("input/testing.txt", window=3, shuffle=False)
sys.path.insert(0, unstableparserpath) import parsing # Get pipeline steps for full / partial execution as specified in config steps = get_pipeline_steps(configmap) parallel_steps = steps[0] batching = steps[1] rel_extraction = steps[2] # Determine number of cores to use (based on config setting and availability) cores = compute_cores(configmap) # Batching and sentence segmentation homedir = configmap.get('General', 'home') batchgroupsfile = homedir + '/' + configmap.get('General', 'batch_groups_file') logging.info('started batching: ' + str(datetime.now())) if batching: preprocessor = pre.Preprocessor(configmap) preprocessor.batch_and_segment() # Split batches into groups according to number of cores available for paralellisation batchnamesfile = homedir + '/' + configmap.get('General', 'batches_file') batch_groups_list = hf.group_batches_for_parallel_processing( batchnamesfile, batchgroupsfile, cores) else: # Read batch groups from file batch_groups_list = hf.read_group_batches(batchgroupsfile) print(batch_groups_list) # Implement pipeline steps for which parallelisation makes sense for step in parallel_steps: # Set up a pool of workers pool = mp.Pool(processes=cores) process_batch_group_with_instance = partial(process_batch_group,
def cv_kfold(df, df_store, n_splits=10, test_size=42, with_pca=True): train = df.copy().iloc[::-1] train.Date = pd.to_datetime(train.Date) tscv = TimeSeriesSplit(n_splits=n_splits) reg_model = Regressor() rmse_scores = [] r2_scores = [] date_grouping = train.groupby(train.Date)['Store'] date_list = [g[0] for g in list(date_grouping)[:]] for train_index, test_index in tscv.split(date_grouping): # Fixed test set cardinality (in number of days) train_index = np.append( train_index, list(range(len(train_index), 1 + int(test_index[-1] - test_size)))) test_index = test_index[(1 + int(train_index[-1] - test_index[0])):] train_dates = [date_list[train_index[0]], date_list[train_index[-1]]] test_dates = [date_list[test_index[0]], date_list[test_index[-1]]] train_mask = (train.Date >= train_dates[0]) & (train.Date <= train_dates[1]) test_mask = (train.Date >= test_dates[0]) & (train.Date <= test_dates[1]) # Train and test sets X_train, y_train, X_PCA_train = pp.Preprocessor().transform( df_store, train.loc[train_mask]) X_test, y_test, X_PCA_test = pp.Preprocessor().transform( df_store, train.loc[test_mask]) if with_pca: X_train = X_PCA_train.copy() X_test = X_PCA_test.copy() # Dummy variables can induce differences in the schemas missing_test = set(X_train.columns) - set(X_test.columns) missing_train = set(X_test.columns) - set(X_train.columns) for c in missing_test: X_test[c] = 0 for c in missing_train: X_train[c] = 0 # Reorder to match columns order in train and test X_test = X_test[X_train.columns] # Model fitting on training set train_model(reg_model, X_train, y_train) # Scoring on test set y_pred = reg_model.predict(X_test) rmse_scores.append(rmse(y_test, y_pred)) r2_scores.append(r2(y_test, y_pred)) # Final display for i in range(n_splits): print("FOLD " + str(i + 1) + ": " + "RSME = " + str(rmse_scores[i]) + " | R² = " + str(r2_scores[i])) results = {} results['RMSE'] = rmse_scores results['R2'] = r2_scores # Overall scores w = [1 + 0.5 * i for i in range(1, n_splits + 1)] print("--- OVERALL ---") print("RSME = " + '{0:.2f}'.format(np.average(rmse_scores, weights=w)) + " | R² = " + '{0:.2f}'.format(np.average(r2_scores, weights=w))) return
# 모델을 학습한다. import models import preprocessing import numpy as np import joblib # 데이터 전처리 및 분할 pre = preprocessing.Preprocessor() try: mfcc_x, mfcc_y, ft_x, ft_y = joblib.load('dataset/data.joblib') except: raise Exception('embedding.py를 먼저 실행해 주세요.') y = ft_y.reshape((-1)) x = np.array(list(zip(mfcc_x, ft_x))) xtrain, xtest, ytrain, ytest = pre.preprocess(xydata=[x, y]) # 전처리 mfcc_xtrain, ft_xtrain = xtrain[:,0], xtrain[:,1] mfcc_xtest, ft_xtest = xtest[:,0], xtest[:,1] mfcc_xtrain = mfcc_xtrain.reshape((-1, 30, 100)) mfcc_xtest = mfcc_xtest.reshape((-1, 30, 100)) ft_xtrain = ft_xtrain.reshape((-1, 30, 100)) ft_xtest = ft_xtest.reshape((-1, 30, 100)) # 데이터 불균형 문제 해결을 위해 class_weight 사용 class_weight = [(1 / pre.get_count(ytrain, [1, 0]))*len(ytrain)/2.0, (1 / pre.get_count(ytrain, [0, 1]))*len(ytrain)/2.0] # 모델 정의
def setup_processor(): macros = load_pokered_macros() processor = preprocessing.Preprocessor(config, macros) return processor