def test_base_comparison(): tests = [("LIN", setting.LIN), ("TAX", setting.TAX), ("CAMARGO", setting.CAMARGO), ("PASQUADIBISCEGLIE", setting.PASQUADIBISCEGLIE), ("SDL", setting.STANDARD), ("DBN", setting.CAMARGO), ("TAYMOURI", setting.TAYMOURI)] for test in tests: print("Test", test[0]) d = get_data("Helpdesk") m = Methods.get_prediction_method(test[0]) s = test[1] if test[0] == "LIN": s.filter_cases = 3 if result_exists("Helpdesk", m, s): continue d.prepare(s) r = m.test(m.train(d.train), d.test_orig) save_results(r, d.name, m.name, s) # Di Mauro k-fold test d = get_data("Helpdesk") m = Methods.get_prediction_method("DIMAURO") s = setting.DIMAURO if result_exists("Helpdesk", m, s): return d.prepare(s) r = m.k_fold_validation(d) save_results(r, d.name, m.name, s)
def Logistic_regression(modelpath=None): train_data, train_flag, val_data, val_flag, test_data, test_flag = get_data( ) if modelpath is not None: lr = joblib.load(modelpath) else: lr = LogisticRegression(penalty='l2', solver='liblinear', class_weight="balanced") lr.fit(train_data, train_flag) joblib.dump(lr, "params/LR/LR.model") # val val_output_prob = lr.predict_proba(val_data) val_output_prob = np.array(val_output_prob)[:, 1] _, _, thresold = Measure().get_pr_curve(val_flag, val_output_prob) test_output_proba = lr.predict_proba(test_data) test_output_proba = np.array(test_output_proba)[:, 1] test_output = np.zeros(test_output_proba.shape) test_output[test_output_proba > thresold] = 1 precision = Measure().Precision(test_flag, test_output) recall = Measure().Recall(test_flag, test_output) f1 = Measure().F1_score(test_flag, test_output) acc = Measure().Accuracy(test_flag, test_output) print "precision:%.2f\nrecall:%.2f\nf1:%.2f\nacc:%.2f\n" % ( precision, recall, f1, acc) print "auc:%.2f" % roc_auc_score(test_flag, test_output_proba)
def SVM(modelpath=None): #modelpath="params/SVM/svm_balanced.model"): train_data, train_flag, val_data, val_flag, test_data, test_flag = get_data( ) train_data = train_data[:40000] train_flag = train_flag[:40000] if modelpath is not None: svm_ = joblib.load(modelpath) else: svm_ = SVC(kernel='rbf', probability=True) svm_.fit(train_data, train_flag) joblib.dump(svm_, "params/SVM/svm_balanced.model") #val val_output_prob = svm_.predict_proba(val_data) val_output_prob = np.array(val_output_prob)[:, 1] _, _, thresold = Measure().get_pr_curve(val_flag, val_output_prob) #test test_output_prob = svm_.predict_proba(test_data) test_output_prob = np.array(test_output_prob)[:, 1] test_output = np.zeros(test_output_prob.shape) test_output[test_output_prob > thresold] = 1 precision = Measure().Precision(test_flag, test_output) recall = Measure().Recall(test_flag, test_output) f1 = Measure().F1_score(test_flag, test_output) acc = Measure().Accuracy(test_flag, test_output) print "model:SVM:\nprecision:%.2f\nrecall:%.2f\nf1:%.2f\nacc:%.2f\n" % ( precision, recall, f1, acc) print "auc:%.2f" % roc_auc_score(test_flag, test_output_prob)
def test_standard(dataset, m): d = get_data(dataset) print(get_full_filename(dataset, m, setting.STANDARD)) if result_exists(dataset, m, setting.STANDARD): return d.prepare(setting.STANDARD) r = m.test(m.train(d.train), d.test_orig) save_results(r, d.name, m.name, setting.STANDARD)
def test_stability(): results = {} d = get_data("Helpdesk") d.prepare(setting.STANDARD) for method_name in ["SDL", "CAMARGO", "DIMAURO", "LIN", "PASQUADIBISCEGLIE", "TAX", "TAYMOURI"]: results[method_name] = [] m = Methods.get_prediction_method(method_name) for _ in range(10): r = m.test(m.train(d.train), d.test_orig) results[method_name].append(ACCURACY.calculate(r)) for m in results: print(m, "\t".join([str(a) for a in results[m]]))
def test_model(inputx=Rx, inputy=Ry, error=rnn_Error, steps=20, testdir='test'): testx, testy = get_data(testdir) testy = testy.reshape((-1, 1)) testx /= 255.0 avg = 0 for s in range(steps): bx, by = rnn_batchdata(testx, testy, max_t, b_size, s, 84, 84, stack) e = sess.run(error, {inputx: bx, inputy: by}) print(e) avg += e / steps print('avg {0}'.format(avg))
def test_end_event(dataset, m): end_events = [True, False] basic_setting = copy(setting.STANDARD) for end_event in end_events: d = get_data(dataset) basic_setting.add_end = end_event print(get_full_filename(dataset,m, basic_setting)) if result_exists(dataset, m, basic_setting): continue d.prepare(basic_setting) r = m.test(m.train(d.train), d.test_orig) save_results(r, d.name, m.name, basic_setting)
def test_split_cases(dataset, m): split_cases = [True, False] basic_setting = copy(setting.STANDARD) for split_case in split_cases: d = get_data(dataset) basic_setting.split_cases = split_case print(get_full_filename(dataset,m, basic_setting)) if result_exists(dataset, m, basic_setting): continue d.prepare(basic_setting) r = m.test(m.train(d.train), d.test_orig) save_results(r, d.name, m.name, basic_setting)
def test_percentage(dataset, m): train_percentages = [60, 66, 70, 80] basic_setting = copy(setting.STANDARD) for percentage in train_percentages: d = get_data(dataset) basic_setting.train_percentage = percentage print(get_full_filename(dataset,m, basic_setting)) if result_exists(dataset, m, basic_setting): continue d.prepare(basic_setting) r = m.test(m.train(d.train), d.test_orig) save_results(r, d.name, m.name, basic_setting)
def ranking_experiments(): for d in ["BPIC11"]: for s in ALL_SETTINGS: event_data = get_data(d) event_data.prepare(s) for method in ALL_METHODS: try: m = Methods.get_prediction_method(method) if result_exists(d, m, s): continue if s.train_split == "k-fold": r = m.k_fold_validation(event_data) else: r = m.test(m.train(event_data.train), event_data.test_orig) save_results(r, d, m.name, s) except: traceback.print_exc()
def test_filter(dataset, m): filters = [None, 5] basic_setting = copy(setting.STANDARD) for filter in filters: d = get_data(dataset) basic_setting.filter_cases = filter if filter == 5 and dataset == "Helpdesk": basic_setting.filter_cases = 3 print(get_full_filename(dataset,m, basic_setting)) if result_exists(dataset, m, basic_setting): continue d.prepare(basic_setting) r = m.test(m.train(d.train), d.test_orig) save_results(r, d.name, m.name, basic_setting)
def test_k(dataset, m): ks = [10] basic_setting = copy(setting.STANDARD) basic_setting.train_split = "k-fold" for k in ks: d = get_data(dataset) basic_setting.train_k = k print(get_full_filename(dataset, m, basic_setting)) if result_exists(dataset, m, basic_setting): continue d.prepare(basic_setting) try: r = m.k_fold_validation(d) except: pass save_results(r, d.name, m.name, basic_setting)
def Decision_tree(): train_data, train_flag,val_data,val_flag, test_data, test_flag = get_data() dt = tree.DecisionTreeClassifier(max_depth=6,class_weight="balanced") dt = dt.fit(train_data, train_flag) # val val_output_prob = dt.predict_proba(val_data) val_output_prob = np.array(val_output_prob)[:, 1] _, _, thresold = Measure().get_pr_curve(val_flag, val_output_prob) # test test_output_prob = dt.predict_proba(test_data) test_output_prob = np.array(test_output_prob)[:, 1] test_output = np.zeros(test_output_prob.shape) test_output[test_output_prob > thresold] = 1 precision = Measure().Precision(test_flag, test_output) recall = Measure().Recall(test_flag, test_output) f1 = Measure().F1_score(test_flag, test_output) acc = Measure().Accuracy(test_flag, test_output) print "precision:%.2f\nrecall:%.2f\nf1:%.2f\nacc:%.2f\n" % (precision, recall, f1, acc) print "auc:%.2f"%roc_auc_score(test_flag, test_output_prob)
def test_split(dataset, m): splits = ["train-test", "test-train", "random", "k-fold"] basic_setting = copy(setting.STANDARD) for split in splits: d = get_data(dataset) basic_setting.train_split = split if split == "k-fold": basic_setting.train_k = 3 print(get_full_filename(dataset, m, basic_setting)) if result_exists(dataset, m, basic_setting): continue d.prepare(basic_setting) if split == "k-fold": r = m.k_fold_validation(d) else: r = m.test(m.train(d.train), d.test_orig) save_results(r, d.name, m.name, basic_setting)
# -*- coding: utf-8 -*- """ Created on Sun Oct 28 05:28:18 2018 @author: Pranjal """ import lightgbm as lgb from sklearn.model_selection import GridSearchCV from sklearn import metrics from Data import get_data x_train, x_test, y_train, y_test, submit_Id, submit_x = get_data() lg = lgb.LGBMRegressor(silent=False, learning_rate=0.1, n_estimators=200, num_leaves=300) param_dist = {"max_depth": [15, 16, 18]} grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv=2, scoring="neg_mean_absolute_error", verbose=5) grid_search.fit(x_train, y_train) grid_search.best_estimator_
def main(argv): from Data import get_data from Predictions.setting import Setting from Methods import get_prediction_method from Predictions.metric import ACCURACY if len(argv) < 2: print("Missing arguments, expected: METHOD and DATA") return method = argv[0] data = argv[1] ### # Load data, setting and method ### basic_setting = Setting(None, "test-train", False, True, 70, filter_cases=5) m = get_prediction_method(method) d = get_data(data) model_folder = os.path.join(OUTPUT_FOLDER, str.lower(d.name), "models", str.lower(method)) ### # Check if all required folders exist ### if not os.path.exists(OUTPUT_FOLDER): os.mkdir(OUTPUT_FOLDER) # Perform some method specific checks if method == "DBN": if len(argv) >= 3: basic_setting.prefixsize = int(argv[2]) else: basic_setting.prefixsize = 2 elif method == "CAMARGO": if len(argv) < 3: print( "Please indicate the architecture to use: shared_cat or specialized" ) return architecture = str.lower(argv[2]) m.def_params["model_type"] = architecture basic_setting.prefixsize = 5 elif method == "LIN": basic_setting.prefixsize = 5 basic_setting.prefixsize = 2 # Perform some data specific checks if data == "Helpdesk": basic_setting.filter_cases = 3 d.prepare(basic_setting) ### # Register Start time ### start_time = time.mktime(time.localtime()) start_time_str = time.strftime("%d-%m-%y %H:%M:%S", time.localtime()) time_output = open(os.path.join(model_folder, "timings_next_event.log"), 'a') time_output.write("Starting time: %s\n" % start_time_str) ### # Execute chosen method ### print("EXPERIMENT NEXT ACTIVITY PREDICTION:", argv) # Load model if method == "DBN": model_file = os.path.join(model_folder, "model") with open(model_file, "rb") as pickle_file: model = pickle.load(pickle_file) else: from tensorflow.python.keras.models import load_model if method == "LIN": import Methods model = load_model( os.path.join(model_folder, "model.h5"), custom_objects={"Modulator": Methods.Lin.Modulator.Modulator}) else: model = load_model(os.path.join(model_folder, "model.h5")) # Evaluate model and calculate accuracy if method == "DBN": acc = test_edbn(model, d) elif method == "CAMARGO": acc = test_camargo(model, d) elif method == "LIN": acc = test_lin(model, d) elif method == "DIMAURO": acc = test_dimauro(model, d) elif method == "TAX": acc = test_tax(model, d) with open(os.path.join(model_folder, "results_suffix.log"), "a") as fout: fout.write("Accuracy: (%s) %s\n" % (time.strftime("%d-%m-%y %H:%M:%S", time.localtime()), acc)) ### # Register End time ### current_time = time.mktime(time.localtime()) current_time_str = time.strftime("%d-%m-%y %H:%M:%S", time.localtime()) time_output.write("End time: %s\n" % current_time_str) time_output.write("Duration: %fs\n\n" % (current_time - start_time))
def get_data(self): return get_data()
model = Model(inputs=[inputs], outputs=[outputs]) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[mean_iou]) model.summary() return model if __name__ == "__main__": h, w, ch = 32, 32, 1 size = None X_train, Y_train = get_data("../../../data/detector/", length=size, size=(h, w, ch)) model = model_Unet(h, w, ch) earlystopper = EarlyStopping(patience=10, verbose=1) checkpointer = ModelCheckpoint('Unet(32x32).h5', verbose=1, save_best_only=True) model.fit(X_train, Y_train, validation_split=0.1, batch_size=16, epochs=50, callbacks=[earlystopper, checkpointer])
rnn_outputs, rnn_states = tf.nn.dynamic_rnn(lstm_cell, rconv_flat, dtype=tf.float32) rdense = tf.layers.dense(rnn_outputs, units=1) rprediction = tf.nn.tanh(rdense) rnn_Error = tf.losses.mean_squared_error(labels=Ry, predictions=rprediction) roptimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.99, beta2=0.9999) train_rnn = optimizer.minimize(rnn_Error) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) print('loading data..') train_x, train_y = get_data('train', stacked=stack) train_x /= 255.0 train_y = train_y.reshape(-1, 1) print('begin training for {0} steps..'.format(n_steps)) aloss = 0 for step in range(n_steps): b = (step * b_size) % train_x.shape[0] tx, ty = rnn_batchdata(train_x, train_y, max_t, b_size, step, 84, 84, stack) _, l = sess.run([train_rnn, rnn_Error], {Rx: tx, Ry: ty}) aloss += l * 0.05 if (step % 20 == 0): print('step {0} | avg loss {1}'.format(step, round(aloss, 3)))
Created on Mon Sep 23 15:38:36 2019 @author: chetanjawlae """ # Sales Forecasting Data Processing #%% # Imports import pandas as pd import numpy as np import re # FUNCTION IMPORTS from Data import get_data #%% df = get_data() # AVERAGE UNIT PRICE For Month and Year df.UnitPrice = df.UnitPrice.astype(float) df.DiscountAmount = df.DiscountAmount.astype(float) df.MarginAmount = df.MarginAmount.astype(float) df['final_amount'] = df.UnitPrice - df.DiscountAmount - df.MarginAmount contract_period = df.ContractTo - df.ContractFrom df['contract_period_days'] = contract_period.apply( lambda x: re.findall(r'.*(?: d)', str(x))[0][:-1]) df['contract_period_days'] = df['contract_period_days'].astype(int) df['contract_period_years'] = round(df['contract_period_days'] / 365) #Correlation = df.DiscountAmount.apply(lambda x : pd.factorize(x)[0]).corr(method='pearson', min_periods=1)
def main(argv): from Predictions.setting import Setting from Methods import get_prediction_method from Data import get_data if len(argv) < 2: print("Missing arguments, expected: METHOD and DATA") return method = argv[0] data = argv[1] ### # Load data, setting and method ### basic_setting = Setting(None, "test-train", False, True, 70, filter_cases=5) m = get_prediction_method(method) d = get_data(data) model_folder = os.path.join(OUTPUT_FOLDER, str.lower(d.name), "models", str.lower(method)) ### # Check if all required folders exist ### if not os.path.exists(OUTPUT_FOLDER): os.mkdir(OUTPUT_FOLDER) if not os.path.exists(os.path.join(OUTPUT_FOLDER, str.lower(d.name))): os.mkdir(os.path.join(OUTPUT_FOLDER, str.lower(d.name))) if not os.path.exists( os.path.join(OUTPUT_FOLDER, str.lower(d.name), "models")): os.mkdir(os.path.join(OUTPUT_FOLDER, str.lower(d.name), "models")) if not os.path.exists(model_folder): os.mkdir(model_folder) # Perform some method specific checks if method == "DBN": if len(argv) >= 3: basic_setting.prefixsize = int(argv[2]) else: basic_setting.prefixsize = 2 elif method == "CAMARGO": if len(argv) < 3: print( "Please indicate the architecture to use: shared_cat or specialized" ) return architecture = str.lower(argv[2]) m.def_params["model_type"] = architecture basic_setting.prefixsize = 5 elif method == "LIN": basic_setting.prefixsize = 5 # Perform some data specific checks if data == "Helpdesk": basic_setting.filter_cases = 3 d.prepare(basic_setting) ### # Register Start time ### start_time = time.mktime(time.localtime()) start_time_str = time.strftime("%d-%m-%y %H:%M:%S", time.localtime()) time_output = open(os.path.join(model_folder, "timings_train.log"), 'a') time_output.write("Starting time: %s\n" % start_time_str) ### # Train and save model using chosen data and method ### print("EXPERIMENT TRAINING MODEL:", argv) # Train model model = m.train(d.train) # Save model if method == "DBN": with open(os.path.join(model_folder, "model"), "wb") as pickle_file: pickle.dump(model, pickle_file) else: model.save(os.path.join(model_folder, "model.h5")) ### # Register End time ### current_time = time.mktime(time.localtime()) current_time_str = time.strftime("%d-%m-%y %H:%M:%S", time.localtime()) time_output.write("End time: %s\n" % current_time_str) time_output.write("Duration: %fs\n\n" % (current_time - start_time))