Exemple #1
0
def test_base_comparison():
    tests = [("LIN", setting.LIN), ("TAX", setting.TAX), ("CAMARGO", setting.CAMARGO),
             ("PASQUADIBISCEGLIE", setting.PASQUADIBISCEGLIE), ("SDL", setting.STANDARD), ("DBN", setting.CAMARGO),
             ("TAYMOURI", setting.TAYMOURI)]

    for test in tests:
        print("Test", test[0])
        d = get_data("Helpdesk")
        m = Methods.get_prediction_method(test[0])
        s = test[1]
        if test[0] == "LIN":
            s.filter_cases = 3

        if result_exists("Helpdesk", m, s):
            continue

        d.prepare(s)

        r = m.test(m.train(d.train), d.test_orig)

        save_results(r, d.name, m.name, s)

    # Di Mauro k-fold test
    d = get_data("Helpdesk")
    m = Methods.get_prediction_method("DIMAURO")
    s = setting.DIMAURO

    if result_exists("Helpdesk", m, s):
        return

    d.prepare(s)

    r = m.k_fold_validation(d)
    save_results(r, d.name, m.name, s)
def Logistic_regression(modelpath=None):
    train_data, train_flag, val_data, val_flag, test_data, test_flag = get_data(
    )
    if modelpath is not None:
        lr = joblib.load(modelpath)
    else:
        lr = LogisticRegression(penalty='l2',
                                solver='liblinear',
                                class_weight="balanced")
        lr.fit(train_data, train_flag)
        joblib.dump(lr, "params/LR/LR.model")

    # val
    val_output_prob = lr.predict_proba(val_data)
    val_output_prob = np.array(val_output_prob)[:, 1]
    _, _, thresold = Measure().get_pr_curve(val_flag, val_output_prob)

    test_output_proba = lr.predict_proba(test_data)
    test_output_proba = np.array(test_output_proba)[:, 1]
    test_output = np.zeros(test_output_proba.shape)
    test_output[test_output_proba > thresold] = 1

    precision = Measure().Precision(test_flag, test_output)
    recall = Measure().Recall(test_flag, test_output)
    f1 = Measure().F1_score(test_flag, test_output)
    acc = Measure().Accuracy(test_flag, test_output)
    print "precision:%.2f\nrecall:%.2f\nf1:%.2f\nacc:%.2f\n" % (
        precision, recall, f1, acc)
    print "auc:%.2f" % roc_auc_score(test_flag, test_output_proba)
Exemple #3
0
def SVM(modelpath=None):  #modelpath="params/SVM/svm_balanced.model"):
    train_data, train_flag, val_data, val_flag, test_data, test_flag = get_data(
    )
    train_data = train_data[:40000]
    train_flag = train_flag[:40000]
    if modelpath is not None:
        svm_ = joblib.load(modelpath)
    else:
        svm_ = SVC(kernel='rbf', probability=True)
        svm_.fit(train_data, train_flag)
        joblib.dump(svm_, "params/SVM/svm_balanced.model")

    #val
    val_output_prob = svm_.predict_proba(val_data)
    val_output_prob = np.array(val_output_prob)[:, 1]
    _, _, thresold = Measure().get_pr_curve(val_flag, val_output_prob)

    #test
    test_output_prob = svm_.predict_proba(test_data)
    test_output_prob = np.array(test_output_prob)[:, 1]
    test_output = np.zeros(test_output_prob.shape)
    test_output[test_output_prob > thresold] = 1

    precision = Measure().Precision(test_flag, test_output)
    recall = Measure().Recall(test_flag, test_output)
    f1 = Measure().F1_score(test_flag, test_output)
    acc = Measure().Accuracy(test_flag, test_output)
    print "model:SVM:\nprecision:%.2f\nrecall:%.2f\nf1:%.2f\nacc:%.2f\n" % (
        precision, recall, f1, acc)
    print "auc:%.2f" % roc_auc_score(test_flag, test_output_prob)
Exemple #4
0
def test_standard(dataset, m):
    d = get_data(dataset)

    print(get_full_filename(dataset, m, setting.STANDARD))
    if result_exists(dataset, m, setting.STANDARD):
        return

    d.prepare(setting.STANDARD)

    r = m.test(m.train(d.train), d.test_orig)

    save_results(r, d.name, m.name, setting.STANDARD)
Exemple #5
0
def test_stability():
    results = {}
    d = get_data("Helpdesk")
    d.prepare(setting.STANDARD)
    for method_name in ["SDL", "CAMARGO", "DIMAURO", "LIN", "PASQUADIBISCEGLIE", "TAX", "TAYMOURI"]:
        results[method_name] = []
        m = Methods.get_prediction_method(method_name)
        for _ in range(10):
            r = m.test(m.train(d.train), d.test_orig)
            results[method_name].append(ACCURACY.calculate(r))

    for m in results:
        print(m, "\t".join([str(a) for a in results[m]]))
def test_model(inputx=Rx,
               inputy=Ry,
               error=rnn_Error,
               steps=20,
               testdir='test'):
    testx, testy = get_data(testdir)
    testy = testy.reshape((-1, 1))
    testx /= 255.0
    avg = 0
    for s in range(steps):
        bx, by = rnn_batchdata(testx, testy, max_t, b_size, s, 84, 84, stack)
        e = sess.run(error, {inputx: bx, inputy: by})
        print(e)
        avg += e / steps
    print('avg {0}'.format(avg))
Exemple #7
0
def test_end_event(dataset, m):
    end_events = [True, False]
    basic_setting = copy(setting.STANDARD)

    for end_event in end_events:
        d = get_data(dataset)
        basic_setting.add_end = end_event

        print(get_full_filename(dataset,m, basic_setting))
        if result_exists(dataset, m, basic_setting):
            continue

        d.prepare(basic_setting)

        r = m.test(m.train(d.train), d.test_orig)

        save_results(r, d.name, m.name, basic_setting)
Exemple #8
0
def test_split_cases(dataset, m):
    split_cases = [True, False]
    basic_setting = copy(setting.STANDARD)

    for split_case in split_cases:
        d = get_data(dataset)
        basic_setting.split_cases = split_case

        print(get_full_filename(dataset,m, basic_setting))
        if result_exists(dataset, m, basic_setting):
            continue

        d.prepare(basic_setting)

        r = m.test(m.train(d.train), d.test_orig)

        save_results(r, d.name, m.name, basic_setting)
Exemple #9
0
def test_percentage(dataset, m):
    train_percentages = [60, 66, 70, 80]
    basic_setting = copy(setting.STANDARD)

    for percentage in train_percentages:
        d = get_data(dataset)
        basic_setting.train_percentage = percentage

        print(get_full_filename(dataset,m, basic_setting))
        if result_exists(dataset, m, basic_setting):
            continue

        d.prepare(basic_setting)

        r = m.test(m.train(d.train), d.test_orig)

        save_results(r, d.name, m.name, basic_setting)
Exemple #10
0
def ranking_experiments():
    for d in ["BPIC11"]:
        for s in ALL_SETTINGS:
            event_data = get_data(d)
            event_data.prepare(s)

            for method in ALL_METHODS:
                try:
                    m = Methods.get_prediction_method(method)
                    if result_exists(d, m, s):
                        continue
                    if s.train_split == "k-fold":
                        r = m.k_fold_validation(event_data)
                    else:
                        r = m.test(m.train(event_data.train), event_data.test_orig)
                    save_results(r, d, m.name, s)
                except:
                    traceback.print_exc()
Exemple #11
0
def test_filter(dataset, m):
    filters = [None, 5]
    basic_setting = copy(setting.STANDARD)

    for filter in filters:
        d = get_data(dataset)
        basic_setting.filter_cases = filter
        if filter == 5 and dataset == "Helpdesk":
            basic_setting.filter_cases = 3

        print(get_full_filename(dataset,m, basic_setting))
        if result_exists(dataset, m, basic_setting):
            continue

        d.prepare(basic_setting)

        r = m.test(m.train(d.train), d.test_orig)

        save_results(r, d.name, m.name, basic_setting)
Exemple #12
0
def test_k(dataset, m):
    ks = [10]
    basic_setting = copy(setting.STANDARD)
    basic_setting.train_split = "k-fold"

    for k in ks:
        d = get_data(dataset)
        basic_setting.train_k = k

        print(get_full_filename(dataset, m, basic_setting))
        if result_exists(dataset, m, basic_setting):
            continue

        d.prepare(basic_setting)
        try:
            r = m.k_fold_validation(d)
        except:
            pass

        save_results(r, d.name, m.name, basic_setting)
def Decision_tree():
    train_data, train_flag,val_data,val_flag, test_data, test_flag = get_data()
    dt = tree.DecisionTreeClassifier(max_depth=6,class_weight="balanced")
    dt = dt.fit(train_data, train_flag)
    # val
    val_output_prob = dt.predict_proba(val_data)
    val_output_prob = np.array(val_output_prob)[:, 1]
    _, _, thresold = Measure().get_pr_curve(val_flag, val_output_prob)

    # test
    test_output_prob = dt.predict_proba(test_data)
    test_output_prob = np.array(test_output_prob)[:, 1]
    test_output = np.zeros(test_output_prob.shape)
    test_output[test_output_prob > thresold] = 1

    precision = Measure().Precision(test_flag, test_output)
    recall = Measure().Recall(test_flag, test_output)
    f1 = Measure().F1_score(test_flag, test_output)
    acc = Measure().Accuracy(test_flag, test_output)
    print "precision:%.2f\nrecall:%.2f\nf1:%.2f\nacc:%.2f\n" % (precision, recall, f1, acc)
    print "auc:%.2f"%roc_auc_score(test_flag, test_output_prob)
Exemple #14
0
def test_split(dataset, m):
    splits = ["train-test", "test-train", "random", "k-fold"]
    basic_setting = copy(setting.STANDARD)

    for split in splits:
        d = get_data(dataset)
        basic_setting.train_split = split
        if split == "k-fold":
            basic_setting.train_k = 3

        print(get_full_filename(dataset, m, basic_setting))
        if result_exists(dataset, m, basic_setting):
            continue

        d.prepare(basic_setting)

        if split == "k-fold":
            r = m.k_fold_validation(d)
        else:
            r = m.test(m.train(d.train), d.test_orig)

        save_results(r, d.name, m.name, basic_setting)
# -*- coding: utf-8 -*-
"""
Created on Sun Oct 28 05:28:18 2018

@author: Pranjal
"""

import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from Data import get_data

x_train, x_test, y_train, y_test, submit_Id, submit_x = get_data()

lg = lgb.LGBMRegressor(silent=False,
                       learning_rate=0.1,
                       n_estimators=200,
                       num_leaves=300)

param_dist = {"max_depth": [15, 16, 18]}

grid_search = GridSearchCV(lg,
                           n_jobs=-1,
                           param_grid=param_dist,
                           cv=2,
                           scoring="neg_mean_absolute_error",
                           verbose=5)

grid_search.fit(x_train, y_train)

grid_search.best_estimator_
def main(argv):
    from Data import get_data
    from Predictions.setting import Setting
    from Methods import get_prediction_method
    from Predictions.metric import ACCURACY

    if len(argv) < 2:
        print("Missing arguments, expected: METHOD and DATA")
        return

    method = argv[0]
    data = argv[1]

    ###
    # Load data, setting and method
    ###
    basic_setting = Setting(None,
                            "test-train",
                            False,
                            True,
                            70,
                            filter_cases=5)
    m = get_prediction_method(method)
    d = get_data(data)

    model_folder = os.path.join(OUTPUT_FOLDER, str.lower(d.name), "models",
                                str.lower(method))

    ###
    # Check if all required folders exist
    ###
    if not os.path.exists(OUTPUT_FOLDER):
        os.mkdir(OUTPUT_FOLDER)

    # Perform some method specific checks
    if method == "DBN":
        if len(argv) >= 3:
            basic_setting.prefixsize = int(argv[2])
        else:
            basic_setting.prefixsize = 2
    elif method == "CAMARGO":
        if len(argv) < 3:
            print(
                "Please indicate the architecture to use: shared_cat or specialized"
            )
            return
        architecture = str.lower(argv[2])
        m.def_params["model_type"] = architecture
        basic_setting.prefixsize = 5
    elif method == "LIN":
        basic_setting.prefixsize = 5

    basic_setting.prefixsize = 2

    # Perform some data specific checks
    if data == "Helpdesk":
        basic_setting.filter_cases = 3

    d.prepare(basic_setting)

    ###
    # Register Start time
    ###
    start_time = time.mktime(time.localtime())
    start_time_str = time.strftime("%d-%m-%y %H:%M:%S", time.localtime())
    time_output = open(os.path.join(model_folder, "timings_next_event.log"),
                       'a')
    time_output.write("Starting time: %s\n" % start_time_str)

    ###
    # Execute chosen method
    ###
    print("EXPERIMENT NEXT ACTIVITY PREDICTION:", argv)
    # Load model
    if method == "DBN":
        model_file = os.path.join(model_folder, "model")
        with open(model_file, "rb") as pickle_file:
            model = pickle.load(pickle_file)
    else:
        from tensorflow.python.keras.models import load_model

        if method == "LIN":
            import Methods

            model = load_model(
                os.path.join(model_folder, "model.h5"),
                custom_objects={"Modulator": Methods.Lin.Modulator.Modulator})
        else:
            model = load_model(os.path.join(model_folder, "model.h5"))

    # Evaluate model and calculate accuracy
    if method == "DBN":
        acc = test_edbn(model, d)
    elif method == "CAMARGO":
        acc = test_camargo(model, d)
    elif method == "LIN":
        acc = test_lin(model, d)
    elif method == "DIMAURO":
        acc = test_dimauro(model, d)
    elif method == "TAX":
        acc = test_tax(model, d)

    with open(os.path.join(model_folder, "results_suffix.log"), "a") as fout:
        fout.write("Accuracy: (%s) %s\n" %
                   (time.strftime("%d-%m-%y %H:%M:%S", time.localtime()), acc))

    ###
    # Register End time
    ###
    current_time = time.mktime(time.localtime())
    current_time_str = time.strftime("%d-%m-%y %H:%M:%S", time.localtime())
    time_output.write("End time: %s\n" % current_time_str)
    time_output.write("Duration: %fs\n\n" % (current_time - start_time))
Exemple #17
0
 def get_data(self):
     return get_data()
    model = Model(inputs=[inputs], outputs=[outputs])
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=[mean_iou])
    model.summary()

    return model


if __name__ == "__main__":
    h, w, ch = 32, 32, 1
    size = None

    X_train, Y_train = get_data("../../../data/detector/",
                                length=size,
                                size=(h, w, ch))

    model = model_Unet(h, w, ch)

    earlystopper = EarlyStopping(patience=10, verbose=1)
    checkpointer = ModelCheckpoint('Unet(32x32).h5',
                                   verbose=1,
                                   save_best_only=True)

    model.fit(X_train,
              Y_train,
              validation_split=0.1,
              batch_size=16,
              epochs=50,
              callbacks=[earlystopper, checkpointer])
rnn_outputs, rnn_states = tf.nn.dynamic_rnn(lstm_cell,
                                            rconv_flat,
                                            dtype=tf.float32)

rdense = tf.layers.dense(rnn_outputs, units=1)
rprediction = tf.nn.tanh(rdense)

rnn_Error = tf.losses.mean_squared_error(labels=Ry, predictions=rprediction)
roptimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.99, beta2=0.9999)
train_rnn = optimizer.minimize(rnn_Error)

sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

print('loading data..')
train_x, train_y = get_data('train', stacked=stack)

train_x /= 255.0

train_y = train_y.reshape(-1, 1)

print('begin training for {0} steps..'.format(n_steps))
aloss = 0
for step in range(n_steps):
    b = (step * b_size) % train_x.shape[0]
    tx, ty = rnn_batchdata(train_x, train_y, max_t, b_size, step, 84, 84,
                           stack)
    _, l = sess.run([train_rnn, rnn_Error], {Rx: tx, Ry: ty})
    aloss += l * 0.05
    if (step % 20 == 0):
        print('step {0} | avg loss {1}'.format(step, round(aloss, 3)))
Created on Mon Sep 23 15:38:36 2019

@author: chetanjawlae
"""
# Sales Forecasting Data Processing

#%%
# Imports
import pandas as pd
import numpy as np
import re

# FUNCTION IMPORTS
from Data import get_data
#%%
df = get_data()

# AVERAGE UNIT PRICE For Month and Year
df.UnitPrice = df.UnitPrice.astype(float)
df.DiscountAmount = df.DiscountAmount.astype(float)
df.MarginAmount = df.MarginAmount.astype(float)

df['final_amount'] = df.UnitPrice - df.DiscountAmount - df.MarginAmount

contract_period = df.ContractTo - df.ContractFrom
df['contract_period_days'] = contract_period.apply(
    lambda x: re.findall(r'.*(?: d)', str(x))[0][:-1])
df['contract_period_days'] = df['contract_period_days'].astype(int)
df['contract_period_years'] = round(df['contract_period_days'] / 365)

#Correlation = df.DiscountAmount.apply(lambda x : pd.factorize(x)[0]).corr(method='pearson', min_periods=1)
Exemple #21
0
def main(argv):
    from Predictions.setting import Setting
    from Methods import get_prediction_method
    from Data import get_data

    if len(argv) < 2:
        print("Missing arguments, expected: METHOD and DATA")
        return

    method = argv[0]
    data = argv[1]

    ###
    # Load data, setting and method
    ###
    basic_setting = Setting(None,
                            "test-train",
                            False,
                            True,
                            70,
                            filter_cases=5)
    m = get_prediction_method(method)
    d = get_data(data)

    model_folder = os.path.join(OUTPUT_FOLDER, str.lower(d.name), "models",
                                str.lower(method))

    ###
    # Check if all required folders exist
    ###
    if not os.path.exists(OUTPUT_FOLDER):
        os.mkdir(OUTPUT_FOLDER)

    if not os.path.exists(os.path.join(OUTPUT_FOLDER, str.lower(d.name))):
        os.mkdir(os.path.join(OUTPUT_FOLDER, str.lower(d.name)))

    if not os.path.exists(
            os.path.join(OUTPUT_FOLDER, str.lower(d.name), "models")):
        os.mkdir(os.path.join(OUTPUT_FOLDER, str.lower(d.name), "models"))

    if not os.path.exists(model_folder):
        os.mkdir(model_folder)

    # Perform some method specific checks
    if method == "DBN":
        if len(argv) >= 3:
            basic_setting.prefixsize = int(argv[2])
        else:
            basic_setting.prefixsize = 2
    elif method == "CAMARGO":
        if len(argv) < 3:
            print(
                "Please indicate the architecture to use: shared_cat or specialized"
            )
            return
        architecture = str.lower(argv[2])
        m.def_params["model_type"] = architecture
        basic_setting.prefixsize = 5
    elif method == "LIN":
        basic_setting.prefixsize = 5

    # Perform some data specific checks
    if data == "Helpdesk":
        basic_setting.filter_cases = 3

    d.prepare(basic_setting)

    ###
    # Register Start time
    ###
    start_time = time.mktime(time.localtime())
    start_time_str = time.strftime("%d-%m-%y %H:%M:%S", time.localtime())
    time_output = open(os.path.join(model_folder, "timings_train.log"), 'a')
    time_output.write("Starting time: %s\n" % start_time_str)

    ###
    # Train and save model using chosen data and method
    ###
    print("EXPERIMENT TRAINING MODEL:", argv)
    # Train model
    model = m.train(d.train)

    # Save model
    if method == "DBN":
        with open(os.path.join(model_folder, "model"), "wb") as pickle_file:
            pickle.dump(model, pickle_file)
    else:
        model.save(os.path.join(model_folder, "model.h5"))

    ###
    # Register End time
    ###
    current_time = time.mktime(time.localtime())
    current_time_str = time.strftime("%d-%m-%y %H:%M:%S", time.localtime())
    time_output.write("End time: %s\n" % current_time_str)
    time_output.write("Duration: %fs\n\n" % (current_time - start_time))