Beispiel #1
0
    def oct_target(alpha):
        print('Solving ILP for hyperparameter tuning...')
        all_results = []
        tree_depth = tree_depths[0]
        for r in range(val_repeat):
            train_df, val_df = preprocessing.train_test_split(
                train_val_df, split=train_val_ratio, random_state=random_state)
            preprocessing.normalize(train_df, norm_cols=norm_cols)
            preprocessing.normalize(val_df, norm_cols=norm_cols)
            all_results.append(
                get_results(train_df=train_df,
                            test_df=val_df,
                            alpha=alpha,
                            tree_depth=tree_depth,
                            max_time_per_run=max_time_per_run,
                            threads=threads,
                            print_status=print_status,
                            warm_start=warm_start))

        results_df = pd.concat(all_results)
        all_results_df.append(results_df)
        aggregated = calc_mean_accuracy_per_alpha(results_df)
        all_aggregated_df.append(aggregated)
        best_alpha_acc = aggregated.max()['testing_accuracy']
        return best_alpha_acc
Beispiel #2
0
def _load_raw_data(args, seed=42):
    """Load raw data from data directory (data_path) into pandas DataFrames,
    split train into train/val, create word_to_id dictionary, convert
    data frame words to ids.
    """
    # create dictionary mapping words to number id
    word_to_id = _build_vocab(SEINFELD_FILE, args)

    train_raw_data, valid_raw_data, test_raw_data = [], [], []
    for speaker in args.speakers:
        train_path = os.path.join(args.data_path, speaker + '_train.txt')
        test_path = os.path.join(args.data_path, speaker + '_test.txt')

        # read text files into pandas DataFrames
        df = pd.read_csv(train_path, header=None)
        # split train into train/val pandas data frames
        df_train, df_val = pp.train_test_split(df, train_pct=0.85, seed=seed)
        df_test = pd.read_csv(test_path)

        # append each character's text to raw data lists
        train_raw_data.append(_df_to_word_ids(df_train, word_to_id))
        valid_raw_data.append(_df_to_word_ids(df_val, word_to_id))
        test_raw_data.append(_df_to_word_ids(df_test, word_to_id))

    return train_raw_data, valid_raw_data, test_raw_data, word_to_id
Beispiel #3
0
def plot_results(predicted_data, full_dataset, fraction):
    true_data = pp.train_test_split(full_dataset, fraction)
    xs = range(len(full_dataset))
    fig = plt.figure(facecolor='white', figsize=(8, 5))
    ax = fig.add_subplot(111)
    ax.plot(xs[:len(true_data[2])], true_data[2][:, 0], label='Train Data')
    ax.plot(xs[len(true_data[2]):], true_data[3][:, 0], label='Test Data')
    plt.plot(xs[len(true_data[2]):], predicted_data, label='Predicted')
    plt.legend()
    plt.show()
Beispiel #4
0
def setup_plot(full_dataset, fraction):
    plt.ion()
    true_data = pp.train_test_split(full_dataset, fraction)
    xs = range(len(full_dataset))
    fig = plt.figure(facecolor='white', figsize=(8, 5))
    ax = fig.add_subplot(111)
    ax.plot(xs[:len(true_data[2])], true_data[2][:, 0], label='Train Data')
    ax.plot(xs[len(true_data[2]):], true_data[3][:, 0], label='Test Data')
    plt.scatter(0, 0, label='Prediction', c='g', s=1)
    plt.legend()
    plt.pause(0.01)
    return xs, ax, len(true_data[2])
Beispiel #5
0
def main():
    starter_time = timer(None)
    df_total = pd.read_csv('train.csv', index_col = None, header = 0, memory_map = True)
    df_total = df_total.drop(['ID_code'],axis = 1)
    df_total = df_total.sample(1000)
    df_total.index = range(len(df_total))
    frame_train,frame_test = pp.train_test_split(df_total, 'target', 0.3)
    frame_train = pp.normalization(frame_train,'target')
    frame_test = pp.normalization(frame_test,'target')
    X = frame_train.drop(['target'], axis = 1)
    y = frame_train['target']
    X_pred = frame_test.drop(['target'],axis = 1)
    y_truth = frame_test['target']
    print ('数据读入完成')
    base_learners = constant.base_learners
    print ('基学习器载入完成,开始训练基学习器')
    df_single_output, P = single_model_test(base_learners, X, y, X_pred,y_truth)
    plot_roc_curve (y_truth, P.values, list(P.columns))
    print ('基学习器训练完成,开始调节参数')
    base_param_dicts = constant.base_param_dicts
    df_params_base = base_hyperparam_tuning (X,y,base_learners, base_param_dicts, n_iterations = 50)
    df_params_base.to_csv('params_base.csv')
    print ('参数调节完成,开始训练中间层')
    layer1_learners = constant.layer1_learners
    
    layer1_param_dicts = constant.layer1_param_dicts
    print ('开始为中间层调参')
    
    #in_layer_1, df_params_1 = layer_hyperparam_tuning(X,y,pre_layer_learners=base_learners, local_layer_learners = layer1_learners, param_dicts_layer = layer1_param_dicts, n_iterations = 50, pre_params = 'params_base.csv')
    #df_params_1.to_csv('params1.csv')
    
    
    
    
    #设定学习器参数并确定元学习器
    print ('开始训练元学习器')
    meta_learner = constant.meta_learner
    meta_param_dicts = constant.meta_param_dicts
    meta_layer_model, df_params_meta = layer_hyperparam_tuning(X,y,pre_layer_learners = layer1_learners, local_layer_learners = meta_learner, param_dicts_layer = meta_param_dicts, n_iterations = 50, pre_params = 'params_base.csv')
    df_params_meta.to_csv('paramsMeta.csv')
    params_pre = pd.read_csv('paramsMeta.csv')
    params_pre.set_index(['Unnamed: 0'], inplace = True)
    for case_name, params in params_pre["params"].items():
        case_est = case_name
        params = eval(params)
        for est_name, est in meta_learner:
            if est_name == case_est:
                est.set_params(**params)
    layer_list = constant.layer_list
    pred_proba_1 ,stacking_model = stacking_training(X,y,X_pred,layer_list = layer_list,meta_learner = meta_learner)
    print (roc_auc_score(y_truth, pred_proba_1[:,1]))
    timer(starter_time)
    return pred_proba_1, stacking_model
Beispiel #6
0
def bayesian_tuning(train_val_df, train_val_ratio, tree_depths, target_col_name, val_repeat=8, print_status=True, max_time_per_run=300, warm_start=False):
    
    from bayes_opt import BayesianOptimization
    print('Starting bayesian optimization...')
    norm_cols = [col for col in train_val_df.columns if not col==target_col_name]
    
    #target function for bayesian optimization needs to be defined (strange scoping) 
    all_results_df = []
    all_aggregated_df = []
    
    def oct_target(alpha):
        print('Solving ILP for hyperparameter tuning...')
        all_results = []
        tree_depth = tree_depths[0]
        for r in range(val_repeat):
            train_df, val_df = preprocessing.train_test_split(train_val_df, split=train_val_ratio)
            preprocessing.normalize(train_df, norm_cols=norm_cols)
            preprocessing.normalize(val_df, norm_cols=norm_cols)
            all_results.append(get_results(train_df=train_df,
                               test_df=val_df,
                               alpha=alpha,
                               tree_depth=tree_depth, 
                               max_time_per_run=max_time_per_run,
                               threads=threads,
                               print_status=print_status,
                               warm_start=warm_start))
        
        results_df = pd.concat(all_results)
        all_results_df.append(results_df)
        aggregated = calc_mean_accuracy_per_alpha(results_df)
        all_aggregated_df.append(aggregated)
        best_alpha_acc = aggregated.max()['testing_accuracy']
        return best_alpha_acc
    
    alpha_min = 0
    train_df, val_df = preprocessing.train_test_split(train_val_df, split=train_val_ratio) # need to split for an initial guess on max alpha
    l_hat, mis_points = baseline_accuracy(train_df, target_col_name)
    alpha_max = mis_points/l_hat
    
    bo = BayesianOptimization(oct_target, {'alpha': (alpha_min, alpha_max)})
    n_iter = int(alpha_max*(2/15))+1
    if n_iter<5:
        n_iter = 10
    bo.maximize(init_points=2, n_iter=n_iter, kappa=2)
    
    
    return pd.concat(all_results_df), pd.concat(all_aggregated_df), bo.res['max']['max_params']['alpha']
Beispiel #7
0
def plot_event_predictions(  # pylint: disable=too-many-arguments, too-many-locals
        model,
        data,
        start_time,
        stop_time,
        x_fields,
        y_fields,
        n_points_behind=15,
        n_points_ahead=3):
    """Show predictions along-side real data.

    Args:
        model (keras.Sequential|models.ModelWrapper): The trained model.
        data (pd.DataFrame): The dataframe with the actual data.
        start_time (pd.Timestamp): The time that the event starts.
        stop_time (pd.Timestamp): The time that the event ends.
        x_fields (list): List of column names used to train the model.
        y_fields (list): List of column names used to train the model.
        n_points_behind (int): input vector size.
        n_points_ahead (int): output vector size.
    """

    event = data[(data.index > start_time) & (data.index < stop_time)]

    x, _, _, _ = preprocessing.train_test_split(  # pylint: disable=invalid-name
        event,
        x_fields,
        y_fields,
        percent=1,
        n_points_behind=n_points_behind,
        n_points_ahead=n_points_ahead)

    times = event.index.get_values()
    n_samples = len(event) - (n_points_behind + n_points_ahead)
    times = [
        times[i + n_points_behind:i + n_points_behind + n_points_ahead]
        for i in xrange(n_samples)
    ]
    results = model.predict(x)

    event[y_fields].plot()
    for time, result in zip(times, results):
        index = pd.DatetimeIndex(time)
        series = pd.Series(result, index=index)
        series.plot(c='green')
Beispiel #8
0
from sklearn import svm
from sklearn import metrics
import preprocessing as pre
import joblib

# Accessing dataset
data = pd.read_csv("data.csv")
# print(data.head())
print("Initial shape:", data.shape)
data = pd.DataFrame(data)

data, labels = pre.preprocess(data)
print("Shape after preprocessing:", data.shape, "and length of target array:",
      len(labels))

train, train_labels, test, test_labels = pre.train_test_split(data, labels)
print("Train data:", train.shape, len(train_labels))
print("Test data:", test.shape, len(test_labels))

# SVM Classifier
clf = svm.SVC(kernel="rbf", C=1.0)  # rbf kernel

# Train the model using the training sets
clf.fit(train, train_labels)

# Predict the response for test dataset
y_pred = clf.predict(test)

print("Accuracy:", metrics.accuracy_score(test_labels, y_pred))

prec = metrics.precision_score(test_labels, y_pred)
Beispiel #9
0
from hyperopt import tpe
from hyperopt import fmin
import ast
from hyperopt import Trials
import lightgbm as lgb
import preprocessing as pp
import feature_selection as fs

MAX_EVALS = 500
N_FOLDS = 10

df_total = pd.read_csv('train.csv', index_col=None, header=0, memory_map=True)
df_total = df_total.drop(['ID_code'], axis=1)
#df_total = df_total.sample(1000)
#df_total.index = range(len(df_total))
frame_train, frame_test = pp.train_test_split(df_total, 'target', 0.3)
frame_train = pp.normalization(frame_train, 'target')
frame_test = pp.normalization(frame_test, 'target')
X = frame_train.drop(['target'], axis=1)
y = frame_train['target']
X_pred = frame_test.drop(['target'], axis=1)
y_truth = frame_test['target']
X = np.array(X)
X_pred = np.array(X_pred)
train_set = lgb.Dataset(X, label=y)


def objective(params, n_folds=N_FOLDS):
    """Objective function for Gradient Boosting Machine Hyperparameter Optimization"""

    # Keep track of evals
Beispiel #10
0
    #---------------------------------------------------------------------------

    print '\n' + '-------------------------Starting PREPROCESSING-------------------------------' + '\n'

    start_timer_preprocess = timeit.default_timer()

    print 'Step 1 : Loading data'
    df = preprocessing.load_data(path=code_path + '/Data/')

    print 'Step 2 : Dropping redundant features'
    df = preprocessing.drop_features(df)

    print 'Step 3 : Cleaning up missing value tags : -999.0'
    df = preprocessing.drop_missing_values(df)

    train, test = preprocessing.train_test_split(df, perc=0.80)

    elapsed = timer(start_timer_preprocess)

    print '\n' + '---------------------Finished PREPROCESSING stage, took ' + str(
        elapsed) + ' seconds----------' + '\n'

    #----------------------------------------------------------------------------
    # ----------------------SAMPLING : Training with 2 types of samples---------
    # 1) Uniform sample
    # 2) Choice sample
    #----------------------------------------------------------------------------

    print '--------------------------Starting SAMPLING-----------------------------------' + '\n'

    start_timer_sampling = timeit.default_timer()
Beispiel #11
0
def uci_experiment(loc,
                   target_col,
                   hot_encode_cols,
                   tree_depths,
                   alphas_tuning,
                   repeat,
                   val_repeat=3,
                   train_test_ratio=0.8,
                   train_val_ratio=0.66,
                   header=None,
                   max_time_per_run=300,
                   threads=None,
                   save_to_file=True,
                   print_status=False,
                   f_name=None,
                   character_encoding='utf-8',
                   warm_start=False):
    """
    TODO: currently only numerical datasets are supported (preprocessing needs to be adjusted)
        input checks need to be added
        
    loc: location of dataset (string)
    target_col: number of target column  (to predict)
    tree_depths: list of tree depths to run experiments with
    alphas: list of tree complexity parameters to run experiments with
    repeat: integer indicating how often experiment should be repeated
    train_test_ratio: percentage (between zero and one) indicating how much of data is used for training and validation (rest for testing)
    train_val_ratio: percentage indicating how much of data of training and validation is used for training (rest for validation)
    header: whether or not data under url has a header to load. if no header: set to None, if header: integer indicating row number
    max_time_per_run: how much time is spend for one optimization run
    threads: how many threads in gurobi optimization (None falls back to gurobi default)
    save_to_file: boolean indicating whether results are saved to a file
    filename: filename to save results in (if none and save to file: time will be used as filename)
    characer_encoding: (string) how to decode characters
    """

    df = None

    if is_url(loc):
        #read dataframe from url
        html = requests.get(loc).content
        s = io.StringIO(html.decode(character_encoding))
        df = pd.read_csv(s, header=header)
    else:
        df = pd.read_csv(loc)

    #hot encode if needed
    if not hot_encode_cols is None:
        df, target_col = preprocessing.hot_encode(df, target_col,
                                                  hot_encode_cols)

    #split into training (+validation) and testing
    train_val_df, test_df = preprocessing.train_test_split(
        df,
        split=train_test_ratio)  #test remains untouched until alpha is chosen

    target_col_name = df.columns[target_col]
    norm_cols = [col for col in df.columns if not col == target_col_name]

    #all_results = [] #all (repeat) experimental results for different values of alpha, tree depths

    results_df, aggregated, best_alpha = hyperparameter_tuning(
        method=alphas_tuning,
        train_val_df=train_val_df,
        train_val_ratio=train_val_ratio,
        tree_depths=tree_depths,
        target_col_name=target_col_name,
        val_repeat=val_repeat,
        warm_start=warm_start)

    print('Validation done. Best alpha: {0}'.format(best_alpha))

    #get final result/ accuracy
    final_results = []
    for tree_depth in tree_depths:
        for r in range(1):
            train_df, val_df = preprocessing.train_test_split(train_val_df,
                                                              split=0.66)
            preprocessing.normalize(train_df, norm_cols=norm_cols)
            preprocessing.normalize(test_df, norm_cols=norm_cols)

            final_results.append(
                get_results(train_df=train_df,
                            test_df=test_df,
                            alpha=best_alpha,
                            tree_depth=tree_depth,
                            max_time_per_run=max_time_per_run,
                            threads=threads,
                            print_status=print_status,
                            warm_start=warm_start))  #list of dataframes

    final_results_df = pd.concat(final_results)
    aggregated_final = calc_mean_accuracy_per_alpha(final_results_df)

    if save_to_file:
        dir_name = 'experiments'
        persist_results(dir_name=dir_name,
                        f_name=f_name + '_validation_',
                        results_df=results_df,
                        aggregated=aggregated)
        persist_results(dir_name=dir_name,
                        f_name=f_name + '_final_',
                        results_df=final_results_df,
                        aggregated=aggregated_final)

    return results_df
Beispiel #12
0
def gd_tuning(train_val_df,
              train_val_ratio,
              tree_depths,
              target_col_name,
              val_repeat=8,
              decrease_threshold=0.05,
              p=0.02,
              print_status=True,
              max_time_per_run=300,
              warm_start=False):
    """
    stop if accuracy is worse than best_accuracy-decrease_threshold
    p: after running algorithm, calculate alpha by taking take mean of all alphas that achieved accuracy within range of p of best acc
    """
    print('Starting parameter tuning.')
    train_df, val_df = preprocessing.train_test_split(train_val_df,
                                                      split=train_val_ratio)
    l_hat, mis_points = baseline_accuracy(train_df, target_col_name)
    alpha_max = mis_points / l_hat
    alpha_min = 0
    #alpha_min = 9.92419825072886
    #alpha_max = 9.92419825072886
    test_n_alphas = 50
    #test_n_alphas = 1
    print(
        'Testing maximum of {0} values for alpha between {1} and {2}.'.format(
            test_n_alphas, alpha_min, alpha_max))
    alphas = np.linspace(alpha_min, alpha_max, test_n_alphas)

    all_results = []
    norm_cols = [
        col for col in train_val_df.columns if not col == target_col_name
    ]

    for no, alpha in enumerate(alphas):
        print('Testing alpha={0}'.format(alpha))

        for tree_depth in tree_depths:
            for r in range(val_repeat):
                #create new train/val
                train_df, val_df = preprocessing.train_test_split(
                    train_val_df, split=train_val_ratio)
                #preprocessing
                #normalize
                preprocessing.normalize(train_df, norm_cols=norm_cols)
                preprocessing.normalize(val_df, norm_cols=norm_cols)

                all_results.append(
                    get_results(train_df=train_df,
                                test_df=val_df,
                                alpha=alpha,
                                tree_depth=tree_depth,
                                max_time_per_run=max_time_per_run,
                                threads=threads,
                                print_status=print_status,
                                warm_start=warm_start))  #list of dataframes

        if not alpha == 0:
            results_df = pd.concat(all_results)
            aggregated = calc_mean_accuracy_per_alpha(results_df)
            best_alpha = aggregated.idxmax()[
                'testing_accuracy']  #df is indexed by alpha
            best_alpha_acc = aggregated.max()['testing_accuracy']

            #check whether last tested alpha decreased significantly
            alpha_acc = aggregated['testing_accuracy'][
                alpha]  #accuracy for current alpha

            if alpha_acc < best_alpha_acc - decrease_threshold:
                print(
                    'Accuracy for alpha={0}: {1} is worse than best accuracy for alpha={2}: {3}.\nStopping criterion is met...'
                    .format(alpha, alpha_acc, best_alpha, best_alpha_acc))
                break

    #take mean of top_n best alphas
    #best_alpha = np.mean(aggregated.sort_values(by='testing_accuracy', ascending=False).index[:top_n])

    #take mean of all alphas that achieved accuracy within p of best acc
    #p = 0.02
    best_alpha = np.mean(
        aggregated[aggregated['testing_accuracy'] > best_alpha_acc - p].index)

    return results_df, aggregated, best_alpha
Beispiel #13
0
def bayesian_tuning(train_val_df,
                    train_val_ratio,
                    tree_depths,
                    target_col_name,
                    val_repeat=8,
                    print_status=True,
                    max_time_per_run=300,
                    warm_start=False):

    from bayes_opt import BayesianOptimization
    print('Starting bayesian optimization...')
    norm_cols = [
        col for col in train_val_df.columns if not col == target_col_name
    ]

    #target function for bayesian optimization needs to be defined (strange scoping)
    all_results_df = []
    all_aggregated_df = []

    train_df, val_df = preprocessing.train_test_split(train_val_df,
                                                      split=train_val_ratio)
    if val_repeat == 1:
        random_state = np.random.randint(low=0, high=100)
    else:
        random_state = None

    def oct_target(alpha):
        print('Solving ILP for hyperparameter tuning...')
        all_results = []
        tree_depth = tree_depths[0]
        for r in range(val_repeat):
            train_df, val_df = preprocessing.train_test_split(
                train_val_df, split=train_val_ratio, random_state=random_state)
            preprocessing.normalize(train_df, norm_cols=norm_cols)
            preprocessing.normalize(val_df, norm_cols=norm_cols)
            all_results.append(
                get_results(train_df=train_df,
                            test_df=val_df,
                            alpha=alpha,
                            tree_depth=tree_depth,
                            max_time_per_run=max_time_per_run,
                            threads=threads,
                            print_status=print_status,
                            warm_start=warm_start))

        results_df = pd.concat(all_results)
        all_results_df.append(results_df)
        aggregated = calc_mean_accuracy_per_alpha(results_df)
        all_aggregated_df.append(aggregated)
        best_alpha_acc = aggregated.max()['testing_accuracy']
        return best_alpha_acc

    alpha_min = 0.5
    train_df, val_df = preprocessing.train_test_split(
        train_val_df, split=train_val_ratio
    )  # need to split for an initial guess on max alpha
    l_hat, mis_points = baseline_accuracy(train_df, target_col_name)
    alpha_max = mis_points / l_hat
    max_splits = np.power(tree_depths[0], 2) - 1
    alpha_max = alpha_max * ((max_splits - 1) / max_splits)
    alpha_min = alpha_max * ((1.0) / max_splits)
    bo = BayesianOptimization(oct_target, {'alpha': (alpha_min, alpha_max)})
    n_iter = int((alpha_max - alpha_min) * (1 / 15))
    #n_iter = 5
    if n_iter < 4:
        n_iter = 4
    if tree_depths[0] > 2:
        if n_iter > max_splits:
            n_iter = max_splits
    print(
        '***\nAlpha min: {0}\nAlpha max: {1}\nTesting {2} values for alpha\n***'
        .format(alpha_min, alpha_max, n_iter))
    bo.maximize(init_points=2, n_iter=n_iter, kappa=3)

    return pd.concat(all_results_df), pd.concat(
        all_aggregated_df), bo.res['max']['max_params']['alpha']
Beispiel #14
0
                self.N_k_t[i][j].start = ln.n_k_t[j]
                self.c_k_t[i][j].start = ln.c_k_t[j]
            self.L_t[i].start = ln.l_t


if __name__ == '__main__':
    # data
    target = 4
    df = pd.read_csv('../data/iris/iris.data')
    target_name = df.columns[target]
    norm_cols = [col for col in df.columns if not col == target_name]
    preprocessing.normalize(df, norm_cols=norm_cols)

    # Parameters:
    tree_complexity = 1
    tree_depth = 2
    df_train, df_test = preprocessing.train_test_split(df, split=0.8)
    print('Training samples: {0}'.format(len(df_train)))
    print('Testing samples: {0}'.format(len(df_test)))
    o = OCTH(df_train, target, tree_complexity, tree_depth, warm_start=True)
    o.fit()

    # print('*' * 10)
    # print('SOLUTION')
    # print('*' * 10)
    # print(o.tree)
    preds = o.predict(df, feat_cols=norm_cols)
    print('Training accuracy: {0}'.format(o.training_accuracy()))
    print('Testing accuracy: {0}'.format(o.accuracy_on_test(df_test, target)))
    # o.model.MIPGap
Beispiel #15
0
def main(*args):

    import pandas as pd
    import logging

    ##################################################################

    # preparations : arguments 변수 정의 및 logging 설정
    # logging level : DEBUG, INFO, WARNING, ERROR, CRITICAL

    data_adrress = args[0]
    log_adrress = args[1]
    model_output_adrress = args[2]

    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger("log")
    logger.setLevel(logging.INFO)
    file_handler = logging.FileHandler(log_adrress + "/log.log")
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    logging.basicConfig()

    logger.info("Program Start..")

    ##################################################################

    # Frist : Read data from the directory -> rawData

    import read_data

    logger.info("read data start..")

    rawData = read_data.read_csv(data_adrress)

    # 콘솔창 출력 column 확대
    pd.set_option('display.max_columns', None)
    # pd.set_option('display.max_rows', None)

    #print(rawData)

    logger.info("read data end..")

    ##################################################################

    # Second : Preprocess data from rawData to Train & Test set
    # Step : 1.remove Outlier 2.Feature Scaling 3.Test/Train Split & Shuffle

    import preprocessing

    logger.info("preprocessing start..")

    # Proprocess rawData according to data characteristics

    # For regression (ex. Demand/Time forecasting) --> preprocessingData

    # step 1 : remove outlier (from EDA or domain Knowledge)
    preprocessing_Data = rawData

    # step 2 : Feature scaling

    # For classification (ex. Image, Natural language) --> preprocessingData

    preprocessing_data = rawData

    # X (Independent variable) & Y (Dependent variable) Split

    independent_var = preprocessing_data.loc[:, [
        "Temperature", "Humidity", "Light", "CO2", "HumidityRatio"
    ]]
    dependent_var = preprocessing_data.loc[:, ["Occupancy"]]

    # Train & Test Split (독립변수, 의존변수, Shuffle 유무, Test Set 사이즈)

    x_train, x_test, y_train, y_test = preprocessing.train_test_split(
        independent_var, dependent_var, True, 0.2)

    logger.info("preprocessing end..")

    ##################################################################

    # Third : Build Model

    logger.info("build Model start..")

    import logistic_regression
    import randomforest
    import gradientboosting

    x_train = x_train.values
    y_train = y_train.values.ravel()

    logistic_model = logistic_regression.regression(x_train, y_train)
    randomforest_model = randomforest.randomforest(x_train, y_train, 20, 0)
    xgboost_model = gradientboosting.xgb(x_train, y_train, 0.02, 20)

    logger.info("build Model end..")

    ##################################################################

    # Fourth : Test & Tuning Model

    logger.info("test start..")

    from sklearn import metrics

    y_pred_logistic_model = logistic_model.predict(x_test)
    print('logistic_model 정확도 :',
          metrics.accuracy_score(y_test, y_pred_logistic_model))

    y_pred_randomforest_model = randomforest_model.predict(x_test)
    print('randomforest_model 정확도 :',
          metrics.accuracy_score(y_test, y_pred_randomforest_model))

    y_pred_xgboost_model = xgboost_model.predict(x_test)
    print('xgboost_model 정확도 :',
          metrics.accuracy_score(y_test.values, y_pred_xgboost_model))

    confusion = metrics.confusion_matrix(y_test.values, y_pred_xgboost_model)
    print(confusion)

    logger.info("test end..")

    ##################################################################

    # Fifth : clear memory & Save Output

    logger.info("save start..")

    import joblib
    joblib.dump(logistic_model, model_output_adrress + "./logistic_model.pkl")

    logger.info("save end..")
    logger.info("Program End..")
Beispiel #16
0
OUTPUT = 'Close'
INPUTS = ['Close']

# open data
data = preprocessing.read_data(FILE_PATH)
# select time scope
data = preprocessing.select_data(data, "2015-01-01", "")
# reformat time step, not doing anything if already the right time format (to check)
data = preprocessing.format_time_step(data, "H")

# set timestamp as index
data = data.set_index("Timestamp")
data = data[INPUTS]

# train/test split
train, test = preprocessing.train_test_split(data, 0.7)
train = train[0]
test = test[0]

time_indices = [1420680240, 1421120280]
truth = train.loc[time_indices, OUTPUT]

# dummy echo model
model = dummy_echo.Model(1)
model.train(train)
prediction = model.predict([1420680240])

train_gen = KerasGenerator(train[INPUTS].values, train[OUTPUT].values, 10, 1,
                           10, 1, 32)
test_gen = KerasGenerator(test[INPUTS].values, test[OUTPUT].values, 10, 1, 10,
                          1, 32)