Esempio n. 1
0
def main(args):
    
    # Hyper-parameters
    hyperparams={
    'n_estimators'       : args.n_estimators,
    'max_depth'          : args.max_depth,
    'n_bins'             : args.n_bins,
    'split_criterion'    : args.split_criterion,
    'split_algo'         : args.split_algo,
    'bootstrap'          : args.bootstrap,
    'bootstrap_features' : args.bootstrap_features,
    'max_leaves'         : args.max_leaves,
    'max_features'       : args.max_features
    }

    # SageMaker options
    model_dir       = args.model_dir
    data_dir        = args.data_dir
    
    col_names = ['label'] + ["col-{}".format(i) for i in range(2, 30)] # Assign column names
    dtypes_ls = ['int32'] + ['float32' for _ in range(2, 30)] # Assign dtypes to each column
    data = cudf.read_csv(data_dir+'HIGGS.csv', names=col_names, dtype=dtypes_ls)

    X_train, X_test, y_train, y_test = train_test_split(data, 'label', train_size=0.70)

    cu_rf = cuRF(**hyperparams)
    cu_rf.fit(X_train, y_train)

    print("test_acc:", accuracy_score(cu_rf.predict(X_test), y_test.to_gpu_array()))
def main(args):

    # Load hyperparameters
    hyperparams = get_hyperparameters()

    hyperparams = {
        'n_estimators': int(hyperparams.get("n_estimators", 20)),
        'max_depth': int(hyperparams.get("max_depth", 10)),
        'n_bins': int(hyperparams.get("n_bins", 8)),
        'split_criterion': int(hyperparams.get("split_criterion", 0)),
        'split_algo': int(hyperparams.get("split_algo", 0)),
        'bootstrap': hyperparams.get("bootstrap", 'true') == 'true',
        'bootstrap_features': hyperparams.get("bootstrap_features",
                                              'false') == 'true',
        'max_leaves': int(hyperparams.get("max_leaves", -1)),
        'max_features': float(hyperparams.get("max_features", 0.2))
    }

    #     'split_criterion'    : 0,      # GINI:0, ENTROPY:1
    #     'split_algo'         : 0,      # HIST:0 GLOBAL_QUANTILE:1
    #     'bootstrap'          : True,   # sample with replacement
    #     'bootstrap_features' : False,  # sample without replacement
    #     'max_leaves'         : -1,     # unlimited leaves

    # SageMaker options
    model_dir = args.model_dir
    data_dir = args.data_dir

    col_names = ['label'] + ["col-{}".format(i)
                             for i in range(2, 30)]  # Assign column names
    dtypes_ls = ['int32'] + ['float32' for _ in range(2, 30)
                             ]  # Assign dtypes to each column
    data = cudf.read_csv(data_dir + 'HIGGS.csv',
                         names=col_names,
                         dtype=dtypes_ls)

    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        'label',
                                                        train_size=0.70)

    cu_rf = cuRF(**hyperparams)
    cu_rf.fit(X_train, y_train)

    print("test_acc:",
          accuracy_score(cu_rf.predict(X_test), y_test.to_gpu_array()))
Esempio n. 3
0
def main():
    start_script = time.time()
    
    parser = argparse.ArgumentParser()

    parser.add_argument('--data_dir', type=str, help='location of data')
    parser.add_argument('--n_estimators', type=int, default=100, help='Number of trees in RF')
    parser.add_argument('--max_depth', type=int, default=16, help='Max depth of each tree')
    parser.add_argument('--n_bins', type=int, default=8, help='Number of bins used in split point calculation')
    parser.add_argument('--max_features', type=float, default=1.0, help='Number of features for best split')


    args = parser.parse_args()
    data_dir = args.data_dir
    
    print('\n---->>>> cuDF version <<<<----\n', cudf.__version__)
    print('\n---->>>> cuML version <<<<----\n', cuml.__version__)
    
    t1 = time.time()
    df = cudf.read_parquet(os.path.join(data_dir, 'airline_20m.parquet'))
#     df = cudf.read_orc(os.path.join(data_dir, 'airline_20000000.orc'))
    t2 = time.time()
    print('\n---->>>> cuDF time: {:.2f} <<<<----\n'.format(t2-t1))

    X = df[df.columns.difference(['ArrDelay', 'ArrDelayBinary'])]
    y = df['ArrDelayBinary'].astype(np.int32)
    del df
    
    n_estimators = args.n_estimators
    run.log('n_estimators', np.int(args.n_estimators))
    max_depth = args.max_depth
    run.log('max_depth', np.int(args.max_depth))
    n_bins = args.n_bins
    run.log('n_bins', np.int(args.n_bins))
    max_features = args.max_features
    run.log('max_features', np.str(args.max_features))
        
    print('\n---->>>> Training using GPUs <<<<----\n')
    
    # ----------------------------------------------------------------------------------------------------
    # cross-validation folds 
    # ----------------------------------------------------------------------------------------------------
    accuracy_per_fold = []; train_time_per_fold = []; infer_time_per_fold = []; trained_model = [];
    global_best_model = None; global_best_test_accuracy = 0
    
    traintime = time.time()
    # optional cross-validation w/ model_params['n_train_folds'] > 1
    for i_train_fold in range(5):
        print( f"\n CV fold { i_train_fold } of { 5 }\n" )

        # split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i_train_fold, shuffle = True)

        # train model 
        cu_rf = cuRF(n_estimators=n_estimators, max_depth=max_depth, n_bins=n_bins, max_features=max_features)
        start1 = time.time()
        trained_model = cu_rf.fit(X_train, y_train)
        training_time = time.time() - start1
        train_time_per_fold += [ round( training_time, 4) ]

        # evaluate perf
        start2 = time.time()
        cuml_pred = cu_rf.predict(X_test)
        infer_time = time.time() - start2

        cuml_accuracy = accuracy_score(cuml_pred, y_test) * 100
                
        accuracy_per_fold += [ round( cuml_accuracy, 4) ]
        infer_time_per_fold += [ round( infer_time, 4) ]

        # update best model [ assumes maximization of perf metric ]
        if cuml_accuracy > global_best_test_accuracy :
            global_best_test_accuracy = cuml_accuracy
    
    total_train_inference_time = time.time() - traintime
    run.log('Total training inference time', np.float(total_train_inference_time))
    run.log('Accuracy', np.float(global_best_test_accuracy))
    print( '\n Accuracy             :', global_best_test_accuracy)
    print( '\n accuracy per fold    :', accuracy_per_fold)
    print( '\n train-time per fold  :', train_time_per_fold)
    print( '\n train-time all folds  :', sum(train_time_per_fold))
    print( '\n infer-time per fold  :', infer_time_per_fold)
    print( '\n infer-time all folds  :', sum(infer_time_per_fold))
              
    end_script = time.time()
    print('Total runtime: {:.2f}'.format(end_script-start_script))
    run.log('Total runtime', np.float(end_script-start_script))
    
    print('\n Exiting script')
    args = parser.parse_args()

    # process data
    df = pd.read_csv(args.data_path, index_col=None, header=None)  # read it
    df = process_data(df)

    ## TRAIN
    # get parameters
    label_map = {'normal.': 0, 'anomaly.': 1}
    params = {'random_state':RAND_STATE, 'n_estimators':2500, 'max_depth':200,
              'n_bins':20, 'max_samples':1.0, 'max_features':0.4, 'n_streams':1}
    mlflow.log_params(params)
    if(DEBUG): print(f'Random Forest with {params}')
    
    # train model
    model = cuRF(**params)
    
    # pseudo-cross-validation
    subsample_perc = 0.75  # take 75% of the data for all of the training subsets
    f1_train_norms,f1_train_anoms,f1_test_norms,f1_test_anoms,run_times = [],[],[],[],[]
    for random_state in range(1000):  # number of cross-validations
        np.random.seed(random_state)
        valid_idxs = np.random.choice(df.index, size=round(subsample_perc*df.shape[0]), replace=False)
        # split data
        train, train_norm, train_anom, test_norm, test_anom = split_data(df.loc[valid_idxs])
        X_train, y_train = train  # unpack training data
        
        # train model
        start_time = time.time()  # mark start
        model.fit(X_train, np.vectorize(label_map.get)(y_train))
    X_train, y_train = train  # unpack training data

    # score data
    label_map = {'normal.': 0, 'anomaly.': 1}
    score_df = pd.DataFrame(columns=['n_estimators','max_depth','n_bins','max_samples','max_features','run_time',
                                     'f1_train_norm','f1_train_anom','f1_test_norm','f1_test_anom'])
    for n_estimators in range(25,251,25):
        for max_depth in range(2,21,2):
            for n_bins in range(2,21,2):
                for max_samples in range(2,11,2):
                    max_samples /= 10
                    for max_features in range(2,11,2):
                        # convert parameters
                        max_features /= 10
                        # train model
                        model = cuRF(random_state=RAND_STATE, n_estimators=n_estimators, max_depth=max_depth, n_bins=n_bins, max_samples=max_samples, max_features=max_features, n_streams=1)
                        start_time = time.time()  # mark start
                        model.fit(X_train, np.vectorize(label_map.get)(y_train))

                        # score model
                        f1_train_norm = compute_f1(model, train_norm, 0)
                        f1_train_anom = compute_f1(model, train_anom, 1)
                        f1_test_norm = compute_f1(model, test_norm, 0)
                        f1_test_anom = compute_f1(model, test_anom, 1)

                        # log time
                        end_time = time.time()  # mark end
                        run_time = end_time - start_time  # calculate runtime based on fitting and scoring

                        # save metrics
                        idx = score_df.shape[0]