Esempio n. 1
0
def fit_base_learner(X_tra, y_tra, X_val, y_val, test_fold, X_test, y_test,
            keras_model, random_state=None, description=''):
    rus = RandomUnderSampler(random_state=random_state)
    X_train_batch, y_train_batch = get_random_batch(X_tra, y_tra, rus)
    X_val_batch, y_val_batch = get_random_batch(X_val, y_val, rus)  
    model = KerasClassifier(keras_model)
    
    checkpointer = ModelCheckpoint(filepath='_data/model.best.hdf5',
                               verbose=0, save_best_only=True)
    history = model.fit(X_train_batch.reshape(-1, 9, 9, 1), 
                        y_train_batch.reshape(-1), 
                         validation_data=(X_val_batch.reshape(-1, 9, 9, 1),
                                          y_val_batch.reshape(-1)),
                         batch_size=1024, epochs=100, verbose=0, 
                         callbacks=[checkpointer], shuffle=True)
    
    y_pred = model.predict(X_train_batch.reshape(-1, 9, 9, 1), batch_size=1024)
    y_pred_proba = model.predict_proba(X_train_batch.reshape(-1, 9, 9, 1), batch_size=1024)
    df_train = evaluate_metrics(y_train_batch.reshape(-1), y_pred, y_pred_proba[:,1],
                              'CNN', test_fold, description)
    
    y_pred = model.predict(X_val_batch.reshape(-1, 9, 9, 1), batch_size=1024)
    y_pred_proba = model.predict_proba(X_val_batch.reshape(-1, 9, 9, 1), batch_size=1024)
    df_val = evaluate_metrics(y_val_batch.reshape(-1), y_pred, y_pred_proba[:,1],
                              'CNN', test_fold, description)
    
    y_pred = model.predict(X_test.reshape(-1, 9, 9, 1), batch_size=1024)
    y_pred_proba = model.predict_proba(X_test.reshape(-1, 9, 9, 1), batch_size=1024)
    df_test = evaluate_metrics(y_test.reshape(-1), y_pred, y_pred_proba[:,1],
                              'CNN', test_fold, description)
    return df_train, df_val, df_test, history, model
def trainNNmodel(X, Y):
    # Get the training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

    # Function to create the NN model, required for the wrapper
    def create_keras_model():
        model = Sequential()
        model.add(
            Dense(64,
                  input_dim=X.shape[1],
                  kernel_initializer='glorot_normal',
                  activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(0.5))
        model.add(
            Dense(128, kernel_initializer='glorot_normal', activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(0.5))
        model.add(
            Dense(16, kernel_initializer='glorot_normal', activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer="adam",
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        return model

    # Fit the model
    early_stop = callbacks.EarlyStopping(monitor="accuracy",
                                         patience=50,
                                         mode='max')
    callbacks_list = [early_stop]

    estimator = KerasClassifier(build_fn=create_keras_model,
                                epochs=200,
                                batch_size=12,
                                verbose=0,
                                callbacks=callbacks_list)
    estimator.fit(X_train,
                  y_train,
                  batch_size=12,
                  epochs=200,
                  verbose=1,
                  callbacks=callbacks_list)

    y_pred = estimator.predict(X_test)
    y_pred = [item for sublist in y_pred for item in sublist]
    y_pred_rt = estimator.predict_proba(X_test)[:, 1]

    accuracy = str(accuracy_score(y_test, y_pred))
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_rt)
    auc_value = str(auc(fpr, tpr))
    precision = str(precision_score(y_test, y_pred))
    recall = str(recall_score(y_test, y_pred))
    f1score = str(f1_score(y_test, y_pred, average="weighted"))

    return [
        accuracy, auc_value, precision, recall, f1score, y_test, y_pred,
        y_pred_rt, estimator.model
    ]
Esempio n. 3
0
 def model_probs(self, classifier=None):
     if not classifier:
         classifier = KerasClassifier(build_fn=self.model_build,
                                      epochs=200,
                                      batch_size=5)
         classifier.fit(self.X_train, self.y_train)
     predictions = classifier.predict_proba(self.X_test)
     return predictions
Esempio n. 4
0
def perform_keras(name, mongo_host):
    df = load_train_all_xgb().head(10000)
    del_trash_cols(df)
    blja_nan(df)
    # add_kur_combinations(df)
    folds = 5
    seed = 42

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    losses = []
    n_est = []
    counter = 0
    for big_ind, small_ind in skf.split(np.zeros(len(df)), df[TARGET]):
        big = df.iloc[big_ind]
        small = df.iloc[small_ind]

        print explore_target_ratio(big)
        print explore_target_ratio(small)

        big, small = oversample(big, small, seed)

        print explore_target_ratio(big)
        print explore_target_ratio(small)

        train_target = big[TARGET]
        del big[TARGET]
        train_arr = big

        test_target = small[TARGET]
        del small[TARGET]
        test_arr = small

        global NUM_FETURES
        NUM_FETURES = len(train_arr.columns)

        scaller = StandardScaler()
        train_arr = scaller.fit_transform(train_arr)
        test_arr = scaller.transform(test_arr)

        estimator = KerasClassifier(build_fn=create_baseline,
                                    epochs=100,
                                    batch_size=5,
                                    verbose=10)

        print len(train_arr)
        print len(test_arr)
        eval_set = [(train_arr, train_target), (test_arr, test_target)]
        estimator.fit(train_arr, train_target)

        proba = estimator.predict_proba(test_arr)

        loss = log_loss(test_target, proba)
        out_loss(loss)
        losses.append(loss)
        # xgb.plot_importance(estimator)
        # plot_errors(stats)

    out_loss('avg = {}'.format(np.mean(losses)))
Esempio n. 5
0
    def _model_build(self, *arg):
        self._prepare_test_data()
        model = KerasClassifier(
            build_fn=self.create_model, verbose=0)
        optimizers = [
            'adam']
        init = [
            'normal', 'uniform']
        epochs = [
            100, 150]
        batches = [
            5, 10]
        param_grid = dict(
            optimizer=optimizers, nb_epoch=epochs, batch_size=batches, init=init)
        grid = GridSearchCV(
            estimator=model, param_grid=param_grid, cv=5)
        grid_result = grid.fit(
            self.x_train, self.y_train)
        print("Best: %f using %s" % (
            grid_result.best_score_, grid_result.best_params_))
        # means = grid_result.cv_results_[
        #     'mean_test_score']
        # stds = grid_result.cv_results_[
        #     'std_test_score']
        # params = grid_result.cv_results_[
        #     'params']
        # for mean, stdev, param in zip(means, stds, params):
        #     print("%f (%f) with: %r" % (
        # mean,
        # stdev,
        # param))

        # Training
        # with Best
        # Parameter
        model = Sequential()
        model.add(Dense(
            12, input_dim=8, init=grid_result.best_params_['init'], activation='relu'))
        model.add(Dense(
            8, init=grid_result.best_params_['init'], activation='relu'))
        model.add(Dense(
            1, init=grid_result.best_params_['init'], activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer=grid_result.best_params_['optimizer'], metrics=['accuracy'])
        # Compile
        # model
        model.fit(
            self.x_train, self.y_train, nb_epoch=grid_result.best_params_['nb_epoch'], batch_size=grid_result.best_params_['batch_size'])
        yy_pred = model.predict(
            self.x_test)
        self.y_pred = [np.round(
            x) for x in yy_pred]
        self.y_true = self.y_test
        self.prob = model.predict_proba(
            self.x_test)
        self._analyse_result()
Esempio n. 6
0
def cnn_result(feature_data, plot=0):
    (x_train_cnn, y_train_cnn), (x_test_cnn, y_test_cnn), _ = load_data.cnn(feature=feature_data)
    clf_cnn = KerasClassifier(build_fn=cnn_model, epochs=epochs, batch_size=batch_size, verbose=0)
    clf_cnn.fit(x_train_cnn, y_train_cnn, validation_data=(x_test_cnn, y_test_cnn))
    y_pred_cnn = clf_cnn.predict_proba(x_test_cnn)[:, 1]
    fpr_cnn, tpr_cnn, _ = roc_curve(np.array(y_test_cnn[:, 1], dtype='uint8'), y_pred_cnn, pos_label=1)
    np.savetxt("../E017_new_cnn.txt", [fpr_cnn, tpr_cnn], fmt='%.8f')
    print('*******' * 3, '\n\t AUC = ', auc(fpr_cnn, tpr_cnn), '\n', '*******' * 3)
    if plot:
        plt.plot(fpr_cnn, tpr_cnn, label='CNN AUC=%0.3f' % (auc(fpr_cnn, tpr_cnn)))
        plt.legend()
        plt.show()
 def model_probs(self, classifier=None):
     """
     Inputs a classifer of the type used above in model_run and returns the probabilities that each item in the dataset belongs to each class.
     """
     if not classifier:
         classifier = KerasClassifier(build_fn=self.model_build,
                                      epochs=200,
                                      batch_size=5,
                                      verbose=0)
         classifier.fit(self.X_train, self.y_train)
     predictions = classifier.predict_proba(self.X)
     return predictions
Esempio n. 8
0
def get_probability_labels(x, y):
    all_predictions = []
    estimator = KerasClassifier(batch_size=32, epochs=100, optimizer='Adam', build_fn=baseline_model, verbose=0)
    for train_index, test_index in cv_5.split(x, y):
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        estimator.fit(X_train, y_train)
        predictions = estimator.predict_proba(X_test)
        predictions = list(predictions[:, 1])
        all_predictions.append(predictions)
        a = [j for i in all_predictions for j in i] #remove nested list
    return a
Esempio n. 9
0
def main():
    np.random.seed(seed)
    trainfile = r"/Arun/ML/Practice/Leaves/train.csv"
    testfile = r"/Arun/ML/Practice/Leaves/test.csv"  #File for submission
    traindata = pd.read_csv(trainfile)
    testdata = pd.read_csv(testfile)  #Data for prediction and submission
    print "\nDataset has {1} columns and {0} Rows".format(
        traindata.shape[0], traindata.shape[1])
    print "\nBasic Stats: \n{0}".format(traindata.describe())
    #----Missing Value Treatment----
    tr_mv_vars = getMissingVars(traindata)
    if len(tr_mv_vars) > 0: replaceMVwithMode(traindata, tr_mv_vars)
    te_mv_vars = getMissingVars(testdata)
    if len(te_mv_vars) > 0: replaceMVwithMode(testdata, te_mv_vars)
    #-----X, y assignment-----
    tr_cat_vars = list(getCatVars(traindata))
    tr_con_vars = list(getConVars(traindata))
    te_cat_vars = list(getCatVars(testdata))
    te_con_vars = list(getConVars(testdata))
    target_var = 'species'
    X = traindata.values[:, 2:]
    y = traindata.values[:, 1]  #target_var's all data values
    X_submit = testdata.values[:,
                               1:]  #There's no target_var or y value in this dataset
    submission_ids = testdata['id']
    #-------Encoding----------
    if target_var in tr_cat_vars:
        tr_cat_vars.remove(target_var)
        y, y_classes = encodeY(y)
    if len(tr_cat_vars) > 0: encodeCatVariables(X, tr_cat_vars)
    if len(te_cat_vars) > 0: encodeCatVariables(X_submit, te_cat_vars)
    #--------Scaling--------
    ###X = standardNormalization(X) #Disabled to provide improvised approach via estimators[]. Enable it while using it w/ GridSearchCV.
    #--------Modeling-------
    model = KerasClassifier(build_fn=create_nn_model, verbose=0)
    estimators = []
    estimators.append(('normalization', StandardScaler()))
    estimators.append(
        ('MLP',
         model))  #Multi Layer Perceptron (a.k.a Artificial Neural Network)
    leaf_pipeline = Pipeline(estimators)
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    result = cross_val_score(leaf_pipeline, X, y, cv=kfold, n_jobs=-1)
    print "\nResult: Acc{0}% std({1}%)".format(result.mean() * 100,
                                               result.std() * 100)
    model.fit(X, y)
    y_pred = model.predict(X_submit)
    y_prob = model.predict_proba(X_submit)
    submission = pd.DataFrame(y_prob, index=submission_ids, columns=y_classes)
    submission.to_csv('submit.csv')
Esempio n. 10
0
def main():
    code_dir = '/home/john/git/kaggle/OttoGroup/'
    data_dir = '/home/john/data/otto/'
    training_file = 'train.csv'

    os.chdir(code_dir)
    np.random.seed(1337)

    print('Starting script...')

    print('Loading data...')
    X, labels = load_training_data(data_dir, training_file)

    print('Pre-processing...')
    scaler = create_scaler(X)
    X = apply_scaler(X, scaler)
    y, y_onehot, encoder = preprocess_labels(labels)
    num_features = X.shape[1]
    num_classes = y_onehot.shape[1]
    print('Features = ' + str(num_features))
    print('Classes = ' + str(num_classes))

    print('Building model...')
    model = define_model(num_features, num_classes)
    print('Complete.')

    print('Training model...')
    wrapper = KerasClassifier(model)
    wrapper.fit(X, y_onehot, nb_epoch=20)
    print('Complete.')

    print('Training score = ' + str(wrapper.score(X, y_onehot)))

    preds = wrapper.predict(X)
    print('Predictions shape = ' + str(preds.shape))

    proba = wrapper.predict_proba(X)
    print('Probabilities shape = ' + str(proba.shape))

    print('Building ensemble...')
    ensemble = BaggingClassifier(wrapper, n_estimators=3, max_samples=1.0, max_features=1.0)
    print('Complete.')

    print('Training ensemble...')
    ensemble.fit(X, y)
    print('Complete.')

    print('Ensemble score = ' + str(ensemble.score(X, y)))

    print('Script complete.')
Esempio n. 11
0
def mlp_result(feature_data):
    (x_train_mlp,
     y_train_mlp), (x_test_mlp,
                    y_test_mlp) = load_data.mlp(feature=feature_data)
    clf_mlp = KerasClassifier(build_fn=mlp,
                              epochs=epochs,
                              batch_size=batch_size,
                              verbose=0)
    clf_mlp.fit(x_train_mlp,
                y_train_mlp,
                validation_data=(x_test_mlp, y_test_mlp))
    y_pred_mlp = clf_mlp.predict_proba(x_test_mlp)[:, 1]
    fpr_mlp, tpr_mlp, _ = roc_curve(y_test_mlp, y_pred_mlp)
    np.savetxt("../E017_new_mlp.txt", [fpr_mlp, tpr_mlp], fmt='%.8f')
    print('*******' * 3, '\n\t AUC = ', auc(fpr_mlp, tpr_mlp), '\n',
          '*******' * 3)
Esempio n. 12
0
def tune_epochs(estimator, epoch_num, train, y):
	gc.collect()
	epoch_list = range(1:epoch_num+1)
	epoch_val_score = pd.DataFrame()
	epoch_val_score['epochs'] = epoch_list
	epoch_scores = []
	for epoch in epoch_list:
		estimator = KerasClassifier(build_fn=create_baseline, epochs=epoch, batch_size=5000, verbose=1)
		estimator.fit(xtr, ytr)
		y_pred_val = estimator.predict_proba(xval)
		del estimator
		gc.collect()
		y_pred_val = pd.DataFrame(y_pred_val)
		y_pred_val = y_pred_val.iloc[:,1].values
		epoch_scores.append(roc_auc_score)
		del y_pred_val
		
	epoch_val_score['val_scores'] = epoch_scores
	
	return epoch_val_score
Esempio n. 13
0
def main():

    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')
    event_type = pd.read_csv('data/event_type.csv')
    log_feature = pd.read_csv('data/log_feature.csv')
    resource_type = pd.read_csv('data/resource_type.csv')
    severity_type = pd.read_csv('data/severity_type.csv')

    X = train.copy()

    print("Removing non-numerics")
    for idx in range(len(train)):
        temp = str(train.iloc[[idx]]['location'])
        temp = temp.split('location')[1]
        temp = ''.join(x for x in temp if x.isdigit())
        X.loc[idx, 'location'] = temp
    X['location'] = X['location'].astype('float32')

    for idx in range(len(test)):
        temp = str(test.iloc[[idx]]['location'])
        temp = temp.split('location')[1]
        temp = ''.join(x for x in temp if x.isdigit())
        test.loc[idx, 'location'] = temp
    test['location'] = test['location'].astype('float32')

    for idx in range(len(event_type)):
        temp = str(event_type.iloc[[idx]]['event_type'])
        temp = temp.split('event_type')[1]
        temp = ''.join(x for x in temp if x.isdigit())
        event_type.loc[idx, 'event_type'] = temp
    event_type['event_type'] = event_type['event_type'].astype('float32')

    for idx in range(len(log_feature)):
        temp = str(log_feature.iloc[[idx]]['log_feature'])
        temp = temp.split('feature')[1]
        temp = ''.join(x for x in temp if x.isdigit())
        log_feature.loc[idx, 'log_feature'] = temp
    log_feature['log_feature'] = log_feature['log_feature'].astype('float32')
    log_feature['volume'] = log_feature['volume'].astype('float32')

    for idx in range(len(resource_type)):
        temp = str(resource_type.iloc[[idx]]['resource_type'])
        temp = temp.split('resource_type')[1]
        temp = ''.join(x for x in temp if x.isdigit())
        resource_type.loc[idx, 'resource_type'] = temp
    resource_type['resource_type'] = resource_type['resource_type'].astype(
        'float32')

    for idx in range(len(severity_type)):
        temp = str(severity_type.iloc[[idx]]['severity_type'])
        temp = temp.split('severity_type')[1]
        temp = ''.join(x for x in temp if x.isdigit())
        severity_type.loc[idx, 'severity_type'] = temp
    severity_type['severity_type'] = severity_type['severity_type'].astype(
        'float32')

    print("Merging train dataset")
    X = pd.merge(X, event_type, how='left', on='id')
    X = pd.merge(X, log_feature, how='left', on='id')
    X = pd.merge(X, resource_type, how='left', on='id')
    X = pd.merge(X, severity_type, how='left', on='id')
    X['id'] = X['id'].astype('float32')
    X.drop_duplicates(subset=['id'], inplace=True, keep='first')
    Y = X['fault_severity']
    del X['fault_severity']

    X = X.values
    X_2 = create_features(X, degree=2)
    X_3 = create_features(X, degree=3)
    X = np.hstack((X, X_2, X_3))
    X_dim = X.shape[1]
    dummy_y = np_utils.to_categorical(Y)

    print("Merging test dataset")
    test = pd.merge(test, event_type, how='left', on='id')
    test = pd.merge(test, log_feature, how='left', on='id')
    test = pd.merge(test, resource_type, how='left', on='id')
    test = pd.merge(test, severity_type, how='left', on='id')
    test.drop_duplicates(subset=['id'], inplace=True, keep='first')
    ID_vals = test['id']

    test = test.values
    test_2 = create_features(test, degree=2)
    test_3 = create_features(test, degree=3)
    test = np.hstack((test, test_2, test_3))
    test_dim = test.shape[1]

    print("Transforming with StandardScaler")
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    test = scaler.transform(test)

    # Principal Component Analysis
    ncomponents = 35
    pca = PCA(n_components=ncomponents)
    feature_fit = pca.fit(X)
    X = feature_fit.transform(X)
    test = feature_fit.transform(test)
    X_dim = ncomponents

    estimator = KerasClassifier(build_fn=create_model,
                                input_dim=X_dim,
                                batch_size=80,
                                epochs=10)

    print("Starting KFold cross validation")
    kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
    results = cross_val_score(estimator,
                              X,
                              dummy_y,
                              cv=kfold,
                              scoring='neg_log_loss')
    print("Baseline: %.2f (%.2f%%)" % (results.mean(), results.std() * 100))

    estimator.fit(X, dummy_y)
    predictions = estimator.predict_proba(test)

    submissions = pd.DataFrame(
        data={
            'id': ID_vals,
            'predict_0': predictions[:, 0],
            'predict_1': predictions[:, 1],
            'predict_2': predictions[:, 2]
        })
    submissions.to_csv(
        'output_predictions_extrafeats_featselect_pca35_orig_algo_.csv',
        index=False)

    EndTime = (time.time() - StartTime) / 60.0
    print("Program took %.2f minutes" % (EndTime))
class Classifier():
    def __init__(self):
        self.min_occur = 1
        self.max_length = -1
        self.vocab_size = -1
        self.tokenizer = Tokenizer()  # create the tokenizer

    def fit(self, X, y):

        ytrain = np.zeros((y.shape[0], 3))
        for i in range(y.shape[0]):
            ytrain[i, y[i]] = 1

        statements = pd.Series(X).apply(clean_text_simple)

        vocab = Counter()
        for statement in statements:
            vocab.update(statement)
        tokens = [k for k, c in vocab.items() if c >= self.min_occur]
        statements = statements.apply(lambda x: [w for w in x if w in tokens])

        statements = statements.apply(lambda x: ' '.join(x))
        train_statements = list(statements.values)

        # fit the tokenizer on the statements
        self.tokenizer.fit_on_texts(train_statements)
        self.max_length = max([len(s.split()) for s in train_statements])
        self.vocab_size = len(self.tokenizer.word_index) + 1

        encoded_statements = self.tokenizer.texts_to_sequences(
            train_statements)
        Xtrain = pad_sequences(encoded_statements,
                               maxlen=self.max_length,
                               padding='post')

        self.clf2 = KerasClassifier(build_fn=create_model1,
                                    max_length=self.max_length,
                                    vocab_size=self.vocab_size,
                                    epochs=12)
        self.clf = KerasClassifier(build_fn=create_model1,
                                   max_length=self.max_length,
                                   vocab_size=self.vocab_size,
                                   epochs=12)

        self.clf.fit(Xtrain, ytrain, epochs=12)
        self.clf2.fit(Xtrain, ytrain, epochs=12)

    def predict(self, X):
        y_proba = self.predict_proba(X)
        y = np.argmax(y_proba, axis=1)
        return y

    def predict_proba(self, X):
        statements = pd.Series(X).apply(clean_text_simple)
        statements = statements.apply(lambda x: ' '.join(x))
        statements = list(statements.values)
        encoded_statements = self.tokenizer.texts_to_sequences(statements)
        Xtest = pad_sequences(encoded_statements,
                              maxlen=self.max_length,
                              padding='post')
        y1 = self.clf.predict_proba(Xtest)
        y2 = self.clf2.predict_proba(Xtest)

        y = np.where(
            np.repeat((np.max(y1, axis=1) > np.max(y2, axis=1)).reshape(
                y1.shape[0], 1),
                      3,
                      axis=1), y1, y2)

        return y
Esempio n. 15
0
print('Creating wrapper')
classifier = KerasClassifier(model)

print('Fitting model')
classifier.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch)

print('Testing score function')
score = classifier.score(X_train, Y_train)
print('Score: ', score)

print('Testing predict function')
preds = classifier.predict(X_test)
print('Preds.shape: ', preds.shape)

print('Testing predict proba function')
proba = classifier.predict_proba(X_test)
print('Proba.shape: ', proba.shape)

print('Testing get params')
print(classifier.get_params())

print('Testing set params')
classifier.set_params(optimizer='sgd', loss='mse')
print(classifier.get_params())

print('Testing attributes')
print('Classes')
print(classifier.classes_)
print('Config')
print(classifier.config_)
print('Weights')
Esempio n. 16
0
class FinalModelATC(BaseEstimator, TransformerMixin):


    def __init__(self, model, model_name=None, ml_for_analytics=False, type_of_estimator='classifier', output_column=None, name=None, _scorer=None, training_features=None, column_descriptions=None, feature_learning=False, uncertainty_model=None, uc_results = None):

        self.model = model
        self.model_name = model_name
        self.ml_for_analytics = ml_for_analytics
        self.type_of_estimator = type_of_estimator
        self.name = name
        self.training_features = training_features
        self.column_descriptions = column_descriptions
        self.feature_learning = feature_learning
        self.uncertainty_model = uncertainty_model
        self.uc_results = uc_results


        if self.type_of_estimator == 'classifier':
            self._scorer = _scorer
        else:
            self._scorer = _scorer


    def get(self, prop_name, default=None):
        try:
            return getattr(self, prop_name)
        except AttributeError:
            return default


    def fit(self, X, y):
        self.model_name = get_name_from_model(self.model)

        X_fit = X

        if self.model_name[:12] == 'DeepLearning' or self.model_name in ['BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit', 'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier', 'SGDClassifier', 'RidgeClassifier', 'LogisticRegression']:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()

            if self.model_name[:12] == 'DeepLearning':

                # For Keras, we need to tell it how many input nodes to expect, which is our num_cols
                num_cols = X_fit.shape[1]

                model_params = self.model.get_params()
                del model_params['build_fn']

                if self.type_of_estimator == 'regressor':
                    self.model = KerasRegressor(build_fn=utils_models.make_deep_learning_model, num_cols=num_cols, feature_learning=self.feature_learning, **model_params)
                elif self.type_of_estimator == 'classifier':
                    self.model = KerasClassifier(build_fn=utils_models.make_deep_learning_classifier, num_cols=num_cols, feature_learning=self.feature_learning, **model_params)

        try:
            if self.model_name[:12] == 'DeepLearning':

                print('\nWe will stop training early if we have not seen an improvement in training accuracy in 25 epochs')
                from keras.callbacks import EarlyStopping
                early_stopping = EarlyStopping(monitor='loss', patience=25, verbose=1)
                self.model.fit(X_fit, y, callbacks=[early_stopping])

            elif self.model_name[:16] == 'GradientBoosting':
                if scipy.sparse.issparse(X_fit):
                    X_fit = X_fit.todense()

                patience = 20
                best_val_loss = -10000000000
                num_worse_rounds = 0
                best_model = deepcopy(self.model)
                X_fit, X_test, y, y_test = train_test_split(X_fit, y, test_size=0.15)

                # Add a variable number of trees each time, depending how far into the process we are
                num_iters = list(range(1, 50, 1)) + list(range(50, 100, 2)) + list(range(100, 250, 3)) + list(range(250, 500, 5)) + list(range(500, 1000, 10)) + list(range(1000, 2000, 20)) + list(range(2000, 10000, 100))

                try:
                    for num_iter in num_iters:
                        warm_start = True
                        if num_iter == 1:
                            warm_start = False

                        self.model.set_params(n_estimators=num_iter, warm_start=warm_start)
                        self.model.fit(X_fit, y)

                        try:
                            val_loss = self._scorer.score(self, X_test, y_test)
                        except Exception as e:
                            val_loss = self.model.score(X_test, y_test)

                        if val_loss > best_val_loss:
                            best_val_loss = val_loss
                            num_worse_rounds = 0
                            best_model = deepcopy(self.model)
                        else:
                            num_worse_rounds += 1

                        if num_worse_rounds >= patience:
                            break
                except KeyboardInterrupt:
                    print('Heard KeyboardInterrupt. Stopping training, and using the best checkpointed GradientBoosting model')
                    pass

                self.model = best_model
                print('The number of estimators that were the best for this training dataset: ' + str(self.model.get_params()['n_estimators']))
                print('The best score on a random 15 percent holdout set of the training data: ' + str(best_val_loss))

            else:
                self.model.fit(X_fit, y)

        except TypeError as e:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()
            self.model.fit(X_fit, y)

        except KeyboardInterrupt as e:
            print('Stopping training at this point because we heard a KeyboardInterrupt')
            print('If the model is functional at this point, we will output the model in its latest form')
            print('Note that not all models can be interrupted and still used, and that this feature generally is an unofficial beta-release feature that is known to fail on occasion')
            pass

        return self

    def remove_categorical_values(self, features):
        clean_features = set([])
        for feature in features:
            if '=' not in feature:
                clean_features.add(feature)
            else:
                clean_features.add(feature[:feature.index('=')])

        return clean_features

    def verify_features(self, X, raw_features_only=False):

        if self.column_descriptions is None:
            print('This feature is not enabled by default. Depending on the shape of the training data, it can add hundreds of KB to the saved file size.')
            print('Please pass in `ml_predictor.train(data, verify_features=True)` when training a model, and we will enable this function, at the cost of a potentially larger file size.')
            warnings.warn('Please pass verify_features=True when invoking .train() on the ml_predictor instance.')
            return None

        print('\n\nNow verifying consistency between training features and prediction features')
        if isinstance(X, dict):
            prediction_features = set(X.keys())
        elif isinstance(X, pd.DataFrame):
            prediction_features = set(X.columns)

        # If the user passed in categorical features, we will effectively one-hot-encode them ourselves here
        # Note that this assumes we're using the "=" as the separater in DictVectorizer/DataFrameVectorizer
        date_col_names = []
        categorical_col_names = []
        for key, value in self.column_descriptions.items():
            if value == 'categorical' and 'day_part' not in key:
                try:
                    # This covers the case that the user passes in a value in column_descriptions that is not present in their prediction data
                    column_vals = X[key].unique()
                    for val in column_vals:
                        prediction_features.add(key + '=' + str(val))

                    categorical_col_names.append(key)
                except:
                    print('\nFound a column in your column_descriptions that is not present in your prediction data:')
                    print(key)

            elif 'day_part' in key:
                # We have found a date column. Make sure this date column is in our prediction data
                # It is outside the scope of this function to make sure that the same date parts are available in both our training and testing data
                raw_date_col_name = key[:key.index('day_part') - 1]
                date_col_names.append(raw_date_col_name)

            elif value == 'output':
                try:
                    prediction_features.remove(key)
                except KeyError:
                    pass

        # Now that we've added in all the one-hot-encoded categorical columns (name=val1, name=val2), remove the base name from our prediction data
        prediction_features = prediction_features - set(categorical_col_names)

        # Get only the unique raw_date_col_names
        date_col_names = set(date_col_names)

        training_features = set(self.training_features)

        # Remove all of the transformed date column feature names from our training data
        features_to_remove = []
        for feature in training_features:
            for raw_date_col_name in date_col_names:
                if raw_date_col_name in feature:
                    features_to_remove.append(feature)
        training_features = training_features - set(features_to_remove)

        # Make sure the raw_date_col_name is in our training data after we have removed all the transformed feature names
        training_features = training_features | date_col_names

        # MVP means ignoring text features
        print_nlp_warning = False
        nlp_example = None
        for feature in training_features:
            if 'nlp_' in feature:
                print_nlp_warning = True
                nlp_example = feature
                training_features.remove(feature)

        if print_nlp_warning == True:
            print('\n\nWe found an NLP column in the training data')
            print('verify_features() currently does not support checking all of the values within an NLP column, so if the text of your NLP column has dramatically changed, you will have to check that yourself.')
            print('Here is one example of an NLP feature in the training data:')
            print(nlp_example)

        training_not_prediction = training_features - prediction_features

        if raw_features_only == True:
            training_not_prediction = self.remove_categorical_values(training_not_prediction)

        if len(training_not_prediction) > 0:

            print('\n\nHere are the features this model was trained on that were not present in this prediction data:')
            print(sorted(list(training_not_prediction)))
        else:
            print('All of the features this model was trained on are included in the prediction data')

        prediction_not_training = prediction_features - training_features
        if raw_features_only == True:
            prediction_not_training = self.remove_categorical_values(prediction_not_training)

        if len(prediction_not_training) > 0:

            # Separate out those values we were told to ignore by column_descriptions
            ignored_features = []
            for feature in prediction_not_training:
                if self.column_descriptions.get(feature, 'False') == 'ignore':
                    ignored_features.append(feature)
            prediction_not_training = prediction_not_training - set(ignored_features)

            print('\n\nHere are the features available in the prediction data that were not part of the training data:')
            print(sorted(list(prediction_not_training)))

            if len(ignored_features) > 0:
                print('\n\nAdditionally, we found features in the prediction data that we were told to ignore in the training data')
                print(sorted(list(ignored_features)))

        else:
            print('All of the features in the prediction data were in this model\'s training data')

        print('\n\n')
        return {
            'training_not_prediction': training_not_prediction
            , 'prediction_not_training': prediction_not_training
        }


    def score(self, X, y, verbose=False):
        # At the time of writing this, GradientBoosting does not support sparse matrices for predictions
        if (self.model_name[:16] == 'GradientBoosting' or self.model_name in ['BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit', 'ARDRegression']) and scipy.sparse.issparse(X):
            X = X.todense()

        if self._scorer is not None:
            if self.type_of_estimator == 'regressor':
                return self._scorer.score(self, X, y)
            elif self.type_of_estimator == 'classifier':
                return self._scorer.score(self, X, y)


        else:
            return self.model.score(X, y)


    def predict_proba(self, X, verbose=False):

        if (self.model_name[:16] == 'GradientBoosting' or self.model_name[:12] == 'DeepLearning' or self.model_name in ['BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit', 'ARDRegression']) and scipy.sparse.issparse(X):
            X = X.todense()

        try:
            predictions = self.model.predict_proba(X)

        except AttributeError as e:
            try:
                predictions = self.model.predict(X)
            except TypeError as e:
                if scipy.sparse.issparse(X):
                    X = X.todense()
                predictions = self.model.predict(X)

        except TypeError as e:
            if scipy.sparse.issparse(X):
                X = X.todense()
            predictions = self.model.predict_proba(X)

        # If this model does not have predict_proba, and we have fallen back on predict, we want to make sure we give results back in the same format the user would expect for predict_proba, namely each prediction is a list of predicted probabilities for each class.
        # Note that this DOES NOT WORK for multi-label problems, or problems that are not reduced to 0,1
        # If this is not an iterable (ignoring strings, which might be iterable), then we will want to turn our predictions into tupled predictions
        if not (hasattr(predictions[0], '__iter__') and not isinstance(predictions[0], str)):
            tupled_predictions = []
            for prediction in predictions:
                if prediction == 1:
                    tupled_predictions.append([0,1])
                else:
                    tupled_predictions.append([1,0])
            predictions = tupled_predictions


        # This handles an annoying edge case with libraries like Keras that, for a binary classification problem, with return a single predicted probability in a list, rather than the probability of both classes in a list
        if len(predictions[0]) == 1:
            tupled_predictions = []
            for prediction in predictions:
                tupled_predictions.append([1 - prediction[0], prediction[0]])
            predictions = tupled_predictions

        if X.shape[0] == 1:
            return predictions[0]
        else:
            return predictions

    def predict(self, X, verbose=False):

        if (self.model_name[:16] == 'GradientBoosting' or self.model_name[:12] == 'DeepLearning' or self.model_name in ['BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit', 'ARDRegression']) and scipy.sparse.issparse(X):
            X_predict = X.todense()

        else:
            X_predict = X

        prediction = self.model.predict(X_predict)
        # Handle cases of getting a prediction for a single item.
        # It makes a cleaner interface just to get just the single prediction back, rather than a list with the prediction hidden inside.

        if isinstance(prediction, np.ndarray):
            prediction = prediction.tolist()
            if isinstance(prediction, float) or isinstance(prediction, int) or isinstance(prediction, str):
                return prediction

        if len(prediction) == 1:
            return prediction[0]
        else:
            return prediction

    # transform is initially designed to be used with feature_learning
    def transform(self, X):
        predicted_features = self.predict(X)
        predicted_features = list(predicted_features)

        X = scipy.sparse.hstack([X, predicted_features], format='csr')
        return X

    def predict_uncertainty(self, X):
        if self.uncertainty_model is None:
            print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            print('This model was not trained to predict uncertainties')
            print('Please follow the documentation to tell this model at training time to learn how to predict uncertainties')
            print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            raise ValueError('This model was not trained to predict uncertainties')

        base_predictions = self.predict(X)

        if isinstance(base_predictions, Iterable):
            base_predictions_col = [[val] for val in base_predictions]
            base_predictions_col = np.array(base_predictions_col)
        else:
            base_predictions_col = [base_predictions]

        X_combined = scipy.sparse.hstack([X, base_predictions_col], format='csr')

        uncertainty_predictions = self.uncertainty_model.predict_proba(X_combined)

        results = {
            'base_prediction': base_predictions
            , 'uncertainty_prediction': uncertainty_predictions
        }



        if isinstance(base_predictions, Iterable):

            results['uncertainty_prediction'] = [row[1] for row in results['uncertainty_prediction']]

            results = pd.DataFrame.from_dict(results, orient='columns')

            if self.uc_results is not None:
                calibration_results = {}
                # grab the relevant properties from our uc_results, and make them each their own list in calibration_results
                for key, value in self.uc_results[1].items():
                    calibration_results[key] = []

                for proba in results['uncertainty_prediction']:
                    max_bucket_proba = 0
                    bucket_num = 1
                    while proba > max_bucket_proba:
                        calibration_result = self.uc_results[bucket_num]
                        max_bucket_proba = self.uc_results[bucket_num]['max_proba']
                        bucket_num += 1

                    for key, value in calibration_result.items():
                        calibration_results[key].append(value)
                # TODO: grab the uncertainty_calibration data for DataFrames
                df_calibration_results = pd.DataFrame.from_dict(calibration_results, orient='columns')
                del df_calibration_results['max_proba']

                results = pd.concat([results, df_calibration_results], axis=1)

        else:
            if self.uc_results is not None:
                # TODO: grab the uncertainty_calibration data for dictionaries
                for bucket_name, bucket_result in self.uc_results.items():
                    if proba > bucket_result['max_proba']:
                        break
                    results.update(bucket_result)
                    del results['max_proba']




        return results


    def score_uncertainty(self, X, y, verbose=False):
        return self.uncertainty_model.score(X, y, verbose=False)
Esempio n. 17
0
                    classifier.add(Dense(1, activation='sigmoid'))

                    classifier.compile(optimizer=optimizer,
                                       loss='mean_squared_error')

                    mlp_clf = KerasClassifier(build_fn=lambda: classifier)
                    mlp_clf.fit(X_train,
                                y_train,
                                batch_size=bs,
                                verbose=0,
                                callbacks=[EarlyStopping(patience=5)],
                                epochs=100000,
                                validation_data=(X_val, y_val))

                    y_pred = mlp_clf.predict(X_test)
                    y_pred_scores = mlp_clf.predict_proba(X_test)
                    mse, accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(
                        y_test, y_pred.squeeze(), y_pred_scores)
                    results['BS'].append(bs)
                    results['HL'].append(hl)
                    results['HN'].append(hn)
                    results['LR'].append(lr)
                    results['OPTIM'].append(optimizer)
                    results['MSE'].append(mse)
                    results['ACC'].append(accuracy)
                    results['REC'].append(recall)
                    results['PREC'].append(precision)
                    results['F1'].append(f1)
                    results['AUROC'].append(auroc)
                    results['AUPR'].append(aupr)
Esempio n. 18
0
    # Compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


estimator = KerasClassifier(build_fn=baseline_model,
                            epochs=20,
                            batch_size=50,
                            verbose=1)

#X_train, X_test, Y_train, Y_test = train_test_split(X_train, dummy_y, test_size=0.33, random_state=seed)
estimator.fit(X_train, dummy_y, validation_split=0.33)
predictions = estimator.predict(test_x)

keras_prob = estimator.predict_proba(test_x)
with open('keras_prob_all.pkl', 'wb') as output:
    pickle.dump(keras_prob, output)

sub = pd.DataFrame({
    'building_id': test_df['building_id'],
    'grade': predictions
})

d1 = {0: 'Grade 1', 1: 'Grade 2', 2: 'Grade 3', 3: 'Grade 4', 4: 'Grade 5'}

sub['damage_grade'] = sub['grade'].map(d1)

sub[['building_id', 'damage_grade']].to_csv("sub.csv", index=False)
    gc.collect()

    start = time.time()
    #def f():
    classifier.fit(X_train, Y_train, callbacks=switch_case_callbacks(x=True))
    #   return

    #mem_usage = memory_usage(f, max_usage=True)
    #print('Maximum memory usage: %s' % max(mem_usage))

    end = time.time()
    time_completion = (end - start) / 60
    print('Model completion time: {0:.2f} minutes'.format(time_completion))

#probability of different outcomes
y_prob = classifier.predict_proba(X_test)

# result will probably always be a wildtype(0)
y_pred = classifier.predict(X_test)
#y_pred = y_pred.reshape(-1,1)

#Index of top 5 indels
encoded_top5 = (-y_prob).argsort()[:, 0:5]

#Probability of top 5 indels
top5_prob = np.sort(-y_prob)[:, 0:5] * -100

encoded_y_true = np.argsort(Y_test)[:, -1]


# Vectorise function to map back to true values
Esempio n. 20
0
                  learn_rate=learn_rate,
                  dropout_rate=dropout_rate,
                  neurons1=neurons1)
grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    scoring="f1_micro",
                    cv=3)
grid_search = grid.fit(train_X, train_y)
print(grid_search.best_score_)
print(grid_search.best_params_)

model = grid_search.best_estimator_

# Plots
# Plot Train/Test curve
plotTrainTestLines("MLP(" + str(number_of_columns) + " features)", model,
                   train_X.values, train_y, validation_X.values, validation_y)

# Get probabilities from model
probs = model.predict_proba(validation_X.values)

# Calculate precision/recall values
prec_rec_dict = precision_recall_values(probs, validation_y.values)

# Plot Precision/Recall curve
plotPrecRecCurve(train_X.values, train_y, validation_X.values, validation_y,
                 {'MLP': prec_rec_dict})

# Test score
print("MLP test score : " + str(model.score(test_X.values, test_y.values)))
                            batch_size=5,
                            verbose=0)

kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    dummy_y,
                                                    test_size=0.33,
                                                    random_state=seed)
'''print (len(dummy_y))
print(len(X_train))'''

estimator.fit(X, dummy_y)

predictions = estimator.predict(X)
probabilities = estimator.predict_proba(X)

print(predictions)
print(probabilities)

#print(encoder.inverse_transform(predictions))

from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score

from sklearn.metrics import classification_report

actual = y1
#print(Y_test)
print(actual)
print(log_loss(Y[:50400], np.ndarray.tolist(predictDATA[:50400])))
print(log_loss(Y[50400:], np.ndarray.tolist(predictDATA[50400:72325])))

# evaluate model with standardized dataset
estimator = KerasClassifier(build_fn=create_baseline,
                            nb_epoch=100,
                            batch_size=64,
                            verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
results = cross_val_score(estimator, X_NN[:50400, :, 1], Y[:50400], cv=kfold)
results_test = cross_val_score(estimator,
                               X_NN[50400:, :, 1],
                               Y[50400:],
                               cv=kfold)
fitDATA = estimator.fit(X_NN[50400:, :, 1], Y[50400:])
predictDATA = estimator.predict_proba(X)
print("Results: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))
print("Results: %.2f%% (%.2f%%)" %
      (results_test.mean() * 100, results_test.std() * 100))
print(log_loss(Y[:50400], predictDATA[:50400, 1]))
print(log_loss(Y[50400:], predictDATA[50400:, 1]))

#####################################################################################################################################

### Classifiers
KNC2p = np.zeros((len(X_new), 1))
KNC4p = np.zeros((len(X_new), 1))
KNC8p = np.zeros((len(X_new), 1))
KNC16p = np.zeros((len(X_new), 1))
KNC_logp = np.zeros((len(X_new), 1))
GBCp = np.zeros((len(X_new), 1))
Esempio n. 23
0
print('Creating wrapper')
classifier = KerasClassifier(model)

print('Fitting model')
classifier.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch)

print('Testing score function')
score = classifier.score(X_train, Y_train)
print('Score: ', score)

print('Testing predict function')
preds = classifier.predict(X_test)
print('Preds.shape: ', preds.shape)

print('Testing predict proba function')
proba = classifier.predict_proba(X_test)
print('Proba.shape: ', proba.shape)

print('Testing get params')
print(classifier.get_params())

print('Testing set params')
classifier.set_params(optimizer='sgd', loss='mse')
print(classifier.get_params())

print('Testing attributes')
print('Classes')
print(classifier.classes_)
print('Config')
print(classifier.config_)
print('Weights')
class NNClassifier(object):
    name = 'NN'

    def __init__(self, X_train, X_test, use_scale=True, feature_names=None):
        self.params = {}
        self.model = {}
        self.use_scale = use_scale

        self.error_fraction = {}

        self.name = NNClassifier.name

        self.all_data = X_train.todense()

        from sklearn.preprocessing import StandardScaler
        self.sc = StandardScaler()
        self.all_data = self.sc.fit_transform(self.all_data)

        pickle.dump(self.all_data, open("/tmp/X.p", "w+b"))

    def run_cross_validation(self, train, train_target, folds):
        pass

    def train_predict_all(self, x, y):

        new_x = x.todense()

        new_x = self.sc.transform(new_x)

        def create_baseline():
            model = Sequential()

            #model.add(Dropout(0.2, input_shape=(new_x.shape[1],)))
            model.add(
                Dense(units=512, activation='relu',
                      input_dim=new_x.shape[1]))  #best
            #model.add(Dense(units=256, activation='relu', input_dim=new_x.shape[1]))
            # model.add(Dense(units=128, activation='relu'))
            model.add(Dense(units=1, activation='sigmoid'))

            model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])

            return model

        self.model = KerasClassifier(build_fn=create_baseline,
                                     epochs=500,
                                     batch_size=5,
                                     verbose=1)
        self.model.fit(new_x, y)

        probability_prediction_all = self.model.predict_proba(self.all_data)

        if self.model.classes_[1] == True:
            probability_prediction = probability_prediction_all[:, 1]
        else:
            probability_prediction = probability_prediction_all[:, 0]
        class_prediction = probability_prediction > 0.5

        return probability_prediction, class_prediction
Esempio n. 25
0
model = KerasClassifier(make_model,
                        validation_split=0.15,
                        epochs=100,
                        batch_size=512,
                        callbacks=callbacks_list,
                        verbose=0)
scaler = NDStandardScaler()

model = make_pipeline(scaler, model)
model.fit(trainX, trainY)
import pickle as pkl

pkl.dump(model, open('../../data/sklearn_model_fitted.pkl', 'wb'))
model = pkl.load(open('../../data/sklearn_model_fitted.pkl', 'rb'))

print(testX.shape)
output = model.predict_proba(testX)
print(len(record_list), len(output), len(testY.argmax(axis=1)))
print(testY.argmax(axis=1))
print(output.argmax(axis=1))
summed = pd.DataFrame({
    'record': record_list,
    'predictions': output.argmax(axis=1),
    'label': testY.argmax(axis=1)
})
summed = summed.groupby('record').mean()
summed["predicted label"] = summed['predictions'] > 0.5

print(confusion_matrix(testY.argmax(axis=1), output.argmax(axis=1)))
Esempio n. 26
0
class FinalModelATC(BaseEstimator, TransformerMixin):
    def __init__(self,
                 model,
                 model_name=None,
                 ml_for_analytics=False,
                 type_of_estimator='classifier',
                 output_column=None,
                 name=None,
                 _scorer=None,
                 training_features=None,
                 column_descriptions=None,
                 feature_learning=False,
                 uncertainty_model=None,
                 uc_results=None,
                 training_prediction_intervals=False,
                 min_step_improvement=0.0001,
                 interval_predictors=None,
                 keep_cat_features=False,
                 is_hp_search=None,
                 X_test=None,
                 y_test=None):

        self.model = model
        self.model_name = model_name
        self.ml_for_analytics = ml_for_analytics
        self.type_of_estimator = type_of_estimator
        self.name = name
        self.training_features = training_features
        self.column_descriptions = column_descriptions
        self.feature_learning = feature_learning
        self.uncertainty_model = uncertainty_model
        self.uc_results = uc_results
        self.training_prediction_intervals = training_prediction_intervals
        self.min_step_improvement = min_step_improvement
        self.interval_predictors = interval_predictors
        self.is_hp_search = is_hp_search
        self.keep_cat_features = keep_cat_features
        self.X_test = X_test
        self.y_test = y_test

        if self.type_of_estimator == 'classifier':
            self._scorer = _scorer
        else:
            self._scorer = _scorer

    def get(self, prop_name, default=None):
        try:
            return getattr(self, prop_name)
        except AttributeError:
            return default

    def fit(self, X, y):
        global keras_imported, KerasRegressor, KerasClassifier, EarlyStopping, ModelCheckpoint, TerminateOnNaN, keras_load_model
        self.model_name = get_name_from_model(self.model)

        X_fit = X

        if self.model_name[:12] == 'DeepLearning' or self.model_name in [
                'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit',
                'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier',
                'SGDClassifier', 'RidgeClassifier', 'LogisticRegression'
        ]:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()

            if self.model_name[:12] == 'DeepLearning':
                if keras_imported == False:
                    # Suppress some level of logs
                    os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
                    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
                    from keras.callbacks import EarlyStopping, ModelCheckpoint, TerminateOnNaN
                    from keras.models import load_model as keras_load_model
                    from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier

                    keras_imported = True

                # For Keras, we need to tell it how many input nodes to expect, which is our num_cols
                num_cols = X_fit.shape[1]

                model_params = self.model.get_params()
                del model_params['build_fn']
                try:
                    del model_params['feature_learning']
                except:
                    pass
                try:
                    del model_params['num_cols']
                except:
                    pass

                if self.type_of_estimator == 'regressor':
                    self.model = KerasRegressor(
                        build_fn=utils_models.make_deep_learning_model,
                        num_cols=num_cols,
                        feature_learning=self.feature_learning,
                        **model_params)
                elif self.type_of_estimator == 'classifier':
                    self.model = KerasClassifier(
                        build_fn=utils_models.make_deep_learning_classifier,
                        num_cols=num_cols,
                        feature_learning=self.feature_learning,
                        **model_params)

        if self.model_name[:12] == 'DeepLearning':
            try:

                if self.is_hp_search == True:
                    patience = 5
                    verbose = 0
                else:
                    patience = 25
                    verbose = 2

                X_fit, y, X_test, y_test = self.get_X_test(X_fit, y)
                try:
                    X_test = X_test.toarray()
                except AttributeError as e:
                    pass
                if not self.is_hp_search:
                    print(
                        '\nWe will stop training early if we have not seen an improvement in validation accuracy in {} epochs'
                        .format(patience))
                    print(
                        'To measure validation accuracy, we will split off a random 10 percent of your training data set'
                    )

                early_stopping = EarlyStopping(monitor='val_loss',
                                               patience=patience,
                                               verbose=verbose)
                terminate_on_nan = TerminateOnNaN()

                now_time = datetime.datetime.now()
                time_string = str(now_time.year) + '_' + str(
                    now_time.month) + '_' + str(now_time.day) + '_' + str(
                        now_time.hour) + '_' + str(now_time.minute)

                temp_file_name = 'tmp_dl_model_checkpoint_' + time_string + str(
                    random.random()) + '.h5'
                model_checkpoint = ModelCheckpoint(temp_file_name,
                                                   monitor='val_loss',
                                                   save_best_only=True,
                                                   mode='min',
                                                   period=1)

                callbacks = [early_stopping, terminate_on_nan]
                if not self.is_hp_search:
                    callbacks.append(model_checkpoint)

                self.model.fit(X_fit,
                               y,
                               callbacks=callbacks,
                               validation_data=(X_test, y_test),
                               verbose=verbose)

                # TODO: give some kind of logging on how the model did here! best epoch, best accuracy, etc.

                if self.is_hp_search is False:
                    self.model = keras_load_model(temp_file_name)

                try:
                    os.remove(temp_file_name)
                except OSError as e:
                    pass
            except KeyboardInterrupt as e:
                print(
                    'Stopping training at this point because we heard a KeyboardInterrupt'
                )
                print(
                    'If the deep learning model is functional at this point, we will output the model in its latest form'
                )
                print(
                    'Note that this feature is an unofficial beta-release feature that is known to fail on occasion'
                )

                if self.is_hp_search is False:
                    self.model = keras_load_model(temp_file_name)
                try:
                    os.remove(temp_file_name)
                except OSError as e:
                    pass

        elif self.model_name[:4] == 'LGBM':
            X_fit = X.toarray()

            X_fit, y, X_test, y_test = self.get_X_test(X_fit, y)

            try:
                X_test = X_test.toarray()
            except AttributeError as e:
                pass

            if self.type_of_estimator == 'regressor':
                eval_metric = 'rmse'
            elif self.type_of_estimator == 'classifier':
                if len(set(y_test)) > 2:
                    eval_metric = 'multi_logloss'
                else:
                    eval_metric = 'binary_logloss'

            verbose = True
            if self.is_hp_search == True:
                verbose = False

            if self.X_test is not None:
                eval_name = 'X_test_the_user_passed_in'
            else:
                eval_name = 'random_holdout_set_from_training_data'

            cat_feature_indices = self.get_categorical_feature_indices()
            if cat_feature_indices is None:
                self.model.fit(X_fit,
                               y,
                               eval_set=[(X_test, y_test)],
                               early_stopping_rounds=100,
                               eval_metric=eval_metric,
                               eval_names=[eval_name],
                               verbose=verbose)
            else:
                self.model.fit(X_fit,
                               y,
                               eval_set=[(X_test, y_test)],
                               early_stopping_rounds=100,
                               eval_metric=eval_metric,
                               eval_names=[eval_name],
                               categorical_feature=cat_feature_indices,
                               verbose=verbose)

        elif self.model_name[:8] == 'CatBoost':
            X_fit = X_fit.toarray()

            if self.type_of_estimator == 'classifier' and len(
                    pd.Series(y).unique()) > 2:
                # TODO: we might have to modify the format of the y values, converting them all to ints, then back again (sklearn has a useful inverse_transform on some preprocessing classes)
                self.model.set_params(loss_function='MultiClass')

            cat_feature_indices = self.get_categorical_feature_indices()

            self.model.fit(X_fit, y, cat_features=cat_feature_indices)

        elif self.model_name[:16] == 'GradientBoosting':
            if not sklearn_version > '0.18.1':
                X_fit = X_fit.toarray()

            patience = 20
            best_val_loss = -10000000000
            num_worse_rounds = 0
            best_model = deepcopy(self.model)
            X_fit, y, X_test, y_test = self.get_X_test(X_fit, y)

            # Add a variable number of trees each time, depending how far into the process we are
            if os.environ.get('is_test_suite', False) == 'True':
                num_iters = list(range(1, 50, 1)) + list(range(
                    50, 100, 2)) + list(range(100, 250, 3))
            else:
                num_iters = list(range(
                    1, 50, 1)) + list(range(50, 100, 2)) + list(
                        range(100, 250, 3)) + list(range(250, 500, 5)) + list(
                            range(500, 1000, 10)) + list(range(
                                1000, 2000, 20)) + list(range(
                                    2000, 10000, 100))
            # TODO: get n_estimators from the model itself, and reduce this list to only those values that come under the value from the model

            try:
                for num_iter in num_iters:
                    warm_start = True
                    if num_iter == 1:
                        warm_start = False

                    self.model.set_params(n_estimators=num_iter,
                                          warm_start=warm_start)
                    self.model.fit(X_fit, y)

                    if self.training_prediction_intervals == True:
                        val_loss = self.model.score(X_test, y_test)
                    else:
                        try:
                            val_loss = self._scorer.score(self, X_test, y_test)
                        except Exception as e:
                            val_loss = self.model.score(X_test, y_test)

                    if val_loss - self.min_step_improvement > best_val_loss:
                        best_val_loss = val_loss
                        num_worse_rounds = 0
                        best_model = deepcopy(self.model)
                    else:
                        num_worse_rounds += 1
                    print(
                        '[' + str(num_iter) +
                        '] random_holdout_set_from_training_data\'s score is: '
                        + str(round(val_loss, 3)))
                    if num_worse_rounds >= patience:
                        break
            except KeyboardInterrupt:
                print(
                    'Heard KeyboardInterrupt. Stopping training, and using the best checkpointed GradientBoosting model'
                )
                pass

            self.model = best_model
            print(
                'The number of estimators that were the best for this training dataset: '
                + str(self.model.get_params()['n_estimators']))
            print('The best score on the holdout set: ' + str(best_val_loss))

        else:
            self.model.fit(X_fit, y)

        if self.X_test is not None:
            del self.X_test
            del self.y_test
        return self

    def remove_categorical_values(self, features):
        clean_features = set([])
        for feature in features:
            if '=' not in feature:
                clean_features.add(feature)
            else:
                clean_features.add(feature[:feature.index('=')])

        return clean_features

    def verify_features(self, X, raw_features_only=False):

        if self.column_descriptions is None:
            print(
                'This feature is not enabled by default. Depending on the shape of the training data, it can add hundreds of KB to the saved file size.'
            )
            print(
                'Please pass in `ml_predictor.train(data, verify_features=True)` when training a model, and we will enable this function, at the cost of a potentially larger file size.'
            )
            warnings.warn(
                'Please pass verify_features=True when invoking .train() on the ml_predictor instance.'
            )
            return None

        print(
            '\n\nNow verifying consistency between training features and prediction features'
        )
        if isinstance(X, dict):
            prediction_features = set(X.keys())
        elif isinstance(X, pd.DataFrame):
            prediction_features = set(X.columns)

        # If the user passed in categorical features, we will effectively one-hot-encode them ourselves here
        # Note that this assumes we're using the "=" as the separater in DictVectorizer/DataFrameVectorizer
        date_col_names = []
        categorical_col_names = []
        for key, value in self.column_descriptions.items():
            if value == 'categorical' and 'day_part' not in key:
                try:
                    # This covers the case that the user passes in a value in column_descriptions that is not present in their prediction data
                    column_vals = X[key].unique()
                    for val in column_vals:
                        prediction_features.add(key + '=' + str(val))

                    categorical_col_names.append(key)
                except:
                    print(
                        '\nFound a column in your column_descriptions that is not present in your prediction data:'
                    )
                    print(key)

            elif 'day_part' in key:
                # We have found a date column. Make sure this date column is in our prediction data
                # It is outside the scope of this function to make sure that the same date parts are available in both our training and testing data
                raw_date_col_name = key[:key.index('day_part') - 1]
                date_col_names.append(raw_date_col_name)

            elif value == 'output':
                try:
                    prediction_features.remove(key)
                except KeyError:
                    pass

        # Now that we've added in all the one-hot-encoded categorical columns (name=val1, name=val2), remove the base name from our prediction data
        prediction_features = prediction_features - set(categorical_col_names)

        # Get only the unique raw_date_col_names
        date_col_names = set(date_col_names)

        training_features = set(self.training_features)

        # Remove all of the transformed date column feature names from our training data
        features_to_remove = []
        for feature in training_features:
            for raw_date_col_name in date_col_names:
                if raw_date_col_name in feature:
                    features_to_remove.append(feature)
        training_features = training_features - set(features_to_remove)

        # Make sure the raw_date_col_name is in our training data after we have removed all the transformed feature names
        training_features = training_features | date_col_names

        # MVP means ignoring text features
        print_nlp_warning = False
        nlp_example = None
        for feature in training_features:
            if 'nlp_' in feature:
                print_nlp_warning = True
                nlp_example = feature
                training_features.remove(feature)

        if print_nlp_warning == True:
            print('\n\nWe found an NLP column in the training data')
            print(
                'verify_features() currently does not support checking all of the values within an NLP column, so if the text of your NLP column has dramatically changed, you will have to check that yourself.'
            )
            print(
                'Here is one example of an NLP feature in the training data:')
            print(nlp_example)

        training_not_prediction = training_features - prediction_features

        if raw_features_only == True:
            training_not_prediction = self.remove_categorical_values(
                training_not_prediction)

        if len(training_not_prediction) > 0:

            print(
                '\n\nHere are the features this model was trained on that were not present in this prediction data:'
            )
            print(sorted(list(training_not_prediction)))
        else:
            print(
                'All of the features this model was trained on are included in the prediction data'
            )

        prediction_not_training = prediction_features - training_features
        if raw_features_only == True:
            prediction_not_training = self.remove_categorical_values(
                prediction_not_training)

        if len(prediction_not_training) > 0:

            # Separate out those values we were told to ignore by column_descriptions
            ignored_features = []
            for feature in prediction_not_training:
                if self.column_descriptions.get(feature, 'False') == 'ignore':
                    ignored_features.append(feature)
            prediction_not_training = prediction_not_training - set(
                ignored_features)

            print(
                '\n\nHere are the features available in the prediction data that were not part of the training data:'
            )
            print(sorted(list(prediction_not_training)))

            if len(ignored_features) > 0:
                print(
                    '\n\nAdditionally, we found features in the prediction data that we were told to ignore in the training data'
                )
                print(sorted(list(ignored_features)))

        else:
            print(
                'All of the features in the prediction data were in this model\'s training data'
            )

        print('\n\n')
        return {
            'training_not_prediction': training_not_prediction,
            'prediction_not_training': prediction_not_training
        }

    def score(self, X, y, verbose=False):
        # At the time of writing this, GradientBoosting does not support sparse matrices for predictions
        if (self.model_name[:16] == 'GradientBoosting' or self.model_name in [
                'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit',
                'ARDRegression'
        ]) and scipy.sparse.issparse(X):
            X = X.todense()

        if self._scorer is not None:
            if self.type_of_estimator == 'regressor':
                return self._scorer.score(self, X, y)
            elif self.type_of_estimator == 'classifier':
                return self._scorer.score(self, X, y)

        else:
            return self.model.score(X, y)

    def predict_proba(self, X, verbose=False):

        if (self.model_name[:16] == 'GradientBoosting' or self.model_name[:12]
                == 'DeepLearning' or self.model_name in [
                    'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit',
                    'ARDRegression'
                ]) and scipy.sparse.issparse(X):
            X = X.todense()
        elif (self.model_name[:8] == 'CatBoost'
              or self.model_name[:4] == 'LGBM') and scipy.sparse.issparse(X):
            X = X.toarray()

        try:
            if self.model_name[:4] == 'LGBM':
                try:
                    best_iteration = self.model.best_iteration
                except AttributeError:
                    best_iteration = self.model.best_iteration_
                predictions = self.model.predict_proba(
                    X, num_iteration=best_iteration)
            else:
                predictions = self.model.predict_proba(X)

        except AttributeError as e:
            try:
                predictions = self.model.predict(X)
            except TypeError as e:
                if scipy.sparse.issparse(X):
                    X = X.todense()
                predictions = self.model.predict(X)

        except TypeError as e:
            if scipy.sparse.issparse(X):
                X = X.todense()
            predictions = self.model.predict_proba(X)

        # If this model does not have predict_proba, and we have fallen back on predict, we want to make sure we give results back in the same format the user would expect for predict_proba, namely each prediction is a list of predicted probabilities for each class.
        # Note that this DOES NOT WORK for multi-label problems, or problems that are not reduced to 0,1
        # If this is not an iterable (ignoring strings, which might be iterable), then we will want to turn our predictions into tupled predictions
        if not (hasattr(predictions[0], '__iter__')
                and not isinstance(predictions[0], str)):
            tupled_predictions = []
            for prediction in predictions:
                if prediction == 1:
                    tupled_predictions.append([0, 1])
                else:
                    tupled_predictions.append([1, 0])
            predictions = tupled_predictions

        # This handles an annoying edge case with libraries like Keras that, for a binary classification problem, with return a single predicted probability in a list, rather than the probability of both classes in a list
        if len(predictions[0]) == 1:
            tupled_predictions = []
            for prediction in predictions:
                tupled_predictions.append([1 - prediction[0], prediction[0]])
            predictions = tupled_predictions

        if X.shape[0] == 1:
            return predictions[0]
        else:
            return predictions

    def predict(self, X, verbose=False):

        if (self.model_name[:16] == 'GradientBoosting' or self.model_name[:12]
                == 'DeepLearning' or self.model_name in [
                    'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit',
                    'ARDRegression'
                ]) and scipy.sparse.issparse(X):
            X_predict = X.todense()
        elif self.model_name[:8] == 'CatBoost' and scipy.sparse.issparse(X):
            X_predict = X.toarray()
        else:
            X_predict = X

        if self.model_name[:4] == 'LGBM':
            try:
                best_iteration = self.model.best_iteration
            except AttributeError:
                best_iteration = self.model.best_iteration_
            predictions = self.model.predict(X, num_iteration=best_iteration)
        else:
            predictions = self.model.predict(X_predict)
        # Handle cases of getting a prediction for a single item.
        # It makes a cleaner interface just to get just the single prediction back, rather than a list with the prediction hidden inside.

        if isinstance(predictions, np.ndarray):
            predictions = predictions.tolist()
            if isinstance(predictions, float) or isinstance(
                    predictions, int) or isinstance(predictions, str):
                return predictions

        if isinstance(predictions[0], list) and len(predictions[0]) == 1:
            predictions = [row[0] for row in predictions]

        if len(predictions) == 1:
            return predictions[0]
        else:
            return predictions

    def predict_intervals(self, X, return_type=None):

        if self.interval_predictors is None:
            print(
                '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
            )
            print('This model was not trained to predict intervals')
            print(
                'Please follow the documentation to tell this model at training time to learn how to predict intervals'
            )
            print(
                '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
            )
            raise ValueError('This model was not trained to predict intervals')

        base_prediction = self.predict(X)

        result = {'prediction': base_prediction}
        for tup in self.interval_predictors:
            predictor_name = tup[0]
            predictor = tup[1]
            result[predictor_name] = predictor.predict(X)

        if scipy.sparse.issparse(X):
            len_input = X.shape[0]
        else:
            len_input = len(X)

        if (len_input == 1 and return_type is None) or return_type == 'dict':
            return result

        elif (len_input > 1 and return_type is None
              ) or return_type == 'df' or return_type == 'dataframe':
            return pd.DataFrame(result)

        elif return_type == 'list':
            if len_input == 1:
                list_result = [base_prediction]
                for tup in self.interval_predictors:
                    list_result.append(result[tup[0]])
            else:
                list_result = []
                for idx in range(len_input):
                    row_result = [base_prediction[idx]]
                    for tup in self.interval_predictors:
                        row_result.append(result[tup[0]][idx])
                    list_result.append(row_result)

            return list_result

        else:
            print(
                'Please pass in a return_type value of one of the following: ["dict", "dataframe", "df", "list"]'
            )
            raise (ValueError(
                'Please pass in a return_type value of one of the following: ["dict", "dataframe", "df", "list"]'
            ))

    # transform is initially designed to be used with feature_learning
    def transform(self, X):
        predicted_features = self.predict(X)
        predicted_features = list(predicted_features)

        X = scipy.sparse.hstack([X, predicted_features], format='csr')
        return X

    # Allows the user to get the fully transformed data
    def transform_only(self, X):
        return X

    def predict_uncertainty(self, X):
        if self.uncertainty_model is None:
            print(
                '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
            )
            print('This model was not trained to predict uncertainties')
            print(
                'Please follow the documentation to tell this model at training time to learn how to predict uncertainties'
            )
            print(
                '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
            )
            raise ValueError(
                'This model was not trained to predict uncertainties')

        base_predictions = self.predict(X)

        if isinstance(base_predictions, Iterable):
            base_predictions_col = [[val] for val in base_predictions]
            base_predictions_col = np.array(base_predictions_col)
        else:
            base_predictions_col = [base_predictions]

        X_combined = scipy.sparse.hstack([X, base_predictions_col],
                                         format='csr')

        uncertainty_predictions = self.uncertainty_model.predict_proba(
            X_combined)

        results = {
            'base_prediction': base_predictions,
            'uncertainty_prediction': uncertainty_predictions
        }

        if isinstance(base_predictions, Iterable):

            results['uncertainty_prediction'] = [
                row[1] for row in results['uncertainty_prediction']
            ]

            results = pd.DataFrame.from_dict(results, orient='columns')

            if self.uc_results is not None:
                calibration_results = {}
                # grab the relevant properties from our uc_results, and make them each their own list in calibration_results
                for key, value in self.uc_results[1].items():
                    calibration_results[key] = []

                for proba in results['uncertainty_prediction']:
                    max_bucket_proba = 0
                    bucket_num = 1
                    while proba > max_bucket_proba:
                        calibration_result = self.uc_results[bucket_num]
                        max_bucket_proba = self.uc_results[bucket_num][
                            'max_proba']
                        bucket_num += 1

                    for key, value in calibration_result.items():
                        calibration_results[key].append(value)
                # TODO: grab the uncertainty_calibration data for DataFrames
                df_calibration_results = pd.DataFrame.from_dict(
                    calibration_results, orient='columns')
                del df_calibration_results['max_proba']

                results = pd.concat([results, df_calibration_results], axis=1)

        else:
            if self.uc_results is not None:
                # TODO: grab the uncertainty_calibration data for dictionaries
                for bucket_name, bucket_result in self.uc_results.items():
                    if proba > bucket_result['max_proba']:
                        break
                    results.update(bucket_result)
                    del results['max_proba']

        return results

    def score_uncertainty(self, X, y, verbose=False):
        return self.uncertainty_model.score(X, y, verbose=False)

    def get_categorical_feature_indices(self):
        cat_feature_indices = None
        if self.keep_cat_features == True:
            cat_feature_names = [
                k for k, v in self.column_descriptions.items()
                if v == 'categorical'
            ]
            cat_feature_indices = [
                self.training_features.index(cat_name)
                for cat_name in cat_feature_names
            ]

        return cat_feature_indices

    def get_X_test(self, X_fit, y):

        if self.X_test is not None:
            return X_fit, y, self.X_test, self.y_test
        else:
            X_fit, X_test, y, y_test = train_test_split(X_fit,
                                                        y,
                                                        test_size=0.15)
            return X_fit, y, X_test, y_test
Esempio n. 27
0
    return model

from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(train_y)
encoded_y = encoder.transform(train_y)

dummy_y = np_utils.to_categorical(encoded_y)
print(dummy_y.shape)

estimator = KerasClassifier(build_fn=baseline_model, nb_epochs=10, batch_size=64)
estimator.fit(sentence_vectors[0:3321], dummy_y, validation_split=0.05)

y_pred = estimator.predict_proba(sentence_vectors[3321:])

""" Submission """
submission = pd.DataFrame(y_pred)
submission['id'] = test_index
submission.columns = ['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9', 'id']
submission.to_csv("submission_keras_classify.csv",index=False)





   

if __name__=='__main__':
    main()
Esempio n. 28
0
# In[27]:

estimator = KerasClassifier(build_fn=create_baseline,
                            epochs=5,
                            batch_size=100,
                            verbose=0)
kfold = StratifiedKFold(n_splits=100, shuffle=True, random_state=seed)
results = cross_val_score(estimator, xtrain, ytrain, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))

# In[28]:

estimator = KerasClassifier(build_fn=create_baseline,
                            epochs=5,
                            batch_size=100,
                            verbose=0)
estimator.fit(xtrain, ytrain)
y_pred_proba = estimator.predict_proba(xtest)
skplt.metrics.plot_roc_curve(ytest, y_pred_proba)
plt.show()
print("Neural Network Results: %.2f%% (%.2f%%)" %
      (results.mean() * 100, results.std() * 100))

# In[29]:

y_pred_proba = estimator.predict_proba(xtrain)
skplt.metrics.plot_roc_curve(ytrain, y_pred_proba)
plt.show()
print("Neural Network Results Training: %.2f%% (%.2f%%)" %
      (results.mean() * 100, results.std() * 100))
Esempio n. 29
0
                        batch_size=128,
                        verbose=0)
    acc_history = history.history['val_accuracy']
    all_acc_histories.append(acc_history)
    print(i, ' 폴드 끝남')

### Score check
average_acc_history = [
    np.mean([x[i] for x in all_acc_histories]) for i in range(num_epochs)
]

plt.plot(range(1, len(average_acc_history) + 1), average_acc_history)
plt.xlabel('Epochs')
plt.ylabel('Validation ACC')
plt.show()

### Fit the model
model = KerasClassifier(build_fn=build_model,
                        nb_epoch=100,
                        batch_size=128,
                        verbose=0)
model.fit(x_train, one_hot_train_labels)

### Get prediction
predictions = model.predict_proba(x_test)
predictions
predictions.shape

pd.DataFrame(predictions).to_csv('predictions.csv', index=False)
# Kaggle 기준 score 2.36. 전체 600등 정도
Esempio n. 30
0
conn.close()

# In[ ]:

print(sql_output.sample(12, random_state=1960))

# # Keras Prediction

# In[ ]:

keras_output = pd.DataFrame()
keras_output_key = pd.DataFrame(list(range(x_test.shape[0])), columns=['KEY'])
keras_output_score = pd.DataFrame(
    columns=['Score_' + str(x) for x in range(num_classes)])
keras_output_proba = pd.DataFrame(
    clf.predict_proba(x_test),
    columns=['Proba_' + str(x) for x in range(num_classes)])
keras_output = pd.concat(
    [keras_output_key, keras_output_score, keras_output_proba], axis=1)
for class_label in range(num_classes):
    keras_output['LogProba_' + str(class_label)] = np.log(
        keras_output_proba['Proba_' + str(class_label)])
keras_output['Decision'] = clf.predict(x_test)
print(keras_output.sample(12, random_state=1960))

# # Comparing the SQL and Keras Predictions

# In[ ]:

sql_keras_join = keras_output.join(sql_output,
                                   how='left',
Esempio n. 31
0
# print "Cross validation results:", (results.mean()*100), (results.std()*100)
model.fit(features_train.values, labels_train.values)

print "Model building complete:",round((time()-t0)/60,3),"m"

# print len(np.unique(train.user_id)), len(np.unique(test.user_id))

# features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features_train, labels_train, test_size=0.60)

# # neigh = neighbors.KNeighborsClassifier(weights='distance', n_jobs=-1).fit(train[features], train['hotel_cluster'])
# forest = ensemble.RandomForestClassifier(n_estimators=10, n_jobs=-1).fit(train[features], train['hotel_cluster'])
# # bayes = naive_bayes.GaussianNB().fit(train[features], train['hotel_cluster'])

t0 = time()
print "Predicting probabilities..."
probs = pd.DataFrame(model.predict_proba(features_test.values, batch_size=32))

# probs = pd.DataFrame(forest.predict_proba(test[features]))
probs.columns = np.unique(labels_train.sort_values().values)
# probs.columns = np.unique(labels_train.sort_values().values)
# probs.columns = np.unique(labels_train.values)
preds = pd.DataFrame([list([r.sort_values(ascending=False)[:5].index.values]) for i,r in probs.iterrows()])
print "Mapk score for model:", mapk([[l] for l in labels_test], preds[0], 5)
print "Probablity prediction complete:",round((time()-t0)/60,3),"m"

# t0 = time()
# print "Creating submission..."
# submission = pd.DataFrame()
# submission['id'] = test['id']
# submission['hotel_cluster'] = [' '.join(str(x) for x in y) for y in preds.values]
# submission.sort_values(by='id', inplace=True)
Esempio n. 32
0
class FinalModelATC(BaseEstimator, TransformerMixin):
    def __init__(self,
                 model,
                 model_name=None,
                 ml_for_analytics=False,
                 type_of_estimator='classifier',
                 output_column=None,
                 name=None,
                 scoring_method=None,
                 training_features=None,
                 column_descriptions=None):

        self.model = model
        self.model_name = model_name
        self.ml_for_analytics = ml_for_analytics
        self.type_of_estimator = type_of_estimator
        self.name = name
        self.training_features = training_features
        self.column_descriptions = column_descriptions

        if self.type_of_estimator == 'classifier':
            self._scorer = scoring_method
        else:
            self._scorer = scoring_method

    def fit(self, X, y):
        self.model_name = get_name_from_model(self.model)

        # if self.model_name[:3] == 'XGB' and scipy.sparse.issparse(X):
        #     ones = [[1] for x in range(X.shape[0])]
        #     # Trying to force XGBoost to play nice with sparse matrices
        #     X_fit = scipy.sparse.hstack((X, ones))

        # else:

        X_fit = X

        if self.model_name[:12] == 'DeepLearning' or self.model_name in [
                'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit',
                'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier',
                'SGDClassifier', 'RidgeClassifier', 'LogisticRegression'
        ]:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()

            if self.model_name[:12] == 'DeepLearning':
                if keras_installed:

                    # For Keras, we need to tell it how many input nodes to expect, which is our num_cols
                    num_cols = X_fit.shape[1]

                    model_params = self.model.get_params()
                    del model_params['build_fn']

                    if self.type_of_estimator == 'regressor':
                        self.model = KerasRegressor(
                            build_fn=utils_models.make_deep_learning_model,
                            num_cols=num_cols,
                            **model_params)
                    elif self.type_of_estimator == 'classifier':
                        self.model = KerasClassifier(
                            build_fn=utils_models.
                            make_deep_learning_classifier,
                            num_cols=num_cols,
                            **model_params)
                else:
                    print(
                        'WARNING: We did not detect that Keras was available.')
                    raise TypeError(
                        'A DeepLearning model was requested, but Keras was not available to import'
                    )

        try:
            if self.model_name[:12] == 'DeepLearning':

                print(
                    'Stopping training early if we have not seen an improvement in training accuracy in 25 epochs'
                )
                from keras.callbacks import EarlyStopping
                early_stopping = EarlyStopping(monitor='loss',
                                               patience=25,
                                               verbose=1)
                self.model.fit(X_fit, y, callbacks=[early_stopping])

            else:
                self.model.fit(X_fit, y)

        except TypeError as e:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()
            self.model.fit(X_fit, y)

        except KeyboardInterrupt as e:
            pass

        return self

    def remove_categorical_values(self, features):
        clean_features = set([])
        for feature in features:
            if '=' not in feature:
                clean_features.add(feature)
            else:
                clean_features.add(feature[:feature.index('=')])

        return clean_features

    def verify_features(self, X, raw_features_only=False):

        if self.column_descriptions is None:
            print(
                'This feature is not enabled by default. Depending on the shape of the training data, it can add hundreds of KB to the saved file size.'
            )
            print(
                'Please pass in `ml_predictor.train(data, verify_features=True)` when training a model, and we will enable this function, at the cost of a potentially larger file size.'
            )
            warnings.warn(
                'Please pass verify_features=True when invoking .train() on the ml_predictor instance.'
            )
            return None

        print(
            '\n\nNow verifying consistency between training features and prediction features'
        )
        if isinstance(X, dict):
            prediction_features = set(X.keys())
        elif isinstance(X, pd.DataFrame):
            prediction_features = set(X.columns)

        # If the user passed in categorical features, we will effectively one-hot-encode them ourselves here
        # Note that this assumes we're using the "=" as the separater in DictVectorizer/DataFrameVectorizer
        date_col_names = []
        categorical_col_names = []
        for key, value in self.column_descriptions.items():
            if value == 'categorical' and 'day_part' not in key:
                try:
                    # This covers the case that the user passes in a value in column_descriptions that is not present in their prediction data
                    column_vals = X[key].unique()
                    for val in column_vals:
                        prediction_features.add(key + '=' + str(val))

                    categorical_col_names.append(key)
                except:
                    print(
                        '\nFound a column in your column_descriptions that is not present in your prediction data:'
                    )
                    print(key)

            elif 'day_part' in key:
                # We have found a date column. Make sure this date column is in our prediction data
                # It is outside the scope of this function to make sure that the same date parts are available in both our training and testing data
                raw_date_col_name = key[:key.index('day_part') - 1]
                date_col_names.append(raw_date_col_name)

            elif value == 'output':
                try:
                    prediction_features.remove(key)
                except KeyError:
                    pass

        # Now that we've added in all the one-hot-encoded categorical columns (name=val1, name=val2), remove the base name from our prediction data
        prediction_features = prediction_features - set(categorical_col_names)

        # Get only the unique raw_date_col_names
        date_col_names = set(date_col_names)

        training_features = set(self.training_features)

        # Remove all of the transformed date column feature names from our training data
        features_to_remove = []
        for feature in training_features:
            for raw_date_col_name in date_col_names:
                if raw_date_col_name in feature:
                    features_to_remove.append(feature)
        training_features = training_features - set(features_to_remove)

        # Make sure the raw_date_col_name is in our training data after we have removed all the transformed feature names
        training_features = training_features | date_col_names

        # MVP means ignoring text features
        print_nlp_warning = False
        nlp_example = None
        for feature in training_features:
            if 'nlp_' in feature:
                print_nlp_warning = True
                nlp_example = feature
                training_features.remove(feature)

        if print_nlp_warning == True:
            print('\n\nWe found an NLP column in the training data')
            print(
                'verify_features() currently does not support checking all of the values within an NLP column, so if the text of your NLP column has dramatically changed, you will have to check that yourself.'
            )
            print(
                'Here is one example of an NLP feature in the training data:')
            print(nlp_example)

        training_not_prediction = training_features - prediction_features

        if raw_features_only == True:
            training_not_prediction = self.remove_categorical_values(
                training_not_prediction)

        if len(training_not_prediction) > 0:

            print(
                '\n\nHere are the features this model was trained on that were not present in this prediction data:'
            )
            print(sorted(list(training_not_prediction)))
        else:
            print(
                'All of the features this model was trained on are included in the prediction data'
            )

        prediction_not_training = prediction_features - training_features
        if raw_features_only == True:
            prediction_not_training = self.remove_categorical_values(
                prediction_not_training)

        if len(prediction_not_training) > 0:

            # Separate out those values we were told to ignore by column_descriptions
            ignored_features = []
            for feature in prediction_not_training:
                if self.column_descriptions.get(feature, 'False') == 'ignore':
                    ignored_features.append(feature)
            prediction_not_training = prediction_not_training - set(
                ignored_features)

            print(
                '\n\nHere are the features available in the prediction data that were not part of the training data:'
            )
            print(sorted(list(prediction_not_training)))

            if len(ignored_features) > 0:
                print(
                    '\n\nAdditionally, we found features in the prediction data that we were told to ignore in the training data'
                )
                print(sorted(list(ignored_features)))

        else:
            print(
                'All of the features in the prediction data were in this model\'s training data'
            )

        print('\n\n')
        return {
            'training_not_prediction': training_not_prediction,
            'prediction_not_training': prediction_not_training
        }

    def score(self, X, y, verbose=False):
        # At the time of writing this, GradientBoosting does not support sparse matrices for predictions
        if (self.model_name[:16] == 'GradientBoosting' or self.model_name in [
                'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit',
                'ARDRegression'
        ]) and scipy.sparse.issparse(X):
            X = X.todense()

        if self._scorer is not None:
            if self.type_of_estimator == 'regressor':
                return self._scorer.score(self, X, y)
            elif self.type_of_estimator == 'classifier':
                return self._scorer.score(self, X, y)

        else:
            return self.model.score(X, y)

    def predict_proba(self, X, verbose=False):

        # if self.model_name[:3] == 'XGB' and scipy.sparse.issparse(X):
        #     ones = [[1] for x in range(X.shape[0])]
        #     # Trying to force XGBoost to play nice with sparse matrices
        #     X = scipy.sparse.hstack((X, ones))

        if (self.model_name[:16] == 'GradientBoosting' or self.model_name[:12]
                == 'DeepLearning' or self.model_name in [
                    'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit',
                    'ARDRegression'
                ]) and scipy.sparse.issparse(X):
            X = X.todense()

        try:
            predictions = self.model.predict_proba(X)

        except AttributeError as e:
            # print('This model has no predict_proba method. Returning results of .predict instead.')
            try:
                predictions = self.model.predict(X)
            except TypeError as e:
                if scipy.sparse.issparse(X):
                    X = X.todense()
                predictions = self.model.predict(X)

        except TypeError as e:
            if scipy.sparse.issparse(X):
                X = X.todense()
            predictions = self.model.predict_proba(X)

        # If this model does not have predict_proba, and we have fallen back on predict, we want to make sure we give results back in the same format the user would expect for predict_proba, namely each prediction is a list of predicted probabilities for each class.
        # Note that this DOES NOT WORK for multi-label problems, or problems that are not reduced to 0,1
        # If this is not an iterable (ignoring strings, which might be iterable), then we will want to turn our predictions into tupled predictions
        if not (hasattr(predictions[0], '__iter__')
                and not isinstance(predictions[0], str)):
            tupled_predictions = []
            for prediction in predictions:
                if prediction == 1:
                    tupled_predictions.append([0, 1])
                else:
                    tupled_predictions.append([1, 0])
            predictions = tupled_predictions

        # This handles an annoying edge case with libraries like Keras that, for a binary classification problem, with return a single predicted probability in a list, rather than the probability of both classes in a list
        if len(predictions[0]) == 1:
            tupled_predictions = []
            for prediction in predictions:
                tupled_predictions.append([1 - prediction[0], prediction[0]])
            predictions = tupled_predictions

        if X.shape[0] == 1:
            return predictions[0]
        else:
            return predictions

    def predict(self, X, verbose=False):

        # if self.model_name[:3] == 'XGB' and scipy.sparse.issparse(X):
        #     ones = [[1] for x in range(X.shape[0])]
        #     # Trying to force XGBoost to play nice with sparse matrices
        #     X_predict = scipy.sparse.hstack((X, ones))

        if (self.model_name[:16] == 'GradientBoosting' or self.model_name[:12]
                == 'DeepLearning' or self.model_name in [
                    'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit',
                    'ARDRegression'
                ]) and scipy.sparse.issparse(X):
            X_predict = X.todense()

        else:
            X_predict = X

        prediction = self.model.predict(X_predict)
        # Handle cases of getting a prediction for a single item.
        # It makes a cleaner interface just to get just the single prediction back, rather than a list with the prediction hidden inside.
        if len(prediction) == 1:
            return prediction[0]
        else:
            return prediction
Esempio n. 33
0
# instantiate Kfold and predictions placeholder
k = 5
kf = StratifiedKFold(k)
predictions = np.zeros((X_test.shape[0], k))
predictions_test = np.zeros((X_test.shape[0], k))
predictions_train = np.zeros(X_train.shape[0])
i = 0

# for each fold store predictions on test set and print validation results
test_score = 0.0
for train_index, test_index in kf.split(X_train, Y_train):
    nn.fit(X_train[train_index], Y_train[train_index])
    Y_pred = nn.predict(X_train[test_index])[:, 0]
    Y_pred_train = nn.predict(X_train[train_index])[:, 0]
    predictions[:, i] = nn.predict(X_test)[:, 0]
    predictions_test[:, i] = nn.predict_proba(X_test)[:, 1]
    predictions_train[test_index] = nn.predict_proba(X_train[test_index])[:, 1]
    # current_test_score = f1_score(Y_train[test_index], Y_pred)[:, 0]
    # test_score += current_test_score
    # print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
    # print("test: " + str(current_test_score))
    i += 1
# print("CV test score: "+str(test_score/k))

# save submission file
Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
submission = pd.DataFrame(Y_test)
submission.to_csv(path_or_buf=path_to_submissions +
                  "-".join(my_features_acronym) + "nn_deep.csv",
                  index=True,
                  index_label="id",