Exemple #1
0
def run_pipeline_anova_workflow():
    name = "pipeline scikit example"
    author = "srinidhi"
    description = "anova filter pipeline"
    syncer_obj = Syncer(
        NewOrExistingProject(name, author, description),
        DefaultExperiment(),
        NewExperimentRun("Abc"))

    # import some data to play with
    X, y = samples_generator.make_classification(
        n_informative=5, n_redundant=0, random_state=42)

    x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync(
        X, y, test_size=0.3, random_state=0)
    syncer_obj.add_tag(X, "samples generated data")
    syncer_obj.add_tag(x_train, "training data")
    syncer_obj.add_tag(x_test, "testing data")

    # ANOVA SVM-C
    # 1) anova filter, take 5 best ranked features
    anova_filter = SelectKBest(f_regression, k=5)
    syncer_obj.add_tag(anova_filter, "Anova filter, with k=5")
    # 2) svm
    clf = svm.SVC(kernel='linear')
    syncer_obj.add_tag(clf, "SVC with linear kernel")
    anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)])

    syncer_obj.add_tag(anova_svm, "Pipeline with anova_filter and SVC")

    # Fit the pipeline on the training set
    anova_svm.fit_sync(x_train, y_train)
    y_pred = anova_svm.predict(x_test)
    # Compute metrics for the model on the testing set
    f1 = SyncableMetrics.compute_metrics(
        anova_svm, f1_score, y_test, y_pred, x_test, "predictionCol",
        'label_col')
    precision = SyncableMetrics.compute_metrics(
        anova_svm, precision_score, y_test, y_pred, x_test, "predictionCol",
        'label_col')
    syncer_obj.sync()
    return syncer_obj, f1, precision, x_train, x_test
Exemple #2
0
def run_linear_model_workflow():
    """
    Sample workflow using OneHotEncoder and LinearRegression.
    """
    syncer_obj = Syncer.create_syncer("test1", "test_user",
                                      "pandas-linear-regression")

    data, target = load_pandas_dataset()
    syncer_obj.add_tag(data, "occupation dataset")

    # Hot encode occupation column of data
    hot_enc = preprocessing.OneHotEncoder()
    syncer_obj.add_tag(hot_enc, "Hot encoding occupation column")

    hot_enc.fit_sync(data['occupation'].reshape(-1, 1))
    hot_enc_rows = hot_enc.transform_sync(data['occupation'].reshape(-1, 1))
    hot_enc_df = pd.DataFrame(hot_enc_rows.toarray())

    # Drop column as it is now encoded
    dropped_data = data.drop_sync('occupation', axis=1)
    # Join the hot encoded rows with the rest of the data
    data = dropped_data.join(hot_enc_df)

    x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync(
        data, target, test_size=0.3, random_state=1)

    syncer_obj.add_tag(x_train, "training data - 70%")
    syncer_obj.add_tag(x_test, "testing data - 30%")

    model = linear_model.LinearRegression()
    syncer_obj.add_tag(model, "Basic linear reg")

    model.fit_sync(x_train, y_train)
    y_pred = model.predict_sync(x_test)

    mean_error = SyncableMetrics.compute_metrics(model, mean_squared_error,
                                                 y_test, y_pred, x_test, "",
                                                 'affairs')

    # Sync all the events to database
    syncer_obj.sync()

    # Certain variables are returned so they can be used for unittests below.
    return syncer_obj, x_test, mean_error, dropped_data
orig = pd.read_csv_sync(DATA_PATH + 'adult_with_colnames.csv', index_col=0)
[train, test] = cross_validation.train_test_split_sync(orig,
                                                       test_size=0.3,
                                                       random_state=501)

[lb, train] = oneHotEncoding(None, "workclass", train)
cols = [col for col in train.columns if "workclass_" in col]
[lb2, train] = oneHotEncoding(None, "sex", train)
cols = [col for col in train.columns if "sex_" in col]
train = train.drop(["workclass", "sex"], axis=1)
new_cols = [
    col for col in train.columns if "workclass_" in col or "sex_" in col
]

logreg = linear_model.LogisticRegression(C=10)
features = ['capital-gain', 'capital-loss', 'age'] + new_cols
logreg.fit_sync(train[features], train.income)

[lb, test] = oneHotEncoding(lb, "workclass", test)
[lb2, test] = oneHotEncoding(lb2, "sex", test)
test = test.drop(["workclass", "sex"], axis=1)

test_pred = logreg.predict_sync(test[features])
test_proba = logreg.predict_proba(test[features])

accuracy = SyncableMetrics.compute_metrics(logreg, accuracy_score, test.income,
                                           test_pred, test[features],
                                           "predictionCol", 'income_level')

syncer_obj.sync()
Exemple #4
0
"""
Cross Validation
"""
# Create the classifier
decision_tree_classifier = DecisionTreeClassifier()
syncer_obj.add_tag(decision_tree_classifier, "decision tree")

# Train the classifier on the training set
decision_tree_classifier.fit_sync(training_inputs, training_classes)

# Validate the classifier on the testing set using classification accuracy
#decision_tree_classifier.score(testing_inputs, testing_classes)

# NOTE: score is equivalent to sklearn.metrics.accuracy_score.
SyncableMetrics.compute_metrics(
    decision_tree_classifier, accuracy_score, testing_classes,
    decision_tree_classifier.predict(testing_inputs), training_inputs, "", "")

# cross_val_score returns a list of the scores, which we can visualize
# to get a reasonable estimate of our classifier's performance
cv_scores = cross_validation.cross_val_score_sync(decision_tree_classifier,
                                                  all_inputs,
                                                  all_classes,
                                                  cv=10)
"""
Parameter-tuning
"""
parameter_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 2, 3, 4, 5],
Exemple #5
0
def run_otto_workflow():
    name = "test1"
    author = "author"
    description = "kaggle-otto-script"
    # Creating a new project
    syncer_obj = Syncer(NewOrExistingProject(name, author, description),
                        NewOrExistingExperiment("expName", "expDesc"),
                        NewExperimentRun("otto test"))

    # Import Data
    # Note: This dataset is not included in the repo because of Kaggle
    # restrictions.
    # It can be downloaded from
    # https://www.kaggle.com/c/otto-group-product-classification-challenge/data
    X = pd.read_csv_sync(DATA_PATH + 'otto-train.csv')
    syncer_obj.add_tag(X, "original otto csv data")
    X = X.drop_sync('id', axis=1)

    syncer_obj.add_tag(X, "dropped id column")
    # Extract target
    # Encode it to make it manageable by ML algo
    y = X.target.values

    y = LabelEncoder().fit_transform_sync(y)

    # Remove target from train, else it's too easy ...
    X = X.drop_sync('target', axis=1)

    syncer_obj.add_tag(X, "data with dropped id and target columns")

    # Split Train / Test
    x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync(
        X, y, test_size=0.20, random_state=36)

    syncer_obj.add_tag(x_test, "testing data")
    syncer_obj.add_tag(x_train, "training data")
    # First, we will train and apply a Random Forest WITHOUT calibration
    # we use a BaggingClassifier to make 5 predictions, and average
    # because that's what CalibratedClassifierCV do behind the scene,
    # and we want to compare things fairly, i.e. be sure that averaging several
    # models
    # is not what explains a performance difference between no calibration,
    # and calibration.

    clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)

    clfbag = BaggingClassifier(clf, n_estimators=5)
    clfbag.fit_sync(x_train, y_train)

    y_preds = clfbag.predict_proba_sync(x_test)

    SyncableMetrics.compute_metrics(clfbag,
                                    log_loss,
                                    y_test,
                                    y_preds,
                                    x_test,
                                    "",
                                    "",
                                    eps=1e-15,
                                    normalize=True)
    # print("loss WITHOUT calibration : ", log_loss(
    #     ytest, ypreds, eps=1e-15, normalize=True))

    # Now, we train and apply a Random Forest WITH calibration
    # In our case, 'isotonic' worked better than default 'sigmoid'
    # This is not always the case. Depending of the case, you have to test the
    # two possibilities

    clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
    calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=5)
    calibrated_clf.fit_sync(x_train, y_train)
    y_preds = calibrated_clf.predict_proba_sync(x_test)
    SyncableMetrics.compute_metrics(calibrated_clf,
                                    log_loss,
                                    y_test,
                                    y_preds,
                                    x_test,
                                    "",
                                    "",
                                    eps=1e-15,
                                    normalize=True)

    # print("loss WITH calibration : ", log_loss(
    #     ytest, ypreds, eps=1e-15, normalize=True))

    print(" ")
    print("Conclusion : in our case, calibration improved"
          "performance a lot ! (reduced loss)")
    syncer_obj.sync()
    return syncer_obj, x_train, x_test
#    NewExperimentRun("my_experiment_id"))

# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5)
clf.fit_sync(x_train, y_train)

print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
y_pred = clf.predict_sync(x_test)
mean_error = SyncableMetrics.compute_metrics(
    clf, accuracy_score, y_test, y_pred, x_test, '', '')

syncer_obj.sync()
Exemple #7
0
df['income_level'] = df['income_level'].replace(['<=50K'], [0.0])
df['income_level'] = df['income_level'].replace(['>50K'], [1.0])

# calling labelEncoder on any columns that are object types
for coltype, colname in zip(df.dtypes, df.columns):
    if coltype == 'object':
        le.fit_sync(df[colname])
        transformed_vals = le.transform_sync(df[colname])
        new_df[colname + "_index"] = transformed_vals
    else:
        new_df[colname] = df[colname]

lr = linear_model.LogisticRegression()

x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync(
    new_df, new_df['income_level'], test_size=0.3, random_state=0)

# We don't want to include our label (income_level) when fitting
partial_training = x_train[x_train.columns[:-1]]
partial_testing = x_test[x_test.columns[:-1]]
lr.fit_sync(partial_training, y_train)
y_pred = lr.predict_sync(partial_testing)
SyncableMetrics.compute_metrics(
    lr, precision_score, y_test, y_pred, partial_testing, "predictionCol",
    'income_level')
SyncableMetrics.compute_metrics(
    lr, recall_score, y_test, y_pred, partial_testing, "predictionCol",
    'income_level')

syncer_obj.sync()
Exemple #8
0
# modeldb start
df = pd.read_csv_sync(DATA_PATH + 'credit-default.csv', skiprows=[0])
# modeldb end

target = df['default payment next month']
df = df[["LIMIT_BAL", "SEX", "EDUCATION", "MARRIAGE", "AGE"]]

x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync(
    df, target, test_size=0.3)

lr = linear_model.LogisticRegression(C=2)

# modeldb start
lr.fit_sync(x_train, y_train)
# modeldb end

# modeldb start
y_pred = lr.predict_sync(x_test)
# modeldb end

# modeldb start
score = SyncableMetrics.compute_metrics(lr, accuracy_score, y_test, y_pred,
                                        x_train, "features",
                                        'default payment next month')
# modeldb end

# modeldb start
syncer_obj.sync()
# modeldb end
# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{
    'kernel': ['rbf'],
    'gamma': [1e-3, 1e-4],
    'C': [1, 10, 100, 1000]
}, {
    'kernel': ['linear'],
    'C': [1, 10, 100, 1000]
}]

clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5)
clf.fit_sync(x_train, y_train)

print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
y_pred = clf.predict_sync(x_test)
mean_error = SyncableMetrics.compute_metrics(clf, precision_score, y_test,
                                             y_pred, x_test, '', '')

syncer_obj.sync()
Exemple #10
0
            'hidden_layer_sizes': [(1, ), (1, ), (
                1,
                1,
                1,
            )]
        }
        mlp = MLPClassifier(verbose=10, learning_rate='adaptive')
        clf = GridSearchCV(mlp, params, verbose=10, n_jobs=1, cv=2)
        clf.fit_sync(x_train, y_train)
        print(
            'Finished with grid search with best mean cross-validated score:',
            clf.best_score_)
        print('Best params appeared to be', clf.best_params_)
        joblib.dump(clf, PATH)
        y_pred = clf.predict_sync(x_test)
        score = SyncableMetrics.compute_metrics(clf, accuracy_score, y_test,
                                                y_pred, x_train, "", "")
        clf = clf.best_estimator_

    print('Test accuracy:', clf.score(x_test, y_test))

    #    datasets = {
    #        "train" : Dataset("/path/to/train", {"num_cols" : 15, "dist" : "random"}),
    #        "test" : Dataset("/path/to/test", {"num_cols" : 15, "dist" : "gaussian"})
    #    }
    #    model = "model_obj"
    #    model_type = "NN"
    #    mdb_model1 = Model(model_type, model, "./model.pkl")
    #    model_config1 = ModelConfig(model_type, {"l1" : 10})
    #    model_metrics1 = ModelMetrics({"accuracy" : 0.8})
    #    syncer_obj.sync_datasets(datasets)
    #    syncer_obj.sync_model("train", model_config1, mdb_model1)