Example #1
0
    def setUp(self):
        name = "logistic-test"
        author = "srinidhi"
        description = "income-level logistic regression"
        syncer_obj = SyncerTest(
            NewOrExistingProject(name, author, description),
            DefaultExperiment(), NewExperimentRun("Abc"),
            ThriftConfig(None, None))

        # Creating the pipeline
        pca = decomposition.PCA()
        lr = linear_model.LinearRegression()
        pipe = Pipeline(steps=[('pca', pca), ('logistic', lr)])
        model = linear_model.LinearRegression()
        np.random.seed(0)
        X = pd.DataFrame(np.random.randint(0, 100, size=(100, 2)),
                         columns=list('AB'))
        y = pd.DataFrame(np.random.randint(0, 100, size=(100, 1)),
                         columns=['output'])

        # Add tags for models / dataframes
        syncer_obj.add_tag(X, "digits-dataset")
        syncer_obj.add_tag(pipe, "pipeline with pca + logistic")
        syncer_obj.add_tag(pca, "decomposition PCA")
        syncer_obj.add_tag(lr, "basic linear reg")

        syncer_obj.clear_buffer()
        pipe.fit_sync(X, y)
        events = syncer_obj.sync()
        self.pipeline_event = events[0]
Example #2
0
def run_pipeline_anova_workflow():
    name = "pipeline scikit example"
    author = "srinidhi"
    description = "anova filter pipeline"
    syncer_obj = Syncer(
        NewOrExistingProject(name, author, description),
        DefaultExperiment(),
        NewExperimentRun("Abc"))

    # import some data to play with
    X, y = samples_generator.make_classification(
        n_informative=5, n_redundant=0, random_state=42)

    x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync(
        X, y, test_size=0.3, random_state=0)
    syncer_obj.add_tag(X, "samples generated data")
    syncer_obj.add_tag(x_train, "training data")
    syncer_obj.add_tag(x_test, "testing data")

    # ANOVA SVM-C
    # 1) anova filter, take 5 best ranked features
    anova_filter = SelectKBest(f_regression, k=5)
    syncer_obj.add_tag(anova_filter, "Anova filter, with k=5")
    # 2) svm
    clf = svm.SVC(kernel='linear')
    syncer_obj.add_tag(clf, "SVC with linear kernel")
    anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)])

    syncer_obj.add_tag(anova_svm, "Pipeline with anova_filter and SVC")

    # Fit the pipeline on the training set
    anova_svm.fit_sync(x_train, y_train)
    y_pred = anova_svm.predict(x_test)
    # Compute metrics for the model on the testing set
    f1 = SyncableMetrics.compute_metrics(
        anova_svm, f1_score, y_test, y_pred, x_test, "predictionCol",
        'label_col')
    precision = SyncableMetrics.compute_metrics(
        anova_svm, precision_score, y_test, y_pred, x_test, "predictionCol",
        'label_col')
    syncer_obj.sync()
    return syncer_obj, f1, precision, x_train, x_test
Example #3
0
        new_df[colname + "_index"] = transformed_vals
    else:
        new_df[colname] = df[colname]

# Creating the pipeline
pca = decomposition.PCA()
lr = linear_model.LogisticRegression()
pipe = Pipeline(steps=[('pca', pca), ('logistic', lr)])

# Separating dataset into training and testing sets
x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync(
    new_df, new_df['income_level'], test_size=0.3, random_state=0)

# We don't want to include our label (income_level) when fitting
partial_training = x_train[x_train.columns[:-1]]
partial_testing = x_test[x_test.columns[:-1]]

# Fit the pipeline
pipe.fit_sync(partial_training, y_train)

y_pred = pipe.predict(partial_testing)
# Compute various metrics on the testing set
SyncableMetrics.compute_metrics(
    pipe, f1_score, y_test, y_pred, partial_testing, "predictionCol",
    'income_level')
SyncableMetrics.compute_metrics(
    pipe, precision_score, y_test, y_pred, partial_testing, "predictionCol",
    'income_level')

syncer_obj.sync()