def setUp(self): name = "logistic-test" author = "srinidhi" description = "income-level logistic regression" syncer_obj = SyncerTest( NewOrExistingProject(name, author, description), DefaultExperiment(), NewExperimentRun("Abc"), ThriftConfig(None, None)) # Creating the pipeline pca = decomposition.PCA() lr = linear_model.LinearRegression() pipe = Pipeline(steps=[('pca', pca), ('logistic', lr)]) model = linear_model.LinearRegression() np.random.seed(0) X = pd.DataFrame(np.random.randint(0, 100, size=(100, 2)), columns=list('AB')) y = pd.DataFrame(np.random.randint(0, 100, size=(100, 1)), columns=['output']) # Add tags for models / dataframes syncer_obj.add_tag(X, "digits-dataset") syncer_obj.add_tag(pipe, "pipeline with pca + logistic") syncer_obj.add_tag(pca, "decomposition PCA") syncer_obj.add_tag(lr, "basic linear reg") syncer_obj.clear_buffer() pipe.fit_sync(X, y) events = syncer_obj.sync() self.pipeline_event = events[0]
def run_pipeline_anova_workflow(): name = "pipeline scikit example" author = "srinidhi" description = "anova filter pipeline" syncer_obj = Syncer( NewOrExistingProject(name, author, description), DefaultExperiment(), NewExperimentRun("Abc")) # import some data to play with X, y = samples_generator.make_classification( n_informative=5, n_redundant=0, random_state=42) x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync( X, y, test_size=0.3, random_state=0) syncer_obj.add_tag(X, "samples generated data") syncer_obj.add_tag(x_train, "training data") syncer_obj.add_tag(x_test, "testing data") # ANOVA SVM-C # 1) anova filter, take 5 best ranked features anova_filter = SelectKBest(f_regression, k=5) syncer_obj.add_tag(anova_filter, "Anova filter, with k=5") # 2) svm clf = svm.SVC(kernel='linear') syncer_obj.add_tag(clf, "SVC with linear kernel") anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)]) syncer_obj.add_tag(anova_svm, "Pipeline with anova_filter and SVC") # Fit the pipeline on the training set anova_svm.fit_sync(x_train, y_train) y_pred = anova_svm.predict(x_test) # Compute metrics for the model on the testing set f1 = SyncableMetrics.compute_metrics( anova_svm, f1_score, y_test, y_pred, x_test, "predictionCol", 'label_col') precision = SyncableMetrics.compute_metrics( anova_svm, precision_score, y_test, y_pred, x_test, "predictionCol", 'label_col') syncer_obj.sync() return syncer_obj, f1, precision, x_train, x_test
new_df[colname + "_index"] = transformed_vals else: new_df[colname] = df[colname] # Creating the pipeline pca = decomposition.PCA() lr = linear_model.LogisticRegression() pipe = Pipeline(steps=[('pca', pca), ('logistic', lr)]) # Separating dataset into training and testing sets x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync( new_df, new_df['income_level'], test_size=0.3, random_state=0) # We don't want to include our label (income_level) when fitting partial_training = x_train[x_train.columns[:-1]] partial_testing = x_test[x_test.columns[:-1]] # Fit the pipeline pipe.fit_sync(partial_training, y_train) y_pred = pipe.predict(partial_testing) # Compute various metrics on the testing set SyncableMetrics.compute_metrics( pipe, f1_score, y_test, y_pred, partial_testing, "predictionCol", 'income_level') SyncableMetrics.compute_metrics( pipe, precision_score, y_test, y_pred, partial_testing, "predictionCol", 'income_level') syncer_obj.sync()