Esempio n. 1
0
 def test_two_estimators_predict_proba1(self):
     pipeline = (
         StandardScaler() >>
         (PCA() & Nystroem() & PassiveAggressiveClassifier()) >>
         ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier())
     pipeline.fit(self.X_train, self.y_train)
     pipeline.predict_proba(self.X_test)
Esempio n. 2
0
def score_solution(model, save=0):
    '''
    Added a model and save parameter:
        model ~ hold a classification model 
        save ~ Flag used to save the best model on file using jobLib
    '''
    # Ask the solution for the model pipeline.
    import solution
    pipeline = solution.get_pipeline(model)
    error_message = 'Your `solution.get_pipeline` implementation should ' \
        'return an `sklearn.pipeline.Pipeline`.'
    assert isinstance(pipeline, sklearn.pipeline.Pipeline), error_message
    # Train the model on the training DataFrame.
    X_train, y_train = get_data(subset='train')
    pipeline.fit(X_train, y_train)
    # Apply the model to the test DataFrame.
    X_test, y_test = get_data(subset='test')
    y_pred = pipeline.predict_proba(X_test)
    # Check that the predicted probabilities have an sklearn-compatible shape.
    assert (y_pred.ndim == 1) or \
        (y_pred.ndim == 2 and y_pred.shape[1] == 2), \
        'The predicted probabilities should match sklearn''s ' \
        '`predict_proba` output shape`.'
    y_pred = y_pred if y_pred.ndim == 1 else y_pred[:, 1]
    # Evaluate the predictions with the AUC of the ROC curve.
    if (save == 1): joblib.dump(pipeline, 'Best_Estimator.sav')
    return sklearn.metrics.roc_auc_score(y_test, y_pred)
Esempio n. 3
0
def estimate_probability_multilabel(vectorizer, model, streamer):
    """
    Generate probabilities for a multilabel binary estimator

    Arguments:
        * vectorizer: a sklearn Vectorizer (or pipeline)
        * model: a quantgov.estimator.Estimator
        * streamer: a quantgov.corpora.CorpusStreamer

    Yields:
        2-tuples of docindex, probability

    """
    pipeline = get_pipeline(vectorizer, model)
    texts = (doc.text for doc in streamer)
    try:
        truecols = tuple(
            list(int(i) for i in label_classes).index(1)
            for label_classes in model.model.classes_)
    except AttributeError:
        truecols = tuple(
            list(int(i) for i in label_classes).index(1) for label_classes in (
                est.classes_ for est in model.model.steps[-1][-1].estimators_))
    predicted = pipeline.predict_proba(texts)
    for i, docidx in enumerate(streamer.index):
        yield docidx, tuple(label_predictions[i, truecols[j]]
                            for j, label_predictions in enumerate(predicted))
Esempio n. 4
0
 def test_multiple_estimators_predict_predict_proba(self):
     pipeline = (StandardScaler() >>
                 (LogisticRegression() & PCA()) >> ConcatFeatures() >>
                 (NoOp() & LinearSVC()) >> ConcatFeatures() >>
                 KNeighborsClassifier())
     pipeline.fit(self.X_train, self.y_train)
     _ = pipeline.predict_proba(self.X_test)
     _ = pipeline.predict(self.X_test)
Esempio n. 5
0
def estimate_probability_multiclass(vectorizer, model, streamer):
    """
    Generate probabilities for a one-label, multiclass estimator

    Arguments:
        * vectorizer: a sklearn Vectorizer (or pipeline)
        * model: a quantgov.estimator.Estimator
        * streamer: a quantgov.corpora.CorpusStreamer

    Yields:
        2-tuples of docindex, probability

    """
    pipeline = get_pipeline(vectorizer, model)
    texts = (doc.text for doc in streamer)
    yield from zip(streamer.index, pipeline.predict_proba(texts))
Esempio n. 6
0
def run_pipeline(df, pipeline, pipeline_name=''):
    X = pd.Series(df["text"])
    y = preprocessing.LabelEncoder().fit_transform(df.author.values)

    rskf = StratifiedKFold(n_splits=5, random_state=1)
    losses = []
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pipeline.fit(X_train, y_train)
        predictions = pipeline.predict_proba(X_test)
        log_loss = metrics.log_loss(y_test, predictions)
        losses.append(log_loss)
        print(" Log loss: " + str(log_loss))
        print(" Accuracy : %0.3f " % calculate_accuracy(y_test, predictions))

    print(f'{pipeline_name} mean log loss: {round(pd.np.mean(losses), 3)}')
Esempio n. 7
0
def estimate_probability(vectorizer, model, streamer):
    """
    Generate probabilities for a one-label estimator

    Arguments:
        * vectorizer: a sklearn Vectorizer (or pipeline)
        * model: a quantgov.estimator.Estimator
        * streamer: a quantgov.corpora.CorpusStreamer

    Yields:
        2-tuples of docindex, probability

    """
    pipeline = get_pipeline(vectorizer, model)
    texts = (doc.text for doc in streamer)
    truecol = list(int(i) for i in model.model.classes_).index(1)
    predicted = (i[truecol] for i in pipeline.predict_proba(texts))
    yield from zip(streamer.index, predicted)
Esempio n. 8
0
def estimate_probability_multilabel_multiclass(vectorizer, model, streamer):
    """
    Generate probabilities for a multilabel, multiclass estimator

    Arguments:
        * vectorizer: a sklearn Vectorizer (or pipeline)
        * model: a quantgov.estimator.Estimator
        * streamer: a quantgov.corpora.CorpusStreamer

    Yields:
        2-tuples of docindex, probability

    """
    pipeline = get_pipeline(vectorizer, model)
    texts = (doc.text for doc in streamer)
    predicted = pipeline.predict_proba(texts)
    for i, docidx in enumerate(streamer.index):
        yield docidx, tuple(label_predictions[i]
                            for label_predictions in predicted)
Esempio n. 9
0
def score_solution():
    # Ask the solution for the model pipeline.
    import solution
    pipeline = solution.get_pipeline()
    error_message = 'Your `solution.get_pipeline` implementation should ' \
        'return an `sklearn.pipeline.Pipeline`.'
    assert isinstance(pipeline, sklearn.pipeline.Pipeline), error_message
    # Train the model on the training DataFrame.
    X_train, y_train = get_data(subset='train')
    pipeline.fit(X_train, y_train)
    # Apply the model to the test DataFrame.
    X_test, y_test = get_data(subset='test')
    y_pred = pipeline.predict_proba(X_test)
    # Check that the predicted probabilities have an sklearn-compatible shape.
    assert (y_pred.ndim == 1) or \
        (y_pred.ndim == 2 and y_pred.shape[1] == 2), \
        'The predicted probabilities should match sklearn''s ' \
        '`predict_proba` output shape.'
    y_pred = y_pred if y_pred.ndim == 1 else y_pred[:, 1]
    # Evaluate the predictions with the AUC of the ROC curve.
    return sklearn.metrics.roc_auc_score(y_test, y_pred)
Esempio n. 10
0
import onnxruntime as rt
import joblib
from numpy import load
import sklearn.pipeline

sess = rt.InferenceSession("output/model.onnx")
train_data = load("train_data.npy", allow_pickle=True)

print('---', train_data[0])
inputs = {'input': train_data[:1]}
pred_onx = sess.run(None, inputs)

print("onnx predict_proba")
print("predict", pred_onx[0])
print("predict_proba", pred_onx[1])
print("skl predict_proba")
print("predict", pipeline.predict(train_data[:1]))
print("predict_proba", pipeline.predict_proba(train_data[:1]))
    clf__min_samples_leaf=range(2, 4, 1),
    clf__min_weight_fraction_leaf=[0],
)
#grid_search = sklearn.grid_search.GridSearchCV(
#    pipeline, n_jobs=1, param_grid=param_grid, verbose=100,
#    scoring=youdenJ,score_func=youdenJ,
#    cv=sklearn.cross_validation.PredefinedSplit(testidx))
#grid_search.fit(trainFact[:,rfecv.support_], labels)
#results1=([sklearn.metrics.confusion_matrix(labels,grid_search.best_estimator_.predict(train))])
#grid_search_results1=(grid_search.grid_scores_)
#kwargs=grid_search.best_params_
#pipeline.set_params(**kwargs)

pipeline.fit(train[train.columns[rfecv.support_]], labels)

predictions = (pipeline.predict_proba(
    test[train.columns[rfecv.support_]])[:, 1] >= 0.02) * 1
predictionstrain = (pipeline.predict_proba(
    train[train.columns[rfecv.support_]])[:, 1] >= 0.02) * 1

print Youdens_func(labels, predictionstrain)

# create predictions and submission file
sample['WnvPresent'] = predictions
sample.to_csv('testpredicts5.csv', index=False)

print sum(predictions)

#%%
##########################ROC Plots ###########################################
for yr in [2007, 2009, 2011, 2013]:
    pipeline.fit(train[train.year != yr][train.columns[rfecv.support_]],
print("Classifying unlabeled data done in: %fs" % (time()-t0))
print(report)

kfeatures = np.asarray(selector.get_support(indices=True))
print(np.asarray(vectorizer.get_feature_names())[kfeatures])

#################################################################
###### 3. Use classifier on unlabelled data

pred_unlab = pipeline.predict(X_matrix_unlab).tolist()

directory = 'results'
if not os.path.exists(directory):
    os.makedirs(directory)

probs = np.asmatrix(pipeline.predict_proba(X_matrix_unlab))

for i in range(len(unlabeled_titles)):
    m = max(max(probs[i,:].tolist()))
    if m < .5:
        filename = directory + '/unsorted'
        output_file = open(filename, 'a')
        output_file.write("%s\n" % unlabeled_titles[i])
        output_file.close()
    else:
        filename = directory + '/{}'.format(pred_unlab[i])
        output_file = open(filename, 'a')
        output_file.write("%s\n" % unlabeled_titles[i])
        output_file.close()

Esempio n. 13
0
                    else:
                        X_resampled = X_train
                        y_resampled = y_train

                    t0 = time.clock()
                    pipeline.fit(X_resampled, y_resampled)
                    time_to_fit = (time.clock() - t0)
                    print("done fitting in {}".format(time_to_fit))

                    '''
                    Predictions
                    '''
                    predicted = pipeline.predict(X_test)

                    try:
                        predicted_prob = pipeline.predict_proba(X_test)
                        predicted_prob = predicted_prob[:, 1]  # probability that label is 1

                    except:
                        print("Model has no predict_proba method")

                    '''
                    Evaluation Statistics
                    '''
                    print()
                    print("Evaluation Statistics")
                    if model_name=='KNN':
                        print("Getting feature support")
                        features = pipeline.named_steps['feat']
                        print(X_train.columns[features.transform(np.arange(
                            len(X_train.columns)))])
                  clf__min_weight_fraction_leaf=[0],

                    )
#grid_search = sklearn.grid_search.GridSearchCV(
#    pipeline, n_jobs=1, param_grid=param_grid, verbose=100,
#    scoring=youdenJ,score_func=youdenJ,
#    cv=sklearn.cross_validation.PredefinedSplit(testidx))
#grid_search.fit(trainFact[:,rfecv.support_], labels)
#results1=([sklearn.metrics.confusion_matrix(labels,grid_search.best_estimator_.predict(train))])
#grid_search_results1=(grid_search.grid_scores_)
#kwargs=grid_search.best_params_
#pipeline.set_params(**kwargs)

pipeline.fit(train[train.columns[rfecv.support_]],labels)

predictions=(pipeline.predict_proba(test[train.columns[rfecv.support_]])[:,1]>=0.02)*1
predictionstrain=(pipeline.predict_proba(train[train.columns[rfecv.support_]])[:,1]>=0.02)*1


print Youdens_func(labels,predictionstrain)

# create predictions and submission file
sample['WnvPresent'] = predictions
sample.to_csv('testpredicts5.csv', index=False)

print sum(predictions)




#%%