def test_bagging_classifier_with_missing_inputs():
    # Check that BaggingClassifier can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y = np.array([3, 6, 6, 6, 6])
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(
        FunctionTransformer(replace, validate=False),
        classifier
    )
    pipeline.fit(X, y).predict(X)
    bagging_classifier = BaggingClassifier(pipeline)
    bagging_classifier.fit(X, y)
    y_hat = bagging_classifier.predict(X)
    assert_equal(y.shape, y_hat.shape)
    bagging_classifier.predict_log_proba(X)
    bagging_classifier.predict_proba(X)

    # Verify that exceptions can be raised by wrapper classifier
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(classifier)
    assert_raises(ValueError, pipeline.fit, X, y)
    bagging_classifier = BaggingClassifier(pipeline)
    assert_raises(ValueError, bagging_classifier.fit, X, y)
Beispiel #2
0
    def preprocess(self,any_set,is_train):

        if is_train:
            dico_pattern={'match_lowercase_only':'\\b[a-z]+\\b',
              'match_word':'\\w{2,}',
              'match_word1': '(?u)\\b\\w+\\b',
              'match_word_punct': '\w+|[,.?!;]',
              'match_NNP': '\\b[A-Z][a-z]+\\b|\\b[A-Z]+\\b',
              'match_punct': "[,.?!;'-]"
             }

            tfv_title = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word1"],
                              ngram_range=(1, 2), max_df=1.0, min_df=2, max_features=None,
                              vocabulary=None, binary=True, norm=u'l2',
                              use_idf=True, smooth_idf=True, sublinear_tf=True)

            tfv_desc = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word1"],
                              ngram_range=(1, 2), max_df=1.0, min_df=2, max_features=None,
                              vocabulary=None, binary=True, norm=u'l2',
                              use_idf=True, smooth_idf=True, sublinear_tf=True)

            title_pipe = make_pipeline(ColumnSelector(key='title'), tfv_title)
            desc_pipe = make_pipeline(ColumnSelector(key='description'), tfv_desc)
            self.pipeline = make_union(title_pipe, desc_pipe)

            return self.pipeline.fit_transform(any_set)
        else:
            return self.pipeline.transform(any_set)
def test_pipeline_ducktyping():
    pipeline = make_pipeline(Mult(5))
    pipeline.predict
    pipeline.transform
    pipeline.inverse_transform

    pipeline = make_pipeline(Transf())
    assert_false(hasattr(pipeline, 'predict'))
    pipeline.transform
    pipeline.inverse_transform

    pipeline = make_pipeline(None)
    assert_false(hasattr(pipeline, 'predict'))
    pipeline.transform
    pipeline.inverse_transform

    pipeline = make_pipeline(Transf(), NoInvTransf())
    assert_false(hasattr(pipeline, 'predict'))
    pipeline.transform
    assert_false(hasattr(pipeline, 'inverse_transform'))

    pipeline = make_pipeline(NoInvTransf(), Transf())
    assert_false(hasattr(pipeline, 'predict'))
    pipeline.transform
    assert_false(hasattr(pipeline, 'inverse_transform'))
Beispiel #4
0
def get_pipeline(fsmethods, clfmethod):
    """Returns an instance of a sklearn Pipeline given the parameters
    fsmethod1 and fsmethod2 will be joined in a FeatureUnion, then it will joined
    in a Pipeline with clfmethod

    Parameters
    ----------
    fsmethods: list of estimators
        All estimators in a pipeline, must be transformers (i.e. must have a transform method).

    clfmethod: classifier
        The last estimator may be any type (transformer, classifier, etc.).

    Returns
    -------
    pipe
    """
    feat_union = None
    if not isinstance(fsmethods, list):
        if hasattr(fsmethods, 'transform'):
            feat_union = fsmethods
        else:
            raise ValueError('fsmethods expected to be either a list or a transformer method')
    else:
        feat_union = make_union(*fsmethods)

    if feat_union is None:
        pipe = make_pipeline(clfmethod)
    else:
        pipe = make_pipeline(feat_union, clfmethod)

    return pipe
def test_pipeline_ducktyping():
    pipeline = make_pipeline(Mult(5))
    pipeline.predict
    pipeline.transform
    pipeline.inverse_transform

    pipeline = make_pipeline(Transf())
    assert not hasattr(pipeline, 'predict')
    pipeline.transform
    pipeline.inverse_transform

    pipeline = make_pipeline('passthrough')
    assert pipeline.steps[0] == ('passthrough', 'passthrough')
    assert not hasattr(pipeline, 'predict')
    pipeline.transform
    pipeline.inverse_transform

    pipeline = make_pipeline(Transf(), NoInvTransf())
    assert not hasattr(pipeline, 'predict')
    pipeline.transform
    assert not hasattr(pipeline, 'inverse_transform')

    pipeline = make_pipeline(NoInvTransf(), Transf())
    assert not hasattr(pipeline, 'predict')
    pipeline.transform
    assert not hasattr(pipeline, 'inverse_transform')
    def __init__(self, **config):
        # Validate options are present
        for option in _configuration_options:
            if option not in config:
                raise ValueError("Missing configuration "
                                 "option {!r}".format(option))

        # Feature extraction
        sparse_features = parse_features(config["sparse_features"])
        densifier = make_pipeline(Vectorizer(sparse_features, sparse=True),
                                  ClassifierAsFeature())
        dense_features = parse_features(config["dense_features"])
        vectorization = make_union(densifier,
                                   Vectorizer(dense_features, sparse=False))

        # Classifier
        try:
            classifier = _valid_classifiers[config["classifier"]]
        except KeyError:
            raise ValueError("Unknown classification algorithm "
                             "{!r}".format(config["classifier"]))
        classifier = classifier(**config["classifier_args"])

        self.pipeline = make_pipeline(vectorization, StandardScaler())
        self.classifier = classifier
def analysis(name, typ, condition=None, query=None, title=None):
    """Wrapper to ensure that we attribute the same function for each type
    of analyses: e.g. categorical, regression, circular regression."""
    # Define univariate analysis
    erf_function = None  # Default is fast_mannwhitneyu
    # /!\ for categorical analyses, the contrast is min(y) - max(y)
    # e.g. target_present==False - target_present==True

    if typ == 'categorize':
        # estimator is normalization + l2 Logistic Regression
        clf = make_pipeline(
            StandardScaler(),
            force_predict(LogisticRegression(class_weight='balanced'), axis=1))
        scorer = scorer_auc
        chance = .5
    elif typ == 'regress':
        # estimator is normalization + l2 Ridge
        clf = make_pipeline(StandardScaler(), Ridge())
        scorer = scorer_spearman
        chance = 0.
    elif typ == 'circ_regress':
        # estimator is normalization + l2 Logistic Regression on cos and sin
        clf = make_pipeline(StandardScaler(), PolarRegression(Ridge()))
        scorer = scorer_angle
        chance = 0.
        # The univariate analysis needs a different scorer
        erf_function = scorer_circlin
    if condition is None:
        condition = name
    return dict(name=name, condition=condition, query=query, clf=clf,
                scorer=scorer, chance=chance, erf_function=erf_function,
                cv=8, typ=typ, title=title, single_trial=True)
Beispiel #8
0
def main(met_fname, gday_outfname, var):

    # Load met data
    s = remove_comments_from_header(met_fname)
    df_met = pd.read_csv(s, parse_dates=[[0,1]], skiprows=4, index_col=0,
                         sep=",", keep_date_col=True,
                         date_parser=date_converter)

    # Need to build numpy array, so drop year, doy cols
    met_data = df_met.ix[:,2:].values
    met_data_train = df_met.ix[0:4000,2:].values

    # Load GDAY outputs
    df = pd.read_csv(gday_outfname, skiprows=3, sep=",", skipinitialspace=True)
    df['date'] = make_data_index(df)
    df = df.set_index('date')

    target = df[var][0:4000].values

    # BUILD MODELS

    # hold back 40% of the dataset for testing
    #X_train, X_test, Y_train, Y_test = \
    #    cross_validation.train_test_split(met_data, target, \
    #                                      test_size=0.4, random_state=0)


    param_KNR = { "n_neighbors": [20], "weights": ['distance'] }

    #regmod = DecisionTreeRegressor()
    #regmod = RandomForestRegressor()
    #regmod = SVR()
    regmod = KNeighborsRegressor()


    pipeit3 = lambda model: make_pipeline(StandardScaler(), PCA(), model)
    pipeit2 = lambda model: make_pipeline(StandardScaler(), model)
    regmod_p = pipeit2(regmod)
    modlab = regmod_p.steps[-1][0]
    par_grid = {'{0}__{1}'.format(modlab, parkey): pardat \
                 for (parkey, pardat) in param_KNR.iteritems()}

    #emulator = GridSearchCV(regmod, param_grid=param_DTR, cv=5)
    emulator = GridSearchCV(regmod_p, param_grid=par_grid, cv=5)

    #emulator.fit(X_train, Y_train)
    emulator.fit(met_data_train, target)
    predict = emulator.predict(met_data)


    df = pd.DataFrame({'DT': df.index, 'emu': predict, 'gday': df[var]})


    plt.plot_date(df.index[4000:4383], df['emu'][4000:4383], 'o',
                  label='Emulator')
    plt.plot_date(df.index[4000:4383], df['gday'][4000:4383], 'o',
                  label='GDAY')
    plt.ylabel('GPP (g C m$^{-2}$ s$^{-1}$)')
    plt.legend()
    plt.show()
def cross_validation_LR(X,Y, n_folds, C_seq, K_seq, verbose = False):
    '''
        To classify Y using X, we first use ANOVA to choose K dimensions
        in X, where the difference between different Ys are highest, then run 
        a logistic regression classifier with regularization parameter C on 
        the K dimensions. 
         
        To quantify how well X can classify Y, without specifying training and 
        testing partition, we do n_folds cross validation.
        In each fold, during training, we do an inner loop cross validation to
        select C and K that give the best classification accuracy from a given 
        range; and then we use this to classify the held-out testing data. 
         
        Inputs:
            X, [n, p], n trials of p dimensional data, used for classification
            Y, [n], class labels
            n_folds,integer,  split the data into n_folds for cross validation
            C_seq, a sequence of regularizatioin parameters for logistic 
                    regression classifiers, smaller values specify stronger
                    regularization.
                    e.g. C_seq = 10.0** np.arange(-3,1,1)
            K_seq, a sequence of integers, 
                    e.g.  K_seq = (np.floor(np.arange(0.2,1,0.2)*p)).astype(np.int)
            verbose: boolean, if ture, print the best C and K chosen
        Output:
            averaged classification accuracy of the n_folds
    '''
    cv0 = StratifiedKFold(Y,n_folds = n_folds)
    cv_acc = np.zeros(n_folds)
    for i in range(n_folds):
        ind_test = cv0.test_folds == i
        ind_train = cv0.test_folds != i
        tmpX_train = X[ind_train,:]
        tmpY_train = Y[ind_train]
        tmpX_test = X[ind_test,:]
        tmpY_test = Y[ind_test]
         
        # grid search
        tmp_cv_score = np.zeros([len(C_seq), len(K_seq)])
        for j in range(len(C_seq)):
            for k in range(len(K_seq)):
                cv1 = StratifiedKFold(tmpY_train,n_folds = n_folds)
                anova_filter = SelectKBest(f_regression, k = K_seq[k])
                clf = LogisticRegression(C = C_seq[j], penalty = "l2")
                anova_clf = make_pipeline(anova_filter, clf)
                tmp_cv_score[j,k] = cross_val_score(anova_clf, tmpX_train,
                                  tmpY_train, scoring = "accuracy",  cv = cv1).mean()
         
        best_ind = np.argmax(tmp_cv_score.ravel())
        best_j, best_k = np.unravel_index(best_ind, tmp_cv_score.shape)
         
        anova_filter = SelectKBest(f_regression, k = K_seq[k])
        clf = LogisticRegression(C = C_seq[j], penalty = "l2")
        anova_clf = make_pipeline(anova_filter, clf)
        tmpY_predict = anova_clf.fit(tmpX_train, tmpY_train).predict(tmpX_test) 
        if verbose: 
            print C_seq[best_j],K_seq[best_k]          
        cv_acc[i] =  np.mean(tmpY_test == tmpY_predict)    
    return np.mean(cv_acc)                 
 def fit(self, X, y):
     # Filthy hack
     sids = X[:, -1]
     all_pipelines = [make_pipeline(LogisticRegressionCV()).fit(X_s, y_s) for
                      X_s, y_s in subject_splitter(X[:, :-1], y, sids)]
     f_union = make_union(*[FeatureUnionWrapper(p) for p in all_pipelines])
     self.clf_ = make_pipeline(f_union, LogisticRegressionCV()).fit(X[:, :-1], y)
     return self
Beispiel #11
0
 def test_generator_ok(self):
     pipeline = make_pipeline(FakeGenerator(fakes=['job', 'name', 'address'], nb_sample=20, random_state=40))
     result = pipeline.fit_transform(None)
     self.assertEqual(result.shape, (20, 3))
     pipeline = make_pipeline(FakeGenerator(fakes=['job', 'name', 'address'], nb_sample=20, random_state=40))
     result_2 = pipeline.fit_transform(None)
     # Testing the seed
     assert_frame_equal(result, result_2)
Beispiel #12
0
def test_make_pipeline_memory():
    cachedir = mkdtemp()
    memory = Memory(cachedir=cachedir)
    pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory)
    assert_true(pipeline.memory is memory)
    pipeline = make_pipeline(DummyTransf(), SVC())
    assert_true(pipeline.memory is None)

    shutil.rmtree(cachedir)
Beispiel #13
0
 def test_generator_ok(self):
     pipeline = make_pipeline(
         TestGenerator(nb_sample=100, random_state=40, num_sample=(1, 3), categ_sample=['green', 'blue']))
     result = pipeline.fit_transform(None)
     self.assertEqual(result.shape, (100, 2))
     self.assertEqual(result['number'].min(), 1)
     self.assertEqual(result['number'].max(), 2)
     pipeline = make_pipeline(
         TestGenerator(nb_sample=100, random_state=40, num_sample=(1, 3), categ_sample=['green', 'blue']))
     result_2 = pipeline.fit_transform(None)
     # Testing the seed
     assert_frame_equal(result, result_2)
Beispiel #14
0
def test_classes_property():
    iris = load_iris()
    X = iris.data
    y = iris.target

    reg = make_pipeline(SelectKBest(k=1), LinearRegression())
    reg.fit(X, y)
    assert_raises(AttributeError, getattr, reg, "classes_")

    clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0))
    assert_raises(AttributeError, getattr, clf, "classes_")
    clf.fit(X, y)
    assert_array_equal(clf.classes_, np.unique(y))
def test_make_pipeline():
    t1 = Transf()
    t2 = Transf()
    pipe = make_pipeline(t1, t2)
    assert_true(isinstance(pipe, Pipeline))
    assert_equal(pipe.steps[0][0], "transf-1")
    assert_equal(pipe.steps[1][0], "transf-2")

    pipe = make_pipeline(t1, t2, FitParamT())
    assert_true(isinstance(pipe, Pipeline))
    assert_equal(pipe.steps[0][0], "transf-1")
    assert_equal(pipe.steps[1][0], "transf-2")
    assert_equal(pipe.steps[2][0], "fitparamt")
Beispiel #16
0
def test_make_pipeline_memory():
    cachedir = mkdtemp()
    if LooseVersion(joblib_version) < LooseVersion('0.12'):
        # Deal with change of API in joblib
        memory = Memory(cachedir=cachedir, verbose=10)
    else:
        memory = Memory(location=cachedir, verbose=10)
    pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory)
    assert_true(pipeline.memory is memory)
    pipeline = make_pipeline(DummyTransf(), SVC())
    assert_true(pipeline.memory is None)

    shutil.rmtree(cachedir)
 def __init__(self):
     self.clf1 = [make_pipeline(Imputer(),
                              GradientBoostingRegressor(n_estimators=5000, max_depth=8)) for _ in range(5)]
     self.clf2 = [make_pipeline(Imputer(strategy='median'),
                               ExtraTreesRegressor(n_estimators=5000, criterion='mse', max_depth=8,
                                                   min_samples_split=10, min_samples_leaf=1,
                                                   min_weight_fraction_leaf=0.0,
                                                   max_features='auto', max_leaf_nodes=None, bootstrap=False,
                                                   oob_score=False,
                                                   n_jobs=1, random_state=42, verbose=0, warm_start=True)) for _ in range(5)]
     self.clf3 = [make_pipeline(Imputer(),
                               svm.LinearSVR()) for _ in range(5)]
     self.clf = [linear_model.LinearRegression() for _ in range(5)]
def get_results(dataset):
    X_full, y_full = dataset.data, dataset.target
    n_samples = X_full.shape[0]
    n_features = X_full.shape[1]

    # Estimate the score on the entire dataset, with no missing values
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
    full_scores = cross_val_score(estimator, X_full, y_full,
                                  scoring='neg_mean_squared_error')

    # Add missing values in 75% of the lines
    missing_rate = 0.75
    n_missing_samples = int(np.floor(n_samples * missing_rate))
    missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                          dtype=np.bool),
                                 np.ones(n_missing_samples,
                                         dtype=np.bool)))
    rng.shuffle(missing_samples)
    missing_features = rng.randint(0, n_features, n_missing_samples)

    # Estimate the score after replacing missing values by 0
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
    zero_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                         scoring='neg_mean_squared_error')

    # Estimate the score after imputation (mean strategy) of the missing values
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()
    estimator = make_pipeline(
        make_union(SimpleImputer(missing_values=0, strategy="mean"),
                   MissingIndicator(missing_values=0)),
        RandomForestRegressor(random_state=0, n_estimators=100))
    mean_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                         scoring='neg_mean_squared_error')

    # Estimate the score after chained imputation of the missing values
    estimator = make_pipeline(
        make_union(ChainedImputer(missing_values=0, random_state=0),
                   MissingIndicator(missing_values=0)),
        RandomForestRegressor(random_state=0, n_estimators=100))
    chained_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                            scoring='neg_mean_squared_error')

    return ((full_scores.mean(), full_scores.std()),
            (zero_impute_scores.mean(), zero_impute_scores.std()),
            (mean_impute_scores.mean(), mean_impute_scores.std()),
            (chained_impute_scores.mean(), chained_impute_scores.std()))
def build_text_extraction(binary, min_df, ngram, stopwords,useTfIdf ):
    if useTfIdf:
        return make_pipeline(TfidfVectorizer(min_df=min_df,
								 max_df = 0.8,
								 sublinear_tf=True,
								 use_idf=True,
                                ngram_range=(1,3)), ClassifierOvOAsFeatures())

    return make_pipeline(CountVectorizer(binary=binary,
                                         tokenizer=lambda x: x.split(),
                                         min_df=min_df,
                                         ngram_range=(1, ngram),
                                         stop_words=stopwords),
                         ClassifierOvOAsFeatures())
Beispiel #20
0
def out_fold_pred(params, X, y_array, y_ix, reps):
    y = y_array[:, y_ix]

    # cross validation here
    preds = np.zeros((y_array.shape[0]))
    clf = make_pipeline(StandardScaler(), LogisticRegression(**params))

    for train_ix, test_ix in makeKFold(5, y, reps):
        X_train, y_train = X[train_ix, :], y[train_ix]
        X_test = X[test_ix, :]
        clf = make_pipeline(StandardScaler(), LogisticRegression(**params))
        clf.fit(X_train, y_train)
        pred = clf.predict_proba(X_test)[:, 1]
        preds[test_ix] = pred
    return preds
def build_synset_extraction(binary, min_df, ngram, useTfIdf):
    if useTfIdf:
        return make_pipeline(MapToSynsets(),
                             TfidfVectorizer(min_df=min_df,
								 max_df = 0.8,
								 sublinear_tf=True,
								 use_idf=True,
                                ngram_range=(1,3)),
                             ClassifierOvOAsFeatures())
    return make_pipeline(MapToSynsets(),
                         CountVectorizer(binary=binary,
                                         tokenizer=lambda x: x.split(),
                                         min_df=min_df,
                                         ngram_range=(1, ngram)),
                         ClassifierOvOAsFeatures())
def check_pipeline_consistency(name, Estimator):
    if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit():
        # Those transformers yield non-deterministic output when executed on
        # a 32bit Python. The same transformers are stable on 64bit Python.
        # FIXME: try to isolate a minimalistic reproduction case only depending
        # scipy and/or maybe generate a test dataset that does not
        # cause such unstable behaviors.
        msg = name + ' is non deterministic on 32bit Python'
        raise SkipTest(msg)

    # check that make_pipeline(est) gives same score as est
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X -= X.min()
    y = multioutput_estimator_convert_y_2d(name, y)
    estimator = Estimator()
    set_fast_parameters(estimator)
    set_random_state(estimator)
    pipeline = make_pipeline(estimator)
    estimator.fit(X, y)
    pipeline.fit(X, y)
    funcs = ["score", "fit_transform"]
    for func_name in funcs:
        func = getattr(estimator, func_name, None)
        if func is not None:
            func_pipeline = getattr(pipeline, func_name)
            result = func(X, y)
            result_pipe = func_pipeline(X, y)
            assert_array_almost_equal(result, result_pipe)
def test_estimators_samples_deterministic():
    # This test is a regression test to check that with a random step
    # (e.g. SparseRandomProjection) and a given random state, the results
    # generated at fit time can be identically reproduced at a later time using
    # data saved in object attributes. Check issue #9524 for full discussion.

    iris = load_iris()
    X, y = iris.data, iris.target

    base_pipeline = make_pipeline(SparseRandomProjection(n_components=2),
                                  LogisticRegression())
    clf = BaggingClassifier(base_estimator=base_pipeline,
                            max_samples=0.5,
                            random_state=0)
    clf.fit(X, y)
    pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy()

    estimator = clf.estimators_[0]
    estimator_sample = clf.estimators_samples_[0]
    estimator_feature = clf.estimators_features_[0]

    X_train = (X[estimator_sample])[:, estimator_feature]
    y_train = y[estimator_sample]

    estimator.fit(X_train, y_train)
    assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
def scipy_algo(dataset, abstract=False):
    doc_proc = dp.DocumentsProcessor(dataset)
    tfidf_matrix, f_score_dict = doc_proc.get_data(abstract)

    svd = TruncatedSVD(tfidf_matrix.shape[0])
    lsa = make_pipeline(svd, Normalizer(copy=False))

    #tfidf_matrix = lsa.fit_transform(tfidf_matrix)

    print 'starting clustering after lsa: found %s document and %s features' \
          % (tfidf_matrix.shape[0], tfidf_matrix.shape[1])

    linkage_matrix = hr.average(tfidf_matrix.toarray())
    #linkage_matrix = hr.average(tfidf_matrix)

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    print_f_score_dict(f)

    avg_f_score = average_f_score(f, tfidf_matrix.shape[0])
    print 'average f_score: %s' % avg_f_score
    return avg_f_score
Beispiel #25
0
def train(param_search=False):
    data = load_files(download())
    y = [data.target_names[t] for t in data.target]

    # The random state on the LR estimator is fixed to the most arbitrary value
    # that I could come up with. It is biased toward the middle number keys on
    # my keyboard.
    clf = make_pipeline(TfidfVectorizer(min_df=2, dtype=float,
                                        sublinear_tf=True,
                                        ngram_range=(1, 2),
                                        strip_accents='unicode'),
                        LogisticRegression(random_state=623, C=5000))

    if param_search:
        params = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                  'lr__C': [1000, 5000, 10000]}

        print("Starting parameter search for review sentiment classification")
        # We ignore the original folds in the data, preferring a simple 5-fold
        # CV instead; this is intended to get a working model, not results for
        # publication.
        gs = GridSearchCV(clf, params, cv=5, refit=True, n_jobs=-1, verbose=2)
        gs.fit(data.data, y)

        print("Parameters found:")
        pprint(gs.best_params_)
        print("Cross-validation accuracy: %.3f" % gs.best_score_)

        return gs.best_estimator_

    else:
        print("Training logistic regression for movie review polarity")
        return clf.fit(data.data, y)
def cluster_dandelion_2(dataset, gamma=0.91, filter=False):
    #duplicato, mi serve solo per tornare la linkage_matrix
    doc_proc = dp.DocumentsProcessor(dataset)
    if gamma:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion(
            gamma=gamma, filter=filter)
    else:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion()

    svd = TruncatedSVD(tfidf_matrix.shape[0])
    lsa = make_pipeline(svd, Normalizer(copy=False))

    tfidf_matrix = lsa.fit_transform(tfidf_matrix)

    #linkage_matrix = hr.average(tfidf_matrix.toarray())
    linkage_matrix = hr.average(tfidf_matrix)

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    l = print_f_score_dict(f)

    params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0])
    params['all_fscore'] = l

    return linkage_matrix
Beispiel #27
0
def test_bagging_with_pipeline():
    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
                                                DecisionTreeClassifier()),
                                  max_features=2)
    estimator.fit(iris.data, iris.target)
    assert_true(isinstance(estimator[0].steps[-1][1].random_state,
                           int))
def main():
    if os.path.exists(args.out_svd_result_matrix):
        print("Loading SVD matrix from file")
        X = np.load(args.out_svd_result_matrix)
        print("Loading corpus")
        _, file_index = LoadCorpus(args.training_dir)
    else:
        print("Loading corpus")
        corpus, file_index = LoadCorpus(args.training_dir)
        print("Building TF-IDF")
        tf_idf = TfidfVectorizer(input="content", lowercase=False)
        X = tf_idf.fit_transform(corpus)
        del corpus
        print("Running LSA")
        svd = TruncatedSVD(args.dimentionality)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)
        print("Saving SVD results")
        np.save(args.out_svd_result_matrix, X)
    if (
        os.path.exists(args.out_inv_idx)
        and os.path.exists(args.out_unique_kmeans_labels)
        and os.path.exists(args.out_idx)
    ):
        print("Loading labels")
        unique_labels = np.load(args.out_unique_kmeans_labels)
        inv_idx = np.load(args.out_inv_idx)
        idx = np.load(args.out_idx)
        unique_X = X[idx]
    else:
        print("Unique matrix")
        b = np.ascontiguousarray(X).view(np.dtype((np.void, X.dtype.itemsize * X.shape[1])))
        _, idx, inv_idx = np.unique(b, return_index=True, return_inverse=True)
        print("Saving inv_idx")
        np.save(args.out_inv_idx, inv_idx)
        print("Saving idx")
        np.save(args.out_idx, idx)
        unique_X = X[idx]
        print("Running K-Means")
        unique_labels, _ = KMeans(unique_X)
        print("Save unique K-Means labels")
        np.save(args.out_unique_kmeans_labels, unique_labels)
    print("Re-label non-unique")
    labels = unique_labels[inv_idx]

    for l in range(unique_labels.max() + 1):
        out_filename = args.out_unique_distance_matrix_prefix + str(l) + ".npy"
        if os.path.exists(out_filename):
            continue
        print("Calculating distance matrix for label:", l)
        D = CalcDistances(unique_labels, l, unique_X)
        print("Saving to distance matrix to file")
        np.save(out_filename, D)

    if not os.path.exists(args.out_corpus_index):
        print("Calculating corpus index")
        corpus_index = GetCorpusIndex(file_index, labels, unique_labels, inv_idx)
        print("Saving corpus index")
        json.dump(corpus_index, open(args.out_corpus_index, "w"))
Beispiel #29
0
def get_input_vector(fields, vec_name, data):
    """Transform the input and create a 2D vector to cluster."""
    transformer = create_input_transformer(fields, vec_name)
    pipeline = make_pipeline(transformer, TruncatedSVD())

    log_info('Transformation pipeline complete.')
    return pipeline.fit_transform(data)
def build_lex_extraction(binary, min_df, ngram):
    return make_pipeline(InquirerLexTransform(),
                         CountVectorizer(binary=binary,
                                         tokenizer=lambda x: x.split(),
                                         min_df=min_df,
                                         ngram_range=(1, ngram)),
                         Densifier())
# Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
X_lately = X[-forecast_out:]
X_train = X[:-forecast_out]

# Separate label and identify it as y
y = np.array(dfreg['label'])
y_train = y[:-forecast_out]

#Model Generation

# Linear regression
clfreg = LinearRegression(n_jobs=-1)
clfreg.fit(X_train, y_train)

# Quadratic Regression 2
clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
clfpoly2.fit(X_train, y_train)

# Quadratic Regression 3
clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
clfpoly3.fit(X_train, y_train)

# KNN Regression
clfknn = KNeighborsRegressor(n_neighbors=2)
clfknn.fit(X_train, y_train)

#Evaluation
#confidencereg = clfreg.score(X_test, y_test)
#confidencepoly2 = clfpoly2.score(X_test,y_test)
#confidencepoly3 = clfpoly3.score(X_test,y_test)
#confidenceknn = clfknn.score(X_test, y_test)
Beispiel #32
0
 def create_pipeline(search_space):
     pipeline = make_pipeline(
         SimpleImputer(**search_space['simpleimputer']),
         LogisticRegression(solver='liblinear',
                            **search_space['logisticregression']))
     return pipeline
# epochs = epochs.apply_baseline(baseline=(-0.2, 0))

######################
# MVPA
# Init Cross-validation instance
logo = LeaveOneGroupOut()
# Init Xdawn, classifier and time_decoder
xdawn = mne.preprocessing.Xdawn(n_components=6, reg='diagonal_fixed')

# Init classifier
class_weight = 'balanced'  # None or 'balanced'
lr = LogisticRegression(solver='lbfgs', class_weight=class_weight)
svm = svm.SVC(gamma='scale', kernel='rbf', class_weight=class_weight)

# Init decoder
clf = make_pipeline(StandardScaler(), svm)
# Raw 3-D data decoder
raw_decoder = make_pipeline(mne.decoding.Vectorizer(), StandardScaler(), clf)
# Time resolution data decoder
time_decoder = SlidingEstimator(clf, n_jobs=n_jobs, scoring='f1', verbose=1)

# Init time information
times = epochs.times
n_times = len(times)
y_true = epochs.events[:, 2]
n_samples = len(y_true)
# Time window information
sfreq = epochs.info['sfreq']
window_info = {}
y_pred_timewindow = {}
for window_length in [0.1, 0.2, 0.3, 0.4]:
Beispiel #34
0
def test_in_pipeline():
    from sklearn.pipeline import make_pipeline
    X, y = make_classification(n_samples=100, n_features=5, chunksize=10)
    pipe = make_pipeline(DoNothingTransformer(), LogisticRegression())
    pipe.fit(X, y)
Beispiel #35
0
def train_classifier(input_features, test_features, classifier_type,
                     classifier_output, feat_names):
    feats = []
    labels = []

    # Load features and labels and format them as numpy array
    for line in input_features:
        parts = line.rstrip("\n").split("\t")
        feats.append([float(v) for v in parts[:-1]])
        labels.append(int(parts[-1]))

    dataset = dict()
    dataset['data'] = np.array(feats)
    dataset['target'] = np.array(labels)

    # Train classifier
    if classifier_type == "svm":
        clf = make_pipeline(MinMaxScaler(),
                            svm.SVC(gamma=0.001, C=100., probability=True))
    elif classifier_type == "mlp":
        clf = MLPClassifier(verbose=True,
                            solver='adam',
                            alpha=1e-5,
                            hidden_layer_sizes=(100, ),
                            random_state=1,
                            shuffle=True,
                            early_stopping=True,
                            validation_fraction=0.1)
    elif classifier_type == "extra_trees":
        parameters = {
            'criterion': ('gini', 'entropy'),
            'n_estimators': [100, 200, 300, 400, 500],
        }
        clf = ExtraTreesClassifier(bootstrap=True,
                                   class_weight=None,
                                   criterion='gini',
                                   max_depth=None,
                                   max_features='auto',
                                   max_leaf_nodes=None,
                                   min_impurity_decrease=0.0,
                                   min_impurity_split=None,
                                   min_samples_leaf=1,
                                   min_samples_split=2,
                                   min_weight_fraction_leaf=0.0,
                                   n_estimators=200,
                                   n_jobs=1,
                                   oob_score=False,
                                   random_state=0,
                                   verbose=0,
                                   warm_start=False)
    elif classifier_type == "nn":
        clf = neighbors.KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
    elif classifier_type == "nn1":
        clf = neighbors.KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
    elif classifier_type == "adaboost":
        clf = AdaBoostClassifier(n_estimators=100)
    elif classifier_type == "random_forest":
        parameters = {
            'criterion': ('gini', 'entropy'),
            'n_estimators': [100, 200, 300, 400, 500],
        }
        clf = RandomForestClassifier(bootstrap=True,
                                     class_weight=None,
                                     criterion='gini',
                                     max_depth=None,
                                     max_features='auto',
                                     max_leaf_nodes=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1,
                                     min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=200,
                                     n_jobs=1,
                                     oob_score=False,
                                     random_state=0,
                                     verbose=0,
                                     warm_start=False)
    else:
        logging.error("Unknown classifier: " + classifier_type)
        sys.exit(1)

    # If parameters is defined perform grid search
    try:
        parameters
    except NameError:
        pass
    else:
        clf = GridSearchCV(clf, parameters, n_jobs=-1)

    clf.fit(dataset['data'], dataset['target'])

    # Log sorted feature importances with their names
    if classifier_type in ('random_forest', 'adaboost', 'extra_trees'):
        if isinstance(clf, GridSearchCV):
            feat_dict = dict(
                zip(feat_names, clf.best_estimator_.feature_importances_))
        else:
            feat_dict = dict(zip(feat_names, clf.feature_importances_))

        logging.info("Top 10 important features: ")
        sorted_f = sorted(feat_dict.items(),
                          key=lambda item: item[1],
                          reverse=True)
        for feat in sorted_f[:10]:
            logging.info("\t{:.5f}: {}".format(feat[1], feat[0]))

    # Save classifier and log best params found by grid search
    if isinstance(clf, GridSearchCV):
        joblib.dump(clf.best_estimator_, classifier_output, compress=3)
        logging.info('Best classifier parameters found:')
        for k, v in clf.best_params_.items():
            logging.info('\t {}: {}'.format(k, v))
    else:
        joblib.dump(clf, classifier_output, compress=3)

    feats = []
    labels = []

    for line in test_features:
        parts = line.rstrip("\n").split("\t")
        feats.append([float(v) for v in parts[:-1]])
        labels.append(int(parts[-1]))

    dataset = np.array(feats)
    prediction = clf.predict_proba(dataset)

    pos = 0
    good = []
    wrong = []
    for pred in prediction:
        if labels[pos] == 1:
            good.append(pred[1])
        else:
            wrong.append(pred[1])
        pos += 1

    hgood = np.histogram(good, bins=np.arange(0, 1.1, 0.1))
    hwrong = np.histogram(wrong, bins=np.arange(0, 1.1, 0.1))

    return hgood[0].tolist(), hwrong[0].tolist()
    print 'Accuracy = %s' % (float(equal)/len(Y_pred))


# Loading the dataset
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep=';')

# Splitting into training and testing datasets
Y = data.quality
X = data.drop('quality', axis=1)
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
    X, Y, test_size=0.25, random_state=123, stratify=Y)

# Preprocessing the Data
pipeline = make_pipeline(preprocessing.StandardScaler(),
                         RandomForestClassifier(n_estimators=500, criterion='entropy'))

# Setting the HyperParameters
hyperparameters = {'randomforestclassifier__max_features': [
    'auto', 'sqrt', 'log2'], 'randomforestclassifier__max_depth': [None, 5, 3, 1, 7]}

# Fitting the classfier
clf = GridSearchCV(pipeline, hyperparameters, cv=5)
clf.fit(X_train, Y_train)

# Making prediction on the test set
Y_pred = clf.predict(X_test)

# Calculating Accuracy
accuracy(Y_test.tolist(), Y_pred)
Beispiel #37
0
from sklearn.linear_model import LogisticRegression
import mglearn
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

data = pd.read_excel("Data/Data_TrainProcessed.xlsx",
                     error_bad_lines=False,
                     encoding='utf-8')
y_score = (data['Rate'].values).reshape(-1, 1)
binaray = Binarizer(threshold=3)
y = binaray.fit_transform(y_score)
y_train = np.array(y).flatten()

pipe = make_pipeline(
    TfidfVectorizer(min_df=5, max_df=0.8, max_features=3000,
                    sublinear_tf=True), LogisticRegression())
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(data['Review'].values.astype('U'), y_train)
vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]
# transform the training dataset
X_train = vectorizer.transform(data['Review'].values.astype('U'))
# find maximum value for each of the features over the dataset
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
# get feature names
feature_names = np.array(vectorizer.get_feature_names())
mglearn.tools.visualize_coefficients(
    grid.best_estimator_.named_steps["logisticregression"].coef_,
    feature_names,
            X_array = np.asarray(X)

            X_array = np.asarray(X)
            is_all_zero = np.all(X_array == 0)
            if is_all_zero:
                print('array is all zeros')
            else:
                print('Array is good')
                choice_length = np.count_nonzero(~np.isnan(labels))

                X, y = shuffle(X_array, labels)
                X = X[:choice_length]
                y = y[:choice_length].fillna(0)

                scaler = MinMaxScaler(feature_range=(-1, 1))
                mm = make_pipeline(MinMaxScaler(), Normalizer())
                X = mm.fit_transform(X)
                rbf_feature = RBFSampler(gamma=1.5, random_state=10)
                ps = PolynomialCountSketch(degree=11, random_state=1)
                X_rbf_features = rbf_feature.fit_transform(X)
                X_poly_features = ps.fit_transform(X)
                # We want to get TSNE embedding with 2 dimensions
                n_components = 3
                tsne = TSNE(n_components)
                tsne_result = tsne.fit_transform(X_rbf_features)
                locationFileName = os.path.join(
                    figuresDestination,
                    str(sorted(symbols)[symbolIdx]) + '_idx_' + str(idx) +
                    'date_' + str(dateIdx) + '_' + str(labelName) +
                    '_tsne_rbf_kernelised.png')
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn import metrics
import os

train_prefix = "temp/final/train_5/"
test_prefix = "temp/final/test/"

input = []
label = []

for file in os.listdir(train_prefix):
    data = open(train_prefix + file,"r")
    data_read = csv.reader(data)
    for lines in data_read:
        input.append([float(elem) for elem in lines[0:-1]])
        label.append(float(lines[-1]))

X = np.array(input)
Y = np.array(label)

h = .02  # step size in the mesh

X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                    test_size=0.2)
std_clf = make_pipeline(StandardScaler(), linear_model.LogisticRegression(C=1e5))
std_clf.fit(X_train, y_train)
pred_test_std = std_clf.predict(X_test)

print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std)))
Beispiel #40
0
    data_cfg, **repr_cfg[utils_section])
# change y in case of classification
if 'classification' == task_cfg[utils_section]['task']:
    log_scale = True if 'log' == data_cfg[csv_section]['scale'].lower().strip(
    ) else False
    y = task_cfg[utils_section]['cutoffs'](y, log_scale)
    test_y = task_cfg[utils_section]['cutoffs'](test_y, log_scale)

training_features = x
training_target = y
testing_features = test_x

# Average CV score on the training set was: 0.8734423037820145
exported_pipeline = make_pipeline(
    Binarizer(threshold=0.15000000000000002),
    ExtraTreesClassifier(bootstrap=False,
                         criterion="entropy",
                         max_depth=20,
                         max_features=0.15000000000000002,
                         max_samples=None,
                         min_samples_leaf=2,
                         min_samples_split=7,
                         n_estimators=500))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 666)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

print('Success.')
    def calculate_score(deg, X, y):
        pipe = make_pipeline(StandardScaler(), PolynomialFeatures(deg),
                             BayesianRidge(normalize=False))  # type: Pipeline

        pipe.fit(X, y)
        return pipe.score(X, y)
Beispiel #42
0
#Validation function
n_folds = 5


def rmsle_cv(model):
    kf = KFold(
        n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(
        model, train.values, y_train, scoring="neg_mean_squared_error", cv=kf))
    print("rmse", rmse)
    return (rmse)


# 模型
# LASSO Regression :
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
# Elastic Net Regression
ENet = make_pipeline(
    RobustScaler(), ElasticNet(
        alpha=0.0005, l1_ratio=.9, random_state=3))
# Kernel Ridge Regression
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
# Gradient Boosting Regression
GBoost = GradientBoostingRegressor(
    n_estimators=3000,
    learning_rate=0.05,
    max_depth=4,
    max_features='sqrt',
    min_samples_leaf=15,
    min_samples_split=10,
    loss='huber',
test = rated_df.loc[~rated_df.index.isin(train.index)]
print("Train rows: {}".format(len(train.index)))
print("Test rows: {}".format(len(test.index)))

# In[#    Train Model with Data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

sc = StandardScaler()

mlc = MLPClassifier(activation='relu', random_state=1, nesterovs_momentum=True)
loo = LeaveOneOut()
pipe = make_pipeline(sc, mlc)

# Train the Model and check wich of the Parameters works best
parameters = {
    "mlpclassifier__hidden_layer_sizes": [(300, ), (500, )],
    "mlpclassifier__solver": ("sgd", "lbfgs"),
    "mlpclassifier__max_iter": [500, 1000, 2000],
    "mlpclassifier__learning_rate_init": [0.001, 0.1]
}
MLPClassifierModel = GridSearchCV(pipe, parameters, n_jobs=-1, cv=5)
MLPClassifierModel.fit(train[features], train[target])

# Save Model to file to used it later
file = open("test3_k_t_o_MLPClassifierModel_10_comp.pkl", 'wb')
pickle.dump(MLPClassifierModel, file)
file.close()
# %%
# Scikit-learn provides an estimator called
# :class:`~sklearn.linear_model.LinearLarsIC` that uses either Akaike's
# information criterion (AIC) or the Bayesian information criterion (BIC) to
# select the best model. Before fitting
# this model, we will scale the dataset.
#
# In the following, we are going to fit two models to compare the values
# reported by AIC and BIC.
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoLarsIC
from sklearn.pipeline import make_pipeline

lasso_lars_ic = make_pipeline(StandardScaler(),
                              LassoLarsIC(criterion="aic",
                                          normalize=False)).fit(X, y)


# %%
# To be in line with the defintion in [ZHT2007]_, we need to rescale the
# AIC and the BIC. Indeed, Zou et al. are ignoring some constant terms
# compared to the original definition of AIC derived from the maximum
# log-likelihood of a linear model. You can refer to
# :ref:`mathematical detail section for the User Guide <lasso_lars_ic>`.
def zou_et_al_criterion_rescaling(criterion, n_samples, noise_variance):
    """Rescale the information criterion to follow the definition of Zou et al."""
    return criterion - n_samples * np.log(
        2 * np.pi * noise_variance) - n_samples

data_path = os.path.join(os.path.expanduser('~'), 'kaggle/Caterpillar/data')

dataset = 'br6lin'

train = pd.read_csv(os.path.join(stage1, dataset, 'training.csv'))
target = train.cost
train.drop(['cost'], axis=1, inplace=True)
tube_assembly_ids = pd.read_csv(os.path.join(data_path, 'train_set.csv'), usecols=['tube_assembly_id'])
train['tube_assembly_id'] = tube_assembly_ids

test = pd.read_csv(os.path.join(stage1, dataset, 'testing.csv'))
test.drop(['cost'], axis=1, inplace=True)


preprocess = make_pipeline(
                ScaleContinuousOnly(),
#                StandardScaler(),
)

cwd = os.getcwd()
refit_train_val_dir = 'refit_train_val'
refit_train_dir = 'refit_train'

#epoch_save_range = range(30, 101, 5)
epoch_save_range = list(range(10, 11, 5))

t0 = time.time()
n_folds = 3
do_folds = 3

def KLabelFold(labels, n_folds=3, shuffle=False, random_state=None):
    kfold = KFold(labels.nunique(), n_folds=n_folds, shuffle=shuffle, random_state=random_state)
y -= y.mean()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
                                                    random_state=0)

##############################################################################
# Partial Dependence computation for multi-layer perceptron
# ---------------------------------------------------------
#
# Let's fit a MLPRegressor and compute single-variable partial dependence
# plots

print("Training MLPRegressor...")
tic = time()
est = make_pipeline(QuantileTransformer(),
                    MLPRegressor(hidden_layer_sizes=(50, 50),
                                 learning_rate_init=0.01,
                                 early_stopping=True))
est.fit(X_train, y_train)
print("done in {:.3f}s".format(time() - tic))
print("Test R2 score: {:.2f}".format(est.score(X_test, y_test)))

##############################################################################
# We configured a pipeline to scale the numerical input features and tuned the
# neural network size and learning rate to get a reasonable compromise between
# training time and predictive performance on a test set.
#
# Importantly, this tabular dataset has very different dynamic ranges for its
# features. Neural networks tend to be very sensitive to features with varying
# scales and forgetting to preprocess the numeric feature would lead to a very
# poor model.
#
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

#MakePipelines In SKLearn

from sklearn.pipeline import make_pipeline
# Create a pipeline
pipe = make_pipeline((RandomForestClassifier()))

# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"randomforestclassifier": [RandomForestClassifier()],
                 "randomforestclassifier__n_estimators": [10, 100, 1000],
                 "randomforestclassifier__max_depth":[5,8,15,25,30,None],
                 "randomforestclassifier__min_samples_leaf":[1,2,5,10,15,100],
                 "randomforestclassifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)
best_model.score(X_test,y_test)
Beispiel #48
0
    print("%d sentences" % len(dataset.data))
    print("%d categories" % len(dataset.target_names))
    print()

print(
    "Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    if opts.use_idf:
        # Perform an IDF normalization on the output of HashingVectorizer
        hasher = HashingVectorizer(n_features=opts.n_features,
                                   stop_words='english',
                                   alternate_sign=False,
                                   norm=None,
                                   binary=False)
        vectorizer = make_pipeline(hasher, TfidfTransformer(norm=opts.norm))
    else:
        vectorizer = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english',
                                       alternate_sign=False,
                                       norm=opts.norm,
                                       binary=False)
else:
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 max_features=opts.n_features,
                                 min_df=2,
                                 stop_words='english',
                                 use_idf=opts.use_idf,
                                 norm=opts.norm)
X = vectorizer.fit_transform(dataset.data)
Beispiel #49
0
# commented out after the run
#pipeline_optimizer.fit(train, train_labels)

# export optimized code
# commented out after the run
#pipeline_optimizer.export('tpot_titanic_pipeline.py')

# import libraries
from sklearn.pipeline import make_pipeline

# create the pipeline from TPOT
# original pipeline inluded a Binarizer and RBFSampler which scored only 0.78947
exported_pipeline = make_pipeline(
    RandomForestClassifier(bootstrap=False,
                           criterion="gini",
                           max_features=0.45,
                           min_samples_leaf=14,
                           min_samples_split=13,
                           n_estimators=100))

# fit the pipeline on the train data
exported_pipeline.fit(train, train_labels)

# predict on the test data
results = exported_pipeline.predict(test)

# In[ ]:

# make a submission dataframe
submit = df_test.loc[:, ['PassengerId']]
submit.loc[:, 'Survived'] = results
Beispiel #50
0
                    km.labels_,
                    sample_size=1000,
                    random_state=random_state)
                logger.debug("Silhouette Coefficient: %0.3f" %
                             silhouette_score)
                logger.debug("Homogeneity: %0.3f" % homogeneity)
                text_to_display = 'homogeneity: %.2f\nsilhouette: %.2f' % (
                    homogeneity, silhouette_score)
            elif do_lsa:
                topic_model_name = 'LSA'
                n_components = 300
                n_components = min(100, tfidf.shape[1] - 1)

                svd = TruncatedSVD(n_components, random_state=random_state)
                normalizer = Normalizer(copy=False)
                lsa = make_pipeline(svd, normalizer)

                lsa_results = lsa.fit_transform(tfidf)

                explained_variance = svd.explained_variance_ratio_.sum()
                logger.debug('Explained variance of the SVD step: %d' %
                             int(explained_variance * 100))

                true_k = tfidf.shape[0] * tfidf.shape[1] / tfidf.nnz
                logger.debug('we are looking for %d clusters' % true_k)
                verbose = False
                if minibatch:
                    km = MiniBatchKMeans(n_clusters=true_k,
                                         init='k-means++',
                                         n_init=1,
                                         init_size=1000,
Beispiel #51
0
# 3. Load red wine data.
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep=';')

# 4. Split data into training and test sets
y = data.quality
X = data.drop('quality', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123,
                                                    stratify=y)

# 5. Declare data preprocessing steps
pipeline = make_pipeline(preprocessing.StandardScaler(),
                         RandomForestRegressor(n_estimators=100))

# 6. Declare hyperparameters to tune
hyperparameters = {
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
    'randomforestregressor__max_depth': [None, 5, 3, 1]
}

# 7. Tune model using cross-validation pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv=10)

clf.fit(X_train, y_train)

# 8. Refit on the entire training set
# No additional code needed if clf.refit == True (default is True)
Beispiel #52
0
#   or indices provided. We will obtain as many subsets as the number of
#   transformers passed into the `ColumnTransformer`.
# * It **transforms each subsets**. A specific transformer is applied to
#   each subset: it will internally call `fit_transform` or `transform`. The
#   output of this step is a set of transformed datasets.
# * It then **concatenate the transformed datasets** into a single dataset.

# The important thing is that `ColumnTransformer` is like any other
# scikit-learn transformer. In particular it can be combined with a classifier
# in a `Pipeline`:

# %%
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

# %% [markdown]
# Starting from `scikit-learn 0.23`, the notebooks can display an interactive
# view of the pipelines.

# %%
from sklearn import set_config
set_config(display='diagram')
model

# %% [markdown]
# The final model is more complex than the previous models but still follows
# the same API (the same set of methods that can be called by the user):
#
# - the `fit` method is called to preprocess the data and then train the
def PolynomialLasso(degree=1, alpha=1):
    return make_pipeline(PolynomialFeatures(degree = degree,\
                            include_bias = False), StandardScaler(), Lasso(alpha = alpha))
def PolynomialRegression(degree=1):
    return make_pipeline(PolynomialFeatures(degree = degree,\
                            include_bias = False), LinearRegression())
Beispiel #55
0
# Create separate object for target variable
y = df.<feature>
# Create separate object for input features
X = df.drop('<target feature>', axis = 1)

#Use to split data into train and test data
train_test_split

# Function for creating model pipelines
from sklearn.pipeline import make_pipeline
# For standardization
from sklearn.preprocessing import StandardScaler

#Good practice: create dictionary holding different algos
pipelines = {
	'lasso': make_pipeline(StandardScaler(), lasso()),
	...
}

#Do same thing for hyperparameters grid; one per algo
lasso_hyperparameters = { 
    'lasso__alpha' : [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10] 
}

# Create hyperparameters dictionary
hyperparameters = {
    'lasso': lasso_hyperparameters,
    ...
}

# List tuneable hyperparameters of our Lasso pipeline
labels = epochs.events[:, -1]
evoked = epochs.average()

###############################################################################
# Decoding in tangent space with a logistic regression

n_components = 2  # pick some components

# Define a monte-carlo cross-validation generator (reduce variance):
cv = KFold(n_splits=10, shuffle=True, random_state=42)
epochs_data = epochs.get_data()

clf = make_pipeline(
    XdawnCovariances(n_components),
    TangentSpace(metric="riemann"),
    LogisticRegression(),
)

preds = np.zeros(len(labels))

for train_idx, test_idx in cv.split(epochs_data):
    y_train, y_test = labels[train_idx], labels[test_idx]

    clf.fit(epochs_data[train_idx], y_train)
    preds[test_idx] = clf.predict(epochs_data[test_idx])

# Printing the results
acc = np.mean(preds == labels)
print("Classification accuracy: %f " % (acc))
Beispiel #57
0
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Normalizer
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.032380308137753055
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=RandomForestClassifier(bootstrap=False,
                                                       criterion="gini",
                                                       max_features=0.55,
                                                       min_samples_leaf=2,
                                                       min_samples_split=4,
                                                       n_estimators=100)),
    Normalizer(norm="l1"),
    KNeighborsClassifier(n_neighbors=45, p=1, weights="uniform"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
def test_bagging_with_pipeline():
    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
                                                DecisionTreeClassifier()),
                                  max_features=2)
    estimator.fit(iris.data, iris.target)
Beispiel #59
0
rf = RandomForestClassifier(n_estimators=400,
                            verbose=100,
                            n_jobs=-1,
                            random_state=0,
                            max_samples=5000)

rf.fit(X_train_rf, y_train)

feature_selector = SelectFromModel(rf, prefit=True, max_features=100000)

svc_set = pd.concat([X_train, y_train], axis=1)
svc_set = svc_set.sample(100000, random_state=0)

svc_X = svc_set['review']
svc_y = svc_set['label']

svc_X = vectorizer.transform(svc_X)
svc_X = feature_selector.transform(svc_X)

svc = SVC(cache_size=1000, random_state=0)

svc.fit(svc_X, svc_y)

final_pipe = make_pipeline(vectorizer, feature_selector, svc)

# 95.92% accuracy
final_pipe.score(X_test, y_test)

dump(final_pipe, 'trained_models/good_svc_pruned')
df = pd.read_csv('https://archive.ics.uci.edu/ml/'
                 'machine-learning-databases'
                 '/breast-cancer-wisconsin/wdbc.data', header=None)

X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

le.transform(['M', 'B'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=1)

pipe_lr = make_pipeline(StandardScaler(),
                        LogisticRegression(solver='liblinear',
                                           penalty='l2',
                                           random_state=1))

train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr,
                                                        X=X_train,
                                                        y=y_train,
                                                        train_sizes=np.linspace(0.1, 1.0, 10),
                                                        cv=10,
                                                        n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean,