def neural_net(features,target,test_size_percent=0.2,cv_split=3,n_iter=100,learning_rate=0.01):
    '''Features -> Pandas Dataframe with attributes as columns
        target -> Pandas Dataframe with target column for prediction
        Test_size_percent -> Percentage of data point to be used for testing'''
    scale=preprocessing.MinMaxScaler()
    X_array = scale.fit_transform(features)
    y_array = scale.fit_transform(target)
    mlp = Regressor(layers=[Layer("Rectifier",units=5), # Hidden Layer1
                            Layer("Rectifier",units=3)  # Hidden Layer2
                            ,Layer("Linear")],     # Output Layer
                        n_iter = n_iter, learning_rate=0.01)
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4)
    mlp.fit(X_train,y_train)
    test_prediction = mlp.predict(X_test)
    tscv = TimeSeriesSplit(cv_split)
    
    training_score = cross_val_score(mlp,X_train,y_train,cv=tscv.n_splits) 
    testing_score = cross_val_score(mlp,X_test,y_test,cv=tscv.n_splits)
    print"Cross-val Training score:", training_score.mean()
#    print"Cross-val Testing score:", testing_score.mean()
    training_predictions = cross_val_predict(mlp,X_train,y_train,cv=tscv.n_splits)
    testing_predictions = cross_val_predict(mlp,X_test,y_test,cv=tscv.n_splits)
    
    training_accuracy = metrics.r2_score(y_train,training_predictions) 
#    test_accuracy_model = metrics.r2_score(y_test,test_prediction_model)
    test_accuracy = metrics.r2_score(y_test,testing_predictions)
    
#    print"Cross-val predicted accuracy:", training_accuracy
    print"Test-predictions accuracy:",test_accuracy

    plot_model(target,y_train,y_test,training_predictions,testing_predictions)
    return mlp
Example #2
0
def main():
    from sklearn import preprocessing
    from sklearn.datasets import fetch_mldata
    from sklearn.model_selection import cross_val_score

    db_name = 'iris'
    hid_num = 1000
    data_set = fetch_mldata(db_name)
    data_set.data = preprocessing.scale(data_set.data)

    print(db_name)
    print('ECOBELM', hid_num)
    e = ECOBELM(hid_num, c=2**5)
    ave = 0
    for i in range(10):
        scores = cross_val_score(
            e, data_set.data, data_set.target, cv=5, scoring='accuracy')
        ave += scores.mean()
    ave /= 10
    print("Accuracy: %0.2f " % (ave))

    print('ELM', hid_num)
    e = ELM(hid_num)
    ave = 0
    for i in range(10):
        scores = cross_val_score(
            e, data_set.data, data_set.target, cv=5, scoring='accuracy')
        ave += scores.mean()
    ave /= 10
    print("Accuracy: %0.2f " % (ave))
def test_cross_val_score_fit_params():
    clf = MockClassifier()
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))

    W_sparse = coo_matrix((np.array([1]), (np.array([1]), np.array([0]))),
                          shape=(10, 1))
    P_sparse = coo_matrix(np.eye(5))

    DUMMY_INT = 42
    DUMMY_STR = '42'
    DUMMY_OBJ = object()

    def assert_fit_params(clf):
        # Function to test that the values are passed correctly to the
        # classifier arguments for non-array type

        assert_equal(clf.dummy_int, DUMMY_INT)
        assert_equal(clf.dummy_str, DUMMY_STR)
        assert_equal(clf.dummy_obj, DUMMY_OBJ)

    fit_params = {'sample_weight': np.ones(n_samples),
                  'class_prior': np.ones(n_classes) / n_classes,
                  'sparse_sample_weight': W_sparse,
                  'sparse_param': P_sparse,
                  'dummy_int': DUMMY_INT,
                  'dummy_str': DUMMY_STR,
                  'dummy_obj': DUMMY_OBJ,
                  'callback': assert_fit_params}
    cross_val_score(clf, X, y, fit_params=fit_params)
def test_score_memmap():
    # Ensure a scalar score of memmap type is accepted
    iris = load_iris()
    X, y = iris.data, iris.target
    clf = MockClassifier()
    tf = tempfile.NamedTemporaryFile(mode='wb', delete=False)
    tf.write(b'Hello world!!!!!')
    tf.close()
    scores = np.memmap(tf.name, dtype=np.float64)
    score = np.memmap(tf.name, shape=(), mode='r', dtype=np.float64)
    try:
        cross_val_score(clf, X, y, scoring=lambda est, X, y: score)
        # non-scalar should still fail
        assert_raises(ValueError, cross_val_score, clf, X, y,
                      scoring=lambda est, X, y: scores)
    finally:
        # Best effort to release the mmap file handles before deleting the
        # backing file under Windows
        scores, score = None, None
        for _ in range(3):
            try:
                os.unlink(tf.name)
                break
            except WindowsError:
                sleep(1.)
Example #5
0
    def _cross_validation(self, sentences, labels, intent_features, spacy_nlp, max_ngrams):
        """choose the best number of ngrams to include in bow.

        Given an intent classification problem and a set of ordered ngrams (ordered in terms
        of importance by pick_applicable_ngrams) we choose the best number of ngrams to include
        in our bow vecs by cross validation."""

        from sklearn import preprocessing
        from sklearn.linear_model import LogisticRegression
        from sklearn.model_selection import cross_val_score
        import numpy as np

        clf2 = LogisticRegression(class_weight='balanced')
        intent_encoder = preprocessing.LabelEncoder()
        intent_encoder.fit(labels)
        y = intent_encoder.transform(labels)
        cv_splits = min(10, np.min(np.bincount(y))) if y.size > 0 else 0
        if cv_splits >= 3:
            logger.debug("Started ngram cross-validation to find best number of ngrams to use...")
            num_ngrams = np.unique(list(map(int, np.floor(np.linspace(1, max_ngrams, 8)))))
            no_ngrams_X = self._create_bow_vecs(intent_features, sentences, spacy_nlp, max_ngrams=0)
            no_ngrams_score = np.mean(cross_val_score(clf2, no_ngrams_X, y, cv=cv_splits))
            scores = []
            for n in num_ngrams:
                X = self._create_bow_vecs(intent_features, sentences, spacy_nlp, max_ngrams=n)
                score = np.mean(cross_val_score(clf2, X, y, cv=cv_splits))
                scores.append(score)
                logger.debug("Evaluating usage of {} ngrams. Score: {}".format(n, score))
            n_top = num_ngrams[np.argmax(scores)]
            logger.debug("Score without ngrams: {}".format(no_ngrams_score))
            logger.info("Best score with {} ngrams: {}".format(n_top, np.max(scores)))
            return n_top
        else:
            warnings.warn("Can't cross-validate ngram featurizer. There aren't enough examples per intent (at least 3)")
            return max_ngrams
Example #6
0
    def three_models_combined(self, intrusion_features, avoidance_features, hypertension_features):

        self.df = self.df[~self.df['intrusion_cutoff'].isna()]
        self.df = self.df[~self.df['avoidance_cutoff'].isna()]
        self.df = self.df[~self.df['hypertention_cutoff'].isna()]
        print("self.df.shape", self.df.shape)
        X = self.df
        Y = self.df[self.target]# strict
        all_Y = [self.target, "intrusion_cutoff", "avoidance_cutoff", "hypertention_cutoff"]


        X_train, X_test, y_train, y_test = train_test_split(X, self.df[all_Y], test_size=0.25, random_state = 8526566, stratify=Y)

        # intrusion
        X_intrusion = X_train[intrusion_features].values
        y_intrusion = y_train["intrusion_cutoff"].apply(lambda x: int(x))
        pipe_intrusion = Pipeline(steps=[
            ('rfe', BorderlineSMOTE()),
            ('classifier', XGBClassifier(n_estimators=100, reg_alpha=1))])
        scores = cross_val_score(pipe_intrusion, X_intrusion, y_intrusion, scoring='precision', cv=StratifiedKFold(5))
        print(f"intrusion {sum(scores)/5}")
        pipe_intrusion.fit(X_intrusion, y_intrusion)

        # avoidance
        X_avoidance = X_train[avoidance_features].values
        y_avoidance = y_train["avoidance_cutoff"].apply(lambda x: int(x))
        pipe_avoidance = Pipeline(steps=[
            ('classifier', XGBClassifier(n_estimators=100, scale_pos_weight=3, reg_alpha=1))])
        scores = cross_val_score(pipe_avoidance, X_avoidance, y_avoidance, scoring='precision', cv=StratifiedKFold(5))
        print(f"avoidance {sum(scores)/5}")
        pipe_avoidance.fit(X_avoidance, y_avoidance)


        # hypertension
        X_hypertension = X_train[hypertension_features].values
        y_hypertention = y_train["hypertention_cutoff"].apply(lambda x: int(x))
        pipe_hypertension = Pipeline(steps=[
            ('classifier', BalancedBaggingClassifier(n_estimators=100))])
        scores = cross_val_score(pipe_hypertension, X_hypertension, y_hypertention, scoring='precision', cv=StratifiedKFold(5))
        print(f"hypertension {sum(scores)/5}")
        pipe_hypertension.fit(X_hypertension, y_hypertention)

        ## combine three classifiers
        X_test_hypertension = X_test[hypertension_features].values
        X_test_avoidance = X_test[avoidance_features].values
        X_test_intrusion = X_test[intrusion_features].values

        y_pred_hypertension = pipe_hypertension.predict(X_test_hypertension)
        y_pred_avoidance = pipe_avoidance.predict(X_test_avoidance)
        y_pred_intrusion = pipe_intrusion.predict(X_test_intrusion)
        y_pred = (y_pred_hypertension * y_pred_avoidance * y_pred_intrusion)

        y_target = y_test["PCL_Strict3"].apply(lambda x: int(x))

        acc = accuracy_score(y_target, y_pred)
        f1 = f1_score(y_target, y_pred)
        recall = recall_score(y_target, y_pred)
        precision = precision_score(y_target, y_pred)
        print("test scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def Random_forest(features,target,test_size_percent=0.2,cv_split=3):
    X_array = features.as_matrix()
    y_array = target.as_matrix()        
    model_rdf = RandomForestRegressor()
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4)
    model_rdf.fit(X_train,y_train)
    test_prediction = model_rdf.predict(X_test)
    tscv = TimeSeriesSplit(cv_split)
    
    training_score = cross_val_score(model_rdf,X_train,y_train,cv=tscv.n_splits) 
    testing_score = cross_val_score(model_rdf,X_test,y_test,cv=tscv.n_splits)
    print"Cross-val Training score:", training_score.mean()
#    print"Cross-val Testing score:", testing_score.mean()
    training_predictions = cross_val_predict(model_rdf,X_train,y_train,cv=tscv.n_splits)
    testing_predictions = cross_val_predict(model_rdf,X_test,y_test,cv=tscv.n_splits)
    
    training_accuracy = metrics.r2_score(y_train,training_predictions) 
#    test_accuracy_model = metrics.r2_score(y_test,test_prediction_model)
    test_accuracy = metrics.r2_score(y_test,testing_predictions)
    
#    print"Cross-val predicted accuracy:", training_accuracy
    print"Test-predictions accuracy:",test_accuracy

    plot_model(target,y_train,y_test,training_predictions,testing_predictions)
    return model_rdf
def svm_regressor(features,target,test_size_percent=0.2,cv_split=5):
    
    scale=preprocessing.MinMaxScaler()
    X_array = scale.fit_transform(features)
    y_array = scale.fit_transform(target)  
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4)
    svr = SVR(kernel='rbf',C=10,gamma=1)
    svr.fit(X_train,y_train.ravel())
    test_prediction = svr.predict(X_test)
    tscv = TimeSeriesSplit(cv_split)
    
    training_score = cross_val_score(svr,X_train,y_train,cv=tscv.n_splits) 
    testing_score = cross_val_score(svr,X_test,y_test,cv=tscv.n_splits)
    print"Cross-val Training score:", training_score.mean()
#    print"Cross-val Testing score:", testing_score.mean()
    training_predictions = cross_val_predict(svr,X_train,y_train,cv=tscv.n_splits)
    testing_predictions = cross_val_predict(svr,X_test,y_test,cv=tscv.n_splits)
    
    training_accuracy = metrics.r2_score(y_train,training_predictions) 
#    test_accuracy_model = metrics.r2_score(y_test,test_prediction_model)
    test_accuracy = metrics.r2_score(y_test,testing_predictions)
    
#    print"Cross-val predicted accuracy:", training_accuracy
    print"Test-predictions accuracy:",test_accuracy
    return svr
def linear_regression(features,target,test_size_percent=0.2,cv_split=5):
    ''' Features -> Pandas Dataframe with attributes as columns
        target -> Pandas Dataframe with target column for prediction
        Test_size_percent -> Percentage of data point to be used for testing'''
    X_array = features.as_matrix()
    y_array = target.as_matrix()    
    ols = linear_model.LinearRegression()
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4)
#    model = ols.fit(X_train, y_train)
    ols.fit(X_train, y_train)
#    test_prediction_model = ols.predict(X_test)
    tscv = TimeSeriesSplit(cv_split)
    
    training_score = cross_val_score(ols,X_train,y_train,cv=tscv.n_splits) 
    testing_score = cross_val_score(ols,X_test,y_test,cv=tscv.n_splits)
    print"Cross-val Training score:", training_score.mean()
#    print"Cross-val Testing score:", testing_score.mean()
    training_predictions = cross_val_predict(ols,X_train,y_train,cv=tscv.n_splits)
    testing_predictions = cross_val_predict(ols,X_test,y_test,cv=tscv.n_splits)
    
    training_accuracy = metrics.r2_score(y_train,training_predictions) 
#    test_accuracy_model = metrics.r2_score(y_test,test_prediction_model)
    test_accuracy = metrics.r2_score(y_test,testing_predictions)
    
#    print"Cross-val predicted accuracy:", training_accuracy
    print"Test-predictions accuracy:",test_accuracy

    plot_model(target,y_train,y_test,training_predictions,testing_predictions)
    return ols
Example #10
0
    def fit(self, X_train, y_train):
        # intrusion
        X_intrusion = X_train[self.features].values
        y_intrusion = X_train["intrusion_cutoff"].apply(lambda x: int(x))
        self.pipe_intrusion = Pipeline(steps=[
           ('rfe',  RFE(XGBClassifier(n_estimators=self.n_estimators, reg_alpha=1, scale_pos_weight=3), self.rfe)),
            ('classifier', XGBClassifier(n_estimators=self.n_estimators, reg_alpha=1, scale_pos_weight=3))])
        self.pipe_intrusion.fit(X_intrusion, y_intrusion)
        scores = cross_val_score(self.pipe_intrusion, X_intrusion, y_intrusion, scoring='precision', cv=StratifiedKFold(5))
        print(f"intrusion {sum(scores)/5}")
        self.pipe_intrusion.fit(X_intrusion, y_intrusion)

        # avoidance
        X_avoidance = X_train[self.features].values
        y_avoidance = X_train["avoidance_cutoff"].apply(lambda x: int(x))
        self.pipe_avoidance = Pipeline(steps=[
            ('rfe', RFE(XGBClassifier(n_estimators=self.n_estimators, reg_alpha=1, scale_pos_weight=6), self.rfe)),
            ('classifier', XGBClassifier(n_estimators=self.n_estimators, reg_alpha=1, scale_pos_weight=6))])
        self.pipe_avoidance.fit(X_avoidance, y_avoidance)
        scores = cross_val_score(self.pipe_avoidance, X_avoidance, y_avoidance, scoring='precision', cv=StratifiedKFold(5))
        print(f"avoidance {sum(scores)/5}")
        self.pipe_avoidance.fit(X_avoidance, y_avoidance)

        # hypertension
        X_hypertension = X_train[self.features].values
        y_hypertention = X_train["hypertention_cutoff"].apply(lambda x: int(x))
        self.pipe_hypertension = Pipeline(steps=[
            ('rfe', RFE(XGBClassifier(n_estimators=self.n_estimators, reg_alpha=1, scale_pos_weight=4), self.rfe)),
            ('classifier', XGBClassifier(n_estimators=self.n_estimators, reg_alpha=1, scale_pos_weight=4))])
        self.pipe_hypertension.fit(X_hypertension, y_hypertention)
        scores = cross_val_score(self.pipe_hypertension, X_hypertension, y_hypertention, scoring='precision', cv=StratifiedKFold(5))
        print(f"hypertension {sum(scores)/5}")
        self.pipe_hypertension.fit(X_hypertension, y_hypertention)

        # regression
        X_regression = X_train[self.features].values
        y_regression = X_train["PCL3"]
        self.pipe_regression = Pipeline(steps=[
            ('classifier', Ridge())])
        self.pipe_regression.fit(X_regression, y_regression)

        # target
        y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension)
        y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance)
        y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion)
        y_pred_regression = self.pipe_regression.predict(X_regression) >= self.cutoff

        y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_regression & y_pred_regression)
        y_target = y_train

        acc = accuracy_score(y_target, y_pred)
        f1 = f1_score(y_target, y_pred)
        recall = recall_score(y_target, y_pred)
        precision = precision_score(y_target, y_pred)
        print("test scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
Example #11
0
def test_cross_val_score_allow_nans():
    # Check that cross_val_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    ])
    cross_val_score(p, X, y, cv=5)
Example #12
0
def tune_spam(X_train,y_train,alpha_list):
    val_accuracy=[]
    for alpha in alpha_list:
        model = SVC(C=alpha)
        val_accuracy.extend([np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy'))])
        print [np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy'))]
    max_index =  val_accuracy.index(max( val_accuracy))
    print "CV_val_error:", val_accuracy
    print "Best C:",alpha_list[max_index]
    return alpha_list[max_index]
Example #13
0
def compute_scores(X):
    pca = PCA(svd_solver='full')
    fa = FactorAnalysis()

    pca_scores, fa_scores = [], []
    for n in n_components:
        pca.n_components = n
        fa.n_components = n
        pca_scores.append(np.mean(cross_val_score(pca, X)))
        fa_scores.append(np.mean(cross_val_score(fa, X)))

    return pca_scores, fa_scores
def test_k_fold_cv():
    """Test OneHotEncoder with categorical_features='auto'."""
    boston = load_boston()
    clf = make_pipeline(
        OneHotEncoder(
            categorical_features='auto',
            sparse=False,
            minimum_fraction=0.05
        ),
        LinearRegression()
    )

    cross_val_score(clf, boston.data, boston.target, cv=KFold(n_splits=10, shuffle=True))
def test_precomputed_cross_validation():
    # Ensure array is split correctly
    rng = np.random.RandomState(0)
    X = rng.rand(20, 2)
    D = pairwise_distances(X, metric='euclidean')
    y = rng.randint(3, size=20)
    for Est in (neighbors.KNeighborsClassifier,
                neighbors.RadiusNeighborsClassifier,
                neighbors.KNeighborsRegressor,
                neighbors.RadiusNeighborsRegressor):
        metric_score = cross_val_score(Est(), X, y)
        precomp_score = cross_val_score(Est(metric='precomputed'), D, y)
        assert_array_equal(metric_score, precomp_score)
def get_results(dataset):
    X_full, y_full = dataset.data, dataset.target
    n_samples = X_full.shape[0]
    n_features = X_full.shape[1]

    # Estimate the score on the entire dataset, with no missing values
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
    full_scores = cross_val_score(estimator, X_full, y_full,
                                  scoring='neg_mean_squared_error')

    # Add missing values in 75% of the lines
    missing_rate = 0.75
    n_missing_samples = int(np.floor(n_samples * missing_rate))
    missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                          dtype=np.bool),
                                 np.ones(n_missing_samples,
                                         dtype=np.bool)))
    rng.shuffle(missing_samples)
    missing_features = rng.randint(0, n_features, n_missing_samples)

    # Estimate the score after replacing missing values by 0
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
    zero_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                         scoring='neg_mean_squared_error')

    # Estimate the score after imputation (mean strategy) of the missing values
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()
    estimator = Pipeline([("imputer", SimpleImputer(missing_values=0,
                                                    strategy="mean")),
                          ("forest", RandomForestRegressor(random_state=0,
                                                           n_estimators=100))])
    mean_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                         scoring='neg_mean_squared_error')

    # Estimate the score after chained imputation of the missing values
    estimator = Pipeline([("imputer", ChainedImputer(missing_values=0,
                                                     random_state=0)),
                          ("forest", RandomForestRegressor(random_state=0,
                                                           n_estimators=100))])
    chained_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                            scoring='neg_mean_squared_error')

    return ((full_scores.mean(), full_scores.std()),
            (zero_impute_scores.mean(), zero_impute_scores.std()),
            (mean_impute_scores.mean(), mean_impute_scores.std()),
            (chained_impute_scores.mean(), chained_impute_scores.std()))
def test_pairwise_cross_val_score():
    clf_precomputed = svm.SVC(kernel='precomputed')
    clf_notprecomputed = svm.SVC(kernel='linear')

    X, y = iris.data, iris.target

    for MultiClassClassifier in [OneVsRestClassifier, OneVsOneClassifier]:
        ovr_false = MultiClassClassifier(clf_notprecomputed)
        ovr_true = MultiClassClassifier(clf_precomputed)

        linear_kernel = np.dot(X, X.T)
        score_precomputed = cross_val_score(ovr_true, linear_kernel, y)
        score_linear = cross_val_score(ovr_false, X, y)
        assert_array_equal(score_precomputed, score_linear)
def generate_binary_crime_label():
    y = retrieve_crime_count(2013)
    threshold = np.median(y)
    label = [1 if ele >= threshold else 0 for ele in y]
    F = generate_corina_features()
    from sklearn import svm, tree
    from sklearn.model_selection import cross_val_score
    clf1 = svm.SVC()
    scores1 = cross_val_score(clf1, F[1], label, cv=10)
    print scores1.mean(), scores1
    clf2 = tree.DecisionTreeClassifier()
    scores2 = cross_val_score(clf2, F[1], label, cv=10)
    print scores2.mean(), scores2
    pickle.dump(label, open("crime-label", 'w'))
    return y, label, F[1]
def test_nested_cv():
    # Test if nested cross validation works with different combinations of cv
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    labels = rng.randint(0, 5, 15)

    cvs = [LeaveOneLabelOut(), LeaveOneOut(), LabelKFold(), StratifiedKFold(),
           StratifiedShuffleSplit(n_iter=10, random_state=0)]

    for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
        gs = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                          cv=inner_cv)
        cross_val_score(gs, X=X, y=y, labels=labels, cv=outer_cv,
                        fit_params={'labels': labels})
def test_cross_val_score_pandas():
    # check cross_val_score doesn't destroy pandas dataframe
    types = [(MockDataFrame, MockDataFrame)]
    try:
        from pandas import Series, DataFrame
        types.append((Series, DataFrame))
    except ImportError:
        pass
    for TargetType, InputFeatureType in types:
        # X dataframe, y series
        X_df, y_ser = InputFeatureType(X), TargetType(y)
        check_df = lambda x: isinstance(x, InputFeatureType)
        check_series = lambda x: isinstance(x, TargetType)
        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
        cross_val_score(clf, X_df, y_ser)
Example #21
0
def do_mlp(x,y):

    #mlp
    clf = MLPClassifier(solver='lbfgs',
                        alpha=1e-5,
                        hidden_layer_sizes=(5, 3),
                        random_state=1)

    scores = cross_val_score(clf, x, y, cv = 5,scoring='f1_micro')
    #print scores
    print("f1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    scores = cross_val_score(clf, x, y, cv = 5,scoring='accuracy')
    #print scores
    print("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Example #22
0
def test_cross_val_score_multilabel():
    X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
                  [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
    y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
                  [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
    clf = KNeighborsClassifier(n_neighbors=1)
    scoring_micro = make_scorer(precision_score, average='micro')
    scoring_macro = make_scorer(precision_score, average='macro')
    scoring_samples = make_scorer(precision_score, average='samples')
    score_micro = cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
    score_macro = cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
    score_samples = cross_val_score(clf, X, y, scoring=scoring_samples, cv=5)
    assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
    assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
    assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
def test_nested_cv():
    # Test if nested cross validation works with different combinations of cv
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 5, 15)

    cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(), StratifiedKFold(),
           StratifiedShuffleSplit(n_splits=3, random_state=0)]

    for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
        gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]},
                          cv=inner_cv)
        cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv,
                        fit_params={'groups': groups})
Example #24
0
def test_sample_weight_func():
    """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline with sample weights"""

    tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error')

    # Reify pipeline with known scor

    pipeline_string = ("ExtraTreesRegressor("
        "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8,"
        "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber,"
        "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5,"
        "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5,"
        "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25),"
        "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5,"
        "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, "
        "ExtraTreesRegressor__n_estimators=100)")
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)

    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)

    # make up a sample weight
    training_classes_r_weight = np.array(range(1, len(training_classes_r)+1))
    training_classes_r_weight_dict = set_sample_weight(tpot_obj._fitted_pipeline.steps, training_classes_r_weight)

    np.random.seed(42)
    cv_score1 = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error')

    np.random.seed(42)
    cv_score2 = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error')

    np.random.seed(42)
    cv_score_weight = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error', fit_params=training_classes_r_weight_dict)

    np.random.seed(42)
    tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r, **training_classes_r_weight_dict)
    # Get score from TPOT
    known_score = 12.643383517 # Assumes use of mse
    score = tpot_obj.score(testing_features_r, testing_classes_r)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
    assert np.allclose(cv_score1, cv_score2)
    assert not np.allclose(cv_score1, cv_score_weight)
    assert isclose(known_score, score)
Example #25
0
def test_cross_val_score_sparse_fit_params():
    iris = load_iris()
    X, y = iris.data, iris.target
    clf = MockClassifier()
    fit_params = {'sparse_sample_weight': coo_matrix(np.eye(X.shape[0]))}
    a = cross_val_score(clf, X, y, fit_params=fit_params)
    assert_array_equal(a, np.ones(3))
Example #26
0
 def ccv(self, bst, X, y, scorer):
     cv1 = model_selection.cross_val_score(bst, X, y, cv=self.n_fold_, n_jobs=-2, scoring = scorer)
     tasks = [delayed(split_validate_job)(base.clone(bst), X, y, seed) for seed in range(self.n_fold_)]
     cv2 = Parallel(n_jobs=-2, backend="threading")(tasks)
     score = (np.sum(cv1) + np.sum(cv2)) / (len(cv1) + len(cv2))
     std = np.std(list(cv1) + list(cv2))
     return score, std
Example #27
0
    def _evaluate_projection(self, x, y):
        """
        kNNEvaluate - evaluate class separation in the given projection using a k-NN method
        Parameters
        ----------
        x - variables to evaluate
        y - class

        Returns
        -------
        scores
        """
        if self.percent_data_used != 100:
            rand = np.random.choice(len(x), int(len(x) * self.percent_data_used / 100),
                                    replace=False)
            x = x[rand]
            y = y[rand]
        neigh = KNeighborsClassifier(n_neighbors=3) if self.attr_color.is_discrete else \
            KNeighborsRegressor(n_neighbors=3)
        assert ~(np.isnan(x).any(axis=None) | np.isnan(x).any(axis=None))
        neigh.fit(x, y)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            scores = cross_val_score(neigh, x, y, cv=3)
        return scores.mean()
Example #28
0
def test_build_ps_owa_factory():

    import os
    csv_file = os.path.join(os.path.dirname(__file__), "iris.csv")
    data = np.genfromtxt(csv_file, dtype=float, delimiter=',', names=True)

    X = np.array([data["sepallength"], data["sepalwidth"], data["petallength"], data["petalwidth"]]).T
    y = data["class"]

    from sklearn.preprocessing import MinMaxScaler
    X = MinMaxScaler().fit_transform(X)

    l = nfpc.FuzzyPatternClassifier(
        membership_factory=t_factory,
        aggregation_factory=nfpc.GAOWAFactory(optimizer=nfpc.ps_owa_optimizer())
    )

    from sklearn.model_selection import cross_val_score

    scores = cross_val_score(l, X, y, cv=10)
    mean = np.mean(scores)

    print("mean", mean)

    assert 0.92 < mean
Example #29
0
def test_rfe_estimator_tags():
    rfe = RFE(SVC(kernel='linear'))
    assert_equal(rfe._estimator_type, "classifier")
    # make sure that cross-validation is stratified
    iris = load_iris()
    score = cross_val_score(rfe, iris.data, iris.target)
    assert_greater(score.min(), .7)
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

base = pd.read_csv('iris.csv')
previsores = base.iloc[:, 0:4].values
classe = base.iloc[:, 4].values
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
classe = labelencoder.fit_transform(classe)
classe_dummy = np_utils.to_categorical(classe)

def criar_rede():
    classificador = Sequential()
    classificador.add(Dense(units = 8, activation = 'relu', kernel_initializer = 'normal', input_dim = 4))
    classificador.add(Dropout(0.3))
    classificador.add(Dense(units = 8, activation = 'relu', kernel_initializer = 'normal'))
    classificador.add(Dropout(0.3))
    classificador.add(Dense(units = 3, activation = 'softmax'))
    classificador.compile(optimizer = 'adam', loss = 'categorical_crossentropy',
                          metrics = ['categorical_accuracy'])
    return classificador

classificador = KerasClassifier(build_fn = criar_rede,
                                epochs = 1500,
                                batch_size = 30)
resultados = cross_val_score(estimator = classificador,
                             X = previsores, y = classe,
                             cv = 10, scoring = 'accuracy')
media = resultados.mean()
desvio = resultados.std()
def model(opt):
    knr = KNeighborsClassifier(n_neighbors=opt["n_neighbors"])
    scores = cross_val_score(knr, X, y, cv=5)
    score = scores.mean()

    return score
##Train the model
regressor.fit(X_train, y_train)

print("Coefficient : ", regressor.coef_)

print("Intercept: ", regressor.intercept_)

print("Coefficient of determination R^2 <-- on train set: {}".format(
    regressor.score(X_train, y_train)))

print("Coefficient of determination R^2 <-- on train set: {}".format(
    regressor.score(X_test, y_test)))

from sklearn.model_selection import cross_val_score

score = cross_val_score(regressor, X, y, cv=5)
print(score)
print("Score: ", score.mean())

##Model Evaluation

coeff_df = pd.DataFrame(regressor.coef_,
                        index=X.columns,
                        columns=['Coefficient'])
print(coeff_df.head(10))
###Interpreting the Coefficients
"""1. Holding all features fixed, a 1 unit increase in T with an decrease of 10.12 in AQI PM 2.5
   2. Holding all features fixed, a 1 unit increase in TM with an increase of 3.92 in AQI PM 2.5
   3. Holding all features fixed, a 1 unit increase in VV with an decrease of 47.20 in AQI PM 2.5"""

##Prediction on test data
Example #33
0
        pipelines['Scaled' + name] = ppl
        print('done')
    print('')

    # ----------------------------------------
    # pipeline fitting and scoring
    # ----------------------------------------
    print('Pipleine fitting and scoring progress: name - mean accuracy - std accuracy')
    scoring = 'neg_mean_absolute_error'
    pipelinenames = list()
    scores = list()
    for entry in pipelines.items():
        name = entry[0]
        print('    {0:<20}'.format(name), end = '')
        ppl = entry[1]
        score = -1 * sms.cross_val_score(ppl, Xtrain, ytrain, cv = cvsplitter, scoring = scoring)
        scores.append(score)
        pipelinenames.append(entry[0])
        print('{0:.4f} - {1:.4f}'.format(np.mean(score), np.std(score, ddof = 1)))
    print('')

    # boxplot of results
    plt.close('all')
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.boxplot(scores)
    ax.set_xticklabels(pipelinenames)
    ax.set_xlabel('Algorithm')
    ax.set_ylabel('Mean Absolute Error')
    ax.set_title('Mean Absolute Error of Different Algorithms')
Example #34
0
X = dfForTraining.iloc[:, :].values
# %% Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20)
# %%
classifier = RandomForestClassifier(n_estimators=200,
                                    criterion='entropy',
                                    min_samples_split=.0002575)
classifier.fit(X_train, y_train)
# %% confusion matrix preds
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
# %% Kfold validation
accuracies = cross_val_score(estimator=classifier,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             n_jobs=-1)
acc_mean = accuracies.mean()
acc_std = accuracies.std()
print(f'Accuracy of model: {acc_mean}')
print(f'Accuracy of model: {acc_std}')
# %% Grid search
n_estimators = np.linspace(200, 250, 5)
n_estimators = [int(estimator) for estimator in n_estimators]
min_samples_split = np.linspace(.00001, .001, 5)
# %%
parameters = {
    'n_estimators': n_estimators,
    'min_samples_split': [.0002575],
    'criterion': ['entropy']
Example #35
0
from sklearn.datasets import load_digits
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score

# For reproducibility
np.random.seed(1000)

nb_classifications = 100

if __name__ == '__main__':
    # Load dataset
    digits = load_digits()

    # Collect accuracies
    ab_accuracy = []

    for i in range(1, nb_classifications):
        a = cross_val_score(AdaBoostClassifier(n_estimators=i),
                            digits.data,
                            digits.target,
                            scoring='accuracy',
                            cv=10).mean()
        ab_accuracy.append(a)

    # Show results
    plt.figure(figsize=(30, 25))
    plt.xlabel('Number of trees')
    plt.ylabel('Accuracy')
    plt.grid(True)
    plt.plot(ab_accuracy)
    plt.show()
Example #36
0
# 1) Cross Validation Classification Accuracy
import warnings

warnings.filterwarnings(action="ignore")
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

filename = 'indians-diabetes.data.csv'
names = [
    'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
]
dataframe = pd.read_csv(filename, names=names)

array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
kfold = KFold(n_splits=10)
model = LogisticRegression()

# This is the default method of accuracy
scoringMethod = 'accuracy'

results = cross_val_score(model, X, Y, cv=kfold, scoring=scoringMethod)

print("Accuracy: %.3f (%.3f)" % (results.mean() * 100, results.std() * 100))
Example #37
0
# compared to Europe and Central Asia. Therefore, if you are trying to predict life expectancy,
# it would be preferable to retain the 'Region' feature. To do this, you need to binarize it by
# creating dummy variables, which is what you will do in this exercise.

# Create dummy variables: df_region
df_region = pd.get_dummies(df)

# Print the columns of df_region
print(df_region.columns)

# Create dummy variables with drop_first=True: df_region
df_region = pd.get_dummies(df, drop_first=True)

# Print the new columns of df_region
print(df_region.columns)

#Regression with categorical features
# Having created the dummy variables from the 'Region' feature, you can build
# regression models as you did before. Here, you'll use ridge regression to perform 5-fold cross-validation.
# The feature array X and target variable array y have been pre-loaded.

# Instantiate a ridge regressor: ridge
ridge = Ridge(normalize=True, alpha=0.5)

# Perform 5-fold cross-validation: ridge_cv
ridge_cv = cross_val_score(ridge, df_region.values, y, cv=5)

# Print the cross-validated scores
print(ridge_cv)

# Combine sets and extract HOG features
from itertools import chain
X_train = np.array(
    [feature.hog(im) for im in chain(positive_patches, negative_patches)])
y_train = np.zeros(X_train.shape[0])
y_train[:positive_patches.shape[0]] = 1

#%%
print(X_train.shape)

#%%
# training a support vecctor machine
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

print(cross_val_score(GaussianNB(), X_train, y_train))

#%%
'''
We see that on our training data, even a simple naive Bayes algorithm gets us upwards of 90% accuracy. Let's try the support vector machine, with a grid search over a few choices of the C parameter:
'''
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(LinearSVC(), {'C': [1.0, 2.0, 4.0, 8.0]})
grid.fit(X_train, y_train)
print(grid.best_score_)

#%%
print(grid.best_params_)

#%%
Example #39
0
def clustering():
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import KFold, cross_val_score
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.linear_model import LogisticRegression, LinearRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.metrics import accuracy_score
    from sklearn.svm import SVC
    pd.options.mode.chained_assignment = None

    print 'Reading from CSV files....'
    train = pd.read_csv("training_data_example.csv")
    valid = pd.read_csv("validation_data_example.csv")
    test = pd.read_csv("training_data_example_and_validation.csv")

    # Transform the symbolic values into numbers suitable for the Bayes classifier
    print 'Doing some data pre-processing/cleaning....'
    train['category'] = pd.factorize(train['category'])[0]
    valid['category'] = pd.factorize(valid['category'])[0]
    test['category'] = pd.factorize(test['category'])[0]
    train['expense description'] = pd.factorize(
        train['expense description'])[0]
    valid['expense description'] = pd.factorize(
        valid['expense description'])[0]
    test['expense description'] = pd.factorize(test['expense description'])[0]
    train['tax name'] = pd.factorize(train['tax name'])[0]
    valid['tax name'] = pd.factorize(valid['tax name'])[0]
    test['tax name'] = pd.factorize(test['tax name'])[0]
    train.fillna(train.mean(), inplace=True)
    test.fillna(test.mean(), inplace=True)

    # Format the data and expected values for SKLearn
    trainData = pd.DataFrame(train[[
        'expense description', 'pre-tax amount', 'tax name', 'tax amount'
    ]])
    trainTarget = np.array(pd.DataFrame(train[['category']]))
    testData = pd.DataFrame(test[[
        'expense description', 'pre-tax amount', 'tax name', 'tax amount'
    ]])
    testTarget = pd.DataFrame(test[['category']])
    valData = pd.DataFrame(valid[[
        'expense description', 'pre-tax amount', 'tax name', 'tax amount'
    ]])
    valTarget = pd.DataFrame(valid[['category']])

    # Prepare cross-validation folds & variables
    k_fold = KFold(len(valData), shuffle=True, random_state=0)
    algoEval = 0
    winningAlgo = ""

    # Change y vectors to 1d array
    trainTarget = np.ravel(trainTarget, 'C')
    valTarget = np.ravel(valTarget, 'C')

    # Start cross-validation of candidate algorithms
    print 'Evaluating model algorithms for dataset....'
    GBCclassifier = GradientBoostingClassifier()
    GBCclassifier.fit(trainData, trainTarget)
    if algoEval < cross_val_score(
            GBCclassifier, valData, valTarget, cv=k_fold,
            n_jobs=1).mean() * 100:
        algoEval = cross_val_score(
            GBCclassifier, valData, valTarget, cv=k_fold,
            n_jobs=1).mean() * 100
        classifier = GradientBoostingClassifier()
        winningAlgo = 'Gradient Boost'
    print 'GBC:  ', cross_val_score(
        GBCclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100
    GNBclassifier = GaussianNB()
    GNBclassifier.fit(trainData, trainTarget)
    if algoEval < cross_val_score(
            GNBclassifier, valData, valTarget, cv=k_fold,
            n_jobs=1).mean() * 100:
        algoEval = cross_val_score(
            GNBclassifier, valData, valTarget, cv=k_fold,
            n_jobs=1).mean() * 100
        classifier = GNBclassifier()
        winningAlgo = 'Gaussian Naive Bayes'
    print 'GNB:  ', cross_val_score(
        GNBclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100
    SVCclassifier = SVC()
    SVCclassifier.fit(trainData, trainTarget)
    if algoEval < cross_val_score(
            SVCclassifier, valData, valTarget, cv=k_fold,
            n_jobs=1).mean() * 100:
        algoEval = cross_val_score(
            SVCclassifier, valData, valTarget, cv=k_fold,
            n_jobs=1).mean() * 100
        classifier = SVCclassifier()
        winningAlgo = 'Support Vector Machine'
    print 'SVM:  ', cross_val_score(
        SVCclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100
    LDAclassifier = LinearDiscriminantAnalysis()
    LDAclassifier.fit(trainData, trainTarget)
    if algoEval < cross_val_score(
            LDAclassifier, valData, valTarget, cv=k_fold,
            n_jobs=1).mean() * 100:
        algoEval = cross_val_score(
            LDAclassifier, valData, valTarget, cv=k_fold,
            n_jobs=1).mean() * 100
        classifier = LDAclassifier()
        winningAlgo = 'Linear Discriminant Analysis'
    print 'LDA:  ', cross_val_score(
        LDAclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100
    LinREGclassifier = LinearRegression()
    LinREGclassifier.fit(trainData, trainTarget)
    if algoEval < cross_val_score(
            LinREGclassifier, valData, valTarget, cv=k_fold,
            n_jobs=1).mean() * 100:
        algoEval = cross_val_score(
            LinREGclassifier, valData, valTarget, cv=k_fold,
            n_jobs=1).mean() * 100
        classifier = LinREGclassifier()
        winningAlgo = 'Linear Regression'
    print 'LinReg:  ', cross_val_score(
        LinREGclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100
    LogREGclassifier = LogisticRegression()
    LogREGclassifier.fit(trainData, trainTarget)
    if algoEval < cross_val_score(
            GNBclassifier, valData, valTarget, cv=k_fold,
            n_jobs=1).mean() * 100:
        algoEval = cross_val_score(
            LogREGclassifier, valData, valTarget, cv=k_fold,
            n_jobs=1).mean() * 100
        classifier = LogREGclassifier()
        winningAlgo = 'Logistic Regression'
    print 'LogReg:  ', cross_val_score(
        LogREGclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100
    print '\n the best algorithm is ' + winningAlgo + ', proceeding to model test: \n'

    classifier.fit(trainData, trainTarget)
    predictedValues = classifier.predict(testData)

    testResults = test[['employee id']]
    testResults['category'] = predictedValues

    print 'Model predicted with ', accuracy_score(testTarget, predictedValues),\
        ' accuracy, check prediction.csv for details.'

    # As this is a cluster simulation, the file will be saved on
    # the default path for local engine at /Python27/Lib/site-packages/ipyparallel
    testResults.to_csv('prediction.csv', index=False)
Example #40
0
def un_tuned_models(model, x_train, y_train, xtest):
    train_model = model.fit(x_train, y_train)
    y_pred = train_model.predict(xtest)
    rmse = (-cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=5))
    return (y_pred, rmse)
def main():

    mode = 1 #0-cv, 1-predict

    print "Reading train data"
    #training_set_raw = pd.read_csv('/modules/cs342/Assignment2/training_set.csv')
    #training_set_metadata_raw = pd.read_csv('/modules/cs342/Assignment2/training_set_metadata.csv')
    training_set_raw = pd.read_csv('./train_data_aug4.csv')
    training_set_metadata_raw = pd.read_csv('./train_meta_aug4.csv')

    training_set_targets = training_set_metadata_raw['target']
    training_set_data = training_set_metadata_raw.drop('target',axis=1)

    classes = sorted(training_set_targets.unique())
    class_weight = {
    c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    training_set_data = fill_in_hostgal_specz(training_set_data)
    full_train = format(training_set_data, training_set_raw)
    extragalactic_data, extragalactic_targets, extra_ids,  intragalactic_data, intragalactic_targets, intra_ids = splitGalaxies(full_train, training_set_targets)

    initial_intra = training_set_raw.loc[training_set_raw['object_id'].isin(intra_ids)]
    print initial_intra.shape
    print training_set_raw.shape
    print len(training_set_raw['object_id'].unique())

    print training_set_metadata_raw.shape
    print len(training_set_metadata_raw['object_id'].unique())
    print "Computing periods"
    #intra_periods = do_periods(initial_intra)
    intra_periods = pd.read_csv('./periods_train_aug4.csv')
    intra_periods.loc[intra_periods['period'].isnull(),'period_score'] = 1
    intra_periods.loc[intra_periods['period'].isnull(),'period'] = 0
    intra_periods.loc[intra_periods['period_score'].isnull(),'period_score'] = 0

    initial_extra = training_set_raw.loc[training_set_raw['object_id'].isin(extra_ids)]

    intragalactic_data = intragalactic_data.merge(
        right=intra_periods,
        how='outer',
        on='object_id'
        )
    #print intragalactic_data

    intragalactic_data = removeExtraCols(intragalactic_data)
    #intragalactic_data['period_score'] = intragalactic_data['score']
    #intragalactic_data = intragalactic_data.drop('score',axis=1)

    extragalactic_data = extragalactic_data.drop('object_id',axis=1)

    if mode==0:
        print "Model for extra:"
        param_grid = {
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth' : [4,5,6,7,8],
        'criterion' :['gini', 'entropy']
        }
        clf = RandomForestClassifier(n_jobs=2, max_depth=20,n_estimators=100)
        #CV_rfc = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5)
        #CV_rfc.fit(extragalactic_data, extragalactic_targets)
        #print "params"
        #print CV_rfc.best_params_
        print extragalactic_data.columns
        print cross_val_score(clf, extragalactic_data, extragalactic_targets, cv=10, scoring="neg_log_loss").mean()

        print "Model for intra:"
        clf = RandomForestClassifier(n_jobs=2, max_depth=20,n_estimators=100)
        print intragalactic_data.columns
        #print intragalactic_data
        clf.fit(intragalactic_data, intragalactic_targets.values.ravel())
        print clf.feature_importances_
        print cross_val_score(clf, intragalactic_data, intragalactic_targets, cv=10, scoring="neg_log_loss").mean()

    else:
        print "Training"
        extra_model = RandomForestClassifier(n_jobs=2, max_depth=20,n_estimators=100)
        print extragalactic_data.columns
        extra_model.fit(extragalactic_data, extragalactic_targets.values.ravel())
        intra_model = RandomForestClassifier(n_jobs=2, max_depth=20,n_estimators=100)
        intra_model.fit(intragalactic_data, intragalactic_targets.values.ravel())
        print "Finished training. Starting predictions"

        print "Reading test data"
        test_set_metadata_raw = pd.read_csv('/modules/cs342/Assignment2/test_set_metadata.csv')
        filepath = '/modules/cs342/Assignment2/test_set.csv'

        extra_classes = extra_model.classes_
        intra_classes = intra_model.classes_

        extra_ids = []
        intra_ids = []
        test_set_metadata_raw = fill_in_hostgal_specz(test_set_metadata_raw)
        extra_ids, intra_ids = splitTestGalaxies(test_set_metadata_raw)


        column_names = []
        column_names.append('object_id')
        for classi in extra_classes:
            className = "class_" + str(classi)
            column_names.append(className)
        for classi in intra_classes:
            className = "class_" + str(classi)
            column_names.append(className)
        column_names.append("class_99")

        #print column_names
        count = 0
        batch_no = 0
        batch_extra_dataFrame = pd.DataFrame()
        batch_intra_dataFrame = pd.DataFrame()
        myextrabatchlist = []
        myintrabatchlist = []


        print " >Starting new batch 0"
        my_extra_data_list = []
        my_intra_data_list = []

        extra_idss = set(extra_ids)
        intra_idss = set(intra_ids)
        cc=-1
        for obj_id, d in get_objects_by_id(filepath):
            cc=cc+1
            #combined = format(test_set_metadata_raw.loc[test_set_metadata_raw['object_id']==obj_id],d)
            if (obj_id in extra_idss):
                my_extra_data_list.append(d) # = np.append(my_extra_data_list, d)
            else:
                my_intra_data_list.append(d) #= np.append(my_intra_data_list, d)
            if(count == 10000):
                print " >>Formatting batch objects"
                arr = my_predict(column_names,my_extra_data_list, my_intra_data_list, test_set_metadata_raw, extra_model, intra_model)

                print " >>Write to csv"
                finish = pd.DataFrame(arr, columns=column_names)
                finish["class_99"] = (1-finish.drop("object_id", axis=1)).product(axis=1) #Adding values to class_99
                #Below is a very messy way of making all rows sum to 1 despite the above
                finish.loc[:,finish.columns!="object_id"] = finish.loc[:,finish.columns!="object_id"].div(finish.loc[:,finish.columns!="object_id"].sum(axis=1), axis=0)

                if(batch_no==0):
                    finish.to_csv("rf_feaug.csv", index = False, header = True)
                else:
                    with open("rf_feaug.csv", 'a') as f:
                        finish.to_csv(f, index = False, header=False)

                print " >Starting new batch " + str(batch_no + 1)
                batch_no = batch_no + 1
                lst = 0
                count = 0
                my_extra_data_list = []
                my_intra_data_list = []

            else:
                count = count + 1
        print "!Remaining objects: " + str(count)
        print " >>Formatting batch objects"
        arr = my_predict(column_names,my_extra_data_list, my_intra_data_list, test_set_metadata_raw, extra_model, intra_model)

        print " >>Write to csv"
        finish = pd.DataFrame(arr, columns=column_names)
        finish["class_99"] = (1-finish.drop("object_id", axis=1)).product(axis=1) #Adding values to class_99
        #Below is a very messy way of making all rows sum to 1 despite the above
        finish.loc[:,finish.columns!="object_id"] = finish.loc[:,finish.columns!="object_id"].div(finish.loc[:,finish.columns!="object_id"].sum(axis=1), axis=0)
        with open("rf_feaug.csv", 'a') as f:
            finish.to_csv(f, index = False, header=False)


        print " >>Clean up."
        preds = pd.read_csv("rf_feaug.csv")
        preds['object_id']=preds['object_id'].apply(int)
	    #preds['object_id']=preds['object_id'].apply(int)
        print preds.shape
        print cc
        preds.to_csv("rf_feaug2.csv", index=False)
	    #preds.to_csv('predictions2.csv', index=False)

        print "DONE."
Example #42
0
print('\nR^2:',metrics.r2_score(yy_test,clf3.predict(xx_test)))
data.ix[data['Age'].isnull(),'Age'] = clf3.predict(aa)
data.dropna(inplace=True)
# 字符编码
for i in data.columns:
    if data[i].dtype == 'object':
        le = LabelEncoder()
        data[i] = le.fit_transform(data[i])
Sur = 'Survived'
lab = [i for i in data.columns if i not in Sur]
y = data.ix[:,Sur]
x = data.ix[:,lab]
X_train, X_test, y_train, y_test = train_test_split(x,y,train_size=0.7,random_state=1)
forst = ensemble.RandomForestClassifier()
gbdt = ensemble.GradientBoostingClassifier()
cv1 = cross_val_score(forst,X_train,y_train,cv=5,scoring='f1')
cv2 = cross_val_score(gbdt,X_train,y_train,cv=5,scoring='f1')
print(cv1.mean())
print(cv2.mean())
print('-------------------')
#
# grid = GridSearchCV(estimator=gbdt,param_grid={'learning_rate':np.arange(0.1,1,0.1),
#                                                'n_estimators':range(20,100,10),
#                                                'subsample':(0.7,0.8,0.1),
#                                                'max_depth':range(2,10,2)
#                                                },scoring='f1')
# grid.fit(X_train,y_train)
# print(grid.best_params_)
gbdt_new = ensemble.GradientBoostingClassifier(learning_rate=0.1,n_estimators=90,subsample=0.1,max_depth=4)
gbdt_new.fit(X_train,y_train)
print('\n准确率:',metrics.accuracy_score(y_test,gbdt_new.predict(X_test)))
Example #43
0
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn import svm

SimData = make_classification(n_samples = 100, n_features = 20, n_informative = 2, n_redundant = 2, n_repeated = 2, n_classes = 2, flip_y = 0.01) # produce a matrix of features and corresponding discrete targets

X_train, X_test, y_train, y_test = train_test_split(SimData[0], SimData[1], test_size = 0.5) # randomly sample a training set while holding out 50% of the data for testing (evaluating) our classifier
clf = svm.SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
score = clf.score(X_test, y_test)
print('Accuracy with training/test split: {}'.format(round(score, 2)))

clf = svm.SVC(kernel = 'linear', C = 1)
scores = cross_val_score(clf, SimData[0], SimData[1], cv = 5) # estimate the accuracy of a linear kernel SVM by splitting the data, fitting a model and computing the score 5 consecutive times (with different splits each time)
print('Accuracy with 5-fold cross-validation: {} (+/- {})'.format(round(scores.mean(),2), round(scores.std(),2)))

scoring = ['f1', 'roc_auc'] # specifying multiple metrics for evaluation,
scores = cross_validate(clf, SimData[0], SimData[1], scoring = scoring, cv = 5, return_train_score = False) # returns a dict containing training scores, fit-times and score-times in addition to the test score
print('f1 with 5-fold cross-validation: {}'.format(round(scores['test_f1'].mean(),2)))
print('ROC AUC with 5-fold cross-validation: {}'.format(round(scores['test_roc_auc'].mean(),2)))

cv = StratifiedKFold(n_splits = 5) # The folds are made by preserving the percentage of samples for each class.
scores = cross_val_score(clf, SimData[0], SimData[1], cv = cv)
print('Accuracy with stratified 5-fold cross-validation: {})'.format(round(scores.mean(),2)))

rfecv = RFECV(estimator = clf, step = 1, cv = cv, scoring = 'accuracy')
rfecv.fit(SimData[0], SimData[1])
print("Optimal number of features : %d" % rfecv.n_features_)
Example #44
0
previsores[:, 5] = labelencoder_previsores.fit_transform(previsores[:, 5])
previsores[:, 8] = labelencoder_previsores.fit_transform(previsores[:, 8])
previsores[:, 9] = labelencoder_previsores.fit_transform(previsores[:, 9])
previsores[:, 10] = labelencoder_previsores.fit_transform(previsores[:, 10])

onehotencoder = OneHotEncoder(categorical_features=[0, 1, 3, 5, 8, 9, 10])
previsores = onehotencoder.fit_transform(
    previsores).toarray()  #com o onehotencoder muda pra variavel dummy


def criarRede():
    regressor = Sequential()
    regressor.add(Dense(units=158, activation='relu', input_dim=316))
    regressor.add(Dense(units=158, activation='relu'))
    regressor.add(Dense(units=1, activation='linear'))
    regressor.compile(loss='mean_absolute_error',
                      optimizer='adam',
                      metrics=['mean_absolute_error'])

    return regressor


regressor = KerasRegressor(build_fn=criarRede, epochs=100, batch_size=300)
resultados = cross_val_score(estimator=regressor,
                             X=previsores,
                             y=preco_real,
                             cv=10,
                             scoring='mean_absolute_error')

media = resultados.mean()
desvio = resultados.std
Example #45
0
def classifierMortalityUnderSampling():

    X = df_train.drop(['HospID', 'SiteID', 'surgid', 'Complics', 'Mortality'],
                      axis=1)
    # 'HospID_total_cardiac_surgery', 'HospID_Reop_CABG', 'HospID_total_CABG', 'surgyear',
    # 'surgid_total_cardiac_surgery','surgid_total_CABG', 'surgid_Reop_CABG'], axis=1)
    y = df_train['Mortality']  # Labels

    X_test = df_test.drop(
        ['HospID', 'SiteID', 'surgid', 'Complics', 'Mortality'], axis=1)
    y_test = df_test['Mortality']
    # define undersample strategy
    undersample = RandomUnderSampler(sampling_strategy='majority')
    # fit and apply the transform
    X_over, y_over = undersample.fit_resample(X, y)
    # summarize class distribution
    print(Counter(y_over))
    # X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2)
    xgb_model = xgb.XGBClassifier(objective='binary:logistic',
                                  eval_metric='logloss',
                                  learning_rate=0.1)
    xgb_model.fit(X, y)

    y_pred = xgb_model.predict(X_test)
    preds = xgb_model.predict_proba(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(
        f"The accuracy of the model is {round(accuracy_score(y_test, y_pred), 5) * 100} %"
    )

    cm = confusion_matrix(y_test, y_pred)
    labels = ['TN', 'FP', 'FN', 'TP']
    categories = ['Alive', 'Dead']
    plt = make_confusion_matrix(cm,
                                categories=categories,
                                cmap='RdPu',
                                title='Confusion Metrics Mortality:',
                                group_names=labels)
    plt.show()
    feature_importance(xgb_model, df_model_draft, X_test, y_test, 'pink',
                       'RdPu')
    make_roc_auc_curve(y_test, preds, 'ROC Curve for XGBoost with Experience')

    # example of evaluating a decision tree with random undersampling
    from numpy import mean
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import RepeatedStratifiedKFold
    from imblearn.pipeline import Pipeline

    # define pipeline
    steps = [('under', RandomUnderSampler()), ('model', XGBClassifier())]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline,
                             X,
                             y,
                             scoring='roc_auc',
                             cv=cv,
                             n_jobs=-1)
    print('Mean ROC AUC: %.5f' % mean(scores))
    print(scores)
	keys = df_test[KEY_COL]
	x_test = df_test.drop(columns=[KEY_COL])
	logger.info('x_test: {}'.format(x_test.shape))
	del df_test

	# logger.info('--- reduce memory usage ---')
	# x_train = utils.reduce_mem_usage(x_train, logger)
	# x_test = utils.reduce_mem_usage(x_test, logger)
	# gc.collect()

	logger.info('--- cross validation ---')
	epochs = 20
	batch_size = 1000
	clf = KerasClassifier(build_fn=gen_model, epochs=epochs, batch_size=batch_size)
	kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
	cv_auc = cross_val_score(clf, x_train, y_train, cv=kfold, scoring='roc_auc', verbose=3)
	logger.info('auc of each cv: {}'.format(cv_auc))
	mean_auc = mean(cv_auc)
	logger.info('mean auc: {}'.format(mean_auc))

	logger.info('fitting to train data...')
	clf.fit(x_train, y_train)

	logger.info('predicting test data...')
	pred_test = clf.predict_proba(x_test, batch_size=1000, verbose=1)[:, 1]

	logger.info('--- save submission file ---')
	df_submission = pd.DataFrame({
	                            KEY_COL: keys,
	                            TGT_COL: pred_test
	                        })
Example #47
0
# coding:utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

train_df = pd.read_csv("/Users/jianjun.yue/PycharmGItHub/data/house_price/model/house_price_train.csv",index_col = 0)
test_df = pd.read_csv("/Users/jianjun.yue/PycharmGItHub/data/house_price/model/house_price_test.csv",index_col = 0)
numeric_cols = train_df.columns[train_df.dtypes != 'object']
y_train=train_df["SalePrice"]
X_train=train_df.drop(['SalePrice'],axis=1)
test_df=test_df.drop(['MSSubClass_90'],axis=1)

# boosting 比bagging更高级,它是弄来一把分类器,把它们线性排列,下一个分类器把上一个分类器分类不好的地方加上更高的权重,这样,下一个分类器在这部分就能学习得更深刻

ridge = Ridge(alpha = 15)
params = [10,15,20,25,30,35,40,45,50]
test_scores = []
for param in params:
    clf = AdaBoostRegressor(base_estimator = ridge,n_estimators = param)
    test_score = np.sqrt(-cross_val_score(clf,X_train,y_train,cv = 10,scoring = 'neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
plt.plot(params,test_scores)
plt.title('n_estimators vs CV Error')
plt.show()
Example #48
0
seed = 7
scoring = 'accuracy'
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model,
                                                 X_train,
                                                 Y_train,
                                                 cv=kfold,
                                                 scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
Example #49
0
      ['Outlook','Wind']))
ct_2 = make_column_transformer(
      (KBinsDiscretizer(3,encode='ordinal'),
       ['Temperature']),  
      (KBinsDiscretizer(2,encode='ordinal'),
       ['Humidity']),
      (OneHotEncoder(),
      ['Outlook','Wind']))
one_X = ct_1.fit_transform(df)
two_X= ct_2.fit_transform(df)
#%%
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB, CategoricalNB
import numpy as np 
ngb=GaussianNB()
sc_1=cross_val_score(ngb,X=one_X,y=y,scoring='accuracy',cv=5)
print('the avg score for GassianNB in setting one is',np.mean(sc_1))
# %%
cgb=CategoricalNB()
sc_2=cross_val_score(cgb,X=one_X,y=y,scoring='accuracy',cv=5)
print('the avg score for CategoryNB in setting one is',np.mean(sc_2))
# %%
ngb=GaussianNB()
sc_1=cross_val_score(ngb,X=two_X,y=y,scoring='accuracy',cv=5)
print('the avg score for GassianNB in setting two is',np.mean(sc_1))
# %%
cgb=CategoricalNB()
sc_2=cross_val_score(cgb,X=two_X,y=y,scoring='accuracy',cv=5)
print('the avg score for CategoryNB in setting two is',np.mean(sc_2))

# %%
Example #50
0
    'intrusion_pcl1', 'q6.5_PHYS_pcl2', 'q6.13_SLEEP_pcl2', 'q6.3_FLASH_pcl2'
]
path = "C:\‏‏PycharmProjects\PTSD\Data\PTSD.xlsx"
df = pd.read_excel(path)
df = df[~df['PCL_Strict3'].isna()]
df = df[["ID", 'PCL_Strict3']]
df_pcl3 = pd.read_excel(
    "C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL3.xlsx")
df_pcl2 = pd.read_excel(
    "C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL2.xlsx")
df_pcl1 = pd.read_excel(
    "C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL1.xlsx")

df = df.merge(df_pcl1, on="ID")
df = df.merge(df_pcl2, suffixes=('_pcl1', '_pcl2'), on="ID")
df = df.merge(df_pcl3.drop(['PCL3_Strict', 'pcl3', 'PCL3_Broad'], axis=1),
              on="ID")
df = df[features_0 + ['PCL_Strict3']].dropna()
X = df[features_0]
Y = df['PCL_Strict3']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=271828,
                                                    stratify=Y)
scores = cross_val_score(BaggingClassifier(),
                         X_train,
                         y_train,
                         scoring='f1',
                         cv=10)
print(sum(scores) / len(scores))
Example #51
0
                        train_test_split(iris.data,iris.target,test_size=0.51,random_state=0)

# 分析数据
# 创建分类器
#classifier = nb.KNeighborsClassifier(n_neighbors=3,weights='uniform',algorithm='auto')
classifier = nb.RadiusNeighborsClassifier(n_neighbors=2,
                                          weights='uniform',
                                          algorithm='auto')

# 训练分类器
classifier.fit(train_data, train_label)

# 预测
test_label_predicted = classifier.predict(test_data)
# 交叉验证
scores = cross_val_score(classifier, iris.data, iris.target, cv=10)

# 比较结果
size = len(test_label_predicted)
outer = np.zeros((size), dtype=int)
for i in range(size):
    if test_label_expected[i] != test_label_predicted[i]:
        outer[i] = 1
result = np.vstack((test_label_expected, test_label_predicted, outer))
result = result.T

# 计算正确率
#classifier.score(test_data,test_label_expected)
okresult = float(np.sum(outer == 0)) / len(outer)
print(
    "Classification report for classifier %s:\n%s\n" %
train_size = int(len(df) * 0.7)
print(train_size)
data = df.loc[:, df.columns != 'DELAYED']
labels = df['DELAYED']

train_data = data[:train_size]
train_labels = labels[:train_size]
test_data = data[train_size:]
test_labels = labels[train_size:]

model = RandomForestClassifier(n_estimators=100, verbose=3)

# 5-fold cross validation of model
score_acc = (cross_val_score(model,
                             train_data,
                             train_labels,
                             cv=5,
                             scoring='accuracy'))
print("Accuracy: %0.2f (+/- %0.2f)" % (score_acc.mean(), score_acc.std() * 2))

score_f1 = (cross_val_score(model,
                            train_data,
                            train_labels,
                            cv=5,
                            scoring='f1'))
print("F1 Score: %0.2f (+/- %0.2f)" % (score_f1.mean(), score_f1.std() * 2))

# Fit the classifier to the training set
model.fit(train_data, train_labels)

# Predict probabilities of classes on the test set
Example #53
0
os.chdir("E:/")

titanic_train = pd.read_csv("train.csv")

#EDA
titanic_train.shape
titanic_train.info()

#data preparation
titanic_train1 = pd.get_dummies(titanic_train, columns=['Pclass', 'Sex', 'Embarked'])
titanic_train1.shape
titanic_train1.info()
titanic_train1.head(6)

#feature engineering 
X_train = titanic_train1.drop(['PassengerId','Age','Cabin','Ticket', 'Name','Survived'], 1)
y_train = titanic_train['Survived']

#build the decision tree model
dt = tree.DecisionTreeClassifier()
#use cross validation to estimate performance of model. 
#No model build during cross validation is not used as final model
cv_scores = model_selection.cross_val_score(dt, X_train, y_train, cv=10, verbose=1)
cv_scores.mean()

#build final model on entire train data which is used for prediction
dt.fit(X_train,y_train)

# natively deploy decision tree model(pickle format)
joblib.dump(dt, "tree1.pkl")
x = x.dropna()
y = df['Survived']

# 4)    Разделить данные на обучающую и проверочную выборки (или использовать кросс-валидацию). Будем строить дерево решений.
#       Нужно выбрать параметр модели, который, на ваш взгляд, может повлиять на результат, и выбрать для него возможные значения.
#       Прокомментировать свой выбор. Изменяя в цикле значения параметра, посчитать для каждого случая точноть, полноту, F-меру (может быть, другие метрики?).
#       Изобразить результаты на диаграмме/-ах. Интерпретировать результаты. Нарисовать лучшее дерево.

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
clf = DecisionTreeClassifier(min_samples_split=5)
clf.fit(np.array(x_train), np.array(y_train))
importances = pandas.Series(clf.feature_importances_, index=x_labels)
print(importances)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))
print(np.mean(cross_val_score(clf, X_train, y_train, cv=5)))

# 5)    Проделать аналогичные операции для модели Random Forest. Сравнить результаты.

model = RandomForestClassifier(n_estimators = 100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
scores = []\n",
for t in range(1,100):
    rfc = RandomForestClassifier(n_estimators=t)
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)
    scores.append(f1_score(y_test, y_pred))
     rfc.fit(X_train, y_train)
     y_pred = rfc.predict(X_test)
import numpy as np
import pandas
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

data = pandas.read_csv('samples/abalone.csv')
data['Sex'] = data['Sex'].map(lambda v: -1 if v == 'F' else (0 if v == 'I' else 1))
x = data.drop(['Rings'], axis=1)
y = data.Rings

cv = KFold(n_splits=5, shuffle=True, random_state=1)

b = []
for i in range(1, 51):
    clf = RandomForestRegressor(n_estimators=i, random_state=1)
    a = np.mean(cross_val_score(estimator=clf, X=x, y=y, cv=cv, scoring='r2'))
    b.append([i, a])
    print(i, a)

ans = list(filter(lambda v: v[1] > 0.52, b))
print('ans =', ans[0][0])
Example #56
0
X_train, X_validation, Y_train, Y_validation = train_test_split(X,
                                                                y,
                                                                test_size=0.20,
                                                                random_state=1)

# spot check algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear',
                                        multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))

# evaluate each model
results = []
names = []

for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

# compare algorithms
pyplot.boxplot(results, labels=names)
pyplot.title('alogorithm comparison')
pyplot.show()
print(classification_report(test_labels, predictions))  
print(confusion_matrix(test_labels, predictions))


# In[ ]:


# cross validation 找到最好的k值
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

k_range = range(1,50)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, data_now, label, cv = 5, scoring='accuracy')
    print("k = " + str(k) + ", score = " + str(scores) + ", mean = " +  str(scores.mean()))
    k_scores.append(scores.mean())


# In[ ]:


plt.plot(k_range, k_scores)
plt.xlabel('K for KNN')
plt.ylabel('Cross Validation Accuracy')
plt.show()


# In[ ]:
# preparando los modelos
semilla = 7
modelos = []
modelos.append(("LR", LinearRegression()))
modelos.append(("DT", DecisionTreeRegressor()))
modelos.append(("RF", RandomForestRegressor()))
modelos.append(("SVR", SVR()))

# evaluando cada modelo
results = []
names = []
scoring = 'r2'
for name, model in modelos:
    kfold = ms.KFold(n_splits=3, random_state=semilla)
    cv_results = ms.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# gráficas de los modelos con los puntos
linear = LinearRegression().fit(X, Y)
decision_tree = DecisionTreeRegressor().fit(X, Y)
random_forest = RandomForestRegressor().fit(X, Y)
svr = SVR().fit(X, Y)

plt.scatter(X, Y)
plt.plot(datosGlobal.dia.sort_values(), linear.predict(datosGlobal.dia.sort_values().values.reshape(-1, 1)), label="lineal")
plt.plot(datosGlobal.dia.sort_values(), decision_tree.predict(datosGlobal.dia.sort_values().values.reshape(-1, 1)), label="arbol de decision")
plt.plot(datosGlobal.dia.sort_values(), random_forest.predict(datosGlobal.dia.sort_values().values.reshape(-1, 1)), label="bosque aleatorio")
Example #59
0
    #XGB
    XGB_model = XGBClassifier(min_child_weight=0.1,max_depth=7)
    
    #KNN
    KNN_model = KNeighborsClassifier()

    #SVM
    SVM_model = SVC(kernel = 'linear',probability = True)

    #隨機森
    RFC_model = RandomForestClassifier(n_estimators=100,n_jobs=5)

    #羅吉斯回歸
    LR_model = LogisticRegression()

    scores_x = cross_val_score(XGB_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error')
    scores_k = cross_val_score(KNN_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error')
    scores_s = cross_val_score(SVM_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error')
    scores_r = cross_val_score(RFC_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error')
    scores_l = cross_val_score(LR_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error')

    sm_x = -scores_x.mean()
    sm_k = -scores_k.mean()
    sm_s = -scores_s.mean()
    sm_r = -scores_r.mean()
    sm_l = -scores_l.mean()



    mape_list.append([name,sm_x,sm_k,sm_s,sm_r,sm_l])
def main():
    # 1 查看训练集和测试集的数据特征
    train_data = pandas.read_csv('data/train.csv')
    test_data = pandas.read_csv('data/test.csv')
    print(train_data.info())
    print(test_data.info())
    # 2 人工选取预测有效的特征
    selected_features = ['Pclass', 'Sex', 'Age', 'Embarked', 'SibSp', 'Parch', 'Fare']
    x_train = train_data[selected_features]
    x_test = test_data[selected_features]

    y_train = train_data['Survived']

    # 3 补充缺失值
    # 得知Embared特征惨在缺失值,需要补完
    print(x_train['Embarked'].value_counts())
    print(x_test['Embarked'].value_counts())

    # 对于类别型特征,使用出现频率最高的特征来填充,可以作为减少引入误差的方法之一
    x_train['Embarked'].fillna('S', inplace=True)
    x_test['Embarked'].fillna('S', inplace=True)

    x_train['Age'].fillna(x_train['Age'].mean(), inplace=True)
    x_test['Age'].fillna(x_test['Age'].mean(), inplace=True)

    x_test['Fare'].fillna(x_test['Fare'].mean(), inplace=True)
    print(x_train.info())
    print(x_test.info())

    # 4 采用DictVectorizer对特征向量化
    dict_vectorizer = DictVectorizer(sparse=False)
    x_train = dict_vectorizer.fit_transform(x_train.to_dict(orient='record'))
    print(dict_vectorizer.feature_names_)
    x_test = dict_vectorizer.transform(x_test.to_dict(orient='record'))

    # 5 训练模型
    forest_classifier = RandomForestClassifier()
    xgb_classifier = XGBClassifier()

    # 使用5折交叉验证的方式进行性能评估
    forest_mean_score = cross_val_score(forest_classifier, x_train, y_train, cv=5).mean()
    print(forest_mean_score)
    xgb_mean_score = cross_val_score(xgb_classifier, x_train, y_train, cv=5).mean()
    print(xgb_mean_score)

    # 6 使用并行网格搜索的方式选择更好的超参组合
    params = {
        'max_depth': range(2, 8), 'n_estimators': range(100, 1200, 200),
        'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0]
    }
    xgbc_best = XGBClassifier()
    grid_search_cv = GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5)
    grid_search_cv.fit(x_train, y_train)
    print(grid_search_cv.best_score_)
    print(grid_search_cv.best_params_)

    # 7 预测结果并写入文件
    predict_result = grid_search_cv.predict(x_test)
    submission_data = pandas.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': predict_result})
    submission_data.to_csv('data/submission/titanic_submission.csv', index=False)