Beispiel #1
0
def grid_search(X,y):
    C_array = [0.001, 0.01, 0.1, 1, 10]
    gamma_array = [0.001, 0.01, 0.1, 1]
    hyperparameters = {'C': C_array, 'gamma' : gamma_array}
    grid_search = GridSearchCV(SVC(kernel='rbf'), hyperparameters, cv=10)
    grid_search.fit(X, y)
    return grid_search.best_params_.get('C'),grid_search.best_params_.get('gamma')
Beispiel #2
0
def train_model(feature_data, target_data, search_parameters):
    """
    Train a model using ExtraTreesClassifier and entropy criterion for split quality:
    http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    """
    # Train model
    model_pipeline = sklearn.pipeline.Pipeline([
        ('scale', sklearn.preprocessing.StandardScaler()),
        ('feature_select', sklearn.feature_selection.SelectPercentile(sklearn.feature_selection.f_classif)),
        ('classify', sklearn.ensemble.ExtraTreesClassifier(bootstrap=True, criterion='entropy'))
    ])

    # Create the stratified cross-validation folder; this means that the distribution of the target
    # within each fold will mirror the overall population up to this point.
    cv = sklearn.cross_validation.StratifiedKFold(target_data, n_folds=10)

    # Create grid searcher
    grid_search = sklearn.grid_search.GridSearchCV(
        model_pipeline,
        search_parameters,
        cv=cv,
        verbose=0,
        n_jobs=2,
        scoring=sklearn.metrics.make_scorer(
            sklearn.metrics.fbeta_score,
            beta=1.0,
            pos_label=pandas.Series(target_data).value_counts().idxmax()
        )
    )

    # Fit model in grid search
    grid_search.fit(feature_data, target_data)

    return grid_search
Beispiel #3
0
def optimize_svm(train_data_path='training-data-small.txt.bz2'):
    """Run grid search to determine best C or gamma for svm
    Generally, C ranges from 1 to 1000, and gamma is no larger than 0.1.
    :type train_data_path: str
    :params train_data_path: the path to training data file

    :type return: dict
    :params return: best params obtained by grid search
    """
    from sklearn import grid_search
    from sklearn import metrics

    # load training data
    train_X, train_y = load_data(data_path=train_data_path)

    # config the range of C and gamma in grid search
    param_grid = [{'C': [2**i for i in range(0, 10, 1)],  # 1 <= C <= 1000
                   'gamma': [2**i for i in np.arange(-8, -3, 0.5)],  # 0 < gamma <= 0.1
                   'kernel': ['rbf']},
                  {'C': [2**i for i in range(0, 10, 1)],  # 1 <= C <= 1000
                   'kernel': ['linear']}]
    method = SVC()
    grid_search = grid_search.GridSearchCV(method, param_grid, scoring='f1', n_jobs=9)
    grid_search.fit(train_X, train_y)

    return grid_search.best_params_
def gradient_boosting_exp(X, y, data, split_iterator, base_classifier=None):
    # if base_classifier:
    #     X = sklearn.preprocessing.scale(X)

    hyperparameter_space = {
        "learning_rate": [0.2167],
        "min_samples_leaf": [10],
        "n_estimators": [300],
        "subsample": [0.9],
    }
    # learning_rate: numpy.linspace(0.1, 0.9, 5)
    # min_samples_leaf: numpy.linspace(5, 40, 5).astype(int)
    # min_samples_split: numpy.linspace(20, 100, 5).astype(int)
    # n_estimators: 100, 200, etc (default = 100)
    # max_depth: 2, 3, 4, 5 (default = 3)
    # subsample: 0.5, 0.8, 0.9, 1. (default = 1)

    model = sklearn.ensemble.GradientBoostingClassifier(init=base_classifier)
    grid_search = sklearn.grid_search.GridSearchCV(model, hyperparameter_space, n_jobs=N_JOBS, cv=split_iterator, verbose=1)
    grid_search.fit(X, y)

    if base_classifier:
        print "Gradient boosting with base classifier"
    else:
        print "Gradient boosting classifier"

    print_tuning_scores(grid_search)
    print_feature_importances(data.drop("IsBlueWinner", axis=1).columns, grid_search.best_estimator_)
Beispiel #5
0
 def svc_param_selection(self,X, y, nfolds):
     Cs = [0.001, 0.01, 0.1, 1, 10]
     gammas = [0.001, 0.01, 0.1, 1]
     param_grid = {'C': Cs, 'gamma' : gammas}
     grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds, n_jobs=4, verbose=1)
     grid_search.fit(X, y)
     grid_search.best_params_
     return grid_search.best_params_
def get_svm_param(X, y, nfolds):  #function for hyperparameter tuning for SVM
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma': gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    gets = grid_search.best_params_
    return gets
Beispiel #7
0
def svc_paramter_selection(X, y, nfolds):
    cs = [0.001, .01, 0.1, 1, 10]
    gammas = [.001, .01, .1, 1]
    param_grid = {'C': cs, 'gamma': gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_
Beispiel #8
0
def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 1.1, 2, 3, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    #kernels = [‘linear’, ‘rbf’, ‘poly’]
    param_grid = {'C': Cs, 'gamma': gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search, grid_search.best_params_
def svc_param_selection(X, y, nfolds):
    Cs = [0.1, 1, 10, 100]
    gammas = [0.001, 0.01, 0.1]
    param_grid = {'C': Cs, 'gamma': gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='sigmoid'),
                               param_grid,
                               cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_, grid_search.cv_results_
Beispiel #10
0
def validation(data, target, constant):
    score = 0

    regressor = svm.NuSVR(kernel="poly")

    param_grid = {
        'C': np.linspace(20.0, 40.0, 10),
        'nu': np.linspace(0.0001, 1, 5)
    }

    grid_search = sklearn.grid_search.GridSearchCV(
        regressor,
        param_grid,
        scoring=sklearn.metrics.make_scorer(sklearn.metrics.mean_squared_error,
                                            greater_is_better=False),
        cv=5,
        n_jobs=-1)
    grid_search.fit(data, target)
    clf = grid_search.best_estimator_
    print(clf)

    chunk_size = len(data) / CVSize
    for x in range(CVSize):

        # These describe where to cut to get our crossdat
        first_step = x * chunk_size
        second_step = (x + 1) * chunk_size

        # Get the data parts we train on
        cross_data = np.vstack((data[:first_step], data[second_step:]))
        cross_target = np.append(target[:first_step], target[second_step:])

        # fit and save the coef
        clf.fit(cross_data, cross_target)

        # Find mean squared error and print it
        sample_data = data[first_step:second_step]
        sample_target = target[first_step:second_step]

        # Get scores for our model
        pred = clf.predict(sample_data)
        RMSE = mean_squared_error(sample_target, pred)**0.5
        score += RMSE

    score = score / CVSize

    print("Cross-Validation RMSE: {} ".format(score))

    # Get global score
    clf.fit(data, target)
    pred = clf.predict(data)
    RMSE = mean_squared_error(target, pred)**0.5
    print("RMSE on whole dataset {}".format(RMSE))

    return score
def main():

    # Data Pre-Processing: Join the username table and service log table
    df1 = pd.read_csv("NewForm1.csv")
    df2 = pd.read_csv("serviceExecutionLog_dataset2.csv")
    df3 = pd.merge(df1, df2, on = ['userName', 'executionStartTime'], how = 'left')

    # Uppercase transformation
    df3['model'] = df3['model'].map(str.upper)

    # Write out to csv file
    df3.to_csv("NewForm1WithExecutionTime.csv")

    # Data Pre-Processing: Join the Climate Dataset table to feature to train
    df4 = pd.read_csv("/Users/dennis/Documents/SVM-Tasks/Climate_Datasets.csv")

    # Encoding: Grouping    
    df4['Dataset Group'] = df4['Dataset Group'].map(datasetgrouping)

    # Duplicate & Fillna
    df4['userName'] = df4['userName'].fillna('Unknown')
    df4['Users Group'] = df4['userName']
    df4['Users Group'] = df4['Users Group'].map(usergrouping)

    # Write out to FeaturesForTrain.csv
    df4.to_csv("FeaturesForTrain.csv")

    # Training/Testing Data and split  Preparation
    X, y = df4.astype(str).map(str.strip), df4['userName'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    # Pipeline building
    pipeline = Pipeline([('vect', TfidfVectorizer(stop_words = 'english', lowercase = False)), ('clf', SVC(kernel=['rbf', 'linear'], gamma=0.01, C=100, max_iter = 100))])

    # Check the training data shape
    print X_train.shape

    # parameters setting
    parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), }

    # training with grid_search: parameters fillin
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')

    # training with grid_search with X_train data
    grid_search.fit(X_train, y_train)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')
    
    # Predictions
    predictions = grid_search.predict(X_test)
    predictions_probability = grid_search.predict_proba(X_test)

    # Prediction Results 
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Confusion Matrix:', confusion_matrix(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)
Beispiel #12
0
 def svc_param_selection(self, X, y, nfolds):
     data = self.data
     Cs = [0.001, 0.01, 0.1, 1, 10, 100]
     gammas = [0.001, 0.01, 0.1, 1, 10]
     param_grid = {'C': Cs, 'gamma': gammas}
     grid_search = GridSearchCV(svm.SVC(kernel='rbf'),
                                param_grid,
                                cv=nfolds)
     grid_search.fit(X, y)
     print(grid_search.best_params_)
     return grid_search.best_params_
def svc_param_selection(X, y, nfolds):
    a = [0.4, 0.5, 0.6, 0.7]
    c = [1, 1.5, 2, 2.5, 3]
    d = [1, 2, 3]
    param_grid = {'alpha': a, 'coef0': c, 'degree': d}
    grid_search = GridSearchCV(KernelRidge(kernel='polynomial'),
                               param_grid,
                               cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_
Beispiel #14
0
def validation(data, target, constant):
    score = 0

    #regressor = svm.NuSVR(kernel = "rbf", cache_size=1500, tol=1e-2)
    regressor = sklearn.linear_model.LassoLarsCV()
    #regressor = GradientBoostingClassifier()
    #param_grid = {'C':np.linspace(25.0 , 20000.0, num = 4), 'gamma':np.linspace(0.01/15.0, 0.5/15.0, 16)}
    #param_grid = {'eps':np.linspace(0.0001, 0.002, 4)}
    param_grid = {}
    grid_search = sklearn.grid_search.GridSearchCV(
        regressor,
        param_grid,
        scoring=sklearn.metrics.make_scorer(sklearn.metrics.mean_squared_error,
                                            greater_is_better=False),
        n_jobs=9)
    grid_search.fit(data, target)
    clf = grid_search.best_estimator_
    print(clf)

    chunk_size = len(data) / CVSize
    for x in range(CVSize):

        # These describe where to cut to get our crossdat
        first_step = x * chunk_size
        second_step = (x + 1) * chunk_size

        # Get the data parts we train on
        cross_data = np.vstack((data[:first_step], data[second_step:]))
        cross_target = np.append(target[:first_step], target[second_step:])

        # fit and save the coef
        clf.fit(cross_data, cross_target)

        # Find mean squared error and print it
        sample_data = data[first_step:second_step]
        sample_target = target[first_step:second_step]

        # Get scores for our model
        pred = clf.predict(sample_data)
        RMSE = mean_squared_error(sample_target, pred)**0.5
        score += RMSE

    score = score / CVSize

    print("Cross-Validation RMSE: {} ".format(score))

    # Get global score
    clf.fit(data, target)
    pred = clf.predict(data)
    RMSE = mean_squared_error(target, pred)**0.5
    print("RMSE on whole dataset {}".format(RMSE))

    return score
Beispiel #15
0
 def svc_param_selection(self, X, y, nfolds):
     Cs = [0.001, 0.01, 0.1, 1, 10]
     gammas = [0.001, 0.01, 0.1, 1]
     param_grid = {'C': Cs, 'gamma': gammas}
     grid_search = GridSearchCV(svm.SVC(kernel='rbf'),
                                param_grid,
                                cv=nfolds,
                                n_jobs=4,
                                verbose=1)
     grid_search.fit(X, y)
     grid_search.best_params_
     return grid_search.best_params_
Beispiel #16
0
def svc_param_selection(X, y, nfolds):
    print('Using GridSearchCV for tuning the hyperparameters...')
    print()
    print('This might take some time (approx 5-10 mins)...')
    C_range = np.logspace(-2, 10, 13)
    gamma_range = np.logspace(-9, 3, 13)
    param_grid = {'C': C_range, 'gamma' : gamma_range}
    model = svm.SVC(kernel = 'rbf')
    grid_search = GridSearchCV(model, param_grid, cv=nfolds)
    grid_search.fit(X, y)
    print("Best Params")
    print(grid_search.best_params_)
    return grid_search.best_params_
Beispiel #17
0
def svc_param_selection(dataset, nfolds):
    # outcome and parameters
    y = np.array([x[yPos] for x in dataset])
    X = np.array([x[0:yPos] for x in dataset])

    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    kernels = ["linear", "rbf"]
    param_grid = {'C': Cs, 'gamma': gammas, 'kernel': kernels}
    grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_
def random_forest(X, y, data, split_iterator):
    hyperparameter_space = {
        "n_estimators": [150],
        "min_samples_split": [50],
        "min_samples_leaf": [7]
    }

    model = sklearn.ensemble.RandomForestClassifier()
    grid_search = sklearn.grid_search.GridSearchCV(model, hyperparameter_space, n_jobs=N_JOBS, cv=split_iterator, verbose=1)
    grid_search.fit(X, y)

    print "Random forest"
    print_tuning_scores(grid_search)
    print_feature_importances(data.drop("IsBlueWinner", axis=1).columns, grid_search.best_estimator_)
def neural_network(X, y, data, split_iterator):
    X = sklearn.preprocessing.scale(X)

    print "Neural network"

    hyperparameter_space = {
        "hidden_layer_sizes": [(75,)],
        "dropout": [0.5]
    }

    model = classifiers.NnWrapper(dropout=0.5, show_accuracy=True, batch_spec=((100, 1024), (100, -1)))
    grid_search = sklearn.grid_search.GridSearchCV(model, hyperparameter_space, n_jobs=3, verbose=1)
    grid_search.fit(X, y)

    print_tuning_scores(grid_search)
def SVM_Ranking_Model_Extraction_And_Encoding():

    # Pandas readin Training Samples
    Training_Table_Raw = pd.read_csv("FeatureToTrainWithoutTester.csv")
    Training_Table_Raw = Training_Table_Raw.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Dataset Start Time', 'Dataset End Time', 'executionStartTime', 'Dataset Group', 'Users Group'], axis = 1)
    Training_Table = Training_Table_Raw.copy()

    # Feature Encoding
    Training_Table = transform_features(Training_Table)

    # Training/Testing DataSet Split 
    Train_Test_Split = Training_Table.copy()
    X, y = Train_Test_Split.drop('userName', axis = 1), Train_Test_Split['userName']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    # SVM configuration
    parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), }
    pipeline = Pipeline([('clf', SVC(kernel='rbf', gamma=0.01, C=100, max_iter = 100, probability = True))])
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy')
    Grid_Fit = grid_search.fit(X_train, y_train)
    
    predictions = grid_search.predict(X_test)

    Top_N_Recommendder = Accumulation(Training_Table_Raw, Training_Table, Grid_Fit)

    # Prediction Results 
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Confusion Matrix:', confusion_matrix(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)

    return Top_N_Recommendder
 def test_cv_pipeline(self):
     pipeline = SKL_Pipeline([
         ('vect', SKL_HashingVectorizer(n_features=20)),
         ('tfidf', SKL_TfidfTransformer(use_idf=False)),
         ('lasso', SKL_Lasso(max_iter=1))
     ])
     parameters = {
         'lasso__alpha': (0.001, 0.005, 0.01)
     }
     grid_search = GridSearchCV(self.sc, pipeline, parameters)
     data = [('hi there', 0.0),
             ('what is up', 1.0),
             ('huh', 1.0),
             ('now is the time', 5.0),
             ('for what', 0.0),
             ('the spark was there', 5.0),
             ('and so', 3.0),
             ('were many socks', 0.0),
             ('really', 1.0),
             ('too cool', 2.0)]
     df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas()
     skl_gs = grid_search.fit(df.review.values, df.rating.values)
     assert len(skl_gs.grid_scores_) == len(parameters['lasso__alpha'])
     # TODO
     for gs in skl_gs.grid_scores_:
         pass # assert(gs.)
    def test_cv_lasso_with_mllib_featurization(self):
        data = [('hi there', 0.0),
                ('what is up', 1.0),
                ('huh', 1.0),
                ('now is the time', 5.0),
                ('for what', 0.0),
                ('the spark was there', 5.0),
                ('and so', 3.0),
                ('were many socks', 0.0),
                ('really', 1.0),
                ('too cool', 2.0)]
        data = self.sql.createDataFrame(data, ["review", "rating"])

        # Feature extraction using MLlib
        tokenizer = Tokenizer(inputCol="review", outputCol="words")
        hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000)
        pipeline = Pipeline(stages=[tokenizer, hashingTF])
        data = pipeline.fit(data).transform(data)

        df = self.converter.toPandas(data.select(data.features.alias("review"), "rating"))

        pipeline = SKL_Pipeline([
            ('lasso', SKL_Lasso(max_iter=1))
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }

        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
Beispiel #23
0
    def test_cv_lasso_with_mllib_featurization(self):
        data = [('hi there', 0.0),
                ('what is up', 1.0),
                ('huh', 1.0),
                ('now is the time', 5.0),
                ('for what', 0.0),
                ('the spark was there', 5.0),
                ('and so', 3.0),
                ('were many socks', 0.0),
                ('really', 1.0),
                ('too cool', 2.0)]
        data = self.sql.createDataFrame(data, ["review", "rating"])

        # Feature extraction using MLlib
        tokenizer = Tokenizer(inputCol="review", outputCol="words")
        hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000)
        pipeline = Pipeline(stages=[tokenizer, hashingTF])
        data = pipeline.fit(data).transform(data)

        df = self.converter.toPandas(data.select(data.features.alias("review"), "rating"))

        pipeline = SKL_Pipeline([
            ('lasso', SKL_Lasso())
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }

        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
 def test_cv_pipeline(self):
     pipeline = SKL_Pipeline([
         ('vect', SKL_HashingVectorizer(n_features=20)),
         ('tfidf', SKL_TfidfTransformer(use_idf=False)),
         ('lasso', SKL_Lasso(max_iter=1))
     ])
     parameters = {
         'lasso__alpha': (0.001, 0.005, 0.01)
     }
     grid_search = GridSearchCV(self.sc, pipeline, parameters)
     data = [('hi there', 0.0),
             ('what is up', 1.0),
             ('huh', 1.0),
             ('now is the time', 5.0),
             ('for what', 0.0),
             ('the spark was there', 5.0),
             ('and so', 3.0),
             ('were many socks', 0.0),
             ('really', 1.0),
             ('too cool', 2.0)]
     df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas()
     skl_gs = grid_search.fit(df.review.values, df.rating.values)
     assert len(skl_gs.grid_scores_) == len(parameters['lasso__alpha'])
     # TODO
     for gs in skl_gs.grid_scores_:
         pass # assert(gs.)
Beispiel #25
0
def run_gridsearch(X, y, clf, cv=10):

    param_grid = {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [10, 15, 20, 40],
        "max_depth": [10, 15, 30],
        "min_samples_leaf": [30, 40, 50, 55, 100],
        "max_leaf_nodes": [35, 50, 60],
        "min_samples_leaf": [15, 20, 30]
    }
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=cv)
    start = time()
    grid_search.fit(X, y)
    print(("\nGridSearchCV took {:.2f} "
           "seconds for {:d} candidate "
           "parameter settings.").format(time() - start,
                                         len(grid_search.grid_scores_)))
    top_params = report(grid_search.grid_scores_, 3)
    return top_params
Beispiel #26
0
 def test_cv_linreg(self):
     pipeline = SKL_Pipeline([('lasso', SKL_Lasso(max_iter=1))])
     parameters = {'lasso__alpha': (0.001, 0.005, 0.01)}
     grid_search = GridSearchCV(self.sc, pipeline, parameters)
     X = scipy.sparse.vstack(
         map(lambda x: self.list2csr([x, x + 1.0]), range(0, 100)))
     y = np.array(list(range(0, 100))).reshape((100, 1))
     skl_gs = grid_search.fit(X, y)
     assert len(skl_gs.cv_results_['params']) == len(
         parameters['lasso__alpha'])
Beispiel #27
0
def run_gridsearch(X, y, clf, cv=10):

    param_grid = {"criterion": ["gini", "entropy"],
                  "min_samples_split": [10,15,20,40],
                  "max_depth": [10,15,30],
                  "min_samples_leaf": [30,40,50,55,100],
                  "max_leaf_nodes":  [35,50,60],
                  "min_samples_leaf" : [15,20,30]}
    grid_search = GridSearchCV(clf,
                               param_grid=param_grid,
                               cv=cv)
    start = time()
    grid_search.fit(X, y)
    print(("\nGridSearchCV took {:.2f} "
           "seconds for {:d} candidate "
           "parameter settings.").format(time() - start,
                len(grid_search.grid_scores_)))
    top_params = report(grid_search.grid_scores_, 3)
    return  top_params
def decision_tree(X, y, data, split_iterator, dot_filename=None):
    hyperparameter_space = {
        "max_depth": [5, 10, 20],
        "min_samples_split": [25, 50, 100],
        "min_samples_leaf": [5, 10, 50]
    }

    model = sklearn.tree.DecisionTreeClassifier(max_depth=5)
    grid_search = sklearn.grid_search.GridSearchCV(model, hyperparameter_space, n_jobs=N_JOBS, cv=split_iterator, verbose=1)
    grid_search.fit(X, y)

    print "Decision tree tuning"
    print_tuning_scores(grid_search)

    # refit a shallow tree for visualization
    if dot_filename:
        model = sklearn.tree.DecisionTreeClassifier(max_depth=5)
        model.fit(X, y)
        sklearn.tree.export_graphviz(model, dot_filename, feature_names=data.drop("IsBlueWinner", axis=1).columns)
 def test_cv_linreg(self):
     pipeline = SKL_Pipeline([
         ('lasso', SKL_Lasso(max_iter=1))
     ])
     parameters = {
         'lasso__alpha': (0.001, 0.005, 0.01)
     }
     grid_search = GridSearchCV(self.sc, pipeline, parameters)
     X = scipy.sparse.vstack(map(lambda x: self.list2csr([x, x+1.0]), range(0, 100)))
     y = np.array(list(range(0, 100))).reshape((100,1))
     skl_gs = grid_search.fit(X, y)
     assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
Beispiel #30
0
def train_model(feature_data, target_data, search_parameters):
    '''
    Train a model using ExtraTreesClassifier and entropy criterion for split quality:
    http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    '''
    # Train model
    model_pipeline = sklearn.pipeline.Pipeline([
        ('scale', sklearn.preprocessing.StandardScaler()),
        ('feature_select',
         sklearn.feature_selection.SelectPercentile(
             sklearn.feature_selection.f_classif)),
        ('classify',
         sklearn.ensemble.ExtraTreesClassifier(
             bootstrap=True,
             criterion='entropy',
         ))
    ])

    # Create the stratified cross-validation folder; this means that the distribution of the target
    # within each fold will mirror the overall population up to this point.
    cv = sklearn.cross_validation.StratifiedKFold(target_data, n_folds=10)

    # Create grid searcher
    grid_search = sklearn.grid_search.GridSearchCV(model_pipeline,
                                                   search_parameters,
                                                   scoring=sklearn.metrics.make_scorer(sklearn.metrics.fbeta_score,
                                                                                       beta=1.0,
                                                                                       pos_label=pandas.Series(
                                                                                           target_data) \
                                                                                       .value_counts().idxmax()),
                                                   cv=cv,
                                                   verbose=0,
                                                   n_jobs=2)

    # Fit model in grid search
    grid_search.fit(feature_data, target_data)

    return grid_search
def feature_training(X_data, y_data, feature_name):

    # Retrieve the features
    X_Train, y_Train = X_data[feature_name].reshape(len(X_data[feature_name]), 1), y_data

    # Configuring the parameters
    parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), }

    pipeline = Pipeline([('clf', SVC(kernel='linear', gamma=0.01, C=100, max_iter = 10))])

    grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy')
    
    model = grid_search.fit(X_Train, y_Train)

    return model
pcfilter = sklearn.feature_selection.SelectPercentile(sklearn.feature_selection.f_classif, percentile=100)

pipeline = sklearn.pipeline.Pipeline([('filter', pcfilter), ('clf', clf)])

parameters = {'filter__percentile': [50, 60, 40, 70, 30, 80, 20]}

scorer = sklearn.metrics.make_scorer(sklearn.metrics.log_loss, greater_is_better=False, needs_proba=True)

grid_search = sklearn.grid_search.GridSearchCV(pipeline, parameters, scoring=scorer, n_jobs=1, cv=4, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time.time()
grid_search.fit(XX_train, yy_train)
print("done in %0.3fs" % (time.time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

try:
    joblib.dump(grid_search, '/disk/data1/s1145806/train3_model.pkl')
except:
    pass

Beispiel #33
0
pcfilter = sklearn.feature_selection.SelectPercentile(sklearn.feature_selection.f_classif, percentile=100)

pipeline = sklearn.pipeline.Pipeline([('filter', pcfilter), ('clf', clf)])

parameters = {'filter__percentile': [75, 80, 85, 90, 95, 100]}

scorer = sklearn.metrics.make_scorer(sklearn.metrics.log_loss, greater_is_better=False, needs_proba=True)

grid_search = sklearn.grid_search.GridSearchCV(pipeline, parameters, scoring=scorer, n_jobs=1, cv=4, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time.time()
grid_search.fit(XX_train, yy_train)
print("done in %0.3fs" % (time.time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

try:
    joblib.dump(grid_search, '/disk/data1/s1145806/train_model_refined.pkl')
except:
    pass

Beispiel #34
0
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

param_dist = {"base_estimator__max_depth": [1,2,3],
              "base_estimator__min_samples_split": [1,2],
              "base_estimator__min_samples_leaf": [1,2],
              "n_estimators": [2,3,5],
              "learning_rate":[0.4,0.6,0.8],
              "algorithm":["SAMME","SAMME.R"]
             }

cv = cross_validation.StratifiedShuffleSplit(y_train,n_iter = 4,random_state = 9)
f1score=make_scorer(f1_score, pos_label="yes") 

# build a classifier
dt_clf=DecisionTreeClassifier()
clf = AdaBoostClassifier(dt_clf)

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_dist,cv=cv,scoring=f1score)
gs_estimator=grid_search.fit(X_train,y_train)

print "Best model parameter:  " + str(gs_estimator.best_params_)
y_pred=grid_search.predict(X_test)
#print y_pred
gs_f1score=f1_score(y_test, y_pred,pos_label="yes")
print "f1 score: {:.5f}".format(gs_f1score)


# ###### 
            #temp = np.vstack([bowDiction.compute(im,orb.detect(im)), h])
            descriptor.append(tempy)
            for y in range (0,18):
                if(train_names[y] in x):
                    label.append(y)

#svm = SVC(C = 4, gamma = 0.4)
svm = SVC()
#svm.fit(np.array(descriptor), np.array(label))
#svm = LinearSVC(random_state=0)
#svm.fit(np.array(descriptor),np.array(label))
#svc = SVC()
param_grid = [
  {'C': [2.1,2.3,2.5],'gamma':[0.40,0.45,0.50]}]
grid_search = GridSearchCV(svm, param_grid = param_grid)
grid_search.fit(np.array(descriptor), np.array(label))
svm = grid_search
print ("three")
joblib.dump(orb, 'model_sift.pkl',protocol=2)
joblib.dump(svm, 'model_svm.pkl', protocol=2) #Save Model
joblib.dump(dic, 'bow_dic.pkl', protocol=2)
confusion = np.zeros((18,18))
count = 0
matrix = []
for x in train_name_path:
    im = cv2.imread(x,0)
    if im is not None:
        hog = cv2.HOGDescriptor()
        #hog.winSize = Size(16,32)
        img = cv2.resize(im,(64,128))
        temp = bowDiction.compute(img,orb.detect(img))
print("Testing set Accuracy ",
      accuracy_score(y_test, y_pred.round(), normalize=True))

# XXX
# TODO: Tune the hyper-parameters 'C' and 'kernel' (use rbf and linear).
#       Print the best params, using .best_params_, and print the best score, using .best_score_.
# Get the training and test set accuracy values after hyperparameter tuning.
# XXX

Cs = [1, 10, 100]
kernels = ['linear', 'rbf']

param_grid = {'C': Cs, 'kernel': kernels}

grid_search = GridSearchCV(SVC(), param_grid=param_grid, cv=5)
grid_search.fit(rescaledX, y_data)
print("best params ", grid_search.best_params_)
print("best score ", grid_search.best_score_)

tuningpredict = grid_search.predict(X_test)

print("Train Accuracy  ", accuracy_score(y_train,
                                         grid_search.predict(X_train)))
print("Test Accuracy ",
      accuracy_score(y_test, tuningpredict.round(), normalize=True))

svclassifier2 = SVC(kernel='linear', C=1)
svmd2 = svclassifier2.fit(X_train, y_train)
y_pred2 = svmd2.predict(X_test)
print("Train Accuracy  ", accuracy_score(y_train, svmd2.predict(X_train)))
print("Test Accuracy ", accuracy_score(y_test, y_pred2.round(),
Beispiel #37
0
def getClassifier(data, target):
    score = 0
    temp = 0

    # Classifier to use in BaggingClassifier
    classifier1 = ensemble.ExtraTreesClassifier(min_samples_split=3,
                                                n_estimators=10,
                                                max_features=4)

    # Classifier for GridSearch
    classifier = ensemble.BaggingClassifier(classifier1)

    # Params
    param_grid = {'n_estimators': range(5, 25)}
    #param_grid = {'n_estimators' : np.linspace(10,11, num = 2)}

    # GridSearch
    grid_search = sklearn.grid_search.GridSearchCV(
        classifier,
        param_grid,
        scoring=sklearn.metrics.make_scorer(accuracy_score),
        cv=5,
        n_jobs=4)
    grid_search.fit(data, target)
    clf = grid_search.best_estimator_

    # Print Estimator
    print(clf)

    # Print Cross of Validations Scores
    print(cross_val_score(clf, data, target, cv=5, scoring='accuracy'))

    # Print Mean of Cross Validations Scores
    temp = np.mean(cross_val_score(clf, data, target, cv=5,
                                   scoring='accuracy'))
    print("Built-in Cross-Validation: {} ".format(temp))

    # Martins Version of Cross Validation
    chunk_size = len(data) / CVSize
    for x in range(CVSize):

        # These describe where to cut to get our crossdat
        first_step = x * chunk_size
        second_step = (x + 1) * chunk_size

        # Get the data parts we train on
        cross_data = np.vstack((data[:first_step], data[second_step:]))
        cross_target = np.append(target[:first_step], target[second_step:])

        # fit and save the coef
        clf.fit(cross_data, cross_target)

        # Find mean squared error and print it
        sample_data = data[first_step:second_step]
        sample_target = target[first_step:second_step]

        # Get scores for our model
        pred = clf.predict(sample_data)
        RMSE = accuracy_score(sample_target, pred)
        score += RMSE

    score = score / CVSize

    print("Cross-Validation RMSE: {} ".format(score))

    # Get global score
    #clf.fit(data, target)
    #pred = clf.predict(data)
    #RMSE = accuracy_score(target, pred)
    #print("RMSE on whole dataset {}".format(RMSE))

    # Return estimator/classifier
    return clf
    svm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)
    svm.fit(X_train_dtm, Y_train)
    y_pred_class_svm = svm.predict(X_test_dtm)
    print(metrics.accuracy_score(Y_test, y_pred_class_svm),"SVM-SGD -countvectorizer")
    svm_t = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)
    svm_t.fit(X_train_tfidf, Y_train)
    y_pred_svm_t = svm_t.predict(X_test_tfidf)
    print(metrics.accuracy_score(Y_test, y_pred_svm_t),"SVM-SGD -tfidf")
    #grid
    print("grid")
    from sklearn import svm, grid_search
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas, 'kernel':('poly', 'rbf')}
    grid_search = GridSearchCV(svm.SVC(), param_grid, cv=5)
    grid_search.fit(X_train_dtm, Y_train)
    print(grid_search.best_score_)
    print(grid_search.best_params_)
    y_grid_search_svm = grid_search.predict(X_test_dtm)
    print(metrics.accuracy_score(Y_test,y_grid_search_svm),"grid search- SVM")
    



    
    '''
    #X_train, X_test, y_train, y_test = train_test_split(corpus, labels, random_state=1,train_size=0.90)
    #X_train_tfidf, vectorizer = generate_features(X_train)
    #X_test_tfidf, vectorizer = generate_features(X_test)

    
Beispiel #39
0
    def run(self,
            label_groupings,
            data_splits,
            feature_extractors,
            estimators,
            scores,
            cv=5,
            n_jobs=1,
            to_csv=None):

        self.label_groupings = label_groupings
        self.data_splits = data_splits
        self.feature_extractors = feature_extractors
        self.estimators = estimators
        self.scores = scores

        # total amount of grid search settings
        total_settings = (len(label_groupings) * len(feature_extractors) *
                          len(estimators) * len(scores))

        # the current the grid search settings
        current_setting = 0

        for l_key, l_settings in label_groupings.items():
            for f_key, f_settings in feature_extractors.items():
                for e_key, e_settings in estimators.items():
                    for s_key, s_settings in scores.items():

                        current_setting += 1

                        logger.info("Running setting " + str(current_setting) +
                                    "/" + str(total_settings) + ": " +
                                    str(l_settings.title) + " | " +
                                    str(f_settings.title) + " | " +
                                    str(e_settings.title) + " | " +
                                    str(s_settings.title))

                        pipeline = sklearn.pipeline.Pipeline([
                            ('vect', f_settings.vectorizer),
                            ('clf', e_settings.estimator)
                        ])

                        parameters = {}
                        parameters.update(f_settings.parameter_space)
                        parameters.update(e_settings.parameter_space)

                        grid_search_cv = sklearn.grid_search.GridSearchCV
                        grid_search = grid_search_cv(pipeline,
                                                     parameters,
                                                     scoring=s_settings.score,
                                                     cv=cv,
                                                     n_jobs=n_jobs)
                        grid_search.fit(self.data_splits[l_key].X_train,
                                        self.data_splits[l_key].Y_train)

                        score_train_ = grid_search.best_score_
                        score_test_ = grid_search.score(
                            self.data_splits[l_key].X_test,
                            self.data_splits[l_key].Y_test)

                        hash_key = hash(l_key + f_key + e_key + s_key)
                        self.results[hash_key] = [
                            l_key, l_settings.title, l_settings, f_key,
                            f_settings.title, f_settings, e_key,
                            e_settings.title, e_settings, s_key,
                            s_settings.title, s_settings, score_train_,
                            score_test_, grid_search.best_estimator_,
                            grid_search.best_params_, grid_search.grid_scores_
                        ]

                        # update results
                        resultrows = []
                        for key, result in self.results.items():
                            resultrows.append(result)

                        self.results_table = pd.DataFrame(resultrows,
                                                          columns=self.columns)
                        if (to_csv):
                            self.results_to_csv(to_csv)

        return self.results_table
#print(X_test.shape, y_test.shape)
#print(rfe.score(X_test,y_test))
###########################################################      Logistic Regression  with grid search  ##############################################################################


#p = pd.read_pickle('C:\\Users\\Tawsif Sazid\\Desktop\\lala.xls')
#print(first.dtype)
Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 2000, .0001, 1E6, .00000001]
#solver = ['newton-cg','sag']
##gammas = [0.001, 0.01, 0.1, 1, 10 , 100, 200, 1000, 2000, .0001]   gamma lagbe naa
max_iter = [100,1000,10000]
param_grid = {"C": Cs, 'max_iter': max_iter}

mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
grid_search = GridSearchCV(estimator=mul_lr,param_grid=param_grid,cv=12,refit=True)
grid_search.fit(yoo1,y)
print(grid_search.best_score_)


####import statsmodels.api as sm
'''
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X_train, y_train)
print("Logistic Regression")
print(mul_lr.predict(X_test))
print(mul_lr.score(X_test,y_test))
'''
######################################################### Perform 6-fold cross validation #########################################################################
'''
# Necessary imports: 
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics
            results_recall = np.array([])
            results_f1 = np.array([])

            for train_indices, test_indices in kf:

                features_train= [features[ii] for ii in train_indices]
                features_test= [features[ii] for ii in test_indices]
                labels_train=[labels[ii] for ii in train_indices]
                labels_test=[labels[ii] for ii in test_indices]

                grid_search = GridSearchCV(pipeline,
                                           param_grid = param_grid,
                                           scoring = scoring_index,
                                           n_jobs=1)

                clf = grid_search.fit(features_train, labels_train)
                pred = clf.best_estimator_.predict(features_test)

                print 'Best Estimator >>> ', grid_search.best_estimator_
                print 'Best score >>> ', grid_search.best_score_
                print 'Best scorer >>> ', grid_search.scorer_
                print 'Best best parameters >>> ', grid_search.best_params_

                results_acc = np.append(results_acc, [accuracy_score(labels_test,pred)], axis=0)
                results_precision = np.append(results_precision, [precision_score(labels_test,pred)], axis=0)
                results_recall = np.append(results_recall, [recall_score(labels_test,pred)], axis=0)
                results_f1 = np.append(results_f1, [f1_score(labels_test,pred)], axis=0)

            print '>>>>>>>>>>', clfNames
            print "avg precision : ", np.array(results_precision).mean()
            print "avg recall : ", np.array(results_recall).mean()
Beispiel #42
0
#p = pd.read_pickle('C:\\Users\\Tawsif Sazid\\Desktop\\lala.xls')
#print(first.dtype)
Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 2000, .0001, 1E6, .00000001]
#solver = ['newton-cg','sag']
##gammas = [0.001, 0.01, 0.1, 1, 10 , 100, 200, 1000, 2000, .0001]   gamma lagbe naa
max_iter = [100, 1000, 10000]
param_grid = {"C": Cs, 'max_iter': max_iter}

mul_lr = linear_model.LogisticRegression(multi_class='multinomial',
                                         solver='newton-cg')
grid_search = GridSearchCV(estimator=mul_lr,
                           param_grid=param_grid,
                           cv=12,
                           refit=True)
grid_search.fit(yoo1, y)
print(grid_search.best_score_)

####import statsmodels.api as sm
'''
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X_train, y_train)
print("Logistic Regression")
print(mul_lr.predict(X_test))
print(mul_lr.score(X_test,y_test))
'''
######################################################### Perform 6-fold cross validation #########################################################################
'''
# Necessary imports: 
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics
scores = cross_val_score(mul_lr, first, y, cv=10)
Beispiel #43
0
    # print grid_search.grid_scores_
    # print grid_search.best_estimator_
    #Best estimator was C=0.5.

    #Re-centering search.
    # parameters = {'C':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
    # grid_search = grid_search.GridSearchCV(model_svm, parameters)
    # grid_search.fit(X_train, Y_train)
    # print grid_search.grid_scores_
    # print grid_search.best_estimator_
    #Best estimator was C=0.5.

#Narrowing down by order of magnitude.
parameters = {'C':[0.45, 0.46, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.53, 0.54, 0.55]}
grid_search = grid_search.GridSearchCV(model_svm, parameters)
grid_search.fit(X_train, Y_train)
print grid_search.grid_scores_
print grid_search.best_estimator_
#Best estimator was C=0.45. Because we already compared 0.4 to 0.5 two searches above, and 0.5 was selected, we induce that 0.45 is the optimal value without searching between 0.40 and 0.45.

#Returning model results with optimal 'C' value.
expected = Y_test
predicted = grid_search.predict(X_test)

print classification_report(expected, predicted)
print metrics.confusion_matrix(expected, predicted)
print metrics.accuracy_score(expected, predicted)

#Support Vector Machine: Model fit, transform, and testing with optimized 'C' value
splits = cv.train_test_split(X_train_tfidf, dataset.target, test_size=0.2)
X_train, X_test, Y_train, Y_test = splits
Beispiel #44
0
def classification(FV_N):
    " PCA reduction dimension & Random Forest Classification"

    pca = decomposition.PCA()
    RFC = RandomForestClassifier()

    estimators = [('reduce_dim', pca), ('Random_Forest', RFC)]
    pipe = Pipeline(estimators)

    # Search the best parameters for the classification
    #for i in range(100,700,100):
    #    cc=[i]+cc
    #nb_tree=[]
    #random_st=[]
    #for i in range(50,350,50):
    #    nb_tree=[i]+nb_tree
    #    random_st=[0]+random_st

    cc = [70, 80, 90]
    nb_tree = [200, 200, 200]
    random_st = [0, 0, 0]

    aa = [100, 200, 300]
    cc = []

    params = dict(reduce_dim__n_components=cc,
                  Random_Forest__n_estimators=nb_tree,
                  Random_Forest__random_state=random_st)

    grid_search = GridSearchCV(pipe, param_grid=params)

    X = FV_N

    yr = Get_true_y(Data_FRAMES)

    filename_yr = projectpath + 'io/Output/yr.npy'

    np.save(filename_yr, yr)

    yr = np.load(filename_yr)

    Data_FRAMES.loc[Data_FRAMES.indice == 1595]

    X = X[:yr.shape[0]]
    X.shape
    yr = yr[:X.shape[0]]
    np.save(filename_yr, yr)

    yr = np.load(filename_yr)

    grid_search.fit(X, yr)

    print(grid_search.best_estimator_)

    plt.figure()
    plt.axvline(
        grid_search.best_estimator_.named_steps['reduce_dim'].n_components,
        linestyle=':',
        label='n_components chosen')
    plt.legend(prop=dict(size=12))
    plt.show()

    plt.figure()
    plt.axvline(
        grid_search.best_estimator_.named_steps['Random_Forest'].n_estimators,
        linestyle=':',
        label='n_estimators chosen')
    plt.legend(prop=dict(size=12))
    plt.show()

    n_est_rdf = grid_search.best_estimator_.named_steps[
        'Random_Forest'].n_estimators

    n_compo_pca = grid_search.best_estimator_.named_steps[
        'reduce_dim'].n_components

    pca = decomposition.PCA(n_components=n_compo_pca, svd_solver='auto')
    pca.fit(X)

    variance_Ratio = pca.explained_variance_ratio_

    plt.figure(1, figsize=(4, 3))
    plt.clf()
    plt.axes([.2, .2, .7, .7])
    plt.plot(pca.explained_variance_ratio_.cumsum(), linewidth=1)
    plt.axis('tight')
    plt.xlabel('n_components')
    plt.ylabel('Cumulative Explained variance')

    M = pca.transform(X)

    plt.figure()
    plt.plot(M[yr == 1, 0], M[yr == 1, 1], 'or')
    plt.title('Astrocytes')
    plt.figure()
    plt.plot(M[yr == 2, 0], M[yr == 2, 1], 'ob')
    plt.title('Neurons')

    grid_search.predict(X)

    metrics.accuracy_score(yr, grid_search.predict(X))

    RFC = RandomForestClassifier(n_estimators=n_est_rdf, random_state=0)

    predictedVAL = cross_val_predict(RFC, X, yr, n_jobs=-1)

    metrics.accuracy_score(yr, predictedVAL)

    Conf_Mat = confusion_matrix(yr, predictedVAL)

    import seaborn as sns
    sns.heatmap(Conf_Mat.T, square=True, annot=True, cbar=False)
    plt.xlabel('True label')
    plt.ylabel('predicted label')

    return ()
Beispiel #45
0
# for reproducability
clf.set_params(adaboostclassifier__random_state=42)

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

import sklearn.grid_search
# grid search for best params
parameters = {"adaboostclassifier__n_estimators" : (10, 15, 25, 40, 70),
              "adaboostclassifier__learning_rate" : (.1, .2, .4, .7, 1)}
# score recall because that is the lower value for this dataset/ classifier
grid_search = sklearn.grid_search.GridSearchCV(clf, parameters,
                                               scoring="recall", cv=6)
grid_search.fit(features, labels)
# apply found params
clf.set_params(adaboostclassifier__n_estimators=
               grid_search.best_params_["adaboostclassifier__n_estimators"],
               adaboostclassifier__learning_rate=
               grid_search.best_params_["adaboostclassifier__learning_rate"])

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
tfidf =  np.hstack([tfidf_0,tfidf_1,tfidf_2,tfidf_3])
tfidf_train = tfidf[:num_train,:]
tfidf_test = tfidf[num_train:,:]
#df_all = pd.read_csv('df_all_clean.csv',sep='\t', encoding='utf-8')
df_all = pd.read_csv('df_all_clean.csv',encoding="ISO-8859-1")
df_all = df_all.drop([df_all.columns[0],\
                      'search_term','product_title','product_description','product_info','brand','attr'],axis=1)


df_train = df_all.iloc[:num_train]
y = df_train['relevance'].values
X = np.hstack([df_train.drop(['relevance'],axis=1).values,tfidf_train])
y = y[::3]
X = X[::3]
#print train.shape
#print valid.shape
#print y_train.shape
#print y_valid.shap
start_time = time.time()
clf = RandomForestRegressor(n_estimators = 10000)
    
from sklearn.grid_search import GridSearchCV

param_grid = {"max_depth": [20,25,30,35,40],"min_samples_leaf": [1, 3,7, 10]}

grid_search = GridSearchCV(clf, param_grid=param_grid)
grid_search.fit(X, y)

print grid_search.best_params_           
print("--- Training & Testing: %s minutes ---" % round(((time.time() - start_time)/60),2))
def SVM_Ranking_Model_Extraction_And_Encoding():

    # Pandas readin Training Samples
    df = pd.read_csv("FeatureToTrainWithoutTester.csv")
    df2 = df.copy()
    df2 = df2.drop(['Dataset Start Time', 'Dataset End Time', 'executionStartTime', 'Dataset Group', 'Users Group'], axis = 1)
    df2.head()

    # Feature Encoding
    transform_features(df2)
    df2.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1)
    df2.head()
    
    # Encoded Features
    df = pd.read_csv("Transform_features.csv")

    # Training/Testing DataSet Split 
    df3 = df.copy()
    y = df3['userName']
    df3 = df3.drop(['userName'], axis = 1)
    X = df3
    X_train, X_test, y_train, y_test = X, X, y, y

    # SVM configuration
    parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), }
    pipeline = Pipeline([('clf', SVC(kernel='rbf', gamma=0.01, C=100, max_iter = 100, probability = True))])
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy')
    result2 = grid_search.fit(X_train, y_train)

    #coef = (result.best_estimator_.get_params()['clf'].coef_)
    #coef2 = coef_sum(coef)
    #coef2
    
    index = ['DatasetName', 'Agency', 'Instrument', 'Physical variable', 'var',
       'Units', 'Grid Dimension', 'Variable Name in Web Interface', 'model']


    # Model Estimation
    model = []

    for i in index:
        # Features' distance/relevant to category prediction
        model.append(feature_training(X_train, y_train, i))


    # Training data distance to single column PCA
    weight_set = numpy.zeros((len(X_train), len(index)))

    for j in range(0, len(X_train)):

        dict_index = 0

        for i in index:

            # Features' distance/relevant to category prediction
            model_extraction = model[dict_index]
            sample = X_train[j:j+1]
            weight = feature_distance(sample, i, model_extraction)
            weight_set[j, dict_index] = weight

            dict_index = dict_index + 1

            print "[INFO] Data Points: ", j, "Columns Iteration: ", dict_index
            print "[INFO] Weight : ", weight

        if j % 100 == 0:
            weight_set_file = pd.DataFrame(weight_set.copy())
            weight_set_file.to_csv("weight_set.csv")


    # Delivery: Training data with Label 
    Training_matrix = pd.DataFrame(weight_set.copy())
    Training_matrix['Label'] = y_train


    # SVM Ranking Formatting
    SVM_Rank_Formatted_Training_data  = Training_matrix.copy()

    for j in range(0, len(X_train)):
        for i in range(0, 9):
            SVM_Rank_Formatted_Training_data.ix[j, i] = str(i + 1) + ":" + str(SVM_Rank_Formatted_Training_data.ix[j, i])
            SVM_Rank_Formatted_Training_data.ix[j, 'Label'] = str(int(SVM_Rank_Formatted_Training_data.ix[j, 9]))

    # Columns Reorder
    Rank_format_columns = SVM_Rank_Formatted_Training_data.columns.tolist()
    Rank_format_columns = Rank_format_columns[-1:] + Rank_format_columns[:-1]
    SVM_Rank_Formatted_Training_data = SVM_Rank_Formatted_Training_data[Rank_format_columns]

    # Write to CSV format
    SVM_Rank_Formatted_Training_data.to_csv("SVM_Rank_Formatted_Training_data2.dat", index = False, sep = ' ', index_label = False, header = False)
    SVM_Rank_Formatted_Training_data.to_csv("SVM_Rank_Formatted_Training_data2.csv")

    predictions = grid_search.predict(X_test)

    # Prediction Results 
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Confusion Matrix:', confusion_matrix(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)
Beispiel #48
0
        'kernel': ['rbf'],
        'gamma': [1e-3, 1e-4],
        'C': [1, 10, 100, 1000]
    },
    {
        'kernel': ['sigmoid'],
        'C': [1, 10, 100, 1000]
    },
    {
        'kernel': ['linear'],
        'C': [1, 10, 100, 1000]
    }
]

grid_search = GridSearchCV(clf, param_grid=tuned_parameters)
grid_search.fit(X_trn, Y_trn)

print("GridSearchCV took")
report(grid_search.cv_results_)

# randomized parameter optimization
dist_parameters = {
    "kernel": ['poly', 'rbf', 'sigmoid', 'linear'],
    "C": scipy.stats.expon(
        scale=1000
    ),  #scipy.stats.expon : An exponential continuous random variable. 즉 0~1사이의 값을 랜덤하게 추출
    "degree": scipy.stats.expon(scale=10),  # scale = 1/lamda 즉 
    "gamma": scipy.stats.expon(scale=.1)
}

n_iter_search = 20