def buildRandomForest(self, X_train, X_test, y_train, cv = 3, n_iter = 5, save = False):
        rf = RandomForestClassifier(random_state = 9)
        #Tune the model
        param_distributions = {
            'n_estimators': range(1,50,1),
            'max_depth': range(1,70,1),
            'max_features': range(6,15,1),
            'min_samples_split':[2,3,4],
            'min_samples_leaf':[1,2,3,4],
            'n_jobs':[-1]
        }

        rf_optimized = RandomizedSearchCV(
            estimator = rf,
            param_distributions = param_distributions,
            n_iter= n_iter,
            scoring = 'f1',
            cv = cv,
            random_state = 1
        )

        rf_optimized.fit(X_train, y_train)
        if save == True:
            joblib.dump(value = rf_optimized, filename = "rf_optimized.pkl", compress=1)

        print "Best parameter: %s"  %rf_optimized.best_params_
        print "Best average cross validated F1 score: %0.4f" %rf_optimized.best_score_
        print "--------------------------------------------"

        #predictions
        predicted_y_train = rf_optimized.predict(X_train)
        predicted_y_test = rf_optimized.predict(X_test)

        return predicted_y_train, predicted_y_test
def build_sample(regressor, name):

	# print estimator.get_params().keys() : specify parameters and distributions to sample from
	param_dist = {"max_depth": [3, None],
		      "max_features": sp_randint(1, 11),
		      "min_samples_split": sp_randint(1, 11),
		      "min_samples_leaf": sp_randint(1, 11)}#,
		      #"bootstrap": [True, False],
		      #"criterion": ["mse", "entropy"]}

	

	# run randomized search
	n_iter_search = 20
	random_search = RandomizedSearchCV(regressor, param_distributions=param_dist,
		                           n_iter=n_iter_search)
	
	# time...
	start = time()
	# repeat the CV procedure 10 times to get more precise results
	n = 10  
	# for each iteration, randomly hold out 10% of the data as CV set
	for i in range(n):
		X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
		      sample_X, sample_y, test_size=.10, random_state=i*SEED)
		# train with rand...
		random_search.fit(X_train, y_train)
		# train...
		#regressor = regressor.fit(X_train, y_train)
		# save model
		#store_pkl(regressor, name + ".pkl")
		# predict on train
		preds = random_search.predict(X_cv)
		# print 
		#print preds
		# create DataFrame
		#preds = DataFrame(preds, columns = ["prime_tot_ttc_preds"])
		#print preds
		#print y_cv
		# mape
		mape_r = mape(y_cv, preds)
		# print
		print "MAPE of (fold %d/%d) of %s is : %f" % (i+1 , n, name, mape_r)
		# time...
		print("RandomizedSearchCV took %.2f seconds for %d candidates"
		      " parameter settings." % ((time() - start), n_iter_search))
		report(random_search.grid_scores_)
	# predict on test
	predict_res = random_search.predict(sample_t)
	preds_on_test = DataFrame(list(zip(sample_id, predict_res)), columns = ["ID", "CODIS"])
	preds_on_test['ID'].astype(int)
	# save predictions
	store_csv(preds_on_test, name + ".csv")
	return predict_res
Example #3
0
class CVSearcher(SearcherBase):
    '''
    Cross validation searcher is not specific for time series
    '''

    def __init__(self, sklearn_model_class, params, scoring=None, method=None,
                 n_randomized_search=200, cv=5):
        super(CVSearcher, self).__init__(sklearn_model_class, params, method=method,
                                         n_randomized_search=n_randomized_search,
                                         cv=cv, scoring=scoring)

    def fit(self, X, Y):
        if self.method == 'Grid':
            self.__searcher = GridSearchCV(estimator=self.ml_class(), param_grid=self.search_space,
                                           scoring=self.scoring, cv=self.cv, refit=True)
        elif self.method == 'Randomized' or self.method is None:
            self.__searcher = RandomizedSearchCV(estimator=self.ml_class(), param_distributions=self.search_space,
                                                 scoring=self.scoring,
                                                 n_iter=self.n_randomized_search, cv=self.cv, refit=True)
        else:
            raise ValueError('CVSearcher only support GridSearch and RandomizedSearch')
        self.__searcher.fit(X, Y)
        print("Best: %s" % (self.__searcher.best_estimator_))
        return self

    def predict(self, X):
        return self.__searcher.predict(X)

    def get_scores(self):
        return self.__searcher.grid_scores_
Example #4
0
def Decision_tree(Xtrain, Ytrain, Xtest):

    tuned_parameters = {
        'splitter': ['best', 'random'],
        "max_features": ["log2", "sqrt"],
        'min_samples_split': np.arange(30, 60, 5),
        'min_samples_leaf': np.arange(7, 14),
        'max_depth': np.arange(700, 1389, 10)
    }
    """Randomized optimizationSearch which used cross validation to optimized best parameters for the estimator. 
    In contrast to GridSearchCV, not all parameter values are tried out, 
    but rather a fixed number of parameter settings is sampled from the specified distributions.
    The number of parameter settings that are tried is given by n_iter.
    """
    Multreg = RandomizedSearchCV(DecisionTreeRegressor(random_state=0),
                                 param_distributions=tuned_parameters,
                                 cv=10,
                                 n_iter=int(args[1]),
                                 n_jobs=-1,
                                 random_state=0)

    #Fitting decision tree model
    Multreg.fit(Xtrain, Ytrain)
    #Predicting with unseen testing set
    YMultreg = Multreg.predict(Xtest)
    # save the model to disk
    filename = 'finalized_DC.sav'
    pickle.dump(Multreg, open(filename, 'wb'))
    return YMultreg
def parametr_tuning_random(model,
                           params,
                           scores,
                           X_train,
                           Y_train,
                           X_test,
                           Y_test,
                           n_iter_search=10):
    """
    """
    for score in scores:
        log("# Tuning hyper-parameters for %s: " % score)
        log("", False)
        rnd_tune = RandomizedSearchCV(model,
                                      params,
                                      n_iter=n_iter_search,
                                      cv=5,
                                      scoring=score)
        rnd_tune.fit(X_train, Y_train)

        log("Best parameters set found on development set:")
        log(str(rnd_tune.best_params_), False)
        log("random search score _ TEST set:")
        log(str(rnd_tune.score(X_test, Y_test) * 100), False)
        log("", False)
        log("random search scores on development set:")
        log(str(rnd_tune.grid_scores_), False)
        log("", False)
        log("Detailed classification report:")
        log("", False)
        y_true, y_pred = Y_test, rnd_tune.predict(X_test)
        log(classification_report(y_true, y_pred), False)
        log("", False)
def main():
    data = pd.read_csv(args.dataset)
    X = data.drop(['Id', 'Class'], axis=1)
    Y = data.loc[:, 'Class']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
    estimator = [('reduce_dim', SelectFromModel(RandomForestClassifier())), ('classifier', XGBClassifier())]
    # transform the threshold to the quantile of median
    tmp = map(str, np.arange(args.threshold[0],args.threshold[1],args.threshold[2]))
    threshold = map(lambda x: x+'*median', tmp)
    clf = Pipeline(estimator)
    params = {}
    params['reduce_dim__estimator__n_estimators'] = list(np.arange(args.components[0], args.components[1], args.components[2]))
    params['reduce_dim__threshold'] = threshold
    params['classifier__n_estimators'] = list(np.arange(args.num_tree[0], args.num_tree[1], args.num_tree[2]))
    params['classifier__max_depth'] = list(np.arange(args.depths[0], args.depths[1], args.depths[2]))
    params['classifier__learning_rate'] = list(np.arange(args.lr[0], args.lr[1], args.lr[2]))
    params['classifier__subsample'] = list(np.arange(args.subsample[0], args.subsample[1], args.subsample[2]))
    params['classifier__colsample_bytree'] = list(np.arange(args.colsample[0], args.colsample[1], args.colsample[2]))
    # Cross_validation for grid search
    try:
        grid_search = RandomizedSearchCV(clf, param_distributions=params, n_iter=args.iter, cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)
    except:
        grid_search = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)
    best_parameters, score, _ = max(grid_search.grid_scores_, key=lambda x: x[1])
    result = accuracy_score(y_test, grid_search.predict(X_test))
    print("Predict Accuracy: " + str(result))
    print("XGboost using raw pixel features:\n%s\n" % (metrics.classification_report(y_test, grid_search.predict(X_test))))
    print best_parameters
Example #7
0
def K_NN(Xtrain, Ytrain, Xtest):

    KNNoptparam = {
        "n_neighbors": np.arange(20, 200, 10),
        "weights": ['uniform', 'distance'],
        "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute']
        #,"leaf_size":np.arange(30,150,15)
        ,
        "p": [2, 3]
    }

    #Randomized search parameter optimization
    RF1 = RandomizedSearchCV(KNeighborsRegressor(),
                             param_distributions=KNNoptparam,
                             cv=10,
                             n_iter=int(args[1]),
                             n_jobs=-1,
                             random_state=0)

    RF1.fit(Xtrain, Ytrain)
    #Predicting using unseen data
    KNN_predict = RF1.predict(Xtest)
    # save the model to disk
    filename = 'finalized_KNN.sav'
    pickle.dump(RF1, open(filename, 'wb'))
    return KNN_predict
def parameter_tuning(Xn, yn, scale=1):
    
    # FEATURE SELECTION  
    print Xn.shape
    print yn.shape    
    
    # FEATURE SCALING    
    if scale == 1:
        Xn = preprocessing.scale(Xn, with_mean=True)
        print 'NORMALIZING'
    elif scale == 2:
        Xn = preprocessing.scale(Xn, with_mean=False)
        print 'NORMALIZING'
        
    tuned_parameters = [{'kernel': ['rbf'], 'C': np.logspace(-2, 7, 10),
                         'gamma': np.logspace(-4, 2, 7)}]
    
    tuned_parameters2 = {'kernel': ['rbf'], 'C': np.logspace(-2, 7, 10),
                         'gamma': np.logspace(-4, 2, 7)}
    
    linear_parameters = [{'kernel': ['linear'], 'C': np.logspace(-3, 4, 8)}]
    
    linear_parameters2 = {'kernel': ['linear'], 'C': np.logspace(-3, 4, 8)}
    

    cv = cross_validation.StratifiedKFold(yn,shuffle=True, n_folds=3, random_state=42)

    if RBF:
        clf = RandomizedSearchCV(estimator=SVC(C=1, cache_size=1000), param_distributions=tuned_parameters2, cv=cv, scoring='accuracy', n_iter=30, verbose=1, n_jobs=2).fit(Xn, yn)

        print("Best parameters set found on development set:")
        print
        print(clf.best_estimator_)
        print(clf.best_score_)
        print()
        print confusion_matrix(yn, clf.predict(Xn))

    if LINEAR:
        clf = GridSearchCV(estimator=SVC(C=1, cache_size=1000), param_grid=linear_parameters, cv=cv, scoring='accuracy', verbose=1, n_jobs=2).fit(Xn, yn)


        print("Best parameters set found on development set:")
        print
        print(clf.best_estimator_)
        print(clf.best_score_)
        print()
        print confusion_matrix(yn, clf.predict(Xn))
def svm_tuning(features_train,labels_train,features_test,labels_test,kernel="rbf",C=[],gamma=[],randomized=False,i=50):
    if C==[]:
        C = [x * 1 for x in range(1, 50)];
    if gamma==[]:
        gamma = [x * 1 for x in range(1, 50)];
    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(features_train, labels_train, test_size=0.25, random_state=0)

    if randomized:
        if kernel == "linear":
            tuned_parameters = {"C":C}
            clf =  RandomizedSearchCV(svm.LinearSVC(class_weight="balanced"), param_distributions=tuned_parameters, cv=5,scoring="accuracy",n_iter=i)
        elif kernel == "rbf":
            tuned_parameters = {'C': C, 'gamma': gamma}
            clf = RandomizedSearchCV(svm.SVC(kernel="rbf",cache_size=1000,class_weight="balanced"), param_distributions=tuned_parameters, cv=5,scoring="accuracy",n_iter=i)
        elif kernel == "logistic":
            tuned_parameters = {"C":C}
            clf =  RandomizedSearchCV(linear_model.LogisticRegression(), param_distributions=tuned_parameters, cv=5,scoring="accuracy",n_iter=i)
    else:
        if kernel == "linear":
            tuned_parameters = [{'C': C}]
            clf = GridSearchCV(svm.LinearSVC(class_weight="balanced"), tuned_parameters, cv=5,scoring="accuracy")
        elif kernel == "rbf":
            tuned_parameters = [{'C': C, 'gamma': gamma}]
            clf = GridSearchCV(svm.SVC(kernel="rbf",cache_size=1000,class_weight="balanced"), tuned_parameters, cv=5,scoring="accuracy")
        elif kernel == "logistic":
            tuned_parameters = [{'C': C}]
            clf = GridSearchCV(linear_model.LogisticRegression(), tuned_parameters, cv=5,scoring="accuracy")

    clf.fit(X_train, y_train)

    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))
    print("Best parameters set found on development set:")
    print(clf.best_params_)

    y_true, y_pred = y_test, clf.predict(X_test)
    #print(classification_report(y_true, y_pred))
    print(measures.avgF1(np.array(y_true),y_pred,0,1))

    print("FINAL C:")
    bestC = clf.best_params_["C"]
    if kernel == "linear":
        model = SVM.train(features_train,labels_train,c=bestC,k="linear")
    elif kernel == "rbf":
        bestGamma = clf.best_params_["gamma"]
        model = SVM.train(features_train,labels_train,c=bestC,g=bestGamma,k="rbf")
    elif kernel == "logistic":
        model = LogisticRegression.train(features_train,labels_train,c=bestC)


    prediction = SVM.predict(features_test,model)
    print(measures.avgF1(labels_test,prediction,0,1))
    print(" ")
    if kernel == "rbf":
        return [bestC,bestGamma]
    else:
        return bestC
Example #10
0
    def buildRandomForest(self,
                          X_train,
                          X_test,
                          y_train,
                          cv=3,
                          n_iter=5,
                          save=False):
        rf = RandomForestClassifier(random_state=9)
        #Tune the model
        param_distributions = {
            'n_estimators': range(1, 50, 1),
            'max_depth': range(1, 70, 1),
            'max_features': range(6, 15, 1),
            'min_samples_split': [2, 3, 4],
            'min_samples_leaf': [1, 2, 3, 4],
            'n_jobs': [-1]
        }

        rf_optimized = RandomizedSearchCV(
            estimator=rf,
            param_distributions=param_distributions,
            n_iter=n_iter,
            scoring='f1',
            cv=cv,
            random_state=1)

        rf_optimized.fit(X_train, y_train)
        if save == True:
            joblib.dump(value=rf_optimized,
                        filename="rf_optimized.pkl",
                        compress=1)

        print "Best parameter: %s" % rf_optimized.best_params_
        print "Best average cross validated F1 score: %0.4f" % rf_optimized.best_score_
        print "--------------------------------------------"

        #predictions
        predicted_y_train = rf_optimized.predict(X_train)
        predicted_y_test = rf_optimized.predict(X_test)

        return predicted_y_train, predicted_y_test
    def best_RandomForest(self, df=pd.DataFrame(),
        flag_interactions=False,
                            flag_clean_features=False,
                            impute_func=None,
                            fill_test_func=None):


        df = self.df
        if impute_func:
            print('imputing data...')
            df, self.df_X_realtest = self.impute_data(df,
                                                    self.df_X_realtest,
                                                    impute_func,
                                                    fill_test_func)

        print('get X, y from training set')
        (self.X, self.y) = self.ready_for_model_train(
                                    df, flag_interactions=flag_interactions,
                                    flag_clean_features=flag_clean_features)


        clf = RandomForestClassifier(bootstrap=False)

        grid = {'n_estimators': sp_randint(170, 350),
                'min_samples_leaf': sp_randint(1, 12),
                'max_features': sp_randint(2, 50),
                'max_depth': sp_randint(5, 30),
                'criterion': ['entropy','gini']}

        clf_rfc = RandomizedSearchCV(clf, n_jobs=4,
                                    n_iter=25, cv=6,
                                    param_distributions=grid,
                                    scoring='accuracy')

        print('Finding the best parameters...')
        clf_rfc.fit(self.X, self.y.ravel())

        print('preparing X, y from test set...')
        X_test, y_test = self.ready_for_model_test(
            self.df_X_realtest, flag_interactions)

        y_hat = clf_rfc.predict(X_test)


        print('Best Params: \n')
        for k, v in clf_rfc.best_params_.items():
            print(k, v)

        print("Accuracy with Random Forest = %4.4f"  %
            accuracy_score(y_test, y_hat))
        #binarize_y_confustion_matrix(y_test, y_hat)
        return(clf_rfc.best_params_)
    def best_XGboost(self, df=pd.DataFrame(),
        flag_interactions=False,
                            flag_clean_features=False,
                            impute_func=None,
                            fill_test_func=None):


        df = self.df
        if impute_func:
            print('imputing data...')
            df, self.df_X_realtest = self.impute_data(df,
                                                    self.df_X_realtest,
                                                    impute_func,
                                                    fill_test_func)

        print('get X, y from training set')
        (self.X, self.y) = self.ready_for_model_train(
                                    df, flag_interactions=flag_interactions,
                                    flag_clean_features=flag_clean_features)


        clf = XGBClassifier()

        grid = {'n_estimators': sp_randint(100, 600),
                'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.3, 0.5],
                'max_depth': sp_randint(5, 30),
                'min_child_weight': sp_randint(1, 5)}

        clf_rfc = RandomizedSearchCV(clf, n_jobs=3,
                                    n_iter=15, cv=4,
                                    param_distributions=grid,
                                    scoring='accuracy')

        print('Finding the best parameters...')
        clf_rfc.fit(self.X, self.y.ravel())

        print('preparing X, y from test set...')
        X_test, y_test = self.ready_for_model_test(
            self.df_X_realtest, flag_interactions)

        y_hat = clf_rfc.predict(X_test)


        print('Best Params: \n')
        for k, v in clf_rfc.best_params_.items():
            print(k, v)

        print("Accuracy with Random Forest = %4.4f"  %
            accuracy_score(y_test, y_hat))
        #binarize_y_confustion_matrix(y_test, y_hat)
        return(clf_rfc.best_params_)
Example #13
0
def run_grid_search(m, parameters, params, name, Xtrain, Ytrain, Xtest, Ytest):
	print('=' * 80)
	print("Training %s Model" % name)
	print('=' * 80)
	t0 = time()

	clf = RandomizedSearchCV(m, parameters, cv=3, n_jobs=4, verbose=3, error_score=0)
	clf.fit(Xtrain, Ytrain)
	Yhat = clf.predict(Xtest)
	print("\tDone in %1.2f seconds" % float(time() - t0))
	print("\tScore: %1.2f\n" % mse(Yhat, Ytest))

	print("Best Parameters" + str(clf.best_params_))
	print("Writing Solution")
	submit = pd.DataFrame(data={'id': ids, 'quality': Yhat})
	submit.to_csv('./submissions/'+name+'.csv', index = False)
Example #14
0
def optimize_svr(X_total_train, Y_train, X_total_test, Y_test, n_iter_search):
    svr = SVR()
    # params = [
    #     {'C': scipy.stats.expon(scale=1e-4), 'gamma': scipy.stats.expon(scale=1e-2), 'kernel' : ['rbf']},
    #     {'C': scipy.stats.expon(scale=1e-4), 'degree': [2, 3, 4, 5, 6], 'kernel' : ['poly']},
    #     {'C': scipy.stats.expon(scale=1e-4), 'kernel': ['linear']}
    # ]
   # params = {'C': scipy.stats.expon(scale=1e-4), 'degree': [1,2,3], 'kernel' : ['poly']}
    params = {'C': [1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-8,1e-10], 'degree': [1,2,3], 'kernel' : ['poly']}

    
    random_search = RandomizedSearchCV(svr, param_distributions=params, n_iter=n_iter_search)
    random_search.fit(X_total_train, Y_train)
    result = random_search.predict(X_total_test)
    
    mse = metrics.mean_squared_error(result, Y_test)
   # hyperparams = random_search.best_params_
    # return mse, random_search 
    return mse, random_search
Example #15
0
def Random_forest(Xtrain, Ytrain, Xtest):
    grid = {
        "n_estimators": np.arange(100, 1200, 50),
        "max_features": ["log2", "sqrt", "auto"],
        "max_depth": np.arange(20, 200, 10),
        "min_samples_leaf": np.arange(3, 50, 5)
    }

    #Randomized search parameter optimization
    RF = RandomizedSearchCV(RandomForestRegressor(random_state=0, oob_score=0),
                            param_distributions=grid,
                            cv=15,
                            n_iter=int(args[1]),
                            n_jobs=-1,
                            random_state=0)
    RF.fit(Xtrain, Ytrain)
    #Predicting using unseen data
    RF_predict = RF.predict(Xtest)
    # save the model to disk
    filename = 'finalized_RF.sav'
    pickle.dump(RF, open(filename, 'wb'))
    return RF_predict
Example #16
0
    def best_RandomForest(self, df=pd.DataFrame()):

        if df.empty:
            df = self.df

        self.df_train, self.df_test = self.split_df(df)

        X_train, y_train = self.ready_for_model_train(self.df_train)
        X_test, y_test = self.ready_for_model_test(self.df_test)

        clf = RandomForestClassifier(bootstrap=False)

        grid = {
            'n_estimators': sp_randint(250, 400),
            #'min_samples_leaf': sp_randint(1, 12),
            'max_features': sp_randint(5, 50),
            'max_depth': sp_randint(5, 30)
        }

        clf_rfc = RandomizedSearchCV(clf,
                                     n_jobs=4,
                                     n_iter=15,
                                     param_distributions=grid,
                                     scoring='accuracy')

        print("Finding the best parameters..")

        clf_rfc.fit(X_train, y_train.ravel())
        print("Getting predicts for..")
        y_hat = clf_rfc.predict(X_test)

        print('Best Params: \n')
        for k, v in clf_rfc.best_params_.items():
            print(k, v)

        print("Accuracy with Random Forest = %4.4f" %
              accuracy_score(y_test.ravel(), y_hat))
        #binarize_y_confustion_matrix(y_test, y_hat)
        return (clf_rfc.best_params_)
Example #17
0
def EXT_tree(Xtrain, Ytrain, Xtest):
    grid2 = {
        "n_estimators": np.arange(100, 1200, 50),
        "max_features": ["log2", "sqrt", "auto"],
        "max_depth": np.arange(20, 200, 10),
        "min_samples_leaf": np.arange(3, 50, 5)
    }

    RF3 = RandomizedSearchCV(ExtraTreesRegressor(random_state=0, oob_score=0),
                             param_distributions=grid2,
                             cv=15,
                             n_iter=int(args[1]),
                             n_jobs=-1,
                             random_state=0)
    #MOdel fitting
    RF3.fit(Xtrain, Ytrain)
    #Predicting using unseen data
    EXT_predict = RF3.predict(Xtest)
    # save the model to disk
    filename = 'finalized_EXT.sav'
    pickle.dump(RF3, open(filename, 'wb'))
    return EXT_predict
    def best_RandomForest(self,df=pd.DataFrame()):

        if df.empty:
            df = self.df

        self.df_train, self.df_test = self.split_df(df)

        X_train, y_train = self.ready_for_model_train(self.df_train)
        X_test, y_test = self.ready_for_model_test(self.df_test)



        clf = RandomForestClassifier(bootstrap = False)

        grid = {'n_estimators': sp_randint(250, 400),
                #'min_samples_leaf': sp_randint(1, 12),
                'max_features': sp_randint(5, 50),
                'max_depth': sp_randint(5, 30)}

        clf_rfc = RandomizedSearchCV(clf, n_jobs=4, n_iter=15,
                                    param_distributions=grid,
                                    scoring='accuracy')

        print("Finding the best parameters..")

        clf_rfc.fit(X_train, y_train.ravel())
        print("Getting predicts for..")
        y_hat = clf_rfc.predict(X_test)


        print('Best Params: \n')
        for k, v in clf_rfc.best_params_.items():
            print(k, v)

        print("Accuracy with Random Forest = %4.4f"  %
            accuracy_score(y_test.ravel(), y_hat))
        #binarize_y_confustion_matrix(y_test, y_hat)
        return(clf_rfc.best_params_)
Example #19
0
def make_prediction(pipe, X_train, y_train, X_test):
    """
    Assesses the model with n_iter different sets of parameters through cross-validation, choose the best one, train
    it on the train data and predicts on the test data.
    :param pipe: main pipeline, output of prepare_pipeline()
    :param X_train: training dataset, output of prepare_dataset(raw_train)
    :param y_train: target column of the training set
    :param X_test: testing dataset, output of prepare_dataset(raw_test)
    :return: pandas dataframe with two features: PassengerId and Survived (prediction for the test set)
    """
    param_grid = {'svc__C': stats.uniform(loc=0, scale=10),
                  'svc__decision_function_shape': [None, 'ovo', 'ovr'],
                  'svc__shrinking': [True, False]
                  }
    rand = RandomizedSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_iter=100)

    # We fit to the train sets
    rand.fit(X_train, y_train)
    print('Estimated accuracy: {:.1f} %'.format(rand.best_score_*100))

    output = pd.DataFrame({'PassengerId': X_test.index, 'Survived': rand.predict(X_test)})

    return output
train = pd.read_csv("./Desktop/schiz/concat_train/trainconcat.csv")
test  = pd.read_csv("./Desktop/schiz/concat_test/testconcat.csv")
train_features = train.ix[:,1:411] #train data features
train_label = train["Class"] #train data labels
#test = (test - test.mean()) / (test.max() - test.min())
train_features = (train_features - train_features.mean()) / (train_features.max() - train_features.min())
features = list(train.columns[1:411]) #liste of train features
label = list(train["Class"])
print("Preprocessing data")
param_distributions = {'C': expon()}
svc = LogisticRegression(penalty='l2', C=1.0, fit_intercept=True, solver='liblinear')
clf =RandomizedSearchCV(svc, param_distributions=param_distributions, n_iter=10000)
clf.fit(train_features, label)


scores = cross_validation.cross_val_score(clf,train_features,label,cv=2,scoring='roc_auc')
print(scores)

#def get_score(clf, train_features, train_label):
#    X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_features, train_label, test_size=0.12, random_state=0)
#    clf.fit(X_train, y_train)
#    print clf.score(X_test, y_test) 

print("Training Logistic Regression")

test_feature = test[features]
print("Make predictions on the test set")
test_probs = clf.predict(test_feature)
submission = pd.DataFrame({"id": test["Id"], "probability": test_probs})
submission.to_csv("rf_xgboost_submission.csv", index=False)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Example #21
0
def XGB_Regressor(Train_DS, y, Actual_DS, Sample_DS, Parms_DS_XGB, Grid, Ensemble):

    print("***************Starting xgb Regressor (sklearn)***************")
    t0 = time()

    n_iter_search = 500

    Train_DS, y = shuffle(Train_DS, y, random_state=21)

    if Grid:
        # used for checking the best performance for the model using hyper parameters
        print("Starting model fit with Grid Search")

        # specify parameters and distributions to sample from
        param_dist = {
            "n_estimators": [10],
            "max_depth": sp_randint(1, 25),
            "min_child_weight": sp_randint(1, 25),
            "subsample": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
            "colsample_bytree": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
            "silent": [True],
            "gamma": [0.5, 0.6, 0.7, 0.8, 0.9, 1, 2],
        }

        clf = xgb.XGBRegressor(nthread=4)

        # run randomized search

        clf = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, scoring=gini_scorer, cv=10)

        start = time()
        clf.fit(Train_DS, y)

        print(
            "RandomizedSearchCV took %.2f seconds for %d candidates"
            " parameter settings." % ((time() - start), n_iter_search)
        )

        Parms_DS_Out = report(clf.grid_scores_, n_top=n_iter_search)
        Parms_DS_Out.to_csv(file_path + "Parms_DS_XGB_1001.csv")

        Parms_DS_XGB = Parms_DS_Out

        print("Best estimator found by grid search:")
        print(clf.best_estimator_)

        # Predict actual model
        pred_Actual = clf.predict(Actual_DS)
        print("Actual Model predicted")

        # Get the predictions for actual data set
        preds = pd.DataFrame(pred_Actual, index=Sample_DS.Id.values, columns=Sample_DS.columns[1:])
        preds.to_csv(file_path + "output/Submission_Roshan_XGB_1.csv", index_label="Id")

    if Ensemble:

        print("Starting ensembling")
        Ensemble_DS = pd.DataFrame()

        for i in range(20):
            scores = []
            clf = xgb.XGBRegressor(
                n_estimators=2000,
                max_depth=Parms_DS_XGB["max_depth"][i],
                learning_rate=0.01,
                nthread=4,
                min_child_weight=Parms_DS_XGB["min_child_weight"][i],
                subsample=Parms_DS_XGB["subsample"][i],
                colsample_bytree=Parms_DS_XGB["colsample_bytree"][i],
                silent=True,
                gamma=Parms_DS_XGB["gamma"][i],
            )

            clf.fit(Train_DS, y)

            Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf)
            # scores.append(Nfold_score)
            # print(" %d-iteration... %s " % (i+1,scores))

            pred_Actual = clf.predict(Actual_DS)
            Ensemble_DS[i] = pred_Actual
            print(" %d - Model Completed..." % (i + 1))

        Ensemble_DS.to_csv(file_path + "Ensemble_DS_XGB_1.csv")

    if Grid == False and Ensemble == False:
        # CV:0.38604935169439381, LB:0.382479
        # CV:0.38614992702270973 (with std scaler)
        # clf = xgb.XGBRegressor(n_estimators=1000,max_depth=7,learning_rate=0.01,nthread=2,min_child_weight=5,
        #                         subsample=0.8,colsample_bytree=0.8,silent=True,gamma=1)

        # CV:0.0.38540501304758473
        # clf = xgb.XGBRegressor(n_estimators=1000,max_depth=8,learning_rate=0.01,nthread=4,min_child_weight=5,
        #                         subsample=0.8,colsample_bytree=0.8,silent=True,gamma=1)

        # CV:0.38672594800194787
        clf = xgb.XGBRegressor(
            n_estimators=2000,
            max_depth=6,
            learning_rate=0.01,
            nthread=4,
            min_child_weight=15,
            subsample=1,
            colsample_bytree=0.5,
            silent=True,
            gamma=0.8,
        )

        # CV : 0.38594904255042506)
        # clf = xgb.XGBRegressor(n_estimators=1000,max_depth=5,learning_rate=0.02,nthread=4,min_child_weight=1,
        #                         subsample=1,colsample_bytree=0.9,silent=True,gamma=1)
        #
        # CV :  0.38335661759105549 , 0.3877 in 2000 iter
        # clf = xgb.XGBRegressor(n_estimators=2000,max_depth=5,learning_rate=0.01,nthread=4,min_child_weight=19,
        #                         subsample=1,colsample_bytree=0.3,silent=True,gamma=0.6)

        # CV :  0.3850 in 1000 , 0.3877 in 2000 iter
        clf = xgb.XGBRegressor(
            n_estimators=2000,
            max_depth=5,
            learning_rate=0.01,
            nthread=4,
            min_child_weight=20,
            subsample=0.8,
            colsample_bytree=0.4,
            silent=True,
            gamma=0.6,
        )

        Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf)
        clf.fit(Train_DS, y)

        # Predict actual model
        pred_Actual = clf.predict(Actual_DS)
        print("Actual Model predicted")

        # Get the predictions for actual data set
        preds = pd.DataFrame(pred_Actual, index=Sample_DS.Id.values, columns=Sample_DS.columns[1:])
        preds.to_csv(file_path + "output/Submission_Roshan_XGB_1.csv", index_label="Id")

    print("***************Ending xgb Regressor (sklearn)***************")
    return pred_Actual
Example #22
0
#y_test = y_test[:100]

tuned_parameters = {
    'kernel': ['rbf'],
    'gamma': expon(scale=.1),
    'C': expon(scale=100)
}

clf = RandomizedSearchCV(SVC(C=1),
                         tuned_parameters,
                         cv=5,
                         scoring='accuracy',
                         n_jobs=-1)
clf.fit(X_train, y_train)
print clf.best_estimator_
pred = clf.predict(X_test)
pred_label = le.inverse_transform(pred)

submission = DataFrame({'label': pred_label},
                       columns=['label'],
                       index=np.arange(1,
                                       len(pred) + 1))
submission['Class_1'] = np.zeros(len(pred))
submission['Class_1'][pred == 1] = np.ones(len(pred == 1))
submission['Class_2'] = np.zeros(len(pred))
submission['Class_2'][pred == 2] = np.ones(len(pred == 2))
submission['Class_3'] = np.zeros(len(pred))
submission['Class_3'][pred == 3] = np.ones(len(pred == 3))
submission['Class_4'] = np.zeros(len(pred))
submission['Class_4'][pred == 4] = np.ones(len(pred == 4))
submission['Class_5'] = np.zeros(len(pred))
pipe = Pipeline(steps=[('pca', pca), ('rbfSVM', rbfSVM)])


param_dist={
	"pca__n_components":sp_randint(10,700),
	"rbfSVM__C": scipy.stats.expon(scale=10),
	"rbfSVM__kernel": ["rbf"], 
	"rbfSVM__gamma": scipy.stats.expon(scale=0.01)
}
n_iter_search = 500
random_search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=n_iter_search,cv=cv,verbose=6,n_jobs=4)


random_search.fit(X_train,Y_train)
predicted_held_out=random_search.predict(X_test)
mmat=confusion_matrix(predicted_held_out,Y_test)
print mmat
class_map=dict(zip(set(input_kmers_counts["class"]),range(0,4)))
kappa([class_map[x] for x in Y_test],[class_map[x] for x in predicted_held_out])


# We determine whether the variance of the number of components for the best CV 
all_scores=random_search.grid_scores_
all_scores.sort(key=lambda x:x.mean_validation_score)
with open("random_search_scores_1000iter_5mers.bdat","w") as f :
	cPickle.dump(all_scores,f)

# We generate a pandas data.frame with the results 
import pandas
Example #24
0
def RFC_Classifier(Train_DS, y, Actual_DS, Sample_DS, grid):

    print("***************Starting RFC Classifier***************")

    t0 = time()

    if grid:

        #use SVD (similar to PCA)
        svd = TruncatedSVD( algorithm='randomized', n_iter=5, random_state=None, tol=0.0)

        # Initialize the standard scaler
        scl = StandardScaler(copy=True, with_mean=True, with_std=True)

       #used for checking the best performance for the model using hyper parameters
        print("Starting model fit with Grid/Random Search")

        RFC_model = RandomForestClassifier(n_estimators=500,n_jobs=-1)

        # Create the pipeline
        clf = pipeline.Pipeline([('svd', svd),
    	    					 ('scl', scl),
                        	     ('RFC', RFC_model)])

        # specify parameters and distributions to sample from
        param_dist = {
                      "svd__n_components" : [200,300,400,500,600,700],
                      "max_depth": [1, 2, 3, 4, 5, None],
                      "max_features": sp_randint(1, 40),
                      "min_samples_split": sp_randint(1, 20),
                      "min_samples_leaf": sp_randint(1, 20),
                      "bootstrap": [True, False]
                     }


        # clf = GridSearchCV(estimator = clf, param_grid=param_dist, scoring=kappa_scorer,
        #                              verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

        # run randomized search
        n_iter_search = 1000
        clf = RandomizedSearchCV(clf, param_distributions=param_dist,
                                           n_iter=n_iter_search, scoring = kappa_scorer,cv=10)

        start = time()
        clf.fit(Train_DS, y)

        print("RandomizedSearchCV took %.2f seconds for %d candidates"
                 " parameter settings." % ((time() - start), n_iter_search))
        report(clf.grid_scores_)

        print("Best estimator found by grid search:")
        print(clf.best_estimator_)
        print(clf.grid_scores_)
        print(clf.best_score_)
        print(clf.best_params_)
        print(clf.scorer_)

    else:

        #Setting singular value decomposition
        # svd = TruncatedSVD(n_components=500,algorithm='randomized', n_iter=5, random_state=None, tol=0.0)
        # svd.fit(Train_DS)
        # Train_DS = svd.transform(Train_DS)
        # Actual_DS = svd.transform(Actual_DS)
        #
        # #Setting Standard scaler for data
        # stdScaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        # stdScaler.fit(Train_DS,y)
        # Train_DS = stdScaler.transform(Train_DS)
        # Actual_DS = stdScaler.transform(Actual_DS)

        clf = RandomForestClassifier(n_jobs=-1, n_estimators=500, min_samples_split=1)
        clf = CalibratedClassifierCV(base_estimator=clf, method='sigmoid')

        Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf)

        clf.fit(Train_DS, y)

    #Predict actual model
    pred_Actual = clf.predict(Actual_DS)
    print("Actual Model predicted")

    #Get the predictions for actual data set
    preds = pd.DataFrame(pred_Actual, index=Sample_DS.id.values, columns=Sample_DS.columns[1:])
    preds.to_csv(file_path+'output/Submission_Roshan_RFC.csv', index_label='id')

    print("***************Ending RFC Classifier***************")
    return pred_Actual
Example #25
0
    
#rs.fit(a_in, a_out)    
if len(X_train) != len(y_train):
    sys.stderr.write("Number of samples and number of labels do not match.")
    exit()

for t in xrange(N):
    crash = True
    while(crash):
        try:
            rs.fit(X_train, y_train)
            crash = False
        except RuntimeError:
            sys.stderr.write("--------------------- [Crashed by RunTimeERROR. restarting] --------------------- \n")
            crash = True
    
    sys.stderr.write("Best Parameters: %s, score: %s\n" % (str(rs.best_params_), str(rs.best_score_)))
    y_ = rs.predict(X_valid)
    y = []
    for o in y_:
        y.append(o[0])
    
    input = sys.argv[3].split("/")[-1].split(".")[0]
    y_out = {}
    y_out['estimated_output'] = y
    y_out['best_params'] = rs.best_params_
    y_out['best_score'] = rs.best_score_

    with open("nn_output_headlines_30_d2v_conv_300_m5.txt", "a") as f:
        f.write(str(y_out)+'\n')
Example #26
0
cv_data = train_data[0:temp:,::]
train_data2 = train_data[temp::,::]


forest = RandomForestClassifier(n_estimators = 25)
# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(forest, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=6)

start = time()
random_search.fit(train_data2[::,1::], train_data2[::,0])
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)
train_output = random_search.predict(train_data2[::,1::])
cv_output = random_search.predict(cv_data[::,1::])
print "Training set accuracy: %.3f   CV set accuracy: %.3f"\
      %(len(train_data2[train_output == train_data2[::,0]])/float(len(train_data2)),
      (len(cv_data[cv_output == cv_data[::,0]])/float(len(cv_data))))



# Analyzing important features
forest = random_search.best_estimator_
feature_importance = forest.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())

feature_list = df.columns.values
Example #27
0
    def run_prediction_random_gs_split(X_file, y_file, data_str):
        # fixme copied function
        def _pred_real_scatter(y_test, y_test_predicted, title_str, in_data_name):
            import os
            import pylab as plt
            from matplotlib.backends.backend_pdf import PdfPages

            plt.scatter(y_test, y_test_predicted)
            plt.plot([10, 80], [10, 80], 'k')
            plt.xlabel('real')
            plt.ylabel('predicted')
            ax = plt.gca()
            ax.set_aspect('equal')
            plt.title(title_str)
            plt.tight_layout()

            scatter_file = os.path.join(os.getcwd(), 'scatter_' + in_data_name + '.pdf')
            pp = PdfPages(scatter_file)
            pp.savefig()
            pp.close()
            return scatter_file

        import os, pickle
        import numpy as np
        from sklearn.svm import SVR
        from sklearn.cross_validation import cross_val_score, cross_val_predict, train_test_split
        from sklearn.grid_search import RandomizedSearchCV
        from sklearn.pipeline import Pipeline
        from sklearn.feature_selection import VarianceThreshold
        from sklearn.preprocessing import MinMaxScaler, StandardScaler
        from sklearn.preprocessing import Imputer
        from sklearn.feature_selection import SelectPercentile, f_regression
        from sklearn.metrics import mean_absolute_error, r2_score
        from sklearn.svm import LinearSVR
        from sklearn.decomposition import PCA
        from scipy.stats import randint as sp_randint
        from scipy.stats import expon

        X = np.load(X_file)
        y = np.load(y_file)

        # fixme add squared values to X
        # X = np.hstack([X, np.square(X)])

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

        # remove low variance features
        fill_missing = Imputer()
        var_thr = VarianceThreshold()
        normalize = StandardScaler()  # MinMaxScaler()
        selection = SelectPercentile(f_regression)

        # regression_model = LinearSVR() #SVR(kernel='linear')
        from sklearn.svm import NuSVR
        regression_model = LinearSVR()  # NuSVR(kernel='linear') #SVR(kernel='linear')

        pipe = Pipeline([
            ('fill_missing', fill_missing),
            ('var_thr', var_thr),
            ('normalize', normalize),
            ('selection', selection),
            ('regression_model', regression_model),
        ])


        param_dist = {
            'selection__percentile': sp_randint(10, 100),
            'regression_model__C': expon(scale=100), # sp_randint(.001, 14450),
            'regression_model__epsilon': sp_randint(0, 100),
            'regression_model__loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
        }


        # fixme njobs
        n_iter_search = 400
        gs = RandomizedSearchCV(pipe, param_distributions=param_dist, cv=5, scoring='mean_absolute_error', n_jobs=15, n_iter=n_iter_search)
        gs.fit(X_train, y_train)

        best_estimator = gs.best_estimator_

        grid_scores = gs.grid_scores_
        gs_file = os.path.join(os.getcwd(), 'gs_' + data_str + '.pkl')
        with open(gs_file, 'w') as f:
            pickle.dump(grid_scores, f)

        sorted_grid_score = sorted(gs.grid_scores_, key=lambda x: x.mean_validation_score, reverse=True)
        score_str = [str(n) + ': ' + str(g) for n, g in enumerate(sorted_grid_score)]

        gs_text_file = os.path.join(os.getcwd(), 'gs_txt_' + data_str + '.txt')
        with open(gs_text_file, 'w') as f:
            f.write('\n'.join(score_str))

        # fitted_model = gs.steps[-1][1]

        # fixme pickle crashes
        model_out_file = ''
        # model_out_file = os.path.join(os.getcwd(), 'trained_model.pkl')
        # with open(model_out_file, 'w') as f:
        #     pickle.dump(gs, f)

        y_predicted = gs.predict(X_test)
        cv_scores = mean_absolute_error(y_test, y_predicted)
        cv_scores_r2 = r2_score(y_test, y_predicted)

        title_str = '{}\n mae: {:.3f}\n r2: {:.3f}'.format(
            data_str,
            cv_scores,
            cv_scores_r2)

        scatter_file = _pred_real_scatter(y_test, y_predicted, title_str, data_str)

        return model_out_file, scatter_file, gs_text_file, gs_file, best_estimator
Example #28
0
# run randomized search
n_iter_search = 40
random_search = RandomizedSearchCV(clf,
                                   param_distributions=QL_SVM_param_dist,
                                   n_iter=n_iter_search,
                                   cv=skf)
start = time()
random_search.fit(K_train, y_train)
print(
    "Quasi_linear kernel SVM RandomSearch took %.2f seconds for %d candidates"
    " parameter settings." % ((time() - start), n_iter_search))
print("Random_search Best estimator is :\n"), random_search.best_estimator_
report(random_search.grid_scores_, n_top=5)
# print the classification_report
y_test, y_pred = y_test, random_search.predict(K_test)
#Call predict on the estimator with the best found parameters.
print(classification_report(y_test, y_pred))
print()

# run grid search
grid_search = GridSearchCV(clf, param_grid=QL_SVM_param_dist, cv=skf)
start = time()
grid_search.fit(K_X, Y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings." %
      (time() - start, len(grid_search.grid_scores_)))
print("Grid_search Best estimator is :\n"), grid_search.best_estimator_
report(grid_search.grid_scores_, n_top=10)
# print the classification_report
y_test, y_pred = y_test, grid_search.predict(K_test)
print(classification_report(y_test, y_pred))
Example #29
0
#Start with data with age
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,n_jobs=4, verbose=1)
random_search.fit(x_train_std,y_train)

print 'Reporting'
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)
score=random_search.score(x_test_std,y_test)
print 'Test score'
print score
print 'Predicting'
output = random_search.predict(test_data_std)


#Finally with data without age
# run randomized search
<<<<<<< HEAD
n_iter_search = 20
=======
n_iter_search = 2000
>>>>>>> 5b0499dbec7ef19b9617d4339731063de092e370
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,n_jobs=4, verbose=1)
random_search.fit(x_train_std_noage,y_train_noage)

print 'Reporting noage'
print("RandomizedSearchCV noage took %.2f seconds for %d candidates"
Example #30
0
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        pipeline = Pipeline([('data',
                              FeatureUnion([('audio', AudioLoader()),
                                            ('vad', VADLoader())])),
                             ('svm', SVC(kernel='rbf', gamma=1e-5, C=20))])
        paramdist = {
            'svm__C': np.logspace(0, 2, 50),
            'data__vad__stacksize': scipy.stats.randint(11, 51),
            'data__audio__stacksize': scipy.stats.randint(11, 51)
        }
        clf = RandomizedSearchCV(pipeline,
                                 paramdist,
                                 n_iter=500,
                                 verbose=1,
                                 cv=1,
                                 n_jobs=35)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        with open(
                path.join(data.BASEDIR,
                          'transcriber_rand_params_{0}.pkl'.format(monkey)),
                'wb') as fid:
            pickle.dump(clf.best_params_, fid, -1)
        with open(
                path.join(data.BASEDIR,
                          'transcriber_rand_results_{0}.pkl'.format(monkey)),
                'wb') as fid:
            pickle.dump((y_test, y_pred, labels))
        print monkey
        print classification_report(y_test, y_pred, target_names=labels)
Example #31
0
tuned_parameters = {
 'C': [1, 10, 100,500, 1000], 'kernel': ['linear','rbf'],
 'C': [1, 10, 100,500, 1000], 'gamma': [1,0.1,0.01,0.001, 0.0001], 'kernel': ['rbf'],
 #'degree': [2,3,4,5,6] , 'C':[1,10,100,500,1000] , 'kernel':['poly']
    }

from sklearn.grid_search import RandomizedSearchCV

model_svm = RandomizedSearchCV(svm_model, tuned_parameters,cv=10,scoring='accuracy',n_iter=20)
model_svm.fit(X_train, y_train)
print(model_svm.best_score_)

print(model_svm.best_params_)

y_pred= model_svm.predict(X_test)
print(metrics.accuracy_score(y_pred,y_test))
confusion_matrix=metrics.confusion_matrix(y_test,y_pred)
print confusion_matrix

auc_roc=metrics.classification_report(y_test,y_pred)
print auc_roc

auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print roc_auc

import matplotlib.pyplot as plt
def get_trained_clf_2(df, category, X_train_counts, Y, count_vect, clf,
                      clf_name):
    params = None
    gs_clf = None

    # set clf into grid search
    if (isinstance(clf, tree.DecisionTreeClassifier)):
        print('=' * 100)
        print('      Optimizing tree.DecisionTreeClassifier ...')
        params = {
            'criterion': ['gini', 'entropy'],
            'max_depth': [
                4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30, 40, 50, 70, 90, 120,
                150
            ]
        }
        gs_clf = RandomizedSearchCV(estimator=clf,
                                    param_distributions=params,
                                    cv=5,
                                    n_jobs=-1)
    elif (isinstance(clf, LogisticRegression)):
        print('=' * 100)
        print('      Optimizing Logistic Reg ...')
        params = {
            'C': [0.001, 0.01, 0.1, 1, 10, 15, 20, 30, 40, 100, 1000],
            'penalty': ['l1', 'l2']
        }
        gs_clf = RandomizedSearchCV(estimator=clf,
                                    param_distributions=params,
                                    cv=5,
                                    n_jobs=-1)
    elif (isinstance(clf, RandomForestClassifier)):
        print('=' * 100)
        print('      Optimizing Random Forest ...')
        params = {
            "max_depth": [3, 5, None],
            "max_features": [1, 2, 3, 4, 5, 7, 9],
            "min_samples_split": [1, 2, 3, 4, 5, 7, 9],
            "min_samples_leaf": [1, 2, 3, 4, 5, 7, 9],
            "bootstrap": [True, False],
            "criterion": ["gini", "entropy"]
        }
        gs_clf = RandomizedSearchCV(estimator=clf,
                                    param_distributions=params,
                                    cv=5,
                                    n_jobs=-1)
    elif (isinstance(clf, svm.SVC)):
        print('=' * 100)
        print('      Optimizing SVM ...')
        C_range = 10.0**np.arange(-4, 4)
        gamma_range = 10.0**np.arange(-4, 4)
        kernels = ['rbf', 'linear', 'poly', 'sigmoid']
        params = {
            'C': C_range.tolist(),
            'gamma': gamma_range.tolist(),
            'kernel': kernels
        }
        gs_clf = RandomizedSearchCV(estimator=clf,
                                    param_distributions=params,
                                    cv=5,
                                    n_jobs=-1)
    elif (isinstance(clf, MultinomialNB)):
        print('=' * 100)
        print('      Optimizing MultinomialNB ...')
        params = {
            'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
            'fit_prior': [True, False]
        }
        gs_clf = RandomizedSearchCV(estimator=clf,
                                    param_distributions=params,
                                    cv=5,
                                    n_jobs=-1,
                                    n_iter=10)
    elif (isinstance(clf, AdaBoostClassifier)):
        print('=' * 100)
        print('      Optimizing Ada Boost ...')
        params = {
            'learning_rate': stats.expon(scale=1.0),
            'n_estimators': stats.randint(low=20, high=100)
        }
        gs_clf = RandomizedSearchCV(estimator=clf,
                                    param_distributions=params,
                                    cv=5,
                                    n_jobs=-1,
                                    n_iter=10)
    elif (isinstance(clf, KNeighborsClassifier)):
        print('=' * 100)
        print('      Optimizing KNN Neighbors ...')
        params = {
            'n_neighbors': [i for i in range(2, 10)],
            'weights': ['uniform', 'distance']
        }
        gs_clf = RandomizedSearchCV(estimator=clf,
                                    param_distributions=params,
                                    cv=5,
                                    n_jobs=-1,
                                    n_iter=10)

    start = time.time()

    ## train classifier for recall and precision measurements
    gs_clf.fit(X_train_counts, Y)

    print("      Optimization process took %g s" % (time.time() - start))

    ## get validation score for given classifier
    print('      Cross validation for ' + clf_name)
    scores = cross_val_score(gs_clf, X_train_counts, Y, scoring='recall', cv=5)

    ## Print accuracy
    predictions = gs_clf.predict(X_train_counts)
    print('\n      Best score: ', np.mean(scores))
    print('      Prediction accuracy score: ', accuracy_score(Y, predictions))
    print('      Confusion matrix:')
    display(
        pandas.crosstab(pandas.Series(Y),
                        predictions,
                        rownames=['True'],
                        colnames=['Predicted'],
                        margins=True))
    print('\n')

    ## recall measurement
    #false_negatives = df[((df.category_full_path_mod1 == category) & (df.type == 'False Negative'))].loc[:,'description_mod1']
    #false_negatives = false_negatives.drop_duplicates()
    #X_test_counts = count_vect.transform(false_negatives)
    #Y_test = clf.predict(X_test_counts)

    ## precision measurement
    #false_positives = df[((df.category_full_path_mod1 != category) & (df.type == 'False Negative'))].loc[:,'description_mod1']
    #false_positives = false_positives.drop_duplicates()
    #X_test_counts2 = count_vect.transform(false_positives)
    #Y_test2 = clf.predict(X_test_counts2)

    ## Persist classifier and it's scores to dict
    results_dict = {}
    results_dict["Model name"] = clf_name
    #results_dict["Cross Validation Score"] = np.mean(scores)
    #results_dict["Best Score"] = gs_clf.best_score_
    results_dict["Best Score"] = np.mean(scores)
    #results_dict["Recall"] = np.sum(Y_test)*1.0/len(Y_test)
    #results_dict["Precision"] = 1 - np.sum(Y_test2)*1.0/len(Y_test2)
    results_dict["Model"] = gs_clf

    for param_name in sorted(params.keys()):
        results_dict[param_name] = gs_clf.best_params_[param_name]

    return results_dict
Example #33
0
pca_transformer = PCA(n_components=1000)
pca_transformer.fit(x_train)
pca_transformer.explained_variance_ratio_.sum()
x_train = pca_transformer.transform(x_train)

scaler_y = StandardScaler(copy=True, with_mean=True, with_std=True)
y_train = scaler_y.fit_transform(y_train.values.reshape(-1, 1))

grid.fit(x_train, y_train.ravel())

pd.DataFrame(grid.grid_scores_).sort_values("mean_validation_score")

x_test = poli.transform(x_test)
x_test = pca_transformer.transform(x_test)
pd.DataFrame([
    np.e**grid.predict(x_test),
    scaler_y.transform(y_test.values.reshape(-1, 1))
])
mean_squared_error(y_test, np.e**grid.predict(x_test))
# 2004,805,126
# 716,852,668
# 759,107,470
# 2057,570,962
# 1260,684,066
# 1689,874,386
# 1608,326,518
# 5405,998,897
# 715,551,980
# 778,085,588
# 804,713,150
# 938,380,884
Example #34
0
gbdt = GradientBoostingClassifier(verbose=1)
searchcv = RandomizedSearchCV(estimator=gbdt,
                              param_distributions=param_dist,
                              n_iter=200,
                              verbose=1)
searchcv.fit(Xtrain, ytrain)

searchcv.best_score_
searchcv.best_estimator_
searchcv.best_params_

# ---------------------- predict
titanic_test = pd.read_csv("test_processed.csv", index_col="PassengerId")
Xtest = titanic_test[feature_names]

predictions = searchcv.predict(Xtest)
submission = pd.DataFrame({
    "PassengerId": titanic_test.index,
    "Survived": predictions
})
submission.to_csv("submit_gbdt.csv", index=False)

import pickle
inf = open('gbdt.pkl', 'rb')
gbdt = pickle.load(inf)
inf.close()

sorted(zip(map(lambda x: round(x, 4), gbdt.feature_importances_),
           feature_names),
       reverse=True)
def main(fold_num=0):
    train_ids = pickle.load(open("fold_%s_train_ids.pickle" % fold_num))
    test_ids = pickle.load(open("fold_%s_test_ids.pickle" % fold_num))

    # get the data
    with open('cui_data_sent5.csv', 'r') as f:
        r = csv.DictReader(f)
        data = [row for row in r]

    out = []
    for row in data:
        for label in row['label'].split('|'):
            new_row = row.copy()
            new_row['label'] = label
            out.append(new_row)

    data = out

    # cui graph
    with open('graph_subset.pck', 'rb') as f:
        cui_graph = pickle.load(f)

    # de-unicode all this

    for row in data:
        row["sent"] = unidecode.unidecode(row["sent"])

    # quick processing of cuis
    cui2int = lambda x: int(x[1:])
    int2cui = lambda x: "C{}".format(str(x).zfill(7))
    cui_ancestors = lambda x: list(nx.ancestors(cui_graph, cui2int(x)))

    # Generate text features
    vec = HashingVectorizer(ngram_range=(1, 3), stop_words='english')
    X_text = vec.transform((row['sent'] for row in data))

    # Generate concept features
    indptr = [0]
    indices = []
    csr_data = []

    for row in data:

        ancestors = [cui2int(row['cui'])]  # remember the index cui!
        try:
            ancestors = ancestors + cui_ancestors(row['cui'])
        except:
            pass

        for ancestor in ancestors:
            indices.append(ancestor)
            csr_data.append(1)
        indptr.append(len(indices))

    X_cuis = csr_matrix((csr_data, indices, indptr),
                        shape=(len(data), 10000000),
                        dtype=np.int64)

    # and positional features
    X_pos = np.zeros(shape=(len(data), 5))
    for i, row in enumerate(data):
        X_pos[(i, int(float(row['position']) * 4))] = 5

    # and answers
    y = np.array([row["label"] for row in data])

    # combine primary and secondary outcome for now (not sure it matters too much at this stage)
    y[y == 'secondary_outcome'] = 'outcome'
    y[y == 'primary_outcome'] = 'outcome'

    X = hstack([X_text, X_cuis, X_pos], format='csr')

    X_train = X[train_ids, :]
    X_test = X[test_ids, :]

    y_train = y[train_ids]
    y_test = y[test_ids]

    class_instance_indices = {}
    outcome_indices = np.where(y_train == "outcome")[0]
    interventions_indices = np.where(y_train == "interventions")[0]
    ignore_indices = np.where(y_train == "ignore")[0]
    population_indices = np.where(y_train == "population")[0]

    K = 5
    targets = ['population', 'interventions', 'outcome']

    #ftwo_scorer = make_scorer(fbeta_score, beta=2, labels=targets, average='macro') # favour recall a
    # bcw -- making comparable to CNN approach
    f_scorer = make_scorer(fbeta_score,
                           beta=1,
                           labels=targets,
                           average='macro')

    class_weights = []
    # generate hyperparameter search space
    weight_space = range(1, 50)

    for w1 in weight_space:
        for w2 in weight_space:
            for w3 in weight_space:
                class_weights.append(
                    {t: w
                     for t, w in zip(targets, [w1, w2, w3])})

    parameters = {
        'alpha': np.logspace(-1, -20, 50),
        'class_weight': class_weights
    }

    clf = SGDClassifier(average=True, loss="hinge", class_weights="balanced")

    # do the random grid search thing
    grid_search = RandomizedSearchCV(clf,
                                     param_distributions=parameters,
                                     n_iter=38,
                                     verbose=3,
                                     n_jobs=19,
                                     scoring=f_scorer,
                                     cv=K)

    grid_search.fit(X_train, y_train)

    #y_hat = grid_search.predict(X_test)
    y_hat = grid_search.decision_function(X_test)
    with open("lm_raw_predictions_%s.pickle" % fold_num, 'w') as outf:
        pickle.dump(y_hat, outf)

    #import pdb; pdb.set_trace()

    y_hat = grid_search.predict(X_test)
    with open("lm_predictions_%s.pickle" % fold_num, 'w') as outf:
        pickle.dump(y_hat, outf)

    with open("lm_y_%s.pickle" % fold_num, 'w') as outf:
        pickle.dump(y_test, outf)
Example #36
0
def run(args):
    X_train = np.nan_to_num(
        np.genfromtxt(args.training_data, delimiter=args.delimiter))
    y_train = np.clip(np.genfromtxt(args.training_labels), 0, 1)

    X_trains = X_train
    if args.scale:
        print "Scaling features (mean removal divided by std)..."
        scaler = StandardScaler().fit(X_train)
        X_trains = scaler.transform(X_train)

    # create output folders
    outF = args.output_folder + "/" + os.path.basename(
        args.training_data) + "--FS_" + str(
        args.select_features) + "--i_" + str(args.iterations)
    buildDir(outF)
    maskF = outF + "/masks/"
    buildDir(maskF)
    #evaluation  features  first_experiments  labels  logs  masks  parameters
    #  predictions  src  suca
    paramF = outF + "/parameters/"
    buildDir(paramF)
    #featF = outF+"/features/"
    #buildDir(featF)    

    #evalF = buildDir(outF+"/evaluation")



    #os.path.basename(
    #        args.training_data)]) + featsel_str + "--" + os.path.basename(
    # test_label



    # initializes numpy random seed
    np.random.seed(args.seed)

    # performs feature selection
    featsel_str = ".all-feats"
    if args.select_features:
        print "Performing feature selection ..."
        # initializes selection estimator
        sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000,
                                  n_jobs=8, random_state=args.seed,
                                  n_resampling=1000)

        sel_est.fit(X_trains, y_train)
        X_trains = sel_est.transform(X_trains)

        selected_mask = sel_est.get_support()
        selected_features = sel_est.get_support(indices=True)

        sel_feats_path = os.sep.join(
            #    [".", "masks", os.path.basename(args.training_data)])
            [maskF, os.path.basename(args.training_data)])

        # saves indices
        np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d")
        # saves mask
        np.save(sel_feats_path + ".mask", selected_mask)
        featsel_str = ".randcv"

    estimator = ExtraTreesRegressor(random_state=args.seed, n_jobs=1)

    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    # performs parameter optimization using random search
    print "Performing parameter optimization ... "


    param_distributions = \
        {"n_estimators": [5, 10, 50, 100, 200, 500],
         "max_depth": [3, 2, 1, None],
         "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)],
         "min_samples_split": sp_randint(1, 11),
         "min_samples_leaf": sp_randint(1, 11),
         "bootstrap": [True, False]}
         # "criterion": ["gini", "entropy"]}

    search = RandomizedSearchCV(estimator, param_distributions,
                                n_iter=args.iterations,
                                scoring=mae_scorer, n_jobs=8, refit=True,
                                cv=KFold(X_train.shape[0], args.folds, shuffle=True,
                                         random_state=args.seed), verbose=1,
                                random_state=args.seed)

    # fits model using best parameters found
    search.fit(X_trains, y_train)

    # ................SHAHAB ........................ 
    
    models_dir = sorted(glob.glob(args.models_dir + os.sep + "*"))
    
    estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], 
                                     max_depth=search.best_params_["max_depth"], 
                                     max_features=search.best_params_["max_features"],
                                     min_samples_leaf=search.best_params_["min_samples_leaf"], 
                                     min_samples_split=search.best_params_["min_samples_split"], 
                                     n_estimators=search.best_params_["n_estimators"], 
                                     verbose=1, 
                                     random_state=42, 
                                     n_jobs=8)
   
    estimator2.fit(X_trains,y_train)
    from sklearn.externals import joblib
    print "koooonnn %s" % args.models_dir
    joblib.dump(estimator2, args.models_dir+"/XRT.pkl")
    joblib.dump(scaler, args.models_dir+"/scaler.pkl")
    joblib.dump(sel_est, args.models_dir+"/sel_est.pkl")
    
#    print "Kioonnn number of feat:\n", n_feature
    # ................SHAHAB ........................

    print "Best parameters: ", search.best_params_

    # saves parameters on yaml file
    #param_path = os.sep.join([".", "parameters", os.path.basename(
    param_path = os.sep.join([paramF, os.path.basename(
        args.training_data)]) + featsel_str + ".params.yaml"
    param_file = codecs.open(param_path, "w", "utf-8")
    yaml.dump(search.best_params_, stream=param_file)
    testF = os.sep.join([outF, "/test/"])
    buildDir(testF)

    m = y_train.mean()

    # evaluates model on the different test sets
    test_features = sorted(glob.glob(args.test_data + os.sep + "*"))
    test_labels = sorted(glob.glob(args.test_labels + os.sep + "*"))
    for test_feature, test_label in zip(test_features, test_labels):
        print "Evaluating on %s" % test_label
    	X_test = np.nan_to_num(
        	np.genfromtxt(test_feature, delimiter=args.delimiter))
    	y_test = np.clip(np.genfromtxt(test_label), 0, 1)

    	X_tests = X_test
    	if args.scale:
        	X_tests = scaler.transform(X_test)

    	if args.select_features:
        	X_tests = sel_est.transform(X_tests)

    	# gets predictions on test set
    	#y_pred = search.predict(X_tests)
    	y_pred = np.clip(search.predict(X_tests), 0, 1)

    	# evaluates on test set
    	mae = mean_absolute_error(y_test, y_pred)
    	rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    	print "Test MAE = %2.8f" % mae
    	print "Test RMSE = %2.8f" % rmse
    	print "Prediction range: [%2.4f, %2.4f]" % (y_pred.min(), y_pred.max())
    	# saves evaluation
    	testFX = testF + "/" + os.path.basename(test_label)
    	buildDir(testFX)
    	buildDir(testFX + "/evaluation/")

    	eval_path = os.sep.join([testFX, "evaluation", os.path.basename(
        	args.training_data)]) + featsel_str + "--" + os.path.basename(
        	test_label)
    	mae_eval = codecs.open(eval_path + ".mae", 'w', "utf-8")
    	mae_eval.write(str(mae) + "\n")
    	rmse_eval = codecs.open(eval_path + ".rmse", 'w', "utf-8")
    	rmse_eval.write(str(rmse) + "\n")

    	mu = m * np.ones(y_test.shape[0])  # baseline on test set
    	maeB = mean_absolute_error(y_test, mu)
    	rmseB = np.sqrt(mean_squared_error(y_test, mu))
    	print "Test MAE Baseline= %2.8f" % maeB
    	print "Test RMSE Baseline= %2.8f" % rmseB
    	mae_eval = codecs.open(eval_path + ".mae.Base", 'w', "utf-8")
    	mae_eval.write(str(maeB) + "\n")
    	rmse_eval = codecs.open(eval_path + ".rmse.Base", 'w', "utf-8")
    	rmse_eval.write(str(rmseB) + "\n")



	# saves predictions
	buildDir(testFX + "/predictions/")
	preds_path = os.sep.join([testFX, "predictions", os.path.basename(
        	args.training_data)]) + featsel_str + "--" + os.path.basename(
        	test_label) + ".preds"
	np.savetxt(preds_path, y_pred, fmt="%2.15f")
Example #37
0
Rforest = RandomForestRegressor()
grid_search = RandomizedSearchCV(Rforest,
                                 cv=3,
                                 param_distributions=paramDist,
                                 n_iter=100,
                                 n_jobs=4,
                                 scoring='mean_squared_error')

grid_search.fit(Hold_out, y_test)

scoresGrid = grid_search.grid_scores_
print grid_search.best_score_
print grid_search.best_estimator_
report(grid_search.grid_scores_)

finalpred = np.expm1(grid_search.predict(Ypredict))

pred = np.vstack(
    [np.array(mat[~np.isnan(mat['id'])]['id'], dtype=np.int16), finalpred]).T
pred = pd.DataFrame(pred)
pred.columns = ['id', 'cost']
pred['id'] = pred['id'].astype(np.int16)
ts = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S')
pred.to_csv('pred06-stack' + ts + '.csv', index=False)
"""
p1 = pd.read_csv('pred04-stack.csv')
p2 = pd.read_csv('pred05-stack.csv')

pred = pd.concat([p1['id'],(p1['cost']+p2['cost'])/2],axis=1)
pred.to_csv('pred05-stack05-04.csv',index=False)
"""
Example #38
0
                max_depth=max_depth_dist)

gbdt = GradientBoostingClassifier(verbose=1)
searchcv = RandomizedSearchCV(estimator=gbdt, param_distributions=param_dist,n_iter=200,verbose=1)
searchcv.fit(Xtrain,ytrain)      

searchcv.best_score_                                  
searchcv.best_estimator_
searchcv.best_params_


# ---------------------- predict
titanic_test = pd.read_csv("test_processed.csv",index_col="PassengerId")
Xtest = titanic_test[feature_names]

predictions = searchcv.predict(Xtest)
submission = pd.DataFrame({
        "PassengerId": titanic_test.index,
        "Survived": predictions
    })
submission.to_csv("submit_gbdt.csv", index=False)






import pickle
inf = open('gbdt.pkl', 'rb')
gbdt = pickle.load(inf)
inf.close()
Example #39
0
    temp = np.size(train_data,0)/5
    cv_data = train_data[0:temp:,::]
    train_data2 = train_data[temp::,::]
    #
    # forest = RandomForestClassifier(n_estimators= 100, bootstrap = True, min_samples_leaf = 7, min_samples_split = 7,
    #                                  criterion = 'gini', max_features = 3, max_depth= None)
    # forest = forest.fit(train_data2[::,1::], train_data2[::, 0])
    random_search = RandomizedSearchCV(forest, param_distributions=param_dist,
                                   n_iter=n_iter_search)
    random_search.fit(train_data2[::,1::], train_data2[::,0])

    print "Predicting..."
    # output_cv = forest.predict(cv_data[::,1::]).astype(int)
    # output_train = forest.predict(train_data2[::,1::]).astype(int)

    output_cv = random_search.predict(cv_data[::,1::]).astype(int)
    output_train = random_search.predict(train_data2[::,1::]).astype(int)
    print "Done..."
    if (len(train_data2) != len(output_train)): print "something wrong"
    else:
        temp_cv_acc += len(output_cv[output_cv == cv_data[::,0]])/float(len(output_cv))
        temp_train_acc += len(output_train[train_data2[::,0] == output_train])/float(len(output_train))


print "RF Training Accuracy:", temp_train_acc/trials,
print "CV Accuracy:", temp_cv_acc/trials




# real test data
Example #40
0
def main():
    csv_file_object = csv.reader(open('Data/train.csv', 'rb')) #Load in the training csv file
    header = csv_file_object.next() #Skip the fist line as it is a header
    train_data=[] #Creat a variable called 'train_data'
    for row in csv_file_object: #Skip through each row in the csv file
        train_data.append(row[1:]) #adding each row to the data variable
    train_data = np.array(train_data) #Then convert from a list to an array
    
    #I need to convert all strings to integer classifiers:
    #Male = 1, female = 0:
    train_data[train_data[0::,3]=='male',3] = -1
    train_data[train_data[0::,3]=='female',3] = 1
    #embark c=0, s=1, q=2
    train_data[train_data[0::,10] =='C',10] = -1
    train_data[train_data[0::,10] =='S',10] = 0
    train_data[train_data[0::,10] =='Q',10] = 1
    #Survived
    train_data[train_data[0::,3]==1,0] = 1
    train_data[train_data[0::,3]==0,0] = -1
    
    #I need to fill in the gaps of the data and make it complete.
    #So where there is no price, I will assume price on median of that class
    #Where there is no age I will give median of all ages
    
    imp = preprocessing.Imputer(missing_values=0, strategy='median', axis=0)
    
    #All the ages with no data make the median of the data
    #train_data[train_data[0::,4] == '',4] = np.median(train_data[train_data[0::,4]\
    #                                          != '',4].astype(np.float))
    #All missing ebmbarks just make them embark from most common place
    #train_data[train_data[0::,10] == '',10] = np.round(np.mean(train_data[train_data[0::,10]\
    #                                                   != '',10].astype(np.float)))
    
    train_data = np.delete(train_data,[2,7,9,10],1) #remove the name data, cabin and ticket
    train_data[train_data=='']='0'
    imp.fit_transform(train_data)
    #I need to do the same with the test data now so that the columns are in the same
    #as the training data
    
    
    
    #We finally spit the data between train set and valiation set
    x_train, x_test, y_train, y_test=train_test_split(
        train_data[0::,1::],train_data[0::,0], test_size=0.2, random_state=0)
    
    #Standardise data
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_std=scaler.transform(x_train)
    x_test_std=scaler.transform(x_test)
    
    
    test_file_object = csv.reader(open('Data/test.csv', 'rb')) #Load in the test csv file
    header = test_file_object.next() #Skip the fist line as it is a header
    test_data=[] #Creat a variable called 'test_data'
    ids = []
    for row in test_file_object: #Skip through each row in the csv file
        ids.append(row[0])
        test_data.append(row[1:]) #adding each row to the data variable
    test_data = np.array(test_data) #Then convert from a list to an array
    
    #I need to convert all strings to integer classifiers:
    #Male = 1, female = 0:
    test_data[test_data[0::,2]=='male',2] = 1
    test_data[test_data[0::,2]=='female',2] = -1
    #ebark c=0, s=1, q=2
    test_data[test_data[0::,9] =='C',9] = -1 #Note this is not ideal, in more complex 3 is not 3 tmes better than 1 than 2 is 2 times better than 1
    test_data[test_data[0::,9] =='S',9] = 0
    test_data[test_data[0::,9] =='Q',9] = 1
    
    #All the ages with no data make the median of the data
    #test_data[test_data[0::,3] == '',3] = np.median(test_data[test_data[0::,3]\
    #                                           != '',3].astype(np.float))
    #All missing ebmbarks just make them embark from most common place
    #test_data[test_data[0::,9] == '',9] = np.round(np.mean(test_data[test_data[0::,9]\
    #                                                   != '',9].astype(np.float)))
    #All the missing prices assume median of their respectice class
    #for i in xrange(np.size(test_data[0::,0])):
    #    if test_data[i,7] == '':
    #        test_data[i,7] = np.median(test_data[(test_data[0::,7] != '') &\
    #                                             (test_data[0::,0] == test_data[i,0])\
    #            ,7].astype(np.float))
    
    test_data = np.delete(test_data,[1,6,8,9],1) #remove the name data, cabin and ticket
    test_data[test_data=='']='0'
    #Impute mising values
    imp.fit_transform(test_data)
    
    #Standarize
    scaler_test = preprocessing.StandardScaler().fit(test_data)
    test_data_std=scaler_test.transform(test_data)
    #The data is now ready to go. So lets train then test!
    
    start = time()
    print 'Training estimators'
    estimators = [('linearsvc', LinearSVC()), ('KNeighborsClassifier', KNeighborsClassifier())]
    clf = Pipeline(estimators)
    # specify parameters and distributions to sample from
    param_dist = {"linearsvc__C": sp_randint(1, 1000),
                  "linearsvc__loss": ["l1", "l2"],
                  "linearsvc__dual": [True],
                  "KNeighborsClassifier__n_neighbors": sp_randint(5, 100),
                  "KNeighborsClassifier__weights": ["uniform", "distance"],
                  "KNeighborsClassifier__algorithm": ["ball_tree", "kd_tree", "brute"],
                  "KNeighborsClassifier__leaf_size": sp_randint(3, 100),
                  
                  }
    
    # run randomized search
    n_iter_search = 2000
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                       n_iter=n_iter_search,n_jobs=4, verbose=1)
    random_search.fit(x_train_std,y_train)
    
    print 'Reporting'
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)
    score=random_search.score(x_test_std,y_test)
    print 'Test score'
    print score
    print 'Predicting'
    output = random_search.predict(test_data_std)
    
    open_file_object = csv.writer(open("pipelinearsvcknn.csv", "wb"))
    open_file_object.writerow(["PassengerId","Survived"])
    open_file_object.writerows(zip(ids, output))
def main(date, modelType, iterations):
    """
    Determines the optimal hyperparameters for a given machine learning
    model for a set of training data.

    :param date: Date the training and testing data was collected (YYYY_MMDD)
    :param modelType: (string) type of machine learning model to train
    :param iterations: (int) number of iterations for hyperparameter searching

    :return: (None)
    """
    
    # Make sure that the model is a valid choice
    if (not (modelType in MODELS.keys())) and (modelType != ALL):
        print "Invalid model type:", modelType
        return

    # Allow for training more than one model at a time
    if modelType == ALL:
        modelsToTrain = MODELS.keys()
    else:
        modelsToTrain = [modelType]

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    testX, testY = FileIO.loadTestingData(date)

    trainX = np.nan_to_num(trainX)
    testX = np.nan_to_num(testX)

    for modelType in modelsToTrain:

        # Train the desired ML model
        name, clfType = MODELS[modelType]
        print "Training the", name

        baseClassifier = clfType()
        clf = RandomizedSearchCV(baseClassifier, param_distributions=PARAMETERS[modelType],
                                                 n_iter=iterations,
                                                 n_jobs=4)
        clf.fit(trainX, trainY)

        # Perform some very basic accuracy testing
        trainResult = clf.predict(trainX)
        testResult = clf.predict(testX)

        trainingAccuracy = accuracy_score(trainY, trainResult)
        testingAccuracy = accuracy_score(testY, testResult)
        confusionMatrix = confusion_matrix(testY, testResult)

        print "Training Accuracy:", trainingAccuracy
        print "Testing Accuracy:", testingAccuracy
        print "Confusion Matrix:"
        print confusionMatrix
        print " "
        print "Hyperparameters:"
        for param in PARAMETERS[modelType].keys():
            print param + ':', clf.best_estimator_.get_params()[param]
        print " "

        # Save the model to disk
        FileIO.saveModel(clf.best_estimator_, modelType, date)
Example #42
0
parameter_space_bal = {
    'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['auto', 1e-3, 1e-4],
    'C': [0.01, .1, 1, 10, 100, 1000], 'class_weight': [None]}

print("Building balanced SVM")
SVM_bal = RandomizedSearchCV(SVC(C=1), parameter_space_bal, cv=10,
        scoring='recall_weighted', iid=True)
print("fitting balanced SVM")
SVM_bal.fit(xbaltrain, ybaltrain)

print("Hyperparameters for balanced SVM found:")
print(SVM_bal.best_params_)

print("getting predictions for balanced SVM")
y_pred_svm_bal = SVM_bal.predict(xtest)

print("\n\n results for SVM")
winfault.clf_scoring(ytest, y_pred_svm_bal, labels)

print("========================================================")
print("------Building models using Imbalanced training data------")
print("========================================================")
parameter_space = {
    'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['auto', 1e-3, 1e-4],
    'C': [0.01, .1, 1, 10, 100, 1000],
    'class_weight': [
        {0: 0.01}, {1: 1}, {1: 2}, {1: 10}, {1: 50}, 'balanced']}

print("Building Imbalanced SVM")
SVM = RandomizedSearchCV(SVC(C=1), parameter_space, cv=10,
    #2
    svc = SVC()
    svc_param_dist = {"C": uniform(),
                         "gamma": uniform(),
                         "kernel": ['linear', 'rbf'],
                         "class_weight": [{1: 1}, {1: 2}, {1: 5}, {1: 10}],
                         "probability": [True]
                         }
    #params = [{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
    #           'kernel': ['linear'], 'class_weight': [{1: 1}, {1: 5}, {1: 2}, {1: 3}, {1: 10}]}]

    #clf2 = GridSearchCV(svc, param_grid=params, scoring='roc_auc', verbose=True, cv=5, n_jobs=-1)
    clf2 = RandomizedSearchCV(svc, param_distributions=svc_param_dist, n_iter=100)

    clf2.fit(X_train_2, y_train_2)
    clf_2_x_val_predictions = clf2.predict(X_test)
    class_rep_2 = classification_report(y_test, clf_2_x_val_predictions)
    print clf2.best_params_
    print class_rep_2

    #3
    gbc = GradientBoostingClassifier()
    forest_param_dist = {"max_depth": [3,4,5,6,7],
                               "max_features": sp_randint(1, 11),
                               "min_samples_split": sp_randint(1, 11),
                               "min_samples_leaf": sp_randint(1, 11),
                               "subsample": uniform(),
                               "learning_rate": uniform(),
                               "n_estimators": sp_randint(1, 351)}

    clf3 = RandomizedSearchCV(gbc, param_distributions=forest_param_dist, n_iter=100)
Example #44
0
    cv_data = train_data[0:temp:, ::]
    train_data2 = train_data[temp::, ::]
    #
    # forest = RandomForestClassifier(n_estimators= 100, bootstrap = True, min_samples_leaf = 7, min_samples_split = 7,
    #                                  criterion = 'gini', max_features = 3, max_depth= None)
    # forest = forest.fit(train_data2[::,1::], train_data2[::, 0])
    random_search = RandomizedSearchCV(forest,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search)
    random_search.fit(train_data2[::, 1::], train_data2[::, 0])

    print "Predicting..."
    # output_cv = forest.predict(cv_data[::,1::]).astype(int)
    # output_train = forest.predict(train_data2[::,1::]).astype(int)

    output_cv = random_search.predict(cv_data[::, 1::]).astype(int)
    output_train = random_search.predict(train_data2[::, 1::]).astype(int)
    print "Done..."
    if (len(train_data2) != len(output_train)): print "something wrong"
    else:
        temp_cv_acc += len(output_cv[output_cv == cv_data[::, 0]]) / float(
            len(output_cv))
        temp_train_acc += len(
            output_train[train_data2[::, 0] == output_train]) / float(
                len(output_train))

print "RF Training Accuracy:", temp_train_acc / trials,
print "CV Accuracy:", temp_cv_acc / trials

# real test data
test_data = test_df.values
K_X = Quasi_linear_kernel(X,X)
clf = svm.SVC(kernel='precomputed')
# y_pred = clf.predict(K_test)

# run randomized search
n_iter_search = 40
random_search = RandomizedSearchCV(clf, param_distributions=QL_SVM_param_dist,
                                   n_iter=n_iter_search,cv=skf)
start = time()
random_search.fit(K_train, y_train)
print("Quasi_linear kernel SVM RandomSearch took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
print("Random_search Best estimator is :\n"), random_search.best_estimator_
report(random_search.grid_scores_,n_top=5)
# print the classification_report
y_test, y_pred = y_test, random_search.predict(K_test) 
#Call predict on the estimator with the best found parameters.
print(classification_report(y_test, y_pred))
print()

# run grid search
grid_search = GridSearchCV(clf, param_grid=QL_SVM_param_dist, cv=skf)
start = time()
grid_search.fit(K_X, Y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.grid_scores_)))
print("Grid_search Best estimator is :\n"), grid_search.best_estimator_
report(grid_search.grid_scores_,n_top=10)
# print the classification_report
y_test, y_pred = y_test, grid_search.predict(K_test)
print(classification_report(y_test, y_pred))
Example #46
0
            sys.stderr.write("\n:>> Model selected: %s\n" % (rs.best_params_))        
        except:
            try:
                num_lines = sum(1 for line in open("svr_%s_%s_H%s_%s_m%s.out" % (corpus, representation, dimensions, op, min_count), "r"))        
            except IOError:
                num_lines = 0
            y_out = {}
            y_out['estimated_output'] = range(0,len(y))
            y_out['best_params'] = "Non converged model..."
            y_out['learned_model'] = "Nonconverged_model_%d" % num_lines
            y_out['performance'] = 0.0 
            with open("svr_%s_%s_H%s_%s_m%s.out" % (corpus, representation, dimensions, op, min_count), "a") as f:
                f.write(str(y_out)+'\n')
            continue

        f_x = rs.predict(X).tolist()
        sys.stderr.write("\n:>> R2: %s\n" % (r2_score(y, f_x)))
        try:
            num_lines = sum(1 for line in open("svr_%s_%s_H%s_%s_m%s.out" % (corpus, representation, dimensions, op, min_count), "r"))        
        except IOError:
            num_lines = 0

        y_out = {}
        if args.t:
            y_out['estimated_output'] = map(detener, f_x)
        else:
            y_out['estimated_output'] =  f_x

        y_out['best_params'] = rs.best_params_
        y_out['learned_model'] = {'file': "/almac/ignacio/data/svr_models/%s_%s_%s_%s_H%s_%s_m%s.model" % (svr_, corpus, num_lines, representation, dimensions, op, min_count) }
        if args.t:
Example #47
0
param_dist = {
    "pca__n_components": sp_randint(10, 700),
    "rbfSVM__C": scipy.stats.expon(scale=10),
    "rbfSVM__kernel": ["rbf"],
    "rbfSVM__gamma": scipy.stats.expon(scale=0.01)
}
n_iter_search = 500
random_search = RandomizedSearchCV(pipe,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search,
                                   cv=cv,
                                   verbose=6,
                                   n_jobs=4)
random_search.fit(X_train, Y_train)
predicted_held_out = random_search.predict(X_test)
mmat = confusion_matrix(predicted_held_out, Y_test)
print mmat
class_map = dict(zip(set(input_kmers_counts["class"]), range(0, 4)))
kappa([class_map[x] for x in Y_test],
      [class_map[x] for x in predicted_held_out])

# We determine whether the variance of the number of components for the best CV
all_scores = random_search.grid_scores_
all_scores.sort(key=lambda x: x.mean_validation_score)
with open("random_search_scores_1000iter_5mers.bdat", "w") as f:
    cPickle.dump(all_scores, f)

# We try the best parameters on indepedent samples
X_train, X_test, Y_train, Y_test = train_test_split(
    normalized_counts[kmer_colums],
Example #48
0
forest = RandomForestClassifier(n_estimators=25)
# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(forest,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search,
                                   cv=6)

start = time()
random_search.fit(train_data2[::, 1::], train_data2[::, 0])
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates"
    " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)
train_output = random_search.predict(train_data2[::, 1::])
cv_output = random_search.predict(cv_data[::, 1::])
print "Training set accuracy: %.3f   CV set accuracy: %.3f"\
      %(len(train_data2[train_output == train_data2[::,0]])/float(len(train_data2)),
      (len(cv_data[cv_output == cv_data[::,0]])/float(len(cv_data))))

# Analyzing important features
forest = random_search.best_estimator_
feature_importance = forest.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())

feature_list = df.columns.values

df.drop(feature_list[feature_importance < 10], inplace=True, axis=1)
test_df.drop(feature_list[feature_importance < 10], inplace=True, axis=1)
Example #49
0
                  "max_features": list(range(1,X_train.shape[1]+1)),
                  "min_samples_split": list(range(1, 10)),
                  "min_samples_leaf": list(range(1, 10))}
    random_search = RandomizedSearchCV(clf, param_distributions=param_grid, n_jobs=3, n_iter=10000, verbose=1)
    print("Finding best hyperparameters for randomized tree")
    random_search.fit(X_train,Y)
    #random_search.fit(X_train_train,Y_train)

    # Deal with results
    print("Best parameters are:")
    print(random_search.best_params_)
    #score = random_search.score(X_train_test,Y_test)
    #print("Score = {}".format(score))
    best_clf = random_search.best_estimator_

    Y_test = random_search.predict(X_test)
    result = np.hstack([np.expand_dims(ID_test,axis=1),np.expand_dims(Y_test,axis=1)])

    # Write results to file
    with open("predict.csv","w") as outfile:
        outfile.write("PassengerId,Survived\n")
        for i in range(len(result)):
            outfile.write("{},{}\n".format(result[i,0],result[i,1]))

    fig = plt.figure(figsize=(16,9))
    gs = gridspec.GridSpec(1,1,left=0.05,right=0.98,bottom=0.17,top=0.98)
    ax = fig.add_subplot(gs[0,0],xlabel="Features",ylabel="Importance")
    xpos = np.arange(len(labels))
    width = 0.9
    ax.bar(xpos,best_clf.feature_importances_,width=width)
    ax.set_xticks(xpos+(width/2.))
def main(date, modelType, iterations):
    """
    Determines the optimal hyperparameters for a given machine learning
    model for a set of training data.

    :param date: Date the training and testing data was collected (YYYY_MMDD)
    :param modelType: (string) type of machine learning model to train
    :param iterations: (int) number of iterations for hyperparameter searching

    :return: (None)
    """

    # Make sure that the model is a valid choice
    if (not (modelType in MODELS.keys())) and (modelType != ALL):
        print "Invalid model type:", modelType
        return

    # Allow for training more than one model at a time
    if modelType == ALL:
        modelsToTrain = MODELS.keys()
    else:
        modelsToTrain = [modelType]

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    testX, testY = FileIO.loadTestingData(date)

    trainX = np.nan_to_num(trainX)
    testX = np.nan_to_num(testX)

    for modelType in modelsToTrain:

        # Train the desired ML model
        name, clfType = MODELS[modelType]
        print "Training the", name

        baseClassifier = clfType()
        clf = RandomizedSearchCV(baseClassifier,
                                 param_distributions=PARAMETERS[modelType],
                                 n_iter=iterations,
                                 n_jobs=4)
        clf.fit(trainX, trainY)

        # Perform some very basic accuracy testing
        trainResult = clf.predict(trainX)
        testResult = clf.predict(testX)

        trainingAccuracy = accuracy_score(trainY, trainResult)
        testingAccuracy = accuracy_score(testY, testResult)
        confusionMatrix = confusion_matrix(testY, testResult)

        print "Training Accuracy:", trainingAccuracy
        print "Testing Accuracy:", testingAccuracy
        print "Confusion Matrix:"
        print confusionMatrix
        print " "
        print "Hyperparameters:"
        for param in PARAMETERS[modelType].keys():
            print param + ':', clf.best_estimator_.get_params()[param]
        print " "

        # Save the model to disk
        FileIO.saveModel(clf.best_estimator_, modelType, date)
Example #51
0
def parameter_tuning(Xn, yn, scale=1):

    # FEATURE SELECTION
    print Xn.shape
    print yn.shape

    # FEATURE SCALING
    if scale == 1:
        Xn = preprocessing.scale(Xn, with_mean=True)
        print 'NORMALIZING'
    elif scale == 2:
        Xn = preprocessing.scale(Xn, with_mean=False)
        print 'NORMALIZING'

    tuned_parameters = [{
        'kernel': ['rbf'],
        'C': np.logspace(-2, 7, 10),
        'gamma': np.logspace(-4, 2, 7)
    }]

    tuned_parameters2 = {
        'kernel': ['rbf'],
        'C': np.logspace(-2, 7, 10),
        'gamma': np.logspace(-4, 2, 7)
    }

    linear_parameters = [{'kernel': ['linear'], 'C': np.logspace(-3, 4, 8)}]

    linear_parameters2 = {'kernel': ['linear'], 'C': np.logspace(-3, 4, 8)}

    cv = cross_validation.StratifiedKFold(yn,
                                          shuffle=True,
                                          n_folds=3,
                                          random_state=42)

    if RBF:
        clf = RandomizedSearchCV(estimator=SVC(C=1, cache_size=1000),
                                 param_distributions=tuned_parameters2,
                                 cv=cv,
                                 scoring='accuracy',
                                 n_iter=30,
                                 verbose=1,
                                 n_jobs=2).fit(Xn, yn)

        print("Best parameters set found on development set:")
        print
        print(clf.best_estimator_)
        print(clf.best_score_)
        print()
        print confusion_matrix(yn, clf.predict(Xn))

    if LINEAR:
        clf = GridSearchCV(estimator=SVC(C=1, cache_size=1000),
                           param_grid=linear_parameters,
                           cv=cv,
                           scoring='accuracy',
                           verbose=1,
                           n_jobs=2).fit(Xn, yn)

        print("Best parameters set found on development set:")
        print
        print(clf.best_estimator_)
        print(clf.best_score_)
        print()
        print confusion_matrix(yn, clf.predict(Xn))
Example #52
0
    clf = RandomizedSearchCV(RandomForestClassifier(),
            param_distributions=param_grid,
            n_iter=n_iter_search, cv=3, scoring='accuracy',
            n_jobs=-1)

    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print(clf.best_params_)
    print()
    #print("Detailed classification report:")
    #print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
    print(confusion_matrix(y_true, y_pred))

    print(best_score ,clf.best_score_)
    if i == 1:
        break
    else:
        best_score = clf.best_score_
        # remove some features
        rfecv = RFECV(estimator=clf.best_estimator_, step=1, cv=2,
                scoring='accuracy')
        rfecv.fit(X_train, y_train)
        print("Optimal number of features : %d" % rfecv.n_features_)
        X_train = rfecv.transform(X_train)
Example #53
0
def main():
    csv_file_object = csv.reader(open('Data/train.csv',
                                      'rb'))  #Load in the training csv file
    header = csv_file_object.next()  #Skip the fist line as it is a header
    train_data = []  #Creat a variable called 'train_data'
    for row in csv_file_object:  #Skip through each row in the csv file
        train_data.append(row[1:])  #adding each row to the data variable
    train_data = np.array(train_data)  #Then convert from a list to an array

    #I need to convert all strings to integer classifiers:
    #Male = 1, female = 0:
    train_data[train_data[0::, 3] == 'male', 3] = -1
    train_data[train_data[0::, 3] == 'female', 3] = 1
    #embark c=0, s=1, q=2
    train_data[train_data[0::, 10] == 'C', 10] = -1
    train_data[train_data[0::, 10] == 'S', 10] = 0
    train_data[train_data[0::, 10] == 'Q', 10] = 1
    #Survived
    train_data[train_data[0::, 3] == 1, 0] = 1
    train_data[train_data[0::, 3] == 0, 0] = -1

    #I need to fill in the gaps of the data and make it complete.
    #So where there is no price, I will assume price on median of that class
    #Where there is no age I will give median of all ages

    imp = preprocessing.Imputer(missing_values=0, strategy='median', axis=0)

    #All the ages with no data make the median of the data
    #train_data[train_data[0::,4] == '',4] = np.median(train_data[train_data[0::,4]\
    #                                          != '',4].astype(np.float))
    #All missing ebmbarks just make them embark from most common place
    #train_data[train_data[0::,10] == '',10] = np.round(np.mean(train_data[train_data[0::,10]\
    #                                                   != '',10].astype(np.float)))

    train_data = np.delete(train_data, [2, 7, 9, 10],
                           1)  #remove the name data, cabin and ticket
    train_data[train_data == ''] = '0'
    imp.fit_transform(train_data)
    #I need to do the same with the test data now so that the columns are in the same
    #as the training data

    #We finally spit the data between train set and valiation set
    x_train, x_test, y_train, y_test = train_test_split(train_data[0::, 1::],
                                                        train_data[0::, 0],
                                                        test_size=0.2,
                                                        random_state=0)

    #Standardise data
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_std = scaler.transform(x_train)
    x_test_std = scaler.transform(x_test)

    test_file_object = csv.reader(open('Data/test.csv',
                                       'rb'))  #Load in the test csv file
    header = test_file_object.next()  #Skip the fist line as it is a header
    test_data = []  #Creat a variable called 'test_data'
    ids = []
    for row in test_file_object:  #Skip through each row in the csv file
        ids.append(row[0])
        test_data.append(row[1:])  #adding each row to the data variable
    test_data = np.array(test_data)  #Then convert from a list to an array

    #I need to convert all strings to integer classifiers:
    #Male = 1, female = 0:
    test_data[test_data[0::, 2] == 'male', 2] = 1
    test_data[test_data[0::, 2] == 'female', 2] = -1
    #ebark c=0, s=1, q=2
    test_data[
        test_data[0::, 9] == 'C',
        9] = -1  #Note this is not ideal, in more complex 3 is not 3 tmes better than 1 than 2 is 2 times better than 1
    test_data[test_data[0::, 9] == 'S', 9] = 0
    test_data[test_data[0::, 9] == 'Q', 9] = 1

    #All the ages with no data make the median of the data
    #test_data[test_data[0::,3] == '',3] = np.median(test_data[test_data[0::,3]\
    #                                           != '',3].astype(np.float))
    #All missing ebmbarks just make them embark from most common place
    #test_data[test_data[0::,9] == '',9] = np.round(np.mean(test_data[test_data[0::,9]\
    #                                                   != '',9].astype(np.float)))
    #All the missing prices assume median of their respectice class
    #for i in xrange(np.size(test_data[0::,0])):
    #    if test_data[i,7] == '':
    #        test_data[i,7] = np.median(test_data[(test_data[0::,7] != '') &\
    #                                             (test_data[0::,0] == test_data[i,0])\
    #            ,7].astype(np.float))

    test_data = np.delete(test_data, [1, 6, 8, 9],
                          1)  #remove the name data, cabin and ticket
    test_data[test_data == ''] = '0'
    #Impute mising values
    imp.fit_transform(test_data)

    #Standarize
    scaler_test = preprocessing.StandardScaler().fit(test_data)
    test_data_std = scaler_test.transform(test_data)
    #The data is now ready to go. So lets train then test!

    start = time()
    print 'Training estimators'
    estimators = [('linearsvc', LinearSVC()),
                  ('KNeighborsClassifier', KNeighborsClassifier())]
    clf = Pipeline(estimators)
    # specify parameters and distributions to sample from
    param_dist = {
        "linearsvc__C": sp_randint(1, 1000),
        "linearsvc__loss": ["l1", "l2"],
        "linearsvc__dual": [True],
        "KNeighborsClassifier__n_neighbors": sp_randint(5, 100),
        "KNeighborsClassifier__weights": ["uniform", "distance"],
        "KNeighborsClassifier__algorithm": ["ball_tree", "kd_tree", "brute"],
        "KNeighborsClassifier__leaf_size": sp_randint(3, 100),
    }

    # run randomized search
    n_iter_search = 2000
    random_search = RandomizedSearchCV(clf,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       n_jobs=4,
                                       verbose=1)
    random_search.fit(x_train_std, y_train)

    print 'Reporting'
    print(
        "RandomizedSearchCV took %.2f seconds for %d candidates"
        " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)
    score = random_search.score(x_test_std, y_test)
    print 'Test score'
    print score
    print 'Predicting'
    output = random_search.predict(test_data_std)

    open_file_object = csv.writer(open("pipelinearsvcknn.csv", "wb"))
    open_file_object.writerow(["PassengerId", "Survived"])
    open_file_object.writerows(zip(ids, output))
Example #54
0
print(rand.best_score_)
print(rand.best_params_)


# ### Making predictions for new data

# define X_new as the ingredient text
X_new = new.ingredients_str


# print the best model found by RandomizedSearchCV
rand.best_estimator_


# RandomizedSearchCV/GridSearchCV automatically refit the best model with the entire dataset, and can be used to make predictions
new_pred_class_rand = rand.predict(X_new)
new_pred_class_rand


# create a submission file (score: 0.75342)
pd.DataFrame({'id':new.id, 'cuisine':new_pred_class_rand}).set_index('id').to_csv('sub3.csv')


# ## Part 5: Adding features to a document-term matrix (using SciPy)
# 
# - So far, we've trained models on either the **document-term matrix** or the **manually created features**, but not both.
# - To train a model on both types of features, we need to **combine them into a single feature matrix**.
# - Because one of the matrices is **sparse** and the other is **dense**, the easiest way to combine them is by using SciPy.

# create a document-term matrix from all of the training data
X_dtm = vect.fit_transform(X)
Example #55
0
# In[ ]:


print(model_svm.grid_scores_)

# In[ ]:


print(model_svm.best_params_)

# In[ ]:



y_pred= model_svm.predict(X_test)
print(metrics.accuracy_score(y_pred,y_test))

# In[ ]:


confusion_matrix=metrics.confusion_matrix(y_test,y_pred)
confusion_matrix

# In[ ]:


auc_roc=metrics.classification_report(y_test,y_pred)
auc_roc

# In[ ]:
Example #56
0
                                   param_distributions=param_dister,
                                   n_iter=n_iter_search,
                                   n_jobs=2)
start = time()
random_search.fit(X, y)

print("RandomizedSearchCV took %.2f s for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

report(random_search.grid_scores_)

# Load the testing data
test_mat = genfromtxt(TRAINING_INPUT_DIRECTORY + '/testing_matrix.csv',
                      delimiter=',')

test_y = test_mat[:, 0]
test_x = test_mat[:, 1:]

y_true, y_pred = test_y, random_search.predict(test_x)

print("Raw metirc result :")
print(classification_report(y_true, y_pred))
print('Accuracy : ' + str(accuracy_score(y_true, y_pred)) + '\n')

mod_y_pred = list(map(lambda x: x if x == 1 else -1, y_pred))
mod_y_true = list(map(lambda x: x if x == 1 else -1, y_true))

print("More reasonable metirc result : ")
print(classification_report(mod_y_true, mod_y_pred))
print('Accuracy : ' + str(accuracy_score(mod_y_true, mod_y_pred)) + '\n')