def perform_classifier_cross_validation(classifier, dtm_train,targets_train,
                                                 dtm_test, targets_test):
    cv = 3
    k_fold = KFold(len(targets_train), n_folds=cv,shuffle=True, 
                                        random_state=42)
    scoring = 'f1_macro'
    scores = cross_validation.cross_val_score(classifier, dtm_train, 
                                    targets_train,cv=k_fold, 
                                    scoring=scoring)
    
    print("Same classifier with cross validation:")
    print("Scores for folds" +"("+str(cv)+"):"+ str(scores))
    print(scoring + ": %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    targets_train_predicted = cross_validation.cross_val_predict(classifier, 
                                            dtm_train,targets_train, cv=cv)
    
    print_classifier_metrics(targets_train,targets_train_predicted, 
                               "train-with-cv")

    targets_test_predicted = cross_validation.cross_val_predict(classifier, 
                                                    dtm_test,targets_test,cv=cv)
    
    print_classifier_metrics(targets_test, targets_test_predicted, 
                               "test-with-cv")
    
    return classifier
def apply_cross_validated_learning(datasetname, X, y, resultsfolder, nfolds=5):

    dataspacename = datasetname + "_nfolds-" + str(nfolds)
    experimentrootpath = IOtools.ensure_dir(os.path.join(resultsfolder, dataspacename))
    scorefilepath = os.path.join(experimentrootpath, metaexperimentation.scorefilename+".csv")
    metaexperimentation.initialize_score_file(scorefilepath)
    
    # SVM
    kernels = ["linear", "rbf", "sigmoid", "poly"]
    Cs = [1, 10, 100, 1000]
    
    for kernel in kernels:
        for c in Cs:
            
            alg = "SVM"
            modelname = "_m-" + alg + "_k-" + kernel + "_C-" + str(c)
            experimentname = "nfolds-" + str(nfolds) + modelname
            
            clf = svm.SVC(kernel=kernel, C=c)
            ypredicted = cross_validation.cross_val_predict(clf, X, y, cv=nfolds)
            #print metrics.accuracy_score(y, ypredicted)
            reportresults(y, ypredicted, experimentname, experimentrootpath, scorefilepath)
    
    
    # Naive Bayes
    NBmodels = [naive_bayes.MultinomialNB(), naive_bayes.GaussianNB()]
    for nbmodel in NBmodels:
        alg = "NB"
        modelname = "_m-" + nbmodel.__class__.__name__
        experimentname = "nfolds-" + str(nfolds) + modelname
        
        ypredicted = cross_validation.cross_val_predict(nbmodel, X, y, cv=nfolds)
        reportresults(y, ypredicted, experimentname, experimentrootpath, scorefilepath)
Exemple #3
0
def main():
    
    parser = argparse.ArgumentParser(description='Train an ML model')
    required = parser.add_argument_group('required options')

    required.add_argument('-x', '--trainfile', required=True, help='File containing training data')
    required.add_argument('-y', '--targetfile', required=True, help='File containing target data')
    #required.add_argument('-o', '--modelfile', required=True, help='Output filename for trained model object')
    #required.add_argument('-t', '--targettype', default=int)
    
    args = parser.parse_args()

    #X = np.loadtxt(args.trainfile, skiprows=1)
    X = np.loadtxt(args.trainfile)
    #Y = np.loadtxt(args.targetfile, dtype=args.targettype)
    #Y = np.loadtxt(args.targetfile)   
    Y = np.genfromtxt(args.targetfile,dtype='str')

    assert len(X) == len(Y), "length mismatch between train and target data"

    clf1 = linear_model.LogisticRegression(penalty='l2',C=1e5,solver='newton-cg',tol=0.00001)
    clf1.fit(X, Y)
    predicted1=cross_validation.cross_val_predict(clf1,X,Y,cv=2)
    print("Prediction accuracy of logistic regression : ", metrics.accuracy_score(Y, predicted1))
    #predicted=cross_validation.cross_val_predict(clf1,x,x_tr,cv=2)
    
    clf2 = svm.SVC(C=1e5,kernel='rbf')
    clf2.fit(X, Y)
    predicted2=cross_validation.cross_val_predict(clf2,X,Y,cv=2)
    print("Prediction accuracy of SVM : ", metrics.accuracy_score(Y, predicted2))

    clf3 = naive_bayes.BernoulliNB(alpha=1.9)
    clf3.fit(X, Y)
    predicted3=cross_validation.cross_val_predict(clf3,X,Y,cv=2)
    print("Prediction accuracy of naive bayes : ", metrics.accuracy_score(Y, predicted3))

    clf4 = tree.DecisionTreeClassifier(criterion='entropy')
    clf4.fit(X, Y)
    predicted4=cross_validation.cross_val_predict(clf4,X,Y,cv=2)
    print("Prediction accuracy of decision trees : ", metrics.accuracy_score(Y, predicted4))
        
    #with open(args.modelfile, "wb") as outfile:
    #    pickle.dump(clf1, outfile, pickle.HIGHEST_PROTOCOL)
    
    with open('bin_file_lr',"wb") as outfile1:
         pickle.dump(clf1, outfile1, pickle.HIGHEST_PROTOCOL)

    with open('bin_file_svm',"wb") as outfile2:
         pickle.dump(clf2, outfile2, pickle.HIGHEST_PROTOCOL)

    with open('bin_file_bayes',"wb") as outfile3:
         pickle.dump(clf3, outfile3, pickle.HIGHEST_PROTOCOL)

    with open('bin_file_dtree',"wb") as outfile4:
         pickle.dump(clf4, outfile4, pickle.HIGHEST_PROTOCOL)
def test_cross_val_predict_sparse_prediction():
    # check that cross_val_predict gives same result for sparse and dense input
    X, y = make_multilabel_classification(
        n_classes=2, n_labels=1, allow_unlabeled=False, return_indicator=True, random_state=1
    )
    X_sparse = csr_matrix(X)
    y_sparse = csr_matrix(y)
    classif = OneVsRestClassifier(SVC(kernel="linear"))
    preds = cval.cross_val_predict(classif, X, y, cv=10)
    preds_sparse = cval.cross_val_predict(classif, X_sparse, y_sparse, cv=10)
    preds_sparse = preds_sparse.toarray()
    assert_array_almost_equal(preds_sparse, preds)
def test_cross_val_predict_pandas():
    # check cross_val_score doesn't destroy pandas dataframe
    types = [(MockDataFrame, MockDataFrame)]
    try:
        from pandas import Series, DataFrame
        types.append((Series, DataFrame))
    except ImportError:
        pass
    for TargetType, InputFeatureType in types:
        # X dataframe, y series
        X_df, y_ser = InputFeatureType(X), TargetType(y)
        check_df = lambda x: isinstance(x, InputFeatureType)
        check_series = lambda x: isinstance(x, TargetType)
        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
        cval.cross_val_predict(clf, X_df, y_ser)
Exemple #6
0
def run(params):
    train = loadDataFrame(params,'train')
    if params['test']:
        test = loadDataFrame(params,'test')
        
    train = runPreprocess(train,params)
    clf = getSpecifiedClf(params)
    try:
        dataset,target = splitDatasetTarget(train,params['target'])
    except:
        raise Exception('Target not specified')
    try:
        cross_val = params['cross_validate']
    except:
        cross_val = False
        
    clfName = getNameFromModel(clf)
    if cross_val and clfName != 'XGBClassifier':
        print('Beginning cross validation')
        predicted = cross_validation.cross_val_predict(clf,dataset,target,cv=5,n_jobs=-1)
        accuracyChecker(target,predicted)
        return
        
    if clfName == 'XGBClassifier':
        print('Xgboost CV selected. Beginning to find optimal rounds')
        clf = xgboostCV(clf,dataset,target)
        print('Xgboost Accuracy on 80-20 split (for speed)')
            
    trainX,testX,trainY,testY = splitTrainTest(dataset,target)
    clf.fit(trainX,trainY)
    predicted = clf.predict(testX)
    accuracyChecker(testY,predicted)

        
Exemple #7
0
def get_testing_metrics(model, X, y, metrics, as_indexes, n_folds, X_test=None):
    y_pred = cross_val_predict(
        model,
        X,
        y,
        cv=StratifiedKFold(
            y,
            n_folds=n_folds,
            shuffle=True,
            random_state=RANDOM_STATE
        )
    )
    print "y_pred", y_pred
    model.fit(X, y)
    result = get_y_true_y_pred_based_metrics(y, y_pred, metrics)
    if FEATURES in metrics:
        result[FEATURES] = model.get_support(indices=True)
    if OBJECTS in metrics:
        if as_indexes:
            result[OBJECTS] = [get_data_keeper().get_object_name_by_index(index) for (index,) in X]
        else:
            result[OBJECTS] = list(X.index)
    if TEST_PREDICTIONS in metrics:
        result[TEST_PREDICTIONS] = X_test, model.predict(X_test)
    return result
 def cross_validate(self):
     progress_logger.info("Starting cross validation.")
     validate_clf = linear_model.LogisticRegression(class_weight=self.weights)
     predictions = cross_validation.cross_val_predict(validate_clf, self.X, self.Y.ravel(), cv=5)
     fp_count = 0.0
     tp_count = 0.0
     fn_count = 0.0
     tn_count = 0.0
     miscount = 0.0
     for i in range(len(predictions)):
         prediction = predictions[i]
         expected = self.Y[i][0]
         if prediction == 1 and expected == 1:
             tp_count += 1
         elif prediction == 1 and expected == 0:
             fp_count += 1
         elif prediction == 0 and expected == 1:
             fn_count += 1
         elif prediction == 0 and expected == 0:
             tn_count += 1
         else:
             miscount += 1
     if miscount > 0:
         debug_logger.warn("During cross validation, found {} miscounts.".format(miscount))
     total_count = fp_count + tp_count + fn_count + tn_count
     self.validation_accuracy = (tp_count + tn_count) / total_count if total_count != 0 else 0.0
     fp_rate = fp_count / (fp_count + tn_count) if fp_count + tn_count != 0 else 0.0
     fn_rate = fn_count / (fn_count + tp_count) if fn_count + tp_count != 0 else 0.0
     progress_logger.info("Confusion matrix - True positives: {}, False positives: {}, False negatives: {}, True negatives: {}".format(
         tp_count, fp_count, fn_count, tn_count))
     progress_logger.info("Validation Accuracy: {}".format(self.validation_accuracy))
     progress_logger.info("False positive rate: {}".format(fp_rate))
     progress_logger.info("False negative rate: {}".format(fn_rate))
def crossvalidation(x, y):
    """
    Cross validation metric. Also plot confusion matrix and save cls if flags are set to 1.
    :param x: features (valence, arousal)
    :param y: target (emotion)
    :return:
    """
    c_array = np.logspace(0, 3, 4)
    gamma_array = np.logspace(-3, 3, 7)

    # feature scaling
    if feature_scaling:
        std_scale = preprocessing.StandardScaler().fit(x)
        x = std_scale.transform(x)

    for c in c_array:
        for gamma in gamma_array:
            clf = svm.SVC(kernel='linear', C=c, gamma=gamma) #kernel= rbf #kernel= poly #kernel= linear
            scores = cross_validation.cross_val_score(clf, x, y, cv=3)
            print("Accuracy: %0.2f (+/- %0.2f) %f %f" % (scores.mean(), scores.std() * 2, c, gamma))
            pred = cross_validation.cross_val_predict(clf, x, y, cv=3)
            print("Classes accuracy: ", classes_accuracy(y, pred))

    print(np.array(y))
    print(pred)

    #plot last one, not best, CARE!!!
    if plot_confusion_matrix:
        confusion_matrix.prepare_plot(y, pred)

    if save_clf:
        clf.fit(x, y)
        joblib.dump(clf, 'classifiers\\'+configuration.get('clf_name')+'.pkl')
Exemple #10
0
def main():
    pickle_folder = '../pickles_no_rms'
    pickle_folders_to_load = [f for f in os.listdir(pickle_folder) if os.path.isdir(join(pickle_folder, f))]
    pickle_folders_to_load = sorted(pickle_folders_to_load)
    pickle_folders_to_load = [p for p in pickle_folders_to_load if 'drums1__' not in p]

    sdr_type = 'background'

    fits = []
    sdrs = []
    for pick in pickle_folders_to_load:
        beat_spec_name = join(pickle_folder, pick, pick + '__beat_spec.pick')
        beat_spec = pickle.load(open(beat_spec_name, 'rb'))

        entropy, log_mean = beat_spectrum_prediction_statistics(beat_spec)
        fit_X = [entropy, log_mean]
        fits.append(fit_X)

        sdrs_name = join(pickle_folder, pick, pick + '__sdrs.pick')
        sdr_vals = pickle.load(open(sdrs_name, 'rb'))
        cur_sdr = sdr_vals[sdr_type][0]
        sdrs.append(cur_sdr)

    fits = np.array(fits)
    sdrs = np.array(sdrs).reshape(-1, 1)
    knn = neighbors.KNeighborsRegressor(5, weights='distance')
    scores = cross_validation.cross_val_predict(knn, fits, sdrs, cv=10, verbose=1)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean() - sdrs.mean(), scores.std() * 2))
Exemple #11
0
def classify_cv(data, cats, k):
    clf = svm.SVC(gamma=0.001, C=100.)
    vect = TfidfVectorizer(analyzer = 'word', stop_words = stopwords)
    tfidf_matrix = vect.fit_transform(data)
    predicted = cross_validation.cross_val_predict(clf, tfidf_matrix, cats, cv=k)
    conf_matrix = metrics.confusion_matrix(cats, predicted)
    print (metrics.classification_report(cats, predicted))
def main():
    pickle_folder = "../mir_1k/pickles"
    pickle_folders_to_load = [f for f in os.listdir(pickle_folder) if "__beat_spec.pick" in f]
    pickle_folders_to_load = sorted(pickle_folders_to_load)

    sdr_type = "background"

    fits = []
    sdrs = []
    for pick in pickle_folders_to_load:
        pick = pick.replace("__beat_spec.pick", "")
        beat_spec_path = join(pickle_folder, pick + "__beat_spec.pick")
        beat_spec = pickle.load(open(beat_spec_path, "rb"))

        entropy, log_mean = beat_spectrum_prediction_statistics(beat_spec)
        fit_X = [entropy, log_mean]
        fits.append(fit_X)

        sdrs_name = join(pickle_folder, pick + "__sdrs.pick")
        sdr_vals = pickle.load(open(sdrs_name, "rb"))
        cur_sdr = sdr_vals[sdr_type][0]
        sdrs.append(cur_sdr)

    fits = np.array(fits)
    sdrs = np.array(sdrs).reshape(-1, 1)
    knn = neighbors.KNeighborsRegressor(5, weights="distance")
    scores = cross_validation.cross_val_predict(knn, fits, sdrs, cv=10, verbose=1)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def main():
    start_time = time.clock()

    vectors, labels, anchors = prepare_data()

    print("Gonna go out and classify. Wish me luck.")
    support_vector_machine = grid_search.GridSearchCV(svm.SVC(), PARAM_GRID, cv = GRID_SEARCH_CV)
    classifier = pipeline.make_pipeline(preprocessing.StandardScaler(), support_vector_machine)

    predicted_labels = cross_validation.cross_val_predict(classifier, vectors, labels, cv = CV)

    end_time = time.clock()

    file_string = str(datetime.datetime.today()) + "-" + "-".join( DIRECTORIES + [str(ANCHORS_PER_CLASS) + "anchors", str(ITEMS_PER_CLASS) + "items"] ) + ".txt"
    with open( os.path.join("reports", file_string), "w") as f:
        date = "date: " + str(datetime.datetime.now())
        compressor_filters = pretty_print(lzma_filters)
        anchors_used = "anchors used: " + pretty_print(anchors)
        time_indication = "indication of time spent: " + str(end_time - start_time)
        anchors = "anchors per class: " + str(ANCHORS_PER_CLASS)
        preloaded_anchors = "Used preloaded anchors: " + str(USE_REPRESENTATIVE_ANCHORS)
        grid_search_cv = "grid search cv: " + str(GRID_SEARCH_CV)
        cv = "cv: " + str(CV)
        report = metrics.classification_report(labels, predicted_labels, digits=4)
        print report + "\n"
        print time_indication

        f.writelines("\n".join([date, compressor_filters, anchors_used, time_indication, anchors, preloaded_anchors, grid_search_cv, cv, report]))
Exemple #14
0
def run_xval(prefix, clf, data, cv, features=cgo13_features, seed=1):
    X = features(data)
    y = getlabels(data)

    predicted = cross_validation.cross_val_predict(clf, X, y, cv=cv)

    return Metrics(prefix, data, predicted, clf)
Exemple #15
0
 def test_knn_score_equal_sklearn_loocv_score(self):
     acc, correct, cmat = \
         score(self.distance, self.label, k=5, metric='distance')
     # scoring only one k value, so take just the first elements:
     acc = acc[0, 0]
     correct = correct[0]
     cmat = cmat[0]
     # This should work too, but is much slower than using precomp. dist.
     #=======================================================================
     # knclassifier = KNeighborsClassifier(n_neighbors=5, algorithm='brute', 
     #                                     metric='cosine')
     #=======================================================================
     knclassifier = KNeighborsClassifier(n_neighbors=5, algorithm='brute', 
                                         metric='precomputed')
     n = self.distance.shape[0] # for LOO-CV
     loo_cv = LeaveOneOut(n)
     predicted_sklearn = cross_val_predict(
         knclassifier, self.distance, self.label, cv=loo_cv)
     acc_sklearn = accuracy_score(self.label, predicted_sklearn)
     if not np.allclose(acc, acc_sklearn):
         return self.assertAlmostEqual(acc, acc_sklearn, places=7)
     else:
         correct_sklearn = predicted_sklearn == self.label
         equal_prediction = np.all(correct == correct_sklearn)
         msg = """Accuracies of hub toolbox k-NN and sklearn-kNN are almost 
                  equal, but the predictions per data point are not."""
         return self.assertTrue(equal_prediction, msg)
def main():

    dataTuples=getDataInFormat()
    print "Length of dataTuples is: ",  len(dataTuples)
    shuffle(dataTuples)
    trainTuples=dataTuples
    del dataTuples
    ids, labels, vectors= getLabelsAndVectors(trainTuples)
    del trainTuples
    followerCountsList = loadFollowerCountsFromFile()
    space=getSpace(vectors)
    reducedSpace=getReducedSpace(vectors, space)
    spaceWithMetaFeatures= augmentSpace(reducedSpace, emotionFeatures)

    print "Total # of features in your space is: ", len(space)
    print "Total # of features in your reducedSpace is: ", len(reducedSpace)
    oneHotVectors=getOneHotVectors(ids, labels, vectors,spaceWithMetaFeatures , followerCountsList)
    trainVectors, trainLabels=getOneHotVectorsAndLabels(oneHotVectors)
    del oneHotVectors
    clf = OneVsRestClassifier(SVC(C=1, kernel = 'linear',gamma=0.1, verbose= False, probability=False))
    clf.fit(trainVectors, trainLabels)
    
    print "\nDone fitting classifier on training data...\n"
    print "\nDone fitting classifier on training data...\n"
    print "="*50, "\n"
    print "Results with 10-fold cross validation:\n"
    print "="*50, "\n"
    predicted = cross_validation.cross_val_predict(clf, trainVectors, trainLabels, cv=10)
    print "*"*20
    print "\t accuracy_score\t", metrics.accuracy_score(trainLabels, predicted)
    print "*"*20
    print "precision_score\t", metrics.precision_score(trainLabels, predicted)
    print "recall_score\t", metrics.recall_score(trainLabels, predicted)
    print "\nclassification_report:\n\n", metrics.classification_report(trainLabels, predicted)
    print "\nconfusion_matrix:\n\n", metrics.confusion_matrix(trainLabels, predicted)
def regression_test(X, Y):
	norm_train_x = preprocessing.MinMaxScaler((-1,1)).fit_transform(X)

	max_layer_size = len(x_cols)**2
	max_layers = [Layer("Sigmoid", units=max_layer_size/4),
				  Layer("Sigmoid", units=max_layer_size/2),
				  # Layer("Sigmoid", units=max_layer_size/2),
				  # Layer("Sigmoid", units=max_layer_size/4),
				  Layer("Linear")]
	nn = Regressor(layers=max_layers,learning_rate=0.08, n_iter=300)

	regressors = [('Random Forest Regressor', RandomForestRegressor(n_estimators=100), False), 
				   ('AdaBoost Regressor', AdaBoostRegressor(), False), 
				   ('SVR', SVR(), False), 
				   ('Neural Net w/ Sigmoid -> Sigmoid -> Linear', nn, True)]

	for name, reg, norm in regressors:
		if norm:
			train_x = norm_train_x
		else:
			train_x = X
		print name

		preds = cross_validation.cross_val_predict(reg, train_x, Y, cv=K)
		print 'R^2:', metrics.r2_score(Y, preds)
Exemple #18
0
def checkSkflowAccuracy(dataset,target):
    # baseline: 0.6923 with max_feat=0.5
    classifier = RandomForestClassifier(max_depth=8, n_estimators=500, n_jobs=8, random_state=1, max_features=0.9)
    predicted = cross_validation.cross_val_predict(classifier,dataset,target,cv=5)
    score = metrics.accuracy_score(target,predicted)
    print("Accuracy: " + str(score))
    print(metrics.confusion_matrix(target,predicted,labels=[0,1,2,3,4,5]))
def training(features, targets, feature_description,
             validation_features, model_flag):
    """
    Train the data with XGBoost model and 10-cross fold validation
    method. Output the result in confusion matrix.
    :param model_flag:
    :param validation_features:
    :param features: X, 2-D matrix
    :param targets: Y 1-D target array
    :param feature_description: brief description of the feature
    """
    model_name = model_name_dict[model_flag]
    model = model_dict[model_flag]
    model.fit(features, targets)
    prediction = model.predict(validation_features)

    file_names = np.load('ZL_validation_file_names.npy')
    validation_result = open('validation_result_' + model_name +
                             feature_description, 'w')

    # output validation result with specified format.
    p = re.compile('(validation\.[0-9]+)')
    for i in range(len(prediction)):
        # format: validation_xxxxx type
        print >> validation_result, \
            p.findall(file_names[i])[0].replace('.', '_'), \
            type_array[int(prediction[i])]
    validation_result.close()

    prediction = cross_validation.cross_val_predict(
            model, features, targets, cv=10)

    cm = confusion_matrix(targets, prediction)
    output_confusion_matrix_tex(
            cm, model_name + '_' + feature_description)
def eval_log_reg(the_training_data, the_truth): 
    K_FOLD = 10
    
    # Linear regression
    lr = linear_model.LogisticRegression()

    # Evaluate
    scores = cross_validation.cross_val_score(lr, the_training_data, the_truth, cv=K_FOLD)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    predicted = cross_validation.cross_val_predict(lr, the_training_data, the_truth, cv=K_FOLD)
    print "Confusion matrix:"
    print metrics.confusion_matrix(the_truth, predicted)
    
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        the_training_data, the_truth, test_size=1.0/K_FOLD, random_state=0)
    lr.fit(X_train, y_train)
    labels = X_train.columns
    coefficients = [(labels[i],val) for i,val in enumerate(lr.__dict__['coef_'][0])]
    coefficients.sort(key=lambda x: abs(x[1]), reverse=True)
    print "Most predictive features:"
    for i in range(0,5):
        print "    %s: %0.2f" % (coefficients[i][0], coefficients[i][1])
    
    numExamples = np.shape(X_train)[0]
    print "Training examples: %d" % numExamples
    usedUtterances = [example.split(".csv_")[0] for example in X_train.index]
    numUtterances = len(set(usedUtterances))
    print  "Training utterances: %d" % numUtterances
    
    return [scores.mean(), scores.std() * 2, len(coefficients), numExamples, numUtterances]
 def fit(self,X,y):
     '''
     fit the model
     '''
     if self.use_append == True:
         self.__X = X
         self.__y = y
     elif self.use_append == False:
         self.__y = y
         temp = []
     
     for clf in self.stage_one_clfs:
         y_pred = cross_val_predict(clf[1], X, y, cv=5, n_jobs=1)
         clf[1].fit(X,y)
         y_pred  = np.reshape(y_pred,(len(y_pred),1))
         if self.use_append == True:
             self.__X = np.hstack((self.__X,y_pred))
         elif self.use_append == False:
             temp.append(y_pred)
             
         if self.print_scores == True:
             score = mean_squared_error(self.__y,y_pred)
             print("Score of %s: %0.3f" %(clf[0],score))
             
     if self.use_append == False:
         self.__X = np.array(temp).T[0]
     # fit the second stage models
     for clf in self.stage_two_clfs:
         clf[1].fit(self.__X,self.__y)        
Exemple #22
0
def run_svm(x, y):
    s = svm.SVR()
    scores = cross_validation.cross_val_score(s, x, y, cv=10)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    predictions = cross_validation.cross_val_predict(s, x, y, cv=10)

    return predictions
    def transform(self, X):
        # Purpose of skip is to skip the estimator
        if self.skip:
            return X

        # Is the data being transformed the same as the training data
        is_train_data = False
        if isinstance(X, pd.DataFrame) and self.hashed_value == hash(X.values.data.tobytes()):
            is_train_data = True
        if isinstance(X, np.ndarray) and self.hashed_value == hash(X.data.tobytes()):
            is_train_data = True

        # If the dataset is the training data, use CV predictions
        if is_train_data:
            feature = cross_val_predict(clone(self.model), X, self.y)#, cv=self.train_cv)

        # Otherwise, use the model to predict
        else:
            feature = self.model.predict(X)

        # Add feature to dataset
        if isinstance(X, pd.DataFrame):
            X[self.feature_name] = feature
        if isinstance(X, np.ndarray):
            X = np.c_[X, feature]
        return X
def predict_evaluate_models(fn ,ax=None, sel=["Penalties_Conceeded","Tries_Scored"], goal="Referee", verbosity=0):
    class_weight = 'auto'
    X, y, names = data_prepare(fn, sel=sel, goal=goal, verbosity=verbosity-1)
    if verbosity > 2:
        y_shuffled = y.copy()
        np.random.shuffle(y_shuffled)
        print ("All zeros accuracy:",1.0-np.sum(y)/len(y)) 
        print ("y_shuffled f1_csore:",metrics.f1_score(y, y_shuffled))

    n_folds = 10
    cv = cross_validation.StratifiedKFold(y, n_folds=n_folds)
    #cv = cross_validation.LeaveOneOut(n=len(y))
    results = []
    for sclf in ('svm','svmp','svmr','lgCV','gnb','rf','knc'):
        clf = get_clf(sclf,class_weight=class_weight)
        y_pred = cross_validation.cross_val_predict(clf, X, y, cv=cv)
        #print "pred:",y_pred
        res = [
            metrics.accuracy_score(y, y_pred),
            metrics.precision_score(y, y_pred),
            metrics.recall_score(y, y_pred),
            metrics.f1_score(y, y_pred),
            ]
        if verbosity > 0:
            print (sclf,res) 
        results.append( (sclf,res) )

    return results
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--data_dir","-dr",default = "/home/1546/code/keyword_extraction/stanford_parser/no_location_features")
    parser.add_argument('--method','-m',type=int,default=0,choices=range(4),
        help=
        """chose methods from:
                0:linear_svc
                1:logistic regression
                2:naive bayes
                3:decision  tree
        """)
    parser.add_argument("--top_size","-ts",type=int,default = 20)
    parser.add_argument("--need_positive","-np",action='store_true')

    args=parser.parse_args()
    X,y,entity_info = load_data_set(args.data_dir)
    clf = get_classifier(args.method)
    predicted = cross_validation.cross_val_predict(clf,X,y,cv=5)
    #accuracy = metrics.accuracy_score(y,predicted)
    #f1 = metrics.f1_score(y,predicted)

    #print "performance:"
    #print "accuracy: %f, f1: %f" %(accuracy,f1)
    
    show_performance_on_entity_types(y,predicted,entity_info)
    print classification_report(y, predicted)
def tune_and_train_rf(X_train, y_train, strat_k_fold=None):
    '''
    Uses oob estimates to find optimal max_depth between None + 0...20
    Refits with best max_depth
    '''
    oob_r2 = []
    cv_list = [None] + range(1, 20)
    for md in cv_list:
        rf = RandomForestRegressor(n_estimators=100, max_depth=md, oob_score=True, random_state=0, n_jobs=-1)
        rf.fit(X_train, y_train)
        oob_r2.append(rf.oob_score_)

    best_max_depth = cv_list[np.argmax(oob_r2)]
    print("best max_depth: %s" % best_max_depth)

    # CV
    rf = RandomForestRegressor(n_estimators=100, max_depth=best_max_depth, oob_score=True, random_state=0, n_jobs=-1)

    cv_results = None
    if strat_k_fold:
        y_predicted_cv = cross_val_predict(rf, X_train, y_train, cv=strat_k_fold, n_jobs=-1)
        cv_r2 = []
        cv_mae = []
        for k_train, k_test in strat_k_fold:
            cv_r2.append(r2_score(y_train[k_test], y_predicted_cv[k_test]))
            cv_mae.append(mean_absolute_error(y_train[k_test], y_predicted_cv[k_test]))
        cv_results = {'y_predicted_cv': y_predicted_cv,
                      'cv_r2': cv_r2,
                      'cv_mae': cv_mae,
                      'oob_r2': oob_r2}

    # refit
    rf.fit(X_train, y_train)
    return rf, cv_results
def main():
    dataset = samples.get_dataset()

    X, y, page_labels = build_Xy_from_pages_dataset(dataset)
    clf = create_classifier()

    # this gives the prediction result for every element
    # when it was in the test dataset during cross validation
    cv_iter = cross_validation.LabelKFold(page_labels, n_folds=10)
    predicted = cross_validation.cross_val_predict(clf, X, y, cv=cv_iter)

    cm = metrics.confusion_matrix(y, predicted)
    print('\nConfusion matrix:')
    print(cm, '\n\n')
    print(metrics.classification_report(y, predicted))

    print('Training and peeking at the word weights...')
    X_train, y_train = X[:-20], y[:-20]
    clf = get_trained_classifier(X_train, y_train)
    cv = clf.steps[-2][1]
    svc = clf.steps[-1][1]
    word_weights = zip(svc.coef_[0], cv.vocabulary_)

    print('Top 10 weights for negative cases')
    for weight, word in sorted(word_weights)[:10]:
        print('%0.5f  %s' % (weight, word))

    print('\nTop 10 weights for positive cases')
    for weight, word in sorted(word_weights)[-10:][::-1]:
        print('%0.5f  %s' % (weight, word))

    import pickle
    with open('classifier.pickle', 'w') as f:
        pickle.dump(clf, f)
def ada_boost_cv(x_train,
                 y_train,
                 cv,
                 max_tree_depth,
                 n_estimators,
                 learning_rate):

    tree_classifier = DecisionTreeClassifier(max_depth=max_tree_depth,
                                             class_weight="balanced")


    ada_boost_classifier = AdaBoostClassifier(base_estimator=tree_classifier,
                                              n_estimators=n_estimators,
                                              learning_rate=learning_rate)

    y_bar = cross_val_predict(estimator=ada_boost_classifier,
                              X=x_train,
                              y=y_train,
                              cv=cv,
                              n_jobs=cv)

    y_bar_proba = ada_boost_classifier.predict_proba(x_train)
    print(list(zip(y_bar,y_bar_proba)))

    cm = confusion_matrix(y_train,y_bar)

    accuracy_negative = cm[0,0] / np.sum(cm[0,:])
    accuracy_positive = cm[1,1] / np.sum(cm[1,:])

    precision = cm[1,1] / (cm[1,1] + cm[0,1])
    recall = cm[1,1] / (cm[1,1] + cm[1,0])

    f1_score = 2 * precision * recall / (precision + recall)

    return accuracy_positive, accuracy_negative, precision, recall, f1_score
def kfCrossVal(loansData):
    
    # Import required libraries
    from sklearn.cross_validation import cross_val_predict
    from sklearn import linear_model
    import sklearn.metrics as met
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import PolynomialFeatures

    # Create linear regression model using FICO score as the only predictor
    # Interest Rate is the dependent variable
    lr = linear_model.LinearRegression()
    y = loansData.as_matrix(columns=['Interest.Rate'])
    x = loansData[['Loan.Length', 'FICO.Score']].as_matrix()

    # Run the kfold cross validation and store the results as an array
    predicted = cross_val_predict(lr, x, y, cv=10)

    # Try and run as quadratic?
    # POLY2 = smf.ols(formula = 'Y ~ 1 + X + I(X**2)', data=TRAIN_DF).fit()

    # Calculate MAE, MSE, and R2
    print("Mean Absolute Error: {}".format(met.mean_absolute_error(y, predicted)))
    print("Mean Squared Error: {}".format(met.mean_squared_error(y, predicted)))
    print("R Squared: {}".format(met.r2_score(y, predicted)))

    # Plot the actual versus predicted values
    fix, ax = plt.subplots()
    ax.scatter(y, predicted)
    ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.show()
Exemple #30
0
def benchmark(clf_class, params, name):
    print("parameters:", params)
    t0 = time()
    clf = clf_class(**params).fit(X_train, y_train)
    print("done in %fs" % (time() - t0))
    t0 = time()
    pred = clf.predict(X_test)
    print("done in %fs" % (time() - t0))
    #execute_prediction(clf)
    print(clf.score(X_test, y_test))
    predicted = cross_validation.cross_val_predict(clf, X,
                                                y, cv=10)
    score = metrics.accuracy_score(y, predicted)
    print(score)

    print("Classification report on test set for classifier:")
    print(clf)
    print()
    print(classification_report(y_test, pred,
                                target_names=target_names))

    cm = confusion_matrix(y_test, pred)
    print("Confusion matrix:")
    print(cm)

    # Show confusion matrix
    #pl.matshow(cm)
    #pl.title('Confusion matrix of the %s classifier' % name)
    #pl.colorbar()

    np.set_printoptions(precision=2)
    print('Confusion matrix, without normalization')
    plt.figure()
    plot_confusion_matrix(cm)
Exemple #31
0
# plt.figure()
# for color, i, target_name in zip(colors, [0, 1, 2], target_names):
#     plt.scatter(X_r2[Y == i, 0], X_r2[Y == i, 1], alpha=.8, color=color,
#                 label=target_name)
# plt.legend(loc='best', shadow=False, scatterpoints=1)
# plt.title('LDA of Data')

plt.show()

import ipdb
ipdb.set_trace()
h = 0.02  #step size in mesh

clf = LogisticRegression(C=1e5, penalty='l2')
clf.fit(X, Y)
predicted = cross_validation.cross_val_predict(clf, X, Y, cv=5)
print "accuracy score: ", metrics.accuracy_score(Y, predicted)
print "precision score: ", metrics.precision_score(Y,
                                                   predicted,
                                                   average='weighted')
print "recall score: ", metrics.recall_score(Y, predicted, average='weighted')

Y_test = clf.predict_proba(X_test)
# create submission
submission = pd.DataFrame(Y_test,
                          columns=['predict_0', 'predict_1', 'predict_2'])
submission.head()
submission['id'] = testDF.index.values
cols = submission.columns.tolist()
cols = cols[-1:] + cols[:-1]
submission = submission[cols]
                            n_folds=5,
                            shuffle=True,
                            random_state=1)

# Perform cross-validation
scores = cross_validation.cross_val_score(cv=kf,
                                          estimator=clf,
                                          X=X_train,
                                          y=y_train,
                                          scoring='accuracy')
print('Scores: ' + str(scores))
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), 2 * scores.std()))

# Gather predictions
predictions = cross_validation.cross_val_predict(cv=kf,
                                                 estimator=clf,
                                                 X=X_train,
                                                 y=y_train)

accuracy_score = metrics.accuracy_score(y_train, predictions)
print('accuracy score: ' + str(accuracy_score))

confusion_matrix = metrics.confusion_matrix(y_train, predictions)

class_names = encoder.classes_.tolist()

#Train the classifier
clf.fit(X=X_train, y=y_train)

model = {'classifier': clf, 'classes': encoder.classes_, 'scaler': X_scaler}

# Save classifier to disk
Exemple #33
0
keep_probes = []
for sname in selectors.keys():
    subject_probes = np.array(
        [i for i, x in enumerate(data['subjects']) if x == sname])
    subject_probes = subject_probes[selectors[sname]]
    keep_probes += list(subject_probes)
data['neural_responses'] = data['neural_responses'][:, keep_probes]

# filter out the target stimuli (80 -- fruit)
#for dropstim in [60, 70, 80, 90]:
for dropstim in [80]:
    keepidx = data['image_category'] != dropstim
    data['image_category'] = data['image_category'][keepidx]
    data['neural_responses'] = data['neural_responses'][keepidx]

# uncomment for a permutation test
#data['image_category'] = np.random.permutation(data['image_category'])

# obtain CV predictions
print 'Data size', data['neural_responses'].shape
clf = RandomForestClassifier(n_estimators=3000)
predicted = cross_validation.cross_val_predict(clf,
                                               data['neural_responses'],
                                               data['image_category'],
                                               cv=n_cv)

# display results
print confusion_matrix(data['image_category'], predicted)
print f1_score(data['image_category'], predicted, average='weighted')
if __name__ == "__main__":
    input_data = load_input_data()
    target_data = load_target_data()

    print("Number of data points: ", len(target_data))

    decisionTree = RandomForestClassifier(max_features="log2")

    # Cross validation using K-fold and leave one out
    cv = KFold(len(target_data), n_folds=2, shuffle=True)
    cv2 = LeaveOneOut(len(target_data))  # only needs the number of points

    # Calculating scores for the model
    scores = cross_val_score(decisionTree, input_data, target_data, cv=cv)
    print("SCORES: ", scores)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    # Value of the output when it was in the test set
    estimated_results = cross_val_predict(decisionTree,
                                          input_data,
                                          target_data,
                                          cv=cv)
    print("PREDICTED VALUES:", estimated_results)

    # Train the model
    decisionTree.fit(input_data, target_data)
    predicted = decisionTree.predict(input_data)
    expected = target_data
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
#-logistic,Author:ssb--
from sklearn import metrics, cross_validation
from sklearn import datasets
import numpy as np
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report,confusion_matrix
import pandas as pd

import matplotlib.pyplot as plt
df=pd.read_csv('dataSample1.csv')
samples=df.loc[:,['Openness','Conscientousness','Extraversion','Agreeableness','Emotional_Range','Conversation','Openness to Change','Hedonism','Self-enhancement','Self-transcendence']]
target=df.loc[:,'Profession']
cv_folds = cross_validation.StratifiedKFold(target, n_folds=5, shuffle=False, random_state=0)
from sklearn.linear_model import LogisticRegression
Logistic_predict = cross_validation.cross_val_predict(LogisticRegression(), samples, target, cv=cv_folds)
for (fold_no in 1:cv_folds):
    report=classification_report(target,Logistic_predict) #report
    print(report)



Exemple #36
0
X = data.reshape((len(data), -1))
X = preprocessing.scale(X)

Y = target.flatten()

#X, Y = shuffle(X, Y)

print(X.shape, Y.shape)

clf = RandomForestRegressor(n_estimators=50, criterion='mse')
#clf = LinearRegression()
#clf = SVR()
#clf = BayesianRidge(compute_score=True)

Y_pred = cross_val_predict(clf, X, Y, cv=10)

r, p = pearsonr(Y, Y_pred)
print("Correlation: ", r)
print("Variance explained: ", r**2)
print("P-value: ", p**2)
print("RMSE: ", mean_squared_error(Y, Y_pred)**0.5)

Y_pred = Y_pred[Y != 0]
Y = Y[Y != 0]

print("MAPE: ", mean_absolute_percentage_error(Y, Y_pred))

target_pred = Y_pred.reshape(-1, 16)

for i in range(0, 16):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.15)
    plt.ylabel('Classe correta')
    plt.xlabel('Classe predita')
tabela = PrettyTable(['Modelo', 'f1', 'Mean Squared Error'])


# In[6]:

# Regressao Logistica
# Modelo
logistic = lm.LogisticRegression().fit(X, y)
predicted = cv.cross_val_predict(logistic, X, y, cv=10)
# Cross Validation
scores = cv.cross_val_score(lm.LogisticRegression(), X, y, cv=10,
scoring='f1_weighted')
print ('Regressao Logistica')
print (scores.mean())
# Avaliacao
cnf_matrix = metrics.confusion_matrix(y, predicted)
cr = metrics.classification_report(y, predicted)
print (cr)
with open('cr.txt', 'w') as text_file:
    text_file.write(cr)
    text_file.write('\n')
mse = metrics.mean_squared_error(y, predicted)
tabela.add_row(['Regressao Logistica', scores.mean(), mse])
# Matriz de Confusao Normalizada
Exemple #38
0
# In[62]:

model = lm.fit(df, score)

# In[63]:

accuracyscore = cross_val_score(model, df, score, cv=6)

# In[64]:

accuracyscore

# In[65]:

predictions = cross_val_predict(model, df, score, cv=6)

# In[66]:

plt.scatter(score, predictions)
plt.xlabel("True Value")
plt.ylabel("Predictions")
plt.show()

# In[67]:

R_square = metrics.r2_score(score, predictions)

# In[68]:

R_square
print X_test.shape, y_test.shape


# fit a model
lm = linear_model.LinearRegression()

model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

## The line / model
plt.scatter(y_test, predictions)
plt.xlabel("True Values")
plt.ylabel("Predictions")

print "Score:", model.score(X_test, y_test)     

'''
Now let's try out k-fold cross-validation. Again scikit-learn provides useful functions to do the heavy lifting. The function cross_val_predict returns the predicted values for each data point when it's in the testing slice.
'''

from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics

# Perform 6-fold cross validation
scores = cross_val_score(model, df, y, cv=6)
print "Cross-validated scores:", scores
# Make cross validated predictions
predictions = cross_val_predict(model, df, y, cv=6)
plt.scatter(y, predictions)
accuracy = metrics.r2_score(y, predictions)
print "Cross-Predicted Accuracy:", accuracy
따라서 리그레이션 말고 다른 모델로 예측하기 추천
'''
plt.scatter(y_train, model.predict(x_train))
plt.xlabel('true values')
plt.ylabel('predictions')
plt.title('train')

plt.scatter(y_test, pred)
plt.xlabel('true values')
plt.ylabel('predictions')
plt.tile('test')

#cross validation
from sklearn.cross_validation import cross_val_score, cross_val_predict
scores = cross_val_score(model, x_train, y_train, cv=6)
pred = cross_val_predict(model, x_train, y_train, cv=6)
plt.scatter(y, pred)

#k fold cross validation
import numpy as np
from sklearn.model_selection import KFold
x = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=2)
kf.get_n_splits(x)

for train_index, test_index in kf.split(x):
    print('train:', train_index, 'test:', test_index)
    print(y[train_index], y[test_index])

#leave one out
Exemple #41
0
                          hidden_layer_sizes=(25),
                          learning_rate='constant',
                          learning_rate_init=0.001,
                          max_iter=200,
                          momentum=0.9,
                          nesterovs_momentum=True,
                          power_t=0.5,
                          random_state=1,
                          shuffle=True,
                          solver='lbfgs',
                          tol=0.0001,
                          validation_fraction=0.1,
                          verbose=False,
                          warm_start=False)

y_pred = cross_val_predict(estimator, X, y, cv=10)
print(classification_report(y, y_pred))
#print metrics.confusion_matrix(y, y_pred)

print("AUC score ", roc_auc_score(y, y_pred))

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=1)

a = 0
if a == 0:
    # Set the parameters by cross-validation
    tuned_parameters = [{
print(train1.columns)

X = train1.values
X_test = test1.values


# In[ ]:

#kernel = 1*RBF(length_scale=1.0)
kernel = 1.0**2 * Matern(length_scale=1.0, length_scale_bounds=(1e-05, 100000.0), nu=0.5)
gp = GaussianProcessRegressor(kernel=kernel, alpha=5e-9, optimizer='fmin_l_bfgs_b', 
                                n_restarts_optimizer=0, normalize_y=False, copy_X_train=True,
                                random_state=2016)
clf = Pipeline([('scaler', StandardScaler()), ('gp', gp)])     
y_log_centered = y_log - y_log.mean()
y_pred = cross_val_predict(clf, X, y_log_centered, cv=5, n_jobs=-1)
y = np.expm1(y_log)
y_pred = np.expm1(y_pred + y_log.mean())
score = rmsle(y,y_pred)
print(score) # 0.1459


# In[ ]:

import matplotlib as mpl
import matplotlib.pyplot as plt
# get_ipython().magic(u'matplotlib inline')
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)

plt.scatter(y_pred, y)
Exemple #43
0
                                skip_header=1,
                                converters={
                                    1: day_to_number,
                                    3: number_from_end_string,
                                    4: number_from_end_string
                                })
network_X = network_file[:, (0, 1, 2, 3, 4, 6)]
network_Y = network_file[:, 5]

fixed_set_RMSE = []
average_RMSE = []

for poly_degree in range(1, 8):
    regr = make_pipeline(PolynomialFeatures(poly_degree), LinearRegression())

    predicted = cross_validation.cross_val_predict(regr, network_X, network_Y,
                                                   10, 1, 0, None, 0)
    scores = cross_validation.cross_val_score(regr,
                                              network_X,
                                              network_Y,
                                              cv=10,
                                              scoring='mean_squared_error')

    print '----poly_degree---', poly_degree
    print 'All RMSEs', numpy.sqrt(-scores)
    print 'Mean RMSE', numpy.mean(numpy.sqrt(-scores))
    print 'Best RMSE', numpy.min(numpy.sqrt(-scores))

    fixed_set_RMSE.append(numpy.mean(numpy.sqrt(-scores[0])))
    average_RMSE.append(numpy.mean(numpy.sqrt(-scores)))

    #Residual
Exemple #44
0
y_test.reset_index(drop=True, inplace=True)

x_test

#SUBMISSION SETUP

tuned_clf.fit(x_train, y_train)
tuned_clf.score(x_test, y_test)
results = cval.cross_val_score(gr_clf,
                               x_train,
                               y_train,
                               scoring='f1_micro',
                               cv=kf_total,
                               n_jobs=-1)
results
cpred = cval.cross_val_predict(gr_clf, x_test, y_test, cv=kf_total, n_jobs=-1)
cpred
cpred = pd.DataFrame(prediction, columns=['damage_grade'])
cpred.set_index(test_values['building_id'], inplace=True)
cpred.to_csv('prediction.csv')
tuned_clf.fit(x_sample, y_sample)
tuned_clf.score(x_test, y_test)
prediction = tuned_clf.predict(test_values)
prediction = pd.DataFrame(prediction, columns=['damage_grade'])
prediction.set_index(test_id['building_id'], inplace=True)
prediction.to_csv('prediction.csv')
x_sample

#MORE TEST STUFF
x_sample.corr()
x_sample.drop(columns=['secondary_use'], inplace=True)
Exemple #45
0
X = pd.read_csv('sample/Classifier_Features.csv')
#select small samples (200 raws) for example
X = X[0:200]
X.fillna(0, inplace=True)
X.drop(['REVIEW_TEXT'], axis=1, inplace=True)
#print X.head()
Y = X.pop('CLASS')  #store label 'CLASS' in Y
numeric_variables = list(X.dtypes[X.dtypes != "object"].index)
X = X[numeric_variables]

cl0 = DecisionTreeClassifier(max_depth=5)
cl1 = RandomForestClassifier(n_estimators=100, criterion="gini", n_jobs=2)

# use sklearn cross_validation package to fit model
predicted_0 = cross_validation.cross_val_predict(cl0, X, Y, cv=10)
predicted_1 = cross_validation.cross_val_predict(cl1, X, Y, cv=10)

print(" ")
print(
    "********************Classification Model Results ************************"
)
print("--------------------------------------------------------------")
print("Decision Tree Accuracy: ", metrics.accuracy_score(Y, predicted_0))
print("Confusion Matrix For Decision Tree Classifier")
print(metrics.confusion_matrix(Y, predicted_0))
print("AUC Score : ", metrics.roc_auc_score(Y, predicted_0))
print("Recall : ", metrics.recall_score(Y, predicted_0))
print("Average Precision Score : ",
      metrics.average_precision_score(Y, predicted_0))
from sklearn.model_selection import cross_val_predict

import numpy as np
predictors = ["Pclass", "Sex", "Age",
              "Fare","NlengthD","NameLength", "FsizeD", "Title","Deck"]

# Initialize our algorithm with the default paramters
# n_estimators is the number of trees we want to make
# min_samples_split is the minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)
rf = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, 
                            min_samples_leaf=1)
kf = KFold(titanic.shape[0], n_folds=5, random_state=1)
cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=50)

predictions = cross_validation.cross_val_predict(rf, titanic[predictors],titanic["Survived"],cv=kf)
predictions = pd.Series(predictions)
scores = cross_val_score(rf, titanic[predictors], titanic["Survived"],
                                          scoring='f1', cv=kf)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())


# In[ ]:


predictors = ["Pclass", "Sex", "Age",
              "Fare","NlengthD","NameLength", "FsizeD", "Title","Deck","TicketNumber"]
rf = RandomForestClassifier(random_state=1, n_estimators=50, max_depth=9,min_samples_split=6, min_samples_leaf=4)
rf.fit(titanic[predictors],titanic["Survived"])
kf = KFold(titanic.shape[0], n_folds=5, random_state=1)
Exemple #47
0
def search_models(model,
                  param_dists,
                  train_features,
                  train_pred_true,
                  test_features,
                  n_iter=10,
                  scoring='r2',
                  cv=5,
                  n_jobs=-1,
                  pred_column='prediction',
                  file_path='',
                  file_basename='',
                  show_svg=False):
    r"""Search hyper-parameters for best regression model and report.
    
    Args:
        model (sklearn): Regression model from `sklearn`.
        param_dists (dict): Dictionary with keys as string parameter names
            and values as `scipy.stats` distributions [1]_ or lists.
        train_features (pandas.DataFrame): The data to be fit for training.
        train_pred_true (pandas.Series): The target relative to `train_features`
            for regression.
        test_features (pandas.DataFrame): The data to be fit for testing
            as a final evaluation.
        n_iter (int, optional, default=10): Number of parameter settings 
            that are sampled. Trades off runtime vs quality of the solution.
        scoring (str, optional, default='r2'): Scoring function.
            Default is R^2, coefficient of determination.
            See scikit-learn model evaluation documentation [2]_.
        cv (int, optional, default=5): Number of folds for K-fold cross-validation.
        n_jobs (int, optional, default=-1): The number of CPUs to use to do the
            computation. -1 is all CPUs.
        pred_column (str, optional, default='prediction'): Name for output
            prediction column in CSV.
        file_path (str, optional, default=''): Path for generated files.
        file_basename (str, optional, default=''): Base name for generated files.
        show_svg (bool, optional, default=False): Show SVG plot.
    
    Returns:
        None
    
    Raises:
        ValueError
    
    See Also:
        sklearn.grid_search.RandomizedSearchCV
        
    References:
    .. [1] http://docs.scipy.org/doc/scipy/reference/stats.html
    .. [2] http://scikit-learn.org/stable/modules/model_evaluation.html
    
    """
    # TODO: move to demo/main.py as a top-level script.
    # TODO: outliers by Bonferroni correcte p-values
    # TODO: outliers by prediction distribution
    # Check input.
    if not isinstance(train_features, pd.DataFrame):
        raise ValueError("`train_features` must be a `pandas.DataFrame`")
    if not isinstance(train_pred_true, pd.Series):
        raise ValueError("`train_pred_true` must be a `pandas.Series`")
    if not isinstance(test_features, pd.DataFrame):
        raise ValueError("`test_features` must be a `pandas.DataFrame`")
    # Search for best model and report.
    search = sk_gs.RandomizedSearchCV(estimator=model,
                                      param_distributions=param_dists,
                                      n_iter=n_iter,
                                      scoring=scoring,
                                      n_jobs=n_jobs,
                                      cv=cv)
    time_start = time.time()
    search.fit(X=train_features, y=train_pred_true)
    time_stop = time.time()
    print(("Elapsed search time (seconds) = {elapsed:.1f}").format(
        elapsed=time_stop - time_start))
    model_best = search.best_estimator_
    print(("Best params = {params}").format(params=search.best_params_))
    grid_best = max(search.grid_scores_,
                    key=lambda elt: elt.mean_validation_score)
    if not np.isclose(search.best_score_, grid_best.mean_validation_score):
        raise AssertionError(
            "Program error. Max score from `search.grid_scores_` was not found correctly."
        )
    print(("Best score (R^2) = {mean:.4f} +/- {std:.4f}").format(
        mean=grid_best.mean_validation_score,
        std=np.std(grid_best.cv_validation_scores)))
    train_pred_best = sk_cv.cross_val_predict(estimator=model_best,
                                              X=train_features,
                                              y=train_pred_true,
                                              cv=cv,
                                              n_jobs=n_jobs)
    print("Score from best model training predictions (R^2) = {score:.4f}".
          format(score=sk_met.r2_score(y_true=train_pred_true,
                                       y_pred=train_pred_best)))
    train_pred_default = sk_cv.cross_val_predict(estimator=model,
                                                 X=train_features,
                                                 y=train_pred_true,
                                                 cv=cv,
                                                 n_jobs=n_jobs)
    print("Score from default model training predictions (R^2) = {score:.4f}".
          format(score=sk_met.r2_score(y_true=train_pred_true,
                                       y_pred=train_pred_default)))
    if hasattr(model_best, 'feature_importances_'):
        print("Plot feature importances from best model:")
        plot_feature_importances(model=model_best,
                                 train_features=train_features)
    print("Plot actual vs predicted values from best model:")
    plot_actual_vs_predicted(y_true=train_pred_true, y_pred=train_pred_best)
    # Create predictions for `test_features`.
    # Order by index, save as CSV, and graph.
    test_pred_best = model_best.predict(X=test_features)
    file_csv = r'predictions_{name}.csv'.format(name=file_basename)
    path_csv = os.path.join(file_path, file_csv)
    print("Predictions CSV file =\n{path}".format(path=path_csv))
    df_csv = pd.DataFrame(data=test_pred_best,
                          index=test_features.index,
                          columns=[pred_column]).sort_index()
    df_csv.to_csv(path_or_buf=path_csv, header=True, index=True, quoting=None)
    if hasattr(model_best.estimators_[0], 'tree_'):
        file_dot = r'graph_{name}.dot'.format(name=file_basename)
        path_dot = os.path.join(file_path, file_dot)
        print("Graphviz dot and SVG files =\n{path}\n{path}.svg".format(
            path=path_dot))
        sk_tree.export_graphviz(decision_tree=model_best.estimators_[0],
                                out_file=path_dot,
                                feature_names=test_features.columns)
        cmd = ['dot', '-Tsvg', path_dot, '-O']
        # Use pre-Python 3.5 subprocess API for backward compatibility.
        subprocess.check_call(args=cmd)
        if show_svg:
            display(SVG(filename=path_dot + '.svg'))
    return None
Exemple #48
0
	test_predictions=alg.predict(titanic[predictors].iloc[test,:])
	predictions.append(test_predictions)

predictions=np.concatenate(predictions, axis=0)
# predictions[predictions>0.5]=1
# predictions[predictions<=0.5]=0
# accuracy=sum(predictions[predictions==titanic["Survived"]])/len(predictions)
# print(accuracy)
predictions_new = list(range(len(predictions)))
titanic_survived_list = list(titanic['Survived'])
summ = 0
for i in list(range(len(predictions))):
    if predictions[i] > 0.5:
        predictions_new[i] = 1
    else:
        predictions_new[i] = 0
    if predictions_new[i] == titanic_survived_list[i]:
        summ = summ+1
# print predictions_new

# print titanic['Survived']
accuracy = summ/float(len(predictions))
print(accuracy)


from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression#逻辑回归

alg=LogisticRegression(random_state=1)
scores=cross_validation.cross_val_predict(alg, titanic[predictors],titanic["Survived"],cv=3)
print(scores.mean())
def predict(classifier, x, y):
    predictions = cross.cross_val_predict(classifier, x, y, cv=10)
    return predictions
def cross_val_predict_score(model, actuals, predictions):
    from sklearn.cross_validation import cross_val_predict
    return cross_val_predict(model, actuals, predictions)
Exemple #51
0
import os
import numpy as np
from ranknet import RankNet
from sklearn.cross_validation import cross_val_predict

os.system("rm -rf testlog")
data1 = np.random.rand(1000, 30)
data2 = np.random.rand(1000, 30)
label = [True]*1000

rn = RankNet(hidden_units=[20, 10],
             learning_rate=0.01, verbose=True)
data = rn.pack_data(data1, data2)
err = rn.pretrain(data1)
print("Reconstruction Error", err)
cost = rn.fit(data, logdir="logfine")
print("Cost", cost)
score = rn.get_scores(data1)


if False:
    cvpred = cross_val_predict(rn, data, label, cv=2)
    input1 = np.random.rand(10, 30)
    input2 = np.random.rand(10, 30)
    input_ = rn.pack_data(input1, input2)
    prob = rn.predict_prob(input_)
    pred = rn.predict(input_)
    score = rn.get_scores(input1)
    score = rn.get_scores(input2)
# Feature correlation
Feature_corr = df.corr()  # .corr is used for find corelation
plt.figure(figsize=(30, 30))
sns.heatmap(Feature_corr, cbar=True, square=True, cmap='coolwarm')

# Random Forest Regressor Model and important features
clf = RandomForestRegressor(n_estimators=500, max_features=25)
clf.fit(X_train, Y_train)
importance = clf.feature_importances_
X1 = df.drop(['Progression'], axis=1)
dfi = pd.DataFrame(importance, index=X1.columns, columns=["Importance"])
dfi = dfi.sort_values(['Importance'], ascending=False)
dfi.plot(kind='bar', color='Purple')

# Cross-validated data Prediction
Predicted_Train = cross_val_predict(clf, X_train, Y_train, cv=5)
fig_Train, ax_Train = plt.subplots()
ax_Train = sns.regplot(x=Y_train,
                       y=Predicted_Train,
                       scatter_kws={
                           "color": "green",
                           's': 60
                       },
                       line_kws={
                           "color": "gold",
                           "lw": 3
                       },
                       marker="o")
plt.ylim(-0.15, 0.05)
plt.xlim(-0.15, 0.05)
ax_Train.set_xlabel('Real Progression Rate')
Exemple #53
0
data_all['Sex'] = data_all['Sex'].map({'male': 1, 'female': 2})

sns.countplot(data_all['Cabin'])
data_all['Cabin'] = data_all['Cabin'].astype('category').cat.codes
data_all.info()
####建model
data_all.drop('Survived', 1, inplace=True)
X_train = data_all[0:len(train_data)]
X_test = data_all.iloc[len(train_data):]
# Scikit-learn 需要train dataset及label dataset(即答案)各一
Y_label = train_data.Survived

#cross validation
#decision tree
Y_pred = cross_validation.cross_val_predict(DecisionTreeClassifier(),
                                            X_train,
                                            Y_label,
                                            cv=10)
acc_decision_tree = metrics.accuracy_score(Y_label, Y_pred)
#Random Forest
Y_pred = cross_validation.cross_val_predict(
    RandomForestClassifier(n_estimators=1000), X_train, Y_label, cv=10)
acc_rf = metrics.accuracy_score(Y_label, Y_pred)
#print (metrics.classification_report(Y_label, Y_pred) )
#Logistic Regression
Y_pred = cross_validation.cross_val_predict(LogisticRegression(),
                                            X_train,
                                            Y_label,
                                            cv=10)
acc_LR = metrics.accuracy_score(Y_label, Y_pred)
#SVC
Y_pred = cross_validation.cross_val_predict(SVC(), X_train, Y_label, cv=10)
Exemple #54
0
    result = []
    for i in selected:
        result.append(all_elem[i])
    return result

housing_file = numpy.genfromtxt('../../Datasets/housing_data.csv', delimiter=',', skip_header=1)
housing_X = housing_file[:, (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)]
housing_Y = housing_file[:, 13]

fixed_set_RMSE = []
average_RMSE = []

for poly_degree in range(1, 5):
    regr = make_pipeline(PolynomialFeatures(poly_degree), LinearRegression())

    predicted = cross_validation.cross_val_predict(regr, housing_X, housing_Y, 10, 1, 0, None, 0)
    scores = cross_validation.cross_val_score(regr, housing_X, housing_Y,  cv=10, scoring='mean_squared_error')
    
    print '----poly_degree---', poly_degree
    print 'All RMSEs',  numpy.sqrt(-scores)
    print 'Mean RMSE',  numpy.mean(numpy.sqrt(-scores))
    print 'Best RMSE',  numpy.min(numpy.sqrt(-scores))
    
    fixed_set_RMSE.append(numpy.mean(numpy.sqrt(-scores[0])))
    average_RMSE.append(numpy.mean(numpy.sqrt(-scores)))
    
    #Residual
    residual = []
    for i in range(len(housing_X)):
        residual.append(housing_Y[i] - predicted[i])
Exemple #55
0
 print "**************************************"
 #        print "lrate: ", learning_rate
 #        print "alpha: ", alpha
 #        print "n_est: ", n_estimators
 #gb=GradientBoostingRegressor(max_depth=1,learning_rate=0.04,n_estimators=100)
 #gb=GradientBoostingRegressor(loss='lad',max_depth=1,learning_rate=0.05,n_estimators=440)
 #gb=GradientBoostingRegressor(loss='huber',max_depth=1,learning_rate=0.45,n_estimators=200,alpha=0.45)
 #gb=GradientBoostingRegressor(loss='quantile',max_depth=1,learning_rate=0.028,n_estimators=109,alpha=0.36,criterion="friedman_mse")
 #gb=GradientBoostingRegressor(loss='quantile',max_depth=1,learning_rate=0.028,n_estimators=109,alpha=0.36,criterion="friedman_mse",subsample=0.6)
 gb = GradientBoostingRegressor()
 gb.fit(x, y)
 gbsc = gb.score(x, y)
 print "R2 score is: ", gbsc
 gbsc_hpc = gb.score(x_hpc, y_hpc)
 print "R2 score on hpc is: ", gbsc_hpc
 predictions = cross_val_predict(gb, x, y, cv=10)
 np.clip(predictions, 0, 1, out=predictions)
 predictions_h = gb.predict(x_hpc)
 np.clip(predictions_h, 0, 1, out=predictions_h)
 err = np.mean(abs((predictions - y) / y))
 var = np.var(abs((predictions - y) / y))
 print "Cross-Predicted Relative Error: ", err
 print "Cross-Predicted Var of Relative Error: ", var
 np.savetxt('result1.txt', predictions, delimiter='\n', fmt='%.3f')
 err = np.mean(abs((predictions - y)))
 var = np.var(abs((predictions - y)))
 print "Cross-Predicted Abs Error: ", err
 print "Cross-Predicted Var of Abs Error: ", var
 err_h = np.mean(abs((predictions_h - y_hpc) / y_hpc))
 var_h = np.var(abs((predictions_h - y_hpc) / y_hpc))
 print predictions_h
def main():
    #picklef = open(config_file, 'r')
    #config_dict = pickle.load(picklef)

    print "\n========================="
    print "SURROGATE MODEL GENERATOR"
    print "========================="
    print "PARSE AND CLEAN DATA"
    print "========================="
    # load design and target data into a pandas dataframe from the input csv
    dataframe = pd.read_csv(input_data_file)

    # drop rows (samples) with NaNs in them
    dataframe = dataframe[dataframe.isnull() == False]

    # split the dataframe into design and target dataframes
    design_data = dataframe[features]
    design_labels = design_data.axes

    target_data = dataframe[targets]
    target_labels = target_data.axes

    if DEBUG:
        print "\nFeatures:\n", design_data
        print "\nTargets:\n", target_data

    print "\nParsed data shapes\n design data: ", np.shape(
        design_data), "\n target data: ", np.shape(target_data)
    print " #samples: %d\n #input parameters: %d" % (np.shape(design_data)[0],
                                                     np.shape(design_data)[1])
    print " #output parameters: %d" % np.shape(target_data)[1]

    if DEBUG:
        print "design data:"
        print design_data
        print "target_data:"
        print target_data

    if test_split > 0.0:
        print "\n========================="
        print "SPLIT TRAIN AND TEST DATASETS"
        print "========================="
        # split the data into a training set and a testing set for validation later.
        X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(
            design_data, target_data, test_size=test_split)

        print "\nX_train, Y_train:", np.shape(X_train), np.shape(Y_train)
        print "X_test, Y_test:", np.shape(X_test), np.shape(Y_test)
        print "training sample size: %d" % np.shape(X_train)[0]
        print "testing sample size: %d" % np.shape(X_test)[0]
        if DEBUG:
            print "X_train:\n", X_train
            print "Y_train:\n", Y_train
    else:
        X_train = design_data
        Y_train = target_data
        X_test, Y_test = [], []
    # standardize the training data to mean 0 and variance 1
    if normalize is True:
        print "\n========================="
        print "DATA NORMALIZATION AND SCALING"
        print "========================="

        # initialize a StandardScaler object to calculate the means and scaling values of each design
        # parameter (that is, it calculates the means and stdevs over the columns).
        # We then use the scaler object to transform the entire input data set (except for the design ID
        # number) to their normalized values.
        X_train_scaler = preprocessing.MinMaxScaler(
            feature_range=(0, 1)).fit(X_train)
        X_train_scaled = pd.DataFrame(X_train_scaler.transform(X_train),
                                      columns=X_train.axes[1])
        if test_split > 0.0:
            X_test_scaler = preprocessing.MinMaxScaler(
                feature_range=(0, 1)).fit(X_test)
            X_test_scaled = pd.DataFrame(X_test_scaler.transform(X_test),
                                         columns=X_test.axes[1])
        else:
            X_test_scaled = []

        print "\n feature min: ", X_train_scaler.data_min_
        print " feature max: ", X_train_scaler.data_max_
        print " feature range: ", X_train_scaler.data_range_
        print " feature scales: \n", X_train_scaler.scale_

        print "\nScaled training inputs:"
        print " shape: ", np.shape(X_train_scaled)

        if DEBUG:
            print "\n X_train_scaled:\n", X_train_scaled
            print "\nScaled testing inputs:"
            print " shape:", np.shape(X_test_scaled)
            print "\n X_test_scaled:\n", X_test_scaled

        Y_train_scaler = preprocessing.MinMaxScaler(
            feature_range=(0, 1)).fit(Y_train)
        Y_train_scaled = pd.DataFrame(Y_train_scaler.transform(Y_train),
                                      columns=Y_train.axes[1])
        if test_split > 0.0:
            Y_test_scaler = preprocessing.MinMaxScaler(
                feature_range=(0, 1)).fit(Y_test)
            Y_test_scaled = pd.DataFrame(Y_test_scaler.transform(Y_test),
                                         columns=Y_test.axes[1])
        else:
            Y_test_scaled = []

        print "\n output min: ", Y_train_scaler.data_min_
        print " output max: ", Y_train_scaler.data_max_
        print " output range: ", Y_train_scaler.data_range_
        print " output scales: \n", Y_train_scaler.scale_

        print "\nScaled training inputs:"
        print " shape: ", np.shape(Y_train_scaled)

        if DEBUG:
            print "\n Y_train_scaled:\n", Y_train_scaled
            print "\nScaled testing inputs:"
            print " shape:", np.shape(Y_test_scaled)
            print "\n Y_test_scaled:\n", Y_test_scaled
            #print "\nBefore scaling:"
            #print np.shape(X_train)
            #print X_train

        # This is just for visualizing the normalization transformations with histograms
        if DEBUG is True and 1:
            fig, axes = plt.subplots(np.shape(X_train)[1],
                                     sharex=True,
                                     sharey=True)
            for ax, label in izip(axes, X_train.axes[1]):
                ax.hist(X_train[label], bins=7)
                ax.set_title(label)
            fig.suptitle(
                "Distribution of design parameters before normalization")

            fig, axes = plt.subplots(np.shape(X_train_scaled)[1],
                                     sharex=True,
                                     sharey=True)
            print X_train_scaled.axes
            for ax, label in izip(axes, X_train_scaled.axes[1]):
                ax.hist(X_train_scaled[label], bins=7)
                ax.set_title(label)
            fig.suptitle(
                "Distribution of design parameters after normalization")

            if len(Y_train) is not 0 and len(Y_train_scaled) is not 0:
                fig, axes = plt.subplots(np.shape(Y_train)[1],
                                         sharex=True,
                                         sharey=True)
                for ax, label in izip(axes, Y_train.axes[1]):
                    ax.hist(Y_train[label], bins=7)
                    ax.set_title(label)
                fig.suptitle(
                    "Distribution of performance parameters before normalization"
                )

                fig, axes = plt.subplots(np.shape(Y_train_scaled)[1],
                                         sharex=True,
                                         sharey=True)
                for ax, label in izip(axes, Y_train_scaled.axes[1]):
                    ax.hist(Y_train_scaled[label], bins=7)
                    ax.set_title(label)
                fig.suptitle(
                    "Distribution of performance parameters after normalization"
                )
            plt.show()
    else:
        X_train_scaled = X_train
        X_test_scaled = X_test
    print "\n========================="
    print "SUPPORT VECTOR REGRESSION"
    print "========================="

    surrogate_models = [
    ]  # Array to hold the surrogate model objects for each output parameter

    # If gridsearch is True, use scikit-learn's gridsearch to systematically search for optimal
    # hyperparameter values. Else, we use hyperparameter values set by the user to construct and
    # train surrogate models for each performance variable.
    if gridsearch:
        # construct a surrogate model for each output parameter (performance metric)
        print "My God... They're learning..."
        for n, target_parameter in enumerate(Y_train_scaled):
            print "\n------------------------"
            print target_parameter
            print "------------------------"
            if DEBUG: print Y_train_scaled[target_parameter]
            model = generate_optimized_surrogate(
                X_train_scaled,
                Y_train_scaled[target_parameter],
                label=target_parameter,
                C_range=C_range,
                epsilon_range=epsilon_scale,
                grid_iter=optimize_iter,
                scoring=model_scoring)
            surrogate_models.append(model)
    else:
        for n, target_parameter in enumerate(Y_train_scaled):
            print "\n------------------------"
            print target_parameter
            print "------------------------"
            model = SVR(kernel='rbf',
                        C=C_tuple[n],
                        epsilon=epsilon_tuple[n],
                        gamma='auto').fit(X_train_scaled,
                                          Y_train_scaled[target_parameter])
            surrogate_models.append(model)

    print "\nSurrogate models:\n", surrogate_models
    """
    print np.shape(surrogate_model)
    print surrogate_model
    # make predictions over the output surrogate data.
    #prediction_outputs = [model.predict(X_train_scaled) for model in surrogate_model]
    prediction_outputs = surrogate_model[1].predict(X_train_scaled)
    print np.shape(prediction_outputs)
    print prediction_outputs
    """

    # If the sampled data was split into training and testing sets, evaluate the generated models
    # on the testing data. Otherwise, compute cross-validated scores using the training data.

    # First, instantiate a list to hold our scaler (transformation) objects to transform the values
    # predicted by the models to the range of the performance metrics being modeled.
    Y_scalers = []
    for n, model in enumerate(surrogate_models):
        print "\n------------------------"
        print targets[n]
        print "------------------------"

        if test_split > 0.0:
            print "\n========================="
            print "MODEL EVALUATION"
            print "========================="
            predictions = model.predict(X_test_scaled)
            target_values = Y_test[targets[n]]
            # reverse-transform the outputs and predictions back to their original values
            Y_test_scaler = preprocessing.MinMaxScaler().fit(
                Y_test[targets[n]].reshape(-1, 1))
            predictions = Y_test_scaler.inverse_transform(
                predictions.reshape(-1, 1))

            #print Y_test[:,n]
            #print predictions
            #result_array = np.column_stack((Y_test[:,n].reshape(-1,1), predictions))

            print "test values, predicted values"
            print target_values, predictions
            print "model score:", metrics.mean_squared_error(
                target_values, predictions)
            #print "model score: ", model.score(target_values, predictions)
            print "model parameters:"
            parameters = model.get_params()
            print ' C: ', parameters['C']
            print ' epsilon: ', parameters['epsilon']
            #print ' gamma: ', parameters['gamma']

        # If a testing set was not set aside, use Leave-One-Out (LOO) cross-validation
        else:
            scaled_target_values = Y_train_scaled[targets[n]].values
            target_values = Y_train[targets[n]].values

            scores = cross_validation.cross_val_score(
                model,
                X_train_scaled.values,
                scaled_target_values,
                scoring='mean_squared_error',
                cv=len(Y_train_scaled))

            avg_score = np.mean(scores)
            score_std = np.std(scores)
            print "model avg score: %1.5f (+/-%1.5f)" % (-avg_score, score_std)

            predictions = cross_validation.cross_val_predict(
                model,
                X_train_scaled.values,
                scaled_target_values,
                cv=len(Y_train_scaled))

            # Make a scaler and inverse transform the predictions back to their original, unscaled ranges
            Y_test_scaler = preprocessing.MinMaxScaler().fit(target_values)
            predictions = Y_test_scaler.inverse_transform(predictions)
            Y_scalers.append(Y_test_scaler)
            print "Y_scalers[%d]: " % n, Y_scalers[n]

        # plot the predicted vs actual values
        fig, ax = plt.subplots()
        ax.scatter(predictions, target_values, marker='x')
        ax.plot(target_values, target_values, c='b', linestyle='--')
        ax.set_xlabel("Predicted Values")
        ax.set_ylabel("Actual Values")
        ax.set_title("Predicted vs Actual Target Values: %s" % targets[n])

        fig.savefig('%s%s_%s_predicted_vs_actual.png' %
                    (output_directory, data_title, targets[n]))
    """
    if test_split > 0.0:
        print "\n========================="
        print "MODEL EVALUATION"
        print "========================="

        # step through each model and evaluate its performance on the testing data
        for n, model in enumerate(surrogate_models):
            print "\n------------------------"
            print targets[n]
            print "------------------------"
            predictions = model.predict(X_test_scaled)
            target_values = Y_test[targets[n]]
            # reverse-transform the outputs and predictions back to their original values
            Y_test_scaler = preprocessing.MinMaxScaler().fit(Y_test[targets[n]].reshape(-1,1))
            predictions = Y_test_scaler.inverse_transform(predictions.reshape(-1,1))

            #print Y_test[:,n]
            #print predictions
            #result_array = np.column_stack((Y_test[:,n].reshape(-1,1), predictions))

            print "test values, predicted values"
            print target_values, predictions
            print "model score:", metrics.mean_squared_error(target_values, predictions)
            #print "model score: ", model.score(target_values, predictions)
            print "model parameters:"
            parameters = model.get_params()
            print ' C: ', parameters['C']
            print ' epsilon: ', parameters['epsilon']
            #print ' gamma: ', parameters['gamma']

            # plot the predicted vs actual values
            fig, ax = plt.subplots()
            ax.scatter(predictions, target_values, marker = 'x')
            ax.plot(target_values, target_values, c='b', linestyle='--')
            ax.set_xlabel("Predicted Values")
            ax.set_ylabel("Actual Values")
            ax.set_title("Predicted vs Actual Target Values: %s" %targets[n])

            fig.savefig('%s%s_predicted_vs_actual.png' %(output_directory, targets[n]))

    else:
        print "\n========================="
        print "MODEL CROSS-VALIDATION"
        print "========================="

        # Use cross-validation to evaluate the models created above
        for n, model in enumerate(surrogate_models):
            print "\n------------------------"
            print targets[n]
            print "------------------------"

            scaled_target_values = Y_train_scaled[targets[n]].values
            target_values = Y_train[targets[n]].values

            scores = cross_validation.cross_val_score(model, 
                                                      X_train_scaled.values, 
                                                      scaled_target_values,
                                                      scoring = 'mean_squared_error',
                                                      cv = len(Y_train_scaled))

            avg_score = np.mean(scores)
            score_std = np.std(scores)
            print "model avg score: %1.5f (+/-%1.5f)" %(-avg_score, score_std)

            predictions = cross_validation.cross_val_predict(model,
                                                             X_train_scaled.values,
                                                             scaled_target_values,
                                                             cv = len(Y_train_scaled))

            # Make a scaler and inverse transform the predictions back to their original, unscaled ranges
            Y_test_scaler = preprocessing.MinMaxScaler().fit(target_values)
            predictions = Y_test_scaler.inverse_transform(predictions)

            # plot the predicted vs actual values
            fig, ax = plt.subplots()
            ax.scatter(predictions, target_values, marker = 'x')
            ax.plot(target_values, target_values, c='b', linestyle='--')
            ax.set_xlabel("Predicted Values")
            ax.set_ylabel("Actual Values")
            ax.set_title("Predicted vs Actual Target Values: %s" %targets[n])

            fig.savefig('%s%s_predicted_vs_actual.png' %(output_directory, targets[n]))
    """
    if save_models is True:
        model_file = data_title + "_surrogate_models.pkl"
        input_scaler_file = data_title + "_input_scalers.pkl"
        scaler_file = data_title + "_datascalers.pkl"
        models_savefile = output_directory + model_file
        input_scalers_savefile = output_directory + input_scaler_file
        scalers_savefile = output_directory + scaler_file
        #models_savefile = "%s%s_surrogate_models.pkl" %(output_directory, data_name)
        #scalers_savefile = "%s%s_datascalers.pkl" %(output_directory, data_name)

        with open(models_savefile, 'w') as f:
            pickle.dump(surrogate_models, f)

        with open(input_scalers_savefile, 'w') as f:
            pickle.dump(X_train_scaler, f)

        with open(scalers_savefile, 'w') as f:
            pickle.dump(Y_scalers, f)

    return surrogate_models, Y_scalers
def linearRegression(X, Y, datasetName, workFlow=False, workFlowNo=0):
    """
    :param X: data consisting of features (excluding class variable)
    :param Y: column vector consisting of class variable
    :param datasetName: Network / Housing
    :param workFlow: for question 3
    :return: models linear regression and performs 10 fold Cross Validation
             displays statistics of various features and plots graphs for predicted/residual and actual values
    """
    print "LINEAR REGRESSION"
    print "Executing..."
    print

    # can change to model on the entire dataset but by convention splitting the dataset is a better option
    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(
        X, Y, test_size=0.10, random_state=5)

    lm = linear_model.LinearRegression(
    )  # sklearn Linear Regression model used for cross validation
    lm.fit(X_train, Y_train)

    scores = cross_validation.cross_val_score(
        lm, X, Y, cv=10,
        scoring='mean_squared_error')  # cross validation 10 folds
    predicted = cross_validation.cross_val_predict(lm, X, Y, cv=10)

    # differentiate between workflow execution and whole dataset execution
    if workFlow:
        est = sm.OLS(Y, X).fit(
        )  # panda OLS library used to build model on entire dataset and provide stats on variable
        print est.summary()
    else:
        print "WORKFLOW_" + str(workFlowNo)
        print "Estimated intercept coefficient: ", lm.intercept_
        estimatedCoefficients = pd.DataFrame(
            zip(X.columns, lm.coef_),
            columns=['Features', 'EstimatedCoefficients'])
        print estimatedCoefficients

    rmseEstimator = np.mean((scores * -1)**0.5)  # average of the RMSE values

    print
    print "RMSE Values of Estimator : " + str(rmseEstimator)
    print

    # plot graph for Fitted values vs Actual values
    plt.scatter(Y, predicted)
    plt.figure(1)
    plt.xlabel("Actual Median Value")
    plt.ylabel("Predicted Median Value")
    plt.title('Fitted values vs Actual Values')
    plt.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4)

    if workFlow:
        plt.savefig(
            "../Graphs/{0}/Question {1}/LR - Fitted vs Actual.png".format(
                datasetName, '2a' if datasetName == "Network" else '4a'))
    else:
        plt.savefig(
            "../Graphs/{0}/Question 3/LR - Fitted vs Actual WorkFlow {1}.png".
            format(datasetName, workFlowNo))

    # plot graph for Residual values vs Fitted Values
    plt.figure(2)
    plt.xlabel('Fitted Values')
    plt.ylabel('Residuals')
    plt.title('Residuals vs Fitted Values plot')
    plt.scatter(predicted, predicted - Y, c='b', s=40, alpha=0.5)
    plt.hlines(y=0, xmin=-10, xmax=50)

    if workFlow:
        plt.savefig(
            "../Graphs/{0}/Question {1}/LR - Residuals vs Fitted.png".format(
                datasetName, '2a' if datasetName == "Network" else '4a'))
    else:
        plt.savefig(
            "../Graphs/{0}/Question 3/LR - Residuals vs Fitted WorkFlow {1}.png"
            .format(datasetName, workFlowNo))

    plt.show()
from sklearn import datasets
from sklearn import linear_model
from sklearn.cross_validation import cross_val_predict
import matplotlib.pyplot as plt
boston = datasets.load_boston()
#print(boston.DESCR)
#print(boston.target)
print(boston.data)
#CRIM(犯罪率) ZN(房星大於25000ft比率)
#INDOUS(住宅比率) CHAS(有吳臨河) NOX(空汙比率) RM(房間數)
#AGE(自有住宅比例) DIS(離市中心距離) RAD(離高速公路距離)
#TAX(房屋稅率) PTRATIO(小學老師比率) B(黑人比率)
#STAT(低收人比率) MEDV(受僱者收入)4
lr = linear_model.LinearRegression()
predict = cross_val_predict(lr, boston.data, boston.target, cv=10)
plt.figure()
plt.scatter(boston.target, predict)
y = boston.target
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
plt.plot()
plt.show()
print(predict)

# In[1]:

from sklearn import datasets
import matplotlib.pyplot as plt
import numpy as np
data = datasets.fetch_olivetti_faces()
#print(data.DESCR)
def randomForestRegression(X, Y, datasetName):
    """
    :param X: data consisting of features (excluding class variable)
    :param Y: column vector consisting of class variable
    :param datasetName: Network / Housing
    :return: models random forest regression with fine-tuning of tree depth and maximum number of trees
    """

    print "RANDOM FOREST REGRESSION"
    print "Executing..."
    print

    # fine tuning of depth of tree
    depth = range(4, 15)
    rmse = []

    for eachDepth in range(
            len(depth)):  # test depth of tree using 10 folds cross validation
        estimator = RandomForestRegressor(n_estimators=20,
                                          max_depth=depth[eachDepth])
        scores = cross_validation.cross_val_score(estimator,
                                                  X,
                                                  Y,
                                                  cv=10,
                                                  scoring='mean_squared_error')
        rmse.append((np.mean(scores) * -1)**(0.5))

    # plot graphs for RMSE vs Depth of Tree
    plt.figure(1)
    plt.plot(depth, rmse)
    plt.title('RMSE vs Depth of Tree')
    plt.xlabel('Depth')
    plt.ylabel('Root Mean Square Error')
    plt.savefig("../Graphs/{0}/Question 2b/RMSE vs Depth of Tree.png".format(
        datasetName))

    bestDepth = depth[rmse.index(
        min(rmse))]  # best depth obtained on basis of RMSE

    # fine tuning of number of maximum tree
    noOfTrees = range(20, 220, 40)
    rmse = []

    for eachTreeSize in range(
            len(noOfTrees)
    ):  # test maximum number of tree using 10 folds cross validation
        estimator = RandomForestRegressor(n_estimators=noOfTrees[eachTreeSize],
                                          max_depth=bestDepth)
        scores = cross_validation.cross_val_score(estimator,
                                                  X,
                                                  Y,
                                                  cv=10,
                                                  scoring='mean_squared_error')
        rmse.append((np.mean(scores) * -1)**(0.5))

    bestTrees = noOfTrees[rmse.index(min(rmse))]

    # plot graphs for RMSE vs Maximum No. of Tree

    plt.figure(2)
    plt.plot(noOfTrees, rmse)
    plt.title('RMSE vs Maximum No of Tree')
    plt.xlabel('Number of Tree')
    plt.ylabel('Root Mean Square Error')
    plt.savefig(
        "../Graphs/{0}/Question 2b/RMSE vs No of Tree.png".format(datasetName))

    rfe = RandomForestRegressor(n_estimators=bestTrees, max_depth=bestDepth)
    predicted = cross_validation.cross_val_predict(rfe, X, Y, cv=10)

    # plot graph for Fitted values vs Actual values
    plt.figure(3)
    plt.scatter(Y, predicted)
    plt.xlabel("Actual Median Value")
    plt.ylabel("Predicted Median Value")
    plt.title('Fitted values vs Actual Values')
    plt.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4)
    plt.savefig("../Graphs/{0}/Question {1}/LR - Fitted vs Actual.png".format(
        datasetName, '2b' if datasetName == "Network" else '4a'))

    plt.figure(4)
    plt.xlabel('Fitted Values')
    plt.ylabel('Residuals')
    plt.title('Residuals vs Fitted Values plot')
    plt.scatter(predicted, predicted - Y, c='b', s=40, alpha=0.5)
    plt.hlines(y=0, xmin=-10, xmax=50)

    plt.savefig(
        "../Graphs/{0}/Question {1}/LR - Residuals vs Fitted.png".format(
            datasetName, '2b' if datasetName == "Network" else '4a'))

    # print the best rmse of random forest with maximum no of trees = 140 and depth = 9
    print "RANDOM FOREST REGRESSION"
    print "PARAMETER TUNING"
    print "Max Depth : " + str(bestDepth)
    print "Number of Maximum Tree " + str(bestTrees)
    print "Root Mean Squared Error : " + str(min(rmse))

    plt.show()
Exemple #60
0
boston = datasets.load_boston()
# 输出数据集介绍文档
print(boston.DESCR)

# 导入线性支持向量机回归模块
from sklearn.svm import LinearSVR
# 导入交叉验证模块
from sklearn.cross_validation import cross_val_predict

feature = boston.data
target = boston.target

# 建立线性支持向量机回归模型
model = LinearSVR()
# 交叉验证,数据集等分为10份
predictions = cross_val_predict(model, feature, target, cv=10)

import matplotlib.pyplot as plt
# 绘制散点图
plt.scatter(target, predictions)
# 绘制45°参考线
plt.plot([target.min(), target.max()],
         [target.min(), target.max()],
         'r--',
         lw=2)
# 设置坐标轴名称
plt.xlabel('true_target')
plt.ylabel('prediction')

plt.show()