# for i in range(0,Total_data_number):
    for z in randomized_list:
        temp_feature.append(feature_list_of_all_instances[z])
        temp_class.append(class_list_of_all_instances[z])
        # continue
    feature_list_of_all_instances = temp_feature
    class_list_of_all_instances = temp_class

    # feature_list_of_all_instances = feature_list_of_all_instances.tolist()
    # class_list_of_all_instances = class_list_of_all_instances.tolist()
    data = []
    for i in range(0, Total_data_number):
        data.append(i)
    #
    kf = cross_validation.KFold(Total_data_number, n_folds=5, shuffle=True)
    #
    # # Cs = numpy.logspace(-6, -1, 10)
    #
    # # clf = GridSearchCV(estimator='svc',param_grid=dict(C = Cs) , n_jobs=-1 )
    #
    #
    print("Starting K fold data to Svm   ...   ")
    l = 0
    for iteration, data in enumerate(kf, start=1):

        # print(iteration, data[0], data[1])
        train_set_indexes = data[0]
        test_set_indexes = data[1]

        temp_total_dataset = []
Exemple #2
0
    # parser.add_argument("last_sub_idx", help="last sub",
    #                 type=int, default=len(all_subjects))

    args = parser.parse_args()
    start_idx = args.start_sub_idx
    end_idx = args.end_sub_idx
    for experiment_counter, subject in enumerate(all_subjects[start_idx:end_idx]):



        file_name = os.path.join(data_base_dir, subject)
        all_data_per_char, target_per_char, train_mode_per_block, all_data_per_char_as_matrix, target_per_char_as_matrix = create_data_rep_training(
            file_name, -200, 800, downsampe_params=8)


        for rep_per_sub, cross_validation_indexes in enumerate(list(cross_validation.KFold(len(train_mode_per_block)/10, n_folds=4,
                                                                              random_state=42, shuffle=True))):

            # seperate randomally

            batch_size = 20
            select = 1



            train_as_p300 = False
            train_indexes = train_mode_per_block == 1
            validation_indexes = train_mode_per_block == 2
            test_indexes = train_mode_per_block != 1

            if train_as_p300:
    testdex = testing.index
else:
    training = pd.read_csv('../input/train.csv',
                           index_col="item_id",
                           parse_dates=["activation_date"])
    traindex = training.index
    testing = pd.read_csv('../input/test.csv',
                          index_col="item_id",
                          parse_dates=["activation_date"])
    testdex = testing.index

ntrain = training.shape[0]
ntest = testing.shape[0]

kf = cross_validation.KFold(ntrain,
                            n_folds=NFOLDS,
                            shuffle=True,
                            random_state=SEED)

y = training.deal_probability.copy()
training.drop("deal_probability", axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*training.shape))
print('Test shape: {} Rows, {} Columns'.format(*testing.shape))

print("Combine Train and Test")
df = pd.concat([training, testing], axis=0)
del training, testing
gc.collect()

categorical = [
    "region", "city", "parent_category_name", "category_name", "user_type",
    "image_top_1", "param_1", "param_2", "param_3"
Exemple #4
0
print("Finished feature extraction over {} windows".format(len(X)))
print("Unique labels found: {}".format(set(y)))
sys.stdout.flush()


# %%---------------------------------------------------------------------------
#
#		                Train & Evaluate Classifier
#
# -----------------------------------------------------------------------------

n = len(y)
n_classes = len(class_names)

# TODO: Train your classifier!
cv = cross_validation.KFold(n, n_folds=10, shuffle=False, random_state=None)
tree = DecisionTreeClassifier(criterion="entropy", max_depth=3)
dtavgacc = 0.0
dtavgprecision = 0.0
dtavgrecall = 0.0

for i, (train_indexes, test_indexes) in enumerate(cv):
    X_train = X[train_indexes, :]
    y_train = y[train_indexes]
    X_test = X[test_indexes, :]
    y_test = y[test_indexes]
    tree.fit(X_train, y_train)

    y_pred = tree.predict(X_test)
    conf = confusion_matrix(y_test, y_pred, labels=[0,1,2])
    dtaccuracy1 = tree.score(X_test, y_test)
Exemple #5
0
def trainAndTestNet():
    unsupervisedData, data, labels = createTrainingSet()

    print np.unique(np.argmax(labels, axis=1))

    print "data.shape"
    print data.shape
    print "labels.shape"
    print labels.shape

    # Random data for training and testing
    kf = cross_validation.KFold(n=len(data), k=5)
    for train, test in kf:
        break

    print data
    data = common.scale(data)
    unsupervisedData = None

    activationFunction = activationfunctions.Rectified()
    rbmActivationFunctionVisible = activationfunctions.Identity()
    rbmActivationFunctionHidden = activationfunctions.RectifiedNoisy()

    unsupervisedLearningRate = 0.0001
    supervisedLearningRate = 0.001
    momentumMax = 0.99

    trainData = data[train]
    trainLabels = labels[train]

    # net = db.DBN(4, [1200, 1500, 1000, len(args.emotions)],
    #            binary=False,
    #            activationFunction=activationFunction,
    #            rbmActivationFunctionVisible=rbmActivationFunctionVisible,
    #            rbmActivationFunctionHidden=rbmActivationFunctionHidden,
    #            unsupervisedLearningRate=unsupervisedLearningRate,
    #            supervisedLearningRate=supervisedLearningRate,
    #            momentumMax=momentumMax,
    #            nesterovMomentum=True,
    #            rbmNesterovMomentum=True,
    #            rmsprop=True,
    #            miniBatchSize=20,
    #            hiddenDropout=0.5,
    #            visibleDropout=0.8,
    #            momentumFactorForLearningRateRBM=False,
    #            firstRBMheuristic=False,
    #            rbmVisibleDropout=1.0,
    #            rbmHiddenDropout=1.0,
    #            preTrainEpochs=10,
    #            sparsityConstraintRbm=False,
    #            sparsityRegularizationRbm=0.001,
    #            sparsityTragetRbm=0.01)
    #
    # net.train(trainData, trainLabels, maxEpochs=200,
    #           validation=False,
    #           unsupervisedData=unsupervisedData)
    #
    # probs, predicted = net.classify(data[test])

    net = cnn.CNN(30, 40, len(args.emotions))

    net.train(trainData, trainLabels)

    probs, predicted = net.classify(data[test])

    actualLabels = labels[test]
    correct = 0
    errorCases = []

    for i in xrange(len(test)):
        actual = actualLabels[i]
        print probs[i]
        if predicted[i] == np.argmax(actual):
            correct += 1
        else:
            errorCases.append(i)

    print "correct"
    print correct

    print "percentage correct"
    print correct * 1.0 / len(test)

    confMatrix = confusion_matrix(np.argmax(actualLabels, axis=1), predicted)
    print "confusion matrix"
    print confMatrix

    with open(args.net_file, "wb") as f:
        pickle.dump(net, f)
    return net
Exemple #6
0
def predict_news_article():
    '''
    It would seem we are essentially unable to predict which
    articles will get news stories, conditioned on their
    already receiving a press release! This is actually kind of
    interesting, since we *are* able to predict (certainly better
    than chance) which articles will get press release OR
    news articles. This supports the conclusions of Chambers et al:
    it seems the press release selection and process is the crucial
    thing.
    '''
    X, y, vectorizer = get_X_y()
    lr = LogisticRegression(penalty="l2", fit_intercept=True)

    parameters = {"C": [.1, .01, .001]}
    clf0 = GridSearchCV(lr, parameters, scoring='accuracy')
    print "fitting model..."
    clf0.fit(X, y)
    print "done."

    print texify_most_informative_features(vectorizer, clf0,
                                           "predictive features")

    kf = cross_validation.KFold(X.shape[0], shuffle="true", n_folds=5)
    fs, aucs = [], []
    fold = 0
    for train, test in kf:
        clf = GridSearchCV(lr, parameters, scoring='accuracy')
        clf.fit(X[train], y[train])

        probs = clf.predict_proba(X[test])

        #aucs.append(sklearn.metrics.roc_auc_score(y[test], probs))
        cur_auc = sklearn.metrics.roc_auc_score(y[test], probs[:, 1])
        aucs.append(cur_auc)
        preds = clf.predict(X[test])
        fs.append(sklearn.metrics.f1_score(y[test], preds))

        if fold == 0:
            fpr, tpr, thresholds = sklearn.metrics.roc_curve(
                y[test], probs[:, 1])
            pylab.clf()

            fout = "roc"

            pylab.plot(fpr, tpr, label="ROC curve (area = %0.2f)" % cur_auc)
            pylab.plot([0, 1], [0, 1], 'k--')
            pylab.xlim((-0.025, 1.025))
            pylab.ylim((-0.025, 1.025))
            pylab.xlabel("false positive rate")
            pylab.ylabel("true positive rate")
            pylab.title("ROC curve (area = %0.2f)" % cur_auc)
            pylab.tight_layout()
            pylab.savefig(fout)

        fold += 1

    print "average auc: %s" % (sum(aucs) / float(len(aucs)))
    print "average fs: %s" % (sum(fs) / float(len(fs)))
    #print "ABOUT TO RETURN"
    pdb.set_trace()
    return clf0
Exemple #7
0
def testPicklingDBN():
    data, labels = readKanade(False, None, equalize=False)

    print "data.shape"
    print data.shape
    print "labels.shape"
    print labels.shape

    # Random data for training and testing
    kf = cross_validation.KFold(n=len(data), n_folds=5)
    for train, test in kf:
        break

    if args.relu:
        activationFunction = Rectified()
        unsupervisedLearningRate = 0.05
        supervisedLearningRate = 0.01
        momentumMax = 0.95
        data = scale(data)
        rbmActivationFunctionVisible = Identity()
        rbmActivationFunctionHidden = RectifiedNoisy()

    else:
        activationFunction = Sigmoid()
        rbmActivationFunctionVisible = Sigmoid()
        rbmActivationFunctionHidden = Sigmoid()

        unsupervisedLearningRate = 0.5
        supervisedLearningRate = 0.1
        momentumMax = 0.9

    trainData = data[train]
    trainLabels = labels[train]

    # TODO: this might require more thought
    net = db.DBN(5, [1200, 1500, 1500, 1500, 7],
                 binary=1 - args.relu,
                 activationFunction=activationFunction,
                 rbmActivationFunctionVisible=rbmActivationFunctionVisible,
                 rbmActivationFunctionHidden=rbmActivationFunctionHidden,
                 unsupervisedLearningRate=unsupervisedLearningRate,
                 supervisedLearningRate=supervisedLearningRate,
                 momentumMax=momentumMax,
                 nesterovMomentum=True,
                 rbmNesterovMomentum=True,
                 rmsprop=True,
                 miniBatchSize=20,
                 hiddenDropout=0.5,
                 visibleDropout=0.8,
                 rbmVisibleDropout=1.0,
                 rbmHiddenDropout=1.0,
                 preTrainEpochs=1)

    net.train(trainData,
              trainLabels,
              maxEpochs=10,
              validation=False,
              unsupervisedData=None,
              trainingIndices=train)

    initialDict = net.__dict__

    with open(args.netFile, "wb") as f:
        pickle.dump(net, f)

    with open(args.netFile, "rb") as f:
        net = pickle.load(f)

    afterDict = net.__dict__

    del initialDict['rbmActivationFunctionHidden']
    del initialDict['rbmActivationFunctionVisible']

    del afterDict['rbmActivationFunctionHidden']
    del afterDict['rbmActivationFunctionVisible']

    for key in initialDict:
        assert key in afterDict
        if isinstance(initialDict[key], (np.ndarray, np.generic)):
            assert np.arrays_equal(initialDict[key], afterDict[key])
        else:
            assert initialDict[key] == afterDict[key]
Exemple #8
0
Y_pred_NB = modelNB.predict(X_test)

printMetrics(Y_pred, Y_predNB)  #99.86

#%%

# USING CROSS VALIDATION: LOGISTIC REGRESSION

#from sklearn.svm import SVC

classifier = LogisticRegression()

#classifier = svm.SVC(kernel = 'rbf', C = 1, gamma = 0.001)

# performing kfold_cross_validation
kfold_cv = cross_validation.KFold(n=len(X_train), n_folds=20)
print(kfold_cv)

#Running the model using scoring metric as Accuracy

kfold_cv_result = cross_validation.cross_val_score(estimator=classifier,
                                                   X=X_train,
                                                   y=Y_train,
                                                   cv=kfold_cv)

#print(kfold_cv_result)

#finding the mean
print(kfold_cv_result.mean())  # 99.659

# MAX ACCURACY OF KFOLD FOR LOGISTIC REGRESSION:
Exemple #9
0
#V = mat(V).T
# Components to be included as features
#k_pca = 3
#X = X*V[:,0:k_pca]
#N, M = X.shape

# Parameters for neural network classifier
n_hidden_units = 2  # number of hidden units
n_train = 5  # number of networks trained in each k-fold
learning_goal = 10  # stop criterion 1 (train mse to be reached)
max_epochs = 64  # stop criterion 2 (max epochs in training)
show_error_freq = 3  # frequency of training status updates

# K-fold crossvalidation
K = 10  # only five folds to speed up this example
CV = cross_validation.KFold(N, K, shuffle=True)

# Variable for classification error
errors = np.zeros(K)
error_hist = np.zeros((max_epochs, K))
bestnet = list()
k = 0
for train_index, test_index in CV:
    print('\nCrossvalidation fold: {0}/{1}'.format(k + 1, K))

    # extract training and test set for current CV fold
    X_train = X[train_index, :]
    y_train = y[train_index]
    X_test = X[test_index, :]
    y_test = y[test_index]
    # X_train = X[train_index]
Exemple #10
0
    train_tags_all_subject = []
    test_tags_all_subject = []
    time_noise = 0

    for experiment_counter, subject in enumerate(
            all_subjects[start_idx:end_idx]):
        print "start subject:{}".format(subject)

        file_name = os.path.join(data_base_dir, subject)
        all_data_per_char, target_per_char, train_mode_per_block, all_data_per_char_as_matrix, target_per_char_as_matrix = create_data_rep_training(
            file_name, -200 + time_noise, 800 + time_noise, downsampe_params=8)

        for rep_per_sub, cross_validation_indexes in enumerate(
                list(
                    cross_validation.KFold(len(train_mode_per_block) / 10,
                                           n_folds=4,
                                           random_state=42,
                                           shuffle=True))):
            batch_size = 20
            select = 1
            train_as_p300 = False
            train_indexes = train_mode_per_block == 1
            validation_indexes = train_mode_per_block == 2
            test_indexes = train_mode_per_block != 1

            if train_as_p300:

                data_generator_batch = triplet_data_generator_no_dict(
                    all_data_per_char_as_matrix[train_indexes],
                    target_per_char_as_matrix[train_indexes],
                    batch_size=batch_size,
                    select=select,
Exemple #11
0
	np.save("train_X.npy", train_X)
	np.save("train_y.npy", train_y)
	print "Done"

	"""

    print "loading.."
    train_X = np.load("train_X.npy")
    train_y = np.load("train_y.npy")
    print train_X.shape, train_y.shape

    ################## XGBoost ###############
    ################## XGBoost ###############
    print "Building models.."
    cv_scores = []
    kf = cross_validation.KFold(train_X.shape[0],
                                n_folds=8,
                                shuffle=True,
                                random_state=2015)
    for dev_index, val_index in kf:
        dev_X, val_X = train_X[dev_index, :], train_X[val_index, :]
        dev_y, val_y = train_y[dev_index], train_y[val_index]

        sc = preprocessing.StandardScaler()
        dev_X = sc.fit_transform(dev_X)
        val_X = sc.transform(val_X)

        runNN(dev_X, dev_y, val_X, test_y=val_y)

        break
Exemple #12
0
from sklearn.qda import QDA
from sklearn.svm import LinearSVC, SVC

from create_lagged_series import create_lagged_series

if __name__ == "__main__":
    snpret = create_lagged_series("^GSPC",
                                  datetime.datetime(2001, 1, 10),
                                  datetime.datetime(2005, 12, 31),
                                  lags=5)
    X = snpret[["Lag1", "Lag2"]]
    y = snpret["Direction"]

    kf = cross_validation.KFold(len(snpret),
                                n_folds=10,
                                indices=False,
                                shuffle=True,
                                random_state=42)
    for train_index, test_index in kf:
        X_train = X.ix[X.index[train_index]]
        X_test = X.ix[X.index[test_index]]
        y_train = y.ix[y.index[train_index]]
        y_test = y.ix[y.index[test_index]]
        print("Hit Rates/Confusion Matrices:\n")
        model = SVC(C=1000000.0,
                    cache_size=200,
                    class_weight=None,
                    coef0=0.0,
                    degree=3,
                    gamma=0.0001,
                    kernel='rbf',
    for score_col in scores:
        predictions_all.ix[:, score_col] = predictions_all.ix[:,
                                                              score_col].map(
                                                                  logit)

    unique_models = predictions_all.modelname.unique()
    n_obs = essays.meta_data().shape[0]
    predictions_matrix = np.zeros((len(scores), unique_models.shape[0], n_obs))
    for n_score, score in enumerate(scores):
        for n_model, model in enumerate(unique_models):
            predictions_matrix[
                n_score, n_model, :] = predictions_all_list[n_model][score]
    predictions_matrix = predictions_matrix.transpose(2, 0, 1)

    # set up cross validatio
    cvsets = cross_validation.KFold(len(trainset), 10, random_state=0)
    trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0]
    testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0]
    cvsets = [(trainset[tr], trainset[te])
              for tr, te in cvsets] + [(trainset, testset)]

    # set up final dataframe
    predictions_df = pd.DataFrame({
        'id':
        range(essays.meta_data().shape[0]),
        'student_id':
        essays.meta_data()["student_id"],
        'test_id':
        essays.meta_data()["test_id"],
        'essay_type':
        essays.meta_data()["essay_type"],
splits = 5
features = []
for i in range(amount):
    if (i % 10 == 0): print(i, "/", amount)
    #features[i] = extractor.splitColorFeatures(thumbs[i],splits)
    harald = extractor.calculateDarktoBrightRatio(thumbs[i])
    rian = extractor.splitColorFeatures(thumbs[i], splits)
    features.append(numpy.append(harald, rian))

#model = grid_search.GridSearchCV(svm.SVC(),{'kernel' : ['poly'], 'C' : [1, 10, 100, 1000], 'degree' : [4,7,10], 'shrinking' : [True, False]})
#model.fit(features, classes)
#print(model.best_estimator_)
#print('\a')

print("Producing KFold indexes")
kfold = cv.KFold(amount, n_folds=5, shuffle=True)
model = lda.LDA()
#model = svm.SVC(kernel = 'linear')
#model = qda.QDA()
score = cross_validation.cross_val_score(model, features, classes, cv=kfold)
print("scores ", score)
print("mean score ", score.mean())

#model = svm.SVC(kernel = 'linear', probability = True)
model = lda.LDA()
#model = neighbors.KNeighborsClassifier(n_neighbors = 1)
scores = score_calculation.loglossKFold(features, classes, model, 5)
print("logloss scores ", scores)
print("logloss score mean ", numpy.mean(scores), " ", numpy.std(scores))

#predictions = cross_validation.cross_val_predict(model, features, classes, cv = kfold)
def outer_cross():
    from Reading_data import *
    folder_prefix = ['Outer_', 'Inner_']
    # Normalize data
    X = stats.zscore(X)  #shuffling X
    x_index = [24, 25, 31, 33]
    y_index = 54
    doc_nr = 1000
    y = X[:doc_nr, y_index]  # attribute 54 is used for prediction purpose
    X = X[:doc_nr, x_index]  # these  attribute are used as input
    N, M = X.shape
    print "Max value in all attribute :", np.max(X)
    print "Min value in all attribute ", np.min(X)
    print "X shape : ", X.shape
    # ============ parameter for ann =====================
    n_hidden_units = np.arange(2, 13)  # number of hidden units
    n_train = 2  # number of networks trained in each k-fold
    learning_goal = 100  # stop criterion 1 (train mse to be reached)
    max_epochs = 65  # stop criterion 2 (max epochs in training)
    show_error_freq = 10  # frequency of training status updates'
    # =========================================================
    # summary index
    BEST_NEURONS_NR = 0
    NET_TRAIN_ERROR = 1
    BEST_TRAIN_ERROR = 2
    Y_TEST = 3
    Y_TEST_EST = 4
    Y_TRAIN = 5
    Y_TRAIN_EST = 6
    MEAN_TEST_ERR_VS_UNITS = 7
    MEAN_TRAIN_ERR_VS_UNIT = 8
    # ==================================================================

    OUTER_K = 3
    INNER_K = 5
    best_hidden_units = np.zeros(OUTER_K)
    Train_errors = np.zeros(OUTER_K)
    Test_errors = np.zeros(OUTER_K)
    #creating to folder for diagrams
    folder_one = create_new_dir(folder_prefix[0])
    folder_two = create_new_dir(folder_prefix[1])
    #===================================================================
    summary_dict = {
    }  # {k:(best_neurons_nr,net_train_error,best_train_error,y_test,y_test_est,y_train,y_train_est,mean_test_err_vs_unit,mean_train_err_vs_unit)}
    f = 0
    OUTER_CV = cross_validation.KFold(N, OUTER_K, shuffle=True)
    for train_index, test_index in OUTER_CV:
        print('\nOuter Crossvalidation fold: {0}/{1}'.format(f + 1, OUTER_K))

        # extract training and test set for current CV fold
        X_train = X[train_index, :]
        y_train = y[train_index, :]
        X_test = X[test_index, :]
        y_test = y[test_index, :]
        best_neurons_nr, mean_test_err_vs_unit, mean_train_err_vs_unit, mean_best_train_err_vs_unit = inner_cross(
            X_train, y_train, INNER_K, n_hidden_units, n_train, learning_goal,
            max_epochs, show_error_freq)

        bestnet, best_train_error, net_train_errors = find_best_network(
            X_train, y_train, n_train, best_neurons_nr, learning_goal,
            max_epochs, show_error_freq)
        y_test_est = bestnet.sim(X_test)
        y_train_est = bestnet.sim(X_train)

        summary_dict[f] = (best_neurons_nr, net_train_errors, best_train_error,
                           y_test, y_test_est, y_train, y_train_est,
                           mean_test_err_vs_unit, mean_train_err_vs_unit)
        #for index in x_index:
        # new_index=x_index.index(index)
        # plot_featue_vs_residual(attributeNames[index],X_train[:,new_index],index,abs(y_test_est-y_test),folder_one)
        f += 1

        # after the work is done then visualize it
        #create folder for saving files

    for key in summary_dict:
        value = summary_dict[key]
        plot_result(key, value[BEST_NEURONS_NR], value[Y_TEST],
                    value[Y_TEST_EST], folder_one)
        plot_error_vs_units(key, value[MEAN_TRAIN_ERR_VS_UNIT],
                            value[MEAN_TEST_ERR_VS_UNITS], n_hidden_units,
                            folder_two)
Exemple #16
0
def runGradientBoosting(train_file, train_label_file, test_file,
                        test_label_file):
    ##############################################
    # Load Data
    train_data = csv_io.read_data(train_file)
    train_label = csv_io.read_data(train_label_file)

    train_data = np.array([x[0:] for x in train_data])
    train_label = np.array([x[0] for x in train_label])
    ##############################################
    # Fit regression model
    params = {
        'n_estimators': 50,
        'max_depth': 10,
        'min_samples_split': 2,
        'learning_rate': 0.15
    }
    rf = ensemble.GradientBoostingRegressor(**params)

    # Doing cross validation using 10 folds
    cv = cross_validation.KFold(len(train_data), n_folds=3)
    average_importance = 0
    average_score = 0
    for traincv, testcv in cv:
        print train_data[traincv].shape, train_label[traincv].shape

        rf.fit(train_data[traincv], train_label[traincv])

        average_score += rf.score(train_data[testcv], train_label[testcv])
        #print rf.predict(train_data[testcv])
        print rf.feature_importances_

        mse = mean_squared_error(train_label[testcv],
                                 rf.predict(train_data[testcv]))
        mse_train = mean_squared_error(train_label[traincv],
                                       rf.predict(train_data[traincv]))
        print("MSE: %.4f" % mse)
        print("MSE: %.4f" % mse_train)

    print "average score is" + str(average_score / 10)
    # End doing cross validaiton

    # Predicting using test data
    test_data = csv_io.read_data(test_file)
    test_data = np.array([x[0:] for x in test_data])

    #test_label = csv_io.read_data(test_label_file)
    #test_label = np.array( x[1] for x in test_label )

    max_probs = []
    min_mse = 1
    for i in range(1, 20):
        rf.fit(train_data, train_label)
        predicted_probs = rf.predict_proba(test_data)
        mse_train = mean_squared_error(train_label, rf.predict(train_data))
        if min_mse > mse_train:
            min_mse = mse_train
            max_probs = predicted_probs

    print np.array(max_probs).shape
    #print mean_squared_error(test_label, max_probs)
    postprocessing.getBenchmark(max_probs)
Exemple #17
0
plt.semilogx(alphas,
             np.array(scores) - np.array(scores_std) / np.sqrt(len(X)), 'b--')
plt.ylabel('CV score')
plt.xlabel('alpha')
plt.axhline(np.max(scores), linestyle='--', color='.5')

##############################################################################
# Bonus: how much can you trust the selection of alpha?

# To answer this question we use the LassoCV object that sets its alpha
# parameter automatically from the data by internal cross-validation (i.e. it
# performs cross-validation on the training data it receives).
# We use external cross-validation to see how much the automatically obtained
# alphas differ across different cross-validation folds.
lasso_cv = linear_model.LassoCV(alphas=alphas)
k_fold = cross_validation.KFold(len(X), 3)

print("Answer to the bonus question:",
      "how much can you trust the selection of alpha?")
print()
print("Alpha parameters maximising the generalization score on different")
print("subsets of the data:")
for k, (train, test) in enumerate(k_fold):
    lasso_cv.fit(X[train], y[train])
    print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".format(
        k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])))
print()
print("Answer: Not very much since we obtained different alphas for different")
print("subsets of the data and moreover, the scores for these alphas differ")
print("quite substantially.")
Exemple #18
0
def run_pipe(input_files, input_labels, use_modules, no_proc):
    '''run svr forkflow on data'''

    #--------------Organise inputs
    #calculate matrix
    #feature_matrix = prepare_modality(input_files, input_mask)
    #--------------Execute analysis
    #prepare feature agglomeration
    #mask_handle = nb.load(input_mask)
    connect = sklim.grid_to_graph(*input_files[0].shape,
                                  mask=np.invert(
                                      np.isnan(np.sum(input_files, 0))))
    inshape = input_files.shape

    feature_matrix = input_files.reshape((inshape[0], -1))

    #remove nans
    sum_features = np.sum(feature_matrix, 0)
    feature_matrix = feature_matrix[:, np.invert(np.isnan(sum_features))]

    #cross validation
    loo = sklcv.KFold(len(input_labels), n_folds=len(input_labels))
    print('Starting svr')

    cv_pred = jl.Parallel(n_jobs=no_proc, verbose=1, pre_dispatch=no_proc * 2)(
        jl.delayed(do_model)(feature_matrix[train], input_labels[train],
                             feature_matrix[test], connect, use_modules)
        for train, test in loo)
    cv_pred = np.array(cv_pred)
    corr, p = ss.pearsonr(cv_pred[:, 0], input_labels)

    #creating final model
    print('creating final model')
    if use_modules.find('a') != -1:
        final_agglo = sklcl.WardAgglomeration(connectivity=connect,
                                              n_clusters=int(
                                                  np.median(cv_pred[:, 1])))
        feature_matrix = final_agglo.fit_transform(feature_matrix)
    else:
        final_agglo = 0

    if use_modules.find('b') != -1:
        bool_pos, bool_neg = direction_cutoff(feature_matrix)
        feature_matrix = feature_matrix[:, bool_pos]
    else:
        bool_pos = 0

    if use_modules.find('c') != -1:
        final_scaler = sklpre.StandardScaler()
        feature_matrix = final_scaler.fit_transform(feature_matrix)
    else:
        final_scaler = 0

    if use_modules.find('d') != -1:
        final_univ = sklfs.SelectFpr(alpha=np.median(cv_pred[:, 2]))
        feature_matrix = final_univ.fit_transform(feature_matrix, input_labels)
    else:
        final_univ = 0

    final_model = sklsvm.NuSVR(kernel='linear',
                               C=100,
                               degree=1,
                               nu=np.median(cv_pred[:, 3]))
    final_model.fit(feature_matrix, input_labels)

    return cv_pred, corr, p, final_agglo, final_univ, final_scaler, bool_pos, final_model
Exemple #19
0
# Fit a per-column scaler
X_scaler = StandardScaler().fit(X)
# Apply the scaler to X
X_train = X_scaler.transform(X)
y_train = np.array(label_list)

# Convert label strings to numerical encoding
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

# Create classifier
clf = svm.SVC(kernel='linear')

# Set up 5-fold cross-validation
kf = cross_validation.KFold(len(X_train),
                            n_folds=5,
                            shuffle=True,
                            random_state=1)

# Perform cross-validation
scores = cross_validation.cross_val_score(cv=kf,
                                          estimator=clf,
                                          X=X_train,
                                          y=y_train,
                                          scoring='accuracy')
print('Scores: ' + str(scores))
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), 2 * scores.std()))

# Gather predictions
predictions = cross_validation.cross_val_predict(cv=kf,
                                                 estimator=clf,
                                                 X=X_train,
    def linear_regression(self, lr_input, sqlContext, cc_output):

        # try:
        logging.info('Performing Regression')
        dfi = pd.read_csv(spark_conf.file_path['data_update_path'] +
                          'Encoded_Classified_1.txt',
                          sep='|',
                          encoding='ISO-8859-1')
        #dfi_test = pd.read_csv(spark_conf.classifier_input['output_file']+'Encoded_Classified_test.txt', sep  = '|', encoding = 'ISO-8859-1')
        dfi_test_new = cc_output
        dfi_test = lr_input
        input_list = list(dfi_test)
        #print input_list
        corr = dfi.corr()
        #sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns)
        feature_cols = [
            'likes', 'comment_count', 'user_level_num', 'Average', 'Avoid!',
            'Blah!', 'Good Enough', 'Great!', 'Insane!', 'Not rated',
            'Very Bad', 'Well...', 'Big Foodie', 'Connoisseur', 'Foodie',
            'Super Foodie', 'Bad Ambience', 'Bad Food', 'Bad Service',
            'Good Ambience', 'Good Food', 'Good Service', 'Not Worthy',
            'binarized_user_foodie_level', 'binarized_rating_text',
            'binarized_class_name'
        ]

        feature_cols_1 = list(set(input_list).intersection(feature_cols))
        # print feature_cols_1

        X_train = dfi[:-1]
        # print len(X_train)
        X_test = dfi_test[0:]
        # print len(X_test)
        y_train = dfi.confidence[:-1]
        # print len(y_train)
        y_test = dfi_test.confidence[0:]
        #print len(y_test)

        X = X_train[feature_cols_1]
        y = y_train
        Xtest = X_test[feature_cols_1]

        regr = linear_model.Lasso(alpha=0.0000000001,
                                  fit_intercept=True,
                                  normalize=False,
                                  precompute=False,
                                  copy_X=True,
                                  max_iter=1000,
                                  tol=0.0001,
                                  warm_start=False,
                                  positive=False,
                                  random_state=None,
                                  selection='cyclic')
        regr.fit(X, y)

        shuffle = cross_validation.KFold(len(X),
                                         n_folds=10,
                                         shuffle=True,
                                         random_state=0)
        scores = cross_validation.cross_val_score(regr, X, y, cv=shuffle)
        #print("Accuracy: %.3f%% (%.3f%%)") % (scores.mean()*100.0, scores.std()*100.0)

        #print regr.intercept_
        #print (regr.coef_)

        #print mean_squared_error(regr.predict(Xtest), y_test)**0.5
        #print regr.predict(Xtest)
        #print regr.score(X,y)

        se = pd.Series(regr.predict(Xtest))
        dfi_test_new['score'] = se.values
        # dfi_test['xyz'] =  se.values
        print list(dfi_test_new)
        df_s = sqlContext.createDataFrame(dfi_test_new)
        #df_s.show()
        #print df_s.count()
        df_s.rdd.map(lambda x: list(x)).map(lambda y: filter_data(
            y)).saveAsTextFile(spark_conf.hdfs_path['classifier_output'] +
                               '%s.txt' % spark_conf.utc_time()[1])
        #  dfi_test.to_csv(spark_conf.classifier_input['output_file']+'final_Output.txt',sep='|',encoding="ISO-8859-1")
        return 1
Exemple #21
0



# Try cross-validation(We did a 3 fold crossvalidation and arbitaryly assigning the following paremeters)
"""bug: the following code arise an warning: overflow encountered in exp. Check out the discussion about it later http://comments.gmane.org/gmane.comp.python.scikit-learn/3730"""

# 3-fold cross validation means:
# Ex: >>X = np.array([2, 3, 1, 0,12,10,22,11,22,111,23,12]),
#	  >>kfold = cross_validation.KFold(len(X), n_folds=3)  3-fold means 2/3 train, 1/3 test
#     ###ytrain [ 12  10  22  11  22 111  23  12] ytest [2 3 1 0]
	  ###ytrain [  2   3   1   0  22 111  23  12]  ytest [12 10 22 11]
	  ###ytrain [ 2  3  1  0 12 10 22 11]          ytest [ 22 111  23  12]
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation
kfold = cross_validation.KFold(len(X), n_folds=3)
gbc= GradientBoostingClassifier(n_estimators=100, max_depth=1, learning_rate=1.0,random_state=0)
crossScores=[gbc.fit(X[train], y[train]).score(X[test], y[test]) for train, test in kfold]
print crossScores, 'average crossvalidation score ='+str(sum(crossScores)/3)   ##return : 0.55



"""Algorithm evaluation starts here!!!!"""


seperateIdx= len(X)*60/100
X_train = X[0:seperateIdx]
y_train = y[0:seperateIdx] 
X_test  = X[seperateIdx:]
y_test  = y[seperateIdx:]
def c_validation(X, y, k, function):
    kf = cross_validation.KFold(X.shape[0], n_folds=k)

    totalloss = 0  # Variable that will store the total intances that will be tested
    totalsuccess5 = 0
    totalsuccess10 = 0
    totalpercentageloss = 0
    res = []
    corr = []

    lines = []

    for trainIndex, testIndex in kf:
        trainSet = X[trainIndex]
        testSet = X[testIndex]
        trainLabels = y[trainIndex]
        testLabels = y[testIndex]

        avg = 0
        for i in trainLabels:
            avg += i
        avg = avg / trainLabels.shape[0]

        predictedLabels = function(trainSet, trainLabels, testSet)

        loss = 0
        percentageloss = 0
        success5 = 0
        success10 = 0
        for i in range(testSet.shape[0]):
            if not np.isnan(predictedLabels[i][0]):
                if predictedLabels[i][0] > 1:
                    predictedLabels[i][0] = avg
                loss += abs(predictedLabels[i][0] - testLabels[i]) / (
                    testLabels[i] * testLabels.shape[0])
                percentageloss += abs(predictedLabels[i][0] -
                                      testLabels[i]) / testLabels.shape[0]
                if abs(predictedLabels[i][0] -
                       testLabels[i]) / testLabels[i] < 0.05:
                    success5 += 1 / testLabels.shape[0]
                    success10 += 1 / testLabels.shape[0]
                elif abs(predictedLabels[i][0] -
                         testLabels[i]) / testLabels[i] < 0.1:
                    success10 += 1 / testLabels.shape[0]
            else:
                print(i)
        print('Loss: ', 100 * loss, '%')
        print('Average error: ', 100 * percentageloss, '%')
        print('Success 0.05: ', 100 * success5, '%')
        print('Success 0.1: ', 100 * success10, '%')
        totalloss += loss
        totalpercentageloss += percentageloss
        totalsuccess5 += success5
        totalsuccess10 += success10

        res += list(predictedLabels)
        corr += list(testLabels[:])

    plt.plot(res, linestyle='', marker='.')
    plt.plot(corr, linestyle='', marker='.')
    plt.show()

    print('Total Loss: ', 100 * totalloss / k, '%')
    print('Total Average Error: ', 100 * totalpercentageloss / k, '%')
    print('Total success 0.05: ', 100 * totalsuccess5 / k, '%')
    print('Total success 0.1: ', 100 * totalsuccess10 / k, '%')
    return totalloss / k
Exemple #23
0
# -*- coding: utf-8 -*-
"""
@author: Yu
"""

import numpy
from sklearn import linear_model, cross_validation

data = numpy.genfromtxt('random_housing_data.csv',
                        delimiter=',',
                        skip_header=1)
alphas = [0.1, 0.01, 0.001]
others = data[:, (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)]
medv = data[:, 13]
rmse = []

tenfold = cross_validation.KFold(len(others), 10, True)
ridge = linear_model.RidgeCV(alphas, False, False, None, tenfold, None, False)
ridge.fit(others, medv)

predicted = ridge.predict(others)

rmse.append(numpy.sqrt(((predicted - medv)**2).mean()))

print 'RMSE: \n', rmse
print 'Alpha: \n', ridge.alpha_
bothClassifier = SVC()

end = time.time()
#########################################

# genderClassifier.fit(train, genders)
# ageClassifier.fit(train, ages)

# with open('image_classifiers.bin', 'wb') as fp:
# 	pickle.dump(genderClassifier, fp)
# 	pickle.dump(ageClassifier, fp)
# fp.close()

# Run 10-Fold Validation
#########################################
cv = cross_validation.KFold(size, n_folds=10)
resultsGender = []
resultsAge = []
resultsBoth = []
i = 1
for traincv, testcv in cv:
    print "Starting iteration ", i
    start = time.time()

    s = time.time()
    genderClassifier.fit(train[traincv[0]:traincv[-1]],
                         genders[traincv[0]:traincv[-1]])
    ageClassifier.fit(train[traincv[0]:traincv[-1]],
                      ages[traincv[0]:traincv[-1]])
    bothClassifier.fit(train[traincv[0]:traincv[-1]],
                       both[traincv[0]:traincv[-1]])
Exemple #25
0
def getHyperParamsAndBestNet():
    unsupervisedData, data, labels = createTrainingSet()

    print np.unique(np.argmax(labels, axis=1))

    print "data.shape"
    print data.shape
    print "labels.shape"
    print labels.shape

    print data
    data = common.scale(data)
    unsupervisedData = None

    activationFunction = activationfunctions.Rectified()
    rbmActivationFunctionVisible = activationfunctions.Identity()
    rbmActivationFunctionHidden = activationfunctions.RectifiedNoisy()

    tried_params = []
    percentages = []
    best_index = 0
    index = 0
    best_correct = 0

    # Random data for training and testing
    kf = cross_validation.KFold(n=len(data), n_folds=10)
    for train, test in kf:
        unsupervisedLearningRate = random.uniform(0.0001, 0.2)
        supervisedLearningRate = random.uniform(0.0001, 0.2)
        momentumMax = random.uniform(0.7, 1)

        tried_params += [{
            'unsupervisedLearningRate': unsupervisedLearningRate,
            'supervisedLearningRate': supervisedLearningRate,
            'momentumMax': momentumMax
        }]

        trainData = data[train]
        trainLabels = labels[train]

        # net = db.DBN(4, [1200, 1500, 1000, len(args.emotions)],
        #            binary=False,
        #            activationFunction=activationFunction,
        #            rbmActivationFunctionVisible=rbmActivationFunctionVisible,
        #            rbmActivationFunctionHidden=rbmActivationFunctionHidden,
        #            unsupervisedLearningRate=unsupervisedLearningRate,
        #            supervisedLearningRate=supervisedLearningRate,
        #            momentumMax=momentumMax,
        #            nesterovMomentum=True,
        #            rbmNesterovMomentum=True,
        #            rmsprop=True,
        #            miniBatchSize=20,
        #            hiddenDropout=0.5,
        #            visibleDropout=0.8,
        #            momentumFactorForLearningRateRBM=False,
        #            firstRBMheuristic=False,
        #            rbmVisibleDropout=1.0,
        #            rbmHiddenDropout=1.0,
        #            preTrainEpochs=10,
        #            sparsityConstraintRbm=False,
        #            sparsityRegularizationRbm=0.001,
        #            sparsityTragetRbm=0.01)
        #
        # net.train(trainData, trainLabels, maxEpochs=200,
        #           validation=False,
        #           unsupervisedData=unsupervisedData)
        #
        # probs, predicted = net.classify(data[test])

        net = cnn.CNN(30, 40, len(args.emotions))

        net.train(trainData, trainLabels)

        probs, predicted = net.classify(data[test])

        actualLabels = labels[test]
        correct = 0

        for i in xrange(len(test)):
            actual = actualLabels[i]
            print probs[i]
            if predicted[i] == np.argmax(actual):
                correct += 1

        percentage_correct = correct * 1.0 / len(test)
        print "percentage correct"
        print percentage_correct

        if percentage_correct > best_correct:
            best_index = index
            best_correct = percentage_correct
            with open(args.net_file, "wb") as f:
                pickle.dump(net, f)

        percentages += [percentage_correct]
        index += 1

    print 'best params'
    print tried_params[best_index]
    print 'precision'
    print best_correct
Exemple #26
0
    elif option == '-l':
      label_file = value
    else:
      assert False, "Option %s not available" % option
  if not data_file or not label_file:
    Usage()
  data = np.genfromtxt(data_file, delimiter=',')
  labels = np.genfromtxt(label_file, delimiter='\n')
  #data=data[labels!=2]
  # Normalizing data
  data = preprocessing.scale(data)
  preds_1 = np.zeros(data.shape[0])
  preds_2 = np.zeros(data.shape[0])
  preds_3 = np.zeros(data.shape[0])
  preds_4 = np.zeros(data.shape[0])
  for train_index, test_index in cross_validation.KFold(data.shape[0], n_folds=10):
  #for train_index, test_index in cross_validation.LeaveOneOut(data.shape[0]):
    print 'Running for a split...'
    estimator = LogisticRegression()
    estimator.fit(data[train_index], labels[train_index])
    preds_1[test_index] = estimator.predict(data[test_index])

    estimator = sklearn.dummy.DummyClassifier(strategy='stratified',random_state=0)
    estimator.fit(data[train_index], labels[train_index])
    preds_2[test_index] = estimator.predict(data[test_index])

    estimator = RandomForestClassifier(n_estimators=20, n_jobs=5)
    estimator.fit(data[train_index], labels[train_index])
    preds_3[test_index] = estimator.predict(data[test_index])

#GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
Exemple #27
0
def run_stack(SEED):

    model = "DIV Long-Lat KNN5 55 Imp"

    print "Running GB, RF, ET stack."

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess5_40.csv",
                                 skipFirstLine=False,
                                 split="\t")
    test = csv_io.read_data("PreProcessData/test_PreProcess5_40.csv",
                            skipFirstLine=False,
                            split="\t")
    weights = csv_io.read_data("PreProcessData/Weights.csv",
                               skipFirstLine=False)

    #random.seed(SEED)
    #random.shuffle(trainBase)

    avg = 0
    NumFolds = 5  # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10).

    predicted_list = []
    bootstrapLists = []

    # use this for quick runs.
    # note RF with 150 crashes on 30 features
    # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
    # GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
    # RandomForestRegressor(n_estimators=100, n_jobs=1),
    #RandomForestRegressor(n_estimators=75, n_jobs=1),
    # clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1),
    # SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False)
    # ]
    #knn 5 at 3.45
    #knn 15 at 3.31
    #knn 25 at 3.30
    #knn 40 at 3.31
    # KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
    # KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
    # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
    # LinearRegression at 3.77
    # Ridge at 3.77
    # SGD 4.23
    #Gauss at 13
    # LinearRegression(fit_intercept=True, normalize=False, copy_X=True),
    # Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001),
    # SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False),
    # GaussianNB()
    # clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
    # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2)
    # ]

    # GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ******************
    # clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166),
    # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166),
    # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166),
    # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166),
    # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166)
    # ]

    # about 1 hour run time, and 3.10 score.
    #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166)
    # about 2 hours run time at 3.05
    #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166)
    # about 2 hours run time at 3.06
    #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166)
    # about 4 hours run time at 3.06
    #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166)

    # 6/2000 on 40 features is 2.97
    # I
    clfs = [
        GradientBoostingRegressor(max_features=40,
                                  learn_rate=0.05,
                                  subsample=0.5,
                                  max_depth=6,
                                  n_estimators=3000,
                                  random_state=166,
                                  min_samples_leaf=1)
    ]

    # use this for quick runs.
    # clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166),
    # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11),
    # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'),
    # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)]

    # use this for quick runs.  reduced estimators to 50
    # clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
    # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True,
    # tol=0.001, verbose=False)
    # ]

    #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
    #ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1)

    # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
    # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
    # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
    # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
    # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
    # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
    # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)]

    # full algorithm stack.
    # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8),
    # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
    # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
    # GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
    # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
    # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
    # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
    # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
    # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
    # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
    # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
    # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)]

    print "Data size: ", len(trainBase), len(test)
    dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
    dataset_blend_test = np.zeros((len(test), len(clfs)))

    trainNew = []
    trainTestNew = []
    testNew = []
    trainNewSelect = []
    trainTestNewSelect = []
    testNewSelect = []

    print "Scaling"
    targetPre = [x[0] for x in trainBase]
    trainPre = [x[1:] for x in trainBase]
    testPre = [x[0:] for x in test]
    #print trainPre[0]
    scaler = preprocessing.Scaler().fit(trainPre)
    trainScaled = scaler.transform(trainPre)
    testScaled = scaler.transform(testPre)

    #print scaler.mean_
    #print scaler.std_
    print "Begin Training"

    for ExecutionIndex, clf in enumerate(clfs):
        print clf
        avg = 0

        predicted_list = []

        dataset_blend_test_set = np.zeros((len(test), NumFolds))

        foldCount = 0

        #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
        Folds = cross_validation.KFold(len(trainBase),
                                       k=NumFolds,
                                       indices=True)
        for train_index, test_index in Folds:

            #trainBaseTemp = [trainBase[i] for i in train_index]
            #target = [x[0] for x in trainBaseTemp]
            #train = [x[1:] for x in trainBaseTemp]

            #testBaseTemp = [trainBase[i] for i in test_index]
            #targetTest = [x[0] for x in testBaseTemp]
            #trainTest = [x[1:] for x in testBaseTemp]

            #test = [x[0:] for x in test]

            target = [targetPre[i] for i in train_index]
            train = [trainScaled[i] for i in train_index]

            targetTest = [targetPre[i] for i in test_index]
            trainTest = [trainScaled[i] for i in test_index]

            print
            print "Iteration: ", foldCount
            print "LEN: ", len(train), len(target)

            clf.fit(train, target)
            prob = clf.predict(trainTest)

            dataset_blend_train[test_index, ExecutionIndex] = prob

            probSum = 0
            weightSum = 0
            # totalOffByHalf = 0
            # totalPositive = 0
            # totalPositiveOffByHalf = 0
            # totalPositivePredictions = 0

            for i in range(0, len(prob)):
                probX = prob[i]

                probSum += weights[test_index[i]][0] * math.fabs(
                    targetTest[i] - probX)
                weightSum += weights[test_index[i]][0]
                #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX

                # log loss cal
                #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
                # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
                # totalOffByHalf = totalOffByHalf + 1

                # if ( int(targetTest[i]) == 1 ):
                # totalPositive = totalPositive + 1
                # if ( int(targetTest[i]) == 1 and probX < 0.5):
                # totalPositiveOffByHalf = totalPositiveOffByHalf + 1
                # if (probX > 0.5):
                # totalPositivePredictions = totalPositivePredictions + 1

            # print
            # print "Stats:"
            # print "Total Off By > 0.5 ", totalOffByHalf
            # print "Total Positive ", totalPositive
            # print "Total Positive Off By Half ", totalPositiveOffByHalf
            # print "Total Positive Predictions ", totalPositivePredictions
            #print -probSum/len(prob)
            print "Score: ", probSum / weightSum

            avg += (probSum / weightSum) / NumFolds

            predicted_probs = clf.predict(testScaled)
            #predicted_list.append([x[1] for x in predicted_probs])
            dataset_blend_test_set[:, foldCount] = predicted_probs  #[0]

            foldCount = foldCount + 1

        dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1)

        #print "Saving NP"
        #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
        #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
        #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
        #print "Done Saving NP"

        now = datetime.datetime.now()
        #print dataset_blend_test_set.mean(1)
        csv_io.write_delimited_file_single(
            "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" +
            str(avg) + "_" + str(clf)[:12] + ".csv",
            dataset_blend_test_set.mean(1))

        csv_io.write_delimited_file_single(
            "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") +
            "_" + str(avg) + "_" + str(clf)[:12] + ".csv",
            dataset_blend_train[:, ExecutionIndex])

        csv_io.write_delimited_file("../predictions/RunLog.csv", [
            now.strftime("%Y %m %d %H %M %S"),
            str(avg),
            str(clf),
            str(NumFolds), model, "", ""
        ],
                                    filemode="a",
                                    delimiter=",")

        print "------------------------Average: ", avg

        #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

    return dataset_blend_train, dataset_blend_test
Exemple #28
0
 mmwrite(os.path.join(save_dir,save_stem+'_Omega.mtx'),Omega)
 # h5write(os.path.join(save_dir,save_stem+'_W0_ipsi.h5'),
 #         np.zeros((Y_ipsi.shape[0],X.shape[0])))
 # h5write(os.path.join(save_dir,save_stem+'_W0_contra.h5'),
 #         np.zeros((Y_contra.shape[0],X.shape[0])))
 if cross_val_matrices:
     from sklearn import cross_validation
     fid=open(cmdfile,'w')
     n_inj=X.shape[1]
     # Sets up nested outer/inner cross-validation. The inner loop is for
     # model selection (validation), the outer for testing.
     if cross_val=='LOO':
         outer_sets=cross_validation.LeaveOneOut(n_inj)
     else:
         outer_sets=cross_validation.KFold(n_inj,
                                           n_folds=cross_val,
                                           shuffle=True,
                                           random_state=shuffle_seed)
     for i,(train,test) in enumerate(outer_sets):
         X_train=X[:,train]
         X_test=X[:,test]
         Y_train_ipsi=Y_ipsi[:,train]
         Y_test_ipsi=Y_ipsi[:,test]
         Omega_train = Omega[:,train]
         Omega_test = Omega[:,test]
         Y_train_contra=Y_contra[:,train]
         Y_test_contra=Y_contra[:,test]
         # setup some directories
         outer_dir=os.path.join(save_dir,'cval%d'%i)
         try:
             os.mkdir(outer_dir)
         except OSError:
Exemple #29
0
# LARS Regression
import pandas
from sklearn import cross_validation
from sklearn.linear_model import Lars

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
names = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
    'PTRATIO', 'B', 'LSTAT', 'MEDV'
]
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:, 0:13]
Y = array[:, 13]
num_folds = 10
num_instances = len(X)
seed = 7
kfold = cross_validation.KFold(n=num_instances,
                               n_folds=num_folds,
                               random_state=seed)
model = Lars()
scoring = 'mean_squared_error'
results = cross_validation.cross_val_score(model,
                                           X,
                                           Y,
                                           cv=kfold,
                                           scoring=scoring)
print(results.mean())
Exemple #30
0
def calculate(path = None, n_estimators = None, max_depth = None, max_features = None, 
    vectorizer_max_features = None, neutral_sentiment = None):

  # Read console parameters if function is not called from python.
  if (path is None):
    path                    = sys.argv[1]
    n_estimators            = int(sys.argv[2])
    max_depth               = int(sys.argv[3])
    max_features            = int(sys.argv[4])
    vectorizer_max_features = int(sys.argv[5])
    neutral_sentiment       = bool(sys.argv[6])

  # Get only text reviews and star ratings from entire data set.
  print('Extracting data...')
  reviews, ratings = extract_reviews_and_rating(path, neutral_sentiment)

  # Limit vocabulary size to 5000.
  vectorizer = CountVectorizer(
      analyzer = "word",   
      tokenizer = None,    
      preprocessor = None, 
      stop_words = None,   
      max_features = vectorizer_max_features
  )

  # Initialize a random forest classifier.
  forest = RandomForestClassifier(
      n_estimators = n_estimators, 
      max_depth = max_depth,
      max_features = max_features
  )

  # Create bag of words features.
  train_data_features = vectorizer.fit_transform(reviews)
  train_data_features = train_data_features.toarray()

  # Create bag of words features.
  test_data_features = vectorizer.transform(reviews)
  test_data_features = test_data_features.toarray()

  # Prepare train and test indices for tenfold cross validation.
  kf = cross_validation.KFold(len(reviews), n_folds = 10)
  sum_accuracy = 0

  # Tenfold cross validation loop.
  for train, test in kf:

    # Convert python lists to numpy arrays.
    train = np.array(train)
    test = np.array(test)

    # Train classifier.
    print('Training classifier...')
    forest = forest.fit(train_data_features[train], ratings[train])

    # Use trained classifier to predict sentiment of test data.
    print('Processing test data...')
    result = forest.predict(test_data_features[test])

    # Calculate prediction accuracy.
    accuracy = accuracy_score(ratings[test], result)
    print('Accuracy: ' + str(accuracy) + '\n')
    
    # Sum each score to calculate average.
    sum_accuracy += accuracy


  # Display average accuracy for tenfold cross validation.
  accuracy = sum_accuracy / 10
  print('Average accuracy: ' + str(accuracy) + '\n')
  return accuracy