Beispiel #1
0
def RBM_SVM(trainfeatures, testfeatures, trainlabels, testlabels):
    # ******************* Scikit-learning RBM + SVM *******************
    print "train RBM+SVM model"

    ##    trainfeatures = (trainfeatures - np.min(trainfeatures, 0)) / (np.max(trainfeatures, 0) + 0.0001)  # 0-1 scaling
    min_max_scaler = preprocessing.MinMaxScaler()
    trainfeatures_fs = min_max_scaler.fit_transform(trainfeatures)
    testfeatures_fs = min_max_scaler.transform(testfeatures)

    # SVM parameters
    clf = svm.SVC(C=5.0, kernel='sigmoid', degree=3, gamma=0.5, coef0=10.0,
                  shrinking=True, probability=False, tol=0.001, cache_size=200,
                  class_weight=None, verbose=False, max_iter=-1, random_state=None)

    # RBM parameters
    rbm = BernoulliRBM(random_state=0, verbose=True)
    rbm.learning_rate = 0.06
    rbm.n_iter = 20

    # Machine learning pipeline
    classifier = Pipeline(steps=[('rbm', rbm), ('svm', clf)])

    # More components tend to give better prediction performance, but larger
    # fitting time
    rbm.n_components = 400
    classifier.fit(trainfeatures_fs, trainlabels)
    results = classifier.predict(testfeatures_fs)

    results = results.ravel()
    testerror = float(len(testlabels)
                      - np.sum(testlabels == results))/float(len(testlabels))
    # print"error rate with SVM  is %.4f" %testerror

    return testerror
Beispiel #2
0
def train_nn(data, expected_values):
    data, expected_values = preprocess_data(data,
                                            expected_values,
                                            remove_high_rr=False)
    logger.info("Starting feature reduction.")
    X = np.asarray(data[1:], 'float64')
    logger.info("Done with feature reduction.")
    Y = expected_values
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=0)

    logger.info("Starting NeuralNetwork training.")

    logistic = linear_model.LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    clf = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

    rbm.learning_rate = 0.06
    rbm.n_iter = 20
    rbm.n_components = 100
    logistic.C = 1.0

    clf.fit(X_train, Y_train)

    # Evaluation
    #TODO: Make unified evaluation
    logger.info("Logistic regression using RBM features:\n%s\n" %
                (metrics.classification_report(Y_test, clf.predict(X_test))))

    logger.info("Done with NeuralNetwork training.")
    return lambda x: wrap_threshold_distribtuion(
        np.array(clf.predict(x)).astype(float))
Beispiel #3
0
def rbm():
    X_train, Y_train, X_test, Y_test = train_test_data(is_feature=False)

    rbm = BernoulliRBM(random_state=0, verbose=True)
    logistic = linear_model.LogisticRegression(solver='newton-cg', tol=1)
    rbm_features_classifier = Pipeline(steps=[('rbm',
                                               rbm), ('logistic', logistic)])

    rbm.learning_rate = 0.06
    rbm.n_iter = 10
    # More components tend to give better prediction performance, but larger
    # fitting time
    rbm.n_components = 100
    logistic.C = 50

    X_train = X_train.reshape(X_train.shape[0], -1)
    # Training RBM-Logistic Pipeline
    rbm_features_classifier.fit(X_train, Y_train)

    # # Training the Logistic regression classifier directly on the pixel
    # raw_pixel_classifier = clone(logistic)
    # raw_pixel_classifier.C = 100.
    # raw_pixel_classifier.fit(X_train, Y_train)

    X_test = X_test.reshape(X_test.shape[0], -1)
    Y_pred = rbm_features_classifier.predict(X_test)

    # print("Logistic regression using RBM features:\n%s\n" % (
    #     metrics.classification_report(Y_test, Y_pred)))

    # Y_pred = raw_pixel_classifier.predict(X_test)

    result_analysis(Y_pred, Y_test, 'BernoulliRBM')
Beispiel #4
0
def Logistic():
    logistic = linear_model.LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    # RBM parameters obtained after cross-validation
    rbm.learning_rate = 0.01
    rbm.n_iter = 121
    rbm.n_components = 700
    logistic.C= 1.0  
    # Training RBM-Logistic Pipeline
    classifier.fit(data_train,target_train)
    # Training Logistic regression
    logistic_classifier = linear_model.LogisticRegression(C=1.0)
    logistic_classifier.fit(data_train,target_train)    
    print("printing_results")
    print("Logistic regression using RBM features:\n%s\n" % (metrics.classification_report(target_test,classifier.predict(data_test))))
    cm3 = confusion_matrix(target_test,classifier.predict(data_test))
    plt.matshow(cm3)
    plt.title('Confusion Matrix Logistic Regression with RBM Features')
    plt.colorbar()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix3.jpg')
    print("Logistic regression using raw pixel features:\n%s\n" % (metrics.classification_report(target_test,logistic_classifier.predict(data_test))))
    cm4 = confusion_matrix(target_test,logistic_classifier.predict(data_test))
    plt.matshow(cm4)
    plt.title('Confusion Matrix Logistic Regression')
    plt.colorbar()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix4.jpg')
#Logistic()
Beispiel #5
0
def SGD():
    SGD = linear_model.SGDClassifier(loss='hinge',penalty='l2',random_state=42,n_jobs=-1,epsilon=0.001)
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('SGD', SGD)])
    # RBM parameters obtained after cross-validation
    rbm.learning_rate = 0.01
    rbm.n_iter = 15
    rbm.n_components = 50
    SGD.alpha=0.0001
    SGD.C=1 
    # Training SGD
    SGD_classifier = linear_model.SGDClassifier(loss='hinge',penalty='l2',random_state=42,n_jobs=-1,alpha=0.0001, epsilon=0.001)
    SGD_classifier.fit(data_train,target_train)
    # Training RBM-SGD Pipeline    
    classifier.fit(data_train,target_train)
    print("printing_results")
    
    print("SGD using RBM features:\n%s\n" % (metrics.classification_report(target_test,classifier.predict(data_test))))
    cm = confusion_matrix(target_test,classifier.predict(data_test))
    plt.matshow(cm)
    plt.title('Confusion Matrix SVM with SDG with RBM Features')
    plt.colorbar()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix1.jpg')
    print("SGD using raw pixel features:\n%s\n" % (metrics.classification_report(target_test,SGD_classifier.predict(data_test))))
    cm1 = confusion_matrix(target_test,SGD_classifier.predict(data_test))
    plt.matshow(cm1)
    plt.title('Confusion Matrix SVM with SDG Raw Features')
    plt.colorbar()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix2.jpg')
Beispiel #6
0
def restrictedBoltzmannMachine(trainData, trainLabels, testData):
	logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=10000, multi_class='multinomial')
	rbm = BernoulliRBM(random_state=0, batch_size = 2000, verbose=True)

	rbm_features_classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

	# #############################################################################
	# Training

	# Hyper-parameters. These were set by cross-validation,
	# using a GridSearchCV. Here we are not performing cross-validation to
	# save time.
	rbm.learning_rate = 0.06
	rbm.n_iter = 20
	# More components tend to give better prediction performance, but larger
	# fitting time
	rbm.n_components = 100
	logistic.C = 6000

	# Training RBM-Logistic Pipeline
	rbm_features_classifier.fit(trainData, trainLabels)
	labels = rbm_features_classifier.predict(testData)

	#labels = list(labels)
	return labels

	'''
Beispiel #7
0
def build_classifier(clf_name):

    clf = None
    parameters = {}

    if clf_name == "svm":
        clf = svm.SVC(kernel='linear', C=10)
        parameters = {}

    elif clf_name == "knn":
        clf = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='brute', leaf_size=30,
                                             metric='cosine', metric_params=None)

    elif clf_name == "rmb":
        logistic = linear_model.LogisticRegression()
        rbm = BernoulliRBM(random_state=0, verbose=True)
        rbm.learning_rate = 0.01
        rbm.n_iter = 20
        rbm.n_components = 100
        logistic.C = 6000
        clf = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
        #parameters = {'clf__C': (1, 10)}

    elif clf_name == "tsne":
        clf = TSNE(n_components=2, init='random', metric='cosine')

    return clf, parameters
Beispiel #8
0
def runRBM(arr, clsfr):#iters, lrn_rate, logistic_c_val, logistic_c_val2, n_comp, filename):
    global file_dir, nEvents, solutionFile
    iters = int(arr[0]*10)
    lrn_rate = arr[1]
    logistic_c_val = arr[2]*1000.0
    logistic_c_val2 = arr[3]*100.0
    n_comp = int(arr[4]*100)
    filename = 'rbm_iter'+str(iters)+'_logc'+str(log_c_val)+'_logcc'+str(log_c_val2)+'_lrn'+str(learn_rate)+'_nc'+str(n_comp)# low
    logistic = linear_model.LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

    ###############################################################################
    # Training

    # Hyper-parameters. These were set by cross-validation,
    # using a GridSearchCV. Here we are not performing cross-validation to
    # save time.
    rbm.learning_rate = lrn_rate #0.10#0.06
    rbm.n_iter = iters #20
    # More components tend to give better prediction performance, but larger
    # fitting time
    rbm.n_components = n_comp # 250
    logistic.C = logistic_c_val #6000.0

    # Training RBM-Logistic Pipeline
    classifier.fit(sigtr[train_input].values, sigtr['Label'].values)

    # Training Logistic regression
    logistic_classifier = linear_model.LogisticRegression(C=logistic_c_val2)#100.0
    logistic_classifier.fit(sigtr[train_input].values, sigtr['Label'].values)

    ###############################################################################
    # Evaluation
    if clsfr == 0:
        clsnn_pred=classifier.predict(sigtest[train_input].values)
        solnFile('clsnn_'+filename,clsnn_pred,sigtest['EventId'].values)#,bkgtest)
        ams_score = ams.AMS_metric(solutionFile, file_dir+filename+'.out', nEvents)
        print ams_score
        logfile.write(filename+': ' + str(ams_score)+'\n')
    
    elif clsfr == 1:
        log_cls_pred = logistic_classifier.predict(sigtest[train_input].values)
        solnFile('lognn_'+filename,log_cls_pred,sigtest['EventId'].values)#,bkgtest)
        ams_score = ams.AMS_metric(solutionFile, file_dir+'lognn_'+filename+'.out', nEvents)
        print ams_score
        logfile.write('lognn ' + filename+': ' + str(ams_score)+'\n')
    else:
        logistic_classifier_tx = linear_model.LogisticRegression(C=logistic_c_val2)
        logistic_classifier_tx.fit_transform(sigtr[train_input].values, sigtr['Label'].values)
        log_cls_tx_pred = logistic_classifier_tx.predict(sigtest[train_input].values)
        solnFile('lognntx_'+filename,log_cls_tx_pred,sigtest['EventId'].values)#,bkgtest)
        ams_score = ams.AMS_metric(solutionFile, file_dir+filename+'.out', nEvents)
        print ams_score
        logfile.write('lognntx '+ filename+': ' + str(ams_score)+'\n')

    return -1.0*float(ams_score)
Beispiel #9
0
def train_rbm(X, n_components=100, n_iter=10):
    X = X.astype(np.float64)
    X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)  # scale to [0..1]
    rbm = BernoulliRBM(random_state=0, verbose=True)
    rbm.learning_rate = 0.06
    rbm.n_iter = n_iter
    rbm.n_components = n_components
    rbm.fit(X)
    return rbm
Beispiel #10
0
Datei: auto.py Projekt: dfdx/cdbn
def train_rbm(X, n_components=100, n_iter=10):
    X = X.astype(np.float64)
    X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)  # scale to [0..1]
    rbm = BernoulliRBM(random_state=0, verbose=True)
    rbm.learning_rate = 0.06
    rbm.n_iter = n_iter
    rbm.n_components = n_components
    rbm.fit(X)
    return rbm
def rbm_logistic_train_and_predict(train_set_x,train_set_y,test_set_x,test_set_y):
    logistic = linear_model.LogisticRegression(C=6000)
    rbm = BernoulliRBM(random_state=0, verbose=True)
    rbm.learning_rate = 0.06
    rbm.n_iter = 20
    rbm.n_components = 100
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    classifier.fit(train_set_x,train_set_y)
    PRED = classifier.predict(test_set_x)
    return PRED
Beispiel #12
0
def brbm_rf(Xtr, ytr, Xte=None, yte=None):
    randomforest = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=100)
    rbm = BernoulliRBM(random_state=0)
    classifier = Pipeline(steps=[('rbm', rbm), ('randomforest', randomforest)])

    rbm.learning_rate = 0.025
    rbm.n_iter = 250
    rbm.n_components = 100

    return simple_classification(classifier, Xtr, ytr, Xte, yte)
def rbm_dbn_train_and_predict(train_set_x,train_set_y,test_set_x,test_set_y):
    dbn = DBN(epochs=200,learn_rates=0.01)
    rbm = BernoulliRBM(random_state=0, verbose=True)
    rbm.learning_rate = 0.06
    rbm.n_iter = 20
    rbm.n_components = 100
    classifier = Pipeline(steps=[('rbm', rbm), ('dbn', dbn)])
    classifier.fit(train_set_x,train_set_y)
    PRED = classifier.predict(test_set_x)
    return PRED   
def rbm_knn_train_and_predict(train_set_x,train_set_y,test_set_x,test_set_y):
    knn = KNeighborsClassifier(n_neighbors=5)
    rbm = BernoulliRBM(random_state=0, verbose=True)
    rbm.learning_rate = 0.06
    rbm.n_iter = 20
    rbm.n_components = 100
    classifier = Pipeline(steps=[('rbm', rbm), ('knn', knn)])
    classifier.fit(train_set_x,train_set_y)
    PRED = classifier.predict(test_set_x)
    return PRED
Beispiel #15
0
def train_model():
    global ocr_map
    count = 1
    a = [
        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D',
        'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
        'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
    ]
    for char in a:
        ocr_map[count] = char
        count += 1
    data_frames = []
    X_train = []
    Y_train = []
    X_test = []
    Y_test = []
    logistic = linear_model.LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    # classifier = Pipeline(steps=[ ('logistic', LinearSVC())])
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    for i in range(1, count):
        l = get_data(i)
        print len(l)
        for data in range(0, 900):
            X_train.append(l[data]['text'])
            Y_train.append(l[data]['label'])
        for data in range(900, len(l)):
            X_test.append(l[data]['text'])
            Y_test.append(l[data]['label'])

    # X_train, Y_train = nudge_dataset(X_train, Y_train)
    # X_test, Y_test = nudge_dataset(X_test, Y_test)
    X_train = (X_train - np.min(X_train, 0)) / (np.max(X_train, 0) + 0.0001
                                                )  # 0-1 scaling
    X_test = (X_test - np.min(X_test, 0)) / (np.max(X_test, 0) + 0.0001
                                             )  # 0-1 scaling

    print X_train.shape, X_test.shape
    # skf = StratifiedKFold(Y, n_folds=2)
    # joblib.dump(X_train, 'X_train.pkl',compress=3)
    # joblib.dump(Y_train, 'Y_train.pkl',compress=3)
    # joblib.dump(X_test, 'X_test.pkl',compress=3)
    # joblib.dump(Y_test, 'Y_test.pkl',compress=3)
    rbm.learning_rate = 0.06
    rbm.n_iter = 20
    rbm.n_components = 100
    # logistic.C = 6000.0
    classifier.fit(X_train, Y_train)
    f = open("ocr_results.txt", 'w')
    answers = classifier.predict(X_test)
    print confusion_matrix(Y_test, answers)
    score_data = accuracy_score(Y_test, answers)
    print score_data
    f.write(str(score_data))
    f.close()
Beispiel #16
0
def run_auto():
    X = load_data('gender/male')
    X = X.astype(np.float32) / 256
    rbm = BernoulliRBM(random_state=0, verbose=True)
    rbm.learning_rate = 0.06
    rbm.n_iter = 20
    rbm.n_components = 2000
    rbm.fit(X)
    cimgs = [comp.reshape(100, 100) for comp in rbm.components_]
    smartshow(cimgs[:12])
    return rbm
Beispiel #17
0
def run_auto():
    X = load_data('gender/male')
    X = X.astype(np.float32) / 256
    rbm = BernoulliRBM(random_state=0, verbose=True)
    rbm.learning_rate = 0.06
    rbm.n_iter = 20
    rbm.n_components = 2000
    rbm.fit(X)
    cimgs = [comp.reshape(100, 100) for comp in rbm.components_]
    smartshow(cimgs[:12])
    return rbm
 def useNeuralNetwork(self):
     #Set up logistic regression unit:
     logistic = linear_model.LogisticRegression()
     #Set up neural net unit; tune its parameters ##TODO: grid search for params
     rbm = BernoulliRBM(random_state=0, verbose=True)
     rbm.learning_rate = 0.06
     rbm.n_iter = 20
     rbm.n_components = 50
     #Make classifier a pipeline
     self.classifier = Pipeline(steps=[('rbm', rbm), ('logistic',
                                                      logistic)])
Beispiel #19
0
    def getNeuralModel(self,X,Y):

            logistic = linear_model.LogisticRegression()
            rbm = BernoulliRBM(verbose=True)

            classifier = linear_model.LogisticRegression(penalty='l2', tol=.0001)#Pipeline(steps = [('rbm', rbm),('logistic',logistic)])
            rbm.learning_rate = 0.0001
            rbm.n_iter = 1000
            rbm.n_components = 1000

            classifier.fit(X, Y)

            return classifier
Beispiel #20
0
def RBM_train(data, target):
    """ Train RBM + SVM """
    train_data, test_data, train_labels, test_labels = train_test_split(
        data, target, test_size=0.33, random_state=42)
    svm_data = svm.SVC(gamma=0.001)
    rbm = BernoulliRBM()
    classifier = Pipeline(steps=[('rbm', rbm), ('svm', svm_data)])
    rbm.learning_rate = 0.06
    rbm.n_iter = 40
    rbm.n_components = 100
    classifier.fit(train_data, train_labels)
    predicted = classifier.predict(test_data)
    get_cost(predicted, test_labels)
Beispiel #21
0
 def train_with_logistic(self):
     rbm = BernoulliRBM(random_state=0, verbose=False)
     logistic = linear_model.LogisticRegression(C=100)
     classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
     
     rbm.learning_rate = 0.05
     rbm.n_iter = 30
     # More components tend to give better prediction performance, but larger
     # fitting time
     rbm.n_components = 30
     
     classifier.fit(self.X, self.Y)
     self.classifier = classifier
     joblib.dump(classifier,"rbm-logistic.pkl")
Beispiel #22
0
 def  train_with_svm(self):
     rbm = BernoulliRBM(random_state=0, verbose=False)
     svc = LinearSVC(C=10.0,class_weight='balanced',max_iter=100)
     classifier = Pipeline(steps=[('rbm', rbm), ('svm', svc)])
     
     rbm.learning_rate = 0.05
     rbm.n_iter = 30
     # More components tend to give better prediction performance, but larger
     # fitting time
     rbm.n_components = 100
     
     classifier.fit(self.X, self.Y)
     self.classifier = classifier
     joblib.dump(classifier,"rbm.pkl")
def neural_net():
    digits = datasets.load_digits()
    X = np.asarray(digits.data, 'float32')
    sidelength = int(np.sqrt(X.shape[1]))
    X, Y = nudge_dataset(X, digits.target, dimen=(sidelength, sidelength))
    #Scale the data to be between zero and 1 at all pixels:
    X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) + 0.0001)

    #Split the data set into a training and testing set:
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=0)

    #Models we will use
    logistic = linear_model.LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    #The classifier
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

    ###############################################################################
    # Training

    # Hyper-parameters. These were set by cross-validation,
    # using a GridSearchCV. Here we are not performing cross-validation to
    # save time.
    rbm.learning_rate = 0.06
    rbm.n_iter = 20
    # More components tend to give better prediction performance, but larger
    # fitting time
    rbm.n_components = 100
    logistic.C = 6000.0

    # Training RBM-Logistic Pipeline
    classifier.fit(X_train, Y_train)

    # Training Logistic regression
    #logistic_classifier = linear_model.LogisticRegression(C=100.0)
    #logistic_classifier.fit(X_train, Y_train)

    ###############################################################################
    # Evaluation
    print ""
    print("Logistic regression using RBM features:\n%s\n" %
          (metrics.classification_report(Y_test, classifier.predict(X_test))))

    #Predict a few individual cases:
    print classifier.predict(X_test[:5, :]), Y_test[:5]
def neural_net():
    digits = datasets.load_digits()
    X = np.asarray(digits.data, 'float32')
    sidelength = int(np.sqrt(X.shape[1]))
    X,Y = nudge_dataset(X,digits.target,dimen=(sidelength,sidelength))
    #Scale the data to be between zero and 1 at all pixels:
    X = (X - np.min(X,axis=0))/(np.max(X,axis=0)+0.0001)

    #Split the data set into a training and testing set:
    X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)

    #Models we will use
    logistic = linear_model.LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    #The classifier
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

    ###############################################################################
    # Training

    # Hyper-parameters. These were set by cross-validation,
    # using a GridSearchCV. Here we are not performing cross-validation to
    # save time.
    rbm.learning_rate = 0.06
    rbm.n_iter = 20
    # More components tend to give better prediction performance, but larger
    # fitting time
    rbm.n_components = 100
    logistic.C = 6000.0

    # Training RBM-Logistic Pipeline
    classifier.fit(X_train, Y_train)

    # Training Logistic regression
    #logistic_classifier = linear_model.LogisticRegression(C=100.0)
    #logistic_classifier.fit(X_train, Y_train)

    ###############################################################################
    # Evaluation
    print ""
    print("Logistic regression using RBM features:\n%s\n" % (
        metrics.classification_report(
            Y_test,
            classifier.predict(X_test))))
    
    #Predict a few individual cases:
    print classifier.predict(X_test[:5,:]),Y_test[:5]
Beispiel #25
0
    def train(cls) -> str:
        """
        Returns classification results
        """
        X_train, X_test, Y_train, Y_test = RestrictedBoltzmann.load_data()

        logistic = linear_model.LogisticRegression(solver='newton-cg', tol=1)

        rbm = BernoulliRBM(random_state=0, verbose=True)

        rbm_features_classifier = Pipeline(steps=[('rbm',
                                                   rbm), ('logistic',
                                                          logistic)])

        # Hyper-parameters. These were set by cross-validation,
        # using a GridSearchCV. Here we are not performing cross-validation to
        # save time.
        rbm.learning_rate = 0.06
        rbm.n_iter = 10
        # More components tend to give better prediction performance, but larger
        # fitting time
        rbm.n_components = 100
        logistic.C = 6000

        # Training RBM-Logistic Pipeline
        rbm_features_classifier.fit(X_train, Y_train)

        # Training the Logistic regression classifier directly on the pixel
        raw_pixel_classifier = clone(logistic)
        raw_pixel_classifier.C = 100.
        raw_pixel_classifier.fit(X_train, Y_train)

        RestrictedBoltzmann.store_model("rbm_features",
                                        rbm_features_classifier)

        RestrictedBoltzmann.store_model("raw_pixel", raw_pixel_classifier)

        # Evaluation
        Y_pred = rbm_features_classifier.predict(X_test)
        report1 = "Logistic regression using RBM features:\n%s\n" % (
            metrics.classification_report(Y_test, Y_pred))

        Y_pred = raw_pixel_classifier.predict(X_test)
        report2 = "Logistic regression using raw pixel features:\n%s\n" % (
            metrics.classification_report(Y_test, Y_pred))

        return f"{report1} \n\n {report2}"
Beispiel #26
0
def estimate_n_components():
    X = load_data('gender/male')
    X = X.astype(np.float32) / 256
    n_comp_list = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]
    scores = []
    for n_comps in n_comp_list:
        rbm = BernoulliRBM(random_state=0, verbose=True)
        rbm.learning_rate = 0.06
        rbm.n_iter = 50
        rbm.n_components = 100
        rbm.fit(X)
        score = rbm.score_samples(X).mean()
        scores.append(score)
    plt.figure()
    plt.plot(n_comp_list, scores)
    plt.show()
    return n_comp_list, scores
Beispiel #27
0
def estimate_n_components():
    X = load_data('gender/male')
    X = X.astype(np.float32) / 256
    n_comp_list = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]
    scores = []
    for n_comps in n_comp_list:
        rbm = BernoulliRBM(random_state=0, verbose=True)
        rbm.learning_rate = 0.06
        rbm.n_iter = 50
        rbm.n_components = 100
        rbm.fit(X)
        score = rbm.score_samples(X).mean()
        scores.append(score)
    plt.figure()
    plt.plot(n_comp_list, scores)
    plt.show()
    return n_comp_list, scores
Beispiel #28
0
def SGD():
    SGD = linear_model.SGDClassifier(loss='hinge',
                                     penalty='l2',
                                     random_state=42,
                                     n_jobs=-1,
                                     epsilon=0.001)
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('SGD', SGD)])
    # RBM parameters obtained after cross-validation
    rbm.learning_rate = 0.01
    rbm.n_iter = 15
    rbm.n_components = 50
    SGD.alpha = 0.0001
    SGD.C = 1
    # Training SGD
    SGD_classifier = linear_model.SGDClassifier(loss='hinge',
                                                penalty='l2',
                                                random_state=42,
                                                n_jobs=-1,
                                                alpha=0.0001,
                                                epsilon=0.001)
    SGD_classifier.fit(data_train, target_train)
    # Training RBM-SGD Pipeline
    classifier.fit(data_train, target_train)
    print("printing_results")

    print("SGD using RBM features:\n%s\n" % (metrics.classification_report(
        target_test, classifier.predict(data_test))))
    cm = confusion_matrix(target_test, classifier.predict(data_test))
    plt.matshow(cm)
    plt.title('Confusion Matrix SVM with SDG with RBM Features')
    plt.colorbar()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix1.jpg')
    print("SGD using raw pixel features:\n%s\n" %
          (metrics.classification_report(target_test,
                                         SGD_classifier.predict(data_test))))
    cm1 = confusion_matrix(target_test, SGD_classifier.predict(data_test))
    plt.matshow(cm1)
    plt.title('Confusion Matrix SVM with SDG Raw Features')
    plt.colorbar()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix2.jpg')
Beispiel #29
0
def Train():
    """
    Train Function
    """
    if os.path.exists('X_train.pkl') == False:
        print("generate data and split to train test set.")
        with open('X.pkl') as Xf:
            X = cPickle.load(Xf)
        with open('Y.pkl') as Yf:
            Y = cPickle.load(Yf)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

    print("load data from pickled files..")
    with open('X_train.pkl') as x_train_f:
        X_train = cPickle.load(x_train_f)
    with open('X_test.pkl')  as x_test_f:
        X_test  = cPickle.load(x_test_f)
    with open('Y_train.pkl') as y_train_f:
        Y_train = cPickle.load(y_train_f)
    with open('Y_test.pkl')  as y_test_f:
        Y_test  = cPickle.load(y_test_f)
    print("Load Data success!")

    logistic = linear_model.LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    rbm.learning_rate = 0.06
    rbm.n_iter = 300
    rbm.n_components = 1000
    logistic.C = 6000.0
    clf = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    clf.fit(X_train,Y_train)
    #logistic_classifier = linear_model.LogisticRegression(C=100.0)
    #logistic_classifier.fit(X_train, Y_train)
    #print("Logistic regression using raw pixel features:\n%s\n" % (
    #metrics.classification_report(
    #    Y_test,
    #    logistic_classifier.predict(X_test))))
    print("fit complete..")
    print("Logistic regression using RBM features:\n%s\n" % (
    metrics.classification_report(
        Y_test,
        clf.predict(X_test))))
    with open('clf.pkl','a+') as clf_f:
        cPickle.dump(clf,clf_f)
Beispiel #30
0
def RBM():
    filename = "../data/smaller.dta"
    raw_data = open(filename, 'rt')
    data = np.loadtxt(raw_data, delimiter=" ")
    X = data[:, :3]
    Y = data[:, 3]
    print(X)
    print(Y)
    print("training on RBM")
    rbm = BernoulliRBM(random_state=0, verbose=True)
    rbm.learning_rate = 0.06
    rbm.n_iter = 20
    rbm.n_components = 100
    rbm.fit(X, Y)
    predictions = rbm.transform(X)
    params = rbm.get_params()
    print("predictions = ", predictions)
    print("rbm = ", rbm)
    print("params = ", params)
Beispiel #31
0
def ParaTun2(X_dev, Y_dev):
    rbm = BernoulliRBM(random_state=0, verbose=True)
    steps = [('rbm', rbm), ('classifier', OneVsRestClassifier(LinearSVC()))]
    rbm.learning_rate = 0.005
    rbm.n_iter = 200
    rbm.n_components = 100
    #rbm.batch_size = 10
    pipeline = Pipeline(steps)
    params = {'classifier__estimator__C': [10]}
    #scorer = make_scorer(roc_auc_score, average='macro', needs_proba=True)
    predictor = GridSearchCV(pipeline, params, cv=2, n_jobs=1)
    #predictor = GridSearchCV(pipeline, params, n_jobs=1)

    print '2'
    result = predictor.fit(X_dev, Y_dev)
    print result.best_score_
    #print result.cv_results_
    print result.best_params_
    return predictor
    def _get_classification_pipeline(self):
        """Builds and returns the classification Pipeline for this classifier

        :return: A Pipeline with the required classification steps
        """
        rbm = BernoulliRBM()
        rbm.n_components = 100
        rbm.learning_rate = 0.01
        rbm.n_iter = 10

        logistic_regression = linear_model.LogisticRegression()
        logistic_regression.C = 10000

        classification_steps = [
            ("rbm", rbm),
            ("logistic", logistic_regression)
        ]

        return Pipeline(steps=classification_steps)
Beispiel #33
0
def main():
    X, Y = load_csv_file('train.csv')
    estimators = 1000
    test_size = 0.05
    X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=test_size, random_state=0)
    X_train_real, X_test_real, Y_train_real, Y_test_real = train_test_split(X_train, Y_train, test_size=test_size, random_state=42)
    log.info('Loaded training file')
    X_test, _ = load_csv_file('test.csv', cut_end=False)
    log.info('Loaded test file')

    #Classifier Setup
    logistic = linear_model.LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    tree_clf = ExtraTreesClassifier(n_estimators=estimators, n_jobs=-1,
                                    random_state=0, max_depth=None)

    rbm.learning_rate = 0.06
    rbm.n_iter = 20
    rbm.n_components = 500
    logistic.C = 6000.0

    pipeline = make_pipeline(tree_clf, rbm, logistic)
    #clf = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=1)
    clf = pipeline
    log.info('Fitting Boltzman with %s' str([name for name, _ in pipeline.steps]))
    clf.fit(X_train_real, Y_train_real)
    clf_probs = clf.predict_proba(X_test_real)
    score = log_loss(Y_test_real, clf_probs)
    log.info('Log Loss score un-trained = %f' % score)

    # Calibrate Classifier using ground truth in X,Y_valid
    sig_clf = CalibratedClassifierCV(clf, method="isotonic", cv="prefit")
    log.info('Fitting CalibratedClassifierCV')
    sig_clf.fit(X_valid, Y_valid)
    sig_clf_probs = sig_clf.predict_proba(X_test_real)
    sig_score = log_loss(Y_test_real, sig_clf_probs)
    log.info('Log loss score trained = %f' % sig_score)

    # Ok lets predict the test data with our funky new classifier
    sig_submission_probs = sig_clf.predict_proba(X_test)

    write_out_submission(sig_submission_probs, 'submission.csv')
def LogRegWithRBMFeatures(x_train, y_train, x_cv, y_cv):
	"""
	Logistic regression using RBM features
	http://scikit-learn.org/stable/auto_examples/plot_rbm_logistic_classification.html
	"""
	logistic = linear_model.LogisticRegression()
	#rbm = BernoulliRBM(random_state=0, verbose=True)
	rbm = BernoulliRBM()
	classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
	rbm.learning_rate = 0.06
	rbm.n_iter = 20
	rbm.n_components = 5000
	#logistic.C = 6000.0
	
	classifier.fit(x_train, y_train)
	
	#classifier = BernoulliRBM(n_components = 10)
	#classifier.fit(x_train, y_train)
	
	return classifier
Beispiel #35
0
def RBM(X_train, X_test, y_train, y_test):
    #logistic = LogisticRegression(solver='newton-cg', tol=1)
    nn = MLPClassifier(solver='adam',
                       alpha=1e-5,
                       hidden_layer_sizes=(50, 25, 1),
                       random_state=1)
    rbm = BernoulliRBM(random_state=0, verbose=True)

    rbm_features_classifier = Pipeline(
        #steps=[('rbm', rbm), ('logistic', logistic)])
        steps=[('rbm', rbm), ('nn', nn)])
    rbm.learning_rate = 0.06
    rbm.n_iter = 10

    rbm.n_components = 100
    #logistic.C = 6000

    rbm_features_classifier.fit(X_train, y_train)
    prediction = rbm_features_classifier.predict(X_test)
    print(100 * accuracy_score(y_test, prediction))
    print(confusion_matrix(y_test, prediction))
Beispiel #36
0
def train_new(path):

    thumbnail = get_thumbnail(Image.open('images/{0}'.format(path)))

    vectors = []
    for pixel_tuple in thumbnail.getdata():
        vec = []
        for val in pixel_tuple:
            vec.append(float(val))
        vectors.append(vec)

    X = np.asarray(vectors, 'float32')
    Y = np.array(X.shape)
    X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)

    rbm = BernoulliRBM(random_state=1, verbose=True)
    rbm.learning_rate = 0.09
    rbm.n_iter = 1
    rbm.n_components = 16
    rbm.batch_size = 2

    return rbm.fit(X).components_
Beispiel #37
0
def train(image_matrix, images):

    X = np.asarray(image_matrix, 'float32')
    Y = np.array(X.shape)
    X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)

    rbm = BernoulliRBM(random_state=1, verbose=True)
    rbm.learning_rate = 0.09
    rbm.n_iter = 1
    rbm.n_components = 16
    rbm.batch_size = 2

    y_new = np.zeros(X.shape)
    for i in range(len(X)):
        x_new = rbm.fit(X[i])
        y_new[i] = x_new.components_

    global model
    model = {
        'matrix': y_new,
        'images': images
    }
Beispiel #38
0
def Logistic():
    logistic = linear_model.LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    # RBM parameters obtained after cross-validation
    rbm.learning_rate = 0.01
    rbm.n_iter = 121
    rbm.n_components = 700
    logistic.C = 1.0
    # Training RBM-Logistic Pipeline
    classifier.fit(data_train, target_train)
    # Training Logistic regression
    logistic_classifier = linear_model.LogisticRegression(C=1.0)
    logistic_classifier.fit(data_train, target_train)
    print("printing_results")
    print("Logistic regression using RBM features:\n%s\n" %
          (metrics.classification_report(target_test,
                                         classifier.predict(data_test))))
    cm3 = confusion_matrix(target_test, classifier.predict(data_test))
    plt.matshow(cm3)
    plt.title('Confusion Matrix Logistic Regression with RBM Features')
    plt.colorbar()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix3.jpg')
    print("Logistic regression using raw pixel features:\n%s\n" %
          (metrics.classification_report(
              target_test, logistic_classifier.predict(data_test))))
    cm4 = confusion_matrix(target_test, logistic_classifier.predict(data_test))
    plt.matshow(cm4)
    plt.title('Confusion Matrix Logistic Regression')
    plt.colorbar()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix4.jpg')


#Logistic()
Beispiel #39
0
def rbm(X,Y):
	# Models we will use
	logistic = linear_model.LogisticRegression()
	rbm = BernoulliRBM(random_state=0, verbose=True)
	X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2,random_state=0)
	classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
	###############################################################################
	# Training

	# Hyper-parameters. These were set by cross-validation,
	# using a GridSearchCV. Here we are not performing cross-validation to
	# save time.
	rbm.learning_rate = 0.06
	rbm.n_iter = 1000
	# More components tend to give better prediction performance, but larger
	# fitting time
	rbm.n_components = 100
	logistic.C = 6000.0

	# Training RBM-Logistic Pipeline
	classifier.fit(X_train, Y_train)

	# Training Logistic regression
	logistic_classifier = linear_model.LogisticRegression(C=100.0)
	logistic_classifier.fit(X_train, Y_train)
	# Evaluation

	print()
	print("Logistic regression using RBM features:\n%s\n" % (
	    metrics.classification_report(
	        Y_test,
	        classifier.predict(X_test))))

	print("Logistic regression using raw pixel features:\n%s\n" % (
	    metrics.classification_report(
	        Y_test,
	        logistic_classifier.predict(X_test))))
Beispiel #40
0
    def train_deep_boltzman(self):
        rbm1 = BernoulliRBM(random_state=0, verbose=False)
        logistic = linear_model.LogisticRegression(class_weight='balanced')

        classifier = Pipeline(steps=[('rbm', rbm1), ('logistic', logistic)])
        # More components tend to give better prediction performance, but larger
        # fitting time
        
        params = {
        "rbm__learning_rate": [0.1, 0.03, 0.01],
        "rbm__n_iter": [20, 40, 80],
        "rbm__n_components": [50, 75, 100],
        "logistic__C": [1.0, 10.0, 100.0]}
#         gs = grid_search.GridSearchCV(classifier,params)
#         gs.fit(self.X, self.Y)
        print "grid search done, training pipelined classifier"
        rbm1.n_components = 100
        rbm1.n_iter = 40
        rbm1.learning_rate = 0.01
        logistic.C = 10.0
        classifier.fit(self.X, self.Y)
        self.classifier = classifier
        "classification"
        joblib.dump(classifier,"two-layerRbm-logistic.pkl")
Beispiel #41
0
def get_price_signal(stock,
                     data):  # -------------------------------------------
    # Use Logistic Regression classifier with BernoulliRBM Neural Netowrk
    # Return 1 if buy signal on stock return, Return 0 if sell signal
    price = data.history(stock, 'price', bar_count=50, frequency='1d')
    price = price.fillna(method='ffill')

    logistic = linear_model.LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=False)

    rbm.learning_rate = 0.017
    rbm.n_iter = 30

    rbm.n_components = 150
    logistic.C = 6000.0
    classifier = skp.Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

    # Make a list of 1's and 0's, 1 when the price increased from the prior bar
    returns = np.diff(price)
    changes = (np.diff(price) > 0).astype(int)

    lag = 1
    X = (returns)[:-lag].astype(float)  # Add the prior changes
    X_data = X.reshape((len(X), 1))
    Y = changes[lag:]  # Add dependent variable, the final change
    Y_data = Y.reshape((len(Y), 1))

    if len(
            Y
    ) >= 30:  # There needs to be enough data points to make a good model
        try:
            classifier.fit(X_data, Y_data)  # Generate the model
            prediction = classifier.predict(returns[-lag:])  # Predict
        except:
            return None
        return prediction[-1]
def classifier(train_num, use_profile=False):

	X,Y = feature_extractor(train_num, use_profile)

	logistic_classifier = linear_model.LogisticRegression(C=100.0, penalty='l1')
	X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
	                                                    test_size=0.2,
	                                                    random_state=0)
	logistic_classifier.fit(X_train, Y_train)



	logistic = linear_model.LogisticRegression()
	rbm = BernoulliRBM(random_state=0, verbose=True)

	classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

	rbm.learning_rate = 0.06
	rbm.n_iter = 20
	# More components tend to give better prediction performance, but larger
	# fitting time
	rbm.n_components = 100
	logistic.C = 6000.0

	# Training RBM-Logistic Pipeline
	# classifier.fit(X_train, Y_train)

	# print("Logistic regression using RBM features:\n%s\n" % (
	#     metrics.classification_report(
	#         Y_test,
	#         classifier.predict(X_test))))

	# param_grid = {'penalty':['l1','l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
	# GridSearch = GridSearchCV(logistic_classifier, param_grid, cv = 10)
	# GridSearch.fit(X_train, Y_train)
	# bestLRclf = GridSearch.best_estimator_

	bestLRclf = logistic_classifier

	print("Logistic regression using raw features:\n%s\n" % (
	    metrics.classification_report(
	        Y_test,
	        bestLRclf.predict(X_test))))

	print bestLRclf.coef_


	# print "logistic_classifier RBM accuracy", metrics.accuracy_score(Y_test, classifier.predict(X_test))
	print "logistic_classifier accuracy", metrics.accuracy_score(Y_test, bestLRclf.predict(X_test))
	print "logistic_regression mean_squared_error", metrics.mean_squared_error(Y_test, bestLRclf.predict(X_test))

	logProb = bestLRclf.predict_log_proba(X_test)
	second_col = logProb[:,1]
	sorted_index = np.argsort(second_col)

	correct_count = 0
	for i in range(1, 427):
		index = sorted_index[-i]
		if Y_test[index] == 1:
			correct_count += 1
	correct_percentage = correct_count / 426.0
	print "correct_percentage", correct_percentage

	return metrics.accuracy_score(Y_test, bestLRclf.predict(X_test))
Beispiel #43
0
#pre-train networks using Restricted Boltzmann Machine
#first layer
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
bigMatrix = min_max_scaler.fit_transform(bigMatrix)

vectorOfOnes =  np.tile(1.0, (bigMatrix.shape[0], 1))
bigMatrix = np.hstack((vectorOfOnes, bigMatrix))

#Divide train Matrix and Test Matrix (for which I don't have labels)
trainMatrixReduced = bigMatrix[someOtherNumbers, :]
testMatrixReduced = bigMatrix[testIndexes, :]

RBM1 = BernoulliRBM(verbose = True)
RBM1.learning_rate = 0.04
RBM1.n_iter = 20
RBM1.n_components = 700
RBM1.fit(bigMatrix)

ThetaHiddenOne = RBM1.components_.T

bigMatrix = sigmoid(np.dot(bigMatrix, ThetaHiddenOne))

vectorOfOnes =  np.tile(1.0, (bigMatrix.shape[0], 1))
bigMatrix = np.hstack((vectorOfOnes, bigMatrix))

#second layer
RBM2 = BernoulliRBM(verbose = True)
RBM2.learning_rate = 0.03
RBM2.n_iter = 20
RBM2.n_components = 500
RBM2.fit(bigMatrix)
def RBMtest01():
	#利用RBM进行non-linear feature extraction
	#相对于直接进行logistic regression, RBM features 可以提高分类精度

	import numpy as np
	import matplotlib.pyplot as plt

	from scipy.ndimage import convolve
	from sklearn import linear_model, datasets, metrics
	from sklearn.cross_validation import train_test_split
	from sklearn.neural_network import BernoulliRBM
	from sklearn.pipeline import Pipeline

	def nudge_dataset(X, Y):
		direction_vectors = [
			[[0, 1, 0],
			 [0, 0, 0],
			 [0, 0, 0]],

			[[0, 0, 0],
			 [1, 0, 0],
			 [0, 0, 0]],

			[[0, 0, 0],
			 [0, 0, 1],
			 [0, 0, 0]],

			[[0, 0, 0],
			 [0, 0, 0],
			 [0, 1, 0]]
		]

		shift = lambda x, w: convolve(x.reshape((8, 8)), mode = 'constant', weights = w).ravel()

		X = np.concatenate([X] + [np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors])
		Y = np.concatenate([Y for _ in range(5)], axis = 0)

		return X, Y

	digits = datasets.load_digits()
	X = np.asarray(digits.data, 'float32')  #这里应该就是进行了一下数据类型转换 a#list to array

	X, Y = nudge_dataset(X, digits.target)  #相当于重新生成了5倍的X,Y

	#print np.max(X, 0)
	#print np.min(X, 0)
	X = (X - np.min(X, 0)) / (np.max(X, 0) - - np.min(X, 0) + 0.0001) # 0-1 scaling 这里做了归一化(每一维分别归一化)

	X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)


	print set(Y_train)
	#'''
	#新建模型
	logistic = linear_model.LogisticRegression()
	rbm = BernoulliRBM(random_state = 0, verbose = True)

	#感觉这里的pipeline就是一个连续进行fit, transform的过程
	#而rbm模型transform的结果是Latent representations of the data.

	classifier = Pipeline(steps = [('rbm', rbm), ('logistic', logistic)])

	#Training
	#这里的参数是根据cross-validation选出来的 -- GridSearchCV
	rbm.learning_rate = 0.06
	rbm.n_iter = 20
	rbm.n_components = 100  #这里就是利用rbm 训练出100个特征
	logistic.C = 6000


	#rbm.fit(X_train, Y_train)
	rbm.fit(X_train)


	#rbm从数据的维数来看,首先是一个非监督的训练过程,就是从X_train中求出N个代表性的vector,
	#然后再把原始的X_trian投影到这N的向量上,获得X_train的新N维feature
	#与PCA类似

	predicted_Y = rbm.transform(X_train)

	print rbm.components_  #rbm.components_是 100 * 64的矩阵
	print len(rbm.components_)
	print len(rbm.components_[0])

	print predicted_Y
	print len(predicted_Y)
	print len(predicted_Y[0])
	print len(X_train)
	print len(X_train[0])


	# Training RBM-Logistic Pipeline
	#相当于这里输入的还是每一维都进行了归一化之后的X_train
	#对应的Y_train还是0-9 表示label
	print "Start Training RBM-Logistic Pipeline"
	classifier.fit(X_train, Y_train)





	# Training Logistic regression,
	logistic_classifier = linear_model.LogisticRegression(C = 100.0)
	logistic_classifier.fit(X_train, Y_train)

	#Evaluation

	print "Logistic regression using RBM features: \n%s\n" %(metrics.classification_report(Y_test, classifier.predict(X_test)))
	print "Logistic regression using raw features: \n%s\n" %(metrics.classification_report(Y_test, logistic_classifier.predict(X_test)))


	#Plotting

	plt.figure(figsize = (4.2, 4))

	for i, comp in enumerate(rbm.components_):
		plt.subplot(10, 10, i + 1)
		#这里获得的还是100个64维vector,然后把每一个vector都reshape到8*8显示出来
		plt.imshow(comp.reshape(8,8), cmap=plt.cm.gray_r)
		plt.xticks(())
		plt.yticks(())

	plt.suptitle('100 components extracted by RBM', fontsize = 16)
	plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.23)

	plt.show()
Beispiel #45
0
# Chargement des digits
X, Y = utils.load_data()
print(X.shape)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.2,
                                                    random_state=0)

# Models we will use
rbm_layer_1 = BernoulliRBM(random_state=0, verbose=True)
rbm_layer_2 = BernoulliRBM(random_state=0, verbose=True)
logistic = linear_model.LogisticRegression() # pour comparaison avec RBM + regression logistique
###############################################################################
# Training du premier rbm
rbm_layer_1.learning_rate = 0.01
rbm_layer_1.n_iter = 50
rbm_layer_1.n_components = 300
# Training RBM
print("Debut training RBM1")
print(X_train.shape)
t0 = time.clock()
rbm_layer_1.fit(X_train)
print(time.clock() - t0)
# creation d'une base de train a partir d'echantillonnage
# de variable cachees du premier rbm
n_sample_second_layer_training = 3*int(X.shape[0])
H1_train = np.zeros(shape=(n_sample_second_layer_training, rbm_layer_1.n_components))
comp = 0
while (comp < n_sample_second_layer_training):
	rng = check_random_state(rbm_layer_1.random_state)
	randTemp = rd.randint(0, X.shape[0] - 1)
	H1_train[comp] = rbm_layer_1._sample_hiddens(X[randTemp], rng)
# clf = joblib.load(learning_model_path)
# print("Now Loading...")

start_time = time.clock()
print("Now Learning...")
###############################################################################
# Training

# Hyper-parameters. These were set by cross-validation,
# using a GridSearchCV. Here we are not performing cross-validation to
# save time.
rbm.learning_rate = 0.06
rbm.n_iter = 20
# More components tend to give better prediction performance, but larger
# fitting time
rbm.n_components = 100
logistic.C = 6000.0

# Training RBM-Logistic Pipeline
classifier.fit(train_data, train_label)

# Training Logistic regression
logistic_classifier = linear_model.LogisticRegression(C=100.0)
logistic_classifier.fit(train_data, train_label)

end_time = time.clock()
print("Learning Complete \nTime =", end_time - start_time)
# Time = 7276.782202

# Saving data
joblib.dump(rbm, learning_model_path)
data_path = "C:\\Users\\seyit\\Desktop\\NLPLab\\data\\"
features_file = "combined_data\\RBMsentence_training.txt"
labels_file = "polarity_sentences_kaggle\\training.txt"

with open(data_path + labels_file, 'r', encoding="utf8") as f:
    sentences = [x for x in f.readlines()]
    labels = [x[0] for x in sentences]

labels = np.array(labels)
labels = labels.astype(float)
print(labels)
features = np.loadtxt(data_path + features_file, dtype=float)
rbm_data = np.c_[features, labels].astype(float)

RBM = BernoulliRBM(random_state=0, verbose=True)
RBM.n_components = 20
RBM.learning_rate = 0.05
RBM.n_iter = 20

MLP = MLPClassifier(activation='relu',
                    alpha=1e-05,
                    batch_size=10,
                    beta_1=0.9,
                    beta_2=0.999,
                    early_stopping=False,
                    epsilon=1e-08,
                    hidden_layer_sizes=(100, 50),
                    learning_rate='adaptive',
                    learning_rate_init=0.01,
                    max_iter=200,
                    momentum=0.01,
Beispiel #48
0
print "TRAINING..."

logistic = linear_model.LogisticRegression()
rbm = BernoulliRBM(random_state=0, verbose=True)

classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

# Hyper-parameters. These were set by cross-validation,
# using a GridSearchCV. Here we are not performing cross-validation to
# save time.
rbm.learning_rate = 0.06
rbm.n_iter = 10
# More components tend to give better prediction performance, but larger
# fitting time
rbm.n_components = 50
logistic.C = 6000.0

# Training RBM-Logistic Pipeline
classifier.fit(X_train, Y_train)

# Training Logistic regression
logistic_classifier = linear_model.LogisticRegression(C=100.0)
logistic_classifier.fit(X_train, Y_train)

####################### Testing ############################

print()
print("Logistic regression using RBM features:\n%s\n" %
      (metrics.classification_report(Y_test, classifier.predict(X_test))))
def makePrediction(training_date, days):
    #training_date = datetime.datetime(2014,1,22)
    #title = 'scores/accuracy_' + str(days) + '.txt'
    #print title
    #text_file = open(title, "w")

    snpret = create_lagged_series("NDAQ",
                                  training_date,
                                  datetime.datetime(2015, 5, 26),
                                  lags=5)
    #print snpret
    # Use the prior two days of returns as predictor values, with direction as the response
    X = snpret[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"]]
    y = snpret["Direction"]

    # The test data is split into two parts: Before and after 1st Jan 2005.
    start_test = datetime.datetime(2015, 3, 17)
    # Create training and test sets
    X_train = X[X.index < start_test]
    X_test = X[X.index >= start_test]
    y_train = y[y.index < start_test]
    y_test = y[y.index >= start_test]

    # Create prediction DataFrame
    pred = pd.DataFrame(index=y_test.index)
    #print pred
    pred["Actual"] = y_test
    # Create and fit the three models
    print "Hit Rates:"
    models = [("Linear", linear_model.LinearRegression()),
              ("LR", LogisticRegression()),
              ("KNN", neighbors.KNeighborsClassifier(n_neighbors=3)),
              ("SVM", SVC(C=10)),
              ("RF", RandomForestClassifier(n_estimators=4))]
    for m in models:
        fit_model(m[0], m[1], X_train, y_train, X_test, pred)

    logistic = LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    rbm.learning_rate = .06
    rbm.n_iter = 15
    rbm.n_components = 100
    logistic.C = 6000
    classifier.fit(X_train, y_train)
    logistic_classifier = LogisticRegression(C=100.0)
    logistic_classifier.fit(X_train, y_train)
    score = classifier.score(X_train, y_train)
    print score
    text_file.write('Neural Network : ' + str(score) + '\n')

    # 100 Days
    text_file.write('100 Days Prediction Accuracies\n')
    snpret = create_lagged_series("NDAQ",
                                  training_date,
                                  datetime.datetime(2015, 8, 6),
                                  lags=5)
    #print snpret
    # Use the prior two days of returns as predictor values, with direction as the response
    X = snpret[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"]]
    y = snpret["Direction"]

    # The test data is split into two parts: Before and after 1st Jan 2005.
    start_test = datetime.datetime(2015, 3, 17)
    # Create training and test sets
    X_train = X[X.index < start_test]
    X_test = X[X.index >= start_test]
    y_train = y[y.index < start_test]
    y_test = y[y.index >= start_test]

    # Create prediction DataFrame
    pred = pd.DataFrame(index=y_test.index)
    #print pred
    pred["Actual"] = y_test
    # Create and fit the three models
    print "Hit Rates:"
    models = [("Linear", linear_model.LinearRegression()),
              ("LR", LogisticRegression()),
              ("KNN", neighbors.KNeighborsClassifier(n_neighbors=3)),
              ("SVM", SVC(C=10)),
              ("RF", RandomForestClassifier(n_estimators=4))]
    for m in models:
        fit_model(m[0], m[1], X_train, y_train, X_test, pred)

    logistic = LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    rbm.learning_rate = .06
    rbm.n_iter = 15
    rbm.n_components = 100
    logistic.C = 6000
    classifier.fit(X_train, y_train)
    logistic_classifier = LogisticRegression(C=100.0)
    logistic_classifier.fit(X_train, y_train)
    score = classifier.score(X_train, y_train)
    print score
    text_file.write('Neural Network : ' + str(score) + '\n')

    # 200 Days
    text_file.write('200 Days Prediction Accuracies\n')
    snpret = create_lagged_series("NDAQ",
                                  training_date,
                                  datetime.datetime(2015, 12, 31),
                                  lags=5)
    #print snpret
    # Use the prior two days of returns as predictor values, with direction as the response
    X = snpret[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"]]
    y = snpret["Direction"]

    # The test data is split into two parts: Before and after 1st Jan 2005.
    start_test = datetime.datetime(2015, 3, 17)
    # Create training and test sets
    X_train = X[X.index < start_test]
    X_test = X[X.index >= start_test]
    y_train = y[y.index < start_test]
    y_test = y[y.index >= start_test]

    # Create prediction DataFrame
    pred = pd.DataFrame(index=y_test.index)
    #print pred
    pred["Actual"] = y_test
    # Create and fit the three models
    print "Hit Rates:"
    models = [("Linear", linear_model.LinearRegression()),
              ("LR", LogisticRegression()),
              ("KNN", neighbors.KNeighborsClassifier(n_neighbors=3)),
              ("SVM", SVC(C=10)),
              ("RF", RandomForestClassifier(n_estimators=4))]
    for m in models:
        fit_model(m[0], m[1], X_train, y_train, X_test, pred)

    logistic = LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    rbm.learning_rate = .06
    rbm.n_iter = 15
    rbm.n_components = 100
    logistic.C = 6000
    classifier.fit(X_train, y_train)
    logistic_classifier = LogisticRegression(C=100.0)
    logistic_classifier.fit(X_train, y_train)
    score = classifier.score(X_train, y_train)
    print score
    text_file.write('Neural Network : ' + str(score))
    text_file.close()
#mmodel number 2
bigMatrixTrain = (bigMatrixTrain - np.min(bigMatrixTrain, 0)) / (np.max(bigMatrixTrain, 0) + 0.0001)  # 0-1 scaling
#Divide dataset for cross validation purposes
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    bigMatrixTrain, y, test_size = 0.4, random_state = 0) #fix this

# specify parameters and distributions to sample from
# Models we will use
logistic = linear_model.LogisticRegression()
rbm = BernoulliRBM(random_state=0, verbose=True)

classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
rbm.learning_rate = 0.06
rbm.n_iter = 20
# More components tend to give better prediction performance, but larger fitting time
rbm.n_components = 300
logistic.C = 6000.0

# Training RBM-Logistic Pipeline
classifier.fit(X_train, y_train)

print()
print("Logistic regression using RBM features:\n%s\n" % (
    metrics.classification_report(y_test, classifier.predict(X_test))))
print("Logistic regression using RBM features:\n%s\n" % (
    confusion_matrix(y_test, classifier.predict(X_test))))


#mmodel number 3
#Divide dataset for cross validation purposes
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
Beispiel #51
0
    #models    
    rbm = BernoulliRBM(random_state=0, verbose=True)
    
    multilabel=  OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, 
                                                        random_state=0))
    


    classifier = Pipeline(steps=[('rbm', rbm), ('multilabel', multilabel)])
       
    ###############################################################################
    # Training
    rbm.learning_rate = 0.06
    rbm.n_iter = 20
    # rbm components
    rbm.n_components = 20
      
    # Training Pipeline
    classifier.fit(X_train, Y_train)

    multilabel_classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, 
                                                        random_state=0))
       
    multilabel_classifier.fit(X_train,Y_train) 
    ###############################################################################
    # Evaluation
                                                 
 
    print()
    print("classification using RBM features:\n%s\n" % (
        metrics.classification_report(
Beispiel #52
0
    # Training RBM-Logistic Pipeline
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in classifier.steps])
    print("parameters:")
    pprint(parameters)
    print(gridSearch.fit(X_train, Y_train))
    print("Best score: %0.3f" % gridSearch.best_score_)
    print("Best parameters set:")
    best_parameters = gridSearch.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))


rbm_layer_1.learning_rate = 0.048888888888888891
rbm_layer_1.n_iter = 25
rbm_layer_1.n_components = 100

print("Debut training RBM1")
print(X_train.shape)
t0 = time.clock()
rbm_layer_1.fit(X_train)
print(time.clock() - t0)

# creation d'une base de train a partir d'echantillonnage
# de variable cachees du premier rbm
n_sample_second_layer_training = int(X.shape[0])
H1_train = np.zeros(shape=(n_sample_second_layer_training, rbm_layer_1.n_components))
H1_label_train = np.zeros(shape = (n_sample_second_layer_training, 1))
comp = 0
while (comp < n_sample_second_layer_training):
    rng = check_random_state(rbm_layer_1.random_state)
n_samples = len(line_images)
data = line_images.reshape((n_samples, -1))
is_rmb = False

# Create a classifier

# classifier = svm.SVC()
# classifier = neural_network.MLPClassifier()
# classifier = RandomForestClassifier()

logistic = linear_model.LogisticRegression()
rbm = BernoulliRBM(random_state=0, verbose=True)
classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
rbm.learning_rate = 0.00000001
rbm.n_iter = 30
rbm.n_components = 50
logistic.C = 6000.0 # regularization - smaller means more
is_rmb = True

# We learn the lines on the first half of the lines
classifier.fit(data[:n_samples / 2], line_labels[:n_samples / 2])

# Now predict the value of the digit on the second half:
expected = line_labels[n_samples / 2:]
predicted = classifier.predict(data[n_samples / 2:])

print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

plt.figure()
rbm = BernoulliRBM(random_state=0, verbose=True)

classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

###############################################################################
# Training

# Hyper-parameters. These were set by cross-validation,
# using a GridSearchCV. Here we are not performing cross-validation to
# save time.
rbm.learning_rate = 0.06
rbm.n_iter = 20
# More components tend to give better prediction performance, but larger
# fitting time
# n_component = number of binary hidden unit
rbm.n_components = input_dim
logistic.C = 6000.0

# Training RBM-Logistic Pipeline
classifier.fit(X_train, Y_train)

# Training Logistic regression
logistic_classifier = linear_model.LogisticRegression(C=100.0)
logistic_classifier.fit(X_train, Y_train)

###############################################################################
# Evaluation

print()

#Evaluation part
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline

#eval
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt

mnist = datasets.fetch_mldata("MNIST Original")
X = np.asarray(mnist.data, 'float32')/255.
Y = mnist.target.astype("int0")

Xtr, Xts, Ytr, Yts = train_test_split(X, Y, test_size=0.3,  random_state=0)



logistic = linear_model.LogisticRegression()
logistic.C = 50. #optimize me pls
logistic.penalty = 'l2'

rbm = BernoulliRBM(random_state=0, verbose=True)
rbm.n_components = 300# and me
rbm.learning_rate = 0.05
rbm.n_iter = 30
classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
classifier.fit(Xtr, Ytr)



preds = classifier.predict(Xts)
print classification_report(Yts, preds)
Beispiel #56
0
from sklearn.neural_network import BernoulliRBM
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

base = datasets.load_digits()
previsores = np.asarray(base.data, 'float32')
classe = base.target

normalizador = MinMaxScaler(feature_range=(0,1))
previsores = normalizador.fit_transform(previsores)

previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size = 0.2, random_state=0)

rbm = BernoulliRBM(random_state = 0) #Bernoulli para reduzir a dimensionalidade
rbm.n_iter = 25 #Numero de iteraçoes
rbm.n_components = 50 #Neuroneos na camada escondida, nao precisamos colocar a de entrada pq ja tem os 64 pixels que el ja vai pegar 

naive_rbm = GaussianNB() 
classificador_rbm = Pipeline(steps = [('rbm', rbm), ('naive', naive_rbm)]) #faz a reduçao com o Bernoulli e depois manda para o de Gauss
classificador_rbm.fit(previsores_treinamento, classe_treinamento)


# ==== Visualizando as imagens reduzidas =====
plt.figure(figsize=(20,20))
for i, comp in enumerate(rbm.components_):
    plt.subplot(10, 10, i + 1)
    plt.imshow(comp.reshape((8,8)), cmap=plt.cm.gray_r)
    plt.xticks(())
    plt.yticks(())
plt.show()
Beispiel #57
0
svc = sklearn.svm.SVC()

# classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
classifier = Pipeline(steps=[('rbm', rbm), ('svc', svc)])

###############################################################################
# Training

# Hyper-parameters. These were set by cross-validation,
# using a GridSearchCV. Here we are not performing cross-validation to
# save time.
rbm.learning_rate = 0.06
rbm.n_iter = 20
# More components tend to give better prediction performance, but larger
# fitting time
rbm.n_components = 100
logistic.C = 6000.0

# Training RBM-Logistic Pipeline
classifier.fit(X_train, Y_train)

# Training Logistic regression
logistic_classifier = linear_model.LogisticRegression(C=100.0)
logistic_classifier.fit(X_train, Y_train)

###############################################################################
# Evaluation

print()
print("Logistic regression using RBM features:\n%s\n" %
      (metrics.classification_report(Y_test, classifier.predict(X_test))))
rbm = BernoulliRBM(random_state=0, verbose=True)

classifier = Pipeline(steps=[("rbm", rbm), ("logistic", logistic)])

###############################################################################
# Training

# Hyper-parameters. These were set by cross-validation,
# using a GridSearchCV. Here we are not performing cross-validation to
# save time.
rbm.learning_rate = learning_rate
rbm.n_iter = training_epochs
rbm.batch_size = batch_size
# More components tend to give better prediction performance, but larger
# fitting time
rbm.n_components = n_hidden
logistic.C = 1000.0

# Training RBM-Logistic Pipeline
classifier.fit(np_train_set[:, n_labels:], Y_train)

# Training Logistic regression
logistic_classifier = linear_model.LogisticRegression(C=100.0)
logistic_classifier.fit(X_train, Y_train)

###############################################################################
# Evaluation

print()
print(
    "Logistic regression using RBM features:\n%s\n"