Ejemplo n.º 1
0
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline = Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)
Ejemplo n.º 2
0
def evaluate_classifier(featx):

    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]

    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    classifierName = 'SVM'
    classifier = SklearnClassifier(LinearSVC(), sparse=False).train(trainfeats)

    newsdata = {}
    '''
    news_path = "./xa/"
    out_ = open('result.txt', 'w')

    for root, dirs, files, in os.walk(news_path):
        for name in files:
            if name == ".DS_Store":
                continue
            fp = open(root+'/'+name, 'r')
            #print(name)
            date = ''
            text = []
            gotDate = False
            #print(root+'/'+name)
            for line in fp:
                if gotDate == False:
                    date = line.replace('\n','')
                    gotDate = True
                    if date not in newsdata:
                        newsdata[date] = [0,0]
                else:
                    if len(line.strip()) == 0:
                        gotDate = False
                        continue
                    text.append(line)
                    #print(text)
                    newsfeat = [(featx(f), date) for f in word_split(text)]
                    del text[:]
                    observed = classifier.classify(newsfeat[0][0])
                    if observed == 'neg':
                        newsdata[date][1] += 1
                        #print('------------------------------ '+ 'neg')
                    else:
                        newsdata[date][0] += 1
                        #print('------------------------------ '+ 'pos')
                        #print(root+'/'+name+': '+ 'pos')

                    gotDate = False
            fp.close()
    for date in newsdata:
        #print(date+': '+str(newsdata[date][0])+', '+str(newsdata[date][1]))
        out_.write(date+'\n'+str(newsdata[date][0])+', '+str(newsdata[date][1])+'\n')
    out_.close() 
    '''
    out_ = open('TEST_result.txt', 'w')

    fp = open('test_half_half.txt', 'r')
    #print(name)
    date = ''
    text = []
    gotDate = False
    #print(root+'/'+name)
    for line in fp:
        if gotDate == False:
            date = line.replace('\n', '')
            gotDate = True
            if date not in newsdata:
                newsdata[date] = [0, 0]
        else:
            if len(line.strip()) == 0:
                gotDate = False
                continue
            text.append(line)
            print(text)
            newsfeat = [(featx(f), date) for f in word_split(text)]
            del text[:]
            observed = classifier.classify(newsfeat[0][0])
            if observed == 'neg':
                newsdata[date][1] += 1
                print('------------------------------ ' + 'neg')
            else:
                newsdata[date][0] += 1
                print('------------------------------ ' + 'pos')
                #print(root+'/'+name+': '+ 'pos')

            gotDate = False
    fp.close()
    for date in newsdata:
        #print(date+': '+str(newsdata[date][0])+', '+str(newsdata[date][1]))
        out_.write(date + '\n' + str(newsdata[date][0]) + ', ' +
                   str(newsdata[date][1]) + '\n')
    out_.close()
Ejemplo n.º 3
0
def main(params):
    dp = DataProvider(params)
    auth_to_ix = dp.create_author_idx()

    # Preprocess the training data
    train_docs = []
    targets = []
    model = {}

    # remove numbers
    bad_hombres = range(10)
    if params['nostop']:
        bad_hombres = bad_hombres + stopwords.words('english')
    if params['nopunct']:
        bad_hombres = bad_hombres + list(string.punctuation)

    bad_hombres = set(bad_hombres)

    all_words = Counter()

    for i, doc in enumerate(dp.data['docs']):
        no_num = re.sub(r'\d+', '', doc['text'].lower())
        curr_text = [
            w for w in wordpunct_tokenize(no_num) if w not in bad_hombres
        ]
        dp.data['docs'][i]['tokenized'] = curr_text
        if doc['split'] == 'train':
            all_words.update(curr_text)

    short_vocab = {
        w: i
        for i, w in enumerate([
            wrd for wrd in all_words
            if all_words[wrd] > params['vocab_threshold']
        ])
    }

    docCounts_train, target_train = count(dp,
                                          short_vocab,
                                          auth_to_ix,
                                          split='train')
    bow_features_train, idf_train = bow_features(docCounts_train,
                                                 params['tfidf'])

    docCounts_val, target_val = count(dp, short_vocab, auth_to_ix, split='val')
    bow_features_val, _ = bow_features(docCounts_val,
                                       params['tfidf'],
                                       idf=idf_train)

    # Do PCA?
    if params['pca'] > 0:
        pca_model = PCA(n_components=params['pca'])
        bow_features_train = pca_model.fit_transform(bow_features_train)
        print
        'Explained variance is %.2f' % (sum(
            pca_model.explained_variance_ratio_))

        bow_features_val = pca_model.transform(bow_features_val)
        params['pca'] = bow_features_train.shape[-1]

    # Normalize the data
    bow_features_train, mean_tr, std_tr = normalize(bow_features_train)
    bow_features_val, _, _ = normalize(bow_features_val, mean_tr, std_tr)

    if params['mlp'] == False:
        if params['linearsvm']:
            # Linear SVC alread implements one-vs-rest
            svm_model = LinearSVC()  # verbose=1)
            svm_model.fit(bow_features_train, target_train)

        # Time to evaluate now.
        confTr = svm_model.decision_function(bow_features_train)
        confVal = svm_model.decision_function(bow_features_val)
    else:
        params['num_output_layers'] = len(auth_to_ix)
        params['inp_size'] = params['pca']
        model = MLP_classifier(params)
        model.fit(bow_features_train, target_train, bow_features_val,
                  target_val, params['epochs'], params['lr'], params['l2'])
        confTr = model.decision_function(bow_features_train)
        confVal = model.decision_function(bow_features_val)

    mean_rank_train = np.where(
        confTr.argsort(axis=1)[:, ::-1] == target_train[:, None])[1].mean()
    topk_train = (
        np.where(confTr.argsort(axis=1)[:, ::-1] == target_train[:, None])[1]
        <= params['topk']).sum() * 100. / len(target_train)
    train_accuracy = 100. * float(
        (confTr.argmax(axis=1) == target_train).sum()) / len(target_train)

    mean_rank_val = np.where(
        confVal.argsort(axis=1)[:, ::-1] == target_val[:, None])[1].mean()
    topk_val = (
        np.where(confVal.argsort(axis=1)[:, ::-1] == target_val[:, None])[1] <=
        params['topk']).sum() * 100. / len(target_val)
    val_accuracy = 100. * float(
        (confVal.argmax(axis=1) == target_val).sum()) / len(target_val)

    # DO the binary evaluation similar to the Bagnall
    # confTr = confTr - confTr.mean(axis=1)[:,None]
    n_auths = len(auth_to_ix)

    n_train = confTr.shape[0]
    neg_auths_tr = np.random.randint(0, n_auths, n_train)
    adjusted_scores_tr = ((np.argsort(
        confTr[:, np.concatenate([target_train.astype(int), neg_auths_tr])],
        axis=0) == np.concatenate([np.arange(n_train),
                                   np.arange(n_train)])).argmax(axis=0) +
                          1) / float(n_train)
    auc_tr = roc_auc_score(
        np.concatenate([
            np.ones(int(n_train), dtype=int),
            np.zeros(int(n_train), dtype=int)
        ]), adjusted_scores_tr)

    n_val = confVal.shape[0]
    neg_auths_val = np.random.randint(0, n_auths, n_val)
    adjusted_scores_val = ((np.argsort(
        confVal[:, np.concatenate([target_val.astype(int), neg_auths_val])],
        axis=0) == np.concatenate([np.arange(n_val),
                                   np.arange(n_val)])).argmax(axis=0) +
                           1) / float(n_val)
    auc_val = roc_auc_score(
        np.concatenate(
            [np.ones(int(n_val), dtype=int),
             np.zeros(int(n_val), dtype=int)]), adjusted_scores_val)

    print
    '------------- Training set-------------------'
    print
    'Accuracy is %.2f, Mean rank is %.2f / %d' % (
        train_accuracy, mean_rank_train, len(auth_to_ix))
    print
    'Top-%d Accuracy is %.2f' % (params['topk'], topk_train)
    print
    'Accuracy per adjusted scores %.3f' % (100. * (
        (adjusted_scores_tr[:n_train] >= 0.5).sum() +
        (adjusted_scores_tr[n_train:] < 0.5).sum()) / (2. * n_train))
    print
    'AUC is  %.2f' % (auc_tr)

    print
    '------------- Val set-------------------'
    print
    'Accuracy is %.2f, Mean rank is %.2f / %d' % (val_accuracy, mean_rank_val,
                                                  len(auth_to_ix))
    print
    'Top-%d Accuracy is %.2f' % (params['topk'], topk_val)
    print
    'Accuracy per adjusted scores %.3f' % (100. * (
        (adjusted_scores_val[:n_val] >= 0.5).sum() +
        (adjusted_scores_val[n_val:] < 0.5).sum()) / (2. * n_val))
    print
    'AUC is  %.2f' % (auc_val)

    print
    '--------------------------------------------------------------------------'
    print
    '--------------------------------------------------------------------------\n\n'
Ejemplo n.º 4
0
    axi.axis('off')
    
from itertools import chain
X_train = np.array([feature.hog(im) for im in chain(positive_patches, negative_patches)])
y_train = np.zeros(X_train.shape[0])
y_train[:positive_patches.shape[0]] = 1
X_train.shape

from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import cross_val_score

cross_val_score(GaussianNB(), X_train, y_train)

from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
grid = GridSearchCV(LinearSVC(), {'C' :[1.0, 2.0, 4.0, 8.0]})
grid.fit(X_train, y_train)
grid.best_score_
grid.best_params_

model = grid.best_estimator_
model.fit(X_train, y_train)

import skimage
test_image = skimage.data.astronaut()
test_image = skimage.color.rgb2gray(test_image)
test_image = skimage.transform.rescale(test_image, .5)
test_image = test_image[:160, 40:180]

plt.imshow(test_image, cmap='gray')
plt.axis('off')
Ejemplo n.º 5
0
from sklearn.svm import LinearSVC

# Initialize and fit the model
model = LinearSVC()
model.fit(X_train, y_train)

# Generate predictions and score them manually
predictions = model.predict(X_test)
print(sum(predictions == y_test.squeeze()) / len(y_test))
Ejemplo n.º 6
0
# initialize the HOG descriptor
hog = HOG(orientations = 18, pixelsPerCell = (10, 10),
	cellsPerBlock = (1, 1), normalize = True)

# loop over the images
for image in digits:
	# deskew the image, center it
	image = dataset.deskew(image, 20)
	image = dataset.center_extent(image, (20, 20))

	# describe the image and update the data matrix
	hist = hog.describe(image)
	data.append(hist)

# train the model
model = LinearSVC(random_state = 42)
model.fit(data, target)

# dump the model to file
f = open(args["model"], "w")
f.write(cPickle.dumps(model))
f.close()

cv2.putText(canvas,'Please select an image from the plates folder as filename:', (canvas.shape[1] - 600, canvas.shape[0] - 570), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2)
cv2.putText(canvas,'a. Car1Plate.jpg', (canvas.shape[1] - 600, canvas.shape[0] - 550), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2)
cv2.putText(canvas,'b. Car2Plate.jpg', (canvas.shape[1] - 600, canvas.shape[0] - 530), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2)
cv2.putText(canvas,'c. Car3Plate.jpg', (canvas.shape[1] - 600, canvas.shape[0] - 510), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2)
cv2.putText(canvas,'d. Car4Plate.jpg', (canvas.shape[1] - 600, canvas.shape[0] - 490), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2)
cv2.putText(canvas,'e. Car5Plate.jpg', (canvas.shape[1] - 600, canvas.shape[0] - 470), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2)
cv2.putText(canvas,'Enter a letter..', (canvas.shape[1] - 600, canvas.shape[0] - 450), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2)
cv2.imshow('CONSOLE',canvas)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# Define the classifiers
classifiers = [LogisticRegression(), LinearSVC(), SVC(), KNeighborsClassifier()]

# Fit the classifiers
for c in classifiers:
    c.fit(X, y)

# Plot the classifiers
plot_4_classifiers(X, y, classifiers)
plt.show()
"""
Estimators
Base article: Benchmarking functional connectome-based predictive models for resting-state fMRI
"""
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Lasso, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn import linear_model

feature_selection = SelectPercentile(f_classif, percentile=10)
svc_l1 = LinearSVC(penalty='l1', dual=False, random_state=0)
anova_svcl1 = Pipeline([('anova', feature_selection), ('svc', svc_l1)])
svc_l2 = LinearSVC(penalty='l2', random_state=0)
anova_svcl2 = Pipeline([('anova', feature_selection), ('svc', svc_l2)])
gnb = GaussianNB()
randomf = RandomForestClassifier(random_state=0)
logregression_l1 = LogisticRegression(penalty='l1', dual=False, random_state=0)
logregression_l2 = LogisticRegression(penalty='l2', random_state=0)
lasso = Lasso(random_state=0)
knn = KNeighborsClassifier(n_neighbors=1)
ridge = RidgeClassifier()
netn5 = MLPClassifier(solver='lbfgs',
                      alpha=1e-5,
                      hidden_layer_sizes=(5, ),
                      random_state=1)
netn5a = MLPClassifier(solver='adam',
    # Create a color plot with the results
    n_classes = len(np.unique(y))
    contours = ax.contourf(xx,
                           yy,
                           Z,
                           alpha=0.3,
                           levels=np.arange(n_classes + 1) - 0.5,
                           cmap=cmap,
                           clim=(y.min(), y.max()),
                           zorder=1)

    ax.set(xlim=xlim, ylim=ylim)


lsvc = LinearSVC(penalty='l2', loss='hinge', random_state=42, C=2)
lsvc.fit(X_train, y_train)

plot_decision_regions(X_train.values,
                      y_train.values,
                      clf=lsvc,
                      res=0.001,
                      legend=2)
plt.title('Decision Regions')
plt.xlabel('rank')
plt.ylabel('comments')

lsvc.predict(X_test)

print('score: {}'.format(lsvc.score(X_test, y_test)))
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import (cross_val_score, KFold, cross_validate,
                                     train_test_split)
from sklearn.ensemble import StackingClassifier

data = load_wine()
y = data.target
X = data.data
stc = StandardScaler()
lenc = LabelEncoder()
columns = data.feature_names
df = pd.DataFrame(data=np.hstack(tup=(X, y.reshape(-1, 1))),
                  columns=np.hstack(tup=(columns, ["Class"])))
X_std = stc.fit_transform(df[columns])
pipesvm = Pipeline([("stc", stc), ("selection", RFE(LinearSVC())),
                    ("svm", SVC(kernel="linear"))])
pipelda = Pipeline([("stc", stc), ("svm", LinearDiscriminantAnalysis())])
estimators = [("LDA", pipelda), ("SVM", pipesvm)]
# El utilizar clasificadores apilados tiene beneficios cuando se trata de
# problemas multiclase, puesto que puede mejorar mucho el pronostico de clase
# al explotar el poder predictivo del pronostico para ciertas clases
stacking_classifier = StackingClassifier(estimators=estimators,
                                         final_estimator=GaussianNB())
print("Stacking stimators")
print(
    cross_val_score(X=df[columns],
                    y=y,
                    estimator=stacking_classifier,
                    cv=KFold(5)))
print("Only SVM")
Ejemplo n.º 11
0
def plot_dataset(X, y, axes):
    plt.plot(X[:, 0][y == 0], X[:, 1][y == 0], "bs")
    plt.plot(X[:, 0][y == 1], X[:, 1][y == 1], "g^")
    plt.axis(axes)
    plt.grid(True, which='both')
    plt.xlabel(r"$x_1$", fontsize=20)
    plt.ylabel(r"$x_2$", fontsize=20, rotation=0)


plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
plt.show()

polynomial_svm_clf = Pipeline([("poly_features", PolynomialFeatures(degree=3)),
                               ("scaler", StandardScaler()),
                               ("svm_clf",
                                LinearSVC(C=10, loss="hinge",
                                          random_state=42))])

polynomial_svm_clf.fit(X, y)


def plot_predictions(clf, axes):
    x0s = np.linspace(axes[0], axes[1], 100)
    x1s = np.linspace(axes[2], axes[3], 100)
    x0, x1 = np.meshgrid(x0s, x1s)
    X = np.c_[x0.ravel(), x1.ravel()]
    y_pred = clf.predict(X).reshape(x0.shape)
    y_decision = clf.decision_function(X).reshape(x0.shape)
    plt.contourf(x0, x1, y_pred, cmap=plt.cm.brg, alpha=0.2)
    plt.contourf(x0, x1, y_decision, cmap=plt.cm.brg, alpha=0.1)

        def get_ten_fold_crossvalid_perfermance(self, settings = None):
            fisher_mode = settings['fisher_mode']
            analysis_scr = []
            predicted_score = settings['predicted_score']
            reduce_ratio = settings['reduce_ratio']
            #for seq_no in range(1, self.ddi_obj.total_number_of_sequences+1):
            #subset_size = math.floor(self.ddi_obj.total_number_of_sequences / 10.0)
            kf = KFold(self.ddi_obj.total_number_of_sequences, n_folds = 10, shuffle = True)
            #for subset_no in range(1, 11):
            for ((train_index, test_index),subset_no) in izip(kf,range(1,11)):
            #for train_index, test_index in kf;
                print("Subset:", subset_no)
                print("Train index: ", train_index)
                print("Test index: ", test_index)
                #logger.info('subset number: ' + str(subset_no))
                if settings['SVM']:
                    print "SVM"
                    (train_X_10fold, train_y_10fold),(train_X_reduced, train_y_reduced), (test_X, test_y) = self.ddi_obj.get_ten_fold_crossvalid_one_subset(train_index, test_index, fisher_mode = fisher_mode, reduce_ratio = reduce_ratio)
                    standard_scaler = preprocessing.StandardScaler().fit(train_X_reduced)
                    scaled_train_X = standard_scaler.transform(train_X_reduced)
                    scaled_test_X = standard_scaler.transform(test_X)
                    Linear_SVC = LinearSVC(C=1, penalty="l2")
                    Linear_SVC.fit(scaled_train_X, train_y_reduced)
                    predicted_test_y = Linear_SVC.predict(scaled_test_X)
                    isTest = True; #new
                    analysis_scr.append((self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple(performance_score(test_y, predicted_test_y).values())) #new

                    predicted_train_y = Linear_SVC.predict(scaled_train_X)
                    isTest = False; #new
                    analysis_scr.append((self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple(performance_score(train_y_reduced, predicted_train_y).values()))

                    
                if settings['SVM_RBF']:
                    print "SVM_RBF"
                    standard_scaler = preprocessing.StandardScaler().fit(train_X_reduced)
                    scaled_train_X = standard_scaler.transform(train_X_reduced)
                    scaled_test_X = standard_scaler.transform(test_X)
                    L1_SVC_RBF_Selector = SVC(C=1, gamma=0.01, kernel='rbf').fit(scaled_train_X, train_y_reduced)

                    predicted_test_y = L1_SVC_RBF_Selector.predict(scaled_test_X)
                    isTest = True; #new
                    analysis_scr.append((self.ddi, subset_no, fisher_mode, 'SVM_RBF', isTest) + tuple(performance_score(test_y, predicted_test_y).values())) #new

                    predicted_train_y = L1_SVC_RBF_Selector.predict(scaled_train_X)
                    isTest = False; #new
                    analysis_scr.append((self.ddi, subset_no, fisher_mode, 'SVM_RBF', isTest) + tuple(performance_score(train_y_reduced, predicted_train_y).values()))
          
                # direct deep learning 
                min_max_scaler = Preprocessing_Scaler_with_mean_point5()
                X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced)
                X_train_pre_validation_minmax = min_max_scaler.transform(train_X_reduced)
                x_test_minmax = min_max_scaler.transform(test_X)
                pretraining_X_minmax = min_max_scaler.transform(train_X_10fold)
                x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split(X_train_pre_validation_minmax, 
                                                                                                  train_y_reduced
                                                                    , test_size=0.4, random_state=42)
                finetune_lr = settings['finetune_lr']
                batch_size = settings['batch_size']
                pretraining_epochs = cal_epochs(settings['pretraining_interations'], x_train_minmax, batch_size = batch_size)
                #pretrain_lr=0.001
                pretrain_lr = settings['pretrain_lr']
                training_epochs = settings['training_epochs']
                hidden_layers_sizes= settings['hidden_layers_sizes']
                corruption_levels = settings['corruption_levels']
                if settings['SAE_SVM']: 
                    # SAE_SVM
                    print 'SAE followed by SVM'
                    x = X_train_pre_validation_minmax
                    a_MAE_A = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, 
                                            hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels)
                    new_x_train_minmax_A =  a_MAE_A.transform(X_train_pre_validation_minmax)
                    new_x_test_minmax_A =  a_MAE_A.transform(x_test_minmax)
                    Linear_SVC = LinearSVC(C=1, penalty="l2")
                    Linear_SVC.fit(new_x_train_minmax_A, train_y_reduced)
                    predicted_test_y = Linear_SVC.predict(new_x_test_minmax_A)
                    isTest = True; #new
                    analysis_scr.append((self.ddi, subset_no, fisher_mode, 'SAE_SVM', isTest) + tuple(performance_score(test_y, predicted_test_y).values())) #new

                    predicted_train_y = Linear_SVC.predict(new_x_train_minmax_A)
                    isTest = False; #new
                    analysis_scr.append((self.ddi, subset_no, fisher_mode, 'SAE_SVM', isTest) + tuple(performance_score(train_y_reduced, predicted_train_y).values()))
                 
                                  
                    
                if settings['DL']:
                    print "direct deep learning"
                    sda = trainSda(x_train_minmax, y_train_minmax,
                                 x_validation_minmax, y_validation_minmax , 
                                 x_test_minmax, test_y,
                                 hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                                 training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, 
                                 pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                     )
                    print 'hidden_layers_sizes:', hidden_layers_sizes
                    print 'corruption_levels:', corruption_levels
                    training_predicted = sda.predict(x_train_minmax)
                    y_train = y_train_minmax
                    isTest = False; #new
                    analysis_scr.append((self.ddi, subset_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_train, training_predicted).values()))

                    test_predicted = sda.predict(x_test_minmax)
                    y_test = test_y
                    isTest = True; #new
                    analysis_scr.append((self.ddi, subset_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_test, test_predicted).values()))

                if settings['DL_U']:
                # deep learning using unlabeled data for pretraining
                    print 'deep learning with unlabel data'
                    
                    pretraining_epochs = cal_epochs(settings['pretraining_interations'], x_train_minmax, batch_size = batch_size)
                    sda_unlabel = trainSda(x_train_minmax, y_train_minmax,
                                 x_validation_minmax, y_validation_minmax , 
                                 x_test_minmax, test_y, 
                                 pretraining_X_minmax = pretraining_X_minmax,
                                 hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                                 training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, 
                                 pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                     )
                    print 'hidden_layers_sizes:', hidden_layers_sizes
                    print 'corruption_levels:', corruption_levels
                    training_predicted = sda_unlabel.predict(x_train_minmax)
                    y_train = y_train_minmax
                    isTest = False; #new
                    analysis_scr.append((self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple(performance_score(y_train, training_predicted, predicted_score).values()))

                    test_predicted = sda_unlabel.predict(x_test_minmax)
                    y_test = test_y

                    isTest = True; #new
                    analysis_scr.append((self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple(performance_score(y_test, test_predicted, predicted_score).values()))
                if settings['DL_S']:
                    # deep learning using split network
                    print 'deep learning using split network'
                    # get the new representation for A set. first 784-D
                    pretraining_epochs = cal_epochs(settings['pretraining_interations'], x_train_minmax, batch_size = batch_size)
                    
                    x = x_train_minmax[:, :x_train_minmax.shape[1]/2]
                    print "original shape for A", x.shape
                    a_MAE_A = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, 
                                            hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels)
                    new_x_train_minmax_A =  a_MAE_A.transform(x_train_minmax[:, :x_train_minmax.shape[1]/2])
                    x = x_train_minmax[:, x_train_minmax.shape[1]/2:]
                    
                    print "original shape for B", x.shape
                    a_MAE_B = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, 
                                            hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels)
                    new_x_train_minmax_B =  a_MAE_B.transform(x_train_minmax[:, x_train_minmax.shape[1]/2:])
                    
                    new_x_test_minmax_A = a_MAE_A.transform(x_test_minmax[:, :x_test_minmax.shape[1]/2])
                    new_x_test_minmax_B = a_MAE_B.transform(x_test_minmax[:, x_test_minmax.shape[1]/2:])
                    new_x_validation_minmax_A = a_MAE_A.transform(x_validation_minmax[:, :x_validation_minmax.shape[1]/2])
                    new_x_validation_minmax_B = a_MAE_B.transform(x_validation_minmax[:, x_validation_minmax.shape[1]/2:])
                    new_x_train_minmax_whole = np.hstack((new_x_train_minmax_A, new_x_train_minmax_B))
                    new_x_test_minmax_whole = np.hstack((new_x_test_minmax_A, new_x_test_minmax_B))
                    new_x_validationt_minmax_whole = np.hstack((new_x_validation_minmax_A, new_x_validation_minmax_B))

                    
                    sda_transformed = trainSda(new_x_train_minmax_whole, y_train_minmax,
                         new_x_validationt_minmax_whole, y_validation_minmax , 
                         new_x_test_minmax_whole, y_test,
                         hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                         training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, 
                         pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                         )
                    
                    print 'hidden_layers_sizes:', hidden_layers_sizes
                    print 'corruption_levels:', corruption_levels
                    training_predicted = sda_transformed.predict(new_x_train_minmax_whole)
                    y_train = y_train_minmax
                    
                    isTest = False; #new
                    analysis_scr.append((self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple(performance_score(y_train, training_predicted, predicted_score).values()))

                    test_predicted = sda_transformed.predict(new_x_test_minmax_whole)
                    y_test = test_y

                    isTest = True; #new
                    analysis_scr.append((self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple(performance_score(y_test, test_predicted, predicted_score).values()))
            
            
            report_name = filename + '_' + '_test10fold_'.join(map(str, hidden_layers_sizes)) +                             '_' + str(pretrain_lr) + '_' + str(finetune_lr) + '_' + str(reduce_ratio)+                     '_' + str(training_epochs) + '_' + current_date
            saveAsCsv(predicted_score, report_name, performance_score(y_test, test_predicted, predicted_score), analysis_scr)
        def get_LOO_perfermance(self, settings = None):
            fisher_mode = settings['fisher_mode']
            analysis_scr = []
            predicted_score = settings['predicted_score'] 
            reduce_ratio = settings['reduce_ratio'] 
            for seq_no in range(1, self.ddi_obj.total_number_of_sequences+1):
                print seq_no
                logger.info('sequence number: ' + str(seq_no))
                if settings['SVM']:
                    print "SVM"
                    (train_X_LOO, train_y_LOO),(train_X_reduced, train_y_reduced), (test_X, test_y) = self.ddi_obj.get_LOO_training_and_reduced_traing(seq_no,fisher_mode = fisher_mode, reduce_ratio = reduce_ratio)
                    standard_scaler = preprocessing.StandardScaler().fit(train_X_reduced)
                    scaled_train_X = standard_scaler.transform(train_X_reduced)
                    scaled_test_X = standard_scaler.transform(test_X)
                    Linear_SVC = LinearSVC(C=1, penalty="l2")
                    Linear_SVC.fit(scaled_train_X, train_y_reduced)
                    predicted_test_y = Linear_SVC.predict(scaled_test_X)
                    isTest = True; #new
                    analysis_scr.append((self.ddi, seq_no, fisher_mode, 'SVM', isTest) + tuple(performance_score(test_y, predicted_test_y).values())) #new

                    predicted_train_y = Linear_SVC.predict(scaled_train_X)
                    isTest = False; #new
                    analysis_scr.append((self.ddi, seq_no, fisher_mode, 'SVM', isTest) + tuple(performance_score(train_y_reduced, predicted_train_y).values()))
                # Deep learning part
                min_max_scaler = Preprocessing_Scaler_with_mean_point5()
                X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced)
                X_train_pre_validation_minmax = min_max_scaler.transform(train_X_reduced)
                x_test_minmax = min_max_scaler.transform(test_X)
                pretraining_X_minmax = min_max_scaler.transform(train_X_LOO)
                x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split(X_train_pre_validation_minmax, 
                                                                                                  train_y_reduced
                                                                    , test_size=0.4, random_state=42)
                finetune_lr = settings['finetune_lr']
                batch_size = settings['batch_size']
                pretraining_epochs = cal_epochs(settings['pretraining_interations'], x_train_minmax, batch_size = batch_size)
                #pretrain_lr=0.001
                pretrain_lr = settings['pretrain_lr']
                training_epochs = settings['training_epochs']
                hidden_layers_sizes= settings['hidden_layers_sizes']
                corruption_levels = settings['corruption_levels']
                if settings['DL']:
                    print "direct deep learning"
                    # direct deep learning 
                    sda = trainSda(x_train_minmax, y_train_minmax,
                                 x_validation_minmax, y_validation_minmax , 
                                 x_test_minmax, test_y,
                                 hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                                 training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, 
                                 pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                     )
                    print 'hidden_layers_sizes:', hidden_layers_sizes
                    print 'corruption_levels:', corruption_levels
                    training_predicted = sda.predict(x_train_minmax)
                    y_train = y_train_minmax
                    isTest = False; #new
                    analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_train, training_predicted).values()))

                    test_predicted = sda.predict(x_test_minmax)
                    y_test = test_y
                    isTest = True; #new
                    analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_test, test_predicted).values()))

                if 0:
                    # deep learning using unlabeled data for pretraining
                    print 'deep learning with unlabel data'
                    pretraining_epochs_for_reduced = cal_epochs(1500, pretraining_X_minmax, batch_size = batch_size)
                    sda_unlabel = trainSda(x_train_minmax, y_train_minmax,
                                 x_validation_minmax, y_validation_minmax , 
                                 x_test_minmax, test_y, 
                                 pretraining_X_minmax = pretraining_X_minmax,
                                 hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                                 training_epochs = training_epochs, pretraining_epochs = pretraining_epochs_for_reduced, 
                                 pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                     )
                    print 'hidden_layers_sizes:', hidden_layers_sizes
                    print 'corruption_levels:', corruption_levels
                    training_predicted = sda_unlabel.predict(x_train_minmax)
                    y_train = y_train_minmax
                    isTest = False; #new
                    analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL_U', isTest) + tuple(performance_score(y_train, training_predicted, predicted_score).values()))

                    test_predicted = sda_unlabel.predict(x_test_minmax)
                    y_test = test_y

                    isTest = True; #new
                    analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL_U', isTest) + tuple(performance_score(y_test, test_predicted, predicted_score).values()))
                if settings['Split_DL']:
                    # deep learning using split network
                    print 'deep learning using split network'
                    # get the new representation for A set. first 784-D
                    pretraining_epochs = cal_epochs(settings['pretraining_interations'], x_train_minmax, batch_size = batch_size)
                    hidden_layers_sizes= settings['hidden_layers_sizes']
                    corruption_levels = settings['corruption_levels']
                    
                    x = x_train_minmax[:, :x_train_minmax.shape[1]/2]
                    print "original shape for A", x.shape
                    a_MAE_A = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, 
                                            hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels)
                    new_x_train_minmax_A =  a_MAE_A.transform(x_train_minmax[:, :x_train_minmax.shape[1]/2])
                    x = x_train_minmax[:, x_train_minmax.shape[1]/2:]
                    
                    print "original shape for B", x.shape
                    a_MAE_B = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, 
                                            hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels)
                    new_x_train_minmax_B =  a_MAE_B.transform(x_train_minmax[:, x_train_minmax.shape[1]/2:])
                    
                    new_x_test_minmax_A = a_MAE_A.transform(x_test_minmax[:, :x_test_minmax.shape[1]/2])
                    new_x_test_minmax_B = a_MAE_B.transform(x_test_minmax[:, x_test_minmax.shape[1]/2:])
                    new_x_validation_minmax_A = a_MAE_A.transform(x_validation_minmax[:, :x_validation_minmax.shape[1]/2])
                    new_x_validation_minmax_B = a_MAE_B.transform(x_validation_minmax[:, x_validation_minmax.shape[1]/2:])
                    new_x_train_minmax_whole = np.hstack((new_x_train_minmax_A, new_x_train_minmax_B))
                    new_x_test_minmax_whole = np.hstack((new_x_test_minmax_A, new_x_test_minmax_B))
                    new_x_validationt_minmax_whole = np.hstack((new_x_validation_minmax_A, new_x_validation_minmax_B))

                    finetune_lr = settings['finetune_lr']
                    batch_size = settings['batch_size']
                    pretraining_epochs = cal_epochs(settings['pretraining_interations'], x_train_minmax, batch_size = batch_size)
                    #pretrain_lr=0.001
                    pretrain_lr = settings['pretrain_lr']
                    training_epochs = settings['training_epochs']
                    hidden_layers_sizes= settings['hidden_layers_sizes']
                    corruption_levels = settings['corruption_levels']
                    
                    sda_transformed = trainSda(new_x_train_minmax_whole, y_train_minmax,
                         new_x_validationt_minmax_whole, y_validation_minmax , 
                         new_x_test_minmax_whole, y_test,
                         hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                         training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, 
                         pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                         )
                    
                    print 'hidden_layers_sizes:', hidden_layers_sizes
                    print 'corruption_levels:', corruption_levels
                    training_predicted = sda_transformed.predict(new_x_train_minmax_whole)
                    y_train = y_train_minmax
                    
                    isTest = False; #new
                    analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL_S', isTest) + tuple(performance_score(y_train, training_predicted, predicted_score).values()))

                    test_predicted = sda_transformed.predict(new_x_test_minmax_whole)
                    y_test = test_y

                    isTest = True; #new
                    analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL_S', isTest) + tuple(performance_score(y_test, test_predicted, predicted_score).values()))
            
            
            
            report_name = filename + '_' + '_'.join(map(str, hidden_layers_sizes)) +                             '_' + str(pretrain_lr) + '_' + str(finetune_lr) + '_' + str(reduce_ratio)+                             '_' +str(training_epochs) + '_' + current_date
            saveAsCsv(predicted_score, report_name, performance_score(y_test, test_predicted, predicted_score), analysis_scr)
Ejemplo n.º 14
0
def train_models(Xt=None, yt=None, Xv=None, yv=None, params={}):
    """classifier model training and validation

    inner most routine!?

    Four classifiers are implemented:
        - LDA: linear discriminant analysis
        - QDA: quadratic discriminant analysis
        - OVO: one-vs-one
        - OVR: one-vs-rest


    arguments
    ------
    Xt/Xv: np.array
        feature vectors for training (t) and validation (v)
    yt/yv: list/np.array
        labels, corresponding to X, for training (t) and validation (v)

    params: dict
        doStandardize: bool
            standardize training features?
        numPCs: int
            PCA dimension reduction of training features (after standardization)
        quietmode: bool
            suppress sklearn noise during training
        labels: list
            names of the labels, for sorting confusion matrices
        num_lda: int
            number of dimensions for LDA classifier
        LSVCparam: dict
            LinearSVC parameters

    returns
    ------
    mdic: dictionary
        dictionary w/ transformations, trained models and confusion matrices

    """

    LSVCparam = dict(random_state=0, verbose=0, max_iter=3000)

    # parametersss
    p = dict(_about='parameters for train_models()',
             doStandardize=True,
             numPCs=0,
             quietmode=True,
             labels=None,
             num_lda=2,
             LSVCparam=LSVCparam)
    p.update(params)

    # assign labels if not already set
    if p['labels'] is None:
        p['labels'] = np.unique('yt').tolist()

    # labeling sanity checks
    lbls_trn = np.unique(yt).tolist()
    lbls_val = np.unique(yv).tolist()

    # training and validation labels (unique) must match
    if set(lbls_trn) != set(lbls_val):
        print('LABELS MISMATCH: train/validate %s' %
              (str([lbls_trn, lbls_val])))
        raise Exception()

    # passed labels also must match
    for label in p['labels']:
        if label not in lbls_trn:
            print('LABEL [%s] not found in trn/val labels' % label, lbls_trn)
            raise Exception()

    #-------------------------------------------------------
    # standardization
    if p['doStandardize']:
        sc = StandardScaler().fit(Xt)
        Xt = sc.transform(Xt)
        Xv = sc.transform(Xv)
    else:
        sc = None

    # PCA
    if p['numPCs'] > 0:
        pca = PCA(n_components=numPCs).fit(Xt)
        Xt = pca.transform(Xt)
        Xv = pca.transform(Xv)
    else:
        pca = None

    # MODEL TRAINING
    lda = LinearDiscriminantAnalysis(n_components=p['num_lda']).fit(Xt, yt)
    qda = QuadraticDiscriminantAnalysis().fit(Xt, yt)
    with warnings.catch_warnings():
        if p['quietmode']:
            warnings.filterwarnings(
                "ignore", category=sklearn.exceptions.ConvergenceWarning)
        ovo = OneVsOneClassifier(LinearSVC(**LSVCparam)).fit(Xt, yt)
        ovr = OneVsRestClassifier(LinearSVC(**LSVCparam)).fit(Xt, yt)

    # confusion matrices / validation
    cnf_lda = confusion_matrix(yv, lda.predict(Xv), labels=p['labels'])
    cnf_qda = confusion_matrix(yv, qda.predict(Xv), labels=p['labels'])
    cnf_ovr = confusion_matrix(yv, ovr.predict(Xv), labels=p['labels'])
    cnf_ovo = confusion_matrix(yv, ovo.predict(Xv), labels=p['labels'])

    cb = mt.ClassifierBundle(
        sc=sc,
        pca=pca,
        training_params=p,
        classifiers=dict(
            LDA=dict(cls=lda, confusion=cnf_lda),
            QDA=dict(cls=qda, confusion=cnf_qda),
            OVO=dict(cls=ovo, confusion=cnf_ovo),
            OVR=dict(cls=ovr, confusion=cnf_ovr),
        ),
    )

    # pack results into a dictionary
    mdic = dict(_about="classifier models made with train_models()",
                cb=cb,
                sc=sc,
                pca=pca,
                num_trn=len(yt),
                num_val=len(yv),
                classifiers=dict(
                    LDA=dict(cls=lda, confusion=cnf_lda),
                    QDA=dict(cls=qda, confusion=cnf_qda),
                    OVO=dict(cls=ovo, confusion=cnf_ovo),
                    OVR=dict(cls=ovr, confusion=cnf_ovr),
                ),
                params=p)
    return mdic
print("LogisticRegression_classifier:",
      (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)))

#SGD
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier:",
      (nltk.classify.accuracy(SGDClassifier_classifier, testing_set)))

###SVC
##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier:", (nltk.classify.accuracy(SVC_classifier, testing_set)))
##
#LinearSVC_classifier
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier:",
      (nltk.classify.accuracy(LinearSVC_classifier, testing_set)))

#NuSVC_classifier
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier:",
      (nltk.classify.accuracy(NuSVC_classifier, testing_set)))

voted_classifier = VoteClassifier(classifier, MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier,
                                  SGDClassifier_classifier,
                                  LinearSVC_classifier, NuSVC_classifier)
Ejemplo n.º 16
0
 
 train_df = train_df.drop(labels = label, inplace=False, axis=0)
 
 x_train, y_train = prepareData(train_df)
 x_valid, y_valid = prepareData(valid_df)
 
 print('Done Read Train and Validation data!')
 
 print('Training NB')
 classificador = GaussianNB(priors=None, var_smoothing=1e-9)
 classificador.fit(x_train, y_train)
 
 previsoes_nb = classificador.predict(x_valid)
 
 print('Training SVM')
 classificador = LinearSVC()
 classificador.fit(x_train, y_train)
 
 previsoes_svc = classificador.predict(x_valid)
 
 print('Training RNA')
 kernel_initializer = 'normal'
 activation = 'relu'
 loss = 'binary_crossentropy'
 batch_size = 1500
 neurons = 1536
 dropout = 0.1
 learning_rate = 0.001
 beta_1 = 0.97
 beta_2 = 0.97
 decay  = 0.05
Ejemplo n.º 17
0
    save = True if args.save else False

    train, test, train_label, test_label = load_data(args.dataset, args.n_classes)

    # t = args.epoch
    # print(t)
    # train_label[train_label != t] = 10
    # train_label[train_label == t] = 1
    # train_label[train_label == 10] = 0
    # test_label[test_label != t] = 10
    # test_label[test_label == t] = 1
    # test_label[test_label == 10] = 0
    print('training data size: ')
    print(train.shape)
    print('testing data size: ')
    print(test.shape)

    dual = True if args.dual else False
    scd = LinearSVC(C=args.c, dual=dual)
    a = time.time()
    scd.fit(train, train_label)
    print('Cost: %.3f seconds'%(time.time() - a))

    print('Best Train Accuracy: ', accuracy_score(y_true=train_label, y_pred=scd.predict(train)))
    print('Balanced Train Accuracy: ', balanced_accuracy_score(y_true=train_label, y_pred=scd.predict(train)))
    print('Best one Accuracy: ', accuracy_score(y_true=test_label, y_pred=scd.predict(test)))
    print('Balanced Accuracy: ', balanced_accuracy_score(y_true=test_label, y_pred=scd.predict(test)))

    if save:
        save_path = 'checkpoints'
        save_checkpoint(scd, save_path, args.target, et, vc)
from mglearn.datasets import load_extended_boston
from sklearn.model_selection import train_test_split
import pandas as pd
import mglearn
from IPython.display import display
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from pandas import np
import matplotlib.pyplot as plt
X, y = mglearn.datasets.make_forge()
fig, axes = plt.subplots(1, 2, figsize=(10, 3))
for model, ax in zip([LinearSVC(), LogisticRegression(), 9], axes):
    clf = model.fit(X, y)
    mglearn.plots.plot_2d_separator(clf,
                                    X,
                                    fill=False,
                                    eps=0.5,
                                    ax=ax,
                                    alpha=.7)
    mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
    ax.set_title("{}".format(clf.__class__.__name__))
    ax.set_xlabel("Признак 0")
    ax.set_ylabel("Признак 1")
axes[0].legend(loc=3)
plt.show()
Ejemplo n.º 19
0
tweets = ["Hello world, today is a good day",
          "Bye, bye, world, I am sleeping",
          "Hello bikey, it is bleh",
          "Good bye popa, window",
          "Maybe now I will say hello",
          "Tomorrow I will do bye",
          "It is a good night for be hello",
          "Perhaps bye will be okay"]

tokTweets = [nltk.word_tokenize(tweet) for tweet in tweets]
stances = ['yes','no','yes','no','yes','no','yes','no']

stringTweets = [str(tweet) for tweet in tokTweets]


X_train, X_test, y_train, y_test = train_test_split(stringTweets, stances, test_size=0.33, random_state=2)

le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

tf = TfidfVectorizer(max_features=5000)
tf.fit(stringTweets)
X_train_tf = tf.transform(X_train)
X_test_tf = tf.transform(X_test)

svm = LinearSVC()
svm.fit(X_train_tf, y_train)
predictions = svm.predict(X_test_tf)

print("SVM Accuracy score: {0}".format(accuracy_score(predictions, y_test)*100))
Ejemplo n.º 20
0
np.random.seed(42)
mnist = fetch_mldata("MNIST original")
X = mnist["data"]
y = mnist["target"]

X_train = X[:60000]
y_train = y[:60000]
X_test = X[60000:]
y_test = y[60000:]

rnd_idx = np.random.permutation(60000)
X_train = X_train[rnd_idx]
y_train = y_train[rnd_idx]

# Model
lin_clf = LinearSVC(random_state=42)
#  lin_clf.fit(X_train, y_train) # Base model
#  y_pred = lin_clf.predict(X_train)
#  accuracy_score(y_train, y_pred) .83

# Scale data
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.astype(np.float32))
X_test_scaled = scaler.transform(X_test.astype(np.float32))

#  lin_clf.fit(X_train_scaled, y_train) # Scaled model
#  y_pred = lin_clf.predict(X_train_scaled)
#  accuracy_score(y_train, y_pred) .92

# Model with RBF kernel function
Ejemplo n.º 21
0
    print("{:10}{:20}{:10.2f}{}".format(clf_conf.id, features_key, recall, marker))


clfs = [
    ClfConf(id="lr",
            clf=lambda: LogisticRegression(solver='lbfgs', max_iter=4000),
            normalized=False
            ),
    ClfConf(id="lda",
            clf=lambda: Lda(n_components=None, priors=None, shrinkage=None, solver='svd', store_covariance=False,
                            tol=0.0001),
            normalized=False
            ),
    ClfConf(id="svm_lin",
            clf=lambda: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
                                  intercept_scaling=1, loss='squared_hinge', max_iter=1000,
                                  multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
                                  verbose=0),
            normalized=True
            ),
    ClfConf(id="svm",
            clf=lambda: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                            decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
                            max_iter=-1, probability=False, random_state=None, shrinking=True,
                            tol=0.001, verbose=False),
            normalized=True
            ),
    ClfConf(id="knn",
            clf=lambda: KNeighborsClassifier(n_neighbors=3),
            normalized=False
            ),
    ClfConf(id="nm_g",
def classifier_analysis(X, label, methodType):
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import ShuffleSplit
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline

    #rng = None
    rng = np.random.RandomState(1)

    if methodType == 0:
        # random forest
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2,
                                            min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
                                            max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None,
                                            bootstrap=True, oob_score=False, n_jobs=n_jobs, random_state=rng, verbose=0,
                                            warm_start=False, class_weight=None)

        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__n_estimators': [5, 10, 20],
            'classifier__max_depth': [None, 10, 5, 3],
            'classifier__max_features': ['auto', 10, 5]
        }
    elif methodType == 1:
        # adaboost
        from sklearn.ensemble import AdaBoostClassifier
        classifier = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=rng)
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__n_estimators': [5, 10, 20],
            'classifier__learning_rate': [0.8, 0.9, 1.0]
        }
    elif methodType == 2:
        # GBC
        from sklearn.ensemble import GradientBoostingClassifier
        classifier = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0,
                                                criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
                                                min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                                min_impurity_split=None, init=None, random_state=rng, max_features=None,
                                                verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__n_estimators': [50, 100, 150],
            'classifier__max_depth': [None, 10, 5, 3],
            'classifier__learning_rate': [0.8, 0.9, 1.0]
        }
    elif methodType == 3:
        # logtistic regression
        from sklearn.linear_model import LogisticRegression
        classifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True,
                                        intercept_scaling=1, class_weight=None, random_state=rng, solver='saga',
                                        max_iter=100, multi_class='multinomial', verbose=0, warm_start=False, n_jobs=n_jobs)
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__penalty': ['l1', 'l2'],
            'classifier__C': [0.9, 1.0, 1.1]
        }
    elif methodType == 4:
        # SVM
        from sklearn.svm import SVC
        classifier = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False,
                         tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1,
                         decision_function_shape='ovr', random_state=rng)
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'classifier__C': [0.9, 1.0, 1.1]
        }
    elif methodType == 5:
        # MLP
        from sklearn.neural_network import MLPClassifier
        classifier = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001,
                                   batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5,
                                   max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False,
                                   warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False,
                                   validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__hidden_layer_sizes': [(100, ), (50, ), (20, )],
            'classifier__learning_rate_init': [0.0001, 0.001, 0.01]
        }
    elif methodType == 6:
        # linear SVM
        from sklearn.svm import LinearSVC
        classifier = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, tol=0.0001, C=1.0, multi_class='ovr',
                               fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=rng,
                               max_iter=1000)
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__penalty': ['l1', 'l2'],
            'classifier__C': [0.9, 1.0, 1.1]
        }
    elif methodType == 7:
        # Bernoulli Naive Bayes
        from sklearn.naive_bayes import BernoulliNB
        classifier = BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None)
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__alpha': [0.90, 0.95, 1.0],
            'classifier__fit_prior': [True, False]
        }
    elif methodType == 8:
        # multinomial Naive Bayes
        from sklearn.naive_bayes import MultinomialNB
        classifier = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
        param_grid = {
            'classifier__alpha': [0.90, 0.95, 1.0],
            'classifier__fit_prior': [True, False]
        }
    else:
        return

    if methodType == 8:
        pipe = Pipeline([
            ('classifier', classifier)
        ])
    else:
        pipe = Pipeline([
            ('scale', StandardScaler()),
            ('filter', FilterSimu()),
            ('classifier', classifier)
        ])


    grid = GridSearchCV(pipe, cv=ShuffleSplit(n_splits=4, test_size=0.25, random_state=rng), n_jobs=1, param_grid=param_grid)
    grid.fit(X, label)
    best_estimator = grid.best_estimator_

    #mean_scores = np.array(grid.cv_results_['mean_test_score'])
    #mean_tscores = np.array(grid.cv_results_['mean_train_score'])
    #print mean_scores
    #print mean_tscores

    print grid.best_params_
    score = grid.best_score_
    #print grid.cv_results_['params']

    return best_estimator, grid.predict(X), score
Ejemplo n.º 23
0
data=np.asarray(data_df)
label=np.asarray(label_df).flatten('F') #change to 1D vector

scaler = joblib.load('scaler.joblib')
scaler.fit(data)
data=scaler.transform(data)


x_train, x_test, y_train, y_test = train_test_split(data,label, test_size=0.2, random_state = 4)



mlp =MLPClassifier(random_state=4)
rfc=RandomForestClassifier(random_state=4)
svc = LinearSVC()
ovr = OneVsRestClassifier(svc)
models =[]
models.append(ovr)
models.append(mlp)
models.append(rfc)
kf = StratifiedKFold(n_splits=5, random_state = 4)


y_pred=[]
for model in models:
    model.fit(data,label)
    y=model.predict(x_test)
    y_pred.append(y)  
    print(accuracy_score(y_test, y))
Ejemplo n.º 24
0
# ### Perceptron

# In[ ]:

# Perceptron
perceptron = Perceptron()
acc_perceptron = predict_model(X_data, Y_data, perceptron, X_test_kaggle,
                               'submission_Perception.csv')

# ###  Linear SVC

# In[ ]:

# Linear SVC
linear_svc = LinearSVC()
acc_linear_svc = predict_model(X_data, Y_data, linear_svc, X_test_kaggle,
                               'submission_Linear_SVC.csv')

# ### Stochastic Gradient Descent

# In[ ]:

# Stochastic Gradient Descent
sgd = SGDClassifier()
acc_sgd = predict_model(X_data, Y_data, sgd, X_test_kaggle,
                        'submission_stochastic_Gradient_Descent.csv')

# ### Decision Tree

# In[ ]:
Ejemplo n.º 25
0
# created a new column called 'cleaned' in data to store the processed data

X_train, X_test, y_train, y_test = train_test_split(data['cleaned'],
                                                    data.stars,
                                                    test_size=0.2)
#stars column in the data contains the stars of that perticular comment.

pipeline = Pipeline([
    ('vect',
     TfidfVectorizer(ngram_range=(1, 2),
                     stop_words="english",
                     sublinear_tf=True)),
    #TfidfVectorizer - Transforms text to feature vectors that can be used as input to estimator.
    ('chi', SelectKBest(chi2, k=10000)),
    #selecting best words/features using chisquare algorithm
    ('clf', LinearSVC(C=1.0, penalty='l1', max_iter=3000, dual=False))
])
#support vector classifier using 3000 iterations

model = pipeline.fit(X_train, y_train)
#training the processed data

vectorizer = model.named_steps['vect']
chi = model.named_steps['chi']
clf = model.named_steps['clf']

feature_names = vectorizer.get_feature_names()
feature_names = [feature_names[i] for i in chi.get_support(indices=True)]
feature_names = np.asarray(feature_names)

target_names = ['1', '2', '3', '4', '5']
##############################################################################
# There is freedom in the choice of ``events`` composing the feature vectors and we encourage the
# reader to explore different combinations. Note, however, that odd photon-numbered events have
# zero probability because ideal GBS only generates and outputs pairs of photons.
#
# Given our points in the feature space and their target labels, we can use
# scikit-learn's Support Vector Machine `LinearSVC <https://scikit-learn.org/stable/modules/generated/sklearn.svm
# .LinearSVC.html>`__ as our model to train:

from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

R_scaled = StandardScaler().fit_transform(R)  # Transform data to zero mean and unit variance

classifier = LinearSVC()
classifier.fit(R_scaled, classes)

##############################################################################
# Here, the term "linear" refers to the *kernel* function used to calculate inner products
# between vectors in the space. We can use a linear SVM because we have already embedded the
# graphs in a feature space based on GBS. We have also rescaled the feature vectors so that they
# zero mean and unit variance using scikit-learn's ``StandardScaler``, a technique
# `often used <https://scikit-learn.org/stable/modules/preprocessing.html>`__ in machine learning.
#
# We can then visualize the trained SVM by plotting the decision boundary with respect to the
# points:

w = classifier.coef_[0]
i = classifier.intercept_[0]
Ejemplo n.º 27
0
                                                  random_state=42)

# Logisitic Regression classifier
log_reg = LogisticRegression(random_state=0)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_val)
log_accuracy = accuracy_score(y_val, y_pred)

# Naive Bayes Classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_val)
nb_accuracy = accuracy_score(y_val, y_pred)

# Support Vector Classifier
lsvm = LinearSVC(random_state=0)
lsvm.fit(X_train, y_train)
y_pred = lsvm.predict(X_val)
lsvm_accuracy = accuracy_score(y_val, y_pred)

print(
    "Accuracy benchmark:\nLogistic Regression: {}\nNaive Bayes: {}\nSupport Vector: {}"
    .format(log_accuracy, nb_accuracy, lsvm_accuracy))

plt.figure()
plt.bar([1, 2, 3], [log_accuracy, nb_accuracy, lsvm_accuracy])
plt.show()

# --------------
# path_test : Location of test data
Ejemplo n.º 28
0
#     |

#  0  |   0   0

#     |

#  1  |   0   1

#     |

print(type(y_data))

#2. 모델

model = LinearSVC()  # 사용 모델 명시

#3. 훈련

model.fit(x_data, y_data)

#4. 평가, 예측

x_test = [[0, 0], [1, 0], [0, 1], [1, 1]]

y_pred = model.predict(x_test)

# score = model.evaluate(예측)

acc = accuracy_score([0, 0, 0, 1], y_pred)  # evaluate = score()
Ejemplo n.º 29
0
softmax_reg = LogisticRegression(multi_class="multinomial",
                                 solver="lbfgs",
                                 C=10)
softmax_reg.fit(X, y)
print(softmax_reg.predict([[5, 2]]))
print(softmax_reg.predict_proba([[5, 2]]))

# 3.1 linear SVM; feature scale sensitive, so normalization important
#   SVM does not output probabilities
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
X = iris["data"][:, (2, 3)]
y = (iris["target"] == 2).astype(np.float64)  # iris-virginica
svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("linear_svc", LinearSVC(C=1, loss="hinge")),
])
svm_clf.fit(X, y)
print(svm_clf.predict([[5.5, 1.7]]))

#3.2 polynomial Kernel SVM
# set degree, doesn't actually add any high degree features
from sklearn.svm import SVC
poly_kernel_svm_clf = Pipeline([("scaler", StandardScaler()),
                                ("svm_clf",
                                 SVC(kernel="poly", degree=3, coef0=1, C=5))])
poly_kernel_svm_clf.fit(X, y)

# 3.3 Gaussian RBF Kernel: good for small size training set
# gamma is regulation, high gamma, irregular boundary, overfitting
rbf_kernel_svm_clf = Pipeline([("scaler", StandardScaler()),
Ejemplo n.º 30
0
def run():
    """Run example for Doc-2-Vec method and IMDB dataset."""
    log.info('START')
    data = {
        'test-neg.txt': 'TEST_NEG',
        'test-pos.txt': 'TEST_POS',
        'train-neg.txt': 'TRAIN_NEG',
        'train-pos.txt': 'TRAIN_POS',
        'train-unsup.txt': 'TRAIN_UNS'
    }
    data = {join(IMDB_MERGED_PATH, k): v for k, v in data.iteritems()}
    sentences = Doc2VecGenerator(data)

    vector_size = 400
    models_path = '/datasets/amazon-data/csv/models/doc2vec/'
    if not exists(models_path):
        makedirs(models_path)
        log.info('Directory: {} has been created'.format(models_path))
    f_name = 'imdb-{}.d2v'.format(vector_size)
    f_model = join(models_path, f_name)

    log.info('Model Load or Save')
    if isfile(f_model):
        model = Doc2Vec.load(f_model)
        log.info('Model has been loaded from: {}'.format(f_model))
    else:
        cores = multiprocessing.cpu_count()
        model = Doc2Vec(min_count=1,
                        window=10,
                        size=vector_size,
                        sample=1e-4,
                        negative=5,
                        workers=cores)
        model.build_vocab(sentences.to_array())
        log.info('Epochs')
        for epoch in range(10):
            log.info('EPOCH: #{}'.format(epoch))
            model.train(sentences.sentences_perm())

        model.save(f_model)

    log.info('Sentiment')
    train_arrays = numpy.zeros((25000, vector_size))
    train_labels = numpy.zeros(25000)

    for i in range(12500):
        log.debug('TRAIN_{}'.format(i))
        prefix_train_pos = 'TRAIN_POS_' + str(i)
        prefix_train_neg = 'TRAIN_NEG_' + str(i)
        train_arrays[i] = model.docvecs[prefix_train_pos]
        train_arrays[12500 + i] = model.docvecs[prefix_train_neg]
        train_labels[i] = 1
        train_labels[12500 + i] = 0

    test_arrays = numpy.zeros((25000, vector_size))
    test_labels = numpy.zeros(25000)

    for i in range(12500):
        log.debug('TEST_{}'.format(i))
        prefix_test_pos = 'TEST_POS_' + str(i)
        prefix_test_neg = 'TEST_NEG_' + str(i)
        test_arrays[i] = model.docvecs[prefix_test_pos]
        test_arrays[12500 + i] = model.docvecs[prefix_test_neg]
        test_labels[i] = 1
        test_labels[12500 + i] = 0

    log.info('Fitting')

    classifiers = {
        'BernoulliNB': BernoulliNB(),
        'GaussianNB': GaussianNB(),
        'DecisionTreeClassifier': DecisionTreeClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'RandomForestClassifier': RandomForestClassifier(),
        'LogisticRegression': LogisticRegression(),
        'SVC': SVC(),
        'LinearSVC': LinearSVC()
    }

    results = {}

    for classifier_name, classifier in classifiers.iteritems():
        log.info('Clf: {}'.format(classifier_name))
        classifier.fit(train_arrays, train_labels)
        #
        # LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
        #                    intercept_scaling=1, penalty='l2', random_state=None,
        #                    tol=0.0001)
        result = classifier.score(test_arrays, test_labels)
        log.info('Clf acc: {}'.format(result))
        results[classifier_name] = result

    log.info(results)
    with open(models_path + 'results-{}'.format(f_name)) as res:
        pickle.dump(results, res)