def train_and_predict_m8 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'porter')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting Ridge Classifer...")
    clf = RidgeClassifier(class_weight = 'auto', alpha = 1, normalize = True)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    param_grid = {'alpha' : [0.1, 0.3, 1, 3, 10], 'normalize' : [True, False]}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
Example #2
0
def retrain_models(username):
	train_x, train_y, body_x, body_y, head_x, head_y = model_retriever.retrieve_data_db(username)

	b_train_x = []
	b_train_y = numpy.concatenate([body_y, train_y])

	for msg in (body_x + train_x):
		b_train_x.append(extract_body_features(msg))

	body_vec = TfidfVectorizer(norm="l2")
	b_train_x = body_vec.fit_transform(b_train_x)

	h_train_x = []
	h_train_y = numpy.concatenate([head_y, train_y])

	for msg in (head_x + train_x):
		h_train_x.append(extract_header_features(msg))

	head_vec = DictVectorizer()
	h_train_x = head_vec.fit_transform(h_train_x)

	body_model = LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3)
	head_model = RidgeClassifier(tol=1e-2, solver="lsqr")

	body_model.fit(b_train_x, b_train_y)
	head_model.fit(h_train_x, h_train_y)

        print("Finished training models for "+username+"...")

	store_models(username, body_vec, body_model, head_vec, head_model)
Example #3
0
def run(input_train, input_test, output_name):
    """
    Takes a file path as input, a file path as output, and produces a sorted csv of
    item IDs for Kaggle submission
    -------
    input_train : 'full path of the training file'
    input_test : 'full path of the testing file'
    output_name : 'full path of the output file'
    """

    data = pd.read_table(input_train)
    test = pd.read_table(input_test)
    testItemIds = test.itemid
    response = data.is_blocked
    dummies = sparse.csc_matrix(pd.get_dummies(data.subcategory))
    pretestdummies = pd.get_dummies(test.subcategory)
    testdummies = sparse.csc_matrix(pretestdummies.drop(['Растения', 'Товары для компьютера'],axis=1))
    words = np.array(data.description,str)
    testwords = np.array(test.description,str)
    del data, test
    vect = text.CountVectorizer(decode_error = u'ignore', strip_accents='unicode', ngram_range=(1,2))
    corpus = np.concatenate((words, testwords))
    vect.fit(corpus)
    counts = vect.transform(words)
    features = sparse.hstack((dummies,counts))
    clf = RidgeClassifier()
    clf.fit(features, response)
    testcounts = vect.transform(testwords)
    testFeatures = sparse.hstack((testdummies,testcounts))
    predicted_scores = clf.predict_proba(testFeatures).T[1]
    f = open(output_name,'w')
    f.write("id\n") 
    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        f.write("%d\n" % (item_id))
    f.close()
Example #4
0
def validate(input_train, rows=True, test=0.25):
    """
    Takes file as input and returns classification report, average precision, and
    AUC for a bigram model. By default, loads all rows of a dataset, trains on .75,
    and tests on .25. 
    ----
    input_train : 'full path of the file you are loading'
    rows : True - loads all rows; insert an int for specific number of rows
    test : float proportion of dataset used for testing
    """
    if rows == True:
        data = pd.read_table(input_train)
    else:
        data = pd.read_table(input_train, nrows = rows)
    response = data.is_blocked
    dummies = sparse.csc_matrix(pd.get_dummies(data.subcategory))
    words = np.array(data.description,str)
    del data
    vect = text.CountVectorizer(decode_error = u'ignore',strip_accents='unicode',ngram_range=(1,2))
    counts = vect.fit_transform(words)
    features = sparse.hstack((dummies,counts))
    features_train, features_test, target_train, target_test = train_test_split(features, response, test_size = test)
    clf = RidgeClassifier()
    clf.fit(features_train, target_train)
    prediction = clf.predict(features_test)
    return classification_report(target_test, prediction), average_precision_score(target_test, prediction), roc_auc_score(target_test, prediction)
    def test_default_configuration_classify(self):
        for i in range(2):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=False)
            configuration_space = ExtraTreesPreprocessor.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()
            preprocessor = ExtraTreesPreprocessor(random_state=1,
                                                  **{hp_name: default[hp_name]
                                                     for hp_name in default})
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = RidgeClassifier()
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.accuracy_score(predictions, Y_test)
            self.assertAlmostEqual(accuracy, 0.87310261080752882, places=2)
    def test_default_configuration_classify(self):
        for i in range(5):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=False)
            configuration_space = KernelPCA.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()
            preprocessor = KernelPCA(random_state=1,
                                     **{hp_name: default[hp_name] for hp_name in
                                        default if default[hp_name] is not None})
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = RidgeClassifier()
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.accuracy_score(predictions, Y_test)
            self.assertAlmostEqual(accuracy, 0.096539162112932606)
    def test_default_configuration_classify(self):
        for i in range(2):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=True)
            configuration_space = TruncatedSVD.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()
            preprocessor = TruncatedSVD(random_state=1,
                                                  **{hp_name: default[hp_name]
                                                     for hp_name in
                                                     default if default[
                                                      hp_name] is not None})
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = RidgeClassifier()
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.accuracy_score(predictions, Y_test)
            self.assertAlmostEqual(accuracy, 0.44201578627808136, places=2)
def get_optimal_blend_weigth(exp_, best_param_,
                             folder, fname, model_fname):
    clf = RidgeClassifier()
    X_test, y_test = exp_.get_test_data()
    clf.set_params(**best_param_)
    clf.fit(X_test, y_test)

    # dump2csv optimal linear weight
    names = np.append(np.array(['intercept'], dtype='S100'), X_test.columns.values)
    coefs = np.append(clf.intercept_, clf.coef_).astype(np.float64)
    optimal_linear_weight = pd.DataFrame(coefs.reshape(1,len(coefs)), columns=names)
    optimal_linear_weight.to_csv(os.path.join(Config.get_string('data.path'),
                                              folder,
                                              fname), index=False)

    # dump2cpkle for ridge model
    model_fname = os.path.join(Config.get_string('data.path'), folder, model_fname)
    with gzip.open(model_fname, 'wb') as gf:
        cPickle.dump(clf, gf, cPickle.HIGHEST_PROTOCOL)
    
    return True
Example #9
0
def Predict():
    print('\nThere are %d new deals') % n_test

    # Using the KNN classifier
    clf_KNN = KNeighborsClassifier(n_neighbors=3) # KNN doesnot work even if k has been tuned
    #clf_KNN = KNeighborsClassifier(n_neighbors=7)
    #clf_KNN = KNeighborsClassifier(n_neighbors=11)
    clf_KNN.fit(Corpus_train, Y_train)
    Y_pred_KNN = clf_KNN.predict(Corpus_test)
    print_rate(Y_test, Y_pred_KNN, n_test, 'KNNClassifier')
    
    # Using the SVM classifier
    clf_SVM = svm.SVC()
    clf_SVM.fit(Corpus_train, Y_train)
    Y_pred_SVM = clf_SVM.predict(Corpus_test)
    print_rate(Y_test, Y_pred_SVM, n_test, 'SVMClassifier')
    
    # Using the Ridge classifier
    clf_RC = RidgeClassifier(tol=0.01, solver="lsqr")
    #clf_RC = RidgeClassifier(tol=0.1, solver="lsqr")
    clf_RC.fit(Corpus_train, Y_train)
    Y_pred_RC = clf_RC.predict(Corpus_test)
    print_rate(Y_test, Y_pred_RC, n_test, 'RidgeClassifier')
    
    # won't consider Random Forests or Decision Trees beacause they work bad for high sparse dimensions
    
    
    # Using the Multinomial Naive Bayes classifier
    # I expect that this MNB classifier will do the best since it is designed for occurrence counts features
    #clf_MNB = MultinomialNB(alpha=0.01) #smoothing parameter = 0.01 is worse than 0.1
    clf_MNB = MultinomialNB(alpha=0.1)
    #clf_MNB = MultinomialNB(alpha=0.3) #a big smoothing rate doesnot benefit the model
    #clf_MNB = MultinomialNB(alpha=0.2) #or alpha = 0.05 can generate the best outcome
    clf_MNB.fit(Corpus_train, Y_train)
    Y_pred_MNB = clf_MNB.predict(Corpus_test)
    print_rate(Y_test, Y_pred_MNB, n_test, 'MultinomialNBClassifier')
def get_classifier(classifier):
  if classifier["name"] == 'linear-ridge':
    c = RidgeClassifier()
  elif classifier["name"] == 'SVC':
    c = SVC()
  elif classifier["name"] == "l2-SVC":
    c = L2KernelClassifier()
  elif classifier["name"] == "fredholm":
    c = L2FredholmClassifier()
  elif classifier["name"] == "TSVM":
    c = SVMLight()
  elif classifier["name"] == "Lap-RLSC":
    c = LapRLSC()
  elif classifier["name"] == "fred_kernel_appr":
    c = FredholmKernelApprClassifier()
  else:
    raise NameError('Not existing classifier: ' + classifier["name"] + '.')
  c.set_params(**classifier["params"])
  return c
Example #11
0
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import NearestCentroid

lr = LogisticRegression() 
svc = SVC(kernel="linear")
tree = DecisionTreeClassifier()
mlp = MLPClassifier()
ridge  =RidgeClassifier(tol=1e-2, solver='lsqr', alpha=.5)
sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, 
                    max_iter=100, tol=None)
rf = RandomForestClassifier(max_features=9, n_estimators=100)
percep = Perceptron(n_iter=50)
pass_agg = PassiveAggressiveClassifier(n_iter=50)
near_cent = NearestCentroid()

from sklearn.metrics import classification_report
for clf in (lr, svc, tree, mlp, ridge, sgd, rf, percep, pass_agg, near_cent):
    clf.fit(X_train, y_train)
    print('=' * 25 + "    " + clf.__class__.__name__ + "    " + "="*30)
    print(clf.__class__.__name__, classification_report(y_test, clf.predict(X_test)));


Example #12
0
def test_ridge_classifier_no_support_multilabel():
    X, y = make_multilabel_classification(n_samples=10, random_state=0)
    assert_raises(ValueError, RidgeClassifier().fit, X, y)
def text_classify_influence_by_ngram_range():
    """
    ngram_range对文本分类的影响
    """
    train_df = pd.read_csv('./data/train_set.csv', sep='\t', nrows=15000)
    sample = train_df[0:5000]
    n = int(2 * len(sample) / 3)
    f1 = []
    tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=2000)
    train_test = tfidf.fit_transform(sample['text'])
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    clf = RidgeClassifier(alpha=0.1, solver='sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))

    tfidf = TfidfVectorizer(ngram_range=(2, 2), max_features=2000)
    train_test = tfidf.fit_transform(sample['text'])
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    clf = RidgeClassifier(alpha=0.1, solver='sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))

    tfidf = TfidfVectorizer(ngram_range=(3, 3), max_features=2000)
    train_test = tfidf.fit_transform(sample['text'])
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    clf = RidgeClassifier(alpha=0.1, solver='sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))

    tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=2000)
    train_test = tfidf.fit_transform(sample['text'])
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    clf = RidgeClassifier(alpha=0.1, solver='sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))
svm_cv = GridSearchCV(SVC(C=1., kernel="linear"),
                      param_grid={'C': [.1, .5, 1., 5., 10., 50., 100.]},
                      scoring='f1', n_jobs=1)

logistic_cv = GridSearchCV(LogisticRegression(C=1., penalty="l1"),
                           param_grid={'C': [.1, .5, 1., 5., 10., 50., 100.]},
                           scoring='f1')
logistic_l2_cv = GridSearchCV(LogisticRegression(C=1., penalty="l2"),
                              param_grid={
                                  'C': [.1, .5, 1., 5., 10., 50., 100.]
                              },
                              scoring='f1')

# The ridge classifier has a specific 'CV' object that can set it's
# parameters faster than using a GridSearchCV
ridge = RidgeClassifier()
ridge_cv = RidgeClassifierCV()

# A dictionary, to hold all our classifiers
classifiers = {'SVC': svm,
               'SVC cv': svm_cv,
               'log l1': logistic,
               'log l1 50': logistic_50,
               'log l1 cv': logistic_cv,
               'log l2': logistic_l2,
               'log l2 cv': logistic_l2_cv,
               'ridge': ridge,
               'ridge cv': ridge_cv
               }

#############################################################################
def model_fit(X, y, test_size=0.5, alpha_low=-6, alpha_high=6,
              n_steps=25, cv=4, plot_figures=False):

    # Prepare datasets
    scaler = MinMaxScaler(feature_range=(0, 1))
    X_temp = X.reshape((len(X), -1))
    X_temp = scaler.fit_transform(X_temp)
    indexes = list(range(len(X_temp)))

    # Split Dataset into training and test set
    x_train, x_test, y_train, y_test, idx_train, idx_test = train_test_split(
        X_temp, y, indexes, test_size=test_size, random_state=0, stratify=y)

    # Model creation
    ridge = RidgeClassifier(class_weight='balanced')
    alphas = np.logspace(alpha_low, alpha_high, num=n_steps)
    clf = GridSearchCV(estimator=ridge,
                       param_grid={'alpha': alphas},
                       cv=cv, return_train_score=True,
                       n_jobs=-1, verbose=1)

    # Fit the model to the data
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        start = time.time()
        results = clf.fit(x_train, y_train)
        comp_time_total = time.time() - start

    # Plot the model fit curves
    if plot_figures:

        # Extract relevant modelling metrics
        train_scores = 100 * clf.cv_results_['mean_train_score']
        valid_scores = 100 * clf.cv_results_['mean_test_score']
        std_tr = 100 * clf.cv_results_['std_train_score']
        std_va = 100 * clf.cv_results_['std_test_score']

        plt.figure(figsize=(10, 5))
        plt.semilogx(alphas, train_scores, label='Training Set')
        plt.semilogx(alphas, valid_scores, label='Validation Set')

        # Add marker and text for best score
        x_pos = clf.best_params_['alpha']
        y_pos = 100 * clf.best_score_
        txt = '{:0.2f}%'.format(y_pos)
        plt.scatter(x_pos, y_pos, marker='x', c='red', zorder=10)
        plt.text(x_pos, y_pos - 7.5, txt, fontdict={'size': 18})

        # Quantify variance with ±std curves
        plt.fill_between(alphas, train_scores-std_tr, train_scores+std_tr, alpha=0.3)
        plt.fill_between(alphas, valid_scores-std_va, valid_scores+std_va, alpha=0.3)
        plt.title('Model Performance')
        plt.ylabel('Classification Accuracy [%]')
        plt.xlabel('Model Parameter [alpha]')
        
        # Adjust x-lim, y-lim, add legend and adjust layout
        plt.xlim(10**alpha_low, 10**alpha_high)
        plt.ylim(15, 105)
        plt.legend()
        plt.tight_layout()
        plt.show()

    else:
        # Provide written performance feedback
        best_score_test = clf.best_score_ * 100
        feedback_txt = 'Model trained for {:.2f}s total '.format(comp_time_total)
        feedback_txt += 'and reached an accuracy of: {:.2f}%'.format(best_score_test)
        time.sleep(0.25)
        print(feedback_txt)

    # Store everything in model
    model = {'model': results.best_estimator_,
             'best_score': results.best_score_,
             'x_train': x_train,
             'x_test': x_test,
             'y_train': y_train,
             'y_test': y_test,
             'idx_train': idx_train,
             'idx_test': idx_test}

    return model
Example #16
0
# Using the KNN classifier
clf_KNN = KNeighborsClassifier(n_neighbors=3) # KNN doesnot work even if k has been tuned
#clf_KNN = KNeighborsClassifier(n_neighbors=7)
#clf_KNN = KNeighborsClassifier(n_neighbors=11)
clf_KNN.fit(Corpus_train, Y_train)
Y_pred_KNN = clf_KNN.predict(Corpus_test)
print_rate(Y_test, Y_pred_KNN, n_test, 'KNNClassifier')

# Using the SVM classifier
clf_SVM = svm.SVC()
clf_SVM.fit(Corpus_train, Y_train)
Y_pred_SVM = clf_SVM.predict(Corpus_test)
print_rate(Y_test, Y_pred_SVM, n_test, 'SVMClassifier')

# Using the Ridge classifier
clf_RC = RidgeClassifier(tol=0.01, solver="lsqr")
#clf_RC = RidgeClassifier(tol=0.1, solver="lsqr")
clf_RC.fit(Corpus_train, Y_train)
Y_pred_RC = clf_RC.predict(Corpus_test)
print_rate(Y_test, Y_pred_RC, n_test, 'RidgeClassifier')

# won't consider Random Forests or Decision Trees beacause they work bad for high sparse dimensions


# Using the Multinomial Naive Bayes classifier
# I expect that this MNB classifier will do the best since it is designed for occurrence counts features
#clf_MNB = MultinomialNB(alpha=0.01) #smoothing parameter = 0.01 is worse than 0.1
clf_MNB = MultinomialNB(alpha=0.1)
#clf_MNB = MultinomialNB(alpha=0.3) #a big smoothing rate doesnot benefit the model
#clf_MNB = MultinomialNB(alpha=0.2) #or alpha = 0.05 can generate the best outcome
clf_MNB.fit(Corpus_train, Y_train)
non_lemons = non_lemons.ix[random.sample(non_lemons.index, 6684)]
train = lemons.append(non_lemons)

#X = train.drop(['RefId','IsBadBuy','VehYear','Make','Model','Trim','SubModel','WheelTypeID','BYRNO'],axis=1)
#y = pd.Series(train['IsBadBuy']).values

target = pd.Series(train['IsBadBuy']).values
data = train.drop(['RefId','IsBadBuy','VehYear','Make','Model','Trim','SubModel','WheelTypeID','BYRNO'],axis=1)

x_train, x_test, y_train, y_test = cross_validation.train_test_split(data,target, test_size=.3)



# Subset the data so we have a more even data set

model = RidgeClassifier()
clf = model.fit(X,y)
Ridg_Class = clf.predict(X)
clf.score(X,y)

metrics.confusion_matrix(y, clf.predict(X))
print metrics.classification_report(y, clf.predict(X))


# GradientBoostingClassifier

from sklearn.ensemble import *
model = GradientBoostingClassifier()

# Train
clf = model.fit(x_train, y_train)
Example #18
0
        print(
            metrics.classification_report(y_test,
                                          pred,
                                          target_names=target_names))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in ((RidgeClassifier(tol=1e-2,
                                   solver="sag"), "Ridge Classifier"),
                  (Perceptron(max_iter=50),
                   "Perceptron"), (PassiveAggressiveClassifier(max_iter=50),
                                   "Passive-Aggressive"),
                  (KNeighborsClassifier(n_neighbors=10),
                   "kNN"), (RandomForestClassifier(), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))
classifier_1 = SGDClassifier()
classifier_1.fit(X_train_vec, y_train_enc)
print(cross_val_score(classifier_1, X_test_vec, y_test, cv=3))
y_pred = cross_val_predict(classifier_1, X_train_vec, y_train_enc)
print('f1_score for classifier_1: ', f1_score(y_train_enc, y_pred))
print('precision for classifier_1: ', precision_score(y_train_enc, y_pred))
print('recall for classifier_1: ', recall_score(y_train_enc, y_pred))
precisions, recalls, threshholds = precision_recall_curve(y_train_enc, y_pred)
plt.plot(threshholds, precisions[:-1], 'b--', label='precision')
plt.plot(threshholds, recalls[:-1], 'g-', label='recall')
plt.ylim([0, 1])
plt.legend()
plt.show()

# Training and testing RidgeClassifier
classifier_2 = RidgeClassifier()
classifier_2.fit(X_train_vec, y_train_enc)
print(cross_val_score(classifier_2, X_test_vec, y_test_enc, cv=3))
y_pred = cross_val_predict(classifier_2, X_train_vec, y_train_enc)
print('f1_score for classifier_2: ', f1_score(y_train_enc, y_pred))
print('precision for classifier_2: ', precision_score(y_train_enc, y_pred))
print('recall for classifier_2: ', recall_score(y_train_enc, y_pred))
precisions, recalls, threshholds = precision_recall_curve(y_train_enc, y_pred)
plt.plot(threshholds, precisions[:-1], 'b--', label='precision')
plt.plot(threshholds, recalls[:-1], 'g-', label='recall')
plt.ylim([0, 1])
plt.legend()
plt.show()

# Traning and Testing RandomForsetClassifier
classifier_3 = RandomForestClassifier()
class EmotionsClassifier():
    def __init__(self):
        self.__max_length = 128

    def __apply_class2mood(self):
        class2mood = {
            (1, 1): 0,  #'excited,delighted,aroused,astonished',
            (1, 0): 1,  #'calm,relaxed,content, friendly',
            (0, 1): 2,  #'angry annoyed, frustrated, disguted',
            (0, 0): 3  #'depressed, bored, sad, gloomy'
        }
        res = []
        for idx, row in self.__corpus.iterrows():
            val = row['val_class']
            aro = row['aro_class']
            res.append(class2mood[(val, aro)])
        self.__corpus['multiclass'] = res

    def __load_corpus(self, filepath):
        self.__corpus = pd.read_csv(filepath).dropna(axis=0)
        self.__corpus['val_class'] = self.__corpus['Val.W'].apply(
            lambda x: 1 if x > 3.0 else 0)
        self.__corpus['aro_class'] = self.__corpus['Aro.W'].apply(
            lambda x: 1 if x > 3.0 else 0)
        self.__apply_class2mood()

    def __vectorize(self, vect_type):
        vd = {
            vect_types.TF_IDF: TfidfVectorizer(ngram_range=(1, 3), min_df=3),
            vect_types.COUNT: CountVectorizer(ngram_range=(1, 3), min_df=10)
        }
        self.__vect = vd[vect_type].fit(self.__corpus.processed_ru)

    def __transform_data(self, transform_type):
        if transform_type is transform_types.FREQ:
            self.__vectorize(vect_types.COUNT)
        self.__feats = self.__vect.transform(self.__corpus.processed_ru)
        self.__labels = self.__corpus.multiclass
        return self.__feats, to_categorical(self.__corpus.multiclass)

    def __eval_model(self, y_train, y_test, y_train_pred, y_test_pred):
        class_names = [
            'excited,delighted,aroused,astonished',
            'calm,relaxed,content, friendly',
            'angry annoyed, frustrated, disguted',
            'depressed, bored, sad, gloomy'
        ]
        print('train scores\n')
        print(
            classification_report(y_train,
                                  y_train_pred,
                                  target_names=class_names))
        print('test scores\n')
        print(
            classification_report(y_test,
                                  y_test_pred,
                                  target_names=class_names))

    def __fit_classifier(self):
        X_train, X_test, y_train, y_test = train_test_split(self.__feats,
                                                            self.__labels,
                                                            test_size=0.2)
        self.__model = RidgeClassifier(alpha=100, class_weight='balanced').fit(
            X_train, y_train)
        y_train_pred = self.__model.predict(X_train)
        y_test_pred = self.__model.predict(X_test)
        self.__eval_model(y_train, y_test, y_train_pred, y_test_pred)

    def make_classifier(self):
        self.__load_corpus('emo_bank_ru.csv')
        self.__transform_data(transform_types.FREQ)
        self.__fit_classifier()

    def __make_sequences(self, max_length):
        t = Tokenizer()
        t.fit_on_texts(self.__corpus.processed_ru.tolist())
        self.__vocab_size = len(t.word_index) + 1
        encoded_docs = t.texts_to_sequences(self.__corpus.processed_ru)
        feats = sequence.pad_sequences(encoded_docs, maxlen=max_length)
        self.__tokenizer = t
        return feats, to_categorical(self.__corpus.multiclass)

    def __make_single_sequence(self, text, max_length):
        encoded_doc = self.__tokenizer.texts_to_sequences([text])
        feats = sequence.pad_sequences(encoded_doc, maxlen=max_length)
        return feats

    def __create_net(self, max_length):
        model = Sequential()
        model.add(
            Embedding(self.__vocab_size,
                      100,
                      input_length=max_length,
                      embeddings_regularizer=regularizers.l2(1e-5)))
        model.add(Dropout(0.7))
        model.add(
            Conv1D(filters=50,
                   kernel_size=3,
                   padding='same',
                   activation='sigmoid'))
        model.add(MaxPooling1D(pool_size=10))
        model.add(Dropout(0.4))
        model.add(LSTM(25, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(4, activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['categorical_accuracy', self.__top_2_acc])
        self.__nnet = model

    def __prepare_net_data(self, preprocess_option, max_length):
        preprocess = {
            preprocess_options.SEQ: self.__make_sequences(max_length),
            preprocess_options.BOW: self.__transform_data(transform_types.FREQ)
        }
        self.__netin, self.__netout = preprocess[preprocess_option]

    def __top_2_acc(self, y_true, y_pred):
        return top_k_categorical_accuracy(y_true, y_pred, k=2)

    def __train_net(self):
        X_train, X_test, y_train, y_test = train_test_split(self.__netin,
                                                            self.__netout,
                                                            test_size=0.1)
        class_weight = compute_class_weight(
            'balanced', [0, 1, 2, 3],
            self.__corpus.multiclass.apply(int).tolist())
        checkpointer = ModelCheckpoint(filepath='checkpoint.hdf5',
                                       verbose=1,
                                       save_best_only=True,
                                       monitor='val_loss')
        self.__nnet.fit(X_train,
                        y_train,
                        epochs=100,
                        batch_size=64,
                        validation_data=[X_test, y_test],
                        callbacks=[checkpointer],
                        class_weight=class_weight)

    def make_neural_net(self):
        self.__load_corpus('emo_bank_ru.csv')
        self.__prepare_net_data(preprocess_options.SEQ, self.__max_length)
        self.__create_net(self.__max_length)
        self.__train_net()

    def __clean_text(self, text):
        morph = pymorphy2.MorphAnalyzer()
        text = re.sub(r'[1-9a-zA-Z\^\*\/\$\@\_\"\\n\)\(\.\,\:\;\!\[\]]', ' ',
                      text)
        tokens = [
            morph.parse(w)[0].normal_form for w in
            gensim.utils.simple_preprocess(text, deacc=True, min_len=1)
            if len(w) > 2
        ]
        return ' '.join(tokens)

    def __transform_cleaned_text(self, cleaned_text):
        return self.__vect.transform(cleaned_text)

    def run_classifier(self, text):
        class_names = {
            0: 'excited,delighted,aroused,astonished',
            1: 'calm,relaxed,content, friendly',
            2: 'angry annoyed, frustrated, disguted',
            3: 'depressed, bored, sad, gloomy'
        }
        cleaned_text = self.__clean_text(text)
        feats = self.__transform_cleaned_text([cleaned_text])
        pred_class = self.__model.predict(feats)
        print('text: %s\nclassified as %s' %
              (text, class_names[pred_class[0]]))
        return class_names[pred_class[0]]

    def run_neural_network(self, text):
        class_names = {
            0: 'excited,delighted,aroused,astonished',
            1: 'calm,relaxed,content, friendly',
            2: 'angry annoyed, frustrated, disguted',
            3: 'depressed, bored, sad, gloomy'
        }
        cleaned_text = self.__clean_text(text)
        feats = self.__make_single_sequence(cleaned_text, self.__max_length)
        pred_class = self.__nnet.predict(feats)
        print('text: %s\nclassified as %s' %
              (text, class_names[np.argmax(pred_class[0])]))
        return class_names[np.argmax(pred_class[0])]

    def save_neural_net(self, filename):
        net_json = self.__nnet.to_json()
        with open(filename + '.json', "w") as json_file:
            json_file.write(net_json)
        self.__nnet.save_weights(filename + '.h5')
        with open(filename + '_tokenizer.pkl', 'wb') as output:
            pickle.dump(self.__tokenizer, output, pickle.HIGHEST_PROTOCOL)

    def load_neural_net(self, filename):
        json_file = open(filename + '.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        self.__nnet = model_from_json(loaded_model_json)
        self.__nnet.load_weights(filename + ".h5")
        with open(filename + '_tokenizer.pkl', 'rb') as inp:
            self.__tokenizer = pickle.load(inp)

    def load_checkpoint(self):
        self.__nnet.load_weights('checkpoint.hdf5')
Example #21
0
# Written by John Tindel                                                                    #
# Further information on the models presented here can be found in modelDocumentation.ipynb #
#############################################################################################

from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

#The model parameters are stored in this area
ridge = RidgeClassifier(alpha=2, solver="sag")
logit = LogisticRegression(solver="sag")
perceptron = Perceptron(n_iter=50)
passiveAggressive = PassiveAggressiveClassifier(n_iter=20, loss='hinge')
knn = KNeighborsClassifier(n_neighbors=5)
nearestCentroid = NearestCentroid()
L1SVC = LinearSVC(loss='squared_hinge', penalty='l1', dual=False)
L2SVC = LinearSVC(loss='squared_hinge', penalty='l2', dual=False)
L1SGD = SGDClassifier(alpha=.0001, n_iter=10, penalty='L1')
L2SGD = SGDClassifier(alpha=.0001, n_iter=10, penalty='L2')
elasticNet = SGDClassifier(alpha=.0001, n_iter=175, penalty="elasticnet")
MNB = MultinomialNB(alpha=.01)
BNB = BernoulliNB(binarize=.01, alpha=.01)
pipeline = Pipeline([
    ('feature_selection',
     SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3))),
Example #22
0
    clf.fit(X_train, y_train)

    print(f"""       Results
~~~~~~~~~~~~~~~~~~~~~
Train Score: {clf.score(X_train, y_train):.2f}
---
Test Score: {clf.score(X_test, y_test):.2f}
Best Parameters:
{clf.best_params_}
""")

    return clf


seed = 6

models = {
    'LogReg': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'DT': DecisionTreeClassifier(random_state=seed),
    'Gaussian': GaussianNB(),
    'Multinomial': MultinomialNB(),
    'LDA': LinearDiscriminantAnalysis(),
    'LinearSVC': LinearSVC(max_iter=1250, random_state=seed),
    'SGD': SGDClassifier(random_state=seed),
    'ADA': AdaBoostClassifier(random_state=seed),
    'Bagging': BaggingClassifier(random_state=seed),
    'Ridge': RidgeClassifier(random_state=seed),
    'RF': RandomForestClassifier(random_state=seed),
    'GradientBoost': GradientBoostingClassifier(random_state=seed)
}
Example #23
0
from env_stocktrading import StockTradingEnv

tscv = TimeSeriesSplit(n_splits=4)

# Define classifiers and parameters
classifiers = {}
classifiers.update({"LR": LogisticRegression(solver='liblinear')})
classifiers.update({"LDA": LinearDiscriminantAnalysis()})
classifiers.update({"QDA": QuadraticDiscriminantAnalysis()})
classifiers.update({"AdaBoost": AdaBoostClassifier()})
classifiers.update({"Bagging": BaggingClassifier()})
classifiers.update({"ETE": ExtraTreesClassifier()})
classifiers.update({"GB": GradientBoostingClassifier()})
classifiers.update({"RF": RandomForestClassifier()})
classifiers.update({"RidgeC": RidgeClassifier()})
classifiers.update({"SGD": SGDClassifier()})
classifiers.update({"BNB": BernoulliNB()})
classifiers.update({"GNB": GaussianNB()})
classifiers.update({"KNN": KNeighborsClassifier()})
classifiers.update({"MLP": MLPClassifier()})
classifiers.update({"NuSVC": NuSVC(probability=True,kernel='rbf',nu=0.01)})
classifiers.update({"SVC": SVC(C=0.025, probability=True)})
classifiers.update({"DTC": DecisionTreeClassifier()})
classifiers.update({"ETC": ExtraTreeClassifier()})
classifiers.update({"XGB": XGBClassifier()})

parameters = {}
# Must connect each parameter to the named step in your pipeline with a double underscore __.
parameters.update({"LR": {"classifier__C": [0.1, 0.5, 1, 5, 10, 50, 80, 100],
                         }})
def classify(granularity=10):
    trainDir = path.join(GEOTEXT_HOME, 'processed_data/' + str(granularity).strip() + '_clustered/')
    testDir = path.join(GEOTEXT_HOME, 'processed_data/test')
    data_train = load_files(trainDir, encoding=encoding)
    target = data_train.target
    data_test = load_files(testDir, encoding=encoding)

    categories = data_train.target_names
    
    def size_mb(docs):
        return sum(len(s.encode(encoding)) for s in docs) / 1e6
    
    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)
    
    print("%d documents - %0.3fMB (training set)" % (
        len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" % (
        len(data_test.data), data_test_size_mb))
    print("%d categories" % len(categories))
    print()
    
    # split a training set and a test set
    y_train = data_train.target
    y_test = data_test.target
    
    
    print("Extracting features from the training dataset using a sparse vectorizer")
    t0 = time()
    vectorizer = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, min_df=2, max_df=1.0, ngram_range=(1, 1), stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()
    
    print("Extracting features from the test dataset using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()
    chi = False
    if chi:
        k = 500000
        print("Extracting %d best features by a chi-squared test" % 0)
        t0 = time()
        ch2 = SelectKBest(chi2, k=k)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        
        print("done in %fs" % (time() - t0))
        print()
        
    feature_names = np.asarray(vectorizer.get_feature_names())
    # clf = LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-3)
    clf = RidgeClassifier(tol=1e-2, solver="auto")
    print('_' * 80)
    print("Training: ")
    print(clf)
    
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    scores = clf.decision_function(X_test)
    print scores.shape
    print pred.shape
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    # score = metrics.f1_score(y_test, pred)
    # print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))
        print("top 10 keywords per class:")
        for i, category in enumerate(categories):
            top10 = np.argsort(clf.coef_[i])[-10:]
            print("%s: %s" % (category, " ".join(feature_names[top10])))

    
    sumMeanDistance = 0
    sumMedianDistance = 0
    distances = []
    confidences = []
    randomConfidences = []
    
    for i in range(0, len(pred)):
        user = path.basename(data_test.filenames[i])
        location = userLocation[user].split(',')
        lat = float(location[0])
        lon = float(location[1])
        prediction = categories[pred[i]]
        confidence = scores[i][pred[i]] - mean(scores[i])
        randomConfidence = scores[i][random.randint(0, len(categories) - 1)]
        confidences.append(confidence)
        randomConfidences.append(randomConfidence)
        medianlat = classLatMedian[prediction]  
        medianlon = classLonMedian[prediction]  
        meanlat = classLatMean[prediction] 
        meanlon = classLonMean[prediction]      
        distances.append(distance(lat, lon, medianlat, medianlon))
        sumMedianDistance = sumMedianDistance + distance(lat, lon, medianlat, medianlon)
        sumMeanDistance = sumMeanDistance + distance(lat, lon, meanlat, meanlon)
    averageMeanDistance = sumMeanDistance / float(len(pred))
    averageMedianDistance = sumMedianDistance / float(len(pred))
    print "Average mean distance is " + str(averageMeanDistance)
    print "Average median distance is " + str(averageMedianDistance)
    print "Median distance is " + str(median(distances))
    fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)
    
    plt.xlim(0, 4000)
    plt.ylim(0, 2)
    ax1.scatter(distances, confidences)
    ax2.bar(distances, confidences)
    plt.savefig(path.join(GEOTEXT_HOME, 'confidence.png'))
Example #25
0
    def get_estimator(self):
        estimator = self.kwargs.get("estimator", self.ESTIMATOR)
        # self.mlflow_log_param("model", estimator)
        # added both regressions for predicting scores and classifier for match outcomes
        # elif estimator == 'Linear':
        #     model = LinearRegression()
        # elif estimator == 'RandomForestRegressor':
        #     model = RandomForestRegressor()
        # elif estimator == 'Lasso':
        #     model = Lasso()
        # elif estimator == "Ridge":
        #     model = Ridge()
        # elif estimator == "GBM":
        #     model = GradientBoostingRegressor()
        # elif estimator == "KNNRegressor":
        #     model = KNeighborsRegressor()
        if estimator == 'GaussianNB':  # No proba parameter needed
            model = GaussianNB()
        # elif estimator == 'LDA':
        #     self.model_params = {'solver': ['lsqr','eigen'],  #note svd does not run with shrinkage and models using it will be tuned separately
        #                           'n_components': [1.0,2.0,3.0,4.0,5.0]}
        #     model = LinearDiscriminantAnalysis()
        # elif estimator == "xgboost":
        #     model = XGBRegressor()
        # classification models
        if estimator == 'Logistic':  # No proba parameter needed
            self.model_params = {'C': np.arange(0.001, 1000)}
            #model = LogisticRegression(C=20.000999999999998)
            model = LogisticRegression()
        # elif estimator == 'LDA':
        #     model = LinearDiscriminantAnalysis()
        elif estimator == 'RandomForestClassifier':  # No proba parameter needed
            self.model_params = {
                'bootstrap': [True, False],
                'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                'max_features': ['auto', 'sqrt'],
                'min_samples_leaf': [1, 2, 4],
                'min_samples_split': [2, 5, 10],
                'n_estimators':
                [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
            }
            #model = RandomForestClassifier(n_estimators=1800, n_jobs=-1,max_depth=100,min_samples_split=5,bootstrap=False)
            model = RandomForestClassifier()
        elif estimator == "RidgeClassifier":  # No predict_proba
            self.model_params = {"alpha": np.arange(0.001, 1000)}
            model = RidgeClassifier(alpha=106.00099999999999)
            # model = RidgeClassifier()
            # model = GridSearchCV(estimator=grid, param_grid=dict(alpha=alphas))
        elif estimator == "KNNClassifier":  # No Proba parameter needed
            self.model_params = {
                "leaf_size": range(1, 1000),
                "n_neighbors": range(1, 1000),
                "p": [1.0, 2.0]
            }
            #model = KNeighborsClassifier(leaf_size=336,n_neighbors=913,p=2.0) #positive results
            model = KNeighborsClassifier()
            # model = GridSearchCV(knn, hyperparameters, cv=10)
        elif estimator == "XGBClassifier":  # Proba: Returns array with the probability of each data example being of a given class.
            self.model_params = {
                'max_depth': range(2, 20, 2),
                'n_estimators': range(60, 220, 40),
                'learning_rate': [0.3, 0.1, 0.01, 0.05],
                'min_child_weight': [1.0, 3.0, 5.0],
                'gamma': [1.0, 3.0, 5.0]
            }
            #model = XGBClassifier(max_depth=14,n_estimators=60,learning_rate=0.1,min_child_weight=1.0,gamma=5.0) #positive results
            # model = XGBClassifier(max_depth=18,n_estimators=60,learning_rate=0.05,min_child_weight=5,gamma=3.0) #positive results
            model = XGBClassifier()
            # model = GridSearchCV(XGB, param_grid=params_1, cv=5)
        elif estimator == "Dummy":
            model = DummyClassifier(strategy='uniform', random_state=15)
        elif estimator == "SVC":
            self.model_params = {
                'C': [0.1, 1, 10, 100, 1000],
                'gamma': [0.01, 0.001],
                'kernel': ['rbf', 'poly', 'sigmoid']
            }
            # model = SVC(kernel='sigmoid', C=80,gamma=0.001,probability=True)
            model = SVC(probability=True)

        elif estimator == "Sequential":
            model = Sequential()
            model.add(Flatten())
            model.add(BatchNormalization())
            model.add(Dense(32, activation='relu'))
            model.add(Dense(32, activation='relu'))
            model.add(Dense(16, activation='relu'))
            model.add(
                Dense(8,
                      kernel_regularizer=regularizers.l2(0.003),
                      activation='relu',
                      input_shape=(10000, )))
            model.add(
                Dense(8,
                      kernel_regularizer=regularizers.l2(0.003),
                      activation='relu'))
            model.add(Dense(1, activation='sigmoid'))
            # model.add(SimpleRNN(1, input_shape=[None, 1], activation='tanh'))
            model.compile(loss='binary_crossentropy',
                          optimizer='Adam',
                          metrics=['accuracy'])

        else:
            self.model_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
            model = LogisticRegression()

        estimator_params = self.kwargs.get("estimator_params", {})
        if estimator != "Sequential":
            model.set_params(**estimator_params)
        return model
Example #26
0
                    fit_intercept=True,
                    intercept_scaling=1,
                    max_iter=100,
                    multi_class='ovr',
                    n_jobs=1,
                    penalty='l2',
                    random_state=None,
                    solver='newton-cg',
                    tol=0.0001,
                    verbose=0,
                    warm_start=False),
 RidgeClassifier(alpha=1.0,
                 class_weight=None,
                 copy_X=True,
                 fit_intercept=True,
                 max_iter=None,
                 normalize=False,
                 random_state=None,
                 solver='auto',
                 tol=0.001),
 RandomForestClassifier(bootstrap=True,
                        class_weight=None,
                        criterion='gini',
                        max_depth=15,
                        max_features='sqrt',
                        max_leaf_nodes=None,
                        min_impurity_decrease=0.0,
                        min_impurity_split=None,
                        min_samples_leaf=1,
                        min_samples_split=10,
                        min_weight_fraction_leaf=0.0,
Example #27
0
import pandas as pd
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate, GridSearchCV

from preprocessing import *
from constants import *
from utils import BaselineClassifierTitanic, TitanicNNClassifier

models = {
    "Baseline": BaselineClassifierTitanic(),
    "Linear Regression": LogisticRegression(),
    "Ridge Regression": RidgeClassifier(),
    "K Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Neural Network": MLPClassifier(),
    "Custom ANN": TitanicNNClassifier()
}

train_accuracies = pd.DataFrame(index=models.keys(),
                                columns=[AVERAGE, PCT_STANDARD_DEVIATION])
test_accuracies = pd.DataFrame(index=models.keys(),
                               columns=[AVERAGE, PCT_STANDARD_DEVIATION])

for model_name, model in models.items():
Example #28
0
def get_model_from_name(model_name, training_params=None, is_hp_search=False):
    global keras_imported

    # For Keras
    epochs = 1000
    # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning':
    #     print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy')
    #     epochs = 100

    all_model_params = {
        'LogisticRegression': {},
        'RandomForestClassifier': {
            'n_jobs': -2,
            'n_estimators': 30
        },
        'ExtraTreesClassifier': {
            'n_jobs': -1
        },
        'AdaBoostClassifier': {},
        'SGDClassifier': {
            'n_jobs': -1
        },
        'Perceptron': {
            'n_jobs': -1
        },
        'LinearSVC': {
            'dual': False
        },
        'LinearRegression': {
            'n_jobs': -2
        },
        'RandomForestRegressor': {
            'n_jobs': -2,
            'n_estimators': 30
        },
        'LinearSVR': {
            'dual': False,
            'loss': 'squared_epsilon_insensitive'
        },
        'ExtraTreesRegressor': {
            'n_jobs': -1
        },
        'MiniBatchKMeans': {
            'n_clusters': 8
        },
        'GradientBoostingRegressor': {
            'presort': False,
            'learning_rate': 0.1,
            'warm_start': True
        },
        'GradientBoostingClassifier': {
            'presort': False,
            'learning_rate': 0.1,
            'warm_start': True
        },
        'SGDRegressor': {
            'shuffle': False
        },
        'PassiveAggressiveRegressor': {
            'shuffle': False
        },
        'AdaBoostRegressor': {},
        'LGBMRegressor': {
            'n_estimators': 2000,
            'learning_rate': 0.15,
            'num_leaves': 8,
            'lambda_l2': 0.001,
            'histogram_pool_size': 16384
        },
        'LGBMClassifier': {
            'n_estimators': 2000,
            'learning_rate': 0.15,
            'num_leaves': 8,
            'lambda_l2': 0.001,
            'histogram_pool_size': 16384
        },
        'DeepLearningRegressor': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'DeepLearningClassifier': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'CatBoostRegressor': {},
        'CatBoostClassifier': {}
    }

    # if os.environ.get('is_test_suite', 0) == 'True':
    #     all_model_params

    model_params = all_model_params.get(model_name, None)
    if model_params is None:
        model_params = {}

    if is_hp_search == True:
        if model_name[:12] == 'DeepLearning':
            model_params['epochs'] = 50
        if model_name[:4] == 'LGBM':
            model_params['n_estimators'] = 500

    if training_params is not None:
        print('Now using the model training_params that you passed in:')
        print(training_params)
        # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it)
        model_params.update(training_params)
        print(
            'After overwriting our defaults with your values, here are the final params that will be used to initialize the model:'
        )
        print(model_params)

    model_map = {
        # Classifiers
        'LogisticRegression': LogisticRegression(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RidgeClassifier': RidgeClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'LinearSVC': LinearSVC(),

        # Regressors
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'Ridge': Ridge(),
        'LinearSVR': LinearSVR(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'RANSACRegressor': RANSACRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'Lasso': Lasso(),
        'ElasticNet': ElasticNet(),
        'LassoLars': LassoLars(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'BayesianRidge': BayesianRidge(),
        'ARDRegression': ARDRegression(),

        # Clustering
        'MiniBatchKMeans': MiniBatchKMeans(),
    }

    try:
        model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001)
        model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(
            max_iter=1000, tol=0.001)
        model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor(
            max_iter=1000, tol=0.001)
    except TypeError:
        model_map['SGDClassifier'] = SGDClassifier()
        model_map['Perceptron'] = Perceptron()
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(
        )
        model_map['SGDRegressor'] = SGDRegressor()
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor()

    if xgb_installed:
        model_map['XGBClassifier'] = XGBClassifier()
        model_map['XGBRegressor'] = XGBRegressor()

    if lgb_installed:
        model_map['LGBMRegressor'] = LGBMRegressor()
        model_map['LGBMClassifier'] = LGBMClassifier()

    if catboost_installed:
        model_map['CatBoostRegressor'] = CatBoostRegressor(
            calc_feature_importance=True)
        model_map['CatBoostClassifier'] = CatBoostClassifier(
            calc_feature_importance=True)

    if model_name[:12] == 'DeepLearning':
        if keras_imported == False:
            # Suppress some level of logs if TF is installed (but allow it to not be installed, and use Theano instead)
            try:
                os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
                os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
                from tensorflow import logging
                logging.set_verbosity(logging.INFO)
            except:
                pass

            global maxnorm
            global Dense, Dropout
            global LeakyReLU, PReLU, ThresholdedReLU, ELU
            global Sequential
            global keras_load_model
            global regularizers, optimizers
            global Activation
            global KerasRegressor, KerasClassifier

            from keras.constraints import maxnorm
            from keras.layers import Activation, Dense, Dropout
            from keras.layers.advanced_activations import LeakyReLU, PReLU, ThresholdedReLU, ELU
            from keras.models import Sequential
            from keras.models import load_model as keras_load_model
            from keras import regularizers, optimizers
            from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier
            keras_imported = True

        model_map['DeepLearningClassifier'] = KerasClassifier(
            build_fn=make_deep_learning_classifier)
        model_map['DeepLearningRegressor'] = KerasRegressor(
            build_fn=make_deep_learning_model)

    try:
        model_without_params = model_map[model_name]
    except KeyError as e:
        print(
            'It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize'
        )
        raise (e)

    if os.environ.get('is_test_suite', False) == 'True':
        if 'n_jobs' in model_params:
            model_params['n_jobs'] = 1
    model_with_params = model_without_params.set_params(**model_params)

    return model_with_params
Example #29
0
def Eval(XTrain, YTrain, XTest, YTest, clf, return_predicted_labels=False):
	"""
		Inputs:
			XTrain - N by D matrix of training data vectors
			YTrain - N by 1 matrix of training class labels
			XTest - M by D matrix of testin data vectors
			YTrain - M by 1 matrix of testing class labels
			clstr - the clustering function 
				either the string = "KMeans" or "GMM"
				or a sklearn clustering instance
					with the methods .fit and 
		Outputs:
			A tuple containing (in the following order):
				Accuracy
				Overall Precision
				Overall Recall
				Overall F1 score
				Avg. Precision per class
				Avg. Recall per class
				F1 Score
				Precision per class
				Recall per class
				F1 Score per class
				(if return_predicted_labels)
					predicted class labels for each row in XTest
	"""

	if type(clf) == str:
		if 'ridge' in clf.lower():
			clf = RidgeClassifier(tol=1e-2, solver="lsqr")
		elif "perceptron" in clf.lower():
			clf = Perceptron(n_iter=50)
		elif "passive aggressive" in clf.lower() or 'passive-aggressive' in clf.lower():
			clf = PassiveAggressiveClassifier(n_iter=50)
		elif 'linsvm' in clf.lower() or 'linearsvm' in clf.lower() or 'linearsvc' in clf.lower():
			clf = LinearSVC()
		elif 'svm' in clf.lower() or 'svc' in clf.lower():
			clf = SVC()
		elif 'sgd' in clf.lower():
			clf = SGDClassifier()
   
	clf.fit(XTrain, YTrain)
	YPred = clf.predict(XTest)


	accuracy = sklearn.metrics.accuracy_score(YTest, YPred)
	(overall_precision, overall_recall, overall_f1, support) = sklearn.metrics.precision_recall_fscore_support(YTest, YPred, average='micro')
	(precision_per_class, recall_per_class, f1_per_class, support_per_class) = sklearn.metrics.precision_recall_fscore_support(YTest, YPred)
	avg_precision_per_class = np.mean(precision_per_class)
	avg_recall_per_class = np.mean(recall_per_class)
	avg_f1_per_class = np.mean(f1_per_class)

	del clf

	if return_predicted_labels:
		return (accuracy, overall_precision, overall_recall, overall_f1, avg_precision_per_class, avg_recall_per_class, avg_f1_per_class, precision_per_class, recall_per_class, f1_per_class, YPred)
	else:
		return (accuracy, overall_precision, overall_recall, overall_f1, avg_precision_per_class, avg_recall_per_class, avg_f1_per_class, precision_per_class, recall_per_class, f1_per_class)
Example #30
0
    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in ((LogisticRegression(penalty='l2',
                                      C=1.0,
                                      max_iter=100,
                                      solver='newton-cg'),
                   "Logistic Regression"), (RidgeClassifier(tol=1e-2,
                                                            solver="lsqr"),
                                            "Ridge Classifier"),
                  (Perceptron(n_iter=50),
                   "Perceptron"), (PassiveAggressiveClassifier(n_iter=50),
                                   "Passive-Aggressive"),
                  (KNeighborsClassifier(n_neighbors=10), "kNN"),
                  (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(
if __name__ == "__main__":
    filehandler = open(features_evaluation.SELECTED_FEATURES_CORPUS_CHI2, 'r')
    corpus = pickle.load(filehandler)
    dataset = Dataset(corpus=corpus)

    X = dataset.get_train_x()
    y = dataset.get_train_y()

    scores_dict = defaultdict(list)


    clf1 = LogisticRegression(C=0.05, random_state=1, class_weight='balanced')
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = svm.SVC(C=0.35, class_weight='balanced')
    clf4 = RidgeClassifier(alpha=2.5)
    clf5 = AdaBoostClassifier(n_estimators=150)
    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svm', clf3),
                                        ('rc', clf4), ('ab', clf5)],
                            voting='hard')

    for clf, label in zip([clf1, clf2, clf3, clf4, clf5, eclf],
                          ['Logistic Regression', 'Random Forest', 'SVM',
                           'Ridge Classifier', 'Ada boost', 'Ensemble']):
        scores = cross_val_score(clf, X.toarray(), y, cv=5, scoring='f1_macro')
        scores_dict[label].append(scores.mean())
        print("f1_macro: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

    X, y = dataset.get_resampled_train_X_y(kind='regular')
    clf1.fit(X.toarray(), y)
    clf2.fit(X.toarray(), y)
Example #32
0
    ]

classifiersv2 = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA(),
    KNeighborsClassifier(n_neighbors=10),
    PassiveAggressiveClassifier(n_iter = 50),
    Perceptron(n_iter =50),
    RidgeClassifier(tol=1e-2,solver = 'lsqr'),
    MultinomialNB(alpha=.001),
    BernoulliNB(alpha=.001),
    GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,max_depth=1, loss='deviance'),
    SGDClassifier(alpha=.001, n_iter=50, penalty='l1'),
    SGDClassifier(alpha=.001, n_iter=50, penalty='l2'),
     NearestCentroid()








]
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import RidgeClassifier
from sklearn.dummy import DummyClassifier

classifiers = {'knn':    KNeighborsClassifier(),
              #'lsvm':   SVC(kernel="linear"),
              #'rbfsvm': SVC(gamma=2),
              #'gp':     GaussianProcessClassifier(),
               'dt':     DecisionTreeClassifier(),
               'rf':     RandomForestClassifier(), #default worse than suggested values
               'mlp':    MLPClassifier(), #default worse than suggested values
               'adb':    AdaBoostClassifier(),
               'nb':     GaussianNB(),
              #'qda':    QuadraticDiscriminantAnalysis(),
               'ridge':  RidgeClassifier(),
               '-dumbase-': DummyClassifier(strategy="most_frequent") 
              }

statnames = ['Classifiers', 'Avg. Test-acc', 'Avg. Train-acc', 
             'Std. Test-acc',  'Std. Train-acc',
             'Avg. Test-time', 'Avg. Train-time'
            ]
statcodes = ['clfn', 'mtsts', 'mtrns', 'vtsts', 'vtrns', 'predt', 'trint']

REPEAT  = REPEAT  if REPEAT  >  1 else 1  # sanity-check
CVFOLDS = CVFOLDS if CVFOLDS >= 2 else 2  # sanity-check
# scikit-learn documentation recommends using StratifiedKFold for classification
# problems to preserve class balance across folds. however, in this case, 
# we use KFold and RepeatedKFold because 
#  number of items in a class <= CVFOLDS (works only with 2 folds for entire dataset)
Example #34
0
    if opts.print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred, target_names=target_names))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split("(")[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in (
    (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
    (Perceptron(max_iter=50), "Perceptron"),
    (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"),
    (KNeighborsClassifier(n_neighbors=10), "kNN"),
    (RandomForestClassifier(), "Random forest"),
):
    print("=" * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print("=" * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))
Example #35
0
    if has_bias:
        assert '<BIAS>' in neg or '<BIAS>' in pos

    assert res == explain_weights(reg)


@pytest.mark.parametrize(['clf'], [
    [LogisticRegression(random_state=42)],
    [
        LogisticRegression(
            random_state=42, multi_class='multinomial', solver='lbfgs')
    ],
    [LogisticRegression(random_state=42, fit_intercept=False)],
    [LogisticRegressionCV(random_state=42)],
    [RidgeClassifier(random_state=42)],
    [RidgeClassifierCV()],
    [SGDClassifier(random_state=42)],
    [SGDClassifier(random_state=42, loss='log')],
    [PassiveAggressiveClassifier(random_state=42)],
    [Perceptron(random_state=42)],
    [LinearSVC(random_state=42)],
    [OneVsRestClassifier(SGDClassifier(random_state=42))],
])
def test_explain_linear(newsgroups_train, clf):
    assert_explained_weights_linear_classifier(newsgroups_train, clf)


@pytest.mark.parametrize(['clf'], [
    [OneVsRestClassifier(SGDClassifier(random_state=42))],
    [OneVsRestClassifier(LogisticRegression(random_state=42))],
Example #36
0
             Pipeline([("Scaler", StandardScaler()),
                       ("DecisionTrees", DecisionTreeClassifier())])))

clfs.append(("RandomForestClassifier",
             Pipeline([("Scaler", StandardScaler()),
                       ("RandomForest", RandomForestClassifier())])))

clfs.append(("GradientBoostingClassifier",
             Pipeline([("Scaler", StandardScaler()),
                       ("GradientBoosting",
                        GradientBoostingClassifier(max_features=15,
                                                   n_estimators=150))])))

clfs.append(("RidgeClassifier",
             Pipeline([("Scaler", StandardScaler()),
                       ("RidgeClassifier", RidgeClassifier())])))

clfs.append(("BaggingRidgeClassifier",
             Pipeline([("Scaler", StandardScaler()),
                       ("BaggingClassifier", BaggingClassifier())])))

clfs.append(("ExtraTreesClassifier",
             Pipeline([("Scaler", StandardScaler()),
                       ("ExtraTrees", ExtraTreesClassifier())])))

#'neg_mean_absolute_error', 'neg_mean_squared_error','r2'
scoring = 'accuracy'
n_folds = 7

results, names = [], []
Example #37
0
def test_class_weights():
    # Test class weights.
    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0,
                                                                     0.0]])
    y = [1, 1, 1, -1, -1]

    reg = RidgeClassifier(class_weight=None)
    reg.fit(X, y)
    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1]))

    # we give a small weights to class 1
    reg = RidgeClassifier(class_weight={1: 0.001})
    reg.fit(X, y)

    # now the hyperplane should rotate clock-wise and
    # the prediction on this point should shift
    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([-1]))

    # check if class_weight = 'balanced' can handle negative labels.
    reg = RidgeClassifier(class_weight='balanced')
    reg.fit(X, y)
    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1]))

    # class_weight = 'balanced', and class_weight = None should return
    # same values when y has equal number of all labels
    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0]])
    y = [1, 1, -1, -1]
    reg = RidgeClassifier(class_weight=None)
    reg.fit(X, y)
    rega = RidgeClassifier(class_weight='balanced')
    rega.fit(X, y)
    assert len(rega.classes_) == 2
    assert_array_almost_equal(reg.coef_, rega.coef_)
    assert_array_almost_equal(reg.intercept_, rega.intercept_)
def get_ridge_plot(best_param_, experiment_, 
                   param_keys_, param_vals_,
                   png_folder,
                   png_fname,
                   score_threshold=0.8):

    parameters = dict(zip(param_keys_, param_vals_))
    del parameters['model_type']

    clf = RidgeClassifier()
    X_train, y_train = experiment_.get_train_data()
    clf.set_params(**best_param_)
    clf.fit(X_train, y_train)    
    best_alpha = best_param_['alpha']
    result = {'alphas':[],
              'coefs':np.zeros( (len(parameters['alpha']), len(X_train.columns.values) + 1) ),
              'scores':[],
              'score':None}


    for i, alpha in enumerate(parameters.get('alpha',None)):
        result['alphas'].append(alpha)
        del best_param_['alpha']
        best_param_['alpha'] = alpha
        clf.set_params(**best_param_)
        clf.fit(X_train, y_train)

        # regularization path
        tmp = np.array([0 for j in xrange(len(X_train.columns.values) + 1)], dtype=np.float32)
        if best_param_['fit_intercept']:
            tmp = np.append(clf.intercept_, clf.coef_)
        else:
            tmp[1:] = clf.intercept_
        result['coefs'][i,:] = tmp
        result['scores'].append(experiment_.get_proba(clf, X_train))
    del X_train, y_train

    # 2. 
    tmp_len = len(experiment_.get_data_col_name())
    index2feature = dict(zip(np.arange(1, tmp_len + 1), 
                             experiment_.get_data_col_name()))
    if best_param_['fit_intercept']:
        index2feature[0] = 'intercept'

    # 3. plot
    gs = GridSpec(2,2)
    ax1 = plt.subplot(gs[:,0])
    ax2 = plt.subplot(gs[0,1])
    ax3 = plt.subplot(gs[1,1])


    # 3.1 feature importance
    labels = np.append(np.array(['intercept'], dtype='S100'), experiment_.get_data_col_name())
    nrows, ncols = result['coefs'].shape
    for ncol in xrange(ncols):
        ax1.plot(np.array(result['alphas']), result['coefs'][:,ncol], label = labels[ncol])
    ax1.legend(loc='best')
    ax1.set_xscale('log')
    ax1.set_title("Regularization Path:%1.3e" % (best_alpha))
    ax1.set_xlabel("alpha", fontsize=10)

    # 3.2 PDF
    X_test, y_test = experiment_.get_test_data()
    result['score'] = clf.decision_function(X_test)
    sns.distplot(result['score'], kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF : Decision_Function")


    # 3.3 CDF
    num_bins = 100
    try:
        counts, bin_edges = np.histogram(result['score'], bins=num_bins, normed=True)
    except:
        counts, bin_edges = np.histogram(result['score'], normed=True)

    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Decision_Function:Confidence_Score", fontsize=10)


    png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)
    plt.close()

    return True
num_folds = 5
seed = 2
scoring = 'accuracy'
models = []
names = []
cv_scores = []
test_accuracy = []
precisions = []
recalls = []

models.append(
    ('LR', LogisticRegression(multi_class='multinomial', solver='newton-cg')))
models.append(('SVC', LinearSVC(multi_class='crammer_singer')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('Ridge', RidgeClassifier()))
models.append(('RF', RandomForestClassifier()))

# Crossvalidate all the models and also calculate the test accuracies and other metrics for each model

for name, model in models:
    names.append(name)
    kfold = StratifiedKFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model,
                                 X_train_scaled,
                                 y_train,
                                 cv=kfold,
                                 scoring=scoring)
    cv_score_mean = round(cv_results.mean(), 3)
    cv_scores.append(cv_score_mean)
#!/usr/bin/env python
"""
Ridge regression for Avito
"""
__author__ = "deniederhut"
__license__ = "GPL"
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

data = pd.read_table('/Users/dillonniederhut/Desktop/avito_train.tsv',nrows=100000)
#replace with file path to your training data

features = pd.get_dummies(data.subcategory)
features_train, features_test, target_train, target_test =\
    train_test_split(features, data.is_blocked, test_size = 0.25)

ridge = RidgeClassifier()
ridge.fit(features_train, target_train)
prediction = np.round(ridge.predict(features_test))
print classification_report(target_test, prediction)
print average_precision_score(target_test, prediction)
print roc_auc_score(target_test, prediction)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [
                feature_names[i] for i in ch2.get_support(indices=True)
            ]
    return X_train, feature_names, ch2, vectorizer


# else:
# print(simple_classify(MultinomialNB(),test_x,test_y,train_x,k.target))
# print(simple_classify(RandomForestClassifier(),test_x,test_y,train_x,k.target))
# print(simple_classify(RidgeClassifier(),test_x,test_y,train_x,k.target))
# print(simple_classify(KNeighborsClassifier(),test_x,test_y,train_x,k.target))
# print(simple_classify(Perceptron(),test_x,test_y,train_x,k.target))
# print(simple_classify(PassiveAggressiveClassifier(),test_x,test_y,train_x,k.target))
#New Code For PAN
k, y_train, a = read_pan.read_pan(pan_train)
k_t, test_y, a_t = read_pan.read_pan(pan_test)
train_x, f_names, chi, transformer = feature_extraction2(givenlabel, k)
test_x, _, _, _ = feature_extraction2(givenlabel, k_t)

# train_x,f_names,chi,transformer=feature_extraction(givenlabel,k)
print(simple_classify(MultinomialNB(), test_x, test_y, train_x, k))
print(simple_classify(RandomForestClassifier(), test_x, test_y, train_x, k))
print(simple_classify(RidgeClassifier(), test_x, test_y, train_x, k))
print(simple_classify(KNeighborsClassifier(), test_x, test_y, train_x, k))
print(simple_classify(Perceptron(), test_x, test_y, train_x, k))
print(
    simple_classify(PassiveAggressiveClassifier(), test_x, test_y, train_x, k))
# Notation:
# N: number for training examples; K: number of models in level 0
# X: feature matrix; y: result array; z_k: prediction result array for k's model
# 

# Setup 10 fold cross validation
fold_num = 10
kf = KFold(n_samples, k=fold_num, indices=True)

# set number of neighbors for kNN
n_neighb = 19

# Brute-force implementation
clf_mNB = MultinomialNB(alpha=.01)
clf_kNN = KNeighborsClassifier(n_neighbors=n_neighb)
clf_ridge = RidgeClassifier(tol=1e-1)
clf_lSVC = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3)
clf_SVC = SVC(C=32, gamma=0.0625)
# clf_SGD = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")

# empty ndarrays for predication results z_kn
z_mNB = np.array([], dtype=np.int32)
z_kNN = np.array([], dtype=np.int32)
z_ridge = np.array([], dtype=np.int32)
z_lSVC = np.array([], dtype=np.int32)
z_SVC = np.array([], dtype=np.int32)


###############################################################################
# Stacking
# 
        print(
            metrics.classification_report(y_test,
                                          pred,
                                          target_names=target_names))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in ((RidgeClassifier(tol=1e-2, solver="lsqr"),
                   "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"),
                  (PassiveAggressiveClassifier(n_iter=50),
                   "Passive-Aggressive"),
                  (KNeighborsClassifier(n_neighbors=10), "kNN"),
                  (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(
        benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))
Example #44
0
    data = [ i for i in csv.reader(file(train_file, 'rb')) ]
    data = data[1:] # remove header
    random.shuffle(data)

    X = np.array([ i[1:] for i in data ]).astype(float)
    Y = np.array([ i[0] for i in data ]).astype(int)

    train_cutoff = len(data) * 3/4

    X_train = X[:train_cutoff]
    Y_train = Y[:train_cutoff]
    X_test = X[train_cutoff:]
    Y_test = Y[train_cutoff:]

    classifier = RidgeClassifier(normalize = True, alpha = 1)
    classifier = classifier.fit(X_train, Y_train)
    
    print 'Training error : %s' % (classifier.fit(X_train, Y_train).score(X_train, Y_train))

    Y_predict = classifier.predict(X_test)

    equal = 0
    for i in xrange(len(Y_predict)):
        if Y_predict[i] == Y_test[i]:
            equal += 1

    print 'Accuracy = %s' % (float(equal)/len(Y_predict))


# Notation:
# N: number for training examples; K: number of models in level 0
# X: feature matrix; y: result array; z_k: prediction result array for k's model
# 

# Setup 10 fold cross validation
fold_num = 10
kf = KFold(n_samples, k=fold_num, indices=True)

# set number of neighbors for kNN
n_neighb = 19

# Brute-force implementation
clf_mNB = MultinomialNB(alpha=.01)
clf_kNN = KNeighborsClassifier(n_neighbors=n_neighb)
clf_ridge = RidgeClassifier(tol=1e-1)
clf_lSVC = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3)
clf_SVC = SVC(C=32, gamma=0.0625)
# clf_SGD = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")

# empty ndarrays for predication results z_kn
z_mNB = np.array([], dtype=np.int32)
z_kNN = np.array([], dtype=np.int32)
z_ridge = np.array([], dtype=np.int32)
z_lSVC = np.array([], dtype=np.int32)
z_SVC = np.array([], dtype=np.int32)


###############################################################################
# Stacking
# 
Example #46
0
"EMC strives to keep your personal information accurate. We have implemented technology, management processes and policies to maintain data integrity. We will provide you with access to your information when reasonable, or in accordance with relevant laws, including making reasonable effort to provide you with online access and the opportunity to change your information. To protect your privacy and security, we will take steps to verify your identity before granting access or making changes to your personal information. To access and/or correct information, you can do so online or notify us via the appropriate method below depending on which site is at issue",
"Your information to our service providers. We use service providers who help us to provide you with our services. We give relevant persons working for some of these providers access to your information, but only to the extent necessary for them to perform their services for us. We also implement reasonable contractual and technical protections to ensure the confidentiality of your personal information and data is maintained, used only for the provision of their services to us, and handled in accordance with this privacy policy. Examples of service providers include payment processors, email service providers, and web traffic analytics tools",
"Some Microsoft sites allow you to choose to share your personal information with select Microsoft partners so that they can contact you about their products, services or offers. Other sites, such as MSN instead may give you a separate choice as to whether you wish to receive communications from Microsoft about a partner's particular offering (without transferring your personal information to the third party). See the Communication Preferences section below for more information.",
]

X_new = vectorizer.transform(docs_new)


# Train classifiers
print "Training Classifiers..."
t0 = time()

clf_nb = MultinomialNB()
clf_lsvc = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
clf_svc = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True)
clf_rdg = RidgeClassifier(tol=1e-1)
clf_sgd = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")

# Logistic regression requires OneVsRestClassifier which hides
# its methods such as decision_function
# It will require extra implementation efforts to use it as a candidate
# for multilabel classification
# clf_lgr = OneVsRestClassifier(LogisticRegression(C=1000,penalty='l1'))
# kNN does not have decision function due to its nature
# clf_knn = KNeighborsClassifier(n_neighbors=13)

# train
clf_nb.fit(X, y)
clf_lsvc.fit(X, y)
clf_rdg.fit(X, y)
clf_svc.fit(X, y)
Example #47
0
###Training with libraries

categories = None 
remove = ()

X_train = cityName;

print('Creating the vectorizer and chosing a transform (from raw text to feature)')
vect= TfidfVectorizer(sublinear_tf=True, max_df=0.5)
#vect=CountVectorizer(min_n=1,max_n=2,max_features=1000);

X_train = vect.fit_transform(X_train)


cityClass = RidgeClassifier(tol=1e-7)
countryClass = RidgeClassifier(tol=1e-7)

print('Creating a classifier for cities')
cityClass.fit(X_train,cityCode)
print('Creating a classifier for countries')
countryClass.fit(X_train,countryCode)

print('testing the performance');

testCityNames = vect.transform(cityNameTest);

predictionsCity = countryClass.predict(testCityNames);
predictionsCountry = cityClass.predict(testCityNames);

with open('predictions.csv','w') as csvfile:
Example #48
0
 def __init__(self, num_arms=3):
     self.K = num_arms
     self.training_data = None
     self.training_labels = None
     self.clf = RidgeClassifier()
     self.dont_fit = True
Example #49
0
vocabulary = np.array([t for t, i in sorted(vectorizer.vocabulary.iteritems(), key=itemgetter(1))])

# ch2 = SelectKBest(chi2, k=200)
# X_train = ch2.fit_transform(X_train, y_train)
# X_test = ch2.transform(X_test)
# print "X_train: n_samples: %d, n_features: %d" % X_train.shape
# print "X_test : n_samples: %d, n_features: %d" % X_test.shape
# print

X_train = X_train.toarray()
X_test = X_test.toarray()

# clf = BernoulliNB(alpha=.1)
# clf = MultinomialNB(alpha=.01)
# clf = KNeighborsClassifier(n_neighbors=3)
clf = RidgeClassifier(tol=1e-1)
# clf = RandomForestClassifier(n_estimators=20, max_depth=None, min_split=3, random_state=42)
# clf = SGDClassifier(alpha=.01, n_iter=50, penalty="l2")
# clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)


clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print "y    : ", y_test
print "pred : ", pred
print

# # print out top words for each category
# for i, category in enumerate(categories):
#             top = np.argsort(clf.coef_[i, :])[-20:]
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()
lbl.fit(train['label1'].values)
train['label1'] = lbl.transform(train['label1'].values)
label = train['label1']
num_class = train['label1'].max() + 1

#=======================模型训练:5折交叉验证=========================================
n_folds = 5
stack_train = np.zeros((train.shape[0], num_class))
stack_test = np.zeros((test.shape[0], num_class))
for i, (tr, va) in enumerate(
        StratifiedKFold(label, n_folds=n_folds, random_state=42)):
    print('stack:%d/%d' % ((i + 1), n_folds))

    ridge = RidgeClassifier(random_state=42)
    ridge.fit(trn_term_doc[tr], label[tr])
    score_va = ridge._predict_proba_lr(trn_term_doc[va])
    score_te = ridge._predict_proba_lr(test_term_doc)

    stack_train[va] += score_va
    stack_test += score_te

print(
    "model acc_score:",
    metrics.accuracy_score(label,
                           np.argmax(stack_train, axis=1),
                           normalize=True,
                           sample_weight=None))

##获取第一第二个标签:取概率最大的前两个即可:
    # N: number for training examples; K: number of models in level 0
    # X: feature matrix; y: result array; z_k: prediction result array for k's model
    # 

    # Setup 10 fold cross validation
    fold_num = 10
    kf = KFold(n_samples, k=fold_num, indices=True)

    # set number of neighbors for kNN
    n_neighb = 19

    # Brute-force implementation
    clf_bNB     = BernoulliNB(alpha=.01)
    clf_mNB     = MultinomialNB(alpha=.01)
    clf_kNN     = KNeighborsClassifier(n_neighbors=n_neighb)
    clf_ridge   = RidgeClassifier(tol=1e-1)
    clf_lSVC    = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3)
    clf_SVC     = SVC(C=32, gamma=0.0625, probability=True)
    # clf_SGD     = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")

    ###############################################################################
    # Stacking
    # 
    # initialize empty y and z

    n_categories = len(set(y))
    z = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=float)
    # z = np.zeros( (n_samples, n_categories) , dtype=float)

    # Test for 10 rounds using the results from 10 fold cross validations
    for i, (train_index, test_index) in enumerate(kf):
Example #52
0
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

# add two columns for hour and weekday
def dayhour(timestr):
    d = datetime.strptime(str(x), "%y%m%d%H")
    return [float(d.weekday()), float(d.hour)]

fh = FeatureHasher(n_features = 2**20, input_type="string")

# Train classifier
clf = RidgeClassifier()
train = pd.read_csv("train/subtrain.csv", chunksize = 100000, iterator = True)
all_classes = np.array([0, 1])
for chunk in train:
    y_train = chunk["click"]
    chunk = chunk[cols]
    chunk = chunk.join(pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"]))
    chunk.drop(["hour"], axis=1, inplace = True)
    Xcat = fh.transform(np.asarray(chunk.astype(str)))
    clf.fit(Xcat, y_train)
    
# Create a submission file
usecols = cols + ["id"]
X_test = pd.read_csv("test/mtest.csv", usecols=usecols)
X_test = X_test.join(pd.DataFrame([dayhour(x) for x in X_test.hour], columns=["wd", "hr"]))
X_test.drop(["hour"], axis=1, inplace = True)
Example #53
0
File: ridge.py Project: mb16/Kaggle
def main():

    startCol = 0
    endCol = 50  # max = 1775

    train = csv_io.read_data("../Data/train.csv")
    target = [x[0] for x in train][1:3000]
    targetTest = [x[0] for x in train][3001:]
    trainTest = [x[startCol+1:endCol+1] for x in train][3001:]
    test = csv_io.read_data("../Data/test.csv")
    test = [x[startCol:endCol] for x in test]
	
    train = [x[startCol+1:endCol+1] for x in train][1:3000]	
	
    fo = open("knn_stats.txt", "a+")

    rf = RidgeClassifier(alpha=0.01, fit_intercept=True, normalize=False, copy_X=True, tol=0.001) 
	
    rf.fit(train, target)
    prob = rf.predict(trainTest) # changed from test


    result = 100
    probSum = 0
    for i in range(0, len(prob)):
        probX = prob[i] # [1]
        if ( probX > 0.7):
            probX = 0.7;		
        if ( probX < 0.3):
            probX = 0.3;
        print i, probSum, probX, target[i]
        print target[i]*log(probX), (1-target[i])*log(1-probX)
        probSum += targetTest[i]*log(probX)+(1-targetTest[i])*log(1-probX)
	
        #print probSum	
        #print len(prob)	
        #print "C: ", 10**C, " gamma: " ,2**g
        print -probSum/len(prob)
	

	
    if ( -probSum/len(prob) < result ):
        result = -probSum/len(prob)
        predicted_probs = rf.predict(test)  # was test
        predicted_probs = ["%f" % x for x in predicted_probs]
        csv_io.write_delimited_file("../Submissions/knn.csv", predicted_probs)
        print "Generated Data!!"
		
    #fo.write(str(5) + str(5)+ str(5));
		
    fo.close()
		
    #csv_io.write_delimited_file("../Submissions/rf_benchmark_test2.csv", predicted_probs)

    #predicted_probs = rf.predict_proba(train) # changed from test
 
    #predicted_probs = ["%f" % x[1] for x in predicted_probs]
    #predicted_probs = rf.predict(train) # changed from test
    #predicted_probs = ["%f" % x for x in predicted_probs]	
	
    #csv_io.write_delimited_file("../Submissions/rf_benchmark_train2.csv", predicted_probs)
	
	
    var = raw_input("Enter to terminate.")								
names = [
    "Logistic Regression", "Linear SVC",
    "LinearSVC with L1-based feature selection", "Multinomial NB",
    "Bernoulli NB", "Ridge Classifier", "AdaBoost", "Perceptron",
    "Passive-Aggresive", "Nearest Centroid"
]
classifiers = [
    LogisticRegression(),
    LinearSVC(),
    Pipeline([('feature_selection',
               SelectFromModel(LinearSVC(penalty="l1", dual=False))),
              ('classification', LinearSVC(penalty="l2"))]),
    MultinomialNB(),
    BernoulliNB(),
    RidgeClassifier(),
    AdaBoostClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    NearestCentroid()
]
zipped_clf = zip(names, classifiers)

tvec = TfidfVectorizer()


def classifier_comparator(vectorizer=tvec,
                          n_features=10000,
                          stop_words=None,
                          ngram_range=(1, 1),
                          classifier=zipped_clf):
# N: number for training examples; K: number of models in level 0
# X: feature matrix; y: result array; z_k: prediction result array for k's model
# 

# Setup 10 fold cross validation
fold_num = 10
kf = KFold(n_samples, k=fold_num, indices=True)

# set number of neighbors for kNN
n_neighb = 13

# Brute-force implementation
clf_bNB     = BernoulliNB(alpha=.01)
clf_mNB     = MultinomialNB(alpha=.01)
clf_kNN     = KNeighborsClassifier(n_neighbors=n_neighb)
clf_ridge   = RidgeClassifier(tol=1e-1)
clf_SGD     = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")
clf_lSVC    = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
clf_SVC     = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True)


###############################################################################
# Stacking
# 
# initialize empty y and z

print 'X_den shape: ', X_den.shape
print 'y shape:     ', y.shape

n_categories = len(set(y))
z = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=float)
Example #56
0
def main() :
	
	# a few hard-coded variables, change here if you want to modify random seed or number of folds in the cross-validation
	nFolds = 10
	randomSeed = 42
	
	# here the feature file is selected 
	featureFile = "../results/feature-importance-efs.csv"
	#featureFile = "../results/feature-importance-elastic-net.csv"
	#featureFile = "../results/feature-importance-recursive-feature-elimination-svc.csv"
	#featureFile = "../results/feature-importance-univariate.csv"
	
	# load dataset
	X, y, featureNames = genericFunctions.loadTCGADataset()
	print("Training dataset (original):", X.shape)
	
	# load selected features
	selectedFeatures = genericFunctions.loadFeatures(featureFile)
	
	# create reduced dataset
	print("Reading feature file \"" + featureFile + "\"...")
	featureIndexes = [ i for i in range(0, len(featureNames)) if featureNames[i] in selectedFeatures ]
	X_reduced = X[:,featureIndexes] 
	print("Training dataset (reduced):", X_reduced.shape)
	
	print("Normalizing by samples...")
	normalizeBySample = True
	if normalizeBySample :
		from sklearn.preprocessing import normalize
		X = normalize(X)
		X_reduced = normalize(X_reduced)
	
	# FINALLY, WE CAN CLASSIFY AWAY!
	classifierList = [

			#[RandomForestClassifier(), "RandomForestClassifier()"],
			[BaggingClassifier(n_estimators=300), "BaggingClassifier(n_estimators=300)"],
			[GradientBoostingClassifier(n_estimators=300), "GradientBoostingClassifier(n_estimators=300)"],
			[RandomForestClassifier(n_estimators=300), "RandomForestClassifier(n_estimators=300)"],
			[LogisticRegression(), "LogisticRegression"], # coef_
			[PassiveAggressiveClassifier(), "PassiveAggressiveClassifier"], # coef_
			[RidgeClassifier(), "RidgeClassifier"], # coef_
			[SGDClassifier(), "SGDClassifier"], # coef_
			[SVC(kernel='linear'), "SVC(linear)"], # coef_, but only if the kernel is linear...the default is 'rbf', which is NOT linear

			]
	
	# 10-fold cross-validation
	from sklearn.model_selection import StratifiedKFold 
	skf = StratifiedKFold(n_splits = nFolds, shuffle=True, random_state=randomSeed) 
	foldIndexes = [ (training, test) for training, test in skf.split(X, y) ]

	for originalClassifier, classifierName in classifierList :
		
		classifierPerformance = []
		classifierPerformanceReduced = []

		# iterate over all folds
		print("\nClassifier " + classifierName + " on original dataset...")
		for fold, indexes in enumerate(foldIndexes) :
			
			train_index, test_index = indexes
			X_train, X_test = X[train_index], X[test_index]
			y_train, y_test = y[train_index], y[test_index]
			
			# let's normalize by feature
			scaler = StandardScaler()
			X_train = scaler.fit_transform(X_train)
			X_test = scaler.transform(X_test)

			classifier = copy.deepcopy(originalClassifier)
			classifier.fit(X_train, y_train)
			scoreTraining = classifier.score(X_train, y_train)
			scoreTest = classifier.score(X_test, y_test)
			
			print("\tFold #%d: training: %.4f, test: %.4f" % (fold, scoreTraining, scoreTest))
			classifierPerformance.append( scoreTest )
		
		# iterate again over all folds, this time on the reduced dataset
		print("Classifier " + classifierName + " on reduced dataset...")
		for fold, indexes in enumerate(foldIndexes) :

			train_index, test_index = indexes
			X_train, X_test = X_reduced[train_index], X_reduced[test_index]
			y_train, y_test = y[train_index], y[test_index]
			
			# let's normalize by feature
			scaler = StandardScaler()
			X_train = scaler.fit_transform(X_train)
			X_test = scaler.transform(X_test)

			classifier = copy.deepcopy(originalClassifier)
			classifier.fit(X_train, y_train)
			scoreTraining = classifier.score(X_train, y_train)
			scoreTest = classifier.score(X_test, y_test)
			
			print("\tFold %d: training: %.4f, test: %.4f" % (fold, scoreTraining, scoreTest))
			classifierPerformanceReduced.append( scoreTest )
		

		print("Classifier %s, performance on original dataset: %.4f (+/- %.4f)" % (classifierName, np.mean(classifierPerformance), np.std(classifierPerformance)))
		print("Classifier %s, performance on reduced dataset: %.4f (+/- %.4f)" % (classifierName, np.mean(classifierPerformanceReduced), np.std(classifierPerformanceReduced)))

	return
        (LDA(), "Linear Discriminant Analysis"),
        (LinearSVC(), "SVM")
        ):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf, name))   

# Attach classifier to the original json file

# loading dtm file for all twitts
fp = open('./python_files/twitter_dtm.pkl', 'rb')
dtm = pkl.load(fp)
fp.close()

# Predict the labels using Ridges classifier
clf = RidgeClassifier(alpha=1.,tol=1e-2, solver="lsqr")
clf.fit(X_train, y_train)
predicted_labels = clf.predict(dtm)

# loading json file for all twitts
file_name = '../R Project/Data/obamacare.json'
line_reader = open(file_name,'r') # r means for reading

# building a new json file for all twitts + new predicted labels
new_file_name = '../R Project/Data/obamacare_labeled.json'
line_writer = open(new_file_name,'w') # w means for writing

# adding the predicted label to each entry of json file
twit_i = 0
for line in line_reader:
    label = predicted_labels[twit_i]
        print("classification report:")
        print(metrics.classification_report(y_test, pred,
                                            target_names=categories))

    if True:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                            dual=False, tol=1e-3)))