def train_and_predict_m8 (train, test, labels) : ## Apply basic concatenation + stemming trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'porter') ## TF-IDF transform with sub-linear TF and stop-word removal tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS) tfv.fit(trainData) X = tfv.transform(trainData) X_test = tfv.transform(testData) ## Create the classifier print ("Fitting Ridge Classifer...") clf = RidgeClassifier(class_weight = 'auto', alpha = 1, normalize = True) ## Create a parameter grid to search for best parameters for everything in the pipeline param_grid = {'alpha' : [0.1, 0.3, 1, 3, 10], 'normalize' : [True, False]} ## Predict model with best parameters optimized for quadratic_weighted_kappa if (gridSearch) : model = perform_grid_search (clf, param_grid, X, labels) pred = model.predict(X_test) else : clf.fit(X, labels) pred = clf.predict(X_test) return pred
def retrain_models(username): train_x, train_y, body_x, body_y, head_x, head_y = model_retriever.retrieve_data_db(username) b_train_x = [] b_train_y = numpy.concatenate([body_y, train_y]) for msg in (body_x + train_x): b_train_x.append(extract_body_features(msg)) body_vec = TfidfVectorizer(norm="l2") b_train_x = body_vec.fit_transform(b_train_x) h_train_x = [] h_train_y = numpy.concatenate([head_y, train_y]) for msg in (head_x + train_x): h_train_x.append(extract_header_features(msg)) head_vec = DictVectorizer() h_train_x = head_vec.fit_transform(h_train_x) body_model = LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3) head_model = RidgeClassifier(tol=1e-2, solver="lsqr") body_model.fit(b_train_x, b_train_y) head_model.fit(h_train_x, h_train_y) print("Finished training models for "+username+"...") store_models(username, body_vec, body_model, head_vec, head_model)
def run(input_train, input_test, output_name): """ Takes a file path as input, a file path as output, and produces a sorted csv of item IDs for Kaggle submission ------- input_train : 'full path of the training file' input_test : 'full path of the testing file' output_name : 'full path of the output file' """ data = pd.read_table(input_train) test = pd.read_table(input_test) testItemIds = test.itemid response = data.is_blocked dummies = sparse.csc_matrix(pd.get_dummies(data.subcategory)) pretestdummies = pd.get_dummies(test.subcategory) testdummies = sparse.csc_matrix(pretestdummies.drop(['Растения', 'Товары для компьютера'],axis=1)) words = np.array(data.description,str) testwords = np.array(test.description,str) del data, test vect = text.CountVectorizer(decode_error = u'ignore', strip_accents='unicode', ngram_range=(1,2)) corpus = np.concatenate((words, testwords)) vect.fit(corpus) counts = vect.transform(words) features = sparse.hstack((dummies,counts)) clf = RidgeClassifier() clf.fit(features, response) testcounts = vect.transform(testwords) testFeatures = sparse.hstack((testdummies,testcounts)) predicted_scores = clf.predict_proba(testFeatures).T[1] f = open(output_name,'w') f.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): f.write("%d\n" % (item_id)) f.close()
def validate(input_train, rows=True, test=0.25): """ Takes file as input and returns classification report, average precision, and AUC for a bigram model. By default, loads all rows of a dataset, trains on .75, and tests on .25. ---- input_train : 'full path of the file you are loading' rows : True - loads all rows; insert an int for specific number of rows test : float proportion of dataset used for testing """ if rows == True: data = pd.read_table(input_train) else: data = pd.read_table(input_train, nrows = rows) response = data.is_blocked dummies = sparse.csc_matrix(pd.get_dummies(data.subcategory)) words = np.array(data.description,str) del data vect = text.CountVectorizer(decode_error = u'ignore',strip_accents='unicode',ngram_range=(1,2)) counts = vect.fit_transform(words) features = sparse.hstack((dummies,counts)) features_train, features_test, target_train, target_test = train_test_split(features, response, test_size = test) clf = RidgeClassifier() clf.fit(features_train, target_train) prediction = clf.predict(features_test) return classification_report(target_test, prediction), average_precision_score(target_test, prediction), roc_auc_score(target_test, prediction)
def test_default_configuration_classify(self): for i in range(2): X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=False) configuration_space = ExtraTreesPreprocessor.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() preprocessor = ExtraTreesPreprocessor(random_state=1, **{hp_name: default[hp_name] for hp_name in default}) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) # fit a classifier on top classifier = RidgeClassifier() predictor = classifier.fit(X_train_trans, Y_train) predictions = predictor.predict(X_test_trans) accuracy = sklearn.metrics.accuracy_score(predictions, Y_test) self.assertAlmostEqual(accuracy, 0.87310261080752882, places=2)
def test_default_configuration_classify(self): for i in range(5): X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=False) configuration_space = KernelPCA.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() preprocessor = KernelPCA(random_state=1, **{hp_name: default[hp_name] for hp_name in default if default[hp_name] is not None}) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) # fit a classifier on top classifier = RidgeClassifier() predictor = classifier.fit(X_train_trans, Y_train) predictions = predictor.predict(X_test_trans) accuracy = sklearn.metrics.accuracy_score(predictions, Y_test) self.assertAlmostEqual(accuracy, 0.096539162112932606)
def test_default_configuration_classify(self): for i in range(2): X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) configuration_space = TruncatedSVD.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() preprocessor = TruncatedSVD(random_state=1, **{hp_name: default[hp_name] for hp_name in default if default[ hp_name] is not None}) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) # fit a classifier on top classifier = RidgeClassifier() predictor = classifier.fit(X_train_trans, Y_train) predictions = predictor.predict(X_test_trans) accuracy = sklearn.metrics.accuracy_score(predictions, Y_test) self.assertAlmostEqual(accuracy, 0.44201578627808136, places=2)
def get_optimal_blend_weigth(exp_, best_param_, folder, fname, model_fname): clf = RidgeClassifier() X_test, y_test = exp_.get_test_data() clf.set_params(**best_param_) clf.fit(X_test, y_test) # dump2csv optimal linear weight names = np.append(np.array(['intercept'], dtype='S100'), X_test.columns.values) coefs = np.append(clf.intercept_, clf.coef_).astype(np.float64) optimal_linear_weight = pd.DataFrame(coefs.reshape(1,len(coefs)), columns=names) optimal_linear_weight.to_csv(os.path.join(Config.get_string('data.path'), folder, fname), index=False) # dump2cpkle for ridge model model_fname = os.path.join(Config.get_string('data.path'), folder, model_fname) with gzip.open(model_fname, 'wb') as gf: cPickle.dump(clf, gf, cPickle.HIGHEST_PROTOCOL) return True
def Predict(): print('\nThere are %d new deals') % n_test # Using the KNN classifier clf_KNN = KNeighborsClassifier(n_neighbors=3) # KNN doesnot work even if k has been tuned #clf_KNN = KNeighborsClassifier(n_neighbors=7) #clf_KNN = KNeighborsClassifier(n_neighbors=11) clf_KNN.fit(Corpus_train, Y_train) Y_pred_KNN = clf_KNN.predict(Corpus_test) print_rate(Y_test, Y_pred_KNN, n_test, 'KNNClassifier') # Using the SVM classifier clf_SVM = svm.SVC() clf_SVM.fit(Corpus_train, Y_train) Y_pred_SVM = clf_SVM.predict(Corpus_test) print_rate(Y_test, Y_pred_SVM, n_test, 'SVMClassifier') # Using the Ridge classifier clf_RC = RidgeClassifier(tol=0.01, solver="lsqr") #clf_RC = RidgeClassifier(tol=0.1, solver="lsqr") clf_RC.fit(Corpus_train, Y_train) Y_pred_RC = clf_RC.predict(Corpus_test) print_rate(Y_test, Y_pred_RC, n_test, 'RidgeClassifier') # won't consider Random Forests or Decision Trees beacause they work bad for high sparse dimensions # Using the Multinomial Naive Bayes classifier # I expect that this MNB classifier will do the best since it is designed for occurrence counts features #clf_MNB = MultinomialNB(alpha=0.01) #smoothing parameter = 0.01 is worse than 0.1 clf_MNB = MultinomialNB(alpha=0.1) #clf_MNB = MultinomialNB(alpha=0.3) #a big smoothing rate doesnot benefit the model #clf_MNB = MultinomialNB(alpha=0.2) #or alpha = 0.05 can generate the best outcome clf_MNB.fit(Corpus_train, Y_train) Y_pred_MNB = clf_MNB.predict(Corpus_test) print_rate(Y_test, Y_pred_MNB, n_test, 'MultinomialNBClassifier')
def get_classifier(classifier): if classifier["name"] == 'linear-ridge': c = RidgeClassifier() elif classifier["name"] == 'SVC': c = SVC() elif classifier["name"] == "l2-SVC": c = L2KernelClassifier() elif classifier["name"] == "fredholm": c = L2FredholmClassifier() elif classifier["name"] == "TSVM": c = SVMLight() elif classifier["name"] == "Lap-RLSC": c = LapRLSC() elif classifier["name"] == "fred_kernel_appr": c = FredholmKernelApprClassifier() else: raise NameError('Not existing classifier: ' + classifier["name"] + '.') c.set_params(**classifier["params"]) return c
from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from sklearn.linear_model import RidgeClassifier from sklearn.linear_model import SGDClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.neighbors import NearestCentroid lr = LogisticRegression() svc = SVC(kernel="linear") tree = DecisionTreeClassifier() mlp = MLPClassifier() ridge =RidgeClassifier(tol=1e-2, solver='lsqr', alpha=.5) sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=100, tol=None) rf = RandomForestClassifier(max_features=9, n_estimators=100) percep = Perceptron(n_iter=50) pass_agg = PassiveAggressiveClassifier(n_iter=50) near_cent = NearestCentroid() from sklearn.metrics import classification_report for clf in (lr, svc, tree, mlp, ridge, sgd, rf, percep, pass_agg, near_cent): clf.fit(X_train, y_train) print('=' * 25 + " " + clf.__class__.__name__ + " " + "="*30) print(clf.__class__.__name__, classification_report(y_test, clf.predict(X_test)));
def test_ridge_classifier_no_support_multilabel(): X, y = make_multilabel_classification(n_samples=10, random_state=0) assert_raises(ValueError, RidgeClassifier().fit, X, y)
def text_classify_influence_by_ngram_range(): """ ngram_range对文本分类的影响 """ train_df = pd.read_csv('./data/train_set.csv', sep='\t', nrows=15000) sample = train_df[0:5000] n = int(2 * len(sample) / 3) f1 = [] tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=2000) train_test = tfidf.fit_transform(sample['text']) train_x = train_test[:n] train_y = sample['label'].values[:n] test_x = train_test[n:] test_y = sample['label'].values[n:] clf = RidgeClassifier(alpha=0.1, solver='sag') clf.fit(train_x, train_y) val_pred = clf.predict(test_x) f1.append(f1_score(test_y, val_pred, average='macro')) tfidf = TfidfVectorizer(ngram_range=(2, 2), max_features=2000) train_test = tfidf.fit_transform(sample['text']) train_x = train_test[:n] train_y = sample['label'].values[:n] test_x = train_test[n:] test_y = sample['label'].values[n:] clf = RidgeClassifier(alpha=0.1, solver='sag') clf.fit(train_x, train_y) val_pred = clf.predict(test_x) f1.append(f1_score(test_y, val_pred, average='macro')) tfidf = TfidfVectorizer(ngram_range=(3, 3), max_features=2000) train_test = tfidf.fit_transform(sample['text']) train_x = train_test[:n] train_y = sample['label'].values[:n] test_x = train_test[n:] test_y = sample['label'].values[n:] clf = RidgeClassifier(alpha=0.1, solver='sag') clf.fit(train_x, train_y) val_pred = clf.predict(test_x) f1.append(f1_score(test_y, val_pred, average='macro')) tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=2000) train_test = tfidf.fit_transform(sample['text']) train_x = train_test[:n] train_y = sample['label'].values[:n] test_x = train_test[n:] test_y = sample['label'].values[n:] clf = RidgeClassifier(alpha=0.1, solver='sag') clf.fit(train_x, train_y) val_pred = clf.predict(test_x) f1.append(f1_score(test_y, val_pred, average='macro'))
svm_cv = GridSearchCV(SVC(C=1., kernel="linear"), param_grid={'C': [.1, .5, 1., 5., 10., 50., 100.]}, scoring='f1', n_jobs=1) logistic_cv = GridSearchCV(LogisticRegression(C=1., penalty="l1"), param_grid={'C': [.1, .5, 1., 5., 10., 50., 100.]}, scoring='f1') logistic_l2_cv = GridSearchCV(LogisticRegression(C=1., penalty="l2"), param_grid={ 'C': [.1, .5, 1., 5., 10., 50., 100.] }, scoring='f1') # The ridge classifier has a specific 'CV' object that can set it's # parameters faster than using a GridSearchCV ridge = RidgeClassifier() ridge_cv = RidgeClassifierCV() # A dictionary, to hold all our classifiers classifiers = {'SVC': svm, 'SVC cv': svm_cv, 'log l1': logistic, 'log l1 50': logistic_50, 'log l1 cv': logistic_cv, 'log l2': logistic_l2, 'log l2 cv': logistic_l2_cv, 'ridge': ridge, 'ridge cv': ridge_cv } #############################################################################
def model_fit(X, y, test_size=0.5, alpha_low=-6, alpha_high=6, n_steps=25, cv=4, plot_figures=False): # Prepare datasets scaler = MinMaxScaler(feature_range=(0, 1)) X_temp = X.reshape((len(X), -1)) X_temp = scaler.fit_transform(X_temp) indexes = list(range(len(X_temp))) # Split Dataset into training and test set x_train, x_test, y_train, y_test, idx_train, idx_test = train_test_split( X_temp, y, indexes, test_size=test_size, random_state=0, stratify=y) # Model creation ridge = RidgeClassifier(class_weight='balanced') alphas = np.logspace(alpha_low, alpha_high, num=n_steps) clf = GridSearchCV(estimator=ridge, param_grid={'alpha': alphas}, cv=cv, return_train_score=True, n_jobs=-1, verbose=1) # Fit the model to the data with warnings.catch_warnings(): warnings.filterwarnings("ignore") start = time.time() results = clf.fit(x_train, y_train) comp_time_total = time.time() - start # Plot the model fit curves if plot_figures: # Extract relevant modelling metrics train_scores = 100 * clf.cv_results_['mean_train_score'] valid_scores = 100 * clf.cv_results_['mean_test_score'] std_tr = 100 * clf.cv_results_['std_train_score'] std_va = 100 * clf.cv_results_['std_test_score'] plt.figure(figsize=(10, 5)) plt.semilogx(alphas, train_scores, label='Training Set') plt.semilogx(alphas, valid_scores, label='Validation Set') # Add marker and text for best score x_pos = clf.best_params_['alpha'] y_pos = 100 * clf.best_score_ txt = '{:0.2f}%'.format(y_pos) plt.scatter(x_pos, y_pos, marker='x', c='red', zorder=10) plt.text(x_pos, y_pos - 7.5, txt, fontdict={'size': 18}) # Quantify variance with ±std curves plt.fill_between(alphas, train_scores-std_tr, train_scores+std_tr, alpha=0.3) plt.fill_between(alphas, valid_scores-std_va, valid_scores+std_va, alpha=0.3) plt.title('Model Performance') plt.ylabel('Classification Accuracy [%]') plt.xlabel('Model Parameter [alpha]') # Adjust x-lim, y-lim, add legend and adjust layout plt.xlim(10**alpha_low, 10**alpha_high) plt.ylim(15, 105) plt.legend() plt.tight_layout() plt.show() else: # Provide written performance feedback best_score_test = clf.best_score_ * 100 feedback_txt = 'Model trained for {:.2f}s total '.format(comp_time_total) feedback_txt += 'and reached an accuracy of: {:.2f}%'.format(best_score_test) time.sleep(0.25) print(feedback_txt) # Store everything in model model = {'model': results.best_estimator_, 'best_score': results.best_score_, 'x_train': x_train, 'x_test': x_test, 'y_train': y_train, 'y_test': y_test, 'idx_train': idx_train, 'idx_test': idx_test} return model
# Using the KNN classifier clf_KNN = KNeighborsClassifier(n_neighbors=3) # KNN doesnot work even if k has been tuned #clf_KNN = KNeighborsClassifier(n_neighbors=7) #clf_KNN = KNeighborsClassifier(n_neighbors=11) clf_KNN.fit(Corpus_train, Y_train) Y_pred_KNN = clf_KNN.predict(Corpus_test) print_rate(Y_test, Y_pred_KNN, n_test, 'KNNClassifier') # Using the SVM classifier clf_SVM = svm.SVC() clf_SVM.fit(Corpus_train, Y_train) Y_pred_SVM = clf_SVM.predict(Corpus_test) print_rate(Y_test, Y_pred_SVM, n_test, 'SVMClassifier') # Using the Ridge classifier clf_RC = RidgeClassifier(tol=0.01, solver="lsqr") #clf_RC = RidgeClassifier(tol=0.1, solver="lsqr") clf_RC.fit(Corpus_train, Y_train) Y_pred_RC = clf_RC.predict(Corpus_test) print_rate(Y_test, Y_pred_RC, n_test, 'RidgeClassifier') # won't consider Random Forests or Decision Trees beacause they work bad for high sparse dimensions # Using the Multinomial Naive Bayes classifier # I expect that this MNB classifier will do the best since it is designed for occurrence counts features #clf_MNB = MultinomialNB(alpha=0.01) #smoothing parameter = 0.01 is worse than 0.1 clf_MNB = MultinomialNB(alpha=0.1) #clf_MNB = MultinomialNB(alpha=0.3) #a big smoothing rate doesnot benefit the model #clf_MNB = MultinomialNB(alpha=0.2) #or alpha = 0.05 can generate the best outcome clf_MNB.fit(Corpus_train, Y_train)
non_lemons = non_lemons.ix[random.sample(non_lemons.index, 6684)] train = lemons.append(non_lemons) #X = train.drop(['RefId','IsBadBuy','VehYear','Make','Model','Trim','SubModel','WheelTypeID','BYRNO'],axis=1) #y = pd.Series(train['IsBadBuy']).values target = pd.Series(train['IsBadBuy']).values data = train.drop(['RefId','IsBadBuy','VehYear','Make','Model','Trim','SubModel','WheelTypeID','BYRNO'],axis=1) x_train, x_test, y_train, y_test = cross_validation.train_test_split(data,target, test_size=.3) # Subset the data so we have a more even data set model = RidgeClassifier() clf = model.fit(X,y) Ridg_Class = clf.predict(X) clf.score(X,y) metrics.confusion_matrix(y, clf.predict(X)) print metrics.classification_report(y, clf.predict(X)) # GradientBoostingClassifier from sklearn.ensemble import * model = GradientBoostingClassifier() # Train clf = model.fit(x_train, y_train)
print( metrics.classification_report(y_test, pred, target_names=target_names)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ((RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), (Perceptron(max_iter=50), "Perceptron"), (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(), "Random forest")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))
classifier_1 = SGDClassifier() classifier_1.fit(X_train_vec, y_train_enc) print(cross_val_score(classifier_1, X_test_vec, y_test, cv=3)) y_pred = cross_val_predict(classifier_1, X_train_vec, y_train_enc) print('f1_score for classifier_1: ', f1_score(y_train_enc, y_pred)) print('precision for classifier_1: ', precision_score(y_train_enc, y_pred)) print('recall for classifier_1: ', recall_score(y_train_enc, y_pred)) precisions, recalls, threshholds = precision_recall_curve(y_train_enc, y_pred) plt.plot(threshholds, precisions[:-1], 'b--', label='precision') plt.plot(threshholds, recalls[:-1], 'g-', label='recall') plt.ylim([0, 1]) plt.legend() plt.show() # Training and testing RidgeClassifier classifier_2 = RidgeClassifier() classifier_2.fit(X_train_vec, y_train_enc) print(cross_val_score(classifier_2, X_test_vec, y_test_enc, cv=3)) y_pred = cross_val_predict(classifier_2, X_train_vec, y_train_enc) print('f1_score for classifier_2: ', f1_score(y_train_enc, y_pred)) print('precision for classifier_2: ', precision_score(y_train_enc, y_pred)) print('recall for classifier_2: ', recall_score(y_train_enc, y_pred)) precisions, recalls, threshholds = precision_recall_curve(y_train_enc, y_pred) plt.plot(threshholds, precisions[:-1], 'b--', label='precision') plt.plot(threshholds, recalls[:-1], 'g-', label='recall') plt.ylim([0, 1]) plt.legend() plt.show() # Traning and Testing RandomForsetClassifier classifier_3 = RandomForestClassifier()
class EmotionsClassifier(): def __init__(self): self.__max_length = 128 def __apply_class2mood(self): class2mood = { (1, 1): 0, #'excited,delighted,aroused,astonished', (1, 0): 1, #'calm,relaxed,content, friendly', (0, 1): 2, #'angry annoyed, frustrated, disguted', (0, 0): 3 #'depressed, bored, sad, gloomy' } res = [] for idx, row in self.__corpus.iterrows(): val = row['val_class'] aro = row['aro_class'] res.append(class2mood[(val, aro)]) self.__corpus['multiclass'] = res def __load_corpus(self, filepath): self.__corpus = pd.read_csv(filepath).dropna(axis=0) self.__corpus['val_class'] = self.__corpus['Val.W'].apply( lambda x: 1 if x > 3.0 else 0) self.__corpus['aro_class'] = self.__corpus['Aro.W'].apply( lambda x: 1 if x > 3.0 else 0) self.__apply_class2mood() def __vectorize(self, vect_type): vd = { vect_types.TF_IDF: TfidfVectorizer(ngram_range=(1, 3), min_df=3), vect_types.COUNT: CountVectorizer(ngram_range=(1, 3), min_df=10) } self.__vect = vd[vect_type].fit(self.__corpus.processed_ru) def __transform_data(self, transform_type): if transform_type is transform_types.FREQ: self.__vectorize(vect_types.COUNT) self.__feats = self.__vect.transform(self.__corpus.processed_ru) self.__labels = self.__corpus.multiclass return self.__feats, to_categorical(self.__corpus.multiclass) def __eval_model(self, y_train, y_test, y_train_pred, y_test_pred): class_names = [ 'excited,delighted,aroused,astonished', 'calm,relaxed,content, friendly', 'angry annoyed, frustrated, disguted', 'depressed, bored, sad, gloomy' ] print('train scores\n') print( classification_report(y_train, y_train_pred, target_names=class_names)) print('test scores\n') print( classification_report(y_test, y_test_pred, target_names=class_names)) def __fit_classifier(self): X_train, X_test, y_train, y_test = train_test_split(self.__feats, self.__labels, test_size=0.2) self.__model = RidgeClassifier(alpha=100, class_weight='balanced').fit( X_train, y_train) y_train_pred = self.__model.predict(X_train) y_test_pred = self.__model.predict(X_test) self.__eval_model(y_train, y_test, y_train_pred, y_test_pred) def make_classifier(self): self.__load_corpus('emo_bank_ru.csv') self.__transform_data(transform_types.FREQ) self.__fit_classifier() def __make_sequences(self, max_length): t = Tokenizer() t.fit_on_texts(self.__corpus.processed_ru.tolist()) self.__vocab_size = len(t.word_index) + 1 encoded_docs = t.texts_to_sequences(self.__corpus.processed_ru) feats = sequence.pad_sequences(encoded_docs, maxlen=max_length) self.__tokenizer = t return feats, to_categorical(self.__corpus.multiclass) def __make_single_sequence(self, text, max_length): encoded_doc = self.__tokenizer.texts_to_sequences([text]) feats = sequence.pad_sequences(encoded_doc, maxlen=max_length) return feats def __create_net(self, max_length): model = Sequential() model.add( Embedding(self.__vocab_size, 100, input_length=max_length, embeddings_regularizer=regularizers.l2(1e-5))) model.add(Dropout(0.7)) model.add( Conv1D(filters=50, kernel_size=3, padding='same', activation='sigmoid')) model.add(MaxPooling1D(pool_size=10)) model.add(Dropout(0.4)) model.add(LSTM(25, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(4, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy', self.__top_2_acc]) self.__nnet = model def __prepare_net_data(self, preprocess_option, max_length): preprocess = { preprocess_options.SEQ: self.__make_sequences(max_length), preprocess_options.BOW: self.__transform_data(transform_types.FREQ) } self.__netin, self.__netout = preprocess[preprocess_option] def __top_2_acc(self, y_true, y_pred): return top_k_categorical_accuracy(y_true, y_pred, k=2) def __train_net(self): X_train, X_test, y_train, y_test = train_test_split(self.__netin, self.__netout, test_size=0.1) class_weight = compute_class_weight( 'balanced', [0, 1, 2, 3], self.__corpus.multiclass.apply(int).tolist()) checkpointer = ModelCheckpoint(filepath='checkpoint.hdf5', verbose=1, save_best_only=True, monitor='val_loss') self.__nnet.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=[X_test, y_test], callbacks=[checkpointer], class_weight=class_weight) def make_neural_net(self): self.__load_corpus('emo_bank_ru.csv') self.__prepare_net_data(preprocess_options.SEQ, self.__max_length) self.__create_net(self.__max_length) self.__train_net() def __clean_text(self, text): morph = pymorphy2.MorphAnalyzer() text = re.sub(r'[1-9a-zA-Z\^\*\/\$\@\_\"\\n\)\(\.\,\:\;\!\[\]]', ' ', text) tokens = [ morph.parse(w)[0].normal_form for w in gensim.utils.simple_preprocess(text, deacc=True, min_len=1) if len(w) > 2 ] return ' '.join(tokens) def __transform_cleaned_text(self, cleaned_text): return self.__vect.transform(cleaned_text) def run_classifier(self, text): class_names = { 0: 'excited,delighted,aroused,astonished', 1: 'calm,relaxed,content, friendly', 2: 'angry annoyed, frustrated, disguted', 3: 'depressed, bored, sad, gloomy' } cleaned_text = self.__clean_text(text) feats = self.__transform_cleaned_text([cleaned_text]) pred_class = self.__model.predict(feats) print('text: %s\nclassified as %s' % (text, class_names[pred_class[0]])) return class_names[pred_class[0]] def run_neural_network(self, text): class_names = { 0: 'excited,delighted,aroused,astonished', 1: 'calm,relaxed,content, friendly', 2: 'angry annoyed, frustrated, disguted', 3: 'depressed, bored, sad, gloomy' } cleaned_text = self.__clean_text(text) feats = self.__make_single_sequence(cleaned_text, self.__max_length) pred_class = self.__nnet.predict(feats) print('text: %s\nclassified as %s' % (text, class_names[np.argmax(pred_class[0])])) return class_names[np.argmax(pred_class[0])] def save_neural_net(self, filename): net_json = self.__nnet.to_json() with open(filename + '.json', "w") as json_file: json_file.write(net_json) self.__nnet.save_weights(filename + '.h5') with open(filename + '_tokenizer.pkl', 'wb') as output: pickle.dump(self.__tokenizer, output, pickle.HIGHEST_PROTOCOL) def load_neural_net(self, filename): json_file = open(filename + '.json', 'r') loaded_model_json = json_file.read() json_file.close() self.__nnet = model_from_json(loaded_model_json) self.__nnet.load_weights(filename + ".h5") with open(filename + '_tokenizer.pkl', 'rb') as inp: self.__tokenizer = pickle.load(inp) def load_checkpoint(self): self.__nnet.load_weights('checkpoint.hdf5')
# Written by John Tindel # # Further information on the models presented here can be found in modelDocumentation.ipynb # ############################################################################################# from sklearn.linear_model import RidgeClassifier from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC from sklearn.linear_model import SGDClassifier, Perceptron, PassiveAggressiveClassifier from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.neighbors import KNeighborsClassifier, NearestCentroid from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectFromModel #The model parameters are stored in this area ridge = RidgeClassifier(alpha=2, solver="sag") logit = LogisticRegression(solver="sag") perceptron = Perceptron(n_iter=50) passiveAggressive = PassiveAggressiveClassifier(n_iter=20, loss='hinge') knn = KNeighborsClassifier(n_neighbors=5) nearestCentroid = NearestCentroid() L1SVC = LinearSVC(loss='squared_hinge', penalty='l1', dual=False) L2SVC = LinearSVC(loss='squared_hinge', penalty='l2', dual=False) L1SGD = SGDClassifier(alpha=.0001, n_iter=10, penalty='L1') L2SGD = SGDClassifier(alpha=.0001, n_iter=10, penalty='L2') elasticNet = SGDClassifier(alpha=.0001, n_iter=175, penalty="elasticnet") MNB = MultinomialNB(alpha=.01) BNB = BernoulliNB(binarize=.01, alpha=.01) pipeline = Pipeline([ ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3))),
clf.fit(X_train, y_train) print(f""" Results ~~~~~~~~~~~~~~~~~~~~~ Train Score: {clf.score(X_train, y_train):.2f} --- Test Score: {clf.score(X_test, y_test):.2f} Best Parameters: {clf.best_params_} """) return clf seed = 6 models = { 'LogReg': LogisticRegression(), 'KNN': KNeighborsClassifier(), 'DT': DecisionTreeClassifier(random_state=seed), 'Gaussian': GaussianNB(), 'Multinomial': MultinomialNB(), 'LDA': LinearDiscriminantAnalysis(), 'LinearSVC': LinearSVC(max_iter=1250, random_state=seed), 'SGD': SGDClassifier(random_state=seed), 'ADA': AdaBoostClassifier(random_state=seed), 'Bagging': BaggingClassifier(random_state=seed), 'Ridge': RidgeClassifier(random_state=seed), 'RF': RandomForestClassifier(random_state=seed), 'GradientBoost': GradientBoostingClassifier(random_state=seed) }
from env_stocktrading import StockTradingEnv tscv = TimeSeriesSplit(n_splits=4) # Define classifiers and parameters classifiers = {} classifiers.update({"LR": LogisticRegression(solver='liblinear')}) classifiers.update({"LDA": LinearDiscriminantAnalysis()}) classifiers.update({"QDA": QuadraticDiscriminantAnalysis()}) classifiers.update({"AdaBoost": AdaBoostClassifier()}) classifiers.update({"Bagging": BaggingClassifier()}) classifiers.update({"ETE": ExtraTreesClassifier()}) classifiers.update({"GB": GradientBoostingClassifier()}) classifiers.update({"RF": RandomForestClassifier()}) classifiers.update({"RidgeC": RidgeClassifier()}) classifiers.update({"SGD": SGDClassifier()}) classifiers.update({"BNB": BernoulliNB()}) classifiers.update({"GNB": GaussianNB()}) classifiers.update({"KNN": KNeighborsClassifier()}) classifiers.update({"MLP": MLPClassifier()}) classifiers.update({"NuSVC": NuSVC(probability=True,kernel='rbf',nu=0.01)}) classifiers.update({"SVC": SVC(C=0.025, probability=True)}) classifiers.update({"DTC": DecisionTreeClassifier()}) classifiers.update({"ETC": ExtraTreeClassifier()}) classifiers.update({"XGB": XGBClassifier()}) parameters = {} # Must connect each parameter to the named step in your pipeline with a double underscore __. parameters.update({"LR": {"classifier__C": [0.1, 0.5, 1, 5, 10, 50, 80, 100], }})
def classify(granularity=10): trainDir = path.join(GEOTEXT_HOME, 'processed_data/' + str(granularity).strip() + '_clustered/') testDir = path.join(GEOTEXT_HOME, 'processed_data/test') data_train = load_files(trainDir, encoding=encoding) target = data_train.target data_test = load_files(testDir, encoding=encoding) categories = data_train.target_names def size_mb(docs): return sum(len(s.encode(encoding)) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print("%d documents - %0.3fMB (training set)" % ( len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % ( len(data_test.data), data_test_size_mb)) print("%d categories" % len(categories)) print() # split a training set and a test set y_train = data_train.target y_test = data_test.target print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() vectorizer = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, min_df=2, max_df=1.0, ngram_range=(1, 1), stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test dataset using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() chi = False if chi: k = 500000 print("Extracting %d best features by a chi-squared test" % 0) t0 = time() ch2 = SelectKBest(chi2, k=k) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print("done in %fs" % (time() - t0)) print() feature_names = np.asarray(vectorizer.get_feature_names()) # clf = LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-3) clf = RidgeClassifier(tol=1e-2, solver="auto") print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) scores = clf.decision_function(X_test) print scores.shape print pred.shape test_time = time() - t0 print("test time: %0.3fs" % test_time) # score = metrics.f1_score(y_test, pred) # print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print("%s: %s" % (category, " ".join(feature_names[top10]))) sumMeanDistance = 0 sumMedianDistance = 0 distances = [] confidences = [] randomConfidences = [] for i in range(0, len(pred)): user = path.basename(data_test.filenames[i]) location = userLocation[user].split(',') lat = float(location[0]) lon = float(location[1]) prediction = categories[pred[i]] confidence = scores[i][pred[i]] - mean(scores[i]) randomConfidence = scores[i][random.randint(0, len(categories) - 1)] confidences.append(confidence) randomConfidences.append(randomConfidence) medianlat = classLatMedian[prediction] medianlon = classLonMedian[prediction] meanlat = classLatMean[prediction] meanlon = classLonMean[prediction] distances.append(distance(lat, lon, medianlat, medianlon)) sumMedianDistance = sumMedianDistance + distance(lat, lon, medianlat, medianlon) sumMeanDistance = sumMeanDistance + distance(lat, lon, meanlat, meanlon) averageMeanDistance = sumMeanDistance / float(len(pred)) averageMedianDistance = sumMedianDistance / float(len(pred)) print "Average mean distance is " + str(averageMeanDistance) print "Average median distance is " + str(averageMedianDistance) print "Median distance is " + str(median(distances)) fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True) plt.xlim(0, 4000) plt.ylim(0, 2) ax1.scatter(distances, confidences) ax2.bar(distances, confidences) plt.savefig(path.join(GEOTEXT_HOME, 'confidence.png'))
def get_estimator(self): estimator = self.kwargs.get("estimator", self.ESTIMATOR) # self.mlflow_log_param("model", estimator) # added both regressions for predicting scores and classifier for match outcomes # elif estimator == 'Linear': # model = LinearRegression() # elif estimator == 'RandomForestRegressor': # model = RandomForestRegressor() # elif estimator == 'Lasso': # model = Lasso() # elif estimator == "Ridge": # model = Ridge() # elif estimator == "GBM": # model = GradientBoostingRegressor() # elif estimator == "KNNRegressor": # model = KNeighborsRegressor() if estimator == 'GaussianNB': # No proba parameter needed model = GaussianNB() # elif estimator == 'LDA': # self.model_params = {'solver': ['lsqr','eigen'], #note svd does not run with shrinkage and models using it will be tuned separately # 'n_components': [1.0,2.0,3.0,4.0,5.0]} # model = LinearDiscriminantAnalysis() # elif estimator == "xgboost": # model = XGBRegressor() # classification models if estimator == 'Logistic': # No proba parameter needed self.model_params = {'C': np.arange(0.001, 1000)} #model = LogisticRegression(C=20.000999999999998) model = LogisticRegression() # elif estimator == 'LDA': # model = LinearDiscriminantAnalysis() elif estimator == 'RandomForestClassifier': # No proba parameter needed self.model_params = { 'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000] } #model = RandomForestClassifier(n_estimators=1800, n_jobs=-1,max_depth=100,min_samples_split=5,bootstrap=False) model = RandomForestClassifier() elif estimator == "RidgeClassifier": # No predict_proba self.model_params = {"alpha": np.arange(0.001, 1000)} model = RidgeClassifier(alpha=106.00099999999999) # model = RidgeClassifier() # model = GridSearchCV(estimator=grid, param_grid=dict(alpha=alphas)) elif estimator == "KNNClassifier": # No Proba parameter needed self.model_params = { "leaf_size": range(1, 1000), "n_neighbors": range(1, 1000), "p": [1.0, 2.0] } #model = KNeighborsClassifier(leaf_size=336,n_neighbors=913,p=2.0) #positive results model = KNeighborsClassifier() # model = GridSearchCV(knn, hyperparameters, cv=10) elif estimator == "XGBClassifier": # Proba: Returns array with the probability of each data example being of a given class. self.model_params = { 'max_depth': range(2, 20, 2), 'n_estimators': range(60, 220, 40), 'learning_rate': [0.3, 0.1, 0.01, 0.05], 'min_child_weight': [1.0, 3.0, 5.0], 'gamma': [1.0, 3.0, 5.0] } #model = XGBClassifier(max_depth=14,n_estimators=60,learning_rate=0.1,min_child_weight=1.0,gamma=5.0) #positive results # model = XGBClassifier(max_depth=18,n_estimators=60,learning_rate=0.05,min_child_weight=5,gamma=3.0) #positive results model = XGBClassifier() # model = GridSearchCV(XGB, param_grid=params_1, cv=5) elif estimator == "Dummy": model = DummyClassifier(strategy='uniform', random_state=15) elif estimator == "SVC": self.model_params = { 'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.01, 0.001], 'kernel': ['rbf', 'poly', 'sigmoid'] } # model = SVC(kernel='sigmoid', C=80,gamma=0.001,probability=True) model = SVC(probability=True) elif estimator == "Sequential": model = Sequential() model.add(Flatten()) model.add(BatchNormalization()) model.add(Dense(32, activation='relu')) model.add(Dense(32, activation='relu')) model.add(Dense(16, activation='relu')) model.add( Dense(8, kernel_regularizer=regularizers.l2(0.003), activation='relu', input_shape=(10000, ))) model.add( Dense(8, kernel_regularizer=regularizers.l2(0.003), activation='relu')) model.add(Dense(1, activation='sigmoid')) # model.add(SimpleRNN(1, input_shape=[None, 1], activation='tanh')) model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy']) else: self.model_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]} model = LogisticRegression() estimator_params = self.kwargs.get("estimator_params", {}) if estimator != "Sequential": model.set_params(**estimator_params) return model
fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='newton-cg', tol=0.0001, verbose=0, warm_start=False), RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001), RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=15, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=10, min_weight_fraction_leaf=0.0,
import pandas as pd from sklearn.linear_model import LogisticRegression, RidgeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.neural_network import MLPClassifier from sklearn.model_selection import cross_validate, GridSearchCV from preprocessing import * from constants import * from utils import BaselineClassifierTitanic, TitanicNNClassifier models = { "Baseline": BaselineClassifierTitanic(), "Linear Regression": LogisticRegression(), "Ridge Regression": RidgeClassifier(), "K Neighbors": KNeighborsClassifier(), "Decision Tree": DecisionTreeClassifier(), "Random Forest": RandomForestClassifier(), "Gradient Boosting": GradientBoostingClassifier(), "Neural Network": MLPClassifier(), "Custom ANN": TitanicNNClassifier() } train_accuracies = pd.DataFrame(index=models.keys(), columns=[AVERAGE, PCT_STANDARD_DEVIATION]) test_accuracies = pd.DataFrame(index=models.keys(), columns=[AVERAGE, PCT_STANDARD_DEVIATION]) for model_name, model in models.items():
def get_model_from_name(model_name, training_params=None, is_hp_search=False): global keras_imported # For Keras epochs = 1000 # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning': # print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy') # epochs = 100 all_model_params = { 'LogisticRegression': {}, 'RandomForestClassifier': { 'n_jobs': -2, 'n_estimators': 30 }, 'ExtraTreesClassifier': { 'n_jobs': -1 }, 'AdaBoostClassifier': {}, 'SGDClassifier': { 'n_jobs': -1 }, 'Perceptron': { 'n_jobs': -1 }, 'LinearSVC': { 'dual': False }, 'LinearRegression': { 'n_jobs': -2 }, 'RandomForestRegressor': { 'n_jobs': -2, 'n_estimators': 30 }, 'LinearSVR': { 'dual': False, 'loss': 'squared_epsilon_insensitive' }, 'ExtraTreesRegressor': { 'n_jobs': -1 }, 'MiniBatchKMeans': { 'n_clusters': 8 }, 'GradientBoostingRegressor': { 'presort': False, 'learning_rate': 0.1, 'warm_start': True }, 'GradientBoostingClassifier': { 'presort': False, 'learning_rate': 0.1, 'warm_start': True }, 'SGDRegressor': { 'shuffle': False }, 'PassiveAggressiveRegressor': { 'shuffle': False }, 'AdaBoostRegressor': {}, 'LGBMRegressor': { 'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384 }, 'LGBMClassifier': { 'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384 }, 'DeepLearningRegressor': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'DeepLearningClassifier': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'CatBoostRegressor': {}, 'CatBoostClassifier': {} } # if os.environ.get('is_test_suite', 0) == 'True': # all_model_params model_params = all_model_params.get(model_name, None) if model_params is None: model_params = {} if is_hp_search == True: if model_name[:12] == 'DeepLearning': model_params['epochs'] = 50 if model_name[:4] == 'LGBM': model_params['n_estimators'] = 500 if training_params is not None: print('Now using the model training_params that you passed in:') print(training_params) # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it) model_params.update(training_params) print( 'After overwriting our defaults with your values, here are the final params that will be used to initialize the model:' ) print(model_params) model_map = { # Classifiers 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'RidgeClassifier': RidgeClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'LinearSVC': LinearSVC(), # Regressors 'LinearRegression': LinearRegression(), 'RandomForestRegressor': RandomForestRegressor(), 'Ridge': Ridge(), 'LinearSVR': LinearSVR(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'AdaBoostRegressor': AdaBoostRegressor(), 'RANSACRegressor': RANSACRegressor(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'Lasso': Lasso(), 'ElasticNet': ElasticNet(), 'LassoLars': LassoLars(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'BayesianRidge': BayesianRidge(), 'ARDRegression': ARDRegression(), # Clustering 'MiniBatchKMeans': MiniBatchKMeans(), } try: model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001) model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001) model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier( max_iter=1000, tol=0.001) model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001) model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor( max_iter=1000, tol=0.001) except TypeError: model_map['SGDClassifier'] = SGDClassifier() model_map['Perceptron'] = Perceptron() model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier( ) model_map['SGDRegressor'] = SGDRegressor() model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor() if xgb_installed: model_map['XGBClassifier'] = XGBClassifier() model_map['XGBRegressor'] = XGBRegressor() if lgb_installed: model_map['LGBMRegressor'] = LGBMRegressor() model_map['LGBMClassifier'] = LGBMClassifier() if catboost_installed: model_map['CatBoostRegressor'] = CatBoostRegressor( calc_feature_importance=True) model_map['CatBoostClassifier'] = CatBoostClassifier( calc_feature_importance=True) if model_name[:12] == 'DeepLearning': if keras_imported == False: # Suppress some level of logs if TF is installed (but allow it to not be installed, and use Theano instead) try: os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from tensorflow import logging logging.set_verbosity(logging.INFO) except: pass global maxnorm global Dense, Dropout global LeakyReLU, PReLU, ThresholdedReLU, ELU global Sequential global keras_load_model global regularizers, optimizers global Activation global KerasRegressor, KerasClassifier from keras.constraints import maxnorm from keras.layers import Activation, Dense, Dropout from keras.layers.advanced_activations import LeakyReLU, PReLU, ThresholdedReLU, ELU from keras.models import Sequential from keras.models import load_model as keras_load_model from keras import regularizers, optimizers from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier keras_imported = True model_map['DeepLearningClassifier'] = KerasClassifier( build_fn=make_deep_learning_classifier) model_map['DeepLearningRegressor'] = KerasRegressor( build_fn=make_deep_learning_model) try: model_without_params = model_map[model_name] except KeyError as e: print( 'It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize' ) raise (e) if os.environ.get('is_test_suite', False) == 'True': if 'n_jobs' in model_params: model_params['n_jobs'] = 1 model_with_params = model_without_params.set_params(**model_params) return model_with_params
def Eval(XTrain, YTrain, XTest, YTest, clf, return_predicted_labels=False): """ Inputs: XTrain - N by D matrix of training data vectors YTrain - N by 1 matrix of training class labels XTest - M by D matrix of testin data vectors YTrain - M by 1 matrix of testing class labels clstr - the clustering function either the string = "KMeans" or "GMM" or a sklearn clustering instance with the methods .fit and Outputs: A tuple containing (in the following order): Accuracy Overall Precision Overall Recall Overall F1 score Avg. Precision per class Avg. Recall per class F1 Score Precision per class Recall per class F1 Score per class (if return_predicted_labels) predicted class labels for each row in XTest """ if type(clf) == str: if 'ridge' in clf.lower(): clf = RidgeClassifier(tol=1e-2, solver="lsqr") elif "perceptron" in clf.lower(): clf = Perceptron(n_iter=50) elif "passive aggressive" in clf.lower() or 'passive-aggressive' in clf.lower(): clf = PassiveAggressiveClassifier(n_iter=50) elif 'linsvm' in clf.lower() or 'linearsvm' in clf.lower() or 'linearsvc' in clf.lower(): clf = LinearSVC() elif 'svm' in clf.lower() or 'svc' in clf.lower(): clf = SVC() elif 'sgd' in clf.lower(): clf = SGDClassifier() clf.fit(XTrain, YTrain) YPred = clf.predict(XTest) accuracy = sklearn.metrics.accuracy_score(YTest, YPred) (overall_precision, overall_recall, overall_f1, support) = sklearn.metrics.precision_recall_fscore_support(YTest, YPred, average='micro') (precision_per_class, recall_per_class, f1_per_class, support_per_class) = sklearn.metrics.precision_recall_fscore_support(YTest, YPred) avg_precision_per_class = np.mean(precision_per_class) avg_recall_per_class = np.mean(recall_per_class) avg_f1_per_class = np.mean(f1_per_class) del clf if return_predicted_labels: return (accuracy, overall_precision, overall_recall, overall_f1, avg_precision_per_class, avg_recall_per_class, avg_f1_per_class, precision_per_class, recall_per_class, f1_per_class, YPred) else: return (accuracy, overall_precision, overall_recall, overall_f1, avg_precision_per_class, avg_recall_per_class, avg_f1_per_class, precision_per_class, recall_per_class, f1_per_class)
if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ((LogisticRegression(penalty='l2', C=1.0, max_iter=100, solver='newton-cg'), "Logistic Regression"), (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=100), "Random forest")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(
if __name__ == "__main__": filehandler = open(features_evaluation.SELECTED_FEATURES_CORPUS_CHI2, 'r') corpus = pickle.load(filehandler) dataset = Dataset(corpus=corpus) X = dataset.get_train_x() y = dataset.get_train_y() scores_dict = defaultdict(list) clf1 = LogisticRegression(C=0.05, random_state=1, class_weight='balanced') clf2 = RandomForestClassifier(random_state=1) clf3 = svm.SVC(C=0.35, class_weight='balanced') clf4 = RidgeClassifier(alpha=2.5) clf5 = AdaBoostClassifier(n_estimators=150) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svm', clf3), ('rc', clf4), ('ab', clf5)], voting='hard') for clf, label in zip([clf1, clf2, clf3, clf4, clf5, eclf], ['Logistic Regression', 'Random Forest', 'SVM', 'Ridge Classifier', 'Ada boost', 'Ensemble']): scores = cross_val_score(clf, X.toarray(), y, cv=5, scoring='f1_macro') scores_dict[label].append(scores.mean()) print("f1_macro: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) X, y = dataset.get_resampled_train_X_y(kind='regular') clf1.fit(X.toarray(), y) clf2.fit(X.toarray(), y)
] classifiersv2 = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), KNeighborsClassifier(n_neighbors=10), PassiveAggressiveClassifier(n_iter = 50), Perceptron(n_iter =50), RidgeClassifier(tol=1e-2,solver = 'lsqr'), MultinomialNB(alpha=.001), BernoulliNB(alpha=.001), GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,max_depth=1, loss='deviance'), SGDClassifier(alpha=.001, n_iter=50, penalty='l1'), SGDClassifier(alpha=.001, n_iter=50, penalty='l2'), NearestCentroid() ]
from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.linear_model import RidgeClassifier from sklearn.dummy import DummyClassifier classifiers = {'knn': KNeighborsClassifier(), #'lsvm': SVC(kernel="linear"), #'rbfsvm': SVC(gamma=2), #'gp': GaussianProcessClassifier(), 'dt': DecisionTreeClassifier(), 'rf': RandomForestClassifier(), #default worse than suggested values 'mlp': MLPClassifier(), #default worse than suggested values 'adb': AdaBoostClassifier(), 'nb': GaussianNB(), #'qda': QuadraticDiscriminantAnalysis(), 'ridge': RidgeClassifier(), '-dumbase-': DummyClassifier(strategy="most_frequent") } statnames = ['Classifiers', 'Avg. Test-acc', 'Avg. Train-acc', 'Std. Test-acc', 'Std. Train-acc', 'Avg. Test-time', 'Avg. Train-time' ] statcodes = ['clfn', 'mtsts', 'mtrns', 'vtsts', 'vtrns', 'predt', 'trint'] REPEAT = REPEAT if REPEAT > 1 else 1 # sanity-check CVFOLDS = CVFOLDS if CVFOLDS >= 2 else 2 # sanity-check # scikit-learn documentation recommends using StratifiedKFold for classification # problems to preserve class balance across folds. however, in this case, # we use KFold and RepeatedKFold because # number of items in a class <= CVFOLDS (works only with 2 folds for entire dataset)
if opts.print_report: print("classification report:") print(metrics.classification_report(y_test, pred, target_names=target_names)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split("(")[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ( (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), (Perceptron(max_iter=50), "Perceptron"), (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(), "Random forest"), ): print("=" * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print("=" * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))
if has_bias: assert '<BIAS>' in neg or '<BIAS>' in pos assert res == explain_weights(reg) @pytest.mark.parametrize(['clf'], [ [LogisticRegression(random_state=42)], [ LogisticRegression( random_state=42, multi_class='multinomial', solver='lbfgs') ], [LogisticRegression(random_state=42, fit_intercept=False)], [LogisticRegressionCV(random_state=42)], [RidgeClassifier(random_state=42)], [RidgeClassifierCV()], [SGDClassifier(random_state=42)], [SGDClassifier(random_state=42, loss='log')], [PassiveAggressiveClassifier(random_state=42)], [Perceptron(random_state=42)], [LinearSVC(random_state=42)], [OneVsRestClassifier(SGDClassifier(random_state=42))], ]) def test_explain_linear(newsgroups_train, clf): assert_explained_weights_linear_classifier(newsgroups_train, clf) @pytest.mark.parametrize(['clf'], [ [OneVsRestClassifier(SGDClassifier(random_state=42))], [OneVsRestClassifier(LogisticRegression(random_state=42))],
Pipeline([("Scaler", StandardScaler()), ("DecisionTrees", DecisionTreeClassifier())]))) clfs.append(("RandomForestClassifier", Pipeline([("Scaler", StandardScaler()), ("RandomForest", RandomForestClassifier())]))) clfs.append(("GradientBoostingClassifier", Pipeline([("Scaler", StandardScaler()), ("GradientBoosting", GradientBoostingClassifier(max_features=15, n_estimators=150))]))) clfs.append(("RidgeClassifier", Pipeline([("Scaler", StandardScaler()), ("RidgeClassifier", RidgeClassifier())]))) clfs.append(("BaggingRidgeClassifier", Pipeline([("Scaler", StandardScaler()), ("BaggingClassifier", BaggingClassifier())]))) clfs.append(("ExtraTreesClassifier", Pipeline([("Scaler", StandardScaler()), ("ExtraTrees", ExtraTreesClassifier())]))) #'neg_mean_absolute_error', 'neg_mean_squared_error','r2' scoring = 'accuracy' n_folds = 7 results, names = [], []
def test_class_weights(): # Test class weights. X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = [1, 1, 1, -1, -1] reg = RidgeClassifier(class_weight=None) reg.fit(X, y) assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1])) # we give a small weights to class 1 reg = RidgeClassifier(class_weight={1: 0.001}) reg.fit(X, y) # now the hyperplane should rotate clock-wise and # the prediction on this point should shift assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([-1])) # check if class_weight = 'balanced' can handle negative labels. reg = RidgeClassifier(class_weight='balanced') reg.fit(X, y) assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1])) # class_weight = 'balanced', and class_weight = None should return # same values when y has equal number of all labels X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0]]) y = [1, 1, -1, -1] reg = RidgeClassifier(class_weight=None) reg.fit(X, y) rega = RidgeClassifier(class_weight='balanced') rega.fit(X, y) assert len(rega.classes_) == 2 assert_array_almost_equal(reg.coef_, rega.coef_) assert_array_almost_equal(reg.intercept_, rega.intercept_)
def get_ridge_plot(best_param_, experiment_, param_keys_, param_vals_, png_folder, png_fname, score_threshold=0.8): parameters = dict(zip(param_keys_, param_vals_)) del parameters['model_type'] clf = RidgeClassifier() X_train, y_train = experiment_.get_train_data() clf.set_params(**best_param_) clf.fit(X_train, y_train) best_alpha = best_param_['alpha'] result = {'alphas':[], 'coefs':np.zeros( (len(parameters['alpha']), len(X_train.columns.values) + 1) ), 'scores':[], 'score':None} for i, alpha in enumerate(parameters.get('alpha',None)): result['alphas'].append(alpha) del best_param_['alpha'] best_param_['alpha'] = alpha clf.set_params(**best_param_) clf.fit(X_train, y_train) # regularization path tmp = np.array([0 for j in xrange(len(X_train.columns.values) + 1)], dtype=np.float32) if best_param_['fit_intercept']: tmp = np.append(clf.intercept_, clf.coef_) else: tmp[1:] = clf.intercept_ result['coefs'][i,:] = tmp result['scores'].append(experiment_.get_proba(clf, X_train)) del X_train, y_train # 2. tmp_len = len(experiment_.get_data_col_name()) index2feature = dict(zip(np.arange(1, tmp_len + 1), experiment_.get_data_col_name())) if best_param_['fit_intercept']: index2feature[0] = 'intercept' # 3. plot gs = GridSpec(2,2) ax1 = plt.subplot(gs[:,0]) ax2 = plt.subplot(gs[0,1]) ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance labels = np.append(np.array(['intercept'], dtype='S100'), experiment_.get_data_col_name()) nrows, ncols = result['coefs'].shape for ncol in xrange(ncols): ax1.plot(np.array(result['alphas']), result['coefs'][:,ncol], label = labels[ncol]) ax1.legend(loc='best') ax1.set_xscale('log') ax1.set_title("Regularization Path:%1.3e" % (best_alpha)) ax1.set_xlabel("alpha", fontsize=10) # 3.2 PDF X_test, y_test = experiment_.get_test_data() result['score'] = clf.decision_function(X_test) sns.distplot(result['score'], kde=False, rug=False, ax=ax2) ax2.set_title("PDF : Decision_Function") # 3.3 CDF num_bins = 100 try: counts, bin_edges = np.histogram(result['score'], bins=num_bins, normed=True) except: counts, bin_edges = np.histogram(result['score'], normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Decision_Function:Confidence_Score", fontsize=10) png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname) plt.tight_layout() plt.savefig(png_fname) plt.close() return True
num_folds = 5 seed = 2 scoring = 'accuracy' models = [] names = [] cv_scores = [] test_accuracy = [] precisions = [] recalls = [] models.append( ('LR', LogisticRegression(multi_class='multinomial', solver='newton-cg'))) models.append(('SVC', LinearSVC(multi_class='crammer_singer'))) models.append(('KNN', KNeighborsClassifier())) models.append(('Ridge', RidgeClassifier())) models.append(('RF', RandomForestClassifier())) # Crossvalidate all the models and also calculate the test accuracies and other metrics for each model for name, model in models: names.append(name) kfold = StratifiedKFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring=scoring) cv_score_mean = round(cv_results.mean(), 3) cv_scores.append(cv_score_mean)
#!/usr/bin/env python """ Ridge regression for Avito """ __author__ = "deniederhut" __license__ = "GPL" import numpy as np import pandas as pd from sklearn.linear_model import RidgeClassifier from sklearn.metrics import classification_report from sklearn.cross_validation import train_test_split from sklearn.metrics import roc_auc_score from sklearn.metrics import average_precision_score data = pd.read_table('/Users/dillonniederhut/Desktop/avito_train.tsv',nrows=100000) #replace with file path to your training data features = pd.get_dummies(data.subcategory) features_train, features_test, target_train, target_test =\ train_test_split(features, data.is_blocked, test_size = 0.25) ridge = RidgeClassifier() ridge.fit(features_train, target_train) prediction = np.round(ridge.predict(features_test)) print classification_report(target_test, prediction) print average_precision_score(target_test, prediction) print roc_auc_score(target_test, prediction)
X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] return X_train, feature_names, ch2, vectorizer # else: # print(simple_classify(MultinomialNB(),test_x,test_y,train_x,k.target)) # print(simple_classify(RandomForestClassifier(),test_x,test_y,train_x,k.target)) # print(simple_classify(RidgeClassifier(),test_x,test_y,train_x,k.target)) # print(simple_classify(KNeighborsClassifier(),test_x,test_y,train_x,k.target)) # print(simple_classify(Perceptron(),test_x,test_y,train_x,k.target)) # print(simple_classify(PassiveAggressiveClassifier(),test_x,test_y,train_x,k.target)) #New Code For PAN k, y_train, a = read_pan.read_pan(pan_train) k_t, test_y, a_t = read_pan.read_pan(pan_test) train_x, f_names, chi, transformer = feature_extraction2(givenlabel, k) test_x, _, _, _ = feature_extraction2(givenlabel, k_t) # train_x,f_names,chi,transformer=feature_extraction(givenlabel,k) print(simple_classify(MultinomialNB(), test_x, test_y, train_x, k)) print(simple_classify(RandomForestClassifier(), test_x, test_y, train_x, k)) print(simple_classify(RidgeClassifier(), test_x, test_y, train_x, k)) print(simple_classify(KNeighborsClassifier(), test_x, test_y, train_x, k)) print(simple_classify(Perceptron(), test_x, test_y, train_x, k)) print( simple_classify(PassiveAggressiveClassifier(), test_x, test_y, train_x, k))
# Notation: # N: number for training examples; K: number of models in level 0 # X: feature matrix; y: result array; z_k: prediction result array for k's model # # Setup 10 fold cross validation fold_num = 10 kf = KFold(n_samples, k=fold_num, indices=True) # set number of neighbors for kNN n_neighb = 19 # Brute-force implementation clf_mNB = MultinomialNB(alpha=.01) clf_kNN = KNeighborsClassifier(n_neighbors=n_neighb) clf_ridge = RidgeClassifier(tol=1e-1) clf_lSVC = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3) clf_SVC = SVC(C=32, gamma=0.0625) # clf_SGD = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2") # empty ndarrays for predication results z_kn z_mNB = np.array([], dtype=np.int32) z_kNN = np.array([], dtype=np.int32) z_ridge = np.array([], dtype=np.int32) z_lSVC = np.array([], dtype=np.int32) z_SVC = np.array([], dtype=np.int32) ############################################################################### # Stacking #
print( metrics.classification_report(y_test, pred, target_names=target_names)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ((RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=100), "Random forest")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append( benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))
data = [ i for i in csv.reader(file(train_file, 'rb')) ] data = data[1:] # remove header random.shuffle(data) X = np.array([ i[1:] for i in data ]).astype(float) Y = np.array([ i[0] for i in data ]).astype(int) train_cutoff = len(data) * 3/4 X_train = X[:train_cutoff] Y_train = Y[:train_cutoff] X_test = X[train_cutoff:] Y_test = Y[train_cutoff:] classifier = RidgeClassifier(normalize = True, alpha = 1) classifier = classifier.fit(X_train, Y_train) print 'Training error : %s' % (classifier.fit(X_train, Y_train).score(X_train, Y_train)) Y_predict = classifier.predict(X_test) equal = 0 for i in xrange(len(Y_predict)): if Y_predict[i] == Y_test[i]: equal += 1 print 'Accuracy = %s' % (float(equal)/len(Y_predict))
"EMC strives to keep your personal information accurate. We have implemented technology, management processes and policies to maintain data integrity. We will provide you with access to your information when reasonable, or in accordance with relevant laws, including making reasonable effort to provide you with online access and the opportunity to change your information. To protect your privacy and security, we will take steps to verify your identity before granting access or making changes to your personal information. To access and/or correct information, you can do so online or notify us via the appropriate method below depending on which site is at issue", "Your information to our service providers. We use service providers who help us to provide you with our services. We give relevant persons working for some of these providers access to your information, but only to the extent necessary for them to perform their services for us. We also implement reasonable contractual and technical protections to ensure the confidentiality of your personal information and data is maintained, used only for the provision of their services to us, and handled in accordance with this privacy policy. Examples of service providers include payment processors, email service providers, and web traffic analytics tools", "Some Microsoft sites allow you to choose to share your personal information with select Microsoft partners so that they can contact you about their products, services or offers. Other sites, such as MSN instead may give you a separate choice as to whether you wish to receive communications from Microsoft about a partner's particular offering (without transferring your personal information to the third party). See the Communication Preferences section below for more information.", ] X_new = vectorizer.transform(docs_new) # Train classifiers print "Training Classifiers..." t0 = time() clf_nb = MultinomialNB() clf_lsvc = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) clf_svc = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True) clf_rdg = RidgeClassifier(tol=1e-1) clf_sgd = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2") # Logistic regression requires OneVsRestClassifier which hides # its methods such as decision_function # It will require extra implementation efforts to use it as a candidate # for multilabel classification # clf_lgr = OneVsRestClassifier(LogisticRegression(C=1000,penalty='l1')) # kNN does not have decision function due to its nature # clf_knn = KNeighborsClassifier(n_neighbors=13) # train clf_nb.fit(X, y) clf_lsvc.fit(X, y) clf_rdg.fit(X, y) clf_svc.fit(X, y)
###Training with libraries categories = None remove = () X_train = cityName; print('Creating the vectorizer and chosing a transform (from raw text to feature)') vect= TfidfVectorizer(sublinear_tf=True, max_df=0.5) #vect=CountVectorizer(min_n=1,max_n=2,max_features=1000); X_train = vect.fit_transform(X_train) cityClass = RidgeClassifier(tol=1e-7) countryClass = RidgeClassifier(tol=1e-7) print('Creating a classifier for cities') cityClass.fit(X_train,cityCode) print('Creating a classifier for countries') countryClass.fit(X_train,countryCode) print('testing the performance'); testCityNames = vect.transform(cityNameTest); predictionsCity = countryClass.predict(testCityNames); predictionsCountry = cityClass.predict(testCityNames); with open('predictions.csv','w') as csvfile:
def __init__(self, num_arms=3): self.K = num_arms self.training_data = None self.training_labels = None self.clf = RidgeClassifier() self.dont_fit = True
vocabulary = np.array([t for t, i in sorted(vectorizer.vocabulary.iteritems(), key=itemgetter(1))]) # ch2 = SelectKBest(chi2, k=200) # X_train = ch2.fit_transform(X_train, y_train) # X_test = ch2.transform(X_test) # print "X_train: n_samples: %d, n_features: %d" % X_train.shape # print "X_test : n_samples: %d, n_features: %d" % X_test.shape # print X_train = X_train.toarray() X_test = X_test.toarray() # clf = BernoulliNB(alpha=.1) # clf = MultinomialNB(alpha=.01) # clf = KNeighborsClassifier(n_neighbors=3) clf = RidgeClassifier(tol=1e-1) # clf = RandomForestClassifier(n_estimators=20, max_depth=None, min_split=3, random_state=42) # clf = SGDClassifier(alpha=.01, n_iter=50, penalty="l2") # clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) clf.fit(X_train, y_train) pred = clf.predict(X_test) print "y : ", y_test print "pred : ", pred print # # print out top words for each category # for i, category in enumerate(categories): # top = np.argsort(clf.coef_[i, :])[-20:]
from sklearn import preprocessing lbl = preprocessing.LabelEncoder() lbl.fit(train['label1'].values) train['label1'] = lbl.transform(train['label1'].values) label = train['label1'] num_class = train['label1'].max() + 1 #=======================模型训练:5折交叉验证========================================= n_folds = 5 stack_train = np.zeros((train.shape[0], num_class)) stack_test = np.zeros((test.shape[0], num_class)) for i, (tr, va) in enumerate( StratifiedKFold(label, n_folds=n_folds, random_state=42)): print('stack:%d/%d' % ((i + 1), n_folds)) ridge = RidgeClassifier(random_state=42) ridge.fit(trn_term_doc[tr], label[tr]) score_va = ridge._predict_proba_lr(trn_term_doc[va]) score_te = ridge._predict_proba_lr(test_term_doc) stack_train[va] += score_va stack_test += score_te print( "model acc_score:", metrics.accuracy_score(label, np.argmax(stack_train, axis=1), normalize=True, sample_weight=None)) ##获取第一第二个标签:取概率最大的前两个即可:
# N: number for training examples; K: number of models in level 0 # X: feature matrix; y: result array; z_k: prediction result array for k's model # # Setup 10 fold cross validation fold_num = 10 kf = KFold(n_samples, k=fold_num, indices=True) # set number of neighbors for kNN n_neighb = 19 # Brute-force implementation clf_bNB = BernoulliNB(alpha=.01) clf_mNB = MultinomialNB(alpha=.01) clf_kNN = KNeighborsClassifier(n_neighbors=n_neighb) clf_ridge = RidgeClassifier(tol=1e-1) clf_lSVC = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3) clf_SVC = SVC(C=32, gamma=0.0625, probability=True) # clf_SGD = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2") ############################################################################### # Stacking # # initialize empty y and z n_categories = len(set(y)) z = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=float) # z = np.zeros( (n_samples, n_categories) , dtype=float) # Test for 10 rounds using the results from 10 fold cross validations for i, (train_index, test_index) in enumerate(kf):
epsilon = 1e-15 pred = sp.maximum(epsilon, pred) pred = sp.minimum(1-epsilon, pred) ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred))) ll = ll * -1.0/len(act) return ll # add two columns for hour and weekday def dayhour(timestr): d = datetime.strptime(str(x), "%y%m%d%H") return [float(d.weekday()), float(d.hour)] fh = FeatureHasher(n_features = 2**20, input_type="string") # Train classifier clf = RidgeClassifier() train = pd.read_csv("train/subtrain.csv", chunksize = 100000, iterator = True) all_classes = np.array([0, 1]) for chunk in train: y_train = chunk["click"] chunk = chunk[cols] chunk = chunk.join(pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"])) chunk.drop(["hour"], axis=1, inplace = True) Xcat = fh.transform(np.asarray(chunk.astype(str))) clf.fit(Xcat, y_train) # Create a submission file usecols = cols + ["id"] X_test = pd.read_csv("test/mtest.csv", usecols=usecols) X_test = X_test.join(pd.DataFrame([dayhour(x) for x in X_test.hour], columns=["wd", "hr"])) X_test.drop(["hour"], axis=1, inplace = True)
def main(): startCol = 0 endCol = 50 # max = 1775 train = csv_io.read_data("../Data/train.csv") target = [x[0] for x in train][1:3000] targetTest = [x[0] for x in train][3001:] trainTest = [x[startCol+1:endCol+1] for x in train][3001:] test = csv_io.read_data("../Data/test.csv") test = [x[startCol:endCol] for x in test] train = [x[startCol+1:endCol+1] for x in train][1:3000] fo = open("knn_stats.txt", "a+") rf = RidgeClassifier(alpha=0.01, fit_intercept=True, normalize=False, copy_X=True, tol=0.001) rf.fit(train, target) prob = rf.predict(trainTest) # changed from test result = 100 probSum = 0 for i in range(0, len(prob)): probX = prob[i] # [1] if ( probX > 0.7): probX = 0.7; if ( probX < 0.3): probX = 0.3; print i, probSum, probX, target[i] print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += targetTest[i]*log(probX)+(1-targetTest[i])*log(1-probX) #print probSum #print len(prob) #print "C: ", 10**C, " gamma: " ,2**g print -probSum/len(prob) if ( -probSum/len(prob) < result ): result = -probSum/len(prob) predicted_probs = rf.predict(test) # was test predicted_probs = ["%f" % x for x in predicted_probs] csv_io.write_delimited_file("../Submissions/knn.csv", predicted_probs) print "Generated Data!!" #fo.write(str(5) + str(5)+ str(5)); fo.close() #csv_io.write_delimited_file("../Submissions/rf_benchmark_test2.csv", predicted_probs) #predicted_probs = rf.predict_proba(train) # changed from test #predicted_probs = ["%f" % x[1] for x in predicted_probs] #predicted_probs = rf.predict(train) # changed from test #predicted_probs = ["%f" % x for x in predicted_probs] #csv_io.write_delimited_file("../Submissions/rf_benchmark_train2.csv", predicted_probs) var = raw_input("Enter to terminate.")
names = [ "Logistic Regression", "Linear SVC", "LinearSVC with L1-based feature selection", "Multinomial NB", "Bernoulli NB", "Ridge Classifier", "AdaBoost", "Perceptron", "Passive-Aggresive", "Nearest Centroid" ] classifiers = [ LogisticRegression(), LinearSVC(), Pipeline([('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))), ('classification', LinearSVC(penalty="l2"))]), MultinomialNB(), BernoulliNB(), RidgeClassifier(), AdaBoostClassifier(), Perceptron(), PassiveAggressiveClassifier(), NearestCentroid() ] zipped_clf = zip(names, classifiers) tvec = TfidfVectorizer() def classifier_comparator(vectorizer=tvec, n_features=10000, stop_words=None, ngram_range=(1, 1), classifier=zipped_clf):
# N: number for training examples; K: number of models in level 0 # X: feature matrix; y: result array; z_k: prediction result array for k's model # # Setup 10 fold cross validation fold_num = 10 kf = KFold(n_samples, k=fold_num, indices=True) # set number of neighbors for kNN n_neighb = 13 # Brute-force implementation clf_bNB = BernoulliNB(alpha=.01) clf_mNB = MultinomialNB(alpha=.01) clf_kNN = KNeighborsClassifier(n_neighbors=n_neighb) clf_ridge = RidgeClassifier(tol=1e-1) clf_SGD = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2") clf_lSVC = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) clf_SVC = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True) ############################################################################### # Stacking # # initialize empty y and z print 'X_den shape: ', X_den.shape print 'y shape: ', y.shape n_categories = len(set(y)) z = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=float)
def main() : # a few hard-coded variables, change here if you want to modify random seed or number of folds in the cross-validation nFolds = 10 randomSeed = 42 # here the feature file is selected featureFile = "../results/feature-importance-efs.csv" #featureFile = "../results/feature-importance-elastic-net.csv" #featureFile = "../results/feature-importance-recursive-feature-elimination-svc.csv" #featureFile = "../results/feature-importance-univariate.csv" # load dataset X, y, featureNames = genericFunctions.loadTCGADataset() print("Training dataset (original):", X.shape) # load selected features selectedFeatures = genericFunctions.loadFeatures(featureFile) # create reduced dataset print("Reading feature file \"" + featureFile + "\"...") featureIndexes = [ i for i in range(0, len(featureNames)) if featureNames[i] in selectedFeatures ] X_reduced = X[:,featureIndexes] print("Training dataset (reduced):", X_reduced.shape) print("Normalizing by samples...") normalizeBySample = True if normalizeBySample : from sklearn.preprocessing import normalize X = normalize(X) X_reduced = normalize(X_reduced) # FINALLY, WE CAN CLASSIFY AWAY! classifierList = [ #[RandomForestClassifier(), "RandomForestClassifier()"], [BaggingClassifier(n_estimators=300), "BaggingClassifier(n_estimators=300)"], [GradientBoostingClassifier(n_estimators=300), "GradientBoostingClassifier(n_estimators=300)"], [RandomForestClassifier(n_estimators=300), "RandomForestClassifier(n_estimators=300)"], [LogisticRegression(), "LogisticRegression"], # coef_ [PassiveAggressiveClassifier(), "PassiveAggressiveClassifier"], # coef_ [RidgeClassifier(), "RidgeClassifier"], # coef_ [SGDClassifier(), "SGDClassifier"], # coef_ [SVC(kernel='linear'), "SVC(linear)"], # coef_, but only if the kernel is linear...the default is 'rbf', which is NOT linear ] # 10-fold cross-validation from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits = nFolds, shuffle=True, random_state=randomSeed) foldIndexes = [ (training, test) for training, test in skf.split(X, y) ] for originalClassifier, classifierName in classifierList : classifierPerformance = [] classifierPerformanceReduced = [] # iterate over all folds print("\nClassifier " + classifierName + " on original dataset...") for fold, indexes in enumerate(foldIndexes) : train_index, test_index = indexes X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # let's normalize by feature scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) classifier = copy.deepcopy(originalClassifier) classifier.fit(X_train, y_train) scoreTraining = classifier.score(X_train, y_train) scoreTest = classifier.score(X_test, y_test) print("\tFold #%d: training: %.4f, test: %.4f" % (fold, scoreTraining, scoreTest)) classifierPerformance.append( scoreTest ) # iterate again over all folds, this time on the reduced dataset print("Classifier " + classifierName + " on reduced dataset...") for fold, indexes in enumerate(foldIndexes) : train_index, test_index = indexes X_train, X_test = X_reduced[train_index], X_reduced[test_index] y_train, y_test = y[train_index], y[test_index] # let's normalize by feature scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) classifier = copy.deepcopy(originalClassifier) classifier.fit(X_train, y_train) scoreTraining = classifier.score(X_train, y_train) scoreTest = classifier.score(X_test, y_test) print("\tFold %d: training: %.4f, test: %.4f" % (fold, scoreTraining, scoreTest)) classifierPerformanceReduced.append( scoreTest ) print("Classifier %s, performance on original dataset: %.4f (+/- %.4f)" % (classifierName, np.mean(classifierPerformance), np.std(classifierPerformance))) print("Classifier %s, performance on reduced dataset: %.4f (+/- %.4f)" % (classifierName, np.mean(classifierPerformanceReduced), np.std(classifierPerformanceReduced))) return
(LDA(), "Linear Discriminant Analysis"), (LinearSVC(), "SVM") ): print('=' * 80) print(name) results.append(benchmark(clf, name)) # Attach classifier to the original json file # loading dtm file for all twitts fp = open('./python_files/twitter_dtm.pkl', 'rb') dtm = pkl.load(fp) fp.close() # Predict the labels using Ridges classifier clf = RidgeClassifier(alpha=1.,tol=1e-2, solver="lsqr") clf.fit(X_train, y_train) predicted_labels = clf.predict(dtm) # loading json file for all twitts file_name = '../R Project/Data/obamacare.json' line_reader = open(file_name,'r') # r means for reading # building a new json file for all twitts + new predicted labels new_file_name = '../R Project/Data/obamacare_labeled.json' line_writer = open(new_file_name,'w') # w means for writing # adding the predicted label to each entry of json file twit_i = 0 for line in line_reader: label = predicted_labels[twit_i]
print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) if True: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ( (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=100), "Random forest")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))