def train_and_predict_m8 (train, test, labels) : ## Apply basic concatenation + stemming trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'porter') ## TF-IDF transform with sub-linear TF and stop-word removal tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS) tfv.fit(trainData) X = tfv.transform(trainData) X_test = tfv.transform(testData) ## Create the classifier print ("Fitting Ridge Classifer...") clf = RidgeClassifier(class_weight = 'auto', alpha = 1, normalize = True) ## Create a parameter grid to search for best parameters for everything in the pipeline param_grid = {'alpha' : [0.1, 0.3, 1, 3, 10], 'normalize' : [True, False]} ## Predict model with best parameters optimized for quadratic_weighted_kappa if (gridSearch) : model = perform_grid_search (clf, param_grid, X, labels) pred = model.predict(X_test) else : clf.fit(X, labels) pred = clf.predict(X_test) return pred
def linear_readout(Xtrain, Ytrain, Xtest, Ytest): ''' Readout (accuracy) evaluation. To assess the uniqueness of the projected patterns. A ridge classifier is used. Input: - Xtrain, ... (torch.Tensor): dataset. Note the Xtrain and Xtest are the projected values. Y labels do not need to change. Output: - accuracy_score (float): accuracy of the classification of the given data. ''' from sklearn.linear_model import RidgeClassifier from sklearn.metrics import accuracy_score num_batches_train = Xtrain.shape[0] batch_size_train = Xtrain.shape[1] train_set_length = num_batches_train * batch_size_train Xtrain = Xtrain.cpu().numpy().reshape(train_set_length, -1) Ytrain = Ytrain.cpu().numpy().reshape(train_set_length, -1).ravel() num_batches_test = Xtest.shape[0] batch_size_test = Xtest.shape[1] test_set_length = num_batches_test * batch_size_test Xtest = Xtest.cpu().numpy().reshape(test_set_length, -1) Ytest = Ytest.cpu().numpy().reshape(test_set_length, -1).ravel() classifier = RidgeClassifier() classifier.fit(Xtrain, Ytrain) predicted_labels = classifier.predict(Xtest) return accuracy_score(Ytest, predicted_labels)
def __train_lr( self, x_train_fused, y_train_fused, x_train_unfused, y_train_unfused, num_rows_having_paraphrases, ): logistic_model_unfused = RidgeClassifier(alpha=1.0) logistic_model_unfused.fit(x_train_unfused, y_train_unfused) logistic_model_fused = RidgeClassifier(alpha=1.0) logistic_model_fused.fit(x_train_fused, y_train_fused) if num_rows_having_paraphrases < 1: logging.warning( "Classifier data had no questions with paraphrases. This makes cross validation checks fail, so they will be skipped" ) return [], -1, logistic_model_fused, logistic_model_unfused scores = cross_val_score(logistic_model_fused, x_train_fused, y_train_fused, cv=2) predicted = cross_val_predict(logistic_model_fused, x_train_fused, y_train_fused, cv=2) accuracy = metrics.accuracy_score(y_train_fused, predicted) return scores, accuracy, logistic_model_fused, logistic_model_unfused
def evaluate_random_binning(X, y, X_test, y_test, M, task): # construct random binning features start_time = time.time() rb = RandomBinning(X.shape[1], M) Z, _ = rb.get_features(X) / np.sqrt(M) Z_test, _ = rb.get_features(X_test, expand=False) / np.sqrt(M) if (task == 'classification'): clf = RidgeClassifier(alpha=0.0001, solver='lsqr') clf.fit(Z, y) y_pred = clf.predict(Z_test) error_test = ( 0.5 - np.dot(np.sign(y_test), y_pred) / len(y_test) / 2) * 100 print("--- %s seconds ---" % (time.time() - start_time)) print("C = %d; error_test = %.2f" % (np.shape(Z)[1], error_test) + '%') elif (task == 'regression'): clf = Ridge(alpha=0.01, solver='lsqr', random_state=42) clf.fit(Z, y) y_pred = clf.predict(Z_test) error_test = np.linalg.norm( (y_test - y_pred)) / np.linalg.norm(y_test) * 100 print("--- %s seconds ---" % (time.time() - start_time)) print("C = %d; error_test = %.2f" % (np.shape(Z)[1], error_test) + '%') else: error_test = 'error!' print('No such a task, please check the task name!') return error_test
def Parameter_regularization(train): ''' 正则化参数对模型的影响 :param train: :return: ''' sample = train n = int(2 * len(sample) / 3) tfidf = TfidfVectorizer(ngram_range=(2, 3), max_features=2500) train_test = tfidf.fit_transform(sample['text'].values.astype("U")) train_x = train_test[:n] train_y = sample['label'].values[:n] test_x = train_test[n:] test_y = sample['label'].values[n:] f1 = [] for i in range(10): clf = RidgeClassifier(alpha=0.15 * (i + 1), solver='sag') clf.fit(train_x, train_y) val_pred = clf.predict(test_x) f1.append(f1_score(test_y, val_pred, average='macro')) plt.plot([0.15 * (i + 1) for i in range(10)], f1) plt.xlabel('alpha') plt.ylabel('f1_score') print(f1) plt.show()
def explain_ls(self, nsample=200): self.Z = [] self.Z0 = [] for i in range(0, nsample): rand = self.rand_sphere(0, self.radius / 2) z = [self.mittelpunkt[0] + rand[0], self.mittelpunkt[1] + rand[1]] self.Z.append([]) self.Z[i].append(z[0]) self.Z[i].append(z[1]) z0 = self.retransform(z) dummy = copy.copy(self.instance) dummy[self.attr[0]] = z0[0] dummy[self.attr[1]] = z0[1] self.Z0.append([]) self.Z0[i].append(z0[0]) self.Z0[i].append(z0[1]) coord3 = np.array( self.clf.predict_proba(np.array(dummy).reshape(1, -1))[0])[1] self.Z[i].append(int(coord3 >= 0.5)) clf = RidgeClassifier(alpha=1.0) X = np.array(self.Z)[:, 0:2] y = np.array(self.Z)[:, 2] clf.fit(X, y) self.explainer = clf
def ridgeClass(features, response): from sklearn.linear_model import RidgeClassifier X_train, X_test, y_train, y_test = train_test_split( features, response, test_size=0.4) # , random_state=4) ridgeC = RidgeClassifier() ridgeC.fit(X_train, y_train) print(ridgeC.score(X_test, y_test))
class SupervisedBandit: def __init__(self, num_arms=3): self.K = num_arms self.training_data = None self.training_labels = None self.clf = RidgeClassifier() self.dont_fit = True def take_action(self, features): if self.training_data is None: return torch.tensor(np.random.choice(self.K)) elif not self.dont_fit: # don't fit until have enough unique classes return torch.tensor(self.clf.predict(features)) else: return torch.tensor(self.training_labels[0]) def add_data(self, features, correct_action): if self.training_data is None: self.training_data = features self.training_labels = np.array([correct_action]) else: self.training_data = torch.cat((self.training_data, features)) self.training_labels = np.concatenate( (self.training_labels, correct_action)) if len(np.unique(self.training_labels)) > 1: # solver needs at least 2 unique classes to fit self.dont_fit = False self.clf.fit(self.training_data, self.training_labels)
def text_classify_influence_by_add_regularization(): """ 探究正则化对文本分类的影响 """ train_df = pd.read_csv('./data/train_set.csv', sep='\t', nrows=15000) sample = train_df[0:5000] n = int(2 * len(sample) / 3) tfidf = TfidfVectorizer(ngram_range=(2, 3), max_features=2500) train_test = tfidf.fit_transform(sample['text']) train_x = train_test[:n] train_y = sample['label'].values[:n] test_x = train_test[n:] test_y = sample['label'].values[n:] f1 = [] for i in range(10): clf = RidgeClassifier(alpha=0.15 * (i + 1), solver='sag') clf.fit(train_x, train_y) val_pred = clf.predict(test_x) f1.append(f1_score(test_y, val_pred, average='macro')) plt.plot([0.15 * (i + 1) for i in range(10)], f1) plt.xlabel('alpha') plt.ylabel('f1_score') plt.show()
def mlPreddict(label): df = pd.read_csv('classification.csv') df_names = df Xfeatures = df_names['Names'] cv = CountVectorizer() x = cv.fit_transform(Xfeatures) y = df_names.Classes x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=12) knn = KNeighborsClassifier(n_neighbors=158) r_c = RidgeClassifier() m_nb = MultinomialNB() dt_c = DecisionTreeClassifier() svm_c = svm.SVC() knn.fit(x_train, y_train) r_c.fit(x_train, y_train) m_nb.fit(x_train, y_train) dt_c.fit(x_train, y_train) svm_c.fit(x_train, y_train) knn.score(x_test, y_test) r_c.score(x_test, y_test) m_nb.score(x_test, y_test) dt_c.score(x_test, y_test) svm_c.score(x_test, y_test) sample_name = [label] vect = cv.transform(sample_name).toarray() prediction = svm_c.predict(vect) predict = ''.join(map(str, prediction)) print(predict) return predict
def impute_nan(df, ds, dF): if ds.isnull().any()==True: labeler_st = LabelEncoder() rc_st = RidgeClassifier(tol=1e-2, solver="sag") Sg = Series(labeler_st.fit_transform(ds.astype(str)), index=ds.index) Sg = Sg.where(ds.notnull(), ds, axis=0) x_notna = df.GR[Sg.notnull()].to_numpy().reshape(-1, 1) y_notna = Sg[Sg.notnull()].to_numpy().astype('int').ravel() x_nan = df.GR[Sg.isnull()].to_numpy().reshape(-1, 1) rc_st.fit(x_notna,y_notna) Sg[Sg.isnull()]=rc_st.predict(x_nan) Sg=Series(Sg, index=ds.index).astype(int) ds=Series(labeler_st.inverse_transform(Sg.values.ravel()), index=ds.index) #print('\nStratigraphy:', np.unique(ds)) if dF.isnull().any()==True: rc_fm = RidgeClassifier(tol=1e-2, solver="sag") labeler_fm = LabelEncoder() Fm = Series(labeler_fm.fit_transform(dF.astype(str)), index=dF.index) labeler_st = LabelEncoder() Sg=Series(labeler_st.fit_transform(ds.astype(str)), index=ds.index) Fm=Fm.where(dF.notnull(), dF, axis=0) x_notna = np.concatenate((df.GR[Fm.notnull()].to_numpy().reshape(-1, 1), Sg[Fm.notnull()].to_numpy().reshape(-1, 1)), axis=1) y_notna = Fm[Fm.notnull()].to_numpy().astype('int').ravel() x_nan = np.concatenate((df.GR[Fm.isnull()].to_numpy().reshape(-1, 1), Sg[Fm.isnull()].to_numpy().reshape(-1, 1)), axis=1) rc_fm.fit(x_notna,y_notna) Fm[Fm.isnull()]=rc_fm.predict(x_nan) Fm=Series(Fm, index=dF.index).astype(int) dF=Series(labeler_fm.inverse_transform(Fm.values.ravel()), index=dF.index) #print('\nFormation:', np.unique(dF)) return Sg, Fm
def run(input_train, input_test, output_name): """ Takes a file path as input, a file path as output, and produces a sorted csv of item IDs for Kaggle submission ------- input_train : 'full path of the training file' input_test : 'full path of the testing file' output_name : 'full path of the output file' """ data = pd.read_table(input_train) test = pd.read_table(input_test) testItemIds = test.itemid response = data.is_blocked dummies = sparse.csc_matrix(pd.get_dummies(data.subcategory)) pretestdummies = pd.get_dummies(test.subcategory) testdummies = sparse.csc_matrix(pretestdummies.drop(['Растения', 'Товары для компьютера'],axis=1)) words = np.array(data.description,str) testwords = np.array(test.description,str) del data, test vect = text.CountVectorizer(decode_error = u'ignore', strip_accents='unicode', ngram_range=(1,2)) corpus = np.concatenate((words, testwords)) vect.fit(corpus) counts = vect.transform(words) features = sparse.hstack((dummies,counts)) clf = RidgeClassifier() clf.fit(features, response) testcounts = vect.transform(testwords) testFeatures = sparse.hstack((testdummies,testcounts)) predicted_scores = clf.predict_proba(testFeatures).T[1] f = open(output_name,'w') f.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): f.write("%d\n" % (item_id)) f.close()
def test_class_weights(): # Test class weights. X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = [1, 1, 1, -1, -1] reg = RidgeClassifier(class_weight=None) reg.fit(X, y) assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1])) # we give a small weights to class 1 reg = RidgeClassifier(class_weight={1: 0.001}) reg.fit(X, y) # now the hyperplane should rotate clock-wise and # the prediction on this point should shift assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([-1])) # check if class_weight = 'balanced' can handle negative labels. reg = RidgeClassifier(class_weight='balanced') reg.fit(X, y) assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1])) # class_weight = 'balanced', and class_weight = None should return # same values when y has equal number of all labels X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0]]) y = [1, 1, -1, -1] reg = RidgeClassifier(class_weight=None) reg.fit(X, y) rega = RidgeClassifier(class_weight='balanced') rega.fit(X, y) assert len(rega.classes_) == 2 assert_array_almost_equal(reg.coef_, rega.coef_) assert_array_almost_equal(reg.intercept_, rega.intercept_)
def validate(input_train, rows=True, test=0.25): """ Takes file as input and returns classification report, average precision, and AUC for a bigram model. By default, loads all rows of a dataset, trains on .75, and tests on .25. ---- input_train : 'full path of the file you are loading' rows : True - loads all rows; insert an int for specific number of rows test : float proportion of dataset used for testing """ if rows == True: data = pd.read_table(input_train) else: data = pd.read_table(input_train, nrows = rows) response = data.is_blocked dummies = sparse.csc_matrix(pd.get_dummies(data.subcategory)) words = np.array(data.description,str) del data vect = text.CountVectorizer(decode_error = u'ignore',strip_accents='unicode',ngram_range=(1,2)) counts = vect.fit_transform(words) features = sparse.hstack((dummies,counts)) features_train, features_test, target_train, target_test = train_test_split(features, response, test_size = test) clf = RidgeClassifier() clf.fit(features_train, target_train) prediction = clf.predict(features_test) return classification_report(target_test, prediction), average_precision_score(target_test, prediction), roc_auc_score(target_test, prediction)
def country_based_model(df, input_df, model_evaluator): input_df['-3'] = input_df['name'].str[-3] input_df['-2'] = input_df['name'].str[-2] input_df['-1'] = input_df['name'].str[-1] columns = ['-3', '-2', '-1'] vectorized_name = [pd.get_dummies(input_df[i]) for i in columns] input_df = pd.concat( [vectorized_name[0], vectorized_name[1], vectorized_name[2], input_df], axis=1) cY = input_df['gender'].head(39469) input_df = input_df.drop(columns=['name', 'gender', '-3', '-2', '-1']) cX = input_df.head(39469) cX_train, cX_test, cy_train, cy_test = train_test_split(cX, cY, test_size=0.2, random_state=42) model = RidgeClassifier(fit_intercept=False, solver='lsqr') model.fit(cX_train, cy_train) training_predictions = model.predict(input_df.head(39469)) model_evaluator['MODEL_PREDICTION'] = training_predictions country_model_predictions = model.predict(input_df.tail(1000)) df['COUNTRY_MODEL'] = country_model_predictions return df, model_evaluator
def retrain_models(username): train_x, train_y, body_x, body_y, head_x, head_y = model_retriever.retrieve_data_db(username) b_train_x = [] b_train_y = numpy.concatenate([body_y, train_y]) for msg in (body_x + train_x): b_train_x.append(extract_body_features(msg)) body_vec = TfidfVectorizer(norm="l2") b_train_x = body_vec.fit_transform(b_train_x) h_train_x = [] h_train_y = numpy.concatenate([head_y, train_y]) for msg in (head_x + train_x): h_train_x.append(extract_header_features(msg)) head_vec = DictVectorizer() h_train_x = head_vec.fit_transform(h_train_x) body_model = LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3) head_model = RidgeClassifier(tol=1e-2, solver="lsqr") body_model.fit(b_train_x, b_train_y) head_model.fit(h_train_x, h_train_y) print("Finished training models for "+username+"...") store_models(username, body_vec, body_model, head_vec, head_model)
def text_classify_influence_by_max_features(): """ max_features对文本分类的影响 """ train_df = pd.read_csv('./data/train_set.csv', sep='\t', nrows=15000) sample = train_df[0:5000] n = int(2 * len(sample) / 3) f1 = [] features = [1000, 2000, 3000, 4000] for i in range(4): tfidf = TfidfVectorizer(ngram_range=(2, 3), max_features=features[i]) train_test = tfidf.fit_transform(sample['text']) train_x = train_test[:n] train_y = sample['label'].values[:n] test_x = train_test[n:] test_y = sample['label'].values[n:] clf = RidgeClassifier(alpha=0.1 * (i + 1), solver='sag') clf.fit(train_x, train_y) val_pred = clf.predict(test_x) f1.append(f1_score(test_y, val_pred, average='macro')) plt.plot(features, f1) plt.xlabel('max_features') plt.ylabel('f1_score') plt.show()
def rigid(X_train, X_test, y_train): # Fitting RigidClassifier to the Training set from sklearn.linear_model import RidgeClassifier classifier = RidgeClassifier(alpha=4, class_weight='balanced') classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) return y_pred
def fit_ridge(l2_reg, train_random_features, y_train, test_random_features, y_test): clf = RidgeClassifier(alpha=l2_reg) clf.fit(train_random_features, y_train.ravel()) train_accuracy = clf.score(train_random_features, y_train) test_accuracy = clf.score(test_random_features, y_test) print("Train accuracy:", train_accuracy) print("Test accuracy:", test_accuracy) return train_accuracy, test_accuracy
def Eval(XTrain, YTrain, XTest, YTest, clf, return_predicted_labels=False): """ Inputs: XTrain - N by D matrix of training data vectors YTrain - N by 1 matrix of training class labels XTest - M by D matrix of testin data vectors YTrain - M by 1 matrix of testing class labels clstr - the clustering function either the string = "KMeans" or "GMM" or a sklearn clustering instance with the methods .fit and Outputs: A tuple containing (in the following order): Accuracy Overall Precision Overall Recall Overall F1 score Avg. Precision per class Avg. Recall per class F1 Score Precision per class Recall per class F1 Score per class (if return_predicted_labels) predicted class labels for each row in XTest """ if type(clf) == str: if 'ridge' in clf.lower(): clf = RidgeClassifier(tol=1e-2, solver="lsqr") elif "perceptron" in clf.lower(): clf = Perceptron(n_iter=50) elif "passive aggressive" in clf.lower() or 'passive-aggressive' in clf.lower(): clf = PassiveAggressiveClassifier(n_iter=50) elif 'linsvm' in clf.lower() or 'linearsvm' in clf.lower() or 'linearsvc' in clf.lower(): clf = LinearSVC() elif 'svm' in clf.lower() or 'svc' in clf.lower(): clf = SVC() elif 'sgd' in clf.lower(): clf = SGDClassifier() clf.fit(XTrain, YTrain) YPred = clf.predict(XTest) accuracy = sklearn.metrics.accuracy_score(YTest, YPred) (overall_precision, overall_recall, overall_f1, support) = sklearn.metrics.precision_recall_fscore_support(YTest, YPred, average='micro') (precision_per_class, recall_per_class, f1_per_class, support_per_class) = sklearn.metrics.precision_recall_fscore_support(YTest, YPred) avg_precision_per_class = np.mean(precision_per_class) avg_recall_per_class = np.mean(recall_per_class) avg_f1_per_class = np.mean(f1_per_class) del clf if return_predicted_labels: return (accuracy, overall_precision, overall_recall, overall_f1, avg_precision_per_class, avg_recall_per_class, avg_f1_per_class, precision_per_class, recall_per_class, f1_per_class, YPred) else: return (accuracy, overall_precision, overall_recall, overall_f1, avg_precision_per_class, avg_recall_per_class, avg_f1_per_class, precision_per_class, recall_per_class, f1_per_class)
def RFF_Form1_Classification(WN, b,TrainData, ValData, TestData, train_label, val_label, test_label, option=1): import numpy as np import time import copy import sys from time import clock D = np.shape(WN)[1] b=np.zeros((D,1)) bn=copy.copy(b) PRFSGDAccuracy=np.zeros((D,1)) PRFRidgeAccuracy=np.zeros((D,1)) #interval = np.arange(0,D,10) interval = [D] for k in interval: if (k==0): k=k+1 W=WN[:,range(k)] b=bn[range(k)] RFTrainData= FeaturemapTransformation_Form1(W,TrainData) RFTestData= FeaturemapTransformation_Form1(W,TestData) ## Psuedo RF SGD from minibatchSGDCV import minibatchRFSGDCV from minibatchSGD import RFminibatchSGD st_time = clock() ncv=3 RFcvparam, RFbestbatchparam, RFmeanscore = minibatchRFSGDCV(ValData,val_label,ncv,W,b, option=1, RFoption=1) end_time = clock() print('PRFSGD Cross Validation Completed') print('Time required for PRFSGD CV is =', end_time-st_time) PRFclf = RFminibatchSGD(TrainData,train_label,W,b,option=1,batchsize=RFcvparam['batchsize'], alpha=RFcvparam['alpha'], eta0=RFcvparam['eta0'], RFoption=1) PRFSGDClassifiedlabel=PRFclf.predict(RFTestData) #SCDconfMat=confusion_matrix(test_label,SGDClassifiedlabel) PRFSGDAccuracy[k-1]=sum(test_label==PRFSGDClassifiedlabel)/(float(len(test_label))) # print('RFSGD Completed') # print("The classification accuracy with PsudeoRFSGD =",PRFSGDAccuracy[k-1]) ## Ridge regression Psuedo RF from sklearn.linear_model import RidgeClassifier from sklearn.metrics import confusion_matrix clf = RidgeClassifier(alpha=0.1) clf.fit(RFTrainData, train_label) RFRidgeClassifiedlabel = clf.predict(RFTestData) RFRidgeConfMat=confusion_matrix(test_label,RFRidgeClassifiedlabel) PRFRidgeAccuracy[k-1]=sum(test_label==RFRidgeClassifiedlabel)/(float(len(test_label))) # print("The classification accuracy with PsudeoRFRidge =",PRFRidgeAccuracy[k-1]) # print("The feature expansion",k, "is over") # print('+++++++++++++++++++++++++++++++++++++++') ind = PRFSGDAccuracy>0 PRFSGDAccuracy = PRFSGDAccuracy[ind] PRFRidgeAccuracy = PRFRidgeAccuracy[ind] return PRFSGDAccuracy, PRFRidgeAccuracy
def do_rc(X_test, X_train, Y_train): # creating a classifier of loss function "hinge" and penalty function "l2" clf = RidgeClassifier() print "starts fitting" print clf.fit(X_train, Y_train) print "finished fitting, starts predictions" Y_pred = clf.predict(X_test) print "finished predictions" return Y_pred
def train_ridge(self, x, y, alpha=0.0001): """ Trains a Ridge classifier on the sampled data and classifier predictions considering only the chosen_attributes for now, for simplicity """ # TODO: Automate Parameters linear_clf = RidgeClassifier(alpha=alpha) linear_clf.fit(x, y) self.surrogate = linear_clf
class RidgeModel(ccobra.CCobraModel): def __init__(self, name='Ridge', k=1): super(RidgeModel, self).__init__(name, ["moral"], ["single-choice"]) self.clf = RidgeClassifier(alpha=7) self.n_epochs = 1 def pre_train(self, dataset): x = [] y = [] for subj_train_data in dataset: for seq_train_data in subj_train_data: seq_train_data['task'] = seq_train_data['item'].task inp = create_input(seq_train_data) target = float(output_mppng[seq_train_data['response'][0][0]]) x.append(inp) y.append(target) x = np.array(x) y = np.array(y) self.train_x = x self.train_y = y self.train_network(self.train_x, self.train_y, self.n_epochs, verbose=True) def train_network(self, train_x, train_y, n_epochs, verbose=False): print('Starting training...') for epoch in range(self.n_epochs): # Shuffle the training data perm_idxs = np.random.permutation(np.arange(len(train_x))) train_x = train_x[perm_idxs] train_y = train_y[perm_idxs] self.clf.fit(train_x, train_y) print('Mean accuracy:') print(self.clf.score(train_x, train_y)) def predict(self, item, **kwargs): input = {'task': item.task} input['aux'] = kwargs x = np.array(create_input(input)).reshape(1, -1) output = self.clf.predict(x) self.prediction = output_mppngREV[output[0]] return self.prediction
def classifier(df, vectorizer): train_text = vectorizer.transform(df['text']) train_y = df['label'].values model = RidgeClassifier() logging.info('training ... ') model.fit(train_text, train_y) logging.info('predicting ... ') pred_y = model.predict(train_text) score(train_y, pred_y)
def fit(): """A function that trains the classifiers and ensembles them""" # the features and the labels VHSE = pd.read_excel("sequences.xlsx", index_col=0, sheet_name="VHSE") Y = VHSE["label1"].copy() X_svc = pd.read_excel("esterase_binary.xlsx", index_col=0, sheet_name="ch2_20") X_knn = pd.read_excel("esterase_binary.xlsx", index_col=0, sheet_name="random_30") if X_svc.isnull().values.any(): X_svc.dropna(axis=1, inplace=True) X_svc.drop(["go"], axis=1, inplace=True) if X_knn.isnull().values.any(): X_knn.dropna(axis=1, inplace=True) X_knn.drop(["go"], axis=1, inplace=True) # named_tuples models = namedtuple("models", ["svc", "ridge", "knn"]) test = namedtuple( "test_samples", ["x_svc", "x_knn", "y_svc", "y_knn", "x_test_svc", "x_test_knn"]) train = namedtuple( "train_samples", ["svc_x", "knn_x", "svc_y", "knn_y", "x_train_svc", "x_train_knn"]) # split and train transformed_x_svc, test_x_svc, Y_train_svc, Y_test_svc, X_train_svc, X_test_svc = split_transform( X_svc, Y) transformed_x_knn, test_x_knn, Y_train_knn, Y_test_knn, X_train_knn, X_test_knn = split_transform( X_knn, Y) # the 3 algorithms svc = SVC(C=0.31, kernel="rbf", gamma=0.91) knn = KNN(n_neighbors=7, p=5, metric="minkowski", n_jobs=-1) ridge = RIDGE(alpha=8, random_state=0) # fit the 3 algorithms svc.fit(transformed_x_svc, Y_train_svc) ridge.fit(transformed_x_svc, Y_train_svc) knn.fit(transformed_x_knn, Y_train_knn) # save in namedtuples fitted_models = models(*[svc, ridge, knn]) test_sample = test(*[ test_x_svc, test_x_knn, Y_test_svc, Y_test_knn, X_test_svc, X_test_knn ]) train_sample = train(*[ transformed_x_svc, transformed_x_knn, Y_train_svc, Y_train_knn, X_train_svc, X_train_knn ]) return fitted_models, test_sample, train_sample
def RidgeReg(file1, file2): feature1, lable1 = file2matrix(file1) clf = RidgeClassifier() clf.fit(feature1, lable1) feature2, label2 = file2matrix(file2) y_true = label2 y_score = clf.decision_function(feature2) y_pred = clf.predict(feature2) return y_true, y_score, y_pred
def scikit_ridgeregression(self, dataset, labels): from sklearn.linear_model import RidgeClassifier lr = RidgeClassifier(fit_intercept=False, max_iter=100, random_state=0) lr.fit(dataset, labels) testset, truelabels = self.load_dataset(self.testdata) prob = lr.predict(testset) ans = prob * truelabels err_rate = float(np.sum(ans == -1)) / ans.shape[0] print("Scikit Learn RR Test Error Rate: {:.2f}".format( float(err_rate)))
def main(): """ RidgeRegression classifier. """ dct = True X_train, X_test, y_train, y_test = prepare_datasets(dct) model = RidgeClassifier(alpha=1.0) model.fit(X_train, y_train) pred = model.predict(X_test) bal_acc = balanced_accuracy_score(y_test, pred) print(f"Balanced accuracy score: {bal_acc:g}")
def scikit_ridgec_test(size): X, y = datasets.make_classification(n_samples=1000, n_features=size, n_informative=2, n_redundant=2) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) model = RidgeClassifier(random_state=42) model.fit(X_train, y_train) model.score(X_test, y_test)
def confusion_matrix_for_problem(X, y, model=None): train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=0, stratify=y) if model is None: model = RidgeClassifier() model.fit(train_X, train_y) prediction_y = model.predict(test_X) return confusion_matrix(test_y, prediction_y)
def train_explainer(self): """ Trains a Ridge classifier on the sampled data considering only the chosen_attributes for now, for simplicity """ X = self.sample_set[:, self.chosen_attributes] y = self.predictions # TODO: Automate Parameters clf = RidgeClassifier(alpha=0.1) clf.fit(X, y) self.explainer = clf return self.explainer
def ridge_regression(): train_df = pd.read_csv(r'C:\Users\Rookie\Desktop\nlp\train_set.csv', sep='\t') vectorizer = CountVectorizer(max_features=3000) train_test = vectorizer.fit_transform(train_df['text']) clf = RidgeClassifier() clf.fit(train_test[:10000], train_df['label'].values[:10000]) val_pred = clf.predict(train_test[10000:]) print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))
def test_raise_without_labels(): y = np.random.randint(0, 10, 100).astype(str) X = np.random.randn(100, 4) ridge_classifier = RidgeClassifier() ridge_classifier.fit(X, y) logistic_regression = LogisticRegression() logistic_regression.fit(X, y) pvc = PrefittedVotingClassifier(estimators = [ ridge_classifier, logistic_regression ]) with pytest.raises(ValueError): pvc.predict(X)
def test_string_classification(): y = np.random.randint(0, 10, 100) X = np.random.randn(100, 4) ridge_classifier = RidgeClassifier() ridge_classifier.fit(X, y) logistic_regression = LogisticRegression() logistic_regression.fit(X, y) pvc = PrefittedVotingClassifier(estimators = [ ridge_classifier, logistic_regression ], labels = np.arange(0, 10).astype(str)) pred = pvc.predict(X) assert pred.dtype.type is np.str_
def get_optimal_blend_weigth(exp_, best_param_, folder, fname, model_fname): clf = RidgeClassifier() X_test, y_test = exp_.get_test_data() clf.set_params(**best_param_) clf.fit(X_test, y_test) # dump2csv optimal linear weight names = np.append(np.array(['intercept'], dtype='S100'), X_test.columns.values) coefs = np.append(clf.intercept_, clf.coef_).astype(np.float64) optimal_linear_weight = pd.DataFrame(coefs.reshape(1,len(coefs)), columns=names) optimal_linear_weight.to_csv(os.path.join(Config.get_string('data.path'), folder, fname), index=False) # dump2cpkle for ridge model model_fname = os.path.join(Config.get_string('data.path'), folder, model_fname) with gzip.open(model_fname, 'wb') as gf: cPickle.dump(clf, gf, cPickle.HIGHEST_PROTOCOL) return True
def Predict(): print('\nThere are %d new deals') % n_test # Using the KNN classifier clf_KNN = KNeighborsClassifier(n_neighbors=3) # KNN doesnot work even if k has been tuned #clf_KNN = KNeighborsClassifier(n_neighbors=7) #clf_KNN = KNeighborsClassifier(n_neighbors=11) clf_KNN.fit(Corpus_train, Y_train) Y_pred_KNN = clf_KNN.predict(Corpus_test) print_rate(Y_test, Y_pred_KNN, n_test, 'KNNClassifier') # Using the SVM classifier clf_SVM = svm.SVC() clf_SVM.fit(Corpus_train, Y_train) Y_pred_SVM = clf_SVM.predict(Corpus_test) print_rate(Y_test, Y_pred_SVM, n_test, 'SVMClassifier') # Using the Ridge classifier clf_RC = RidgeClassifier(tol=0.01, solver="lsqr") #clf_RC = RidgeClassifier(tol=0.1, solver="lsqr") clf_RC.fit(Corpus_train, Y_train) Y_pred_RC = clf_RC.predict(Corpus_test) print_rate(Y_test, Y_pred_RC, n_test, 'RidgeClassifier') # won't consider Random Forests or Decision Trees beacause they work bad for high sparse dimensions # Using the Multinomial Naive Bayes classifier # I expect that this MNB classifier will do the best since it is designed for occurrence counts features #clf_MNB = MultinomialNB(alpha=0.01) #smoothing parameter = 0.01 is worse than 0.1 clf_MNB = MultinomialNB(alpha=0.1) #clf_MNB = MultinomialNB(alpha=0.3) #a big smoothing rate doesnot benefit the model #clf_MNB = MultinomialNB(alpha=0.2) #or alpha = 0.05 can generate the best outcome clf_MNB.fit(Corpus_train, Y_train) Y_pred_MNB = clf_MNB.predict(Corpus_test) print_rate(Y_test, Y_pred_MNB, n_test, 'MultinomialNBClassifier')
def test_default_configuration_classify(self): for i in range(2): X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=False) configuration_space = ExtraTreesPreprocessor.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() preprocessor = ExtraTreesPreprocessor(random_state=1, **{hp_name: default[hp_name] for hp_name in default}) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) # fit a classifier on top classifier = RidgeClassifier() predictor = classifier.fit(X_train_trans, Y_train) predictions = predictor.predict(X_test_trans) accuracy = sklearn.metrics.accuracy_score(predictions, Y_test) self.assertAlmostEqual(accuracy, 0.87310261080752882, places=2)
def test_default_configuration_classify(self): for i in range(5): X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=False) configuration_space = KernelPCA.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() preprocessor = KernelPCA(random_state=1, **{hp_name: default[hp_name] for hp_name in default if default[hp_name] is not None}) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) # fit a classifier on top classifier = RidgeClassifier() predictor = classifier.fit(X_train_trans, Y_train) predictions = predictor.predict(X_test_trans) accuracy = sklearn.metrics.accuracy_score(predictions, Y_test) self.assertAlmostEqual(accuracy, 0.096539162112932606)
def test_default_configuration_classify(self): for i in range(2): X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) configuration_space = TruncatedSVD.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() preprocessor = TruncatedSVD(random_state=1, **{hp_name: default[hp_name] for hp_name in default if default[ hp_name] is not None}) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) # fit a classifier on top classifier = RidgeClassifier() predictor = classifier.fit(X_train_trans, Y_train) predictions = predictor.predict(X_test_trans) accuracy = sklearn.metrics.accuracy_score(predictions, Y_test) self.assertAlmostEqual(accuracy, 0.44201578627808136, places=2)
z = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=float) # z = np.zeros( (n_samples, n_categories) , dtype=float) # Test for 10 rounds using the results from 10 fold cross validations for i, (train_index, test_index) in enumerate(kf): print "run %d" % (i+1) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_den_train, X_den_test = X_den[train_index], X_den[test_index] # feed models clf_mNB.fit(X_train, y_train) clf_ridge.fit(X_train, y_train) clf_SGD.fit(X_train, y_train) clf_lSVC.fit(X_train, y_train) clf_SVC.fit(X_train, y_train) # get prediction for this fold run prob_mNB = clf_mNB.predict_proba(X_test) prob_ridge = clf_ridge.decision_function(X_test) prob_SGD = clf_SGD.decision_function(X_test) prob_lSVC = clf_lSVC.decision_function(X_test) prob_SVC = clf_SVC.predict_proba(X_test) # add prob functions into the z 2d-array z_temp = (prob_mNB + prob_ridge + prob_SGD + prob_lSVC + prob_SVC) z = np.append(z, z_temp, axis=0)
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svm', clf3), ('rc', clf4), ('ab', clf5)], voting='hard') for clf, label in zip([clf1, clf2, clf3, clf4, clf5, eclf], ['Logistic Regression', 'Random Forest', 'SVM', 'Ridge Classifier', 'Ada boost', 'Ensemble']): scores = cross_val_score(clf, X.toarray(), y, cv=5, scoring='f1_macro') scores_dict[label].append(scores.mean()) print("f1_macro: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) X, y = dataset.get_resampled_train_X_y(kind='regular') clf1.fit(X.toarray(), y) clf2.fit(X.toarray(), y) clf3.fit(X.toarray(), y) clf4.fit(X.toarray(), y) clf5.fit(X.toarray(), y) eclf.fit(X.toarray(), y) # X_test = dataset.get_test_x() # y_test = dataset.get_test_y() # for clf, label in zip([clf1, clf2, clf3, clf4, clf5, eclf], # ['Logistic Regression', 'Random Forest', # 'SVM', 'Ridge Classifier', 'Ada boost', 'Ensemble']): # score = f1_score(y_test, clf.predict(X_test.toarray())) # print("f1_macro_test: %0.2f [%s]" % (score, label)) # print(precision_score(y_test, clf.predict(X_test.toarray()))) # print(recall_score(y_test, clf.predict(X_test.toarray()))) # predicted = clf.predict(X_test.toarray())
data = [ i for i in csv.reader(file(train_file, 'rb')) ] data = data[1:] # remove header random.shuffle(data) X = np.array([ i[1:] for i in data ]).astype(float) Y = np.array([ i[0] for i in data ]).astype(int) train_cutoff = len(data) * 3/4 X_train = X[:train_cutoff] Y_train = Y[:train_cutoff] X_test = X[train_cutoff:] Y_test = Y[train_cutoff:] classifier = RidgeClassifier(normalize = True, alpha = 1) classifier = classifier.fit(X_train, Y_train) print 'Training error : %s' % (classifier.fit(X_train, Y_train).score(X_train, Y_train)) Y_predict = classifier.predict(X_test) equal = 0 for i in xrange(len(Y_predict)): if Y_predict[i] == Y_test[i]: equal += 1 print 'Accuracy = %s' % (float(equal)/len(Y_predict))
def get_ridge_plot(best_param_, experiment_, param_keys_, param_vals_, png_folder, png_fname, score_threshold=0.8): parameters = dict(zip(param_keys_, param_vals_)) del parameters['model_type'] clf = RidgeClassifier() X_train, y_train = experiment_.get_train_data() clf.set_params(**best_param_) clf.fit(X_train, y_train) best_alpha = best_param_['alpha'] result = {'alphas':[], 'coefs':np.zeros( (len(parameters['alpha']), len(X_train.columns.values) + 1) ), 'scores':[], 'score':None} for i, alpha in enumerate(parameters.get('alpha',None)): result['alphas'].append(alpha) del best_param_['alpha'] best_param_['alpha'] = alpha clf.set_params(**best_param_) clf.fit(X_train, y_train) # regularization path tmp = np.array([0 for j in xrange(len(X_train.columns.values) + 1)], dtype=np.float32) if best_param_['fit_intercept']: tmp = np.append(clf.intercept_, clf.coef_) else: tmp[1:] = clf.intercept_ result['coefs'][i,:] = tmp result['scores'].append(experiment_.get_proba(clf, X_train)) del X_train, y_train # 2. tmp_len = len(experiment_.get_data_col_name()) index2feature = dict(zip(np.arange(1, tmp_len + 1), experiment_.get_data_col_name())) if best_param_['fit_intercept']: index2feature[0] = 'intercept' # 3. plot gs = GridSpec(2,2) ax1 = plt.subplot(gs[:,0]) ax2 = plt.subplot(gs[0,1]) ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance labels = np.append(np.array(['intercept'], dtype='S100'), experiment_.get_data_col_name()) nrows, ncols = result['coefs'].shape for ncol in xrange(ncols): ax1.plot(np.array(result['alphas']), result['coefs'][:,ncol], label = labels[ncol]) ax1.legend(loc='best') ax1.set_xscale('log') ax1.set_title("Regularization Path:%1.3e" % (best_alpha)) ax1.set_xlabel("alpha", fontsize=10) # 3.2 PDF X_test, y_test = experiment_.get_test_data() result['score'] = clf.decision_function(X_test) sns.distplot(result['score'], kde=False, rug=False, ax=ax2) ax2.set_title("PDF : Decision_Function") # 3.3 CDF num_bins = 100 try: counts, bin_edges = np.histogram(result['score'], bins=num_bins, normed=True) except: counts, bin_edges = np.histogram(result['score'], normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Decision_Function:Confidence_Score", fontsize=10) png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname) plt.tight_layout() plt.savefig(png_fname) plt.close() return True
X_test = X_test_summary+X_test_title+X_test_author duration = time() - t0 print("n_samples: %d, n_features: %d" % X_test.shape) print("Done in %fs" % (duration)) def writeToDisk(predn,clfname): target="./"+clfname+".txt" target=open(target,'w') target.write("{}\t{}\n".format("record_id", "topic")) for x in zip(testID, predn): target.write("{}\t{}\n".format(x[0], x[1])) target.close() print(clfname," output written to disk.") clf1=RidgeClassifier(tol=1e-2, solver="lsqr") #Ridge Classifier clf1.fit(X_train, y_train) pred = clf1.predict(X_test) writeToDisk(pred,"RidgeClassifier") clf2=MultinomialNB(alpha=.01) #Naive Bayes classifier clf2.fit(X_train, y_train) pred = clf2.predict(X_test) writeToDisk(pred,"MultinomialNB") clf3=BernoulliNB(alpha=.01) #Naive Bayes(Bernoulli) classifier clf3.fit(X_train, y_train) pred = clf3.predict(X_test) writeToDisk(pred,"BernoulliNB") clf4=KNeighborsClassifier(n_neighbors=10) #KNeighbors Classifier clf4.fit(X_train, y_train)
#clf_KNN = KNeighborsClassifier(n_neighbors=7) #clf_KNN = KNeighborsClassifier(n_neighbors=11) clf_KNN.fit(Corpus_train, Y_train) Y_pred_KNN = clf_KNN.predict(Corpus_test) print_rate(Y_test, Y_pred_KNN, n_test, 'KNNClassifier') # Using the SVM classifier clf_SVM = svm.SVC() clf_SVM.fit(Corpus_train, Y_train) Y_pred_SVM = clf_SVM.predict(Corpus_test) print_rate(Y_test, Y_pred_SVM, n_test, 'SVMClassifier') # Using the Ridge classifier clf_RC = RidgeClassifier(tol=0.01, solver="lsqr") #clf_RC = RidgeClassifier(tol=0.1, solver="lsqr") clf_RC.fit(Corpus_train, Y_train) Y_pred_RC = clf_RC.predict(Corpus_test) print_rate(Y_test, Y_pred_RC, n_test, 'RidgeClassifier') # won't consider Random Forests or Decision Trees beacause they work bad for high sparse dimensions # Using the Multinomial Naive Bayes classifier # I expect that this MNB classifier will do the best since it is designed for occurrence counts features #clf_MNB = MultinomialNB(alpha=0.01) #smoothing parameter = 0.01 is worse than 0.1 clf_MNB = MultinomialNB(alpha=0.1) #clf_MNB = MultinomialNB(alpha=0.3) #a big smoothing rate doesnot benefit the model #clf_MNB = MultinomialNB(alpha=0.2) #or alpha = 0.05 can generate the best outcome clf_MNB.fit(Corpus_train, Y_train) Y_pred_MNB = clf_MNB.predict(Corpus_test) print_rate(Y_test, Y_pred_MNB, n_test, 'MultinomialNBClassifier')
def main(): startCol = 0 endCol = 50 # max = 1775 train = csv_io.read_data("../Data/train.csv") target = [x[0] for x in train][1:3000] targetTest = [x[0] for x in train][3001:] trainTest = [x[startCol+1:endCol+1] for x in train][3001:] test = csv_io.read_data("../Data/test.csv") test = [x[startCol:endCol] for x in test] train = [x[startCol+1:endCol+1] for x in train][1:3000] fo = open("knn_stats.txt", "a+") rf = RidgeClassifier(alpha=0.01, fit_intercept=True, normalize=False, copy_X=True, tol=0.001) rf.fit(train, target) prob = rf.predict(trainTest) # changed from test result = 100 probSum = 0 for i in range(0, len(prob)): probX = prob[i] # [1] if ( probX > 0.7): probX = 0.7; if ( probX < 0.3): probX = 0.3; print i, probSum, probX, target[i] print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += targetTest[i]*log(probX)+(1-targetTest[i])*log(1-probX) #print probSum #print len(prob) #print "C: ", 10**C, " gamma: " ,2**g print -probSum/len(prob) if ( -probSum/len(prob) < result ): result = -probSum/len(prob) predicted_probs = rf.predict(test) # was test predicted_probs = ["%f" % x for x in predicted_probs] csv_io.write_delimited_file("../Submissions/knn.csv", predicted_probs) print "Generated Data!!" #fo.write(str(5) + str(5)+ str(5)); fo.close() #csv_io.write_delimited_file("../Submissions/rf_benchmark_test2.csv", predicted_probs) #predicted_probs = rf.predict_proba(train) # changed from test #predicted_probs = ["%f" % x[1] for x in predicted_probs] #predicted_probs = rf.predict(train) # changed from test #predicted_probs = ["%f" % x for x in predicted_probs] #csv_io.write_delimited_file("../Submissions/rf_benchmark_train2.csv", predicted_probs) var = raw_input("Enter to terminate.")
# initialize empty y and z # Test for 10 rounds using the results from 10 fold cross validations for i, (train_index, test_index) in enumerate(kf): print "run %d" % (i+1) X_train_train, X_train_test = X_train[train_index], X_train[test_index] y_train_train, y_train_test = y_train[train_index], y_train[test_index] # X_den_train, X_den_test = X_den[train_index], X_den[test_index] # feed models clf_mNB.fit(X_train_train, y_train_train) clf_kNN.fit(X_train_train, y_train_train) clf_ridge.fit(X_train_train, y_train_train) clf_lSVC.fit(X_train_train, y_train_train) clf_SVC.fit(X_train_train, y_train_train) # get prediction for this fold run pred_mNB = clf_mNB.predict(X_train_test) pred_kNN = clf_kNN.predict(X_train_test) pred_ridge = clf_ridge.predict(X_train_test) pred_lSVC = clf_lSVC.predict(X_train_test) pred_SVC = clf_SVC.predict(X_train_test) # update z array for each model z_mNB = np.append(z_mNB , pred_mNB , axis=None) z_kNN = np.append(z_kNN , pred_kNN , axis=None) z_ridge = np.append(z_ridge , pred_ridge, axis=None) z_lSVC = np.append(z_lSVC , pred_lSVC , axis=None)
#!/usr/bin/env python """ Ridge regression for Avito """ __author__ = "deniederhut" __license__ = "GPL" import numpy as np import pandas as pd from sklearn.linear_model import RidgeClassifier from sklearn.metrics import classification_report from sklearn.cross_validation import train_test_split from sklearn.metrics import roc_auc_score from sklearn.metrics import average_precision_score data = pd.read_table('/Users/dillonniederhut/Desktop/avito_train.tsv',nrows=100000) #replace with file path to your training data features = pd.get_dummies(data.subcategory) features_train, features_test, target_train, target_test =\ train_test_split(features, data.is_blocked, test_size = 0.25) ridge = RidgeClassifier() ridge.fit(features_train, target_train) prediction = np.round(ridge.predict(features_test)) print classification_report(target_test, prediction) print average_precision_score(target_test, prediction) print roc_auc_score(target_test, prediction)
train = lemons.append(non_lemons) #X = train.drop(['RefId','IsBadBuy','VehYear','Make','Model','Trim','SubModel','WheelTypeID','BYRNO'],axis=1) #y = pd.Series(train['IsBadBuy']).values target = pd.Series(train['IsBadBuy']).values data = train.drop(['RefId','IsBadBuy','VehYear','Make','Model','Trim','SubModel','WheelTypeID','BYRNO'],axis=1) x_train, x_test, y_train, y_test = cross_validation.train_test_split(data,target, test_size=.3) # Subset the data so we have a more even data set model = RidgeClassifier() clf = model.fit(X,y) Ridg_Class = clf.predict(X) clf.score(X,y) metrics.confusion_matrix(y, clf.predict(X)) print metrics.classification_report(y, clf.predict(X)) # GradientBoostingClassifier from sklearn.ensemble import * model = GradientBoostingClassifier() # Train clf = model.fit(x_train, y_train)
clf_svc = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True) clf_rdg = RidgeClassifier(tol=1e-1) clf_sgd = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2") # Logistic regression requires OneVsRestClassifier which hides # its methods such as decision_function # It will require extra implementation efforts to use it as a candidate # for multilabel classification # clf_lgr = OneVsRestClassifier(LogisticRegression(C=1000,penalty='l1')) # kNN does not have decision function due to its nature # clf_knn = KNeighborsClassifier(n_neighbors=13) # train clf_nb.fit(X, y) clf_lsvc.fit(X, y) clf_rdg.fit(X, y) clf_svc.fit(X, y) clf_sgd.fit(X, y) print "Train time: %0.3fs" % (time() - t0) print # # predict by simply apply the classifier # # this will not use the multi-label threshold # predicted = clf_rdg.predict(X_new) # for doc, category in zip(docs_new, predicted): # print '%r => %s' % (doc, data_train.target_names[int(category)]) # print
d = datetime.strptime(str(x), "%y%m%d%H") return [float(d.weekday()), float(d.hour)] fh = FeatureHasher(n_features = 2**20, input_type="string") # Train classifier clf = RidgeClassifier() train = pd.read_csv("train/subtrain.csv", chunksize = 100000, iterator = True) all_classes = np.array([0, 1]) for chunk in train: y_train = chunk["click"] chunk = chunk[cols] chunk = chunk.join(pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"])) chunk.drop(["hour"], axis=1, inplace = True) Xcat = fh.transform(np.asarray(chunk.astype(str))) clf.fit(Xcat, y_train) # Create a submission file usecols = cols + ["id"] X_test = pd.read_csv("test/mtest.csv", usecols=usecols) X_test = X_test.join(pd.DataFrame([dayhour(x) for x in X_test.hour], columns=["wd", "hr"])) X_test.drop(["hour"], axis=1, inplace = True) X_enc_test = fh.transform(np.asarray(X_test.astype(str))) y_act = pd.read_csv("test/mtest.csv", usecols=['click']) y_pred = clf.predict(X_enc_test) with open('logloss.txt','a') as f: f.write('\n'+str(log_loss(y_act, y_pred)))
remove = () X_train = cityName; print('Creating the vectorizer and chosing a transform (from raw text to feature)') vect= TfidfVectorizer(sublinear_tf=True, max_df=0.5) #vect=CountVectorizer(min_n=1,max_n=2,max_features=1000); X_train = vect.fit_transform(X_train) cityClass = RidgeClassifier(tol=1e-7) countryClass = RidgeClassifier(tol=1e-7) print('Creating a classifier for cities') cityClass.fit(X_train,cityCode) print('Creating a classifier for countries') countryClass.fit(X_train,countryCode) print('testing the performance'); testCityNames = vect.transform(cityNameTest); predictionsCity = countryClass.predict(testCityNames); predictionsCountry = cityClass.predict(testCityNames); with open('predictions.csv','w') as csvfile: writer = csv.writer(csvfile) #for ind in range(0,len(predictionsCountry)): # writer.writerow([str(predictionsCountry[ind]),str(predictionsCity[ind])]) for predCountry,predCity in zip(predictionsCountry,predictionsCity):
(LinearSVC(), "SVM") ): print('=' * 80) print(name) results.append(benchmark(clf, name)) # Attach classifier to the original json file # loading dtm file for all twitts fp = open('./python_files/twitter_dtm.pkl', 'rb') dtm = pkl.load(fp) fp.close() # Predict the labels using Ridges classifier clf = RidgeClassifier(alpha=1.,tol=1e-2, solver="lsqr") clf.fit(X_train, y_train) predicted_labels = clf.predict(dtm) # loading json file for all twitts file_name = '../R Project/Data/obamacare.json' line_reader = open(file_name,'r') # r means for reading # building a new json file for all twitts + new predicted labels new_file_name = '../R Project/Data/obamacare_labeled.json' line_writer = open(new_file_name,'w') # w means for writing # adding the predicted label to each entry of json file twit_i = 0 for line in line_reader: label = predicted_labels[twit_i] if label==0:
def classify(granularity=10): trainDir = path.join(GEOTEXT_HOME, 'processed_data/' + str(granularity).strip() + '_clustered/') testDir = path.join(GEOTEXT_HOME, 'processed_data/test') data_train = load_files(trainDir, encoding=encoding) target = data_train.target data_test = load_files(testDir, encoding=encoding) categories = data_train.target_names def size_mb(docs): return sum(len(s.encode(encoding)) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print("%d documents - %0.3fMB (training set)" % ( len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % ( len(data_test.data), data_test_size_mb)) print("%d categories" % len(categories)) print() # split a training set and a test set y_train = data_train.target y_test = data_test.target print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() vectorizer = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, min_df=2, max_df=1.0, ngram_range=(1, 1), stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test dataset using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() chi = False if chi: k = 500000 print("Extracting %d best features by a chi-squared test" % 0) t0 = time() ch2 = SelectKBest(chi2, k=k) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print("done in %fs" % (time() - t0)) print() feature_names = np.asarray(vectorizer.get_feature_names()) # clf = LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-3) clf = RidgeClassifier(tol=1e-2, solver="auto") print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) scores = clf.decision_function(X_test) print scores.shape print pred.shape test_time = time() - t0 print("test time: %0.3fs" % test_time) # score = metrics.f1_score(y_test, pred) # print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print("%s: %s" % (category, " ".join(feature_names[top10]))) sumMeanDistance = 0 sumMedianDistance = 0 distances = [] confidences = [] randomConfidences = [] for i in range(0, len(pred)): user = path.basename(data_test.filenames[i]) location = userLocation[user].split(',') lat = float(location[0]) lon = float(location[1]) prediction = categories[pred[i]] confidence = scores[i][pred[i]] - mean(scores[i]) randomConfidence = scores[i][random.randint(0, len(categories) - 1)] confidences.append(confidence) randomConfidences.append(randomConfidence) medianlat = classLatMedian[prediction] medianlon = classLonMedian[prediction] meanlat = classLatMean[prediction] meanlon = classLonMean[prediction] distances.append(distance(lat, lon, medianlat, medianlon)) sumMedianDistance = sumMedianDistance + distance(lat, lon, medianlat, medianlon) sumMeanDistance = sumMeanDistance + distance(lat, lon, meanlat, meanlon) averageMeanDistance = sumMeanDistance / float(len(pred)) averageMedianDistance = sumMedianDistance / float(len(pred)) print "Average mean distance is " + str(averageMeanDistance) print "Average median distance is " + str(averageMedianDistance) print "Median distance is " + str(median(distances)) fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True) plt.xlim(0, 4000) plt.ylim(0, 2) ax1.scatter(distances, confidences) ax2.bar(distances, confidences) plt.savefig(path.join(GEOTEXT_HOME, 'confidence.png'))