def linear_ridge(M, labels, seed, split=0.8): """ linear ridge algorithm for input M and output labels Inputs: M : matrix m*n where each row is a different example and the columns are composed of the features labels : vector m*1 where each row is the correponding class of the row of M seed : random seed to do the split between test/validation/training split: number between 0 and 1. Split between training and testing set. Default : 0.8 Ouputs: roc_auc_train: AUC score on the train set roc_auc_val: AUC score on the validation set roc_auc_test: AUC score on the test set """ M_float = preprocessing_dataset.preprocessing_nan_normalization(M_str) M_train_val, M_val, M_test, labels_train_val, labels_val, labels_test = preprocessing_dataset.split_train_val_test( M_float, seed, labels, nb_val=3, split=0.8) X_train = M_train_val Y_train = labels_train_val X_test = M_test Y_train = np.reshape(Y_train, (Y_train.shape[0], )) # Create our imputer to replace missing values with the mean e.g. imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp = imp.fit(X_train) # Impute our data, then train X_train_imp = imp.transform(X_train) clf = RidgeClassifier() clf = clf.fit(X_train_imp, Y_train) # Impute each test item, then predict X_test_imp = imp.transform(X_test) X_val_imp = imp.transform(M_val) # Compute the accuracy lin_acc = clf.score(X_test_imp, labels_test) # Compute the AUC pred_train = clf.decision_function(X_train_imp) pred = clf.decision_function(X_test_imp) pred_val = clf.decision_function(X_val_imp) fpr_svm, tpr_svm, thresholds_train = roc_curve(Y_train, pred_train) roc_auc_train = auc(fpr_svm, tpr_svm) fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_val, pred_val) roc_auc_val = auc(fpr_svm, tpr_svm) fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_test, pred) roc_auc_test = auc(fpr_svm, tpr_svm) print( 'linear ridge: train set: %0.5f, validation: %0.5f, test set: %0.5f' % (roc_auc_train, roc_auc_val, roc_auc_test)) return roc_auc_train, roc_auc_val, roc_auc_test
def RidgeReg(file1, file2): feature1, lable1 = file2matrix(file1) clf = RidgeClassifier() clf.fit(feature1, lable1) feature2, label2 = file2matrix(file2) y_true = label2 y_score = clf.decision_function(feature2) y_pred = clf.predict(feature2) return y_true, y_score, y_pred
class RidgeC(BaseClassifier): def __init__(self,TRAINVALTEST_DENSE_X,TRAINVALTEST_DENSE_X_NAMES,\ TRAINVAL_SPARSE_X,TRAINVAL_SPARSE_X_NAMES,\ TEST_SPARSE_X,TEST_SPARSE_X_NAMES,\ UF_VW,ADF,TRAINVAL,UF_CSV,TRAINVAL_MERGE,\ TEST_MERGE,TEST,name='ridge',USE_TINY=False,RANDOMSTATE=2018): super(RidgeC, self).__init__( TRAINVALTEST_DENSE_X,TRAINVALTEST_DENSE_X_NAMES,\ TRAINVAL_SPARSE_X,TRAINVAL_SPARSE_X_NAMES,\ TEST_SPARSE_X,TEST_SPARSE_X_NAMES,\ UF_VW,ADF,TRAINVAL,UF_CSV,TRAINVAL_MERGE,\ TEST_MERGE,TEST,name,USE_TINY,RANDOMSTATE) '''In Ridge, only 'sag' solver can currently fit the intercept when X is sparse.''' '''No normlize is better''' self.clf=RidgeClassifier(tol=1e-2, solver="sag",normalize=False) def trainWithEva(self,trainval_x): '''fit the data with evalidation''' train_x, valid_x, train_y, valid_y = train_test_split(\ trainval_x,self.trainval['label'],\ test_size=0.1, random_state=self.randomstate) self.clf.fit(train_x,train_y) pred = self.clf.decision_function(valid_x) #print(valid_y,pred) score=metrics.roc_auc_score(valid_y, pred) print("%s on valid set accuracy: %0.5f" % (self.name,score)) return score def predict(self,test_x=None,model_path=None): if model_path is not None: self.load_model(model_path) if test_x is None: _,test_x=self.feature_engineering() #self.clf.decision_function(test_x) #print(pd.read_csv(self.ds.TEST),self.ds.TEST) pre=pd.read_csv(self.ds.TEST) #print(test_x.shape,pre.shape) pre['score'] = self.clf.decision_function(test_x) pre['score'] = pre['score'].apply(lambda x: float('%.6f' % x)) return pre
class RidgeClassifierImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def set_forward(self, support_images, support_labels, query_images): """ Overwrites method set_forward in AbstractMetaLearner. """ support_query_size = len(support_images) n_chunks = support_query_size // 32 + 1 support_chunk = [] query_chunk = [] for support, query in zip(support_images.chunk(n_chunks), query_images.chunk(n_chunks)): support_features, query_features = ( features.detach().cpu() for features in self.extract_features( set_device(support), set_device(query))) support_chunk.append(support_features.detach().cpu()) query_chunk.append(query_features.detach().cpu()) z_support = torch.cat(support_chunk, dim=0) del support_chunk z_query = torch.cat(query_chunk, dim=0) del query_chunk # If a transportation method in the feature space has been defined, use it if self.transportation_module: z_support, z_query = (z.cpu() for z in self.transportation_module( set_device(z_support), set_device(z_query))) z_support = z_support.numpy() z_query = z_query.numpy() support_labels = support_labels.cpu().numpy() linear_classifier = RidgeClassifier(alpha=0.1) linear_classifier.fit(z_support, support_labels) scores = torch.tensor(linear_classifier.decision_function(z_query)) scores = set_device(scores) return scores
class Ridge: def __repr__(self): return 'Ridge' def __init__(self, alpha, class_weight, random_state): self.ridge = RidgeClassifier(alpha, class_weight=class_weight, fit_intercept=False, random_state=random_state) def fit(self, X_train, y_train): self.ridge.fit(X_train, y_train) return self def predict_proba(self, Z): preds_class_1 = self.ridge.decision_function(Z) preds = [] for pred in preds_class_1: preds += [[1 - pred, pred]] return np.array(preds)
(predictions == y_test).astype(int)) / predictions.shape[0] else: temp_classifier = classifier x_train, x_test, y_train, y_test = train_test_split(X_train_all, Y_train_all, test_size=0.1) temp_classifier.fit(x_train, y_train) dev_accuracy = temp_classifier.score(x_test, y_test) predictions = temp_classifier.predict(x_test) # if ridge_option: # print(temp_classifier.decision_function(x_test)) try: y_proba[i] = classifier.predict_proba(x_test) except: scores = classifier.decision_function(x_test) y_proba[i] = scores / (1 + scores) y_mistake[i] = np.mean( y_proba[i][y_proba[i].argmax(axis=1) != y_test].max(axis=1)) testArray = np.array([ np.mean(y_proba[i][y_test == j][y_proba[i][y_test == j].argmax( axis=1) != j].max(axis=1)) for j in range(4) ]) y_mistake_perClass[i] = testArray confusion_matrices[i] = confusion_matrix(y_test, predictions) print('Fold N°', str(i)) print('SCORE : ', dev_accuracy) if not (ensemble_option):
def main(): np.random.seed(29118) # Generate toy data n_samples = 200 xs, ys = make_blobs(n_samples, centers=[[0, 0], [0, 2]], cluster_std=[0.3, 0.35]) xt, yt = make_blobs(n_samples, centers=[[2, -2], [2, 0.2]], cluster_std=[0.35, 0.4]) # visualize toy data colors = ["c", "m"] x_all = [xs, xt] y_all = [ys, yt] labels = ["source", "Target"] plt.figure(figsize=(8, 5)) for i in range(2): idx_pos = np.where(y_all[i] == 1) idx_neg = np.where(y_all[i] == 0) plt.scatter( x_all[i][idx_pos, 0], x_all[i][idx_pos, 1], c=colors[i], marker="o", alpha=0.4, label=labels[i] + " positive", ) plt.scatter( x_all[i][idx_neg, 0], x_all[i][idx_neg, 1], c=colors[i], marker="x", alpha=0.4, label=labels[i] + " negative", ) plt.legend() plt.title("Source domain and target domain blobs data", fontsize=14, fontweight="bold") plt.show() clf = RidgeClassifier(alpha=1.0) clf.fit(xs, ys) yt_pred = clf.predict(xt) print("Accuracy on target domain: {:.2f}".format( accuracy_score(yt, yt_pred))) # visualize decision scores of non-adaptation classifier ys_score = clf.decision_function(xs) yt_score = clf.decision_function(xt) title = "Ridge classifier decision score distribution" title_kwargs = {"fontsize": 14, "fontweight": "bold"} hist_kwargs = {"kde": True, "alpha": 0.7} plt_labels = ["Source", "Target"] distplot_1d( [ys_score, yt_score], labels=plt_labels, xlabel="Decision Scores", title=title, title_kwargs=title_kwargs, hist_kwargs=hist_kwargs, ).show() # domain adaptation clf_ = CoIRLS(lambda_=1) # encoding one-hot domain covariate matrix covariates = np.zeros(n_samples * 2) covariates[:n_samples] = 1 enc = OneHotEncoder(handle_unknown="ignore") covariates_mat = enc.fit_transform(covariates.reshape(-1, 1)).toarray() x = np.concatenate((xs, xt)) clf_.fit(x, ys, covariates_mat) yt_pred_ = clf_.predict(xt) print("Accuracy on target domain: {:.2f}".format( accuracy_score(yt, yt_pred_))) ys_score_ = clf_.decision_function(xs).detach().numpy().reshape(-1) yt_score_ = clf_.decision_function(xt).detach().numpy().reshape(-1) title = "Domain adaptation classifier decision score distribution" distplot_1d( [ys_score_, yt_score_], labels=plt_labels, xlabel="Decision Scores", title=title, title_kwargs=title_kwargs, hist_kwargs=hist_kwargs, ).show()
def get_ridge_plot(best_param_, experiment_, param_keys_, param_vals_, png_folder, png_fname, score_threshold=0.8): parameters = dict(zip(param_keys_, param_vals_)) del parameters['model_type'] clf = RidgeClassifier() X_train, y_train = experiment_.get_train_data() clf.set_params(**best_param_) clf.fit(X_train, y_train) best_alpha = best_param_['alpha'] result = {'alphas':[], 'coefs':np.zeros( (len(parameters['alpha']), len(X_train.columns.values) + 1) ), 'scores':[], 'score':None} for i, alpha in enumerate(parameters.get('alpha',None)): result['alphas'].append(alpha) del best_param_['alpha'] best_param_['alpha'] = alpha clf.set_params(**best_param_) clf.fit(X_train, y_train) # regularization path tmp = np.array([0 for j in xrange(len(X_train.columns.values) + 1)], dtype=np.float32) if best_param_['fit_intercept']: tmp = np.append(clf.intercept_, clf.coef_) else: tmp[1:] = clf.intercept_ result['coefs'][i,:] = tmp result['scores'].append(experiment_.get_proba(clf, X_train)) del X_train, y_train # 2. tmp_len = len(experiment_.get_data_col_name()) index2feature = dict(zip(np.arange(1, tmp_len + 1), experiment_.get_data_col_name())) if best_param_['fit_intercept']: index2feature[0] = 'intercept' # 3. plot gs = GridSpec(2,2) ax1 = plt.subplot(gs[:,0]) ax2 = plt.subplot(gs[0,1]) ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance labels = np.append(np.array(['intercept'], dtype='S100'), experiment_.get_data_col_name()) nrows, ncols = result['coefs'].shape for ncol in xrange(ncols): ax1.plot(np.array(result['alphas']), result['coefs'][:,ncol], label = labels[ncol]) ax1.legend(loc='best') ax1.set_xscale('log') ax1.set_title("Regularization Path:%1.3e" % (best_alpha)) ax1.set_xlabel("alpha", fontsize=10) # 3.2 PDF X_test, y_test = experiment_.get_test_data() result['score'] = clf.decision_function(X_test) sns.distplot(result['score'], kde=False, rug=False, ax=ax2) ax2.set_title("PDF : Decision_Function") # 3.3 CDF num_bins = 100 try: counts, bin_edges = np.histogram(result['score'], bins=num_bins, normed=True) except: counts, bin_edges = np.histogram(result['score'], normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Decision_Function:Confidence_Score", fontsize=10) png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname) plt.tight_layout() plt.savefig(png_fname) plt.close() return True
def get_ridge_plot(best_param_, experiment_, param_keys_, param_vals_, png_folder, png_fname, score_threshold=0.8): parameters = dict(zip(param_keys_, param_vals_)) del parameters['model_type'] clf = RidgeClassifier() X_train, y_train = experiment_.get_train_data() clf.set_params(**best_param_) clf.fit(X_train, y_train) best_alpha = best_param_['alpha'] result = {'alphas':[], 'coefs':np.zeros( (len(parameters['alpha']), len(X_train.columns.values) + 1) ), 'scores':[], 'score':None} for i, alpha in enumerate(parameters.get('alpha',None)): result['alphas'].append(alpha) del best_param_['alpha'] best_param_['alpha'] = alpha clf.set_params(**best_param_) clf.fit(X_train, y_train) # regularization path tmp = np.array([0 for j in xrange(len(X_train.columns.values) + 1)], dtype=np.float32) if best_param_['fit_intercept']: tmp = np.append(clf.intercept_, clf.coef_) else: tmp[1:] = clf.intercept_ result['coefs'][i,:] = tmp result['scores'].append(experiment_.get_proba(clf, X_train)) del X_train, y_train # 2. tmp_len = len(experiment_.get_data_col_name()) index2feature = dict(zip(np.arange(1, tmp_len + 1), experiment_.get_data_col_name())) if best_param_['fit_intercept']: index2feature[0] = 'intercept' # 3. plot gs = GridSpec(2,2) ax1 = plt.subplot(gs[:,0]) ax2 = plt.subplot(gs[0,1]) ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance labels = np.append(np.array(['intercept'], dtype='S100'), experiment_.get_data_col_name()) nrows, ncols = result['coefs'].shape for ncol in xrange(ncols): ax1.plot(np.array(result['alphas']), result['coefs'][:,ncol], label = labels[ncol]) ax1.legend(loc='best') ax1.set_xscale('log') ax1.set_title("Regularization Path:%1.3e" % (best_alpha)) ax1.set_xlabel("alpha", fontsize=10) # 3.2 PDF X_test, y_test = experiment_.get_test_data() result['score'] = clf.decision_function(X_test) sns.distplot(result['score'], kde=False, rug=False, ax=ax2) ax2.set_title("PDF : Decision_Function") # 3.3 CDF num_bins = 100 try: counts, bin_edges = np.histogram(result['score'], bins=num_bins, normed=True) except: counts, bin_edges = np.histogram(result['score'], normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Decision_Function:Confidence_Score", fontsize=10) png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname) plt.tight_layout() plt.savefig(png_fname) plt.close() return True
print # # predict by simply apply the classifier # # this will not use the multi-label threshold # predicted = clf_rdg.predict(X_new) # for doc, category in zip(docs_new, predicted): # print '%r => %s' % (doc, data_train.target_names[int(category)]) # print #################################### # Multi-label prediction using Ridge # decision_function print clf_rdg pred_decision = clf_rdg.decision_function(X_new) print pred_decision print # filtering using threshold pred_decision_filtered = label_filtering(pred_decision, 0.1) print pred_decision_filtered print # predict and print for doc, labels in zip(docs_new, pred_decision_filtered): print doc for label in labels: # label[0]: score; label[1]: # print data_train.target_names[label[1]], label[0] print
learning_rate=0.09,objective="multi:softmax").fit(x_train, y_train) prediction_gbm = gbm.predict(x_test) gmbscore = accuracy_score(y_test, prediction_gbm) interval=time.time()-start_time #eta=0.3 max_depth=25 obj=mult num_class=20 #Ensemblistes svc_clf8 = LinearSVC(C=0.8) svc_clf8.fit(np.log(x_train+1), y_train) decision_svc=svc_clf8.decision_function(x_test) prediction_svc8=svc_clf8.predict(x_test) svc_score8 = accuracy_score(y_test, prediction_svc8) Ridge_clf = RidgeClassifier(alpha=1) Ridge_clf.fit(x_train, y_train) decision_ridge=Ridge_clf.decision_function(x_test) prediction_ridge=Ridge_clf.predict(x_test) Ridge_clf_score = accuracy_score(y_test, prediction_ridge) PAC_clf = PassiveAggressiveClassifier(C=0.1) PAC_clf.fit(x_train, y_train) decision_pac=PAC_clf.decision_function(x_test) prediction_PAC=PAC_clf.predict(x_test) PAC_clf_score = accuracy_score(y_test, prediction_PAC) from sklearn.linear_model import RandomizedLogisticRegression RandomizedLogisticRegression_clf = RandomizedLogisticRegression(C=5,n_jobs=-1) RandomizedLogisticRegression_clf.fit(x_train, y_train) prediction_RandomizedLogisticRegression=RandomizedLogisticRegression_clf.predict(x_test) RandomizedLogisticRegression_clf_score = accuracy_score(y_test, prediction_RandomizedLogisticRegression)
class IntentRidgeClassifier: def __init__(self): self.model = None self.x_train = None self.y_train = None self.language = None self.word2index = None self.dict_labels = None def set_word2index(self, word2index: dict): self.word2index = word2index def set_dict_labels(self, dict_labels: dict): self.dict_labels = dict_labels def set_data_train(self, x_raw, y_raw, language): if x_raw is None or y_raw is None or language is None: print('Data train is None') return self.x_train = x_raw self.y_train = y_raw self.language = language self.process_data() def process_data(self): # tokenize if self.language == constant.LANG_JP: self.x_train = [ data_processor.japanese_segment(sentence) for sentence in self.x_train ] # build vocab if self.word2index is None: unique_words = list( set([ word for sentence in self.x_train for word in sentence.split(' ') ])) self.word2index = { word: index for index, word in enumerate(unique_words) } # Convert data to vector self.x_train = [ data_processor.sentence_2_vec(sentence, self.word2index, len(self.word2index) + 1) for sentence in self.x_train ] self.dict_labels = data_processor.make_dict_labels(self.y_train) self.y_train = data_processor.label2vec(self.y_train, self.dict_labels) def build_model(self): self.model = RidgeClassifier(alpha=0.5, class_weight=None, copy_X=True, fit_intercept=True, solver='svd', tol=1) def train_model(self): self.model.fit(self.x_train, self.y_train) def predict(self, x_raw): x_to_predict = data_processor.sentence_2_vec(x_raw, self.word2index, len(self.word2index) + 1) if sum(x_to_predict) == 0: return None d = self.model.decision_function([x_to_predict])[0] * 5 probs = np.exp(d) / np.sum(np.exp(d)) dict_labels = { self.dict_labels[key]: key for key in self.dict_labels.keys() } if len(dict_labels.keys()) < 3: final_rs = [{ 'intent': dict_labels[index], 'prob': probs } for index in range(1)] return final_rs[0] else: max_probability = np.argmax(probs) return { 'intent': dict_labels[max_probability], 'prob': probs[max_probability] } def save_word2index(self, file_path): io_utils.save_dict_to_file(self.word2index, file_path) pass def save_model(self, model_path): if self.model is not None: with open(model_path, 'wb') as fw: pickle.dump(self.model, fw) else: print('Model is None.') def load_model(self, model_path): try: with open(model_path, 'rb') as fr: self.model = pickle.load(fr) except Exception: self.model = None print('Error when load model \n', traceback.format_exc()) def save_dict_labels(self, file_path): io_utils.save_dict_to_file(self.dict_labels, file_path)
def train_fold(train_ind, test_ind, val_ind, graph_feat, graph_feat2, features, y, y_data, idx, lr, params, subject_IDs, pathToSave, i): """ train_ind : indices of the training samples test_ind : indices of the test samples val_ind : indices of the validation samples graph_feat : population graph computed from phenotypic measures num_subjects x num_subjects features : feature vectors num_subjects x num_features y : ground truth labels (num_subjects x 1) y_data : ground truth labels - different representation (num_subjects x 2) params : dictionnary of GCNs parameters subject_IDs : list of subject IDs returns: test_acc : average accuracy over the test samples using GCNs test_auc : average area under curve over the test samples using GCNs lin_acc : average accuracy over the test samples using the linear classifier lin_auc : average area under curve over the test samples using the linear classifier fold_size : number of test samples """ tf.reset_default_graph() tf.app.flags._global_parser = argparse.ArgumentParser() print(len(train_ind)) # selection of a subset of data if running experiments with a subset of the training set #labeled_ind = Reader.site_percentage(train_ind, params['num_training'], subject_IDs) labeled_ind = reader.site_percentage(train_ind,1.0) # feature selection/dimensionality reduction step x_data = Reader.feature_selection(features, y, labeled_ind, params['num_features']) fold_size = len(test_ind) # Calculate all pairwise distances distv = distance.pdist(x_data, metric='correlation') # Convert to a square symmetric distance matrix dist = distance.squareform(distv) sigma = np.mean(dist) # Get affinity from similarity matrix sparse_graph = np.exp(- dist ** 2 / (2 * sigma ** 2)) num_nodes = 662 final_graph = graph_feat * sparse_graph # Gender final_graph2 = graph_feat2 * sparse_graph # Age # Linear classifier clf = RidgeClassifier() clf.fit(x_data[train_ind, :], y[train_ind].ravel()) # Compute the accuracy lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel()) # Compute the AUC pred = clf.decision_function(x_data[test_ind, :]) lin_auc = sklearn.metrics.roc_auc_score(y[test_ind] - 1, pred) print("Linear Accuracy: " + str(lin_acc)) # Classification with GCNs test_acc, test_auc, weights= Train.run_training(final_graph, final_graph2, sparse.coo_matrix(x_data).tolil(), y_data, train_ind, val_ind, test_ind, idx, lr, params, pathToSave, i) # return number of correctly classified samples instead of percentage # test_acc = int(round(test_acc * len(test_ind))) # lin_acc = int(round(lin_acc * len(test_ind))) scores_acc = [test_acc] scores_auc = [test_auc] scores_lin = [lin_acc] scores_auc_lin = [lin_auc] fold_size = [fold_size] weights_0 = weights[0] weights_1 = weights[1] scores_lin_ = np.sum(scores_lin) scores_auc_lin_ = np.mean(scores_auc_lin) scores_acc_ = np.sum(scores_acc) scores_auc_ = np.mean(scores_auc) if not os.path.exists(pathToSave + 'excel/'): os.makedirs(pathToSave + 'excel/') pathToSave2 = pathToSave + 'excel/' result_name = 'ABIDE_classification.mat' sio.savemat(pathToSave2 + str(trial) + result_name, {'lin': scores_lin_, 'lin_auc': scores_auc_lin_, 'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes, 'weights_0': weights_0, 'weights_1': weights_1}) df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_], 'scores_auc_lin': [scores_auc_lin_], 'weights_0': weights_0, 'weights_1': weights_1}) prediction.append(df) # Create a Pandas Excel writer using XlsxWriter as the engine. writer_n = pd.ExcelWriter(pathToSave2 + str(test_ind[0]) + '.xlsx', engine='xlsxwriter') # Convert the dataframe to an XlsxWriter Excel object. df.to_excel(writer_n, sheet_name='Sheet1') # Close the Pandas Excel writer and output the Excel file. writer_n.save() test_acc = int(round(test_acc * len(test_ind))) lin_acc = int(round(lin_acc * len(test_ind))) scores_acc = [test_acc] scores_auc = [test_auc] scores_lin = [lin_acc] scores_auc_lin = [lin_auc] fold_size = [fold_size] # return number of correctly classified samples instead of percentage test_acc = int(round(test_acc * len(test_ind))) return test_acc, test_auc, lin_acc, lin_auc, fold_size
def train_fold(train_ind, test_ind, val_ind, graph_feat, features, y, y_data, params, subject_IDs, pathToSave, i, subject_labels, idx): """ train_ind : indices of the training samples test_ind : indices of the test samples val_ind : indices of the validation samples graph_feat : population graph computed from phenotypic measures num_subjects x num_subjects features : feature vectors num_subjects x num_features y : ground truth labels (num_subjects x 1) y_data : ground truth labels - different representation (num_subjects x 2) params : dictionnary of GCNs parameters subject_IDs : list of subject IDs returns: test_acc : average accuracy over the test samples using GCNs test_auc : average area under curve over the test samples using GCNs lin_acc : average accuracy over the test samples using the linear classifier lin_auc : average area under curve over the test samples using the linear classifier fold_size : number of test samples """ print(len(train_ind)) tf.reset_default_graph() tf.app.flags._global_parser = argparse.ArgumentParser() # selection of a subset of data if running experiments with a subset of the training set # labeled_ind = Reader.site_percentage(train_ind, params['num_training'], subject_IDs) num_nodes = np.size(graph_feat, 0) #print features[0,:],"features" x_data_1 = features.astype(float)#Reader.feature_selection(features, y, labeled_ind, params['num_features']) xrow,xcol = np.shape(x_data_1) for i in range(xrow): for j in range(xcol): x_data_1[i, j] = round(x_data_1[i,j], 4) fold_size = len(test_ind) x_data_1[np.where(np.isnan(x_data_1))] = 0 distv = distance.pdist(x_data_1, metric='correlation') dist = distance.squareform(distv) sigma = np.mean(dist) # Get affinity from similarity matrix sparse_graph = np.exp(- dist ** 2 / (2 * sigma ** 2)) # plt.matshow(sparse_graph) # plt.savefig('features_sparsegraph.png', bbox_inches='tight') # exit() graph = Reader.get_affinity(sparse_graph, idx) x_data = features.astype(float)#np.identity(num_nodes) xrow,xcol = np.shape(x_data) for i in range(xrow): for j in range(xcol): x_data[i, j] = round(x_data[i,j], 4) np.savetxt("x_data.csv", x_data, delimiter=',') x_data[np.where(np.isnan(x_data))] = 0 print(np.where(np.isnan(x_data))) #exit() # Linear classifier clf = RidgeClassifier() clf.fit(x_data[train_ind, :], y[train_ind].ravel()) # Compute the accuracy lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel()) # Compute the AUC pred = clf.decision_function(x_data[test_ind, :]) y_one_hot = label_binarize(y[test_ind], classes=np.arange(3)) lin_auc = sklearn.metrics.roc_auc_score(y_one_hot, pred) # np.savetxt("x_data.csv", x_data, delimiter = ',') # Classification with GCNs test_acc, test_auc, weights, confusion = Train.run_training(graph, sparse.coo_matrix(x_data).tolil(), y_data, train_ind, val_ind, test_ind, params, pathToSave, i) # print(test_acc) scores_acc = [test_acc] scores_auc = [test_auc] scores_lin = [lin_acc] scores_auc_lin = [lin_auc] fold_size = [fold_size] if FLAGS.model == 'gcn_cheby': weights_0 = weights[0] weights_1 = weights[1] weights_2 = weights[2] scores_lin_ = np.sum(scores_lin) scores_auc_lin_ = np.mean(scores_auc_lin) scores_acc_ = int(np.sum(scores_acc) * len(test_ind)) scores_auc_ = np.mean(scores_auc) if not os.path.exists(pathToSave + 'excel/'): os.makedirs(pathToSave + 'excel/') pathToSave2 = pathToSave + 'excel/' result_name = 'ABIDE_classification.mat' if FLAGS.model == 'gcn_cheby': sio.savemat(pathToSave2 + str(trial) + result_name, {'lin': scores_lin_, 'lin_auc': scores_auc_lin_, 'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes, 'weights_0': weights_0, 'weights_1': weights_1, 'weights_2': weights_2}) df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_], 'scores_auc_lin': [scores_auc_lin_], 'weights_0': weights_0, 'weights_1': weights_1, 'weights_2':weights_2, 'confusion_matrix': [confusion]}) else: sio.savemat(pathToSave2 + str(trial) + result_name, {'lin': scores_lin_, 'lin_auc': scores_auc_lin_, 'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes}) df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_], 'scores_auc_lin': [scores_auc_lin_], 'confusion_matrix': [confusion]}) prediction.append(df) # Create a Pandas Excel writer using XlsxWriter as the engine. writer_n = pd.ExcelWriter(pathToSave2 + str(test_ind[0]) + '.xlsx', engine='xlsxwriter') # Convert the dataframe to an XlsxWriter Excel object. df.to_excel(writer_n, sheet_name='Sheet1') # Close the Pandas Excel writer and output the Excel file. writer_n.save() lin_acc = int(round(lin_acc * len(test_ind))) scores_acc = [test_acc] scores_auc = [test_auc] scores_lin = [lin_acc] scores_auc_lin = [lin_auc] fold_size = [fold_size] # return number of correctly classified samples instead of percentage test_acc = int(round(test_acc * len(test_ind))) return test_acc, test_auc, lin_acc, lin_auc, fold_size, len(test_ind)
pred_train_label = model.predict(feature_train_) pred_val_label = model.predict(feature_validation_) # 模型验证,以及根据验证情况调参 acc_train = metrics.accuracy_score(label_train, pred_train_label) f1score_train = metrics.f1_score(label_train, pred_train_label) acc_validation = metrics.accuracy_score(label_validation, pred_val_label) f1score_validation = metrics.f1_score(label_validation, pred_val_label) print( f"acc_train = {acc_train:.3f}; f1score_train = {f1score_train}\nacc_validation = {acc_validation:.8f}; f1score_validaton = {f1score_validation}" ) #%% ============================最终的测试============================ # 最好使用外部测试集 pred_test_label = model.predict(feature_test_) pred_test_prob = model.decision_function(feature_test_) acc_test = metrics.accuracy_score(label_test, pred_test_label) f1score_test = metrics.f1_score(label_test, pred_test_label) print(f"acc_test = {acc_test:.8f}; f1score_test = {f1score_test}\n") #%% ============================结果可视化============================ # 获取权重 wei = model.coef_ wei = (wei - wei.mean()) / wei.std() wei = selector.inverse_transform(wei) wei = pca.inverse_transform(wei) weight = np.zeros(mask.shape) weight[mask] = wei[0] weight = weight + weight.T # 只显示前0.2%的权重
def classify(granularity=10): trainDir = path.join( GEOTEXT_HOME, 'processed_data/' + str(granularity).strip() + '_clustered/') testDir = path.join(GEOTEXT_HOME, 'processed_data/test') data_train = load_files(trainDir, encoding=encoding) target = data_train.target data_test = load_files(testDir, encoding=encoding) categories = data_train.target_names def size_mb(docs): return sum(len(s.encode(encoding)) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print("%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb)) print("%d categories" % len(categories)) print() # split a training set and a test set y_train = data_train.target y_test = data_test.target print( "Extracting features from the training dataset using a sparse vectorizer" ) t0 = time() vectorizer = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, min_df=2, max_df=1.0, ngram_range=(1, 1), stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print( "Extracting features from the test dataset using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() chi = False if chi: k = 500000 print("Extracting %d best features by a chi-squared test" % 0) t0 = time() ch2 = SelectKBest(chi2, k=k) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print("done in %fs" % (time() - t0)) print() feature_names = np.asarray(vectorizer.get_feature_names()) # clf = LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-3) clf = RidgeClassifier(tol=1e-2, solver="auto") print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) scores = clf.decision_function(X_test) print scores.shape print pred.shape test_time = time() - t0 print("test time: %0.3fs" % test_time) # score = metrics.f1_score(y_test, pred) # print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print("%s: %s" % (category, " ".join(feature_names[top10]))) sumMeanDistance = 0 sumMedianDistance = 0 distances = [] confidences = [] randomConfidences = [] for i in range(0, len(pred)): user = path.basename(data_test.filenames[i]) location = userLocation[user].split(',') lat = float(location[0]) lon = float(location[1]) prediction = categories[pred[i]] confidence = scores[i][pred[i]] - mean(scores[i]) randomConfidence = scores[i][random.randint(0, len(categories) - 1)] confidences.append(confidence) randomConfidences.append(randomConfidence) medianlat = classLatMedian[prediction] medianlon = classLonMedian[prediction] meanlat = classLatMean[prediction] meanlon = classLonMean[prediction] distances.append(distance(lat, lon, medianlat, medianlon)) sumMedianDistance = sumMedianDistance + distance( lat, lon, medianlat, medianlon) sumMeanDistance = sumMeanDistance + distance(lat, lon, meanlat, meanlon) averageMeanDistance = sumMeanDistance / float(len(pred)) averageMedianDistance = sumMedianDistance / float(len(pred)) print "Average mean distance is " + str(averageMeanDistance) print "Average median distance is " + str(averageMedianDistance) print "Median distance is " + str(median(distances)) fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True) plt.xlim(0, 4000) plt.ylim(0, 2) ax1.scatter(distances, confidences) ax2.bar(distances, confidences) plt.savefig(path.join(GEOTEXT_HOME, 'confidence.png'))
naive_bayes.fit(X, y) ridge = RidgeClassifier(random_state=rng) ridge.fit(X, y) #%% Testing; # Create some random inputs; num_test_docs = 100 X_test = rng.randint(max_occurrence_of_ngram, size=(num_test_docs, num_features)) nb_scores = naive_bayes.predict_proba(X_test) print(naive_bayes.predict(X_test)) ridge_scores = ridge.decision_function(X_test) print(ridge.predict(X_test)) print(np.argmax(softmax(nb_scores) + softmax(ridge_scores), axis=1)) #%% Testing, using hand-made functions; nb_res_2 = naive_bayes_predict(X_test, naive_bayes.feature_log_prob_, naive_bayes.class_log_prior_) print(np.argmax(nb_res_2, axis=1)) ridge_pred_2 = ridge_pred(X_test, ridge.coef_, ridge.intercept_) print(np.argmax(ridge_pred_2, axis=1)) print(np.argmax(softmax(nb_res_2) + softmax(ridge_pred_2), axis=1))
class RBFNetworkClassifier(BaseEstimator, ClassifierMixin): """Implementación de un clasificador de red de funciones (gaussianas) de base radial. Internamente utiliza un clasificador lineal RidgeClassifier para ajustar los pesos del modelo final.""" def __init__(self, k=7, alpha=1.0, batch_size=100, random_state=None): """Construye un clasificador con los parámetros necesarios: - k: número de centros a elegir. - alpha: valor de la constante regularización. - batch_size: tamaño del batch para el clustering no supervisado. - random_state: semilla aleatoria.""" self.k = k self.alpha = alpha self.batch_size = batch_size self.random_state = random_state self.centers = None self.r = None def _choose_centers(self, X): """Usando k-means escoge los k centros de los datos.""" init_size = 3 * self.k if 3 * self.batch_size <= self.k else None kmeans = MiniBatchKMeans(n_clusters=self.k, batch_size=self.batch_size, init_size=init_size, random_state=self.random_state) kmeans.fit(X) self.centers = kmeans.cluster_centers_ def _choose_radius(self, X): """Escoge el radio para la transformación radial.""" # "Diámetro" de los datos R = np.max(euclidean_distances(X, X)) self.r = R / (self.k**(1 / self.n_features_in_)) def _transform_rbf(self, X): """Transforma los datos usando el kernel RBF.""" return rbf_kernel(X, self.centers, 1 / (2 * self.r**2)) def fit(self, X, y): """Entrena el modelo.""" # Establecemos el modelo lineal subyacente self.model = RidgeClassifier(alpha=self.alpha, random_state=self.random_state) # Guardamos las clases y las características vistas durante el entrenamiento self.classes_ = unique_labels(y) self.n_features_in_ = X.shape[1] # Obtenemos los k centros usando k-means self._choose_centers(X) # Elegimos el radio para el kernel RBF self._choose_radius(X) # Transformamos los datos usando kernel RBF respecto de los centros Z = self._transform_rbf(X) # Entrenamos el modelo lineal resultante self.model.fit(Z, y) # Guardamos los coeficientes obtenidos self.intercept_ = self.model.intercept_ self.coef_ = self.model.coef_ return self def score(self, X, y=None): # Transformamos datos con kernel RBF Z = self._transform_rbf(X) # Score del modelo lineal return self.model.score(Z, y) def predict(self, X): # Transformamos datos con kernel RBF Z = self._transform_rbf(X) # Predicciones del modelo lineal return self.model.predict(Z) def decision_function(self, X): # Transformamos datos con kernel RBF Z = self._transform_rbf(X) # Función de decisión del modelo lineal return self.model.decision_function(Z)
def train_fold(train_ind, test_ind, val_ind, graph_feat, features, y, y_data, params, subject_IDs): """ train_ind : indices of the training samples test_ind : indices of the test samples val_ind : indices of the validation samples graph_feat : population graph computed from phenotypic measures num_subjects x num_subjects features : feature vectors num_subjects x num_features y : ground truth labels (num_subjects x 1) y_data : ground truth labels - different representation (num_subjects x 2) params : dictionnary of GCNs parameters subject_IDs : list of subject IDs returns: test_acc : average accuracy over the test samples using GCNs test_auc : average area under curve over the test samples using GCNs lin_acc : average accuracy over the test samples using the linear classifier lin_auc : average area under curve over the test samples using the linear classifier fold_size : number of test samples """ print(len(train_ind)) # selection of a subset of data if running experiments with a subset of the training set labeled_ind = Reader.site_percentage(train_ind, params['num_training'], subject_IDs) # feature selection/dimensionality reduction step x_data = Reader.feature_selection(features, y, labeled_ind, params['num_features']) fold_size = len(test_ind) # Calculate all pairwise distances distv = distance.pdist(x_data, metric='correlation') # Convert to a square symmetric distance matrix dist = distance.squareform(distv) sigma = np.mean(dist) # Get affinity from similarity matrix sparse_graph = np.exp(-dist**2 / (2 * sigma**2)) final_graph = graph_feat * sparse_graph # Linear classifier clf = RidgeClassifier() clf.fit(x_data[train_ind, :], y[train_ind].ravel()) # Compute the accuracy lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel()) # Compute the AUC pred = clf.decision_function(x_data[test_ind, :]) lin_auc = sklearn.metrics.roc_auc_score(y[test_ind] - 1, pred) print("Linear Accuracy: " + str(lin_acc)) # Classification with GCNs test_acc, test_auc = Train.run_training(final_graph, sparse.coo_matrix(x_data).tolil(), y_data, train_ind, val_ind, test_ind, params) print(test_acc) # return number of correctly classified samples instead of percentage test_acc = int(round(test_acc * len(test_ind))) lin_acc = int(round(lin_acc * len(test_ind))) return test_acc, test_auc, lin_acc, lin_auc, fold_size
def classify(granularity=10): trainDir = path.join(GEOTEXT_HOME, 'processed_data/' + str(granularity).strip() + '_clustered/') testDir = path.join(GEOTEXT_HOME, 'processed_data/test') data_train = load_files(trainDir, encoding=encoding) target = data_train.target data_test = load_files(testDir, encoding=encoding) categories = data_train.target_names def size_mb(docs): return sum(len(s.encode(encoding)) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print("%d documents - %0.3fMB (training set)" % ( len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % ( len(data_test.data), data_test_size_mb)) print("%d categories" % len(categories)) print() # split a training set and a test set y_train = data_train.target y_test = data_test.target print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() vectorizer = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, min_df=2, max_df=1.0, ngram_range=(1, 1), stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test dataset using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() chi = False if chi: k = 500000 print("Extracting %d best features by a chi-squared test" % 0) t0 = time() ch2 = SelectKBest(chi2, k=k) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print("done in %fs" % (time() - t0)) print() feature_names = np.asarray(vectorizer.get_feature_names()) # clf = LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-3) clf = RidgeClassifier(tol=1e-2, solver="auto") print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) scores = clf.decision_function(X_test) print scores.shape print pred.shape test_time = time() - t0 print("test time: %0.3fs" % test_time) # score = metrics.f1_score(y_test, pred) # print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print("%s: %s" % (category, " ".join(feature_names[top10]))) sumMeanDistance = 0 sumMedianDistance = 0 distances = [] confidences = [] randomConfidences = [] for i in range(0, len(pred)): user = path.basename(data_test.filenames[i]) location = userLocation[user].split(',') lat = float(location[0]) lon = float(location[1]) prediction = categories[pred[i]] confidence = scores[i][pred[i]] - mean(scores[i]) randomConfidence = scores[i][random.randint(0, len(categories) - 1)] confidences.append(confidence) randomConfidences.append(randomConfidence) medianlat = classLatMedian[prediction] medianlon = classLonMedian[prediction] meanlat = classLatMean[prediction] meanlon = classLonMean[prediction] distances.append(distance(lat, lon, medianlat, medianlon)) sumMedianDistance = sumMedianDistance + distance(lat, lon, medianlat, medianlon) sumMeanDistance = sumMeanDistance + distance(lat, lon, meanlat, meanlon) averageMeanDistance = sumMeanDistance / float(len(pred)) averageMedianDistance = sumMedianDistance / float(len(pred)) print "Average mean distance is " + str(averageMeanDistance) print "Average median distance is " + str(averageMedianDistance) print "Median distance is " + str(median(distances)) fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True) plt.xlim(0, 4000) plt.ylim(0, 2) ax1.scatter(distances, confidences) ax2.bar(distances, confidences) plt.savefig(path.join(GEOTEXT_HOME, 'confidence.png'))
X_train = vectorizer.fit_transform(train.values) X_train # Let's explain how our model recognizes toxic comments # In[ ]: classifier = RidgeClassifier(solver='sag') y = ys['toxic'].values kf = KFold(n_splits=5, shuffle=True, random_state=239) for train_index, test_index in kf.split(X_train): classifier = RidgeClassifier(solver='sag') classifier.fit(X_train[train_index], y[train_index]) predict = classifier.decision_function(X_train[test_index]) cv_score = roc_auc_score(y[test_index], predict) print(cv_score) break # In[ ]: eli5.show_weights(classifier, vec=vectorizer) # In[ ]: train[COMMENT].values[6] # In[ ]: eli5.show_prediction(classifier, doc=train.values[6], vec=vectorizer)
class Level1Model(object): train_features = [ "ps_car_13", # : 1571.65 / shadow 609.23 "ps_reg_03", # : 1408.42 / shadow 511.15 "ps_ind_05_cat", # : 1387.87 / shadow 84.72 "ps_ind_03", # : 1219.47 / shadow 230.55 "ps_ind_15", # : 922.18 / shadow 242.00 "ps_reg_02", # : 920.65 / shadow 267.50 "ps_car_14", # : 798.48 / shadow 549.58 "ps_car_12", # : 731.93 / shadow 293.62 "ps_car_01_cat", # : 698.07 / shadow 178.72 "ps_car_07_cat", # : 694.53 / shadow 36.35 "ps_ind_17_bin", # : 620.77 / shadow 23.15 "ps_car_03_cat", # : 611.73 / shadow 50.67 "ps_reg_01", # : 598.60 / shadow 178.57 "ps_car_15", # : 593.35 / shadow 226.43 "ps_ind_01", # : 547.32 / shadow 154.58 "ps_ind_16_bin", # : 475.37 / shadow 34.17 "ps_ind_07_bin", # : 435.28 / shadow 28.92 "ps_car_06_cat", # : 398.02 / shadow 212.43 "ps_car_04_cat", # : 376.87 / shadow 76.98 "ps_ind_06_bin", # : 370.97 / shadow 36.13 "ps_car_09_cat", # : 214.12 / shadow 81.38 "ps_car_02_cat", # : 203.03 / shadow 26.67 "ps_ind_02_cat", # : 189.47 / shadow 65.68 "ps_car_11", # : 173.28 / shadow 76.45 "ps_car_05_cat", # : 172.75 / shadow 62.92 "ps_calc_09", # : 169.13 / shadow 129.72 "ps_calc_05", # : 148.83 / shadow 120.68 "ps_ind_08_bin", # : 140.73 / shadow 27.63 "ps_car_08_cat", # : 120.87 / shadow 28.82 "ps_ind_09_bin", # : 113.92 / shadow 27.05 "ps_ind_04_cat", # : 107.27 / shadow 37.43 "ps_ind_18_bin", # : 77.42 / shadow 25.97 "ps_ind_12_bin", # : 39.67 / shadow 15.52 "ps_ind_14", # : 37.37 / shadow 16.65 ] def __init__(self, strat=True, splits=5, random_state=15, submit=False, mean_sub=False, metric=None): # type: (bool, int, int, bool, bool, Callable) -> None self.curr_date = datetime.datetime.now() self._submit = submit self._id = "" self.trn = None self.target = None self.sub = None self.model = None self.metric = metric self.mean_submission = mean_sub self.trn_csr = None self.sub_csr = None if strat: self._folds = StratifiedKFold(n_splits=splits, shuffle=True, random_state=random_state) else: self._folds = KFold(n_splits=splits, shuffle=True, random_state=random_state) self.set_model() def set_model(self): self.model = RidgeClassifier( alpha=3000, # Was 1000 normalize=False, max_iter=1000, class_weight="balanced", # {0: 1, 1: 2}, random_state=1, solver="sag", tol=1e-3, copy_X=False, ) # self.model.fit() @property def do_submission(self): return self._submit @property def id(self): return self._get_id() @abc.abstractmethod def _get_id(self): self._id = "ridge_dummies" if self._id == "": raise ValueError("Id is not set for class " + str(type(self))) return self._id def read_data(self): self.trn = pd.read_csv("../../input/train.csv", index_col=0) self.target = self.trn["target"] del self.trn["target"] if self.do_submission: self.sub = pd.read_csv("../../input/test.csv", index_col=0) def prepare_data(self): self.trn = self.trn[self.train_features] if self.do_submission: self.sub = self.sub[self.train_features] for f in ["ps_reg_03", "ps_car_12", "ps_car_13", "ps_car_14"]: full_f = pd.concat([self.trn[f], self.sub[f]], axis=0) full_cut = np.array(pd.cut(full_f, 20, labels=False)) self.trn[f] = full_cut[:len(self.trn)] self.sub[f] = full_cut[len(self.trn):] del full_f del full_cut # Transform low card f to high_card_f = [] binary_f = [] for f in self.trn.columns: card = len(np.unique(self.trn[f])) one = OneHotEncoder(handle_unknown='ignore') if (card > 2) & (card < 110): print("Encoding %s" % f) if self.trn_csr is None: self.trn_csr = one.fit_transform(self.trn[[f]].replace( -1, 99999)) if self.do_submission: self.sub_csr = one.transform(self.sub[[f]].replace( -1, 99999)) else: self.trn_csr = csr_hstack( (self.trn_csr, one.fit_transform(self.trn[[f]].replace(-1, 99999)))) if self.do_submission: self.sub_csr = csr_hstack( (self.sub_csr, one.transform(self.sub[[f]].replace(-1, 99999)))) elif card <= 2: binary_f.append(f) else: high_card_f.append(f) # Add binary data print("Add binary feats : ", binary_f) self.trn_csr = csr_hstack((self.trn_csr, self.trn[binary_f])) if self.do_submission: self.sub_csr = csr_hstack((self.sub_csr, self.sub[binary_f])) # Add High card data # We need to scale those features print("Add high card feats : ", high_card_f) # skl = StandardScaler() # if not self.do_submission: # self.trn_csr = csr_hstack((self.trn_csr, skl.fit_transform(self.trn[high_card_f].values))) # else: # skl.fit(np.vstack((self.trn[high_card_f].values, self.sub[high_card_f].values))) # self.trn_csr = csr_hstack((self.trn_csr, skl.transform(self.trn[high_card_f].values))) # self.sub_csr = csr_hstack((self.sub_csr, skl.transform(self.sub[high_card_f].values))) print("Transform to csr") self.trn_csr = self.trn_csr.tocsr() print("CSR shape = ", self.trn_csr.shape) if self.do_submission: self.sub_csr = self.sub_csr.tocsr() print(self.trn_csr.sum(axis=0) < 100) self.sub_csr_not_enough = np.array( self.sub_csr.sum(axis=0) <= 100)[0, :] self.sub_csr_occurences = np.array(self.sub_csr.sum(axis=0))[0, :] print(self.sub_csr_occurences.shape) print(self.sub_csr_not_enough) def predict_oof_and_submission(self): self.read_data() self.prepare_data() pos_ratio = .5 class_weight = {0: 1 / (2 * (1 - pos_ratio)), 1: 1 / (2 * pos_ratio)} coefs = np.zeros((self.trn_csr.shape[1], self._folds.n_splits)) if self.model is None: raise ValueError("Model is not set for class " + str(type(self))) if self.target is None: raise ValueError("Model is not set for class " + str(type(self))) if self.trn is None: raise ValueError("Model is not set for class " + str(type(self))) if (self.sub is None) and self.do_submission: raise ValueError("Model is not set for class " + str(type(self))) # Prepare predictors oof_preds = np.zeros(len(self.trn)) if self.do_submission: sub_preds = np.zeros(len(self.sub)) # Go through folds start = time.time() for i_fold, (trn_idx, val_idx) in enumerate( self._folds.split(self.target, self.target)): # Fit model self.model.fit(self.trn_csr[trn_idx], self.target.values[trn_idx]) coefs[:, i_fold] = self.model.coef_ print(self.model.coef_[0, self.sub_csr_not_enough]) print(self.sub_csr_occurences[self.sub_csr_not_enough]) # Predict OOF oof_preds[val_idx] = self.model.decision_function( self.trn_csr[val_idx]) # Predict SUB if mean is requested if (self.sub is not None) and self.mean_submission: sub_preds += self.model.decision_function( self.sub_csr) / self._folds.n_splits # Print results of current fold print( "Fold %2d score : %.6f in [%5.1f]" % (i_fold + 1, self.metric(self.target.values[val_idx], oof_preds[val_idx]), (time.time() - start) / 60)) # display OOF result oof_score = self.metric(self.target, oof_preds) print("Full OOF score : %.6f" % oof_score) # Check if we need to fit the model on the full dataset if (self.sub is not None) and not self.mean_submission: # Fit model self.model.fit(self.trn_csr, self.target) # Compute prediction for submission sub_preds = self.model.decision_function(self.sub_csr) # Make sure coefs are not crazy coefs = np.abs(np.array(self.model.coef_)[0, :]) sub_occ = np.array(self.sub_csr.sum(axis=0))[0, :] trn_occ = np.array(self.trn_csr.sum(axis=0))[0, :] sortation = np.argsort(coefs)[::-1] for s in sortation: print("%6d %6d %.5f" % (trn_occ[s], sub_occ[s], coefs[s])) if self.do_submission: filename = "../output_preds/" + self.id + "_" filename += str(int(1e6 * oof_score)) + "_" filename += self.curr_date.strftime("%Y_%m_%d_%Hh%M") # Save OOF predictions for stacking self.trn[self.id] = 1 / (1 + np.exp(-oof_preds)) self.trn[[self.id]].to_csv(filename + "_oof.csv", float_format="%.9f") # Save submission prediction for stacking or submission self.sub["target"] = 1 / (1 + np.exp(-sub_preds)) self.sub[["target"]].to_csv(filename + "_sub.csv", float_format="%.9f")
X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_den_train, X_den_test = X_den[train_index], X_den[test_index] # feed models clf_mNB.fit(X_train, y_train) clf_ridge.fit(X_train, y_train) clf_SGD.fit(X_train, y_train) clf_lSVC.fit(X_train, y_train) clf_SVC.fit(X_train, y_train) # get prediction for this fold run prob_mNB = clf_mNB.predict_proba(X_test) prob_ridge = clf_ridge.decision_function(X_test) prob_SGD = clf_SGD.decision_function(X_test) prob_lSVC = clf_lSVC.decision_function(X_test) prob_SVC = clf_SVC.predict_proba(X_test) # add prob functions into the z 2d-array z_temp = (prob_mNB + prob_ridge + prob_SGD + prob_lSVC + prob_SVC) z = np.append(z, z_temp, axis=0) # remove the first sub-1d-array of z, due to the creation with 0s z = np.delete(z, 0, 0) # the result of z is a 2d array with shape of (n_samples, n_categories) # the elements are the sum of probabilities of classifiers on each (sample,category) pair print z print 'z shape: ', z.shape
X_train_train, X_train_test = X_train[train_index], X_train[test_index] y_train_train, y_train_test = y_train[train_index], y_train[test_index] # X_den_train, X_den_test = X_den[train_index], X_den[test_index] # feed models clf_mNB.fit(X_train_train, y_train_train) # clf_kNN.fit(X_train_train, y_train_train) clf_ridge.fit(X_train_train, y_train_train) clf_lSVC.fit(X_train_train, y_train_train) clf_SVC.fit(X_train_train, y_train_train) # get prediction for this fold run prob_mNB = clf_mNB.predict_proba(X_train_test) # prob_kNN = clf_kNN.predict_proba(X_train_test) prob_ridge = clf_ridge.decision_function(X_train_test) prob_lSVC = clf_lSVC.decision_function(X_train_test) prob_SVC = clf_SVC.predict_proba(X_train_test) # update z array for each model # z_temp = prob_lSVC # z_temp = (prob_ridge + prob_lSVC) z_temp = (prob_mNB + prob_ridge + prob_lSVC + prob_SVC) z = np.append(z, z_temp, axis=0) # remove the first sub-1d-array of z, due to the creation with 0s z = np.delete(z, 0, 0) # the result of z is a 2d array with shape of (n_samples, n_categories) # the elements are the sum of probabilities of classifiers on each (sample,category) pair # Possible preprocessing on z