def svm_cross_validate_category(X, y, category, C, penalty, sample_weights): clf_svm_1 = SGDRegressor(loss=loss, penalty=penalty, epsilon=epsilon, alpha=C, shuffle=True) clf_svm_2 = SGDRegressor(loss=loss, penalty=penalty, epsilon=epsilon, alpha=C, shuffle=True) cv_indices = generate_cv_indices(category) train_ids = cv_indices[0:N] test_ids = cv_indices[N:2 * N] clf_svm_1.fit(X[train_ids, :], y[train_ids], sample_weight=sample_weights[train_ids]) clf_svm_2.fit(X[test_ids, :], y[test_ids], sample_weight=sample_weights[test_ids]) score = np.zeros(2) score[0] = clf_svm_1.score(X[test_ids, :], y[test_ids]) score[1] = clf_svm_2.score(X[train_ids, :], y[train_ids]) mean_score = np.mean(score) y_1 = clf_svm_1.decision_function(X[test_ids, :]) y_2 = clf_svm_2.decision_function(X[train_ids, :]) u, indices = np.unique(category, return_inverse=True) auc = np.zeros((2, len(u))) for i in range(0, len(u)): i_inds = indices == i if (np.sum(test_ids & i_inds) != 0): fpr, tpr, thresholds = metrics.roc_curve(y[test_ids & i_inds], y_1[i_inds[test_ids]], pos_label=1) auc[0, i] = metrics.auc(fpr, tpr) if (np.sum(train_ids & i_inds) != 0): fpr, tpr, thresholds = metrics.roc_curve(y[train_ids & i_inds], y_2[i_inds[train_ids]], pos_label=1) auc[1, i] = metrics.auc(fpr, tpr) mean_auc = np.mean(auc, axis=0) print("Finished running category cross-validation") return mean_auc
def svm_cross_validate(X, y, category, C, penalty, sample_weights): clf_svm_1 = SGDRegressor(loss=loss, penalty=penalty, epsilon=epsilon, alpha=C, shuffle=True) clf_svm_2 = SGDRegressor(loss=loss, penalty=penalty, epsilon=epsilon, alpha=C, shuffle=True) #N = len(category) #half_data= np.floor(N/2) #cv_indices_1= np.repeat([False],N) #cv_indices_2= np.repeat([False],N) #cv_indices_1[0:half_data] =True #cv_indices_2[half_data:N] =True #cv_indices= np.concatenate((cv_indices_1,cv_indices_2),axis=1) cv_indices = generate_cv_indices_unbalanced(category) train_ids = cv_indices[0:N] test_ids = cv_indices[N:2 * N] clf_svm_1.fit(X[train_ids, :], y[train_ids], sample_weight=sample_weights[train_ids]) clf_svm_2.fit(X[test_ids, :], y[test_ids], sample_weight=sample_weights[test_ids]) score = np.zeros(2) score[0] = clf_svm_1.score(X[test_ids, :], y[test_ids]) score[1] = clf_svm_2.score(X[train_ids, :], y[train_ids]) mean_score = np.mean(score) y_1 = clf_svm_1.decision_function(X[test_ids, :]) y_2 = clf_svm_2.decision_function(X[train_ids, :]) auc = np.zeros(2) fpr, tpr, thresholds = metrics.roc_curve(y[test_ids], y_1, pos_label=1) auc[0] = metrics.auc(fpr, tpr) fpr, tpr, thresholds = metrics.roc_curve(y[train_ids], y_2, pos_label=1) auc[1] = metrics.auc(fpr, tpr) mean_auc = np.mean(auc, axis=0) print("Finished running standard cross validation") return mean_auc
class LinearRegressor(object): def __init__(self, decompose_func=None, preprocessor=None, nbits=15, seed=1): self.decompose_func = decompose_func self.nbits = nbits feature_size, bitmask = set_feature_size(nbits=nbits) self.feature_size = feature_size self.bitmask = bitmask self.encoding_func = make_encoder(decompose_func, preprocessors=preprocessor, bitmask=self.bitmask, seed=seed) self.classifier = SGDRegressor(penalty='elasticnet') def fit(self, graphs, targets): data_mtx = vectorize_graphs(graphs, encoding_func=self.encoding_func, feature_size=self.feature_size) # binarize data_mtx.data = np.where(data_mtx.data > 0, 1, 0) self.classifier.fit(data_mtx, targets) return self def decision_function(self, graphs): # return probability associated to largest target type data_mtx = vectorize_graphs(graphs, encoding_func=self.encoding_func, feature_size=self.feature_size) # binarize data_mtx.data = np.where(data_mtx.data > 0, 1, 0) preds = self.classifier.decision_function(data_mtx) return preds def predict(self, graphs): data_mtx = vectorize_graphs(graphs, encoding_func=self.encoding_func, feature_size=self.feature_size) # binarize data_mtx.data = np.where(data_mtx.data > 0, 1, 0) preds = self.classifier.predict(data_mtx) return preds
def decision_function(self, X, *args, **kw): X = sp.csr_matrix(X) return SGDRegressor.decision_function(self, X, *args, **kw)
coef_svm = clf_svm.coef_.ravel() count = 0 outline = "" out_coeffs = open("coeffs_full_huber_05172014_l2.csv", "w+") for e, x in enumerate(header): if (x == "label" or x == "id" or x == "category" or x == ""): print(e, x, "nan") count += 1 continue print(e, x, coef_svm[e - count]) outline += x + "," + str(coef_svm[e - count]) + "\n" out_coeffs.write(outline) out_coeffs.close() y_train = clf_svm.decision_function(X) plt.scatter(y, y_train) plt.show() submission = np.array([ids, y_train]) submission = np.transpose(submission) np.savetxt("temp.csv", submission, fmt="%d,%1.6f") X_test, y_test, category_test, header_test, id_test = extract_data(loc_test) #X_test_interactions = get_category_interactions_test(X_test,category_test) #X_test = np.concatenate((X_test, X_test_interactions),axis=1) X_test = StandardScaler().fit_transform(X_test) y_test = clf_svm.decision_function(X_test)
#High frequency words def truncate(s,k): return (s[0:k]) # <codecell> truncated_Row = truncate(SmapSortedIndex_Dec,5000) # <codecell> #Stochastic Gradient Descent #X_Test = X[:,0:1000] #Y_Test = Y[0:1000] clf = SGDRegressor(alpha=0.0001, eta0=0.01, fit_intercept=True, learning_rate='invscaling', loss='squared_loss', n_iter=20, p=0.1, penalty='l2', power_t=0.25, rho=0.85, seed=0, shuffle=True, verbose=0, warm_start=False) clf.fit(X.transpose(), Y) # <codecell> clf.decision_function(X.transpose()) # <codecell>