Beispiel #1
0
def svm_cross_validate_category(X, y, category, C, penalty, sample_weights):

    clf_svm_1 = SGDRegressor(loss=loss,
                             penalty=penalty,
                             epsilon=epsilon,
                             alpha=C,
                             shuffle=True)
    clf_svm_2 = SGDRegressor(loss=loss,
                             penalty=penalty,
                             epsilon=epsilon,
                             alpha=C,
                             shuffle=True)

    cv_indices = generate_cv_indices(category)

    train_ids = cv_indices[0:N]
    test_ids = cv_indices[N:2 * N]

    clf_svm_1.fit(X[train_ids, :],
                  y[train_ids],
                  sample_weight=sample_weights[train_ids])
    clf_svm_2.fit(X[test_ids, :],
                  y[test_ids],
                  sample_weight=sample_weights[test_ids])

    score = np.zeros(2)
    score[0] = clf_svm_1.score(X[test_ids, :], y[test_ids])
    score[1] = clf_svm_2.score(X[train_ids, :], y[train_ids])
    mean_score = np.mean(score)

    y_1 = clf_svm_1.decision_function(X[test_ids, :])
    y_2 = clf_svm_2.decision_function(X[train_ids, :])

    u, indices = np.unique(category, return_inverse=True)
    auc = np.zeros((2, len(u)))
    for i in range(0, len(u)):

        i_inds = indices == i

        if (np.sum(test_ids & i_inds) != 0):
            fpr, tpr, thresholds = metrics.roc_curve(y[test_ids & i_inds],
                                                     y_1[i_inds[test_ids]],
                                                     pos_label=1)
            auc[0, i] = metrics.auc(fpr, tpr)

        if (np.sum(train_ids & i_inds) != 0):
            fpr, tpr, thresholds = metrics.roc_curve(y[train_ids & i_inds],
                                                     y_2[i_inds[train_ids]],
                                                     pos_label=1)
            auc[1, i] = metrics.auc(fpr, tpr)

        mean_auc = np.mean(auc, axis=0)
    print("Finished running category cross-validation")
    return mean_auc
Beispiel #2
0
def svm_cross_validate(X, y, category, C, penalty, sample_weights):

    clf_svm_1 = SGDRegressor(loss=loss,
                             penalty=penalty,
                             epsilon=epsilon,
                             alpha=C,
                             shuffle=True)
    clf_svm_2 = SGDRegressor(loss=loss,
                             penalty=penalty,
                             epsilon=epsilon,
                             alpha=C,
                             shuffle=True)

    #N = len(category)
    #half_data= np.floor(N/2)
    #cv_indices_1= np.repeat([False],N)
    #cv_indices_2= np.repeat([False],N)
    #cv_indices_1[0:half_data] =True
    #cv_indices_2[half_data:N] =True
    #cv_indices= np.concatenate((cv_indices_1,cv_indices_2),axis=1)

    cv_indices = generate_cv_indices_unbalanced(category)

    train_ids = cv_indices[0:N]
    test_ids = cv_indices[N:2 * N]

    clf_svm_1.fit(X[train_ids, :],
                  y[train_ids],
                  sample_weight=sample_weights[train_ids])
    clf_svm_2.fit(X[test_ids, :],
                  y[test_ids],
                  sample_weight=sample_weights[test_ids])

    score = np.zeros(2)
    score[0] = clf_svm_1.score(X[test_ids, :], y[test_ids])
    score[1] = clf_svm_2.score(X[train_ids, :], y[train_ids])
    mean_score = np.mean(score)

    y_1 = clf_svm_1.decision_function(X[test_ids, :])
    y_2 = clf_svm_2.decision_function(X[train_ids, :])

    auc = np.zeros(2)
    fpr, tpr, thresholds = metrics.roc_curve(y[test_ids], y_1, pos_label=1)
    auc[0] = metrics.auc(fpr, tpr)

    fpr, tpr, thresholds = metrics.roc_curve(y[train_ids], y_2, pos_label=1)
    auc[1] = metrics.auc(fpr, tpr)

    mean_auc = np.mean(auc, axis=0)
    print("Finished running standard cross validation")
    return mean_auc
Beispiel #3
0
class LinearRegressor(object):
    def __init__(self,
                 decompose_func=None,
                 preprocessor=None,
                 nbits=15,
                 seed=1):
        self.decompose_func = decompose_func
        self.nbits = nbits
        feature_size, bitmask = set_feature_size(nbits=nbits)
        self.feature_size = feature_size
        self.bitmask = bitmask
        self.encoding_func = make_encoder(decompose_func,
                                          preprocessors=preprocessor,
                                          bitmask=self.bitmask,
                                          seed=seed)
        self.classifier = SGDRegressor(penalty='elasticnet')

    def fit(self, graphs, targets):
        data_mtx = vectorize_graphs(graphs,
                                    encoding_func=self.encoding_func,
                                    feature_size=self.feature_size)
        # binarize
        data_mtx.data = np.where(data_mtx.data > 0, 1, 0)
        self.classifier.fit(data_mtx, targets)
        return self

    def decision_function(self, graphs):
        # return probability associated to largest target type
        data_mtx = vectorize_graphs(graphs,
                                    encoding_func=self.encoding_func,
                                    feature_size=self.feature_size)
        # binarize
        data_mtx.data = np.where(data_mtx.data > 0, 1, 0)
        preds = self.classifier.decision_function(data_mtx)
        return preds

    def predict(self, graphs):
        data_mtx = vectorize_graphs(graphs,
                                    encoding_func=self.encoding_func,
                                    feature_size=self.feature_size)
        # binarize
        data_mtx.data = np.where(data_mtx.data > 0, 1, 0)
        preds = self.classifier.predict(data_mtx)
        return preds
Beispiel #4
0
 def decision_function(self, X, *args, **kw):
     X = sp.csr_matrix(X)
     return SGDRegressor.decision_function(self, X, *args, **kw)
Beispiel #5
0
coef_svm = clf_svm.coef_.ravel()

count = 0
outline = ""
out_coeffs = open("coeffs_full_huber_05172014_l2.csv", "w+")
for e, x in enumerate(header):
    if (x == "label" or x == "id" or x == "category" or x == ""):
        print(e, x, "nan")
        count += 1
        continue
    print(e, x, coef_svm[e - count])
    outline += x + "," + str(coef_svm[e - count]) + "\n"
out_coeffs.write(outline)
out_coeffs.close()

y_train = clf_svm.decision_function(X)

plt.scatter(y, y_train)
plt.show()

submission = np.array([ids, y_train])
submission = np.transpose(submission)

np.savetxt("temp.csv", submission, fmt="%d,%1.6f")

X_test, y_test, category_test, header_test, id_test = extract_data(loc_test)
#X_test_interactions = get_category_interactions_test(X_test,category_test)
#X_test = np.concatenate((X_test, X_test_interactions),axis=1)
X_test = StandardScaler().fit_transform(X_test)

y_test = clf_svm.decision_function(X_test)
Beispiel #6
0
 def decision_function(self, X, *args, **kw):
     X = sp.csr_matrix(X)
     return SGDRegressor.decision_function(self, X, *args, **kw)
#High frequency words
def truncate(s,k):
    return (s[0:k])

# <codecell>

truncated_Row = truncate(SmapSortedIndex_Dec,5000)

# <codecell>

#Stochastic Gradient Descent

#X_Test = X[:,0:1000]
#Y_Test = Y[0:1000]

clf = SGDRegressor(alpha=0.0001, eta0=0.01, fit_intercept=True,
       learning_rate='invscaling', loss='squared_loss', n_iter=20, p=0.1,
       penalty='l2', power_t=0.25, rho=0.85, seed=0, shuffle=True,
       verbose=0, warm_start=False)

clf.fit(X.transpose(), Y)

# <codecell>

clf.decision_function(X.transpose())

# <codecell>