def train_and_predict_m8 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'porter')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting Ridge Classifer...")
    clf = RidgeClassifier(class_weight = 'auto', alpha = 1, normalize = True)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    param_grid = {'alpha' : [0.1, 0.3, 1, 3, 10], 'normalize' : [True, False]}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
 def evaluate_random_binning(X, y, X_test, y_test, M, task):
     # construct random binning features
     start_time = time.time()
     rb = RandomBinning(X.shape[1], M)
     Z, _ = rb.get_features(X) / np.sqrt(M)
     Z_test, _ = rb.get_features(X_test, expand=False) / np.sqrt(M)
     if (task == 'classification'):
         clf = RidgeClassifier(alpha=0.0001, solver='lsqr')
         clf.fit(Z, y)
         y_pred = clf.predict(Z_test)
         error_test = (
             0.5 - np.dot(np.sign(y_test), y_pred) / len(y_test) / 2) * 100
         print("--- %s seconds ---" % (time.time() - start_time))
         print("C = %d; error_test = %.2f" % (np.shape(Z)[1], error_test) +
               '%')
     elif (task == 'regression'):
         clf = Ridge(alpha=0.01, solver='lsqr', random_state=42)
         clf.fit(Z, y)
         y_pred = clf.predict(Z_test)
         error_test = np.linalg.norm(
             (y_test - y_pred)) / np.linalg.norm(y_test) * 100
         print("--- %s seconds ---" % (time.time() - start_time))
         print("C = %d; error_test = %.2f" % (np.shape(Z)[1], error_test) +
               '%')
     else:
         error_test = 'error!'
         print('No such a task, please check the task name!')
     return error_test
def impute_nan(df, ds, dF):
    if ds.isnull().any()==True:
        labeler_st = LabelEncoder()
        rc_st = RidgeClassifier(tol=1e-2, solver="sag")
        Sg = Series(labeler_st.fit_transform(ds.astype(str)), index=ds.index)
        Sg = Sg.where(ds.notnull(), ds, axis=0)
        x_notna = df.GR[Sg.notnull()].to_numpy().reshape(-1, 1)
        y_notna = Sg[Sg.notnull()].to_numpy().astype('int').ravel()
        x_nan = df.GR[Sg.isnull()].to_numpy().reshape(-1, 1)
        rc_st.fit(x_notna,y_notna)
        Sg[Sg.isnull()]=rc_st.predict(x_nan)
        Sg=Series(Sg, index=ds.index).astype(int)
        ds=Series(labeler_st.inverse_transform(Sg.values.ravel()), index=ds.index)
        #print('\nStratigraphy:', np.unique(ds))
    if dF.isnull().any()==True:
        rc_fm = RidgeClassifier(tol=1e-2, solver="sag")
        labeler_fm = LabelEncoder()
        Fm = Series(labeler_fm.fit_transform(dF.astype(str)), index=dF.index)
        labeler_st = LabelEncoder()
        Sg=Series(labeler_st.fit_transform(ds.astype(str)), index=ds.index)
        Fm=Fm.where(dF.notnull(), dF, axis=0)
        x_notna = np.concatenate((df.GR[Fm.notnull()].to_numpy().reshape(-1, 1), 
                                  Sg[Fm.notnull()].to_numpy().reshape(-1, 1)), 
                                 axis=1)
        y_notna = Fm[Fm.notnull()].to_numpy().astype('int').ravel()
        x_nan = np.concatenate((df.GR[Fm.isnull()].to_numpy().reshape(-1, 1), 
                                Sg[Fm.isnull()].to_numpy().reshape(-1, 1)), axis=1)
        rc_fm.fit(x_notna,y_notna)
        Fm[Fm.isnull()]=rc_fm.predict(x_nan)
        Fm=Series(Fm, index=dF.index).astype(int)
        dF=Series(labeler_fm.inverse_transform(Fm.values.ravel()), index=dF.index)
        #print('\nFormation:', np.unique(dF))
    return Sg, Fm
Esempio n. 4
0
def test_class_weights():
    # Test class weights.
    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0,
                                                                     0.0]])
    y = [1, 1, 1, -1, -1]

    reg = RidgeClassifier(class_weight=None)
    reg.fit(X, y)
    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1]))

    # we give a small weights to class 1
    reg = RidgeClassifier(class_weight={1: 0.001})
    reg.fit(X, y)

    # now the hyperplane should rotate clock-wise and
    # the prediction on this point should shift
    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([-1]))

    # check if class_weight = 'balanced' can handle negative labels.
    reg = RidgeClassifier(class_weight='balanced')
    reg.fit(X, y)
    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1]))

    # class_weight = 'balanced', and class_weight = None should return
    # same values when y has equal number of all labels
    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0]])
    y = [1, 1, -1, -1]
    reg = RidgeClassifier(class_weight=None)
    reg.fit(X, y)
    rega = RidgeClassifier(class_weight='balanced')
    rega.fit(X, y)
    assert len(rega.classes_) == 2
    assert_array_almost_equal(reg.coef_, rega.coef_)
    assert_array_almost_equal(reg.intercept_, rega.intercept_)
def generate_submission(models):
    X = pd.concat([
        pd.read_csv(inp)[full_labels] for inp in [
            "../models/{}/train_meta_probs.csv".format(model)
            for model in models
        ]
    ],
                  axis=1)
    X_test = pd.concat([
        pd.read_csv(inp)[full_labels] for inp in
        ["../models/{}/test_meta_probs.csv".format(model) for model in models]
    ],
                       axis=1)
    col_names = [
        "{}_{}".format(i, j)
        for i in ["model_{}".format(k) for k in range(len(models))]
        for j in full_labels
    ]
    X.columns, X_test.columns = col_names, col_names
    folds = get_folds()

    print("===Ridge===")
    ridge_cv = RidgeClassifierCV(alphas=[
        0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 100
    ],
                                 cv=folds).fit(X, y)
    print("best alpha value is: {}".format(ridge_cv.alpha_))
    ridge_model = RidgeClassifier(alpha=ridge_cv.alpha_).fit(X, y)
    print(accuracy_score(y, ridge_model.predict(X)))
    test_df['label'] = pd.Series(
        ridge_model.predict(X_test)).map(full_num_label_mapping)
    test_df['label'] = test_df['label'].map(lambda x: "unknown"
                                            if x not in labels else x)
    test_df.to_csv("ridge_on_{}_models.csv".format(len(models)), index=False)
Esempio n. 6
0
def country_based_model(df, input_df, model_evaluator):
    input_df['-3'] = input_df['name'].str[-3]
    input_df['-2'] = input_df['name'].str[-2]
    input_df['-1'] = input_df['name'].str[-1]

    columns = ['-3', '-2', '-1']
    vectorized_name = [pd.get_dummies(input_df[i]) for i in columns]
    input_df = pd.concat(
        [vectorized_name[0], vectorized_name[1], vectorized_name[2], input_df],
        axis=1)

    cY = input_df['gender'].head(39469)
    input_df = input_df.drop(columns=['name', 'gender', '-3', '-2', '-1'])

    cX = input_df.head(39469)
    cX_train, cX_test, cy_train, cy_test = train_test_split(cX,
                                                            cY,
                                                            test_size=0.2,
                                                            random_state=42)

    model = RidgeClassifier(fit_intercept=False, solver='lsqr')
    model.fit(cX_train, cy_train)

    training_predictions = model.predict(input_df.head(39469))
    model_evaluator['MODEL_PREDICTION'] = training_predictions
    country_model_predictions = model.predict(input_df.tail(1000))
    df['COUNTRY_MODEL'] = country_model_predictions

    return df, model_evaluator
Esempio n. 7
0
def Ngram_Range(train):
    '''
    n-gram提取词语字符数的下边界和上边界,考虑到中文的用词习惯,ngram_range可以在(1,4)之间选取
    :param train:
    :return:
    '''
    sample = train
    n = int(2 * len(sample) / 3)
    f1 = []
    tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=2000)
    train_test = tfidf.fit_transform(sample['text'].values.astype('U'))
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]

    clf = RidgeClassifier(alpha=0.1 * (1 + 1), solver='sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))

    tfidf = TfidfVectorizer(ngram_range=(2, 2), max_features=2000)
    train_test = tfidf.fit_transform(sample['text'].values.astype("U"))
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    clf = RidgeClassifier(alpha=0.1 * (2 + 1), solver='sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))
    print(f1)

    tfidf = TfidfVectorizer(ngram_range=(3, 3), max_features=2000)
    train_test = tfidf.fit_transform(sample['text'].values.astype('U'))
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    clf = RidgeClassifier(alpha=0.1 * (3 + 1), solver='sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))
    print(f1)

    tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=2000)
    train_test = tfidf.fit_transform(sample['text'].values.astype("U"))
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    clf = RidgeClassifier(alpha=0.1 * (4 + 1), solver='sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))
    print(f1)
def text_classify_influence_by_ngram_range():
    """
    ngram_range对文本分类的影响
    """
    train_df = pd.read_csv('./data/train_set.csv', sep='\t', nrows=15000)
    sample = train_df[0:5000]
    n = int(2 * len(sample) / 3)
    f1 = []
    tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=2000)
    train_test = tfidf.fit_transform(sample['text'])
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    clf = RidgeClassifier(alpha=0.1, solver='sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))

    tfidf = TfidfVectorizer(ngram_range=(2, 2), max_features=2000)
    train_test = tfidf.fit_transform(sample['text'])
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    clf = RidgeClassifier(alpha=0.1, solver='sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))

    tfidf = TfidfVectorizer(ngram_range=(3, 3), max_features=2000)
    train_test = tfidf.fit_transform(sample['text'])
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    clf = RidgeClassifier(alpha=0.1, solver='sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))

    tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=2000)
    train_test = tfidf.fit_transform(sample['text'])
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    clf = RidgeClassifier(alpha=0.1, solver='sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))
Esempio n. 9
0
def main():
    """ Main function. """
    # parse args
    parser = argparse.ArgumentParser()
    parser.add_argument('data',
                        help='The path to the data. (hw2_lssvm_all.dat)')
    args = parser.parse_args()

    data_path = args.data

    # get data
    x_train, y_train, x_test, y_test = get_data(data_path)

    # run linear ridge and print errors
    np.random.seed(0)
    e_in_list = []
    e_out_list = []
    for lmbd in LMBD_LIST:
        # bagging on bootstrapping
        y_train_pred = 0
        y_pred = 0
        for _ in range(BAGGING_NUM):
            clf = RidgeClassifier(lmbd)
            boot_idcs = np.random.randint(0, x_train.shape[0], BOOT_NUM)
            clf.fit(x_train[boot_idcs], y_train[boot_idcs])
            y_train_pred += clf.predict(x_train)
            y_pred += clf.predict(x_test)
        y_train_pred = y_train_pred / BAGGING_NUM > 0.5
        y_pred = y_pred / BAGGING_NUM > 0.5

        e_in = (y_train_pred != y_train).mean()
        e_in_list.append(e_in)
        e_out = (y_pred != y_test).mean()
        e_out_list.append(e_out)

    # print errors
    print('E_in:', e_in_list)
    min_e_in = min(e_in_list)
    print('min lambda:', [
        LMBD_LIST[idx]
        for idx, e_in in enumerate(e_in_list) if e_in == min_e_in
    ])
    print('corresponding E_in:', min_e_in)
    print('')
    print('E_out:', e_out_list)
    min_e_out = min(e_out_list)
    print('min lambda:', [
        LMBD_LIST[idx]
        for idx, e_out in enumerate(e_out_list) if e_out == min_e_out
    ])
    print('correspondoutg E_out:', min_e_out)
Esempio n. 10
0
def training_rdg(X_train_rdg, X_test_rdg, y_train_rdg, y_test_rdg, nfolds , preproc):

    params = {
                     
                     'alpha' : [0.001, 0.01, 0.1, 1]
      }
    param_comb = 2
    rdg = RidgeClassifier()
    skf = StratifiedKFold(n_splits=nfolds, shuffle = True, random_state = 1001)
    random_search = RandomizedSearchCV(rdg, param_distributions=params, n_iter=param_comb, 
                                     n_jobs=-1, cv=skf.split(X_train_rdg,y_train_rdg), verbose=3, random_state=1001 )
        
    random_search.fit(X_train_rdg, y_train_rdg)
    best_param = random_search.best_params_
    
    print(best_param)
    svc = RidgeClassifier(alpha=best_param["alpha"]).fit(X_train_rdg, y_train_rdg)
    y_pred = svc.predict(X_test_rdg)
    
    res = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Precision', 'Recall', 'F1-score', 'Accuracy'])

    f1 = f1_score(y_pred, y_test_rdg, average = 'weighted')
    pres = precision_score(y_pred, y_test_rdg, average = 'weighted')
    rec = recall_score(y_pred, y_test_rdg, average = 'weighted')
    acc = accuracy_score(y_pred, y_test_rdg)
    
    res = res.append({'Preprocessing': preproc, 'Model': 'Ridge', 'Precision': pres, 
                     'Recall': rec, 'F1-score': f1, 'Accuracy': acc}, ignore_index = True)
    
    return res
Esempio n. 11
0
def validate(input_train, rows=True, test=0.25):
    """
    Takes file as input and returns classification report, average precision, and
    AUC for a bigram model. By default, loads all rows of a dataset, trains on .75,
    and tests on .25. 
    ----
    input_train : 'full path of the file you are loading'
    rows : True - loads all rows; insert an int for specific number of rows
    test : float proportion of dataset used for testing
    """
    if rows == True:
        data = pd.read_table(input_train)
    else:
        data = pd.read_table(input_train, nrows = rows)
    response = data.is_blocked
    dummies = sparse.csc_matrix(pd.get_dummies(data.subcategory))
    words = np.array(data.description,str)
    del data
    vect = text.CountVectorizer(decode_error = u'ignore',strip_accents='unicode',ngram_range=(1,2))
    counts = vect.fit_transform(words)
    features = sparse.hstack((dummies,counts))
    features_train, features_test, target_train, target_test = train_test_split(features, response, test_size = test)
    clf = RidgeClassifier()
    clf.fit(features_train, target_train)
    prediction = clf.predict(features_test)
    return classification_report(target_test, prediction), average_precision_score(target_test, prediction), roc_auc_score(target_test, prediction)
Esempio n. 12
0
class SupervisedBandit:
    def __init__(self, num_arms=3):
        self.K = num_arms
        self.training_data = None
        self.training_labels = None
        self.clf = RidgeClassifier()
        self.dont_fit = True

    def take_action(self, features):
        if self.training_data is None:
            return torch.tensor(np.random.choice(self.K))
        elif not self.dont_fit:  # don't fit until have enough unique classes
            return torch.tensor(self.clf.predict(features))
        else:
            return torch.tensor(self.training_labels[0])

    def add_data(self, features, correct_action):
        if self.training_data is None:
            self.training_data = features
            self.training_labels = np.array([correct_action])
        else:
            self.training_data = torch.cat((self.training_data, features))
            self.training_labels = np.concatenate(
                (self.training_labels, correct_action))

        if len(np.unique(self.training_labels)) > 1:
            # solver needs at least 2 unique classes to fit
            self.dont_fit = False
            self.clf.fit(self.training_data, self.training_labels)
Esempio n. 13
0
def Parameter_regularization(train):
    '''
    正则化参数对模型的影响
    :param train:
    :return:
    '''
    sample = train
    n = int(2 * len(sample) / 3)
    tfidf = TfidfVectorizer(ngram_range=(2, 3), max_features=2500)
    train_test = tfidf.fit_transform(sample['text'].values.astype("U"))
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]

    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    f1 = []
    for i in range(10):
        clf = RidgeClassifier(alpha=0.15 * (i + 1), solver='sag')
        clf.fit(train_x, train_y)
        val_pred = clf.predict(test_x)
        f1.append(f1_score(test_y, val_pred, average='macro'))

    plt.plot([0.15 * (i + 1) for i in range(10)], f1)
    plt.xlabel('alpha')
    plt.ylabel('f1_score')
    print(f1)
    plt.show()
Esempio n. 14
0
def text_classify_influence_by_add_regularization():
    """
    探究正则化对文本分类的影响
    """
    train_df = pd.read_csv('./data/train_set.csv', sep='\t', nrows=15000)
    sample = train_df[0:5000]
    n = int(2 * len(sample) / 3)
    tfidf = TfidfVectorizer(ngram_range=(2, 3), max_features=2500)
    train_test = tfidf.fit_transform(sample['text'])
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]

    f1 = []
    for i in range(10):
        clf = RidgeClassifier(alpha=0.15 * (i + 1), solver='sag')
        clf.fit(train_x, train_y)
        val_pred = clf.predict(test_x)
        f1.append(f1_score(test_y, val_pred, average='macro'))

    plt.plot([0.15 * (i + 1) for i in range(10)], f1)
    plt.xlabel('alpha')
    plt.ylabel('f1_score')
    plt.show()
    def linear_readout(Xtrain, Ytrain, Xtest, Ytest):
        '''
        Readout (accuracy) evaluation. To assess the uniqueness of the projected patterns.
        A ridge classifier is used.
        Input:
            - Xtrain, ... (torch.Tensor): dataset. Note the Xtrain and Xtest are the 
                                          projected values. Y labels do not need to change.
        Output:
            - accuracy_score (float): accuracy of the classification of the given data.
        '''

        from sklearn.linear_model import RidgeClassifier
        from sklearn.metrics import accuracy_score

        num_batches_train = Xtrain.shape[0]
        batch_size_train = Xtrain.shape[1]
        train_set_length = num_batches_train * batch_size_train
        Xtrain = Xtrain.cpu().numpy().reshape(train_set_length, -1)
        Ytrain = Ytrain.cpu().numpy().reshape(train_set_length, -1).ravel()

        num_batches_test = Xtest.shape[0]
        batch_size_test = Xtest.shape[1]
        test_set_length = num_batches_test * batch_size_test
        Xtest = Xtest.cpu().numpy().reshape(test_set_length, -1)
        Ytest = Ytest.cpu().numpy().reshape(test_set_length, -1).ravel()

        classifier = RidgeClassifier()
        classifier.fit(Xtrain, Ytrain)
        predicted_labels = classifier.predict(Xtest)

        return accuracy_score(Ytest, predicted_labels)
Esempio n. 16
0
def text_classify_influence_by_max_features():
    """
    max_features对文本分类的影响
    """
    train_df = pd.read_csv('./data/train_set.csv', sep='\t', nrows=15000)
    sample = train_df[0:5000]
    n = int(2 * len(sample) / 3)
    f1 = []
    features = [1000, 2000, 3000, 4000]
    for i in range(4):
        tfidf = TfidfVectorizer(ngram_range=(2, 3), max_features=features[i])
        train_test = tfidf.fit_transform(sample['text'])
        train_x = train_test[:n]
        train_y = sample['label'].values[:n]
        test_x = train_test[n:]
        test_y = sample['label'].values[n:]
        clf = RidgeClassifier(alpha=0.1 * (i + 1), solver='sag')
        clf.fit(train_x, train_y)
        val_pred = clf.predict(test_x)
        f1.append(f1_score(test_y, val_pred, average='macro'))

    plt.plot(features, f1)
    plt.xlabel('max_features')
    plt.ylabel('f1_score')
    plt.show()
Esempio n. 17
0
def fun():
    digits = datasets.load_digits()
    x_train_digits, x_test_digits, y_train_digits, y_test_digits = train_test_split(
        digits.data, digits.target, test_size=.3)

    wine = datasets.load_wine()
    x_train_wine, x_test_wine, y_train_wine, y_test_wine = train_test_split(
        wine.data, wine.target, test_size=.3)

    olivetti_faces = datasets.fetch_olivetti_faces()
    x_train_olivetti_faces, x_test_olivetti_faces, y_train_olivetti_faces, y_test_olivetti_faces = train_test_split(
        olivetti_faces.data, olivetti_faces.target, test_size=.3)

    classifier_digits = RidgeClassifier()
    classifier_wine = RidgeClassifier()
    classifier_olivetti_faces = RidgeClassifier()

    classifier_digits.fit(x_train_digits, y_train_digits)
    classifier_wine.fit(x_train_wine, y_train_wine)
    classifier_olivetti_faces.fit(x_train_olivetti_faces,
                                  y_train_olivetti_faces)

    targets_predicted_digits = classifier_digits.predict(x_test_digits)
    targets_predicted_wine = classifier_wine.predict(x_test_wine)
    targets_predicted_olivetti_faces = classifier_olivetti_faces.predict(
        x_test_olivetti_faces)

    j = 0
    for i in y_test_digits:
        if y_test_digits[i] == targets_predicted_digits[i]:
            j += 1
    digits_acc = "Accuracy of Ridge Linear Digits:        " + str(
        100 * (j / len(y_test_digits)))
    j = 0
    for i in y_test_wine:
        if y_test_wine[i] == targets_predicted_wine[i]:
            j += 1
    wine_acc = "Accuracy of Ridge Linear Wine:          " + str(
        100 * (j / len(y_test_wine)))
    j = 0
    for i in y_test_olivetti_faces:
        if y_test_olivetti_faces[i] == targets_predicted_olivetti_faces[i]:
            j += 1
    olivetti_faces_acc = "Accuracy of Ridge Linear Olivetti Faces: " + str(
        100 * (j / len(y_test_olivetti_faces)))

    return (digits_acc, wine_acc, olivetti_faces_acc)
Esempio n. 18
0
def rigid(X_train, X_test, y_train):
    # Fitting RigidClassifier to the Training set
    from sklearn.linear_model import RidgeClassifier
    classifier = RidgeClassifier(alpha=4, class_weight='balanced')
    classifier.fit(X_train, y_train)
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    return y_pred
Esempio n. 19
0
def Eval(XTrain, YTrain, XTest, YTest, clf, return_predicted_labels=False):
	"""
		Inputs:
			XTrain - N by D matrix of training data vectors
			YTrain - N by 1 matrix of training class labels
			XTest - M by D matrix of testin data vectors
			YTrain - M by 1 matrix of testing class labels
			clstr - the clustering function 
				either the string = "KMeans" or "GMM"
				or a sklearn clustering instance
					with the methods .fit and 
		Outputs:
			A tuple containing (in the following order):
				Accuracy
				Overall Precision
				Overall Recall
				Overall F1 score
				Avg. Precision per class
				Avg. Recall per class
				F1 Score
				Precision per class
				Recall per class
				F1 Score per class
				(if return_predicted_labels)
					predicted class labels for each row in XTest
	"""

	if type(clf) == str:
		if 'ridge' in clf.lower():
			clf = RidgeClassifier(tol=1e-2, solver="lsqr")
		elif "perceptron" in clf.lower():
			clf = Perceptron(n_iter=50)
		elif "passive aggressive" in clf.lower() or 'passive-aggressive' in clf.lower():
			clf = PassiveAggressiveClassifier(n_iter=50)
		elif 'linsvm' in clf.lower() or 'linearsvm' in clf.lower() or 'linearsvc' in clf.lower():
			clf = LinearSVC()
		elif 'svm' in clf.lower() or 'svc' in clf.lower():
			clf = SVC()
		elif 'sgd' in clf.lower():
			clf = SGDClassifier()
   
	clf.fit(XTrain, YTrain)
	YPred = clf.predict(XTest)


	accuracy = sklearn.metrics.accuracy_score(YTest, YPred)
	(overall_precision, overall_recall, overall_f1, support) = sklearn.metrics.precision_recall_fscore_support(YTest, YPred, average='micro')
	(precision_per_class, recall_per_class, f1_per_class, support_per_class) = sklearn.metrics.precision_recall_fscore_support(YTest, YPred)
	avg_precision_per_class = np.mean(precision_per_class)
	avg_recall_per_class = np.mean(recall_per_class)
	avg_f1_per_class = np.mean(f1_per_class)

	del clf

	if return_predicted_labels:
		return (accuracy, overall_precision, overall_recall, overall_f1, avg_precision_per_class, avg_recall_per_class, avg_f1_per_class, precision_per_class, recall_per_class, f1_per_class, YPred)
	else:
		return (accuracy, overall_precision, overall_recall, overall_f1, avg_precision_per_class, avg_recall_per_class, avg_f1_per_class, precision_per_class, recall_per_class, f1_per_class)
Esempio n. 20
0
def RFF_Form1_Classification(WN, b,TrainData, ValData, TestData, train_label, val_label, test_label, option=1):
    import numpy as np 
    import time
    import copy
    import sys
    from time import clock
    
    D = np.shape(WN)[1]
    b=np.zeros((D,1))
    bn=copy.copy(b)
    PRFSGDAccuracy=np.zeros((D,1))    
    PRFRidgeAccuracy=np.zeros((D,1))
    #interval = np.arange(0,D,10)
    interval = [D]
    
    for k in interval:
        if (k==0):
            k=k+1
        W=WN[:,range(k)]
        b=bn[range(k)]
        RFTrainData= FeaturemapTransformation_Form1(W,TrainData)           
        RFTestData= FeaturemapTransformation_Form1(W,TestData)
        
        ## Psuedo RF SGD
        from minibatchSGDCV import minibatchRFSGDCV
        from minibatchSGD import RFminibatchSGD
        st_time = clock()
        ncv=3
        RFcvparam, RFbestbatchparam, RFmeanscore = minibatchRFSGDCV(ValData,val_label,ncv,W,b,
                                                                    option=1, RFoption=1)
        end_time = clock()
        print('PRFSGD Cross Validation Completed')
        print('Time required for PRFSGD CV is =', end_time-st_time)
        PRFclf = RFminibatchSGD(TrainData,train_label,W,b,option=1,batchsize=RFcvparam['batchsize'], 
                                           alpha=RFcvparam['alpha'], eta0=RFcvparam['eta0'], RFoption=1)
        PRFSGDClassifiedlabel=PRFclf.predict(RFTestData)
        #SCDconfMat=confusion_matrix(test_label,SGDClassifiedlabel)
        PRFSGDAccuracy[k-1]=sum(test_label==PRFSGDClassifiedlabel)/(float(len(test_label)))
#        print('RFSGD Completed')    
#        print("The classification accuracy with PsudeoRFSGD =",PRFSGDAccuracy[k-1])      
         
        
        ## Ridge regression Psuedo RF
        from sklearn.linear_model import RidgeClassifier
        from sklearn.metrics import confusion_matrix
        clf = RidgeClassifier(alpha=0.1)
        clf.fit(RFTrainData, train_label)
        RFRidgeClassifiedlabel = clf.predict(RFTestData)
        RFRidgeConfMat=confusion_matrix(test_label,RFRidgeClassifiedlabel)
        PRFRidgeAccuracy[k-1]=sum(test_label==RFRidgeClassifiedlabel)/(float(len(test_label)))
#        print("The classification accuracy with PsudeoRFRidge =",PRFRidgeAccuracy[k-1])
#        print("The feature expansion",k, "is over")
#        print('+++++++++++++++++++++++++++++++++++++++')
        
    ind = PRFSGDAccuracy>0
    PRFSGDAccuracy = PRFSGDAccuracy[ind]
    PRFRidgeAccuracy = PRFRidgeAccuracy[ind]    
    return PRFSGDAccuracy, PRFRidgeAccuracy
 def rg_predict(train_data, valid_data, test_data, task_name="A"):
     train_features = train_data["feature"]
     train_labels = train_data["label"]
     valid_features = valid_data["feature"]
     valid_labels = valid_data["label"]
     test_features = test_data["feature"]
     rg = RidgeClassifier(max_iter=100)
     rg.fit(train_features, train_labels)
     pred_valid_labels = rg.predict(valid_features)
     if task_name == "A":
         f1_valid = f1_score(valid_labels, pred_valid_labels, pos_label=1)
     else:
         f1_valid = f1_score(valid_labels,
                             pred_valid_labels,
                             average="macro")
     print("F1 on valid : %f" % f1_valid)
     return rg.predict(train_features), rg.predict(
         valid_features), rg.predict(test_features), f1_valid
Esempio n. 22
0
class RidgeModel(ccobra.CCobraModel):
    def __init__(self, name='Ridge', k=1):
        super(RidgeModel, self).__init__(name, ["moral"], ["single-choice"])

        self.clf = RidgeClassifier(alpha=7)

        self.n_epochs = 1

    def pre_train(self, dataset):

        x = []
        y = []

        for subj_train_data in dataset:
            for seq_train_data in subj_train_data:

                seq_train_data['task'] = seq_train_data['item'].task
                inp = create_input(seq_train_data)

                target = float(output_mppng[seq_train_data['response'][0][0]])

                x.append(inp)

                y.append(target)
        x = np.array(x)
        y = np.array(y)

        self.train_x = x
        self.train_y = y

        self.train_network(self.train_x,
                           self.train_y,
                           self.n_epochs,
                           verbose=True)

    def train_network(self, train_x, train_y, n_epochs, verbose=False):
        print('Starting training...')
        for epoch in range(self.n_epochs):

            # Shuffle the training data
            perm_idxs = np.random.permutation(np.arange(len(train_x)))
            train_x = train_x[perm_idxs]
            train_y = train_y[perm_idxs]

            self.clf.fit(train_x, train_y)

            print('Mean accuracy:')
            print(self.clf.score(train_x, train_y))

    def predict(self, item, **kwargs):
        input = {'task': item.task}
        input['aux'] = kwargs
        x = np.array(create_input(input)).reshape(1, -1)
        output = self.clf.predict(x)

        self.prediction = output_mppngREV[output[0]]
        return self.prediction
Esempio n. 23
0
def do_rc(X_test, X_train, Y_train):
    # creating a classifier of loss function "hinge" and penalty function "l2"
    clf = RidgeClassifier()
    print "starts fitting"
    print clf.fit(X_train, Y_train)
    print "finished fitting, starts predictions"
    Y_pred = clf.predict(X_test)
    print "finished predictions"
    return Y_pred
Esempio n. 24
0
 def ridge_classifier(self):
     self.log.writeToLog('Running Ridge Classifier Model...')
     X_train, X_test, y_train, y_test = self.train_test_split()
     rc = RidgeClassifier()
     trained_model = rc.fit(X_train, y_train)
     self.save_pickle(trained_model)
     y_pred = rc.predict(X_test)
     self.model_auc_roc(y_test, y_pred, "Ridge Classifier Model")
     self.model_evaluation(y_test, y_pred, "Ridge Classifier Model")
Esempio n. 25
0
def classifier(df, vectorizer):
    train_text = vectorizer.transform(df['text'])
    train_y = df['label'].values
    model = RidgeClassifier()
    logging.info('training ... ')
    model.fit(train_text, train_y)
    logging.info('predicting ... ')
    pred_y = model.predict(train_text)
    score(train_y, pred_y)
def _ridgeclassifier(*,
                     train,
                     test,
                     x_predict=None,
                     metrics,
                     alpha=1.0,
                     fit_intercept=True,
                     normalize=False,
                     copy_X=True,
                     max_iter=None,
                     tol=0.001,
                     class_weight=None,
                     solver='auto',
                     random_state=None):
    """For for info visit : 
        https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html#sklearn.linear_model.RidgeClassifier
    """

    model = RidgeClassifier(alpha=alpha,
                            fit_intercept=fit_intercept,
                            normalize=normalize,
                            copy_X=copy_X,
                            max_iter=max_iter,
                            tol=tol,
                            class_weight=class_weight,
                            solver=solver,
                            random_state=random_state)
    model.fit(train[0], train[1])
    model_name = 'RidgeClassifier'
    y_hat = model.predict(test[0])

    if metrics == 'f1_score':
        accuracy = f1_score(test[1], y_hat)
    if metrics == 'jaccard_score':
        accuracy = jaccard_score(test[1], y_hat)
    if metrics == 'accuracy_score':
        accuracy = accuracy_score(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
Esempio n. 27
0
def RidgeReg(file1, file2):
    feature1, lable1 = file2matrix(file1)
    clf = RidgeClassifier()
    clf.fit(feature1, lable1)

    feature2, label2 = file2matrix(file2)
    y_true = label2
    y_score = clf.decision_function(feature2)
    y_pred = clf.predict(feature2)
    return y_true, y_score, y_pred
Esempio n. 28
0
 def scikit_ridgeregression(self, dataset, labels):
     from sklearn.linear_model import RidgeClassifier
     lr = RidgeClassifier(fit_intercept=False, max_iter=100, random_state=0)
     lr.fit(dataset, labels)
     testset, truelabels = self.load_dataset(self.testdata)
     prob = lr.predict(testset)
     ans = prob * truelabels
     err_rate = float(np.sum(ans == -1)) / ans.shape[0]
     print("Scikit Learn RR Test Error Rate: {:.2f}".format(
         float(err_rate)))
def train_and_test(challenge):
    idealist = []
    if challenge == "all":
        for file in listdir(variables.linclasstrainingsdatapath):
            if isfile(join(variables.linclasstrainingsdatapath, file)):
                filename = file.split(".")[0]
                idealist += list(importDataHelper.readcsvdata(join(variables.linclasstrainingsdatapath, file)))
    else:
        idealist = list(importDataHelper.readcsvdata(variables.linclasstrainingsdatapath + challenge + ".csv"))
    featurelist = {}
    for row in idealist:
        for key in row.keys():
            featurelist[key] = featurelist.get(key, [])
            featurelist[key] += [int(x) for x in row[key].replace('[', '').replace(']', '').split(',')]
    testdata = pd.DataFrame(featurelist)
    X = testdata.drop('Spam', axis=1)
    y = testdata['Spam']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    clf = RidgeClassifier()
    y_score = clf.fit(X_train, y_train).decision_function(X_test)
    testres = clf.predict(X_test)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in [0, 1]:
        fpr[i], tpr[i], _ = roc_curve(y_test, y_score)
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure()
    lw = 2
    plt.plot(fpr[1], tpr[1], color="darkorange", lw=lw, label="ROC" % roc_auc[1])
    plt.plot([0, 1], [0, 1], color="cornflowerblue", lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(challenge)
    plt.legend(loc="lower right")
    plt.savefig(variables.plotspath + "ROC_linClass_" + challenge + ".png")
    plt.show()
    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    plt.show()
    confusion_matrix = ConfusionMatrix(y_test, testres)
    confusion_matrix.plot(normalized=True)
    plt.title(challenge)
    plt.savefig(variables.plotspath + "CM_linClass_" + challenge + ".png")
    plt.show()
    print(clf.coef_)
    print(classification_report(y_test, testres))
    print(confusion_matrix.stats())
Esempio n. 30
0
def main():
    """
    RidgeRegression classifier.
    """
    dct = True
    X_train, X_test, y_train, y_test = prepare_datasets(dct)
    model = RidgeClassifier(alpha=1.0)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    bal_acc = balanced_accuracy_score(y_test, pred)
    print(f"Balanced accuracy score: {bal_acc:g}")
Esempio n. 31
0
def confusion_matrix_for_problem(X, y, model=None):
    train_X, test_X, train_y, test_y = train_test_split(X,
                                                        y,
                                                        random_state=0,
                                                        stratify=y)

    if model is None:
        model = RidgeClassifier()
    model.fit(train_X, train_y)
    prediction_y = model.predict(test_X)

    return confusion_matrix(test_y, prediction_y)
def Ridge_Classifier(train_X, test_X, train_y, test_y):
    clf = RidgeClassifier().fit(train_X, train_y)
    reversefactor = dict(zip(range(4), definitions))
    predicted_y = clf.predict(test_X)
    predicted_y = np.vectorize(reversefactor.get)(predicted_y)
    correct_y = np.vectorize(reversefactor.get)(test_y)
    cm = confusion_matrix(correct_y, predicted_y)
    print('')
    print("THIS IS THE RESULT FOR RIDGE CLASSIFIER")
    print(cm)
    acc = accuracy_score(correct_y, predicted_y)
    print("Accuracy of training data is " + str(acc))
    return acc
Esempio n. 33
0
def ridge_regression():
    train_df = pd.read_csv(r'C:\Users\Rookie\Desktop\nlp\train_set.csv',
                           sep='\t')

    vectorizer = CountVectorizer(max_features=3000)
    train_test = vectorizer.fit_transform(train_df['text'])

    clf = RidgeClassifier()
    clf.fit(train_test[:10000], train_df['label'].values[:10000])

    val_pred = clf.predict(train_test[10000:])
    print(f1_score(train_df['label'].values[10000:], val_pred,
                   average='macro'))
Esempio n. 34
0
def Predict():
    print('\nThere are %d new deals') % n_test

    # Using the KNN classifier
    clf_KNN = KNeighborsClassifier(n_neighbors=3) # KNN doesnot work even if k has been tuned
    #clf_KNN = KNeighborsClassifier(n_neighbors=7)
    #clf_KNN = KNeighborsClassifier(n_neighbors=11)
    clf_KNN.fit(Corpus_train, Y_train)
    Y_pred_KNN = clf_KNN.predict(Corpus_test)
    print_rate(Y_test, Y_pred_KNN, n_test, 'KNNClassifier')
    
    # Using the SVM classifier
    clf_SVM = svm.SVC()
    clf_SVM.fit(Corpus_train, Y_train)
    Y_pred_SVM = clf_SVM.predict(Corpus_test)
    print_rate(Y_test, Y_pred_SVM, n_test, 'SVMClassifier')
    
    # Using the Ridge classifier
    clf_RC = RidgeClassifier(tol=0.01, solver="lsqr")
    #clf_RC = RidgeClassifier(tol=0.1, solver="lsqr")
    clf_RC.fit(Corpus_train, Y_train)
    Y_pred_RC = clf_RC.predict(Corpus_test)
    print_rate(Y_test, Y_pred_RC, n_test, 'RidgeClassifier')
    
    # won't consider Random Forests or Decision Trees beacause they work bad for high sparse dimensions
    
    
    # Using the Multinomial Naive Bayes classifier
    # I expect that this MNB classifier will do the best since it is designed for occurrence counts features
    #clf_MNB = MultinomialNB(alpha=0.01) #smoothing parameter = 0.01 is worse than 0.1
    clf_MNB = MultinomialNB(alpha=0.1)
    #clf_MNB = MultinomialNB(alpha=0.3) #a big smoothing rate doesnot benefit the model
    #clf_MNB = MultinomialNB(alpha=0.2) #or alpha = 0.05 can generate the best outcome
    clf_MNB.fit(Corpus_train, Y_train)
    Y_pred_MNB = clf_MNB.predict(Corpus_test)
    print_rate(Y_test, Y_pred_MNB, n_test, 'MultinomialNBClassifier')
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_den_train, X_den_test = X_den[train_index], X_den[test_index]

        # feed models
        clf_mNB.fit(X_train, y_train)
        # clf_kNN.fit(X_train, y_train)
        clf_ridge.fit(X_train, y_train)
        clf_lSVC.fit(X_train, y_train)
        clf_SVC.fit(X_den_train, y_train)

        # get prediction for this fold run
        pred_mNB    = clf_mNB.predict(X_test)
        # pred_kNN    = clf_kNN.predict(X_test)
        pred_ridge  = clf_ridge.predict(X_test)
        pred_lSVC   = clf_lSVC.predict(X_test)
        pred_SVC    = clf_SVC.predict(X_den_test)

        # update z array for each model
        z_mNB   = np.append(z_mNB    , pred_mNB  , axis=None)
        # z_kNN   = np.append(z_kNN    , pred_kNN  , axis=None)
        z_ridge = np.append(z_ridge  , pred_ridge, axis=None)
        z_lSVC  = np.append(z_lSVC   , pred_lSVC , axis=None)
        z_SVC   = np.append(z_SVC    , pred_SVC  , axis=None)


    # putting z's from each model into one 2d matrix
    # this is the (feature) input, similar as X, for level 1
    # In level 1, y is still y.
    # z = np.array([z_bNB, z_mNB, z_kNN, z_ridge, z_SGD, z_lSVC, z_SVC, z_tree, z_logis], dtype=np.int32)
Esempio n. 36
0
# Train classifier
clf = RidgeClassifier()
train = pd.read_csv("train/subtrain.csv", chunksize = 100000, iterator = True)
all_classes = np.array([0, 1])
for chunk in train:
    y_train = chunk["click"]
    chunk = chunk[cols]
    chunk = chunk.join(pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"]))
    chunk.drop(["hour"], axis=1, inplace = True)
    Xcat = fh.transform(np.asarray(chunk.astype(str)))
    clf.fit(Xcat, y_train)
    
# Create a submission file
usecols = cols + ["id"]
X_test = pd.read_csv("test/mtest.csv", usecols=usecols)
X_test = X_test.join(pd.DataFrame([dayhour(x) for x in X_test.hour], columns=["wd", "hr"]))
X_test.drop(["hour"], axis=1, inplace = True)

X_enc_test = fh.transform(np.asarray(X_test.astype(str)))

y_act = pd.read_csv("test/mtest.csv", usecols=['click'])
y_pred = clf.predict(X_enc_test)

with open('logloss.txt','a') as f:
    f.write('\n'+str(log_loss(y_act, y_pred)))

with open("submission/submission_ridgecv.csv", "w") as f:
    f.write("id,click\n")
    for idx, xid in enumerate(X_test.id):
        f.write(str(xid) + "," + "{0:.10f}".format(y_pred[idx]) + "\n")
f.close()
Esempio n. 37
0
    X_train_train, X_train_test = X_train[train_index], X_train[test_index]
    y_train_train, y_train_test = y_train[train_index], y_train[test_index]

    # X_den_train, X_den_test = X_den[train_index], X_den[test_index]

    # feed models
    clf_mNB.fit(X_train_train, y_train_train)
    clf_kNN.fit(X_train_train, y_train_train)
    clf_ridge.fit(X_train_train, y_train_train)
    clf_lSVC.fit(X_train_train, y_train_train)
    clf_SVC.fit(X_train_train, y_train_train)

    # get prediction for this fold run
    pred_mNB    = clf_mNB.predict(X_train_test)
    pred_kNN    = clf_kNN.predict(X_train_test)
    pred_ridge  = clf_ridge.predict(X_train_test)
    pred_lSVC   = clf_lSVC.predict(X_train_test)
    pred_SVC    = clf_SVC.predict(X_train_test)

    # update z array for each model
    z_mNB   = np.append(z_mNB    , pred_mNB  , axis=None)
    z_kNN   = np.append(z_kNN    , pred_kNN  , axis=None)
    z_ridge = np.append(z_ridge  , pred_ridge, axis=None)
    z_lSVC  = np.append(z_lSVC   , pred_lSVC , axis=None)
    z_SVC   = np.append(z_SVC    , pred_SVC  , axis=None)


# putting z's from each model into one 2d matrix
# this is the (feature) input, similar as X, for level 1
# In level 1, y is still y.
# z = np.array([z_bNB, z_mNB, z_kNN, z_ridge, z_SGD, z_lSVC, z_SVC, z_tree, z_logis], dtype=np.int32)
Esempio n. 38
0
#clf_KNN = KNeighborsClassifier(n_neighbors=11)
clf_KNN.fit(Corpus_train, Y_train)
Y_pred_KNN = clf_KNN.predict(Corpus_test)
print_rate(Y_test, Y_pred_KNN, n_test, 'KNNClassifier')

# Using the SVM classifier
clf_SVM = svm.SVC()
clf_SVM.fit(Corpus_train, Y_train)
Y_pred_SVM = clf_SVM.predict(Corpus_test)
print_rate(Y_test, Y_pred_SVM, n_test, 'SVMClassifier')

# Using the Ridge classifier
clf_RC = RidgeClassifier(tol=0.01, solver="lsqr")
#clf_RC = RidgeClassifier(tol=0.1, solver="lsqr")
clf_RC.fit(Corpus_train, Y_train)
Y_pred_RC = clf_RC.predict(Corpus_test)
print_rate(Y_test, Y_pred_RC, n_test, 'RidgeClassifier')

# won't consider Random Forests or Decision Trees beacause they work bad for high sparse dimensions


# Using the Multinomial Naive Bayes classifier
# I expect that this MNB classifier will do the best since it is designed for occurrence counts features
#clf_MNB = MultinomialNB(alpha=0.01) #smoothing parameter = 0.01 is worse than 0.1
clf_MNB = MultinomialNB(alpha=0.1)
#clf_MNB = MultinomialNB(alpha=0.3) #a big smoothing rate doesnot benefit the model
#clf_MNB = MultinomialNB(alpha=0.2) #or alpha = 0.05 can generate the best outcome
clf_MNB.fit(Corpus_train, Y_train)
Y_pred_MNB = clf_MNB.predict(Corpus_test)
print_rate(Y_test, Y_pred_MNB, n_test, 'MultinomialNBClassifier')
#score = metrics.f1_score(Y_test, Y_pred_MNB)
Esempio n. 39
0
File: ridge.py Progetto: mb16/Kaggle
def main():

    startCol = 0
    endCol = 50  # max = 1775

    train = csv_io.read_data("../Data/train.csv")
    target = [x[0] for x in train][1:3000]
    targetTest = [x[0] for x in train][3001:]
    trainTest = [x[startCol+1:endCol+1] for x in train][3001:]
    test = csv_io.read_data("../Data/test.csv")
    test = [x[startCol:endCol] for x in test]
	
    train = [x[startCol+1:endCol+1] for x in train][1:3000]	
	
    fo = open("knn_stats.txt", "a+")

    rf = RidgeClassifier(alpha=0.01, fit_intercept=True, normalize=False, copy_X=True, tol=0.001) 
	
    rf.fit(train, target)
    prob = rf.predict(trainTest) # changed from test


    result = 100
    probSum = 0
    for i in range(0, len(prob)):
        probX = prob[i] # [1]
        if ( probX > 0.7):
            probX = 0.7;		
        if ( probX < 0.3):
            probX = 0.3;
        print i, probSum, probX, target[i]
        print target[i]*log(probX), (1-target[i])*log(1-probX)
        probSum += targetTest[i]*log(probX)+(1-targetTest[i])*log(1-probX)
	
        #print probSum	
        #print len(prob)	
        #print "C: ", 10**C, " gamma: " ,2**g
        print -probSum/len(prob)
	

	
    if ( -probSum/len(prob) < result ):
        result = -probSum/len(prob)
        predicted_probs = rf.predict(test)  # was test
        predicted_probs = ["%f" % x for x in predicted_probs]
        csv_io.write_delimited_file("../Submissions/knn.csv", predicted_probs)
        print "Generated Data!!"
		
    #fo.write(str(5) + str(5)+ str(5));
		
    fo.close()
		
    #csv_io.write_delimited_file("../Submissions/rf_benchmark_test2.csv", predicted_probs)

    #predicted_probs = rf.predict_proba(train) # changed from test
 
    #predicted_probs = ["%f" % x[1] for x in predicted_probs]
    #predicted_probs = rf.predict(train) # changed from test
    #predicted_probs = ["%f" % x for x in predicted_probs]	
	
    #csv_io.write_delimited_file("../Submissions/rf_benchmark_train2.csv", predicted_probs)
	
	
    var = raw_input("Enter to terminate.")								
Esempio n. 40
0
def classify(granularity=10):
    trainDir = path.join(GEOTEXT_HOME, 'processed_data/' + str(granularity).strip() + '_clustered/')
    testDir = path.join(GEOTEXT_HOME, 'processed_data/test')
    data_train = load_files(trainDir, encoding=encoding)
    target = data_train.target
    data_test = load_files(testDir, encoding=encoding)

    categories = data_train.target_names
    
    def size_mb(docs):
        return sum(len(s.encode(encoding)) for s in docs) / 1e6
    
    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)
    
    print("%d documents - %0.3fMB (training set)" % (
        len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" % (
        len(data_test.data), data_test_size_mb))
    print("%d categories" % len(categories))
    print()
    
    # split a training set and a test set
    y_train = data_train.target
    y_test = data_test.target
    
    
    print("Extracting features from the training dataset using a sparse vectorizer")
    t0 = time()
    vectorizer = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, min_df=2, max_df=1.0, ngram_range=(1, 1), stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()
    
    print("Extracting features from the test dataset using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()
    chi = False
    if chi:
        k = 500000
        print("Extracting %d best features by a chi-squared test" % 0)
        t0 = time()
        ch2 = SelectKBest(chi2, k=k)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        
        print("done in %fs" % (time() - t0))
        print()
        
    feature_names = np.asarray(vectorizer.get_feature_names())
    # clf = LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-3)
    clf = RidgeClassifier(tol=1e-2, solver="auto")
    print('_' * 80)
    print("Training: ")
    print(clf)
    
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    scores = clf.decision_function(X_test)
    print scores.shape
    print pred.shape
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    # score = metrics.f1_score(y_test, pred)
    # print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))
        print("top 10 keywords per class:")
        for i, category in enumerate(categories):
            top10 = np.argsort(clf.coef_[i])[-10:]
            print("%s: %s" % (category, " ".join(feature_names[top10])))

    
    sumMeanDistance = 0
    sumMedianDistance = 0
    distances = []
    confidences = []
    randomConfidences = []
    
    for i in range(0, len(pred)):
        user = path.basename(data_test.filenames[i])
        location = userLocation[user].split(',')
        lat = float(location[0])
        lon = float(location[1])
        prediction = categories[pred[i]]
        confidence = scores[i][pred[i]] - mean(scores[i])
        randomConfidence = scores[i][random.randint(0, len(categories) - 1)]
        confidences.append(confidence)
        randomConfidences.append(randomConfidence)
        medianlat = classLatMedian[prediction]  
        medianlon = classLonMedian[prediction]  
        meanlat = classLatMean[prediction] 
        meanlon = classLonMean[prediction]      
        distances.append(distance(lat, lon, medianlat, medianlon))
        sumMedianDistance = sumMedianDistance + distance(lat, lon, medianlat, medianlon)
        sumMeanDistance = sumMeanDistance + distance(lat, lon, meanlat, meanlon)
    averageMeanDistance = sumMeanDistance / float(len(pred))
    averageMedianDistance = sumMedianDistance / float(len(pred))
    print "Average mean distance is " + str(averageMeanDistance)
    print "Average median distance is " + str(averageMedianDistance)
    print "Median distance is " + str(median(distances))
    fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)
    
    plt.xlim(0, 4000)
    plt.ylim(0, 2)
    ax1.scatter(distances, confidences)
    ax2.bar(distances, confidences)
    plt.savefig(path.join(GEOTEXT_HOME, 'confidence.png'))
Esempio n. 41
0
    data = [ i for i in csv.reader(file(train_file, 'rb')) ]
    data = data[1:] # remove header
    random.shuffle(data)

    X = np.array([ i[1:] for i in data ]).astype(float)
    Y = np.array([ i[0] for i in data ]).astype(int)

    train_cutoff = len(data) * 3/4

    X_train = X[:train_cutoff]
    Y_train = Y[:train_cutoff]
    X_test = X[train_cutoff:]
    Y_test = Y[train_cutoff:]

    classifier = RidgeClassifier(normalize = True, alpha = 1)
    classifier = classifier.fit(X_train, Y_train)
    
    print 'Training error : %s' % (classifier.fit(X_train, Y_train).score(X_train, Y_train))

    Y_predict = classifier.predict(X_test)

    equal = 0
    for i in xrange(len(Y_predict)):
        if Y_predict[i] == Y_test[i]:
            equal += 1

    print 'Accuracy = %s' % (float(equal)/len(Y_predict))


duration = time() - t0
print("n_samples: %d, n_features: %d" % X_test.shape)
print("Done in %fs" % (duration))

def writeToDisk(predn,clfname):
    target="./"+clfname+".txt"
    target=open(target,'w')
    target.write("{}\t{}\n".format("record_id", "topic"))
    for x in zip(testID, predn):
        target.write("{}\t{}\n".format(x[0], x[1]))
    target.close()
    print(clfname," output written to disk.")

clf1=RidgeClassifier(tol=1e-2, solver="lsqr")   #Ridge Classifier
clf1.fit(X_train, y_train)
pred = clf1.predict(X_test)
writeToDisk(pred,"RidgeClassifier")

clf2=MultinomialNB(alpha=.01)                   #Naive Bayes classifier
clf2.fit(X_train, y_train)
pred = clf2.predict(X_test)
writeToDisk(pred,"MultinomialNB")

clf3=BernoulliNB(alpha=.01)                     #Naive Bayes(Bernoulli) classifier
clf3.fit(X_train, y_train)
pred = clf3.predict(X_test)
writeToDisk(pred,"BernoulliNB")

clf4=KNeighborsClassifier(n_neighbors=10)       #KNeighbors Classifier
clf4.fit(X_train, y_train)
pred = clf4.predict(X_test)
Esempio n. 43
0
# print

X_train = X_train.toarray()
X_test = X_test.toarray()

# clf = BernoulliNB(alpha=.1)
# clf = MultinomialNB(alpha=.01)
# clf = KNeighborsClassifier(n_neighbors=3)
clf = RidgeClassifier(tol=1e-1)
# clf = RandomForestClassifier(n_estimators=20, max_depth=None, min_split=3, random_state=42)
# clf = SGDClassifier(alpha=.01, n_iter=50, penalty="l2")
# clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)


clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print "y    : ", y_test
print "pred : ", pred
print

# # print out top words for each category
# for i, category in enumerate(categories):
#             top = np.argsort(clf.coef_[i, :])[-20:]
#             print "%s: %s" % (category, " ".join(vocabulary[top]))
#             print
# print
# print


pre_score = metrics.precision_score(y_test, pred)
Esempio n. 44
0
X_train = cityName;

print('Creating the vectorizer and chosing a transform (from raw text to feature)')
vect= TfidfVectorizer(sublinear_tf=True, max_df=0.5)
#vect=CountVectorizer(min_n=1,max_n=2,max_features=1000);

X_train = vect.fit_transform(X_train)


cityClass = RidgeClassifier(tol=1e-7)
countryClass = RidgeClassifier(tol=1e-7)

print('Creating a classifier for cities')
cityClass.fit(X_train,cityCode)
print('Creating a classifier for countries')
countryClass.fit(X_train,countryCode)

print('testing the performance');

testCityNames = vect.transform(cityNameTest);

predictionsCity = countryClass.predict(testCityNames);
predictionsCountry = cityClass.predict(testCityNames);

with open('predictions.csv','w') as csvfile:
        writer = csv.writer(csvfile)
        #for ind in range(0,len(predictionsCountry)):
        #        writer.writerow([str(predictionsCountry[ind]),str(predictionsCity[ind])])
        for predCountry,predCity in zip(predictionsCountry,predictionsCity):
                writer.writerow([predCountry,predCity])
    # pre_all = 0.0
    # rec_all = 0.0
    f1_all = []
    f5_all = []
    acc_all = []
    pre_all = []
    rec_all = []

    # level 1 evaluation
    for train_index, test_index in kf1:

        z_train, z_test = z[train_index], z[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf.fit(z_train, y_train)
        pred = clf.predict(z_test)

        # metrics
        acc_score = metrics.zero_one_score(y_test, pred)
        pre_score = metrics.precision_score(y_test, pred)
        rec_score = metrics.recall_score(y_test, pred)

        acc_all.append(acc_score)
        pre_all.append(pre_score)
        rec_all.append(rec_score)

    # put the lists into numpy array for calculating the results
    acc_all_array  = np.asarray(acc_all)
    pre_all_array  = np.asarray(pre_all)
    rec_all_array  = np.asarray(rec_all)
Esempio n. 46
0
#!/usr/bin/env python
"""
Ridge regression for Avito
"""
__author__ = "deniederhut"
__license__ = "GPL"
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

data = pd.read_table('/Users/dillonniederhut/Desktop/avito_train.tsv',nrows=100000)
#replace with file path to your training data

features = pd.get_dummies(data.subcategory)
features_train, features_test, target_train, target_test =\
    train_test_split(features, data.is_blocked, test_size = 0.25)

ridge = RidgeClassifier()
ridge.fit(features_train, target_train)
prediction = np.round(ridge.predict(features_test))
print classification_report(target_test, prediction)
print average_precision_score(target_test, prediction)
print roc_auc_score(target_test, prediction)
Esempio n. 47
0
# the result of z is a 2d array with shape of (n_samples, n_categories)
# the elements are the sum of probabilities of classifiers on each (sample,category) pair
# Possible preprocessing on z
# test_z = normalize(test_z, norm="l2")
# z = scale(z)



###############################################################################
# Test classifier on test dataset

# clf = DecisionTreeClassifier(max_depth=14, min_split=5)
# clf = MultinomialNB(alpha=.01)
# clf = KNeighborsClassifier(n_neighbors=19)
clf = RidgeClassifier(tol=1e-1)
# clf = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3)
# clf = SVC(C=32, gamma=0.0625)
# print clf

clf.fit(z, y_train)
pred = clf.predict(test_z)

pre_score = metrics.precision_score(y_test, pred)
rec_score = metrics.recall_score(y_test, pred)

# print "average f1-score:   %0.5f" % ((2*pre_score*rec_score)/(pre_score+rec_score))
# print "average f5-score:   %0.5f" % ((1.25*pre_score*rec_score)/(0.25*pre_score+rec_score))
print "average f1-score:   %0.2f" % (100*((2*pre_score*rec_score)/(pre_score+rec_score)))
print "average f5-score:   %0.2f" % (100*((1.25*pre_score*rec_score)/(0.25*pre_score+rec_score)))
print "average precision:  %0.5f" % pre_score
print "averege recall:     %0.5f" % rec_score
        ):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf, name))   

# Attach classifier to the original json file

# loading dtm file for all twitts
fp = open('./python_files/twitter_dtm.pkl', 'rb')
dtm = pkl.load(fp)
fp.close()

# Predict the labels using Ridges classifier
clf = RidgeClassifier(alpha=1.,tol=1e-2, solver="lsqr")
clf.fit(X_train, y_train)
predicted_labels = clf.predict(dtm)

# loading json file for all twitts
file_name = '../R Project/Data/obamacare.json'
line_reader = open(file_name,'r') # r means for reading

# building a new json file for all twitts + new predicted labels
new_file_name = '../R Project/Data/obamacare_labeled.json'
line_writer = open(new_file_name,'w') # w means for writing

# adding the predicted label to each entry of json file
twit_i = 0
for line in line_reader:
    label = predicted_labels[twit_i]
    if label==0:
        ideology = 'C'
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    X_den_train, X_den_test = X_den[train_index], X_den[test_index]

    # feed models
    clf_mNB.fit(X_train, y_train)
    clf_kNN.fit(X_train, y_train)
    clf_ridge.fit(X_train, y_train)
    clf_lSVC.fit(X_train, y_train)
    clf_SVC.fit(X_train, y_train)

    # get prediction for this fold run
    pred_mNB    = clf_mNB.predict(X_test)
    pred_kNN    = clf_kNN.predict(X_test)
    pred_ridge  = clf_ridge.predict(X_test)
    pred_lSVC   = clf_lSVC.predict(X_test)
    pred_SVC    = clf_SVC.predict(X_test)

    # update z array for each model
    z_mNB   = np.append(z_mNB    , pred_mNB  , axis=None)
    z_kNN   = np.append(z_kNN    , pred_kNN  , axis=None)
    z_ridge = np.append(z_ridge  , pred_ridge, axis=None)
    z_lSVC  = np.append(z_lSVC   , pred_lSVC , axis=None)
    z_SVC   = np.append(z_SVC    , pred_SVC  , axis=None)


# putting z's from each model into one 2d matrix
# this is the (feature) input, similar as X, for level 1
# In level 1, y is still y.
# z = np.array([z_bNB, z_mNB, z_kNN, z_ridge, z_SGD, z_lSVC, z_SVC, z_tree, z_logis], dtype=np.int32)