コード例 #1
1
def train_and_predict_m8 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'porter')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting Ridge Classifer...")
    clf = RidgeClassifier(class_weight = 'auto', alpha = 1, normalize = True)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    param_grid = {'alpha' : [0.1, 0.3, 1, 3, 10], 'normalize' : [True, False]}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
コード例 #2
0
    def linear_readout(Xtrain, Ytrain, Xtest, Ytest):
        '''
        Readout (accuracy) evaluation. To assess the uniqueness of the projected patterns.
        A ridge classifier is used.
        Input:
            - Xtrain, ... (torch.Tensor): dataset. Note the Xtrain and Xtest are the 
                                          projected values. Y labels do not need to change.
        Output:
            - accuracy_score (float): accuracy of the classification of the given data.
        '''

        from sklearn.linear_model import RidgeClassifier
        from sklearn.metrics import accuracy_score

        num_batches_train = Xtrain.shape[0]
        batch_size_train = Xtrain.shape[1]
        train_set_length = num_batches_train * batch_size_train
        Xtrain = Xtrain.cpu().numpy().reshape(train_set_length, -1)
        Ytrain = Ytrain.cpu().numpy().reshape(train_set_length, -1).ravel()

        num_batches_test = Xtest.shape[0]
        batch_size_test = Xtest.shape[1]
        test_set_length = num_batches_test * batch_size_test
        Xtest = Xtest.cpu().numpy().reshape(test_set_length, -1)
        Ytest = Ytest.cpu().numpy().reshape(test_set_length, -1).ravel()

        classifier = RidgeClassifier()
        classifier.fit(Xtrain, Ytrain)
        predicted_labels = classifier.predict(Xtest)

        return accuracy_score(Ytest, predicted_labels)
コード例 #3
0
 def __train_lr(
     self,
     x_train_fused,
     y_train_fused,
     x_train_unfused,
     y_train_unfused,
     num_rows_having_paraphrases,
 ):
     logistic_model_unfused = RidgeClassifier(alpha=1.0)
     logistic_model_unfused.fit(x_train_unfused, y_train_unfused)
     logistic_model_fused = RidgeClassifier(alpha=1.0)
     logistic_model_fused.fit(x_train_fused, y_train_fused)
     if num_rows_having_paraphrases < 1:
         logging.warning(
             "Classifier data had no questions with paraphrases. This makes cross validation checks fail, so they will be skipped"
         )
         return [], -1, logistic_model_fused, logistic_model_unfused
     scores = cross_val_score(logistic_model_fused,
                              x_train_fused,
                              y_train_fused,
                              cv=2)
     predicted = cross_val_predict(logistic_model_fused,
                                   x_train_fused,
                                   y_train_fused,
                                   cv=2)
     accuracy = metrics.accuracy_score(y_train_fused, predicted)
     return scores, accuracy, logistic_model_fused, logistic_model_unfused
 def evaluate_random_binning(X, y, X_test, y_test, M, task):
     # construct random binning features
     start_time = time.time()
     rb = RandomBinning(X.shape[1], M)
     Z, _ = rb.get_features(X) / np.sqrt(M)
     Z_test, _ = rb.get_features(X_test, expand=False) / np.sqrt(M)
     if (task == 'classification'):
         clf = RidgeClassifier(alpha=0.0001, solver='lsqr')
         clf.fit(Z, y)
         y_pred = clf.predict(Z_test)
         error_test = (
             0.5 - np.dot(np.sign(y_test), y_pred) / len(y_test) / 2) * 100
         print("--- %s seconds ---" % (time.time() - start_time))
         print("C = %d; error_test = %.2f" % (np.shape(Z)[1], error_test) +
               '%')
     elif (task == 'regression'):
         clf = Ridge(alpha=0.01, solver='lsqr', random_state=42)
         clf.fit(Z, y)
         y_pred = clf.predict(Z_test)
         error_test = np.linalg.norm(
             (y_test - y_pred)) / np.linalg.norm(y_test) * 100
         print("--- %s seconds ---" % (time.time() - start_time))
         print("C = %d; error_test = %.2f" % (np.shape(Z)[1], error_test) +
               '%')
     else:
         error_test = 'error!'
         print('No such a task, please check the task name!')
     return error_test
コード例 #5
0
ファイル: train.py プロジェクト: Shajiu/Text-Classification
def Parameter_regularization(train):
    '''
    正则化参数对模型的影响
    :param train:
    :return:
    '''
    sample = train
    n = int(2 * len(sample) / 3)
    tfidf = TfidfVectorizer(ngram_range=(2, 3), max_features=2500)
    train_test = tfidf.fit_transform(sample['text'].values.astype("U"))
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]

    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    f1 = []
    for i in range(10):
        clf = RidgeClassifier(alpha=0.15 * (i + 1), solver='sag')
        clf.fit(train_x, train_y)
        val_pred = clf.predict(test_x)
        f1.append(f1_score(test_y, val_pred, average='macro'))

    plt.plot([0.15 * (i + 1) for i in range(10)], f1)
    plt.xlabel('alpha')
    plt.ylabel('f1_score')
    print(f1)
    plt.show()
コード例 #6
0
    def explain_ls(self, nsample=200):
        self.Z = []
        self.Z0 = []
        for i in range(0, nsample):
            rand = self.rand_sphere(0, self.radius / 2)
            z = [self.mittelpunkt[0] + rand[0], self.mittelpunkt[1] + rand[1]]
            self.Z.append([])
            self.Z[i].append(z[0])
            self.Z[i].append(z[1])

            z0 = self.retransform(z)
            dummy = copy.copy(self.instance)
            dummy[self.attr[0]] = z0[0]
            dummy[self.attr[1]] = z0[1]
            self.Z0.append([])
            self.Z0[i].append(z0[0])
            self.Z0[i].append(z0[1])

            coord3 = np.array(
                self.clf.predict_proba(np.array(dummy).reshape(1, -1))[0])[1]
            self.Z[i].append(int(coord3 >= 0.5))

        clf = RidgeClassifier(alpha=1.0)
        X = np.array(self.Z)[:, 0:2]
        y = np.array(self.Z)[:, 2]
        clf.fit(X, y)
        self.explainer = clf
コード例 #7
0
def ridgeClass(features, response):
    from sklearn.linear_model import RidgeClassifier
    X_train, X_test, y_train, y_test = train_test_split(
        features, response, test_size=0.4)  # , random_state=4)
    ridgeC = RidgeClassifier()
    ridgeC.fit(X_train, y_train)
    print(ridgeC.score(X_test, y_test))
コード例 #8
0
class SupervisedBandit:
    def __init__(self, num_arms=3):
        self.K = num_arms
        self.training_data = None
        self.training_labels = None
        self.clf = RidgeClassifier()
        self.dont_fit = True

    def take_action(self, features):
        if self.training_data is None:
            return torch.tensor(np.random.choice(self.K))
        elif not self.dont_fit:  # don't fit until have enough unique classes
            return torch.tensor(self.clf.predict(features))
        else:
            return torch.tensor(self.training_labels[0])

    def add_data(self, features, correct_action):
        if self.training_data is None:
            self.training_data = features
            self.training_labels = np.array([correct_action])
        else:
            self.training_data = torch.cat((self.training_data, features))
            self.training_labels = np.concatenate(
                (self.training_labels, correct_action))

        if len(np.unique(self.training_labels)) > 1:
            # solver needs at least 2 unique classes to fit
            self.dont_fit = False
            self.clf.fit(self.training_data, self.training_labels)
コード例 #9
0
def text_classify_influence_by_add_regularization():
    """
    探究正则化对文本分类的影响
    """
    train_df = pd.read_csv('./data/train_set.csv', sep='\t', nrows=15000)
    sample = train_df[0:5000]
    n = int(2 * len(sample) / 3)
    tfidf = TfidfVectorizer(ngram_range=(2, 3), max_features=2500)
    train_test = tfidf.fit_transform(sample['text'])
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]

    f1 = []
    for i in range(10):
        clf = RidgeClassifier(alpha=0.15 * (i + 1), solver='sag')
        clf.fit(train_x, train_y)
        val_pred = clf.predict(test_x)
        f1.append(f1_score(test_y, val_pred, average='macro'))

    plt.plot([0.15 * (i + 1) for i in range(10)], f1)
    plt.xlabel('alpha')
    plt.ylabel('f1_score')
    plt.show()
コード例 #10
0
ファイル: ml_model.py プロジェクト: AbdulMunim97/AI-project
def mlPreddict(label):
    df = pd.read_csv('classification.csv')
    df_names = df
    Xfeatures = df_names['Names']
    cv = CountVectorizer()
    x = cv.fit_transform(Xfeatures)
    y = df_names.Classes
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=12)

    knn = KNeighborsClassifier(n_neighbors=158)
    r_c = RidgeClassifier()
    m_nb = MultinomialNB()
    dt_c = DecisionTreeClassifier()
    svm_c = svm.SVC()

    knn.fit(x_train, y_train)
    r_c.fit(x_train, y_train)
    m_nb.fit(x_train, y_train)
    dt_c.fit(x_train, y_train)
    svm_c.fit(x_train, y_train)

    knn.score(x_test, y_test)
    r_c.score(x_test, y_test)
    m_nb.score(x_test, y_test)
    dt_c.score(x_test, y_test)
    svm_c.score(x_test, y_test)

    sample_name = [label]
    vect = cv.transform(sample_name).toarray()

    prediction = svm_c.predict(vect)
    predict = ''.join(map(str, prediction))
    print(predict)

    return predict
コード例 #11
0
def impute_nan(df, ds, dF):
    if ds.isnull().any()==True:
        labeler_st = LabelEncoder()
        rc_st = RidgeClassifier(tol=1e-2, solver="sag")
        Sg = Series(labeler_st.fit_transform(ds.astype(str)), index=ds.index)
        Sg = Sg.where(ds.notnull(), ds, axis=0)
        x_notna = df.GR[Sg.notnull()].to_numpy().reshape(-1, 1)
        y_notna = Sg[Sg.notnull()].to_numpy().astype('int').ravel()
        x_nan = df.GR[Sg.isnull()].to_numpy().reshape(-1, 1)
        rc_st.fit(x_notna,y_notna)
        Sg[Sg.isnull()]=rc_st.predict(x_nan)
        Sg=Series(Sg, index=ds.index).astype(int)
        ds=Series(labeler_st.inverse_transform(Sg.values.ravel()), index=ds.index)
        #print('\nStratigraphy:', np.unique(ds))
    if dF.isnull().any()==True:
        rc_fm = RidgeClassifier(tol=1e-2, solver="sag")
        labeler_fm = LabelEncoder()
        Fm = Series(labeler_fm.fit_transform(dF.astype(str)), index=dF.index)
        labeler_st = LabelEncoder()
        Sg=Series(labeler_st.fit_transform(ds.astype(str)), index=ds.index)
        Fm=Fm.where(dF.notnull(), dF, axis=0)
        x_notna = np.concatenate((df.GR[Fm.notnull()].to_numpy().reshape(-1, 1), 
                                  Sg[Fm.notnull()].to_numpy().reshape(-1, 1)), 
                                 axis=1)
        y_notna = Fm[Fm.notnull()].to_numpy().astype('int').ravel()
        x_nan = np.concatenate((df.GR[Fm.isnull()].to_numpy().reshape(-1, 1), 
                                Sg[Fm.isnull()].to_numpy().reshape(-1, 1)), axis=1)
        rc_fm.fit(x_notna,y_notna)
        Fm[Fm.isnull()]=rc_fm.predict(x_nan)
        Fm=Series(Fm, index=dF.index).astype(int)
        dF=Series(labeler_fm.inverse_transform(Fm.values.ravel()), index=dF.index)
        #print('\nFormation:', np.unique(dF))
    return Sg, Fm
コード例 #12
0
def run(input_train, input_test, output_name):
    """
    Takes a file path as input, a file path as output, and produces a sorted csv of
    item IDs for Kaggle submission
    -------
    input_train : 'full path of the training file'
    input_test : 'full path of the testing file'
    output_name : 'full path of the output file'
    """

    data = pd.read_table(input_train)
    test = pd.read_table(input_test)
    testItemIds = test.itemid
    response = data.is_blocked
    dummies = sparse.csc_matrix(pd.get_dummies(data.subcategory))
    pretestdummies = pd.get_dummies(test.subcategory)
    testdummies = sparse.csc_matrix(pretestdummies.drop(['Растения', 'Товары для компьютера'],axis=1))
    words = np.array(data.description,str)
    testwords = np.array(test.description,str)
    del data, test
    vect = text.CountVectorizer(decode_error = u'ignore', strip_accents='unicode', ngram_range=(1,2))
    corpus = np.concatenate((words, testwords))
    vect.fit(corpus)
    counts = vect.transform(words)
    features = sparse.hstack((dummies,counts))
    clf = RidgeClassifier()
    clf.fit(features, response)
    testcounts = vect.transform(testwords)
    testFeatures = sparse.hstack((testdummies,testcounts))
    predicted_scores = clf.predict_proba(testFeatures).T[1]
    f = open(output_name,'w')
    f.write("id\n") 
    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        f.write("%d\n" % (item_id))
    f.close()
コード例 #13
0
def test_class_weights():
    # Test class weights.
    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0,
                                                                     0.0]])
    y = [1, 1, 1, -1, -1]

    reg = RidgeClassifier(class_weight=None)
    reg.fit(X, y)
    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1]))

    # we give a small weights to class 1
    reg = RidgeClassifier(class_weight={1: 0.001})
    reg.fit(X, y)

    # now the hyperplane should rotate clock-wise and
    # the prediction on this point should shift
    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([-1]))

    # check if class_weight = 'balanced' can handle negative labels.
    reg = RidgeClassifier(class_weight='balanced')
    reg.fit(X, y)
    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1]))

    # class_weight = 'balanced', and class_weight = None should return
    # same values when y has equal number of all labels
    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0]])
    y = [1, 1, -1, -1]
    reg = RidgeClassifier(class_weight=None)
    reg.fit(X, y)
    rega = RidgeClassifier(class_weight='balanced')
    rega.fit(X, y)
    assert len(rega.classes_) == 2
    assert_array_almost_equal(reg.coef_, rega.coef_)
    assert_array_almost_equal(reg.intercept_, rega.intercept_)
コード例 #14
0
def validate(input_train, rows=True, test=0.25):
    """
    Takes file as input and returns classification report, average precision, and
    AUC for a bigram model. By default, loads all rows of a dataset, trains on .75,
    and tests on .25. 
    ----
    input_train : 'full path of the file you are loading'
    rows : True - loads all rows; insert an int for specific number of rows
    test : float proportion of dataset used for testing
    """
    if rows == True:
        data = pd.read_table(input_train)
    else:
        data = pd.read_table(input_train, nrows = rows)
    response = data.is_blocked
    dummies = sparse.csc_matrix(pd.get_dummies(data.subcategory))
    words = np.array(data.description,str)
    del data
    vect = text.CountVectorizer(decode_error = u'ignore',strip_accents='unicode',ngram_range=(1,2))
    counts = vect.fit_transform(words)
    features = sparse.hstack((dummies,counts))
    features_train, features_test, target_train, target_test = train_test_split(features, response, test_size = test)
    clf = RidgeClassifier()
    clf.fit(features_train, target_train)
    prediction = clf.predict(features_test)
    return classification_report(target_test, prediction), average_precision_score(target_test, prediction), roc_auc_score(target_test, prediction)
コード例 #15
0
def country_based_model(df, input_df, model_evaluator):
    input_df['-3'] = input_df['name'].str[-3]
    input_df['-2'] = input_df['name'].str[-2]
    input_df['-1'] = input_df['name'].str[-1]

    columns = ['-3', '-2', '-1']
    vectorized_name = [pd.get_dummies(input_df[i]) for i in columns]
    input_df = pd.concat(
        [vectorized_name[0], vectorized_name[1], vectorized_name[2], input_df],
        axis=1)

    cY = input_df['gender'].head(39469)
    input_df = input_df.drop(columns=['name', 'gender', '-3', '-2', '-1'])

    cX = input_df.head(39469)
    cX_train, cX_test, cy_train, cy_test = train_test_split(cX,
                                                            cY,
                                                            test_size=0.2,
                                                            random_state=42)

    model = RidgeClassifier(fit_intercept=False, solver='lsqr')
    model.fit(cX_train, cy_train)

    training_predictions = model.predict(input_df.head(39469))
    model_evaluator['MODEL_PREDICTION'] = training_predictions
    country_model_predictions = model.predict(input_df.tail(1000))
    df['COUNTRY_MODEL'] = country_model_predictions

    return df, model_evaluator
コード例 #16
0
ファイル: offline_updater.py プロジェクト: dylanrhodes/sigma
def retrain_models(username):
	train_x, train_y, body_x, body_y, head_x, head_y = model_retriever.retrieve_data_db(username)

	b_train_x = []
	b_train_y = numpy.concatenate([body_y, train_y])

	for msg in (body_x + train_x):
		b_train_x.append(extract_body_features(msg))

	body_vec = TfidfVectorizer(norm="l2")
	b_train_x = body_vec.fit_transform(b_train_x)

	h_train_x = []
	h_train_y = numpy.concatenate([head_y, train_y])

	for msg in (head_x + train_x):
		h_train_x.append(extract_header_features(msg))

	head_vec = DictVectorizer()
	h_train_x = head_vec.fit_transform(h_train_x)

	body_model = LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3)
	head_model = RidgeClassifier(tol=1e-2, solver="lsqr")

	body_model.fit(b_train_x, b_train_y)
	head_model.fit(h_train_x, h_train_y)

        print("Finished training models for "+username+"...")

	store_models(username, body_vec, body_model, head_vec, head_model)
コード例 #17
0
def text_classify_influence_by_max_features():
    """
    max_features对文本分类的影响
    """
    train_df = pd.read_csv('./data/train_set.csv', sep='\t', nrows=15000)
    sample = train_df[0:5000]
    n = int(2 * len(sample) / 3)
    f1 = []
    features = [1000, 2000, 3000, 4000]
    for i in range(4):
        tfidf = TfidfVectorizer(ngram_range=(2, 3), max_features=features[i])
        train_test = tfidf.fit_transform(sample['text'])
        train_x = train_test[:n]
        train_y = sample['label'].values[:n]
        test_x = train_test[n:]
        test_y = sample['label'].values[n:]
        clf = RidgeClassifier(alpha=0.1 * (i + 1), solver='sag')
        clf.fit(train_x, train_y)
        val_pred = clf.predict(test_x)
        f1.append(f1_score(test_y, val_pred, average='macro'))

    plt.plot(features, f1)
    plt.xlabel('max_features')
    plt.ylabel('f1_score')
    plt.show()
コード例 #18
0
def rigid(X_train, X_test, y_train):
    # Fitting RigidClassifier to the Training set
    from sklearn.linear_model import RidgeClassifier
    classifier = RidgeClassifier(alpha=4, class_weight='balanced')
    classifier.fit(X_train, y_train)
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    return y_pred
コード例 #19
0
def fit_ridge(l2_reg, train_random_features, y_train, test_random_features, y_test):
    clf = RidgeClassifier(alpha=l2_reg)
    clf.fit(train_random_features, y_train.ravel())
    train_accuracy = clf.score(train_random_features, y_train)
    test_accuracy = clf.score(test_random_features, y_test)
    print("Train accuracy:", train_accuracy)
    print("Test accuracy:", test_accuracy)
    return train_accuracy, test_accuracy
コード例 #20
0
def Eval(XTrain, YTrain, XTest, YTest, clf, return_predicted_labels=False):
	"""
		Inputs:
			XTrain - N by D matrix of training data vectors
			YTrain - N by 1 matrix of training class labels
			XTest - M by D matrix of testin data vectors
			YTrain - M by 1 matrix of testing class labels
			clstr - the clustering function 
				either the string = "KMeans" or "GMM"
				or a sklearn clustering instance
					with the methods .fit and 
		Outputs:
			A tuple containing (in the following order):
				Accuracy
				Overall Precision
				Overall Recall
				Overall F1 score
				Avg. Precision per class
				Avg. Recall per class
				F1 Score
				Precision per class
				Recall per class
				F1 Score per class
				(if return_predicted_labels)
					predicted class labels for each row in XTest
	"""

	if type(clf) == str:
		if 'ridge' in clf.lower():
			clf = RidgeClassifier(tol=1e-2, solver="lsqr")
		elif "perceptron" in clf.lower():
			clf = Perceptron(n_iter=50)
		elif "passive aggressive" in clf.lower() or 'passive-aggressive' in clf.lower():
			clf = PassiveAggressiveClassifier(n_iter=50)
		elif 'linsvm' in clf.lower() or 'linearsvm' in clf.lower() or 'linearsvc' in clf.lower():
			clf = LinearSVC()
		elif 'svm' in clf.lower() or 'svc' in clf.lower():
			clf = SVC()
		elif 'sgd' in clf.lower():
			clf = SGDClassifier()
   
	clf.fit(XTrain, YTrain)
	YPred = clf.predict(XTest)


	accuracy = sklearn.metrics.accuracy_score(YTest, YPred)
	(overall_precision, overall_recall, overall_f1, support) = sklearn.metrics.precision_recall_fscore_support(YTest, YPred, average='micro')
	(precision_per_class, recall_per_class, f1_per_class, support_per_class) = sklearn.metrics.precision_recall_fscore_support(YTest, YPred)
	avg_precision_per_class = np.mean(precision_per_class)
	avg_recall_per_class = np.mean(recall_per_class)
	avg_f1_per_class = np.mean(f1_per_class)

	del clf

	if return_predicted_labels:
		return (accuracy, overall_precision, overall_recall, overall_f1, avg_precision_per_class, avg_recall_per_class, avg_f1_per_class, precision_per_class, recall_per_class, f1_per_class, YPred)
	else:
		return (accuracy, overall_precision, overall_recall, overall_f1, avg_precision_per_class, avg_recall_per_class, avg_f1_per_class, precision_per_class, recall_per_class, f1_per_class)
コード例 #21
0
def RFF_Form1_Classification(WN, b,TrainData, ValData, TestData, train_label, val_label, test_label, option=1):
    import numpy as np 
    import time
    import copy
    import sys
    from time import clock
    
    D = np.shape(WN)[1]
    b=np.zeros((D,1))
    bn=copy.copy(b)
    PRFSGDAccuracy=np.zeros((D,1))    
    PRFRidgeAccuracy=np.zeros((D,1))
    #interval = np.arange(0,D,10)
    interval = [D]
    
    for k in interval:
        if (k==0):
            k=k+1
        W=WN[:,range(k)]
        b=bn[range(k)]
        RFTrainData= FeaturemapTransformation_Form1(W,TrainData)           
        RFTestData= FeaturemapTransformation_Form1(W,TestData)
        
        ## Psuedo RF SGD
        from minibatchSGDCV import minibatchRFSGDCV
        from minibatchSGD import RFminibatchSGD
        st_time = clock()
        ncv=3
        RFcvparam, RFbestbatchparam, RFmeanscore = minibatchRFSGDCV(ValData,val_label,ncv,W,b,
                                                                    option=1, RFoption=1)
        end_time = clock()
        print('PRFSGD Cross Validation Completed')
        print('Time required for PRFSGD CV is =', end_time-st_time)
        PRFclf = RFminibatchSGD(TrainData,train_label,W,b,option=1,batchsize=RFcvparam['batchsize'], 
                                           alpha=RFcvparam['alpha'], eta0=RFcvparam['eta0'], RFoption=1)
        PRFSGDClassifiedlabel=PRFclf.predict(RFTestData)
        #SCDconfMat=confusion_matrix(test_label,SGDClassifiedlabel)
        PRFSGDAccuracy[k-1]=sum(test_label==PRFSGDClassifiedlabel)/(float(len(test_label)))
#        print('RFSGD Completed')    
#        print("The classification accuracy with PsudeoRFSGD =",PRFSGDAccuracy[k-1])      
         
        
        ## Ridge regression Psuedo RF
        from sklearn.linear_model import RidgeClassifier
        from sklearn.metrics import confusion_matrix
        clf = RidgeClassifier(alpha=0.1)
        clf.fit(RFTrainData, train_label)
        RFRidgeClassifiedlabel = clf.predict(RFTestData)
        RFRidgeConfMat=confusion_matrix(test_label,RFRidgeClassifiedlabel)
        PRFRidgeAccuracy[k-1]=sum(test_label==RFRidgeClassifiedlabel)/(float(len(test_label)))
#        print("The classification accuracy with PsudeoRFRidge =",PRFRidgeAccuracy[k-1])
#        print("The feature expansion",k, "is over")
#        print('+++++++++++++++++++++++++++++++++++++++')
        
    ind = PRFSGDAccuracy>0
    PRFSGDAccuracy = PRFSGDAccuracy[ind]
    PRFRidgeAccuracy = PRFRidgeAccuracy[ind]    
    return PRFSGDAccuracy, PRFRidgeAccuracy
コード例 #22
0
def do_rc(X_test, X_train, Y_train):
    # creating a classifier of loss function "hinge" and penalty function "l2"
    clf = RidgeClassifier()
    print "starts fitting"
    print clf.fit(X_train, Y_train)
    print "finished fitting, starts predictions"
    Y_pred = clf.predict(X_test)
    print "finished predictions"
    return Y_pred
コード例 #23
0
 def train_ridge(self, x, y, alpha=0.0001):
     """
     Trains a Ridge classifier on the sampled data and classifier predictions considering only
     the chosen_attributes for now, for simplicity
     """
     # TODO: Automate Parameters
     linear_clf = RidgeClassifier(alpha=alpha)
     linear_clf.fit(x, y)
     self.surrogate = linear_clf
コード例 #24
0
class RidgeModel(ccobra.CCobraModel):
    def __init__(self, name='Ridge', k=1):
        super(RidgeModel, self).__init__(name, ["moral"], ["single-choice"])

        self.clf = RidgeClassifier(alpha=7)

        self.n_epochs = 1

    def pre_train(self, dataset):

        x = []
        y = []

        for subj_train_data in dataset:
            for seq_train_data in subj_train_data:

                seq_train_data['task'] = seq_train_data['item'].task
                inp = create_input(seq_train_data)

                target = float(output_mppng[seq_train_data['response'][0][0]])

                x.append(inp)

                y.append(target)
        x = np.array(x)
        y = np.array(y)

        self.train_x = x
        self.train_y = y

        self.train_network(self.train_x,
                           self.train_y,
                           self.n_epochs,
                           verbose=True)

    def train_network(self, train_x, train_y, n_epochs, verbose=False):
        print('Starting training...')
        for epoch in range(self.n_epochs):

            # Shuffle the training data
            perm_idxs = np.random.permutation(np.arange(len(train_x)))
            train_x = train_x[perm_idxs]
            train_y = train_y[perm_idxs]

            self.clf.fit(train_x, train_y)

            print('Mean accuracy:')
            print(self.clf.score(train_x, train_y))

    def predict(self, item, **kwargs):
        input = {'task': item.task}
        input['aux'] = kwargs
        x = np.array(create_input(input)).reshape(1, -1)
        output = self.clf.predict(x)

        self.prediction = output_mppngREV[output[0]]
        return self.prediction
コード例 #25
0
ファイル: task3.py プロジェクト: MiniBee/HongNLP
def classifier(df, vectorizer):
    train_text = vectorizer.transform(df['text'])
    train_y = df['label'].values
    model = RidgeClassifier()
    logging.info('training ... ')
    model.fit(train_text, train_y)
    logging.info('predicting ... ')
    pred_y = model.predict(train_text)
    score(train_y, pred_y)
コード例 #26
0
def fit():
    """A function that trains the classifiers and ensembles them"""
    # the features and the labels
    VHSE = pd.read_excel("sequences.xlsx", index_col=0, sheet_name="VHSE")
    Y = VHSE["label1"].copy()
    X_svc = pd.read_excel("esterase_binary.xlsx",
                          index_col=0,
                          sheet_name="ch2_20")
    X_knn = pd.read_excel("esterase_binary.xlsx",
                          index_col=0,
                          sheet_name="random_30")

    if X_svc.isnull().values.any():
        X_svc.dropna(axis=1, inplace=True)
        X_svc.drop(["go"], axis=1, inplace=True)

    if X_knn.isnull().values.any():
        X_knn.dropna(axis=1, inplace=True)
        X_knn.drop(["go"], axis=1, inplace=True)

    # named_tuples
    models = namedtuple("models", ["svc", "ridge", "knn"])
    test = namedtuple(
        "test_samples",
        ["x_svc", "x_knn", "y_svc", "y_knn", "x_test_svc", "x_test_knn"])
    train = namedtuple(
        "train_samples",
        ["svc_x", "knn_x", "svc_y", "knn_y", "x_train_svc", "x_train_knn"])

    # split and train
    transformed_x_svc, test_x_svc, Y_train_svc, Y_test_svc, X_train_svc, X_test_svc = split_transform(
        X_svc, Y)
    transformed_x_knn, test_x_knn, Y_train_knn, Y_test_knn, X_train_knn, X_test_knn = split_transform(
        X_knn, Y)

    # the 3 algorithms
    svc = SVC(C=0.31, kernel="rbf", gamma=0.91)
    knn = KNN(n_neighbors=7, p=5, metric="minkowski", n_jobs=-1)
    ridge = RIDGE(alpha=8, random_state=0)

    # fit the 3 algorithms
    svc.fit(transformed_x_svc, Y_train_svc)
    ridge.fit(transformed_x_svc, Y_train_svc)
    knn.fit(transformed_x_knn, Y_train_knn)

    # save in namedtuples
    fitted_models = models(*[svc, ridge, knn])
    test_sample = test(*[
        test_x_svc, test_x_knn, Y_test_svc, Y_test_knn, X_test_svc, X_test_knn
    ])
    train_sample = train(*[
        transformed_x_svc, transformed_x_knn, Y_train_svc, Y_train_knn,
        X_train_svc, X_train_knn
    ])

    return fitted_models, test_sample, train_sample
コード例 #27
0
ファイル: linearModel.py プロジェクト: noosc/exp-code
def RidgeReg(file1, file2):
    feature1, lable1 = file2matrix(file1)
    clf = RidgeClassifier()
    clf.fit(feature1, lable1)

    feature2, label2 = file2matrix(file2)
    y_true = label2
    y_score = clf.decision_function(feature2)
    y_pred = clf.predict(feature2)
    return y_true, y_score, y_pred
コード例 #28
0
 def scikit_ridgeregression(self, dataset, labels):
     from sklearn.linear_model import RidgeClassifier
     lr = RidgeClassifier(fit_intercept=False, max_iter=100, random_state=0)
     lr.fit(dataset, labels)
     testset, truelabels = self.load_dataset(self.testdata)
     prob = lr.predict(testset)
     ans = prob * truelabels
     err_rate = float(np.sum(ans == -1)) / ans.shape[0]
     print("Scikit Learn RR Test Error Rate: {:.2f}".format(
         float(err_rate)))
コード例 #29
0
def main():
    """
    RidgeRegression classifier.
    """
    dct = True
    X_train, X_test, y_train, y_test = prepare_datasets(dct)
    model = RidgeClassifier(alpha=1.0)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    bal_acc = balanced_accuracy_score(y_test, pred)
    print(f"Balanced accuracy score: {bal_acc:g}")
コード例 #30
0
def scikit_ridgec_test(size):
    X, y = datasets.make_classification(n_samples=1000,
                                        n_features=size,
                                        n_informative=2,
                                        n_redundant=2)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)
    model = RidgeClassifier(random_state=42)
    model.fit(X_train, y_train)
    model.score(X_test, y_test)
コード例 #31
0
def confusion_matrix_for_problem(X, y, model=None):
    train_X, test_X, train_y, test_y = train_test_split(X,
                                                        y,
                                                        random_state=0,
                                                        stratify=y)

    if model is None:
        model = RidgeClassifier()
    model.fit(train_X, train_y)
    prediction_y = model.predict(test_X)

    return confusion_matrix(test_y, prediction_y)
コード例 #32
0
 def train_explainer(self):
     """
     Trains a Ridge classifier on the sampled data considering only
     the chosen_attributes for now, for simplicity
     """
     X = self.sample_set[:, self.chosen_attributes]
     y = self.predictions
     # TODO: Automate Parameters
     clf = RidgeClassifier(alpha=0.1)
     clf.fit(X, y)
     self.explainer = clf
     return self.explainer
コード例 #33
0
ファイル: first.py プロジェクト: Rookie1019/data_share
def ridge_regression():
    train_df = pd.read_csv(r'C:\Users\Rookie\Desktop\nlp\train_set.csv',
                           sep='\t')

    vectorizer = CountVectorizer(max_features=3000)
    train_test = vectorizer.fit_transform(train_df['text'])

    clf = RidgeClassifier()
    clf.fit(train_test[:10000], train_df['label'].values[:10000])

    val_pred = clf.predict(train_test[10000:])
    print(f1_score(train_df['label'].values[10000:], val_pred,
                   average='macro'))
コード例 #34
0
def test_raise_without_labels():
    y = np.random.randint(0, 10, 100).astype(str)
    X = np.random.randn(100, 4)
    ridge_classifier = RidgeClassifier()
    ridge_classifier.fit(X, y)
    logistic_regression = LogisticRegression()
    logistic_regression.fit(X, y)
    pvc = PrefittedVotingClassifier(estimators = [
        ridge_classifier,
        logistic_regression
    ])
    with pytest.raises(ValueError):
        pvc.predict(X)
コード例 #35
0
def test_string_classification():
    y = np.random.randint(0, 10, 100)
    X = np.random.randn(100, 4)
    ridge_classifier = RidgeClassifier()
    ridge_classifier.fit(X, y)
    logistic_regression = LogisticRegression()
    logistic_regression.fit(X, y)
    pvc = PrefittedVotingClassifier(estimators = [
        ridge_classifier,
        logistic_regression
    ], labels = np.arange(0, 10).astype(str))
    pred = pvc.predict(X)
    assert pred.dtype.type is np.str_
コード例 #36
0
def get_optimal_blend_weigth(exp_, best_param_,
                             folder, fname, model_fname):
    clf = RidgeClassifier()
    X_test, y_test = exp_.get_test_data()
    clf.set_params(**best_param_)
    clf.fit(X_test, y_test)

    # dump2csv optimal linear weight
    names = np.append(np.array(['intercept'], dtype='S100'), X_test.columns.values)
    coefs = np.append(clf.intercept_, clf.coef_).astype(np.float64)
    optimal_linear_weight = pd.DataFrame(coefs.reshape(1,len(coefs)), columns=names)
    optimal_linear_weight.to_csv(os.path.join(Config.get_string('data.path'),
                                              folder,
                                              fname), index=False)

    # dump2cpkle for ridge model
    model_fname = os.path.join(Config.get_string('data.path'), folder, model_fname)
    with gzip.open(model_fname, 'wb') as gf:
        cPickle.dump(clf, gf, cPickle.HIGHEST_PROTOCOL)
    
    return True
コード例 #37
0
ファイル: test_3.py プロジェクト: albingrace/QianWan
def Predict():
    print('\nThere are %d new deals') % n_test

    # Using the KNN classifier
    clf_KNN = KNeighborsClassifier(n_neighbors=3) # KNN doesnot work even if k has been tuned
    #clf_KNN = KNeighborsClassifier(n_neighbors=7)
    #clf_KNN = KNeighborsClassifier(n_neighbors=11)
    clf_KNN.fit(Corpus_train, Y_train)
    Y_pred_KNN = clf_KNN.predict(Corpus_test)
    print_rate(Y_test, Y_pred_KNN, n_test, 'KNNClassifier')
    
    # Using the SVM classifier
    clf_SVM = svm.SVC()
    clf_SVM.fit(Corpus_train, Y_train)
    Y_pred_SVM = clf_SVM.predict(Corpus_test)
    print_rate(Y_test, Y_pred_SVM, n_test, 'SVMClassifier')
    
    # Using the Ridge classifier
    clf_RC = RidgeClassifier(tol=0.01, solver="lsqr")
    #clf_RC = RidgeClassifier(tol=0.1, solver="lsqr")
    clf_RC.fit(Corpus_train, Y_train)
    Y_pred_RC = clf_RC.predict(Corpus_test)
    print_rate(Y_test, Y_pred_RC, n_test, 'RidgeClassifier')
    
    # won't consider Random Forests or Decision Trees beacause they work bad for high sparse dimensions
    
    
    # Using the Multinomial Naive Bayes classifier
    # I expect that this MNB classifier will do the best since it is designed for occurrence counts features
    #clf_MNB = MultinomialNB(alpha=0.01) #smoothing parameter = 0.01 is worse than 0.1
    clf_MNB = MultinomialNB(alpha=0.1)
    #clf_MNB = MultinomialNB(alpha=0.3) #a big smoothing rate doesnot benefit the model
    #clf_MNB = MultinomialNB(alpha=0.2) #or alpha = 0.05 can generate the best outcome
    clf_MNB.fit(Corpus_train, Y_train)
    Y_pred_MNB = clf_MNB.predict(Corpus_test)
    print_rate(Y_test, Y_pred_MNB, n_test, 'MultinomialNBClassifier')
コード例 #38
0
    def test_default_configuration_classify(self):
        for i in range(2):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=False)
            configuration_space = ExtraTreesPreprocessor.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()
            preprocessor = ExtraTreesPreprocessor(random_state=1,
                                                  **{hp_name: default[hp_name]
                                                     for hp_name in default})
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = RidgeClassifier()
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.accuracy_score(predictions, Y_test)
            self.assertAlmostEqual(accuracy, 0.87310261080752882, places=2)
コード例 #39
0
    def test_default_configuration_classify(self):
        for i in range(5):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=False)
            configuration_space = KernelPCA.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()
            preprocessor = KernelPCA(random_state=1,
                                     **{hp_name: default[hp_name] for hp_name in
                                        default if default[hp_name] is not None})
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = RidgeClassifier()
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.accuracy_score(predictions, Y_test)
            self.assertAlmostEqual(accuracy, 0.096539162112932606)
コード例 #40
0
    def test_default_configuration_classify(self):
        for i in range(2):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=True)
            configuration_space = TruncatedSVD.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()
            preprocessor = TruncatedSVD(random_state=1,
                                                  **{hp_name: default[hp_name]
                                                     for hp_name in
                                                     default if default[
                                                      hp_name] is not None})
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = RidgeClassifier()
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.accuracy_score(predictions, Y_test)
            self.assertAlmostEqual(accuracy, 0.44201578627808136, places=2)
コード例 #41
0
z = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=float)
# z = np.zeros( (n_samples, n_categories) , dtype=float)

# Test for 10 rounds using the results from 10 fold cross validations
for i, (train_index, test_index) in enumerate(kf):

    print "run %d" % (i+1)

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    X_den_train, X_den_test = X_den[train_index], X_den[test_index]

    # feed models
    clf_mNB.fit(X_train, y_train)
    clf_ridge.fit(X_train, y_train)
    clf_SGD.fit(X_train, y_train)
    clf_lSVC.fit(X_train, y_train)
    clf_SVC.fit(X_train, y_train)

    # get prediction for this fold run
    prob_mNB    = clf_mNB.predict_proba(X_test)
    prob_ridge  = clf_ridge.decision_function(X_test)
    prob_SGD    = clf_SGD.decision_function(X_test)
    prob_lSVC   = clf_lSVC.decision_function(X_test)
    prob_SVC    = clf_SVC.predict_proba(X_test)

    # add prob functions into the z 2d-array
    z_temp = (prob_mNB + prob_ridge + prob_SGD + prob_lSVC + prob_SVC)
    z = np.append(z, z_temp, axis=0)
コード例 #42
0
    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svm', clf3),
                                        ('rc', clf4), ('ab', clf5)],
                            voting='hard')

    for clf, label in zip([clf1, clf2, clf3, clf4, clf5, eclf],
                          ['Logistic Regression', 'Random Forest', 'SVM',
                           'Ridge Classifier', 'Ada boost', 'Ensemble']):
        scores = cross_val_score(clf, X.toarray(), y, cv=5, scoring='f1_macro')
        scores_dict[label].append(scores.mean())
        print("f1_macro: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

    X, y = dataset.get_resampled_train_X_y(kind='regular')
    clf1.fit(X.toarray(), y)
    clf2.fit(X.toarray(), y)
    clf3.fit(X.toarray(), y)
    clf4.fit(X.toarray(), y)
    clf5.fit(X.toarray(), y)
    eclf.fit(X.toarray(), y)

    # X_test = dataset.get_test_x()
    # y_test = dataset.get_test_y()

    # for clf, label in zip([clf1, clf2, clf3, clf4, clf5, eclf],
    #                       ['Logistic Regression', 'Random Forest',
    #                        'SVM', 'Ridge Classifier', 'Ada boost', 'Ensemble']):
    #     score = f1_score(y_test, clf.predict(X_test.toarray()))
    #     print("f1_macro_test: %0.2f [%s]" % (score, label))
    #     print(precision_score(y_test, clf.predict(X_test.toarray())))
    #     print(recall_score(y_test, clf.predict(X_test.toarray())))

    # predicted = clf.predict(X_test.toarray())
コード例 #43
0
ファイル: ridge.py プロジェクト: log0/digit_recognizer
    data = [ i for i in csv.reader(file(train_file, 'rb')) ]
    data = data[1:] # remove header
    random.shuffle(data)

    X = np.array([ i[1:] for i in data ]).astype(float)
    Y = np.array([ i[0] for i in data ]).astype(int)

    train_cutoff = len(data) * 3/4

    X_train = X[:train_cutoff]
    Y_train = Y[:train_cutoff]
    X_test = X[train_cutoff:]
    Y_test = Y[train_cutoff:]

    classifier = RidgeClassifier(normalize = True, alpha = 1)
    classifier = classifier.fit(X_train, Y_train)
    
    print 'Training error : %s' % (classifier.fit(X_train, Y_train).score(X_train, Y_train))

    Y_predict = classifier.predict(X_test)

    equal = 0
    for i in xrange(len(Y_predict)):
        if Y_predict[i] == Y_test[i]:
            equal += 1

    print 'Accuracy = %s' % (float(equal)/len(Y_predict))


コード例 #44
0
def get_ridge_plot(best_param_, experiment_, 
                   param_keys_, param_vals_,
                   png_folder,
                   png_fname,
                   score_threshold=0.8):

    parameters = dict(zip(param_keys_, param_vals_))
    del parameters['model_type']

    clf = RidgeClassifier()
    X_train, y_train = experiment_.get_train_data()
    clf.set_params(**best_param_)
    clf.fit(X_train, y_train)    
    best_alpha = best_param_['alpha']
    result = {'alphas':[],
              'coefs':np.zeros( (len(parameters['alpha']), len(X_train.columns.values) + 1) ),
              'scores':[],
              'score':None}


    for i, alpha in enumerate(parameters.get('alpha',None)):
        result['alphas'].append(alpha)
        del best_param_['alpha']
        best_param_['alpha'] = alpha
        clf.set_params(**best_param_)
        clf.fit(X_train, y_train)

        # regularization path
        tmp = np.array([0 for j in xrange(len(X_train.columns.values) + 1)], dtype=np.float32)
        if best_param_['fit_intercept']:
            tmp = np.append(clf.intercept_, clf.coef_)
        else:
            tmp[1:] = clf.intercept_
        result['coefs'][i,:] = tmp
        result['scores'].append(experiment_.get_proba(clf, X_train))
    del X_train, y_train

    # 2. 
    tmp_len = len(experiment_.get_data_col_name())
    index2feature = dict(zip(np.arange(1, tmp_len + 1), 
                             experiment_.get_data_col_name()))
    if best_param_['fit_intercept']:
        index2feature[0] = 'intercept'

    # 3. plot
    gs = GridSpec(2,2)
    ax1 = plt.subplot(gs[:,0])
    ax2 = plt.subplot(gs[0,1])
    ax3 = plt.subplot(gs[1,1])


    # 3.1 feature importance
    labels = np.append(np.array(['intercept'], dtype='S100'), experiment_.get_data_col_name())
    nrows, ncols = result['coefs'].shape
    for ncol in xrange(ncols):
        ax1.plot(np.array(result['alphas']), result['coefs'][:,ncol], label = labels[ncol])
    ax1.legend(loc='best')
    ax1.set_xscale('log')
    ax1.set_title("Regularization Path:%1.3e" % (best_alpha))
    ax1.set_xlabel("alpha", fontsize=10)

    # 3.2 PDF
    X_test, y_test = experiment_.get_test_data()
    result['score'] = clf.decision_function(X_test)
    sns.distplot(result['score'], kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF : Decision_Function")


    # 3.3 CDF
    num_bins = 100
    try:
        counts, bin_edges = np.histogram(result['score'], bins=num_bins, normed=True)
    except:
        counts, bin_edges = np.histogram(result['score'], normed=True)

    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Decision_Function:Confidence_Score", fontsize=10)


    png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)
    plt.close()

    return True
X_test = X_test_summary+X_test_title+X_test_author
duration = time() - t0
print("n_samples: %d, n_features: %d" % X_test.shape)
print("Done in %fs" % (duration))

def writeToDisk(predn,clfname):
    target="./"+clfname+".txt"
    target=open(target,'w')
    target.write("{}\t{}\n".format("record_id", "topic"))
    for x in zip(testID, predn):
        target.write("{}\t{}\n".format(x[0], x[1]))
    target.close()
    print(clfname," output written to disk.")

clf1=RidgeClassifier(tol=1e-2, solver="lsqr")   #Ridge Classifier
clf1.fit(X_train, y_train)
pred = clf1.predict(X_test)
writeToDisk(pred,"RidgeClassifier")

clf2=MultinomialNB(alpha=.01)                   #Naive Bayes classifier
clf2.fit(X_train, y_train)
pred = clf2.predict(X_test)
writeToDisk(pred,"MultinomialNB")

clf3=BernoulliNB(alpha=.01)                     #Naive Bayes(Bernoulli) classifier
clf3.fit(X_train, y_train)
pred = clf3.predict(X_test)
writeToDisk(pred,"BernoulliNB")

clf4=KNeighborsClassifier(n_neighbors=10)       #KNeighbors Classifier
clf4.fit(X_train, y_train)
コード例 #46
0
ファイル: task3_final.py プロジェクト: albingrace/QianWan
#clf_KNN = KNeighborsClassifier(n_neighbors=7)
#clf_KNN = KNeighborsClassifier(n_neighbors=11)
clf_KNN.fit(Corpus_train, Y_train)
Y_pred_KNN = clf_KNN.predict(Corpus_test)
print_rate(Y_test, Y_pred_KNN, n_test, 'KNNClassifier')

# Using the SVM classifier
clf_SVM = svm.SVC()
clf_SVM.fit(Corpus_train, Y_train)
Y_pred_SVM = clf_SVM.predict(Corpus_test)
print_rate(Y_test, Y_pred_SVM, n_test, 'SVMClassifier')

# Using the Ridge classifier
clf_RC = RidgeClassifier(tol=0.01, solver="lsqr")
#clf_RC = RidgeClassifier(tol=0.1, solver="lsqr")
clf_RC.fit(Corpus_train, Y_train)
Y_pred_RC = clf_RC.predict(Corpus_test)
print_rate(Y_test, Y_pred_RC, n_test, 'RidgeClassifier')

# won't consider Random Forests or Decision Trees beacause they work bad for high sparse dimensions


# Using the Multinomial Naive Bayes classifier
# I expect that this MNB classifier will do the best since it is designed for occurrence counts features
#clf_MNB = MultinomialNB(alpha=0.01) #smoothing parameter = 0.01 is worse than 0.1
clf_MNB = MultinomialNB(alpha=0.1)
#clf_MNB = MultinomialNB(alpha=0.3) #a big smoothing rate doesnot benefit the model
#clf_MNB = MultinomialNB(alpha=0.2) #or alpha = 0.05 can generate the best outcome
clf_MNB.fit(Corpus_train, Y_train)
Y_pred_MNB = clf_MNB.predict(Corpus_test)
print_rate(Y_test, Y_pred_MNB, n_test, 'MultinomialNBClassifier')
コード例 #47
0
ファイル: ridge.py プロジェクト: mb16/Kaggle
def main():

    startCol = 0
    endCol = 50  # max = 1775

    train = csv_io.read_data("../Data/train.csv")
    target = [x[0] for x in train][1:3000]
    targetTest = [x[0] for x in train][3001:]
    trainTest = [x[startCol+1:endCol+1] for x in train][3001:]
    test = csv_io.read_data("../Data/test.csv")
    test = [x[startCol:endCol] for x in test]
	
    train = [x[startCol+1:endCol+1] for x in train][1:3000]	
	
    fo = open("knn_stats.txt", "a+")

    rf = RidgeClassifier(alpha=0.01, fit_intercept=True, normalize=False, copy_X=True, tol=0.001) 
	
    rf.fit(train, target)
    prob = rf.predict(trainTest) # changed from test


    result = 100
    probSum = 0
    for i in range(0, len(prob)):
        probX = prob[i] # [1]
        if ( probX > 0.7):
            probX = 0.7;		
        if ( probX < 0.3):
            probX = 0.3;
        print i, probSum, probX, target[i]
        print target[i]*log(probX), (1-target[i])*log(1-probX)
        probSum += targetTest[i]*log(probX)+(1-targetTest[i])*log(1-probX)
	
        #print probSum	
        #print len(prob)	
        #print "C: ", 10**C, " gamma: " ,2**g
        print -probSum/len(prob)
	

	
    if ( -probSum/len(prob) < result ):
        result = -probSum/len(prob)
        predicted_probs = rf.predict(test)  # was test
        predicted_probs = ["%f" % x for x in predicted_probs]
        csv_io.write_delimited_file("../Submissions/knn.csv", predicted_probs)
        print "Generated Data!!"
		
    #fo.write(str(5) + str(5)+ str(5));
		
    fo.close()
		
    #csv_io.write_delimited_file("../Submissions/rf_benchmark_test2.csv", predicted_probs)

    #predicted_probs = rf.predict_proba(train) # changed from test
 
    #predicted_probs = ["%f" % x[1] for x in predicted_probs]
    #predicted_probs = rf.predict(train) # changed from test
    #predicted_probs = ["%f" % x for x in predicted_probs]	
	
    #csv_io.write_delimited_file("../Submissions/rf_benchmark_train2.csv", predicted_probs)
	
	
    var = raw_input("Enter to terminate.")								
コード例 #48
0
# initialize empty y and z

# Test for 10 rounds using the results from 10 fold cross validations
for i, (train_index, test_index) in enumerate(kf):

    print "run %d" % (i+1)

    X_train_train, X_train_test = X_train[train_index], X_train[test_index]
    y_train_train, y_train_test = y_train[train_index], y_train[test_index]

    # X_den_train, X_den_test = X_den[train_index], X_den[test_index]

    # feed models
    clf_mNB.fit(X_train_train, y_train_train)
    clf_kNN.fit(X_train_train, y_train_train)
    clf_ridge.fit(X_train_train, y_train_train)
    clf_lSVC.fit(X_train_train, y_train_train)
    clf_SVC.fit(X_train_train, y_train_train)

    # get prediction for this fold run
    pred_mNB    = clf_mNB.predict(X_train_test)
    pred_kNN    = clf_kNN.predict(X_train_test)
    pred_ridge  = clf_ridge.predict(X_train_test)
    pred_lSVC   = clf_lSVC.predict(X_train_test)
    pred_SVC    = clf_SVC.predict(X_train_test)

    # update z array for each model
    z_mNB   = np.append(z_mNB    , pred_mNB  , axis=None)
    z_kNN   = np.append(z_kNN    , pred_kNN  , axis=None)
    z_ridge = np.append(z_ridge  , pred_ridge, axis=None)
    z_lSVC  = np.append(z_lSVC   , pred_lSVC , axis=None)
コード例 #49
0
#!/usr/bin/env python
"""
Ridge regression for Avito
"""
__author__ = "deniederhut"
__license__ = "GPL"
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

data = pd.read_table('/Users/dillonniederhut/Desktop/avito_train.tsv',nrows=100000)
#replace with file path to your training data

features = pd.get_dummies(data.subcategory)
features_train, features_test, target_train, target_test =\
    train_test_split(features, data.is_blocked, test_size = 0.25)

ridge = RidgeClassifier()
ridge.fit(features_train, target_train)
prediction = np.round(ridge.predict(features_test))
print classification_report(target_test, prediction)
print average_precision_score(target_test, prediction)
print roc_auc_score(target_test, prediction)
コード例 #50
0
train = lemons.append(non_lemons)

#X = train.drop(['RefId','IsBadBuy','VehYear','Make','Model','Trim','SubModel','WheelTypeID','BYRNO'],axis=1)
#y = pd.Series(train['IsBadBuy']).values

target = pd.Series(train['IsBadBuy']).values
data = train.drop(['RefId','IsBadBuy','VehYear','Make','Model','Trim','SubModel','WheelTypeID','BYRNO'],axis=1)

x_train, x_test, y_train, y_test = cross_validation.train_test_split(data,target, test_size=.3)



# Subset the data so we have a more even data set

model = RidgeClassifier()
clf = model.fit(X,y)
Ridg_Class = clf.predict(X)
clf.score(X,y)

metrics.confusion_matrix(y, clf.predict(X))
print metrics.classification_report(y, clf.predict(X))


# GradientBoostingClassifier

from sklearn.ensemble import *
model = GradientBoostingClassifier()

# Train
clf = model.fit(x_train, y_train)
コード例 #51
0
ファイル: 05_multilabel.py プロジェクト: YuanhaoSun/PPLearn
clf_svc = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True)
clf_rdg = RidgeClassifier(tol=1e-1)
clf_sgd = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")

# Logistic regression requires OneVsRestClassifier which hides
# its methods such as decision_function
# It will require extra implementation efforts to use it as a candidate
# for multilabel classification
# clf_lgr = OneVsRestClassifier(LogisticRegression(C=1000,penalty='l1'))
# kNN does not have decision function due to its nature
# clf_knn = KNeighborsClassifier(n_neighbors=13)

# train
clf_nb.fit(X, y)
clf_lsvc.fit(X, y)
clf_rdg.fit(X, y)
clf_svc.fit(X, y)
clf_sgd.fit(X, y)

print "Train time: %0.3fs" % (time() - t0)
print


# # predict by simply apply the classifier
# # this will not use the multi-label threshold
# predicted = clf_rdg.predict(X_new)
# for doc, category in zip(docs_new, predicted):
#     print '%r => %s' % (doc, data_train.target_names[int(category)])
#     print

コード例 #52
0
ファイル: ridgecv.py プロジェクト: evamy/avazu-ctr
    d = datetime.strptime(str(x), "%y%m%d%H")
    return [float(d.weekday()), float(d.hour)]

fh = FeatureHasher(n_features = 2**20, input_type="string")

# Train classifier
clf = RidgeClassifier()
train = pd.read_csv("train/subtrain.csv", chunksize = 100000, iterator = True)
all_classes = np.array([0, 1])
for chunk in train:
    y_train = chunk["click"]
    chunk = chunk[cols]
    chunk = chunk.join(pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"]))
    chunk.drop(["hour"], axis=1, inplace = True)
    Xcat = fh.transform(np.asarray(chunk.astype(str)))
    clf.fit(Xcat, y_train)
    
# Create a submission file
usecols = cols + ["id"]
X_test = pd.read_csv("test/mtest.csv", usecols=usecols)
X_test = X_test.join(pd.DataFrame([dayhour(x) for x in X_test.hour], columns=["wd", "hr"]))
X_test.drop(["hour"], axis=1, inplace = True)

X_enc_test = fh.transform(np.asarray(X_test.astype(str)))

y_act = pd.read_csv("test/mtest.csv", usecols=['click'])
y_pred = clf.predict(X_enc_test)

with open('logloss.txt','a') as f:
    f.write('\n'+str(log_loss(y_act, y_pred)))
コード例 #53
0
ファイル: simpleTestingv04.py プロジェクト: ilciavo/ml3
remove = ()

X_train = cityName;

print('Creating the vectorizer and chosing a transform (from raw text to feature)')
vect= TfidfVectorizer(sublinear_tf=True, max_df=0.5)
#vect=CountVectorizer(min_n=1,max_n=2,max_features=1000);

X_train = vect.fit_transform(X_train)


cityClass = RidgeClassifier(tol=1e-7)
countryClass = RidgeClassifier(tol=1e-7)

print('Creating a classifier for cities')
cityClass.fit(X_train,cityCode)
print('Creating a classifier for countries')
countryClass.fit(X_train,countryCode)

print('testing the performance');

testCityNames = vect.transform(cityNameTest);

predictionsCity = countryClass.predict(testCityNames);
predictionsCountry = cityClass.predict(testCityNames);

with open('predictions.csv','w') as csvfile:
        writer = csv.writer(csvfile)
        #for ind in range(0,len(predictionsCountry)):
        #        writer.writerow([str(predictionsCountry[ind]),str(predictionsCity[ind])])
        for predCountry,predCity in zip(predictionsCountry,predictionsCity):
        (LinearSVC(), "SVM")
        ):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf, name))   

# Attach classifier to the original json file

# loading dtm file for all twitts
fp = open('./python_files/twitter_dtm.pkl', 'rb')
dtm = pkl.load(fp)
fp.close()

# Predict the labels using Ridges classifier
clf = RidgeClassifier(alpha=1.,tol=1e-2, solver="lsqr")
clf.fit(X_train, y_train)
predicted_labels = clf.predict(dtm)

# loading json file for all twitts
file_name = '../R Project/Data/obamacare.json'
line_reader = open(file_name,'r') # r means for reading

# building a new json file for all twitts + new predicted labels
new_file_name = '../R Project/Data/obamacare_labeled.json'
line_writer = open(new_file_name,'w') # w means for writing

# adding the predicted label to each entry of json file
twit_i = 0
for line in line_reader:
    label = predicted_labels[twit_i]
    if label==0:
コード例 #55
0
def classify(granularity=10):
    trainDir = path.join(GEOTEXT_HOME, 'processed_data/' + str(granularity).strip() + '_clustered/')
    testDir = path.join(GEOTEXT_HOME, 'processed_data/test')
    data_train = load_files(trainDir, encoding=encoding)
    target = data_train.target
    data_test = load_files(testDir, encoding=encoding)

    categories = data_train.target_names
    
    def size_mb(docs):
        return sum(len(s.encode(encoding)) for s in docs) / 1e6
    
    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)
    
    print("%d documents - %0.3fMB (training set)" % (
        len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" % (
        len(data_test.data), data_test_size_mb))
    print("%d categories" % len(categories))
    print()
    
    # split a training set and a test set
    y_train = data_train.target
    y_test = data_test.target
    
    
    print("Extracting features from the training dataset using a sparse vectorizer")
    t0 = time()
    vectorizer = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, min_df=2, max_df=1.0, ngram_range=(1, 1), stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()
    
    print("Extracting features from the test dataset using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()
    chi = False
    if chi:
        k = 500000
        print("Extracting %d best features by a chi-squared test" % 0)
        t0 = time()
        ch2 = SelectKBest(chi2, k=k)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        
        print("done in %fs" % (time() - t0))
        print()
        
    feature_names = np.asarray(vectorizer.get_feature_names())
    # clf = LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-3)
    clf = RidgeClassifier(tol=1e-2, solver="auto")
    print('_' * 80)
    print("Training: ")
    print(clf)
    
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    scores = clf.decision_function(X_test)
    print scores.shape
    print pred.shape
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    # score = metrics.f1_score(y_test, pred)
    # print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))
        print("top 10 keywords per class:")
        for i, category in enumerate(categories):
            top10 = np.argsort(clf.coef_[i])[-10:]
            print("%s: %s" % (category, " ".join(feature_names[top10])))

    
    sumMeanDistance = 0
    sumMedianDistance = 0
    distances = []
    confidences = []
    randomConfidences = []
    
    for i in range(0, len(pred)):
        user = path.basename(data_test.filenames[i])
        location = userLocation[user].split(',')
        lat = float(location[0])
        lon = float(location[1])
        prediction = categories[pred[i]]
        confidence = scores[i][pred[i]] - mean(scores[i])
        randomConfidence = scores[i][random.randint(0, len(categories) - 1)]
        confidences.append(confidence)
        randomConfidences.append(randomConfidence)
        medianlat = classLatMedian[prediction]  
        medianlon = classLonMedian[prediction]  
        meanlat = classLatMean[prediction] 
        meanlon = classLonMean[prediction]      
        distances.append(distance(lat, lon, medianlat, medianlon))
        sumMedianDistance = sumMedianDistance + distance(lat, lon, medianlat, medianlon)
        sumMeanDistance = sumMeanDistance + distance(lat, lon, meanlat, meanlon)
    averageMeanDistance = sumMeanDistance / float(len(pred))
    averageMedianDistance = sumMedianDistance / float(len(pred))
    print "Average mean distance is " + str(averageMeanDistance)
    print "Average median distance is " + str(averageMedianDistance)
    print "Median distance is " + str(median(distances))
    fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)
    
    plt.xlim(0, 4000)
    plt.ylim(0, 2)
    ax1.scatter(distances, confidences)
    ax2.bar(distances, confidences)
    plt.savefig(path.join(GEOTEXT_HOME, 'confidence.png'))