def run(input_train, input_test, output_name):
    """
    Takes a file path as input, a file path as output, and produces a sorted csv of
    item IDs for Kaggle submission
    -------
    input_train : 'full path of the training file'
    input_test : 'full path of the testing file'
    output_name : 'full path of the output file'
    """

    data = pd.read_table(input_train)
    test = pd.read_table(input_test)
    testItemIds = test.itemid
    response = data.is_blocked
    dummies = sparse.csc_matrix(pd.get_dummies(data.subcategory))
    pretestdummies = pd.get_dummies(test.subcategory)
    testdummies = sparse.csc_matrix(pretestdummies.drop(['Растения', 'Товары для компьютера'],axis=1))
    words = np.array(data.description,str)
    testwords = np.array(test.description,str)
    del data, test
    vect = text.CountVectorizer(decode_error = u'ignore', strip_accents='unicode', ngram_range=(1,2))
    corpus = np.concatenate((words, testwords))
    vect.fit(corpus)
    counts = vect.transform(words)
    features = sparse.hstack((dummies,counts))
    clf = RidgeClassifier()
    clf.fit(features, response)
    testcounts = vect.transform(testwords)
    testFeatures = sparse.hstack((testdummies,testcounts))
    predicted_scores = clf.predict_proba(testFeatures).T[1]
    f = open(output_name,'w')
    f.write("id\n") 
    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        f.write("%d\n" % (item_id))
    f.close()
Beispiel #2
0
            dev_accuracy = sum(
                (predictions == y_test).astype(int)) / predictions.shape[0]
        else:

            temp_classifier = classifier
            x_train, x_test, y_train, y_test = train_test_split(X_train_all,
                                                                Y_train_all,
                                                                test_size=0.1)
            temp_classifier.fit(x_train, y_train)
            dev_accuracy = temp_classifier.score(x_test, y_test)
            predictions = temp_classifier.predict(x_test)
            # if ridge_option:
            # 	print(temp_classifier.decision_function(x_test))
            try:
                y_proba[i] = classifier.predict_proba(x_test)
            except:
                scores = classifier.decision_function(x_test)
                y_proba[i] = scores / (1 + scores)
            y_mistake[i] = np.mean(
                y_proba[i][y_proba[i].argmax(axis=1) != y_test].max(axis=1))
            testArray = np.array([
                np.mean(y_proba[i][y_test == j][y_proba[i][y_test == j].argmax(
                    axis=1) != j].max(axis=1)) for j in range(4)
            ])
            y_mistake_perClass[i] = testArray

        confusion_matrices[i] = confusion_matrix(y_test, predictions)
        print('Fold N°', str(i))
        print('SCORE : ', dev_accuracy)
    test = df_prob_age.iloc[3000000:]

    age_model_lst = []
    age_score_lst = []
    age_oof = np.zeros((3000000, 10), dtype='float32')
    for count, (train_idx, valid_idx) in enumerate(folds):
        X_train = train_val.iloc[train_idx].values
        X_val = train_val.iloc[valid_idx].values
        y_train = y_age[train_idx]
        y_val = y_age[valid_idx]
        print(f'Training Fold {count}...')
        model = RidgeClassifier(alpha=0.5)
        #     model = LogisticRegression(n_jobs=30)
        model.fit(X_train, y_train)
        try:
            val_pred_prob = model.predict_proba(X_val)
            age_oof[valid_idx] = val_pred_prob
            val_pred = np.argmax(val_pred_prob,
                                 axis=1)  # model.predict(X_val) #
        except:
            print(model.coef_)
            val_pred = model.predict(X_val)
        acc = accuracy_score(y_val, val_pred)
        print(f"Fold-{count}: Acc =", acc)
        age_score_lst.append(acc)
        age_model_lst.append(model)
    print(np.mean(age_score_lst))

    test = df_prob_age.iloc[3000000:]
    age_pred = pd.DataFrame()
    age_pred['user_id'] = testdata['user_id'].copy()
Beispiel #4
0
score = cross_val_score(model,
                        X_values,
                        y_train,
                        cv=kfold,
                        n_jobs=1,
                        scoring='roc_auc',
                        verbose=0)

print('score {:.4}'.format(score.mean()))
#score 0.7853 roc auc: 0.783 col_name.startswith('number') or col_name.startswith('dt') or col_name.startswith('onehot')
# score 0.781 roc auc: 0.7787 if col_name.startswith('number')
# score 0.7832  roc auc: 0.7809  col_name.startswith('number') or col_name.startswith('onehot')
# score 0.7854 roc auc: 0.7831 col_name.startswith('number') or col_name.startswith('dt')

result = df_test_d[['target']].copy()
result['prediction'] = model.predict_proba(X_test)[:, 1]
result['prediction'] = model._predict_proba_lr(X_test)[:, 1]

metric = roc_auc_score(result['target'], result['prediction'])
print('roc auc: {:.4}'.format(metric))
# Обучение
result = df_X_d[['target']].copy()
result['prediction'] = model.predict_proba(X_values)[:, 1]
result['prediction'] = model._predict_proba_lr(X_values)[:, 1]

metric = roc_auc_score(result['target'], result['prediction'])
print('roc auc: {:.4}'.format(metric))
#
# roc auc: 0.7132
# roc auc: 0.9765
def ridge(X_train, y_train, X_test, params):
    model = RidgeClassifier(alpha=float(params[0])).fit(X_train, y_train)
    return model.predict_proba(X_test)[:,1]