Exemple #1
0
def main():
    y_train, tX_train, ids = load_csv_data(DATA_TRAIN_PATH)
    _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

    np.random.seed(2019)

    # Preprocess data together to have the same shifts while creating log or root features
    tX_stacked = np.vstack((tX_train, tX_test))
    prep_param = {
        "bias": True,
        "fill": True,
        "standardize": False,
        "degree": 8,
        "log": True,
        "root": True
    }
    tX_stacked_prep, *_ = preprocess_data(tX_stacked, None, prep_param)
    tX_train_prep, tX_test_prep = np.split(tX_stacked_prep, [len(tX_train)])

    # Split data according to PRI_jet_num value
    tX_tr_splitted, indices_tr = divide_data(tX_train_prep)
    tX_te_splitted, indices_te = divide_data(tX_test_prep)
    n_models = len(indices_tr)

    y_tr_splitted = []
    for i in range(n_models):
        y_tr_splitted.append(y_train[indices_tr[i]])

    # Train
    weights = []
    for i in range(n_models):
        lambda_ = lambda_cv(tX_tr_splitted[i], y_tr_splitted[i])
        print(f"Class {i}, lambda: {lambda_}")
        weights.append(
            ridge_regression(y_tr_splitted[i], tX_tr_splitted[i], lambda_)[0])

    # Predict
    y_pr_tr = np.zeros(tX_train.shape[0])
    y_pr_te = np.zeros(tX_test.shape[0])
    for i in range(n_models):
        y_pr_tr[indices_tr[i]] = predict_labels(weights[i], tX_tr_splitted[i])
        y_pr_te[indices_te[i]] = predict_labels(weights[i], tX_te_splitted[i])

    acc_tr = compute_accuracy(y_train, y_pr_tr)
    print(f"Total accuracy train: {acc_tr}")
    _, counts = np.unique(y_pr_te, return_counts=True)
    print(
        f"Distribution on test data class -1: {counts[0]}, class +1: {counts[1]}"
    )

    create_csv_submission(ids_test, y_pr_te, OUTPUT_PATH)
Exemple #2
0
# -*- coding: utf-8 -*-
#!/bin/python3.5

"""
The run.py file produces our final submission into a "submission.csv"
in the data folder
"""

from als import ALS
from sgd import SGD
from helpers import create_csv_submission, load_data

if __name__ == '__main__':
  # Initializing dataset
  print("Loading dataset")
  path_dataset = "data/data_train.csv"
  ratings = load_data(path_dataset)

  # Creating the sub_file with the best prediction
  # prediction, test_rmse = ALS(ratings, None, 3, 0.2, 0.9)
  prediction, test_rmse = SGD(ratings, None, 0.04, 9, 0.1, 0.016)
  create_csv_submission(prediction)
  print("Submission created at data/submission.csv")
Exemple #3
0
    # Build the model
    initial_w = np.random.randn(D)
    optimal_gamma, optimal_lambda_, measure_tr, measure_te = \
        gamma_lambda_selection_cv(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas[i], lambdas[i],
                                  seed = seed, batch_size = batch_size, metric = metric, model = model)
    print('CA_bs:', CA_baseline)
    print('Iter:', i, ' Best gamma:', optimal_gamma, ' Best lambda:',
          optimal_lambda_, '\n')

    # Update the expected training error
    exp_measure_tr += measure_tr * X_train_subset.shape[0] / X_train.shape[0]
    exp_measure_te += measure_te * X_test_subset.shape[0] / X_test.shape[0]

    # Build the model with the best hyperparameters
    w = get_model(model, y_train_subset, X_train_subset, initial_w, max_iters,
                  optimal_gamma, optimal_lambda_, batch_size)

    # Get predictions
    y_pred_test = np.array(map_minus_1_1(predict_labels(w, X_test_subset)))

    # Insert the ids and predictions to the ids and y_pred arrays
    ids = np.concatenate((ids, ids_test_subset))
    y_pred = np.concatenate((y_pred, y_pred_test))

# Sort the ids and y_pred arrays
ids, y_pred = sort_arr(ids, y_pred)
# Create the submission CSV file
create_csv_submission(ids, y_pred, sumbission_fname)

print("Expected training accuracy / loss:", exp_measure_tr)
print("Expected test accuracy / loss:", exp_measure_te)
Exemple #4
0
true_test = test2.map(lambda x:((x[0],x[1]), x[2]))
pred_test = pred_test.map(lambda x:((x[0],x[1]), x[2]))

true_pred = true_test.join(pred_test)

MSE_test = true_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()
RMSE_test = np.sqrt(MSE_test)

print("Train rmse : ", RMSE_train)
print("Test rmse : ", RMSE_test)

# Generate the submission
testdata = sc.textFile("sample_submission.csv")
testheader = testdata.first() #extract header
testdata = testdata.filter(lambda row: row != testheader) 
testdata = testdata.map(lambda l: l.split(','))
testdata = testdata.map(lambda l: (row_col_spark(l[0], r)))

predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
pred = predictions.collect()

matrix_pred = sp.dok_matrix((10000, 1000), dtype=np.float32)

for row in pred:
    matrix_pred[row[0][0], row[0][1]] = row[1]

path_dataset2 = "sample_submission.csv"
sub_ex = load_data(path_dataset2)	
	
create_csv_submission(list(zip(*sub_ex.nonzero())), matrix_pred, 'submission.csv')
# Convert collection of text documents to a matrix of token counts
vocabulary_to_load = pickle.load(open('models/vocabulary.p', 'rb'))
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range = (1,3),
    max_df = 0.9261187281287935,
    min_df = 4,
    vocabulary = vocabulary_to_load
)
vectorizer._validate_vocabulary()
test_data_features = vectorizer.transform(test)

# Transform count matrix to a normalized tf-idf representation
tfidf_transformer = pickle.load(open('models/corpus_data_tfidf_fitted.p', 'rb'))
test_data_features_tfidf = tfidf_transformer.transform(test_data_features)


# PREDICT LABELS
print("Predicting the labels...")
predicted_labels = clf_logreg.predict(test_data_features_tfidf)
predictions_list = [int(label) for label in predicted_labels]


# CREATE SUBMISSION FILE
print("Creating submission file in results/ folder")
helpers.create_csv_submission(ids, predictions_list, 'results/submission.csv')

print("Submission file successfully created!")
Exemple #6
0
import pickle
from helpers import samples_csv_submission, create_csv_submission

with open('model.pickle', 'rb') as f:
    item_features, user_features, bias_item, bias_user = pickle.load(f)

SUBMISSION_SAMPLES_PATH = "./Data/sample_submission.csv"
samples_submission = samples_csv_submission(SUBMISSION_SAMPLES_PATH)

create_csv_submission(samples_submission, item_features, user_features,
                      bias_item, bias_user, 'submission_run.csv')
Exemple #7
0
import preprocessing, pipeline, helpers

print('Loading the dataset...')
tweets, size_pos, size_neg = preprocessing.load_train_tweets(
    "./data/train_pos_full.txt", "./data/train_neg_full.txt")

# creating the predictions
pred = preprocessing.predictions(size_pos, size_neg)

# loading the test tweets
x_test, ids_test = preprocessing.load_test_tweets('./data/test_data.txt')

print('Training classifier...')
#Best pipeline with TF-IDFVectorizer with gram = (1,4) and LinearSVC which yielded the best results on crowdai.org
clf = pipeline.pipeline_model(1, 4)
clf.fit(tweets, pred)

# create the submission csv file
helpers.create_csv_submission(ids_test, clf.predict(x_test), "submission.csv")
Exemple #8
0
def get_prediction(neural_net,
                   global_vectors,
                   full_corpus,
                   total_training_tweets,
                   nr_pos_tweets,
                   kaggle_name,
                   epochs,
                   patience,
                   split=0.8):
    """ Creates a csv file with kaggle predictions and returns the predictions.
    Input:
        neural_net: Name of a neural net model
        global_vectors: global vectors created out the gensim-.txt files.
        total_training_tweets: (int) Number of tweets that are training tweets. Assums that the first poriton of the corpus is
        training tweets, the second part is the unseen test set.
        nr_pos_tweets: (int) number of traning tweets that are positiv
        kaggle_name: Name for csv file, must end in '.csv'.
   
    Output:
        pred_ones: the predicions (1 or -1)
        a .csv file with name 'kaggle_name'
    """
    num_of_dim = global_vectors.syn0.shape[1]

    # seperate traindata and testdata
    train_corpus = full_corpus[:total_training_tweets:]
    predict_corpus = full_corpus[total_training_tweets::]

    # Build a vector of all the words in a tweet
    train_document_vecs = np.concatenate([
        GM.buildDocumentVector(doc, num_of_dim, global_vectors)
        for doc in train_corpus
    ])
    train_document_vecs = sk.preprocessing.scale(train_document_vecs)

    labels = HL.create_labels(nr_pos_tweets, nr_pos_tweets, kaggle=False)

    train_document_vecs, labels = HL.shuffle_data(train_document_vecs, labels)
    train_x, val_x, train_y, val_y = HL.split_data(train_document_vecs, labels,
                                                   split)

    test_document_vecs = np.concatenate([
        GM.buildDocumentVector(doc, num_of_dim, global_vectors)
        for doc in predict_corpus
    ])
    test_document_vecs = sk.preprocessing.scale(test_document_vecs)

    model = neural_net(num_of_dim)

    # Defining callbacks to be used under fitting process
    early_stopping = early_stopping_callback(patience_=patience, verbose_=1)
    model_checkpoint = model_checkpoint_callback(
        "neural_model_prediction.hdf5", verbose_=1)

    history = model.fit(train_x,
                        train_y,
                        epochs=epochs,
                        batch_size=1024,
                        verbose=1,
                        callbacks=[early_stopping, model_checkpoint],
                        validation_data=(val_x, val_y))

    # Loading the best model found during training
    model = load_model('neural_model_prediction.hdf5')

    prediction = model.predict(test_document_vecs)

    prediction = [1 if i > 0.5 else -1 for i in prediction]

    # Creating prediction
    ids = list(range(1, 10000 + 1))
    HL.create_csv_submission(ids, prediction, kaggle_name)

    return prediction
# Eksternal libraries
import csv
import pickle
import time
import keras as K

# internal imports
import helpers as HL

# Loading pre-processed document vectors for test-set.
test_document_vecs = pickle.load(open("final_document_vectors.pkl", "rb"))

#Loading neural net model
model = K.models.load_model('final_model_for_kaggle.hdf5')

#Predicting on test set with neural net model
prediction = model.predict(test_document_vecs)

#Convert results to kaggle format ( -1, 1 )
prediction = [1 if i > 0.5 else -1 for i in prediction]

#CREATING SUBMISSION
ids = list(range(1, 10000 + 1))
HL.create_csv_submission(ids, prediction, 'powerpuffz_kagglescore.csv')

print("Prediction created - powerpuffz_kagglescore.csv")
Exemple #10
0
                                         PRI_JET_NUM_INDEX)

    # We achieved our best results using Regularized Logistic Regression,
    # so we only load only those previously computed optimal params to generate the submission
    logistic_best_params = np.load("results/logistic_best_params.npy", allow_pickle=True)
    logistic_best_models = []

    for (lambda_, deg, gamma), train_classes_split, train_data_split in \
            zip(logistic_best_params, train_classes_jet_num_splits, train_data_jet_num_splits):
        data_split, columns_to_remove, mean, std = preprocessing_pipeline(train_data_split, degree=np.int(deg),
                                                                          cross_term=True, norm_first=False)
        initial_w = np.zeros((data_split.shape[1],))
        w, loss = reg_logistic_regression(train_classes_split, data_split, lambda_, initial_w, 500, gamma, 1)
        print(f'Loss: {loss:.3f} Accuracy : {compute_accuracy(predict_labels(w, data_split), train_classes_split)}')
        logistic_best_models.append((w, loss, columns_to_remove, mean, std))

    # Calculate the predictions for each of the 4 subsets using the weights and then combine them
    results = None
    for (w, _, col_to_rm, mean, std), (_, deg, _), test_classes_split, test_data_split, test_ids_split in \
            zip(logistic_best_models, logistic_best_params,
                test_classes_jet_num_splits, test_data_jet_num_splits, test_ids_jet_num_splits):
        test_data_split, _, _, _ = preprocessing_pipeline(test_data_split, degree=np.int(deg),
                                                          columns_to_remove=col_to_rm,
                                                          cross_term=True, norm_first=False, mean=mean, std=std)
        pred = predict_labels(w, test_data_split)
        out = np.stack((test_ids_split, pred), axis=-1)
        results = out if results is None else np.vstack((results, out))

    # Create the submission
    create_csv_submission(results[:, 0], results[:, 1], 'results/logistic_submission.csv')