def main(): # Load the dataset data_set = data.load_pickled_data() train_data = data_set['train'] test_data = data_set['test'] log('loaded dataset!') traindocs = [doc.content for doc in train_data if int(doc.rating) != 0] trainlabels = [ int(doc.rating) for doc in train_data if int(doc.rating) != 0 ] # Split the dataset if TEST_SIZE > 0: log('split dataset...') docs_train, docs_val, label_train, label_val = train_test_split( traindocs, trainlabels, test_size=TEST_SIZE, random_state=0) else: docs_train = traindocs label_train = trainlabels # Use prebuild model if not USE_BUILD_MODEL: log('make iterator...') it = LabeledLineSentence(docs_train, label_train) log('start training NN') d2v = train_model(it) else: d2v = gensim.models.Doc2Vec.load('Models/doc2vec_val.model') # Predict val_predictions = predict_val_set(d2v, docs_val) # Print the mae print 'MAE on validation set: ' + str( mean_absolute_error(label_val, val_predictions))
def main(): """ This main flow of the program: - Data is read and preprocessed - An ensemble model is trained using the train data - The validation set is predicted, different error metrics are printed - The test set is predicted and predictions are written to a file """ log('Preprocessing data...') preproc = Preprocessor(a_value=BEST_A, epsilon=BEST_EPSILON, use_cached_features=USE_CACHED_FEATURES) X_train, X_val, y_train, y_val, X_test = preproc.load_and_preprocess() log('Training ensemble...') ensemble = VotingClassifier(estimators=[ ('multinomial', MultinomialNB(alpha=0.01)), ('logistic_sag_balanced', LogisticRegression(solver='sag', n_jobs=NUM_THREADS, C=5, tol=0.01, class_weight='balanced')), ('logistic_lbfgs_balanced', LogisticRegression(solver='lbfgs', n_jobs=NUM_THREADS, C=5, tol=0.01, class_weight='balanced')), ], voting='soft', weights=[1, 1, 1]) ensemble = ensemble.fit(X_train, y_train) # Uncomment when using a test_size > 0 in preprocessor.py # log('Predicting validation set...') # predictions_val = ensemble.predict(X_val) # if USE_CACHED_FEATURES: # reviews = preproc.val_reviews # else: # reviews = preproc.load_val_reviews() # predictions_val = fix_zero_predictions(predictions_val, reviews) # log('Validation error = %s' % str(mean_absolute_error(predictions_val, y_val))) # log(classification_report(predictions_val, y_val)) # plot_confusion_matrix(confusion_matrix(y_val, predictions_val), classes=[1, 2, 3, 4, 5], # title='Normalized confusion matrix: validation set', filename='Plots/val_cnf_matrix.pdf') log('Predicting test set...') test_reviews = data.load_pickled_data()['test'] test_content = [x.content for x in test_reviews] predictions_test = ensemble.predict(X_test) predictions_test = fix_zero_predictions(predictions_test, test_content) pred_file_name = utils.generate_unqiue_file_name(PREDICTIONS_BASENAME, 'csv') log('Dumping predictions to %s...' % pred_file_name) write_predictions_to_csv(predictions_test, pred_file_name) log('That\'s all folks!')
def main(): data_set = data.load_pickled_data() train_data = data_set['train'] test_data = data_set['test'] histogram_ratings(train_data) histogram_prices(train_data + test_data) list_authors(train_data, test_data)
def main(): data_set = data.load_pickled_data() train_data = data_set['train'] test_data = data_set['test'] non_ascii(train_data) exit() # dataset_info(train_data, test_data) # list_authors(train_data, test_data) histogram_ratings(train_data) histogram_amount_of_reviews_per_hotel(train_data)
def load(self): """ Load the pickeled dataset. parameters: :return list<Review> dataset['train']: A list of all train (and validation) reviews. :return list<Review> dataset['test']: A list of all test reviews. """ log('Loading test and train data...') dataset = data.load_pickled_data() return dataset['train'], dataset['test']
def learn(parameter=0.5, classification_type='lsvc', generate_submission=False): """ The actual learning algorithm. There's support for an optional parameter and a choice between some classification methods. :param parameter: the adjustable parameter for the classification :param classification_type: :param generate_submission: boolean, generate a submission or not :return: cross validation score guess """ data_set = data.load_pickled_data() train_data_set = data_set['train'] pipeline = get_pipeline(parameter, classification_type) # k-fold cross validation, with k='KFOLD_SPLITS' kfold = KFold(n_splits=KFOLD_SPLITS, shuffle=True) i = 1 total_mae = 0 # execute 'KFOLD_ITERATIONS' times for train_idx, test_idx in kfold.split(train_data_set): test_data = operator.itemgetter(*test_idx)(train_data_set) train_data = operator.itemgetter(*train_idx)(train_data_set) pipeline.fit(train_data, get_target(train_data)) prediction = pipeline.predict(test_data) mae = cost_mae(prediction, get_target(test_data)) total_mae += mae if i == KFOLD_ITERATIONS: break i += 1 # calculate the final score guess as the mean of the fold scores mean_score_guess = total_mae / KFOLD_ITERATIONS # create a csv file to submit on Kaggle if generate_submission: test_data_set = data_set['test'] pipeline.fit(train_data_set, get_target(train_data_set)) predicted_ratings = pipeline.predict(test_data_set) dump_predictions(predicted_ratings, mean_score_guess) # return the score calculated by cross validation return mean_score_guess
def main(): if not os.path.exists(DEFAULT_PICKLE_PATH): print 'Creating pickle file...' data.create_pickled_data(overwrite_old=True) if not USE_CACHED_FEATURES: log('Loading test and train data...') dataset = data.load_pickled_data() log('Extracting features and target...') X_train, X_val, y_train, y_val, X_test = transform_data(dataset['train'], dataset['test']) print 'train feature shape: %d' % X_train.shape[1] print 'val feature shape: %d' % X_val.shape[1] dump_all(X_train, X_val, y_train, y_val, X_test) else: X_train, X_val, y_train, y_val, X_test = load_all() # Using TruncatedSVD #tsvd = TruncatedSVD(n_components=5000) #X_train = tsvd.fit_transform(X_train) #X_val = tsvd.fit_transform(X_val) #X_test = tsvd.fit_transform(X_test) log('Training model...') model = LogisticRegression(solver='sag', n_jobs=NUM_THEADS, C=5, tol=0.01) # Using RFE #model = RFE(model, n_features_to_select=80000, step=10000, verbose=1) model = model.fit(X_train, y_train) log('Predicting train and validation set...') predictions_train = model.predict(X_train) log('Train error = %s' % str(mean_absolute_error(predictions_train, y_train))) predictions_val = model.predict(X_val) log('Validation error = %s' % str(mean_absolute_error(predictions_val, y_val))) log('Predicting test set...') predictions_test = model.predict(X_test) pred_file_name = utils.generate_unqiue_file_name(PREDICTIONS_BASENAME, 'csv') log('Dumping predictions to %s...' % pred_file_name) write_predictions_to_csv(predictions_test, pred_file_name) log('That\'s all folks!')
def load_and_preprocess(self): """ Load data from pickle files and generate feature matrices, or load the cached ones. parameters: :return csr-matrix X_train: The preprocessed tf-idf feature matrix of the training samples. :return csr-matrix X_val: The preprocessed tf-idf feature matrix of the validation samples. :return numpy array y_train: The ratings corresponding to the samples in the training feature matrix. :return numpy array y_val: The ratings corresponding to the samples in the validation feature matrix. :return csr-matrix X_test: The preprocessed tf-idf feature matrix of the test samples. """ # Check for pickled data if not os.path.exists(DEFAULT_PICKLE_PATH): print 'Creating pickle file...' data.create_pickled_data(overwrite_old=True) # Check for cached features if self.use_cached_features: try: data_tuple = self.load_all() feature_count = data_tuple[0].shape[1] log('Loaded the cached preprocessed data. ({} features)'. format(feature_count)) self.val_reviews = self.load_val_reviews() return data_tuple except IOError: self.use_cached_features = False log('Could not load cached preprocessed data! Doing the preprocessing now...' ) # Preprocess data if not self.use_cached_features: log('Loading test and train data...') dataset = data.load_pickled_data() log('Extracting features and reducing the dimensionality...') X_train, X_val, y_train, y_val, X_test = self.transform_data( dataset['train'], dataset['test']) # Save preprocessed data for later self.dump_all(X_train, X_val, y_train, y_val, X_test) self.dump_val_reviews(self.val_reviews) return X_train, X_val, y_train, y_val, X_test
def main(): # Load the data data_set = data.load_pickled_data() train_data = data_set['train'] test_data = data_set['test'] log('loaded dataset!') traindocs = [doc.content for doc in train_data if int(doc.rating) != 0] trainlabels = [ int(doc.rating) for doc in train_data if int(doc.rating) != 0 ] # Split the data if TEST_SIZE > 0: log('split dataset...') docs_train, docs_val, label_train, label_val = train_test_split( traindocs, trainlabels, test_size=TEST_SIZE, random_state=0) else: docs_train = traindocs label_train = trainlabels # Use prebuild model if not USE_BUILD_MODEL: log('make iterator...') it = LabeledLineSentence(docs_train, label_train) log('start training NN') d2v = train_model(it) else: log('load pretrained model') d2v = gensim.models.Doc2Vec.load('Models/doc2vec0.2.model') train_features = fittransform_feature_matrix(d2v) log('start computing vectors for test data...') val_features = transform_feature_matrix(d2v, docs_val) # Actual classification logistic = LogisticRegression(solver='sag', n_jobs=4, C=1, tol=0.1) logistic.fit(train_features, label_train) predictions = logistic.predict(val_features) log('Validation error = %s' % str(mean_absolute_error(predictions, label_val))) log(classification_report(predictions, label_val))
def main(): data.create_pickled_data(overwrite_old=True) dataset = data.load_pickled_data() train_set = dataset['train'] test_set = dataset['test']
When working with sparse matrices, no deep copies are made. imput arguments: M: the matrix from which to remove rows idx_to_drop: the indices of the rows to remove output arguments: M[mask]: csr matrix, with only the remaining rows """ if not isinstance(M, scipy.sparse.csr_matrix): raise ValueError("works only for CSR format -- use .tocsr() first") indices = list(idx_to_drop) mask = np.ones(M.shape[0], dtype=bool) mask[idx_to_drop] = False return M[mask] if __name__ == '__main__': log('Loading test and train data...') dataset = data.load_pickled_data() train_data = dataset['train'] # Split training data in 4 folds (shuffle folds, because of random hotel split) # Preprocess and dump counter = 1 for x in range(4): log('Extracting features and reducing the dimensionality for fold ' + str(counter)) pp = Preprocessor(a_value=11, epsilon=0.1, reduction_level=0.025) X_train, X_val, y_train, y_val = pp.transform_data(train_data) dump_all(X_train, X_val, y_train, y_val, counter) dump_val_reviews(pp.val_reviews, counter) counter += 1