def make_submission(with_model): """ Generates a submission for the leaderboard """ train_files = resources.train_data_files() train_observations = filename_to_id(train_files) train_label_dict = resources.train_data_labels() train_labels = np.array( [train_label_dict[ob_id] for ob_id in train_observations]) test_files = resources.test_data_files() test_observations = filename_to_id(test_files) preprocess_data = preprocess(with_model, train_files, test_files) trained_model = train(with_model, train_files, train_observations, train_labels, *preprocess_data) Y = predict(with_model, trained_model, test_files, test_observations, *preprocess_data) # Write the output: output_file = "{}/submission_{}.txt".format(resolve('..'), strftime("%Y-%m-%d_%H:%M:%S")) with open(output_file, 'w') as f: for (observation, result) in zip(test_observations, Y): f.write("{} {}\n".format(observation, result)) info(">> Wrote submission output to {}".format(output_file))
def _create_CoreNLP_trainxml(): """ Generates CoreNLP output for the training data (run once) """ input_files = resources.train_data_files() command.run_corenlp(resolve('~', 'corenlp') ,input_files ,resolve('..', 'data', 'CoreNLP', 'train_data') ,annotators=USE_ANNOTATORS)
def all_sentences(for_data): """ Returns a dict of all sentences data derived from CoreNLP. The key is the truncated filename (observation-ID), and the value is the output sentence data generated by parse_xml() for that particular file. @returns: {str: <sentence-data>} """ assert(for_data in ('train', 'test')) if for_data == 'train': data_cache_file = CORENLP_TRAIN_DATA_CACHE else: data_cache_file = CORENLP_TEST_DATA_CACHE # If there's cached data, load it: if exists(data_cache_file): debug('> Loading cached CoreNLP data from {}'.format(data_cache_file)) with open(data_cache_file, 'r') as f: return pickle.load(f) # Otherwise, generate the output from parse_xml() debug('> CoreNLP data {} not found; caching...'.format(data_cache_file)) if for_data == 'train': filenames = resources.train_data_files('CoreNLP') else: filenames = resources.test_data_files('CoreNLP') #if include_test: # filenames += resources.test_data_files('CoreNLP') # parse_xml(filename)[1] means to only keep the actual sentence data, # not the file name/observation identifier. Also, lops off the ".xml" part # from the CoreNLP output filename preserving the original filename data = {splitext(filename_to_id(filename))[0]: parse_xml(filename)[1] for filename in filenames} with open(data_cache_file, 'w') as f: pickle.dump(data, f) debug('> CoreNLP data cached to {}'.format(data_cache_file)) return data
def read_file(filetype, index=None): """ Utility function to fetch a listing of the file or files that constitute the training and test data set. @param str filetype One of 'train' or 'test' @param int|None index If given, the file in the file list at the given index will be read from, otherwise all files will be read from @returns str|[str] """ filelist = [] if filetype == 'train': filelist = resources.train_data_files() elif filetype == 'test': filelist = resources.test_data_files() else: raise ValueError('type must be "train" or "test"; got ' + filetype) return map(utils.files.read_file, filelist) \ if index is None else utils.files.read_file(filelist[index])
def test(model_name, test_size=0.1, suppress_output=False, show_results=False, *args, **kwargs): """ Runs a full test cycle for the given model Options: n_folds: Number of cross-validation folds to produce suppress_output: If True, no output will be produced show_results: If True, individual prediction results will be printed. suppress_output must also be False for this option to work @returns (float:accuracy, int:correct_count, int:incorrect_count) """ observations = resources.train_data_files('text') labels = resources.train_data_labels() # Fed into sklearn cross_validation Y = np.array([labels[ob_id] for ob_id in filename_to_id(observations)]) # Divide the observation data into two sets for training and testing: #(train_files, test_files, hold_out_fold) = nfold_xval(observations, n=n_folds) (train_files, test_files, train_labels, true_labels) = \ cross_validation.train_test_split(observations, Y, test_size=test_size) assert (len(train_files) == len(train_labels) and len(test_files) == len(true_labels)) info("> test size: {}, |train| (kept): {}, |test| (held out): {}"\ .format(test_size, len(train_files), len(test_files))) # Get any preprocessing data and pass it to train() anbd predict() later: data = preprocess(model_name, train_files, test_files) # Generate training features: trained_model = train(model_name \ ,train_files \ ,filename_to_id(train_files) \ ,train_labels \ ,*data) # Same as training: the observation ID is just the basename of the input test_observation_ids = filename_to_id(test_files) # Use the trained model to make predictions: predicted_labels = predict(model_name \ ,trained_model \ ,test_files \ ,test_observation_ids \ ,*data) accuracy = metrics.accuracy_score(true_labels, predicted_labels) cm = metrics.confusion_matrix(true_labels, predicted_labels) f1_score = metrics.f1_score(true_labels, predicted_labels) correct = cm[0][0] + cm[1][1] incorrect = cm[1][0] + cm[0][1] incorrect_observations = set() if not suppress_output: line = '*' * 80 print print line print "Accuracy: {}%".format(accuracy * 100.0) print "F1-Score: {}".format(f1_score) print "Confusion matrix:\n", cm print "Incorrect labelled as 1: {}; Incorrect labelled as -1: {}".format( cm[1][0], cm[0][1]) print "Incorrect:" for i in range(len(test_observation_ids)): if true_labels[i] != predicted_labels[i]: print "TRUE: {}, PREDICTED: {}, LEAD: {}".format( true_labels[i], predicted_labels[i], test_observation_ids[i]) incorrect_observations.add(test_observation_ids[i]) print print line print return (accuracy, correct, incorrect, incorrect_observations)