def generate_all_mappings(): pool = get_process_pool() start_index_inclusive = 1 if not args.debug else 109 stop_index_exclusive = 110 partial_mappings = pool.map( generate_batch_mappings, range(start_index_inclusive, stop_index_exclusive)) pool.close() print('Merging {} partial results...'.format(len(partial_mappings))) accumulated_mappings = {} for partial_result in partial_mappings: accumulated_mappings.update(partial_result) mapping = pd.DataFrame.from_dict(accumulated_mappings, orient='index', columns=['batch_id', 'line']) print(mapping.head()) write_pickle(GENERATED_WIKI_PAGE_MAPPINGS_PATH, mapping)
if args.dataset.endswith('all') or (args.file and args.file.endswith('all')): print('batch processing data') claims_split = np.array_split(claims_and_retrieved_docs, 20) for i, batch in enumerate(claims_split): partial_results = pool.map(preprocess_claim_with_doc, batch.iterrows()) print('Merging partial results...') preprocessed = list(chain.from_iterable(partial_results)) training_data = pd.DataFrame.from_records( preprocessed, columns=PREPROCESSED_DATA_COLUMNS_V1) output_path = GENERATED_LR_PREPROCESSED_TRAINING_DATA if args.dataset.startswith('train') \ else GENERATED_LR_PREPROCESSED_DEV_DATA output_path += str(i) write_pickle(output_path, training_data) # just small subset of data else: partial_results = pool.map(preprocess_claim_with_doc, claims_and_retrieved_docs.iterrows()) print('Merging partial results...') preprocessed = list(chain.from_iterable(partial_results)) training_data = pd.DataFrame.from_records( preprocessed, columns=PREPROCESSED_DATA_COLUMNS_V1) output_path = GENERATED_LR_PREPROCESSED_TRAINING_DATA if args.dataset.startswith('train') \ else GENERATED_LR_PREPROCESSED_DEV_DATA if args.file: output_path += os.path.basename(args.file) write_pickle(output_path, training_data)
line_text, preprocess_doc_title(page_id)) combined_evidence = ' '.join(evidence_sentences) avg_sentence_position = np.mean(evidence_sentence_positions) input = transform_NN_input(claim, combined_evidence, num_evidence_docs_for_claim, num_references, num_evidence_items, num_coordination_terms_evidence_claim, num_coordination_terms_titles_claim, avg_sentence_position, num_evidence_words) preprocessed_pairs.append((claim_id, input, output)) return preprocessed_pairs if __name__ == '__main__': training_data = get_all_claims(args.dataset) if args.debug: training_data = training_data.head(n=3) pool = get_process_pool(args.cores) partial_results = pool.map(preprocess_claim, training_data.iterrows()) print('Merging partial results...') preprocessed = list(chain.from_iterable(partial_results)) preprocessed_df = pd.DataFrame.from_records( preprocessed, columns=PREPROCESSED_DATA_COLUMNS_V2) output_path = GENERATED_NN_PREPROCESSED_DATA.format(args.dataset, 'v4') write_pickle(output_path, preprocessed_df)
loss.item())) loss_history.append(loss.item()) print('Done with training...') # test result # note: equivalent logic to util.evaluation.get_accuracy(), but optimised for pytorch with torch.no_grad(): correct = 0 total = 0 for inputs, labels in dev_loader: inputs = Variable(inputs).to(args.device) labels = Variable(labels).to(args.device) outputs = model(inputs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Accuracy on dev subset: {:.5f}.'.format(correct / total)) # save results write_pickle(GENERATED_NEURAL_NETWORK_MODEL.format(args.preprocessed_format), model) write_pickle( GENERATED_NEURAL_NETWORK_LOSS_HISTORY.format(args.preprocessed_format), loss_history) # plot loss history plot_loss_values(num_iterations * args.num_epochs, args.learning_rate, loss_history, loss_history_frequency)
import argparse import pandas as pd from _4_B_fit_LR_model import fit_and_get_model, LOSS_HISTORY_FREQUENCY from dataaccess.files_constants import GENERATED_LOGISTIC_REGRESSION_MODEL, \ GENERATED_LOGISTIC_REGRESSION_LOSS_HISTORY, GENERATED_LR_PREPROCESSED_TRAINING_DATA from dataaccess.files_io import read_pickle, write_pickle from util.LR_NN_preprocessing import extract_input_and_expected from util.plots import plot_loss_values parser = argparse.ArgumentParser() parser.add_argument('--debug', help='use less data and less learning iterations', action='store_true') parser.add_argument('--num_iterations', type=int, default=100000) parser.add_argument('--learning_rate', type=float, default=0.1) args = parser.parse_args() if __name__ == '__main__': training_data: pd.DataFrame = read_pickle(GENERATED_LR_PREPROCESSED_TRAINING_DATA) train_input, train_expected = extract_input_and_expected(training_data) model, loss_values = fit_and_get_model(train_input, train_expected, args.num_iterations, args.learning_rate) write_pickle(GENERATED_LOGISTIC_REGRESSION_LOSS_HISTORY, loss_values) # for plotting write_pickle(GENERATED_LOGISTIC_REGRESSION_MODEL, model) plot_loss_values(args.num_iterations, args.learning_rate, loss_values, LOSS_HISTORY_FREQUENCY)