Esempio n. 1
0
def generate_all_mappings():
    pool = get_process_pool()

    start_index_inclusive = 1 if not args.debug else 109
    stop_index_exclusive = 110
    partial_mappings = pool.map(
        generate_batch_mappings,
        range(start_index_inclusive, stop_index_exclusive))
    pool.close()

    print('Merging {} partial results...'.format(len(partial_mappings)))
    accumulated_mappings = {}

    for partial_result in partial_mappings:
        accumulated_mappings.update(partial_result)

    mapping = pd.DataFrame.from_dict(accumulated_mappings,
                                     orient='index',
                                     columns=['batch_id', 'line'])
    print(mapping.head())
    write_pickle(GENERATED_WIKI_PAGE_MAPPINGS_PATH, mapping)
Esempio n. 2
0
    if args.dataset.endswith('all') or (args.file
                                        and args.file.endswith('all')):
        print('batch processing data')
        claims_split = np.array_split(claims_and_retrieved_docs, 20)
        for i, batch in enumerate(claims_split):
            partial_results = pool.map(preprocess_claim_with_doc,
                                       batch.iterrows())
            print('Merging partial results...')
            preprocessed = list(chain.from_iterable(partial_results))

            training_data = pd.DataFrame.from_records(
                preprocessed, columns=PREPROCESSED_DATA_COLUMNS_V1)
            output_path = GENERATED_LR_PREPROCESSED_TRAINING_DATA if args.dataset.startswith('train') \
                else GENERATED_LR_PREPROCESSED_DEV_DATA
            output_path += str(i)
            write_pickle(output_path, training_data)

    # just small subset of data
    else:
        partial_results = pool.map(preprocess_claim_with_doc,
                                   claims_and_retrieved_docs.iterrows())
        print('Merging partial results...')
        preprocessed = list(chain.from_iterable(partial_results))

        training_data = pd.DataFrame.from_records(
            preprocessed, columns=PREPROCESSED_DATA_COLUMNS_V1)
        output_path = GENERATED_LR_PREPROCESSED_TRAINING_DATA if args.dataset.startswith('train') \
            else GENERATED_LR_PREPROCESSED_DEV_DATA
        if args.file:
            output_path += os.path.basename(args.file)
        write_pickle(output_path, training_data)
Esempio n. 3
0
                line_text, preprocess_doc_title(page_id))

    combined_evidence = ' '.join(evidence_sentences)
    avg_sentence_position = np.mean(evidence_sentence_positions)

    input = transform_NN_input(claim, combined_evidence,
                               num_evidence_docs_for_claim, num_references,
                               num_evidence_items,
                               num_coordination_terms_evidence_claim,
                               num_coordination_terms_titles_claim,
                               avg_sentence_position, num_evidence_words)
    preprocessed_pairs.append((claim_id, input, output))

    return preprocessed_pairs


if __name__ == '__main__':
    training_data = get_all_claims(args.dataset)
    if args.debug:
        training_data = training_data.head(n=3)

    pool = get_process_pool(args.cores)
    partial_results = pool.map(preprocess_claim, training_data.iterrows())
    print('Merging partial results...')
    preprocessed = list(chain.from_iterable(partial_results))

    preprocessed_df = pd.DataFrame.from_records(
        preprocessed, columns=PREPROCESSED_DATA_COLUMNS_V2)
    output_path = GENERATED_NN_PREPROCESSED_DATA.format(args.dataset, 'v4')
    write_pickle(output_path, preprocessed_df)
                loss.item()))
            loss_history.append(loss.item())

print('Done with training...')

# test result
# note: equivalent logic to util.evaluation.get_accuracy(), but optimised for pytorch
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in dev_loader:
        inputs = Variable(inputs).to(args.device)
        labels = Variable(labels).to(args.device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy on dev subset: {:.5f}.'.format(correct / total))

# save results
write_pickle(GENERATED_NEURAL_NETWORK_MODEL.format(args.preprocessed_format),
             model)
write_pickle(
    GENERATED_NEURAL_NETWORK_LOSS_HISTORY.format(args.preprocessed_format),
    loss_history)

# plot loss history
plot_loss_values(num_iterations * args.num_epochs, args.learning_rate,
                 loss_history, loss_history_frequency)
Esempio n. 5
0
import argparse

import pandas as pd

from _4_B_fit_LR_model import fit_and_get_model, LOSS_HISTORY_FREQUENCY
from dataaccess.files_constants import GENERATED_LOGISTIC_REGRESSION_MODEL, \
    GENERATED_LOGISTIC_REGRESSION_LOSS_HISTORY, GENERATED_LR_PREPROCESSED_TRAINING_DATA
from dataaccess.files_io import read_pickle, write_pickle
from util.LR_NN_preprocessing import extract_input_and_expected
from util.plots import plot_loss_values

parser = argparse.ArgumentParser()
parser.add_argument('--debug', help='use less data and less learning iterations', action='store_true')
parser.add_argument('--num_iterations', type=int, default=100000)
parser.add_argument('--learning_rate', type=float, default=0.1)
args = parser.parse_args()


if __name__ == '__main__':
    training_data: pd.DataFrame = read_pickle(GENERATED_LR_PREPROCESSED_TRAINING_DATA)
    train_input, train_expected = extract_input_and_expected(training_data)

    model, loss_values = fit_and_get_model(train_input, train_expected, args.num_iterations, args.learning_rate)
    write_pickle(GENERATED_LOGISTIC_REGRESSION_LOSS_HISTORY, loss_values)  # for plotting
    write_pickle(GENERATED_LOGISTIC_REGRESSION_MODEL, model)

    plot_loss_values(args.num_iterations, args.learning_rate, loss_values, LOSS_HISTORY_FREQUENCY)