Esempio n. 1
0
def main(args):

    # loading datasets from jsonl files
    with open(args.input_data_path) as f:
        valid = [json.loads(valid) for valid in f]

    logging.info('Collecting documents...')
    documents = ([sample['text'] for sample in valid])

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=True)
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    """
    embedding = Embedding("./glove.6B.300d.txt", words=words)
    with open('./embedding.pkl', 'wb') as f:
        pickle.dump(embedding, f)
    """
    with open('./embedding.pkl', 'rb') as file:
        embedding = pickle.load(file)
    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating valid dataset...')
    create_seq2seq_dataset(process_samples(tokenizer, valid), 'data.pkl',
                           tokenizer.pad_token_id)
Esempio n. 2
0
def main(args):
    # with open(args.output_dir / 'config.json') as f:
    #     config = json.load(f)

    # loading datasets from jsonl files
    # with open(config['train']) as f:
    #     train = [json.loads(line) for line in f]
    with open(args.valid_data_path) as f:
        valid = [json.loads(valid) for valid in f]
    # with open(config['test']) as f:
    #     test = [json.loads(line) for line in f]

    logging.info('Collecting documents...')
    documents = ([sample['text'] for sample in valid])

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=True)
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    with open('embedding2.pkl', 'rb') as f:
        embedding = pickle.load(f)

    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating valid dataset...')
    create_seq2seq_dataset(process_samples(tokenizer, valid),
                           'valid_seq2seq.pkl', tokenizer.pad_token_id)
def main(args):
    
    with open(args.test_input) as f:
        test = [json.loads(line) for line in f]

    logging.info('Collecting documents...')
    documents = (
        [sample['text'] for sample in test]
    )

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=True)
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    with open(args.embedding_file, 'rb') as f:
        embedding = pickle.load(f)

    tokenizer.set_vocab(embedding.vocab)

  
    logging.info('Creating test dataset...')
    create_seq2seq_dataset(
        process_samples(tokenizer, test),
        args.test_output,
        tokenizer.pad_token_id
    )
Esempio n. 4
0
def main(args):
    with open(args.output_dir / 'config.json') as f:
        config = json.load(f)

    # loading datasets from jsonl files
    with open(config['train']) as f:
        train = [json.loads(line) for line in f]
    with open(config['valid']) as f:
        valid = [json.loads(valid) for valid in f]
    with open(config['test']) as f:
        test = [json.loads(line) for line in f]

    logging.info('Collecting documents...')
    documents = (
        [sample['text'] for sample in train]
        + [sample['summary'] for sample in train]
        + [sample['text'] for sample in valid]
        + [sample['text'] for sample in test]
    )

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=config['lower_case'])
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    embedding = Embedding(config['embedding'], words=words)
    with open(args.output_dir / 'embedding.pkl', 'wb') as f:
        pickle.dump(embedding, f)

    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating train dataset...')
    create_seq2seq_dataset(
        process_samples(tokenizer, train),
        args.output_dir / 'train.pkl', config,
        tokenizer.pad_token_id
    )
    logging.info('Creating valid dataset...')
    create_seq2seq_dataset(
        process_samples(tokenizer, valid),
        args.output_dir / 'valid.pkl', config,
        tokenizer.pad_token_id
    )
    logging.info('Creating test dataset...')
    create_seq2seq_dataset(
        process_samples(tokenizer, test),
        args.output_dir / 'test.pkl', config,
        tokenizer.pad_token_id
    )