def main(args): # loading datasets from jsonl files with open(args.input_data_path) as f: valid = [json.loads(valid) for valid in f] logging.info('Collecting documents...') documents = ([sample['text'] for sample in valid]) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=True) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') """ embedding = Embedding("./glove.6B.300d.txt", words=words) with open('./embedding.pkl', 'wb') as f: pickle.dump(embedding, f) """ with open('./embedding.pkl', 'rb') as file: embedding = pickle.load(file) tokenizer.set_vocab(embedding.vocab) logging.info('Creating valid dataset...') create_seq2seq_dataset(process_samples(tokenizer, valid), 'data.pkl', tokenizer.pad_token_id)
def main(args): # with open(args.output_dir / 'config.json') as f: # config = json.load(f) # loading datasets from jsonl files # with open(config['train']) as f: # train = [json.loads(line) for line in f] with open(args.valid_data_path) as f: valid = [json.loads(valid) for valid in f] # with open(config['test']) as f: # test = [json.loads(line) for line in f] logging.info('Collecting documents...') documents = ([sample['text'] for sample in valid]) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=True) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') with open('embedding2.pkl', 'rb') as f: embedding = pickle.load(f) tokenizer.set_vocab(embedding.vocab) logging.info('Creating valid dataset...') create_seq2seq_dataset(process_samples(tokenizer, valid), 'valid_seq2seq.pkl', tokenizer.pad_token_id)
def main(args): with open(args.test_input) as f: test = [json.loads(line) for line in f] logging.info('Collecting documents...') documents = ( [sample['text'] for sample in test] ) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=True) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') with open(args.embedding_file, 'rb') as f: embedding = pickle.load(f) tokenizer.set_vocab(embedding.vocab) logging.info('Creating test dataset...') create_seq2seq_dataset( process_samples(tokenizer, test), args.test_output, tokenizer.pad_token_id )
def main(args): with open(args.output_dir / 'config.json') as f: config = json.load(f) # loading datasets from jsonl files with open(config['train']) as f: train = [json.loads(line) for line in f] with open(config['valid']) as f: valid = [json.loads(valid) for valid in f] with open(config['test']) as f: test = [json.loads(line) for line in f] logging.info('Collecting documents...') documents = ( [sample['text'] for sample in train] + [sample['summary'] for sample in train] + [sample['text'] for sample in valid] + [sample['text'] for sample in test] ) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=config['lower_case']) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') embedding = Embedding(config['embedding'], words=words) with open(args.output_dir / 'embedding.pkl', 'wb') as f: pickle.dump(embedding, f) tokenizer.set_vocab(embedding.vocab) logging.info('Creating train dataset...') create_seq2seq_dataset( process_samples(tokenizer, train), args.output_dir / 'train.pkl', config, tokenizer.pad_token_id ) logging.info('Creating valid dataset...') create_seq2seq_dataset( process_samples(tokenizer, valid), args.output_dir / 'valid.pkl', config, tokenizer.pad_token_id ) logging.info('Creating test dataset...') create_seq2seq_dataset( process_samples(tokenizer, test), args.output_dir / 'test.pkl', config, tokenizer.pad_token_id )