def pdtb_prepare(args): print('Loading dataset...') train_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in PathConfig.train_sections] dev_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in PathConfig.dev_sections] test_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in PathConfig.test_sections] dataset = PDTBDataSet(train_sections, dev_sections, test_sections, level=2 if args.task.startswith('fine') else 1) print('Size of train: {}, dev: {}, test: {}'.format(len(dataset.train_set), len(dataset.dev_set), len(dataset.test_set))) print('Creating word vocab...') if not os.path.exists(PathConfig.experiment_data_dir): os.mkdir(PathConfig.experiment_data_dir) word_vocab = Vocab(mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD]) for word in dataset.get_all_words(): word_vocab.add(word) word_vocab.load_pretrained_emb(PathConfig.embedding_path) print('Size of word vocab: {}'.format(word_vocab.size())) torch.save(word_vocab, os.path.join(PathConfig.experiment_data_dir, 'word_vocab.obj')) tag_vocab = Vocab() for tag in dataset.get_all_tags(): tag_vocab.add(tag) print('Size of tag vocab: {}'.format(tag_vocab.size())) tag_vocab.init_embed(ModelConfig.tag_embed_dim) torch.save(tag_vocab, os.path.join(PathConfig.experiment_data_dir, 'tag_vocab.obj')) print('Formatting the dataset to torch variables...') dataset.format_instances_to_torch_var(word_vocab, tag_vocab) torch.save(dataset, os.path.join(PathConfig.experiment_data_dir, 'dataset.obj'))
def prepare_data(): # load the dataset train_sections = [ os.path.join(paths.json_data_dir, '{:02}'.format(section_num)) for section_num in paths.train_sections ] dev_sections = [ os.path.join(paths.json_data_dir, '{:02}'.format(section_num)) for section_num in paths.dev_sections ] test_sections = [ os.path.join(paths.json_data_dir, '{:02}'.format(section_num)) for section_num in paths.test_sections ] train_dataset = PDTBDataSet(train_sections, tree_type=args.tree_type, level=args.level, multiple_labels=False) dev_dataset = PDTBDataSet(dev_sections, tree_type=args.tree_type, level=args.level, multiple_labels=True) test_dataset = PDTBDataSet(test_sections, tree_type=args.tree_type, level=args.level, multiple_labels=True) if not (train_dataset.consistent_with(dev_dataset) and dev_dataset.consistent_with(test_dataset)): print('Dataset labels are not consistent.') print('Train: {}'.format(sorted(train_dataset.label_map.keys()))) print('Dev: {}'.format(sorted(dev_dataset.label_map.keys()))) print('Test: {}'.format(sorted(test_dataset.label_map.keys()))) print('Size of train set: {}, dev set: {}, test set: {}'.format( len(train_dataset), len(dev_dataset), len(test_dataset))) # save the dataset torch.save(train_dataset, os.path.join(paths.experiment_data_dir, 'train.data')) torch.save(dev_dataset, os.path.join(paths.experiment_data_dir, 'dev.data')) torch.save(test_dataset, os.path.join(paths.experiment_data_dir, 'test.data')) # build the vocab vocab = Vocab( mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD]) all_words = train_dataset.get_all_words() + dev_dataset.get_all_words( ) + test_dataset.get_all_words() # all_words = train_dataset.get_all_words() for word in all_words: vocab.add(word) # load and initialize the embeddings vocab.load_pretrained_emb(paths.embedding_path) print('Size of PDTB vocabulary: {}'.format(vocab.size())) # save the vocab torch.save(vocab, paths.vocab_path)