Exemple #1
0
def pdtb_prepare(args):
    print('Loading dataset...')
    train_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in
                      PathConfig.train_sections]
    dev_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in
                    PathConfig.dev_sections]
    test_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in
                     PathConfig.test_sections]
    dataset = PDTBDataSet(train_sections, dev_sections, test_sections, level=2 if args.task.startswith('fine') else 1)
    print('Size of train: {}, dev: {}, test: {}'.format(len(dataset.train_set), len(dataset.dev_set),
                                                        len(dataset.test_set)))
    print('Creating word vocab...')
    if not os.path.exists(PathConfig.experiment_data_dir):
        os.mkdir(PathConfig.experiment_data_dir)
    word_vocab = Vocab(mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD])
    for word in dataset.get_all_words():
        word_vocab.add(word)
    word_vocab.load_pretrained_emb(PathConfig.embedding_path)
    print('Size of word vocab: {}'.format(word_vocab.size()))
    torch.save(word_vocab, os.path.join(PathConfig.experiment_data_dir, 'word_vocab.obj'))
    tag_vocab = Vocab()
    for tag in dataset.get_all_tags():
        tag_vocab.add(tag)
    print('Size of tag vocab: {}'.format(tag_vocab.size()))
    tag_vocab.init_embed(ModelConfig.tag_embed_dim)
    torch.save(tag_vocab, os.path.join(PathConfig.experiment_data_dir, 'tag_vocab.obj'))
    print('Formatting the dataset to torch variables...')
    dataset.format_instances_to_torch_var(word_vocab, tag_vocab)
    torch.save(dataset, os.path.join(PathConfig.experiment_data_dir, 'dataset.obj'))
Exemple #2
0
def prepare_data():
    # load the dataset
    train_sections = [
        os.path.join(paths.json_data_dir, '{:02}'.format(section_num))
        for section_num in paths.train_sections
    ]
    dev_sections = [
        os.path.join(paths.json_data_dir, '{:02}'.format(section_num))
        for section_num in paths.dev_sections
    ]
    test_sections = [
        os.path.join(paths.json_data_dir, '{:02}'.format(section_num))
        for section_num in paths.test_sections
    ]
    train_dataset = PDTBDataSet(train_sections,
                                tree_type=args.tree_type,
                                level=args.level,
                                multiple_labels=False)
    dev_dataset = PDTBDataSet(dev_sections,
                              tree_type=args.tree_type,
                              level=args.level,
                              multiple_labels=True)
    test_dataset = PDTBDataSet(test_sections,
                               tree_type=args.tree_type,
                               level=args.level,
                               multiple_labels=True)
    if not (train_dataset.consistent_with(dev_dataset)
            and dev_dataset.consistent_with(test_dataset)):
        print('Dataset labels are not consistent.')
        print('Train: {}'.format(sorted(train_dataset.label_map.keys())))
        print('Dev: {}'.format(sorted(dev_dataset.label_map.keys())))
        print('Test: {}'.format(sorted(test_dataset.label_map.keys())))
    print('Size of train set: {}, dev set: {}, test set: {}'.format(
        len(train_dataset), len(dev_dataset), len(test_dataset)))
    # save the dataset
    torch.save(train_dataset,
               os.path.join(paths.experiment_data_dir, 'train.data'))
    torch.save(dev_dataset, os.path.join(paths.experiment_data_dir,
                                         'dev.data'))
    torch.save(test_dataset,
               os.path.join(paths.experiment_data_dir, 'test.data'))
    # build the vocab
    vocab = Vocab(
        mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD])
    all_words = train_dataset.get_all_words() + dev_dataset.get_all_words(
    ) + test_dataset.get_all_words()
    # all_words = train_dataset.get_all_words()
    for word in all_words:
        vocab.add(word)
    # load and initialize the embeddings
    vocab.load_pretrained_emb(paths.embedding_path)
    print('Size of PDTB vocabulary: {}'.format(vocab.size()))
    # save the vocab
    torch.save(vocab, paths.vocab_path)