Ejemplo n.º 1
0
    def load(self):
        if self.o.use_pair:
            wordpos_vocab_list = [
                w.strip().split() for w in open(self.o.vocab_path)
            ][:self.o.num_lex + 2]
            word_vocab_list = [wp[0] for wp in wordpos_vocab_list]
            word_vocab = Vocab.from_list(word_vocab_list,
                                         unk='<UNK>',
                                         pad='<PAD>')
            self.pos_dict = {
                word_vocab[wp[0]]: list(map(BLLIP_POS.__getitem__, wp[1:]))
                for wp in wordpos_vocab_list if len(wp) > 1
            }
        else:
            word_vocab_list = [w.strip() for w in open(self.o.vocab_path)
                               ][:self.o.num_lex + 2]
            word_vocab = Vocab.from_list(word_vocab_list,
                                         unk='<UNK>',
                                         pad='<PAD>')

        self.train_ds = ConllDataset(self.o.train_ds,
                                     pos_vocab=BLLIP_POS,
                                     word_vocab=word_vocab)

        self.dev_ds = ConllDataset(self.o.dev_ds,
                                   pos_vocab=BLLIP_POS,
                                   word_vocab=word_vocab)
        self.test_ds = ConllDataset(self.o.test_ds,
                                    pos_vocab=BLLIP_POS,
                                    word_vocab=word_vocab)

        if self.o.pretrained_ds:
            self.pretrained_ds = ConllDataset(self.o.pretrained_ds,
                                              pos_vocab=BLLIP_POS,
                                              word_vocab=word_vocab)
        else:
            self.pretrained_ds = None

        self.dev_ds.build_batchs(self.o.batch_size)
        self.test_ds.build_batchs(self.o.batch_size)

        if self.o.use_pair:
            self.o.num_lex = sum([len(p) for p in self.pos_dict.values()])
        self.o.max_len = 10
        self.o.num_tag = len(BLLIP_POS) + self.o.num_lex

        if self.o.emb_path:
            self.word_emb = np.load(self.o.emb_path)[:self.o.num_lex + 2]
        else:
            self.word_emb = None
        self.out_pos_emb = np.load(
            self.o.out_pos_emb_path) if self.o.out_pos_emb_path else None
        self.pos_emb = np.load(
            self.o.pos_emb_path) if self.o.pos_emb_path else None
Ejemplo n.º 2
0
    def load(self):
        pos_vocab_list = [w.strip() for w in open(self.o.vocab_path)]
        pos_vocab = Vocab.from_list(pos_vocab_list)
        self.train_ds = ConllDataset(self.o.train_ds, pos_vocab=pos_vocab)
        self.dev_ds = ConllDataset(self.o.dev_ds, pos_vocab=pos_vocab)
        self.test_ds = ConllDataset(self.o.test_ds, pos_vocab=pos_vocab)

        if self.o.pretrained_ds:
            self.pretrained_ds = ConllDataset(self.o.pretrained_ds, pos_vocab=pos_vocab)
        else:
            self.pretrained_ds = None

        self.dev_ds.build_batchs(self.o.batch_size)
        self.test_ds.build_batchs(self.o.batch_size)

        if self.o.emb_path:
            self.word_emb = np.load(self.o.emb_path)
        else:
            self.word_emb = None