Beispiel #1
0
    def build_vocab(self):
        all_words = set()
        all_chars = set()
        all_labels = set()
        train_labels = list(
        )  # to be used for getting class_weights for imbalanced datasets
        flatten = lambda l: [item for sublist in l for item in sublist]
        for sentence in self.train_data_obj:
            all_words.update(set(map(lambda x: x.word, sentence.tokens)))
            all_chars.update(
                set(
                    flatten(
                        map(lambda x: map(lambda y: y.char, x.chars),
                            sentence.tokens))))
            all_labels.add(sentence.label)
            train_labels.append(sentence.label)
        for sentence in self.valid_data_obj:
            all_words.update(set(map(lambda x: x.word, sentence.tokens)))
            all_chars.update(
                set(
                    flatten(
                        map(lambda x: map(lambda y: y.char, x.chars),
                            sentence.tokens))))
            all_labels.add(sentence.label)
        for sentence in self.test_data_obj:
            all_words.update(set(map(lambda x: x.word, sentence.tokens)))
            all_chars.update(
                set(
                    flatten(
                        map(lambda x: map(lambda y: y.char, x.chars),
                            sentence.tokens))))
            all_labels.add(sentence.label)

        all_words.add(PAD_TOKEN)
        all_chars.add(PAD_CHAR)

        word_vocab = list(all_words)
        char_vocab = list(all_chars)
        label_vocab = list(all_labels)

        word2idx = get_vocab_dict(word_vocab)
        idx2word = {idx: word for (word, idx) in word2idx.items()}

        char2idx = get_vocab_dict(char_vocab)
        idx2char = {idx: char for (char, idx) in char2idx.items()}

        label2idx = get_vocab_dict(label_vocab)
        idx2label = {idx: label for (label, idx) in label2idx.items()}

        self.word2idx = word2idx
        self.idx2word = idx2word

        self.char2idx = char2idx
        self.idx2char = idx2char

        self.label2idx = label2idx
        self.idx2label = idx2label
Beispiel #2
0
    def build_vocab(self):

        all_words = set()
        all_pos = set()
        all_dep = set()

        for sentence in self.train_data:
            all_words.update(set(map(lambda x: x.word, sentence.tokens)))
            all_pos.update(set(map(lambda x: x.pos, sentence.tokens)))
            all_dep.update(set(map(lambda x: x.dep, sentence.tokens)))

        all_words.add(ROOT_TOKEN.word)
        all_words.add(NULL_TOKEN.word)
        all_words.add(UNK_TOKEN.word)

        all_pos.add(ROOT_TOKEN.pos)
        all_pos.add(NULL_TOKEN.pos)
        all_pos.add(UNK_TOKEN.pos)

        all_dep.add(ROOT_TOKEN.dep)
        all_dep.add(NULL_TOKEN.dep)
        all_dep.add(UNK_TOKEN.dep)

        word_vocab = list(all_words)
        pos_vocab = list(all_pos)
        dep_vocab = list(all_dep)

        word2idx = get_vocab_dict(word_vocab)
        idx2word = {idx: word for (word, idx) in word2idx.items()}

        pos2idx = get_vocab_dict(pos_vocab)
        idx2pos = {idx: pos for (pos, idx) in pos2idx.items()}

        global dep2idx
        dep2idx = get_vocab_dict(dep_vocab)
        idx2dep = {idx: dep for (dep, idx) in dep2idx.items()}

        self.word2idx = word2idx
        self.idx2word = idx2word

        self.pos2idx = pos2idx
        self.idx2pos = idx2pos

        self.dep2idx = dep2idx
        self.idx2dep = idx2dep