Esempio n. 1
0
    def make_vocab_label(self,
                         sents,
                         vocab_label_init=None):
        if len(sents) == 0:
            return None

        if vocab_label_init:
            vocab_label = deepcopy(vocab_label_init)
        else:
            vocab_label = Vocab()
            none_label = 'O'
            vocab_label.add_word(none_label)

        labels = []
        for sent in sents:
            if sent.has_prds:
                for prop in sent.prd_bio_labels:
                    labels += prop
        cnt = Counter(labels)
        labels = [(w, c) for w, c in cnt.most_common()]

        for label, count in labels:
            vocab_label.add_word(label)

        return vocab_label
Esempio n. 2
0
    def make_vocab_label(self,
                         sents,
                         vocab_label_init=None):
        if len(sents) == 0:
            return None

        if vocab_label_init:
            vocab_label = deepcopy(vocab_label_init)
        else:
            vocab_label = Vocab()
            if self.argv.data_type == 'conll05':
                core_labels = ["A0", "A1", "A2", "A3", "A4", "A5"]
            else:
                core_labels = ["ARG0", "ARG1", "ARG2", "ARG3", "ARG4", "ARG5"]
            for label in core_labels:
                vocab_label.add_word(label)

        bio_labels = []
        for sent in sents:
            for props in sent.prd_bio_labels:
                bio_labels += props
        cnt = Counter(bio_labels)
        bio_labels = [(w, c) for w, c in cnt.most_common()]

        for label, count in bio_labels:
            if not label.endswith('-V') and len(label) > 1:
                vocab_label.add_word(label[2:])

        return vocab_label
Esempio n. 3
0
missing_ratio = round((1.0 * missing_words / len(word_counts)) * 100, 4)

print('Number of words missing from GloVe:', missing_words)
print('Percent of words that are missing from vocabulary: {}%'.format(
    missing_ratio))

# Limit the vocab that we will use to words that appear >= threshold or are in GloVe
vocab = Vocab()

# Dictionary to convert words to integers
threshold = 10

for word, count in word_counts.items():
    if count >= threshold or word in glove_embeddings:
        vocab.add_word(word)

# Special tokens that will be added to our vocab
codes = ["<UNK>", "<EOS>", "<GO>", "<PAD>"]

# Add codes to vocab
for code in codes:
    vocab.add_word(code)

usage_ratio = round(1.0 * len(vocab) / len(word_counts) + 4, 4) * 100

print("Total number of unique words:", len(word_counts))
print("Number of words we will use:", len(vocab))
print("Percent of words we will use: {}%".format(usage_ratio))

# save vocabulary
Esempio n. 4
0
 def make_vocab_word(word_list):
     vocab_word = Vocab()
     vocab_word.add_word(UNK)
     for w in word_list:
         vocab_word.add_word(w)
     return vocab_word
Esempio n. 5
0
def make_vocab_from_ids(key_value_format):
    vocab = Vocab()
    for key, value in key_value_format:
        vocab.add_word(key)
    return vocab