def build_vocab(file_path, threshold): """Build a simple vocabulary wrapper.""" with open(file_path, 'r') as f: dataset = json.load(f) counter = Counter() for i, sen in enumerate(dataset): dependecies = sen['depends'] g = Graph() for dep in dependecies: gov_node = g.add_node(dep['governorGloss'], dep['governor'], "") dep_node = g.add_node(dep['dependentGloss'], dep['dependent'], "") g.add_edge(gov_node, dep_node, dep['dep']) caption = g.__str__() # print("{} # {}".format(i, caption)) tokens = nltk.tokenize.word_tokenize(caption.lower()) counter.update(tokens) if (i + 1) % 1000 == 0: print("{} # {}".format(i + 1, caption)) print("[{}/{}] Tokenized the captions.".format(i + 1, len(dataset))) # If the word frequency is less than 'threshold', then the word is # discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. # <bot>: begin of tree # <eob>: end of branch vocab = Vocabulary() vocab.add_word('<ROOT>') vocab.add_word('<EOB>') vocab.add_word('<UNK>') vocab.add_word('<PAD>') # Add the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab