def build_vocab(cls, json, tokenized_captions, threshold): print("Building vocabulary") coco = COCO(json) counter = Counter() ids = coco.anns.keys() for i, id in enumerate(ids): """ caption = str(coco.anns[id]['caption']) tokens = CocoDataset.tokenize(caption) """ tokens = tokenized_captions[id] counter.update(tokens) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Creates a vocab wrapper and add some special tokens. vocab = Vocabulary() # Adds the words to the vocabulary. for word in words: vocab.add_word(word) print("Total vocabulary size: %d" % len(vocab)) return vocab
def build_vocab(json, threshold): """Build a simple vocabulary wrapper.""" coco = COCO(json) counter = Counter() ids = coco.anns.keys() for i, id in enumerate(ids): caption = str(coco.anns[id]['caption']) tokens = nltk.tokenize.word_tokenize(caption.lower()) counter.update(tokens) if i % 1000 == 0: print("[%d/%d] Tokenized the captions." % (i, len(ids))) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Creates a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Adds the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab