def build_vocab(json, threshold): """Build a simple vocabulary wrapper.""" counter = Counter() for i, id in enumerate(ids): caption = str(coco.anns[id]['caption']) tokens = nltk.tokenize.word_tokenize(caption.lower()) counter.update(tokens) if (i + 1) % 1000 == 0: print("[{}/{}] Tokenized the captions.".format(i + 1, len(ids))) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab
def build_vocabulary(self, threshold): vocabulary = Vocabulary() counter = Counter() for id in self.image_desc: caption = self.image_desc[id] tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize( caption.lower()) if len(tokens) > self.input_maxlen - 2: self.input_maxlen = len(tokens) + 2 counter.update(tokens) for id in self.story_data: temp_in = 0 temp_out = 0 for seq in self.story_data[id]: caption = seq[2] tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize( caption.lower()) counter.update(tokens) temp_out = temp_out + len(tokens) caption_in = self.image_desc[seq[1]] tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize( caption_in.lower()) temp_in = temp_in + len(tokens) counter.update(tokens) if temp_out > self.output_maxlen - 2: self.output_maxlen = temp_out + 2 if temp_in > self.input_maxlen - 2: self.input_maxlen = temp_out + 2 words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocabulary = Vocabulary() vocabulary.add_word('<pad>') vocabulary.add_word('<start>') vocabulary.add_word('<end>') vocabulary.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocabulary.add_word(word) f = open("./Dataset/vocabulary.pkl", "wb") pickle.dump(vocabulary, f) f.close() return vocabulary
def make_data_set_and_vocab(trainpath=None, vectorpath=None, threshhold=0): vocab = Vocabulary() if vectorpath is not None: vocab.load(vectorpath) counter = collections.Counter() with open(trainpath, 'r') as f: for line in f: words = make_wakati(line.strip()) for word in words: counter[word] += 1 # for word, _ in counter.most_common(self.n_max_word - 2): for word, cnt in counter.most_common(): if cnt <= threshhold: break if word not in vocab: vocab.add_word(word) vocab.save('vocab') # ここからデータセット作成 data_set = MyDataset(trainpath=trainpath, vocab=vocab) return data_set, vocab
def build_vocabulary(self, threshold): vocabulary = Vocabulary() counter = Counter() for id in self.image_desc: caption = self.image_desc[id] tokens = nltk.tokenize.word_tokenize(caption.lower()) if len(tokens) > self.max_length - 2: self.max_length = len(tokens) + 2 counter.update(tokens) words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocabulary = Vocabulary() vocabulary.add_word('<pad>') vocabulary.add_word('<start>') vocabulary.add_word('<end>') vocabulary.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocabulary.add_word(word) return vocabulary
def build_vocabulary(self, threshold): '''if os.path.exists("./Dataset/vocabulary.pkl"): f = open("./Dataset/vocabulary.pkl","rb") vocabulary = pickle.load(f) return vocabulary''' vocabulary = Vocabulary() counter = Counter() for id in self.image_desc: caption = self.image_desc[id] tokens = nltk.tokenize.word_tokenize(caption.lower()) if len(tokens) > self.max_length - 2: self.max_length = len(tokens) + 2 counter.update(tokens) for annot in self.coco_desc: caption = annot['caption'] tokens = nltk.tokenize.word_tokenize(caption.lower()) if len(tokens) > self.max_length - 2: self.max_length = len(tokens) + 2 counter.update(tokens) words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocabulary = Vocabulary() vocabulary.add_word('<pad>') vocabulary.add_word('<start>') vocabulary.add_word('<end>') vocabulary.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocabulary.add_word(word) f = open("./Dataset/vocabulary.pkl", "wb") pickle.dump(vocabulary, f) f.close() return vocabulary