def from_dataframe(cls, review_df, cutoff=25): """ Instantiate the vectorizer from the dataset dataframe. Args: review_df (pandas.Dataframe): the serializable dictionary Returns: an instance of the ReviewVectorizer """ review_vocab = Vocabulary(add_unk=True) rating_vocab = Vocabulary(add_unk=False) # Add ratings for rating in sorted(set(review_df.rating)): rating_vocab.add_token(rating) # Add top words if count > provided count word_counts = Counter() for review in review_df.review: for word in review.split(" "): if word not in string.punctuation: word_counts[word] += 1 for word, count in word_counts.items(): if count > cutoff: review_vocab.add_token(word) return cls(review_vocab, rating_vocab)
def build_vocabs(): train, dev, test = load_boknilev() samples = [ s for r in train + dev + test for s in boknilev_record_to_hcpd_samples(r) ] gold_pos_vocab = Vocabulary('GOLD_POS') gold_pos_vocab.add_words( set([hc.next_pos for s in samples for hc in s.x.head_cands])) gold_pos_vocab.add_word(None) words_vocab = Vocabulary('WORDS') words_vocab.add_words( set([hc.word for s in samples for hc in s.x.head_cands])) words_vocab.add_words(set([s.x.pp.word for s in samples])) words_vocab.add_words(set([s.x.child.word for s in samples])) words_vocab.add_word(None) words_to_lemmas = {} words_to_lemmas.update({s.x.child.word: s.x.child.lemma for s in samples}) words_to_lemmas.update( {hc.word: hc.lemma for s in samples for hc in s.x.head_cands}) return [gold_pos_vocab, words_vocab, words_to_lemmas]
def __init__(self, mode=None): super(RegexModel, self).__init__() self.mode = mode if mode is None or mode == "replacements": self.replacements = self._load_replacements() if mode is None or mode == "vocabulary": self.wikipedia_voc = Vocabulary("wikipedia_lower") self.ingredients_voc = Vocabulary( "ingredients_fr_tokens") | Vocabulary("ingredients_fr")
def __init__(self, cfg): """ :param cfg: config """ vocab_in_path = f'{cfg.rsc_src}/vocab.in' self.vocab_in = Vocabulary(vocab_in_path, cfg.cutoff, SPECIAL_CHARS) vocab_out_path = f'{cfg.rsc_src}/vocab.out' self.vocab_out = Vocabulary(vocab_out_path, 0, None) restore_dic_path = f'{cfg.rsc_src}/restore.dic' self.restore_dic = self._load_restore_dic(restore_dic_path)
def __init__(self, data_file, character_level=None, phoneme_level=None, vocabulary=None, transform=None): self.data_file = data_file self.data = joblib.load(open(self.data_file, 'rb')) self.character_level = character_level self.phoneme_level = phoneme_level self.transcription_processor = lambda words: words if self.character_level: characters = [chr(c) for c in range(ord('a'), ord('z') + 1)] characters += [' '] character_vocab = Vocabulary() for character in characters: character_vocab.add_word(character) self.vocabulary = character_vocab self.transcription_processor = self._character_level_transcription_processor elif self.phoneme_level: cmu_phones = list(map(lambda x: x[0], cmudict.phones())) cmu_phones += [' '] phones_vocab = Vocabulary(custom_unk_word=' ') for phone in cmu_phones: phones_vocab.add_word(phone) self.vocabulary = phones_vocab self.phones_dict = cmudict.dict() self.transcription_processor = self._phone_level_transcription_processor elif vocabulary is None: data_file_dir = os.path.dirname(self.data_file) data_file_prefix = os.path.splitext(self.data_file)[0] pickle_file_name = f'{data_file_prefix}_SpeechDataset.pickle' pickle_file_path = os.path.join(data_file_dir, pickle_file_name) if not os.path.isfile(pickle_file_path): dataset_info = self.build_vocabulary_from_dataset(self.data) pickle.dump(dataset_info, open(pickle_file_path, 'wb')) else: dataset_info = pickle.load(open(pickle_file_path, 'rb')) self.vocabulary = dataset_info['vocabulary'] else: self.vocabulary = vocabulary self.transform = transform self.max_transcription_length = max([ len(transcription) for transcription in self.data['transcription_tokens'] ]) self.max_input_length = max([ spectrogram.shape[1] for spectrogram in self.data['audio_spectrograms'] ])
def read_instances_from_file(files, max_len=400, keep_case=False): ''' Collect instances and construct vocab ''' vocab = Vocabulary() lb_vocab = Vocabulary(need_default=False) sets = [] for file in files: sents, labels = [], [] trimmed_sent = 0 with open(file) as f: lines = f.readlines() for l in lines: l = l.strip().split('\t') if len(l) < 2: continue label = l[0] sent = l[1] if not keep_case: sent = sent.lower() word_lst = sent.split() if len(word_lst) > max_len: word_lst = word_lst[:max_len] trimmed_sent += 1 if word_lst: sents.append(word_lst) labels.append(label) vocab.add_word_lst(word_lst) lb_vocab.add_word(label) assert len(sents) == len(labels) sets.append({'sents': sents, 'labels': labels}) logger.info('Get {} instances from file {}'.format(len(sents), file)) if trimmed_sent: logger.info( '{} sentences are trimmed. Max sentence length: {}.'.format( trimmed_sent, max_len)) logger.info('Building vocabulary...') vocab.add_word_lst(['<cls>'] * 6) vocab.build_vocab() lb_vocab.build_vocab() logger.info('Finished. Size of vocab: {}. # Class: {}.'.format( len(vocab), len(lb_vocab))) logger.info('<pad>: {}'.format(vocab.to_index('<pad>'))) logger.info('<unk>: {}'.format(vocab.to_index('<unk>'))) logger.info('<cls>: {}'.format(vocab.to_index('<cls>'))) return sets, vocab, lb_vocab
def __init__(self, coco_ann_file, train, vocabThreshold=None, transforms = None): super().__init__() self.train = train self.ann_file = coco_ann_file self.tokenizer = RegexpTokenizer(r'\w+') self.transform = transforms if train: self.caption_len, self._coco = getcaption_len(self.ann_file, self.tokenizer, train=True) self.vocab = Vocabulary(True,self._coco, vocabThreshold) self.ids = list(self._coco.anns.keys()) else: self._coco = COCO(coco_ann_file) self.ids = list(self._coco.anns.keys()) self.vocab = Vocabulary(train=False)
def create_index(args): reader = DocumentStreamReader(args[2:]) if args[1] == 'varbyte': vocabulary = Vocabulary(Simple9) elif args[1] == 'simple9': vocabulary = Vocabulary(Simple9) else: raise AssertionError('Expected varbyte|simple9 as a compressor') for doc in reader: for word in extract_words(doc.text): vocabulary.append(word, doc.url) dump(args[0], vocabulary)
def to_vocab(data, frequency_cutoff=None, size_cutoff=None): if not utils.xor(frequency_cutoff, size_cutoff): raise Exception("one or the other cutoffs please") counter = Counter(word for sent in data for word in sent) if frequency_cutoff is not None: print("Using a frequency of {} to reduce vocabulary size.".format( frequency_cutoff)) words = [ word for word, count in counter.most_common() if count > frequency_cutoff ] print("Vocabulary size reduced. {} -> {}".format( len(counter), len(words))) elif size_cutoff is not None: print("Using a cutoff of {} to reduce vocabulary size.".format( size_cutoff)) words = [word for word, count in counter.most_common(size_cutoff)] print("Vocabulary size reduced. {} -> {}".format( len(counter), len(words))) else: raise Exception("should never happen...") vocab = Vocabulary(use_mask=True) vocab.add_many(['<START>', "<END>"]) vocab.add_many(words) return vocab
def __init__(self, transform, mode, batch_size, vocab_threshold, vocab_file, glove_file, start_word, end_word, unk_word, annotations_file, vocab_from_file, img_folder): self.transform = transform self.mode = mode self.batch_size = batch_size self.vocab = Vocabulary(vocab_threshold, vocab_file, glove_file, start_word, end_word, unk_word, annotations_file, vocab_from_file, dataset='coco') self.img_folder = img_folder self.sel_length = None if self.mode == 'train' or self.mode == 'val': self.coco = COCO(annotations_file) self.ids = list(self.coco.anns.keys()) print('Obtaining caption lengths...') all_tokens = [ nltk.tokenize.word_tokenize( str(self.coco.anns[self.ids[index]]['caption']).lower()) for index in tqdm(np.arange(len(self.ids))) ] self.caption_lengths = [len(token) for token in all_tokens]
def __init__(self, transform, mode, batch_size, vocab_threshold, vocab_file, start_word, end_word, unk_word, annotations_file, vocab_from_file, img_folder): self.transform = transform self.mode = mode self.batch_size = batch_size self.vocab = Vocabulary(vocab_threshold, vocab_file, start_word, end_word, unk_word, annotations_file, vocab_from_file) self.img_folder = img_folder if self.mode == 'train': self.coco = COCO(annotations_file) self.ids = list(self.coco.anns.keys()) print('Obtaining caption lengths...') all_tokens = [ nltk.tokenize.word_tokenize( str(self.coco.anns[self.ids[index]]['caption']).lower()) for index in tqdm(np.arange(len(self.ids))) ] self.caption_lengths = [len(token) for token in all_tokens] else: self.coco = COCO(annotations_file) self.ids = list(self.coco.anns.keys()) print('Obtaining caption lengths...') all_tokens = [ nltk.tokenize.word_tokenize( str(self.coco.anns[self.ids[index]]['caption']).lower()) for index in tqdm(np.arange(len(self.ids))) ] self.caption_lengths = [len(token) for token in all_tokens] test_info = json.loads(open(annotations_file).read()) self.paths = [item['file_name'] for item in test_info['images']]
def main(): os.makedirs(args.wiki_preprocess, exist_ok=True) vocab = Vocabulary(os.path.join(args.wiki_preprocess, 'entity_vocab.txt')) paths = list(glob.glob(os.path.join(args.wiki_dump, '*.xml-*'))) paths = sorted(paths, key=lambda p: int(os.path.basename(p).split('-')[4][11:-4])) params = [(path, vocab) for path in paths] inlinks = dict() total_pages = 0 with mp.Pool(processes=args.cpu) as pool, \ tqdm(total=len(paths), dynamic_ncols=True) as pbar: for i, res in enumerate(pool.imap_unordered(process_stream, params)): part_inlinks, page_counter = res # update for e, links in part_inlinks.items(): if e not in inlinks: inlinks[e] = set() inlinks[e].update(links) # dump if i % 10 == 0: dump(inlinks) # log total_pages += page_counter pbar.write(f'pages: {total_pages}, ' f'vocab size: {len(vocab)}, ' f'len(inlinks): {len(inlinks)}') pbar.update() del part_inlinks dump(inlinks)
def from_file(cls, text_file: str, num_prev_chars: int, vocab: Vocabulary = None): examples = [] counter: Counter = Counter() with open(text_file, "r") as reviews: for line in reviews: string = line.strip("\n") counter.update(string) # prepend BOS (num_prev_chars times) and EOS to each line chars = ([SSTLanguageModelingDataset.BOS] * num_prev_chars + list(string) + [SSTLanguageModelingDataset.EOS]) examples.extend(examples_from_characters( chars, num_prev_chars)) if not vocab: vocab = Vocabulary( counter, special_tokens=( Vocabulary.UNK, SSTLanguageModelingDataset.BOS, SSTLanguageModelingDataset.EOS, ), ) return cls(examples, vocab)
def main(): """Quick tests.""" a = Attribute('hour', ['0,...,23']) a2 = Attribute('minute', ['0,...,59']) r_ahead = Relation('R1(h1,m1,h2,m2) <=> h1 > h2 or (h1 = h2 and m1 > m2)', ['hour', 'minute', 'hour', 'minute'], 1) r_behind = Relation('R2(h1,m1,h2,m2) <=> h1 < h2 or (h1 = h2 and m1 < m2)', ['hour', 'minute', 'hour', 'minute'], 2) r_pm = Relation('R3(h1) <=> h1 > 12', ['hour'], 3) r_am = Relation('R4(h1) <=> h1 < 12', ['hour'], 4) attribute_structure = AttributeStructure(a, a2, r_ahead, r_behind, r_pm, r_am) ahead_rs = RelationSymbol('Ahead', 4) behind_rs = RelationSymbol('Behind', 4) pm_rs = RelationSymbol('PM', 1) vocabulary = Vocabulary(['C1', 'C2'], [ahead_rs, behind_rs, pm_rs], ['V1', 'V2']) profiles = [[ ahead_rs, ('hour', 1), ('minute', 1), ('hour', 2), ('minute', 2) ], [behind_rs, ('hour', 1), ('minute', 1), ('hour', 2), ('minute', 2)], [pm_rs, ('hour', 1)]] mapping = {ahead_rs: 1, behind_rs: 2, pm_rs: 3} ai = AttributeInterpretation(vocabulary, attribute_structure, mapping, profiles) print ai == ai
def vocabularyBuilding(config): LOG.log('Building Vocabulary') setting = { "cased": False, "rmDigit" : True, "sortBy": "output", "minFreq": 5, "dim": 100, "initPath": "others/glove.6B.100d.txt", "inputCorpus":[config.train_prefix + ".Ndocument", config.valid_prefix + ".Ndocument"], "outputCorpus":[config.train_prefix + ".Nsummary", config.valid_prefix + ".Nsummary"] } Vocab = Vocabulary(setting) saveToPKL('settings/vocab/newData.Vocab', Vocab) f = open('newData.i2w', 'w', encoding='utf-8') for item in Vocab.i2w: if (item == '<pad>' or item == '<unk>' or item == '<bos>' or item == '<eos>' or item == '<mask>'): print(item, 'NAN', file=f) else: print(item, Vocab.typeFreq[item], file=f) setting["full_size"] = Vocab.full_size setting["input_size"] = Vocab.n_in setting["output_size"] = Vocab.n_out setting["savePath"] = "settings/vocab/newData.json" saveToJson(setting["savePath"], setting) return setting
def build_vocab(caption, threshold): """Build a simple vocabulary wrapper.""" counter = Counter() n = len(caption.keys()) for i, key in enumerate(caption.keys()): for sentence in caption[key]: tokens = nltk.tokenize.word_tokenize(sentence) counter.update(tokens) if i % 1000 == 0: print("[%d/%d] Tokenized the captions." % (i, n)) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Creates a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Adds the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab
def __init__(self, transform, mode, batch_size, vocab_threshold, vocab_file, start_word, end_word, unk_word, annotations_file, vocab_from_file, img_folder): # Preprocessing transform self.transform = transform # Train, valid, or test self.mode = mode self.batch_size = batch_size # Dictionaries of stoi and itos for words self.vocab = Vocabulary(vocab_threshold, vocab_file, start_word, end_word, unk_word, annotations_file, vocab_from_file) # Where the images are self.img_folder = img_folder # Train on all captions if self.mode == "train": self.coco = COCO(annotations_file) self.ids = list(self.coco.anns.keys()) print("Obtaining caption lengths...") all_tokens = [ nltk.tokenize.word_tokenize(str(self.coco.anns[self.ids[index]]['caption']).lower()) for index in np.arange(len(self.ids)) ] self.caption_lengths = [len(token) for token in all_tokens] # Caption all images else: self.coco = COCO(annotations_file) self.img_ids = list(self.coco.imgToAnns.keys()) if self.mode == "valid" else self.coco.getImgIds()
def build_vocab(cleaned_captions): """ Parses training set token file captions and builds a Vocabulary object Args: cleaned_captions (str list): cleaned list of human captions to build vocab with Returns: vocab (Vocabulary): Vocabulary object """ # QUESTION 1.1 # TODO collect words word_count = {} for caption in cleaned_captions: for word in caption.split(): if word not in word_count: word_count[word] = 0 word_count[word] += 1 # create a vocab instance vocab = Vocabulary() # add the token words vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # TODO add the rest of the words from the cleaned captions here # vocab.add_word('word') for word, n in word_count.items(): if n > 3: vocab.add_word(word) return vocab
def test(args): vocab = Vocabulary() vocab.load_vocab(os.path.join(args['data_dir'], 'vocabulary.json')) args['voca_size'] = vocab.get_vocab_size() test_data = get_dataloader( os.path.join(args['data_dir'], 'encoded_test_dialogue_pair.json'), os.path.join(args['data_dir'], 'vocabulary.json'), 1) test_sent_pair_list = [] model = Seq2Seq(args).eval() if torch.cuda.is_available(): model = model.cuda() path = Checkpoint.get_latest_checkpoint(args['exp_dir']) model.load_state_dict(torch.load(os.path.join(path, 'model.pt'))) for batch_idx, (sour, sour_len, targ, targ_len) in enumerate(test_data): if torch.cuda.is_available(): sour = sour.cuda() targ = targ.cuda() enco_hidd_state = model.encoder.encoder_forward(sour, sour_len) out_prob = model.decoder.decoder_forward(targ, targ_len, enco_hidd_state, 0) sent_list = [(out_prob.topk(1)[1].view(-1).tolist(), 0)] test_sent_pair_list += process_sent_list(vocab, sour, targ, sent_list) # logger.info('batch_idx:{} \nsent:{}'.format(batch_idx,test_sent_pair_list)) save_test_sent(args['exp_data'], 'generated_test_sent.txt', test_sent_pair_list)
def build_vocabulary(train_recipes): """ Building vocabulary. In our case, we replace words with <unk> when they have less than 5 frequency. """ counter = Counter() for recipe in train_recipes: context = recipe["context"] for step in context: token = step["token"] counter.update(token) min_count = 5 word_counts = [x for x in counter.items() if x[1] >= min_count] word_counts.sort(key=lambda x: x[1], reverse=True) print("# Words in Vocabulary : ", len(word_counts)) reverse_vocab = [x[0] for x in word_counts] unk_id = len(reverse_vocab) sos_id = len(reverse_vocab) + 1 eos_id = len(reverse_vocab) + 2 pad_id = len(reverse_vocab) + 3 vocab_dict = dict([(x, y) for (y, x) in enumerate(reverse_vocab)]) vocab = Vocabulary(vocab_dict, unk_id, sos_id, eos_id, pad_id) return vocab
def __init__(self, transform, mode, batch_size, vocab_threshold, vocab_file, start_word, end_word, unk_word, annotations_file, vocab_from_file, img_folder): self.transform = transform self.mode = mode self.batch_size = batch_size self.vocab = Vocabulary(vocab_threshold, vocab_file, start_word, end_word, unk_word, annotations_file, vocab_from_file) self.img_folder = img_folder test_info = json.loads(open(annotations_file).read()) self.ids, self.paths, self.cap_dict = [], [], {} for item in test_info['images']: self.ids.append(item['id']) self.cap_dict[item['id']] = {'file_name':item['file_name'], 'captions':[]} self.paths.append(item['file_name']) for item in test_info['annotations']: #if item['image_id'] in self.dict: tokens = nltk.tokenize.word_tokenize(str(item['caption']).lower()) caption = [] caption.extend([self.vocab(token) for token in tokens]) caption = torch.Tensor(caption).long() self.cap_dict[item['image_id']]['captions'].append(caption)
def from_corpus(cls, corpus, vocab_size): vocab = Vocabulary() for token in corpus: vocab.add(token) vocab_subset = vocab.get_topk_subset(vocab_size) vocab_subset.shuffle() return cls(vocab_subset)
def __init__(self,transform,mode,batch_size, threshold,sample_size,file,start_seq,end_seq,unk_word, annotations_file,load_vocab,image_dir): self.transform = transform self.mode = mode self.batch_size = batch_size self.vocab = Vocabulary(threshold,file,start_seq,end_seq,unk_word,annotations_file,load_vocab) self.image_dir = image_dir self.sample_size = sample_size if mode in ['train','val']: self.coco = COCO(annotations_file) self.ids = list(self.coco.anns.keys()) print("IDS",len(self.ids)) self.ids = self.ids[:self.sample_size] tokens = list() for idx in tqdm(np.arange(len(self.ids))): caption = str(self.coco.anns[self.ids[idx]]['caption']).lower() tokens.append(nltk.tokenize.word_tokenize(caption)) self.caption_lengths=[len(token) for token in tokens] else: test_anns = json.load(open(annotations_file)) self.paths = [item['file_name'] for item in test_anns['images']]
def __init__(self, transform, mode, batch_size, vocab_threshold, vocab_file, start_word, end_word, unk_word, annotations_file, vocab_from_file, img_folder): # transform - defined transformation (e.g. Rescale, ToTensor, RandomCrop and etc.) self.transform = transform self.mode = mode self.batch_size = batch_size self.vocab = Vocabulary(vocab_threshold, vocab_file, start_word, end_word, unk_word, annotations_file, vocab_from_file) self.img_folder = img_folder # if training and validation if self.mode == 'train' or self.mode == 'valid': # JSON file, where the annotations are stored self.coco = COCO(annotations_file) # each annotatin contains multiple attributes, such as task e.g. segmentation, # image_id, bounding box and etc. # in order to load an image, for instance, # image URL we will use self.coco.loadImgs(image_id) based on image id self.ids = list(self.coco.anns.keys()) print('Obtaining caption lengths...') # get all_tokens - a big list of lists. Each is a list of tokens for specific caption all_tokens = [ nltk.tokenize.word_tokenize( str(self.coco.anns[self.ids[index]]['caption']).lower()) for index in tqdm(np.arange(len(self.ids))) ] # list of token lengths (number of words for each caption) self.caption_lengths = [len(token) for token in all_tokens] else: # if we are in testing mode test_info = json.loads(open(annotations_file).read()) self.paths = [item['file_name'] for item in test_info['images']]
def vocabulary_demo(): # We used up a few lines in the previous example, so we set up # our data generator again. corpus = smart_reader(train_e_path) # Let's create a vocabulary given our (tokenized) corpus vocabulary = Vocabulary(corpus=corpus) print("Original vocabulary size: {}".format(len(vocabulary))) # Now we only keep the highest-frequency words vocabulary_size = 1000 vocabulary.trim(vocabulary_size) print("Trimmed vocabulary size: {}".format(len(vocabulary))) # Now we can get word indexes using v.get_word_id(): for t in ["<PAD>", "<UNK>", "the"]: print("The index of \"{}\" is: {}".format(t, vocabulary.get_token_id(t))) # And the inverse too, using v.i2t: for i in range(10): print("The token with index {} is: {}".format(i, vocabulary.get_token(i))) # Now let's try to get a word ID for a word not in the vocabulary # we should get 1 (so, <UNK>) for t in ["!@!_not_in_vocab_!@!"]: print("The index of \"{}\" is: {}".format(t, vocabulary.get_token_id(t)))
def __init__(self, params, mode, batch_size, vocab_from_file, img_folder, annotations_file): self.params = params self.mode = mode self.batch_size = batch_size self.vocab = Vocabulary(params, vocab_from_file, annotations_file) self.img_folder = img_folder if self.mode == 'train': self.coco = COCO(annotations_file) self.ids = list(self.coco.anns.keys()) print('Obtaining caption lengths...') # Here, nltk.tokenize.word_tokenize(str(sentence).lower()) takes a string type sentence, changes all letters # into lower case and returns a list containing all words in that sentence. # Use np.arange because it runs faster than range all_tokens = [ nltk.tokenize.word_tokenize( str(self.coco.anns[self.ids[index]]['caption']).lower()) for index in tqdm(np.arange(len(self.ids))) ] self.caption_lengths = [len(token) for token in all_tokens] else: test_info = json.loads(open(annotations_file).read()) # paths stores names of all images in test set (e.g. COCO_test2014_000000264794.jpg) self.paths = [item['file_name'] for item in test_info['images']]
def __init__(self, transform, mode, batch_size, vocab_threshold, vocab_file, glove_file, start_word, end_word, unk_word, annotations_file, vocab_from_file, img_folder): self.transform = transform self.mode = mode self.batch_size = batch_size self.vocab = Vocabulary(vocab_threshold, vocab_file, glove_file, start_word, end_word, unk_word, annotations_file, vocab_from_file, dataset='insta') self.img_folder = img_folder self.sel_length = None if self.mode == 'train' or self.mode == 'val': import time start = time.time() print('Start reading...') self.insta = pickle.load(open(annotations_file, 'rb')) print('Done: ', time.time() - start) self.ids = list(self.insta.keys()) print('Obtaining caption lengths...') all_tokens = [ regex_tokenizer.tokenize( str(self.insta[index]['caption']).lower()) for index in tqdm(self.ids) ] self.caption_lengths = [len(token) for token in all_tokens]
def build_vocabulary_from_dataset(self, data): vocabulary = Vocabulary(custom_unk_word=' ') for transcription in data['transcription_tokens']: for word in transcription: vocabulary.add_word(word) dataset_info = {'vocabulary': vocabulary} return dataset_info
def build_vocab(annotation_path, threshold): """Build a simple vocabulary wrapper.""" df_annotation = pd.read_csv(annotation_path, keep_default_na=False) counter = Counter() for _, each_annotation in df_annotation.iterrows(): attribute_tags = each_annotation['attribute_tags'] tokens = list(re.split('[,]', attribute_tags)) if len(tokens) > 0: tokens = [ token.strip() for token in tokens if not has_numbers(token) ] counter.update(tokens) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # print(words) # Create a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab
def main(args): # Create vocabulary. Assumes that the `vocab.pkl` file already exists. vocabulary = Vocabulary(vocab_from_file=True) # Create transformations transformations = transforms.Compose([ transforms.Resize(224), transforms.ToTensor() ]) # Create the model instance. model = CaptioningModel( embed_size=256, hidden_size=256, vocab_size = len(vocabulary), num_layers=2, use_pretrained_encoder=False ) # Load state dict for encoder and decoder model.encoder.load_state_dict(torch.load('./models/encoder.pkl')['encoder_state_dict']) model.decoder.load_state_dict(torch.load('./models/decoder.pkl')['decoder_state_dics']) # Set model to eval state model.eval() image = Image.open(args.image_path) tensor = transformations(image).unsqueeze(0) result = model.sample(tensor) text = result_to_text(result, vocabulary) print(text)