def build_vocabs(): train, dev, test = load_boknilev() samples = [ s for r in train + dev + test for s in boknilev_record_to_hcpd_samples(r) ] gold_pos_vocab = Vocabulary('GOLD_POS') gold_pos_vocab.add_words( set([hc.next_pos for s in samples for hc in s.x.head_cands])) gold_pos_vocab.add_word(None) words_vocab = Vocabulary('WORDS') words_vocab.add_words( set([hc.word for s in samples for hc in s.x.head_cands])) words_vocab.add_words(set([s.x.pp.word for s in samples])) words_vocab.add_words(set([s.x.child.word for s in samples])) words_vocab.add_word(None) words_to_lemmas = {} words_to_lemmas.update({s.x.child.word: s.x.child.lemma for s in samples}) words_to_lemmas.update( {hc.word: hc.lemma for s in samples for hc in s.x.head_cands}) return [gold_pos_vocab, words_vocab, words_to_lemmas]
def build_vocabulary_from_dataset(self, data): vocabulary = Vocabulary(custom_unk_word=' ') for transcription in data['transcription_tokens']: for word in transcription: vocabulary.add_word(word) dataset_info = {'vocabulary': vocabulary} return dataset_info
def __init__(self, data_file, character_level=None, phoneme_level=None, vocabulary=None, transform=None): self.data_file = data_file self.data = joblib.load(open(self.data_file, 'rb')) self.character_level = character_level self.phoneme_level = phoneme_level self.transcription_processor = lambda words: words if self.character_level: characters = [chr(c) for c in range(ord('a'), ord('z') + 1)] characters += [' '] character_vocab = Vocabulary() for character in characters: character_vocab.add_word(character) self.vocabulary = character_vocab self.transcription_processor = self._character_level_transcription_processor elif self.phoneme_level: cmu_phones = list(map(lambda x: x[0], cmudict.phones())) cmu_phones += [' '] phones_vocab = Vocabulary(custom_unk_word=' ') for phone in cmu_phones: phones_vocab.add_word(phone) self.vocabulary = phones_vocab self.phones_dict = cmudict.dict() self.transcription_processor = self._phone_level_transcription_processor elif vocabulary is None: data_file_dir = os.path.dirname(self.data_file) data_file_prefix = os.path.splitext(self.data_file)[0] pickle_file_name = f'{data_file_prefix}_SpeechDataset.pickle' pickle_file_path = os.path.join(data_file_dir, pickle_file_name) if not os.path.isfile(pickle_file_path): dataset_info = self.build_vocabulary_from_dataset(self.data) pickle.dump(dataset_info, open(pickle_file_path, 'wb')) else: dataset_info = pickle.load(open(pickle_file_path, 'rb')) self.vocabulary = dataset_info['vocabulary'] else: self.vocabulary = vocabulary self.transform = transform self.max_transcription_length = max([ len(transcription) for transcription in self.data['transcription_tokens'] ]) self.max_input_length = max([ spectrogram.shape[1] for spectrogram in self.data['audio_spectrograms'] ])
def read_instances_from_file(files, max_len=400, keep_case=False): ''' Collect instances and construct vocab ''' vocab = Vocabulary() lb_vocab = Vocabulary(need_default=False) sets = [] for file in files: sents, labels = [], [] trimmed_sent = 0 with open(file) as f: lines = f.readlines() for l in lines: l = l.strip().split('\t') if len(l) < 2: continue label = l[0] sent = l[1] if not keep_case: sent = sent.lower() word_lst = sent.split() if len(word_lst) > max_len: word_lst = word_lst[:max_len] trimmed_sent += 1 if word_lst: sents.append(word_lst) labels.append(label) vocab.add_word_lst(word_lst) lb_vocab.add_word(label) assert len(sents) == len(labels) sets.append({'sents': sents, 'labels': labels}) logger.info('Get {} instances from file {}'.format(len(sents), file)) if trimmed_sent: logger.info( '{} sentences are trimmed. Max sentence length: {}.'.format( trimmed_sent, max_len)) logger.info('Building vocabulary...') vocab.add_word_lst(['<cls>'] * 6) vocab.build_vocab() lb_vocab.build_vocab() logger.info('Finished. Size of vocab: {}. # Class: {}.'.format( len(vocab), len(lb_vocab))) logger.info('<pad>: {}'.format(vocab.to_index('<pad>'))) logger.info('<unk>: {}'.format(vocab.to_index('<unk>'))) logger.info('<cls>: {}'.format(vocab.to_index('<cls>'))) return sets, vocab, lb_vocab
def build_vocab(cleaned_captions): """ Parses training set token file captions and builds a Vocabulary object Args: cleaned_captions (str list): cleaned list of human captions to build vocab with Returns: vocab (Vocabulary): Vocabulary object """ # QUESTION 1.1 # TODO collect words word_count = {} for caption in cleaned_captions: for word in caption.split(): if word not in word_count: word_count[word] = 0 word_count[word] += 1 # create a vocab instance vocab = Vocabulary() # add the token words vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # TODO add the rest of the words from the cleaned captions here # vocab.add_word('word') for word, n in word_count.items(): if n > 3: vocab.add_word(word) return vocab
def build_vocab(annotation_path, threshold): """Build a simple vocabulary wrapper.""" df_annotation = pd.read_csv(annotation_path, keep_default_na=False) counter = Counter() for _, each_annotation in df_annotation.iterrows(): attribute_tags = each_annotation['attribute_tags'] tokens = list(re.split('[,]', attribute_tags)) if len(tokens) > 0: tokens = [ token.strip() for token in tokens if not has_numbers(token) ] counter.update(tokens) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # print(words) # Create a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab
def build_vocab(caption, threshold): """Build a simple vocabulary wrapper.""" counter = Counter() n = len(caption.keys()) for i, key in enumerate(caption.keys()): for sentence in caption[key]: tokens = nltk.tokenize.word_tokenize(sentence) counter.update(tokens) if i % 1000 == 0: print("[%d/%d] Tokenized the captions." % (i, n)) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Creates a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Adds the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab
def process_dataset(self, fileobj): vocabulary = Vocabulary() sequence_delimiters = [0] while True: line = fileobj.readline() if line is None or len(line) < 1: break sequence_delimiters.append(fileobj.tell()) words = line.strip().split(',') for word in words: vocabulary.add_word(word) dataset_info = { 'sequence_delimiters': sequence_delimiters, 'vocabulary': vocabulary } return dataset_info
class EmbeddingLoader(Dataset): """Face Landmarks dataset.""" def __init__(self, n_views, emb_directory, label_directory): self.n_views = n_views self._read_embedding_dir(emb_directory) # Creates list of paths with all videos self._read_label_dir(label_directory) # Creates list of paths with all videos # The negative example has to be from outside the buffer window. Taken from both sides of # ihe frame. self.sequence_index = 0 self.vocab = Vocabulary() words =['blue', 'orange', 'green', 'blue', 'red', 'yellow'] for i, word in enumerate(words): self.vocab.add_word(word) def __len__(self): return len(self.emb_paths) def __getitem__(self, idx): emb = read_npy_file(self.emb_paths[idx]) seq_idx = self.emb_paths[idx].split('/')[-1].split('_')[0] label = read_caption(os.path.join(self._label_directory, seq_idx + '_parsed.txt')) label = nltk.tokenize.word_tokenize(str(label).lower()) active_label = self.vocab(label[-4]) passive_label = self.vocab(label[-2]) # Convert caption (string) to word ids. target1 = torch.LongTensor([active_label]) target2 = torch.LongTensor([passive_label]) emb = np.mean(emb, axis=0) emb = torch.FloatTensor(emb) return emb, target1, target2 def _read_embedding_dir(self, emb_directory): self._emb_directory = emb_directory filenames = ls_npy(emb_directory) self.emb_paths = [os.path.join(self._emb_directory, f) for f in filenames] self.sequence_count = int(len(self.emb_paths) / self.n_views) def _read_label_dir(self, label_directory): self._label_directory = label_directory filenames = ls_txt(label_directory) self.label_paths = [os.path.join(self._label_directory, f) for f in filenames]
def glove(self): embeddings = [] vocab = Vocabulary() with open(self.file_name, encoding='UTF-8') as f: for line in f: values = line.split() vocab.add_word(values[0]) embeddings.append(np.asarray(values[1:], dtype='float32')) if "<UNK>" not in vocab.word2idx: vocab.add_word("<UNK>") embeddings.append( np.random.uniform(low=-0.05, high=0.05, size=self.embedding_dim)) return vocab, len(embeddings), Embeddings( vocab_size=len(embeddings), embedding_dim=self.embedding_dim, embeddings=np.array(embeddings, dtype='float32'), trainable=False)
def build_vocab_question(imgs, params): # build vocabulary for question and answers. count_thr = params['word_count_threshold'] # count up the number of words counts = {} for img in imgs: for w in img['processed_tokens']: counts[w] = counts.get(w, 0) + 1 cw = sorted([(count, w) for w, count in counts.items()], reverse=True) print('top words and their counts:') print('\n'.join(map(str, cw[:20]))) # print some stats total_words = sum(counts.values()) print('total words:', total_words) bad_words = [w for w, n in counts.items() if n <= count_thr] words = [w for w, n in counts.items() if n > count_thr] bad_count = sum(counts[w] for w in bad_words) print('number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words) * 100.0 / len(counts))) print('number of words in vocab would be %d' % (len(words), )) print('number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count * 100.0 / total_words)) vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') for i, word in enumerate(words): vocab.add_word(word) for img in imgs: txt = img['processed_tokens'] question = [ w if counts.get(w, 0) > count_thr else '<unk>' for w in txt ] img['final_question'] = question return imgs, vocab
def build_vocab(json_path: str, threshold: int) -> Vocabulary: coco_cls = COCO(json_path) countt_cls = Counter() ids = coco_cls.anns.keys() for i, idt in enumerate(ids): caption = str(coco_cls.anns[idt]['caption']) tokens = nltk.tokenize.word_tokenize(caption.lower()) countt_cls.update(tokens) if (i + 1) % 1000 == 0: print('%d/%d tokenize the captions' % (i + 1, len(ids))) words = [word for word, cnt in countt_cls.items() if cnt >= threshold] vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') for word in words: vocab.add_word(word) return vocab
def read_instances_from_file(files, max_len, keep_case): ''' Collect instances and construct vocab ''' vocab = Vocabulary() pos_vocab = Vocabulary(need_default=False) ner_vocab = Vocabulary(need_default=False) srl_vocab = Vocabulary(need_default=False) chunk_vocab = Vocabulary(need_default=False) sets = [] for file in files: sents = [] pos_labels, ner_labels, srl_labels, chunk_labels = [], [], [], [] trimmed_sent = 0 with open(file) as f: lines = f.readlines() sent = [] pos_label, ner_label, srl_label, chunk_label = [], [], [], [] for l in lines: l = l.strip() if l == '': if len(sent) > 0: if len(sent) > max_len: trimmed_sent += 1 pos_labels.append(pos_label[:max_len]) ner_labels.append(ner_label[:max_len]) srl_labels.append(srl_label[:max_len]) chunk_labels.append(chunk_label[:max_len]) sents.append(sent[:max_len]) else: pos_labels.append(pos_label) ner_labels.append(ner_label) srl_labels.append(srl_label) chunk_labels.append(chunk_label) sents.append(sent) sent = [] pos_label, ner_label, srl_label, chunk_label = [], [], [], [] else: l = l.split() word = l[0] if not keep_case: word = word.lower() sent.append(word) pos_label.append(l[2]) ner_label.append(l[3]) srl_label.append(l[4]) chunk_label.append(l[5]) vocab.add_word(word) pos_vocab.add_word(l[2]) ner_vocab.add_word(l[3]) srl_vocab.add_word(l[4]) chunk_vocab.add_word(l[5]) sets.append({ 'sents': sents, 'pos_labels': pos_labels, 'ner_labels': ner_labels, 'srl_labels': srl_labels, 'chunk_labels': chunk_labels }) logger.info('Get {} instances from file {}'.format(len(sents), file)) if trimmed_sent: logger.warning( '{} sentences are trimmed. Max sentence length: {}.'.format( trimmed_sent, max_len)) logger.info('Building vocabulary...') vocab.build_vocab() logger.info('Finished. Size of vocab: {}'.format(len(vocab))) pos_vocab.build_vocab() ner_vocab.build_vocab() srl_vocab.build_vocab() chunk_vocab.build_vocab() logger.info('# class in POS Tagging: {}'.format(len(pos_vocab))) logger.info('# class in NER Tagging: {}'.format(len(ner_vocab))) logger.info('# class in SRL Tagging: {}'.format(len(srl_vocab))) logger.info('# class in Chunking: {}'.format(len(chunk_vocab))) return sets, vocab, [pos_vocab, ner_vocab, srl_vocab, chunk_vocab]
class MultiViewTripletLabelDataset(Dataset): def __init__(self, n_views, video_directory, label_directory, image_size, sample_size=500): self.frame_size = image_size self.n_views = n_views self._read_video_dir(video_directory) self.vocab = Vocabulary() words =['blue', 'orange', 'green', 'red', 'yellow'] for i, word in enumerate(words): self.vocab.add_word(word) self._read_label_dir(label_directory) self._count_frames() self.sample_size = sample_size self.valid_sequence_indices = self._get_valid_sequence_indices(label_directory) self.sequence_index = 0 self.negative_frame_margin = 30 assert len(self.label_paths) == int(len(self.video_paths) / self.n_views) def __len__(self): return len(self.valid_sequence_indices) def __getitem__(self, idx): # build image triplet item self.sequence_index = int(idx) triplets = [] triplets = torch.Tensor(self.sample_size, 3, 3, *self.frame_size) label = read_caption(self.label_paths[self.sequence_index]) label = nltk.tokenize.word_tokenize(str(label).lower()) # print("index: {}, label: {}".format(self.valid_sequence_indices[idx], label)) for i in range(self.sample_size): snaps = self.get_videos(self.sequence_index * self.n_views) anchor_frame, positive_frame, negative_frame = self.sample_triplet(snaps) triplets[i, 0, :, :, :] = anchor_frame triplets[i, 1, :, :, :] = positive_frame triplets[i, 2, :, :, :] = negative_frame try: active_label = self.vocab(label[-4]) passive_label = self.vocab(label[-2]) except: print("Unknown label: ", label) print("sequence: ", self.sequence_index) seq_idx = torch.LongTensor([self.sequence_index] * self.sample_size) # Convert caption (string) to word ids. target = torch.LongTensor([[active_label, passive_label]] * self.sample_size) # Needs padded targets of same size as inputs return triplets, target, seq_idx def _read_video_dir(self, video_directory): self._video_directory = video_directory filenames = ls(video_directory) self.video_paths = [os.path.join(self._video_directory, f) for f in filenames] self.sequence_count = int(len(self.video_paths) / self.n_views) def _count_frames(self): frame_lengths = np.array([len(imageio.read(p)) for p in self.video_paths]) self.frame_lengths = frame_lengths - OFFSET self.cumulative_lengths = np.zeros(len(self.frame_lengths), dtype=np.int32) prev = 0 for i, frames in enumerate(self.frame_lengths): prev = self.cumulative_lengths[i-1] self.cumulative_lengths[i] = prev + frames def _read_label_dir(self, label_directory): self._label_directory = label_directory filenames = ls_txt(label_directory) self.label_paths = [os.path.join(self._label_directory, f) for f in filenames] def _get_valid_sequence_indices(self, label_directory): valid_sequence_indices = [] curr_seq_idx = 0 filenames = ls_txt(label_directory) for filename in filenames: label = read_caption(os.path.join(label_directory, filename)) label = nltk.tokenize.word_tokenize(str(label).lower()) if label[-4] is None or label[-2] is None: curr_seq_idx += 1 continue else: valid_sequence_indices.append(int(filename.split('_')[0])) curr_seq_idx += 1 return valid_sequence_indices @functools.lru_cache(maxsize=1) def get_videos(self, index): views = [] for i in range(self.n_views): views.append(read_video(self.video_paths[index + i], self.frame_size)) return views def sample_triplet(self, snaps): loaded_sample = False while not loaded_sample: try: anchor_index = self.sample_anchor_frame_index() positive_index = anchor_index negative_index = self.sample_negative_frame_index(anchor_index) loaded_sample = True except: print("Error loading video - sequence index: ", self.sequence_index) print("video lengths: ", [len(snaps[i]) for i in range(0, len(snaps))]) print("Maybe margin too high") # random sample anchor view,and positive view view_set = set(range(self.n_views)) anchor_view = np.random.choice(np.array(list(view_set))) view_set.remove(anchor_view) positive_view = np.random.choice(np.array(list(view_set))) negative_view = anchor_view # negative example comes from same view INQUIRE TODO anchor_frame = snaps[anchor_view][anchor_index] positive_frame = snaps[positive_view][positive_index] negative_frame = snaps[negative_view][negative_index] return (torch.Tensor(anchor_frame), torch.Tensor(positive_frame), torch.Tensor(negative_frame)) def build_set(self): triplets = [] triplets = torch.Tensor(self.sample_size, 3, 3, *self.frame_size) for i in range(0, self.sample_size): snaps = self.get_videos(self.sequence_index * self.n_views) anchor_frame, positive_frame, negative_frame = self.sample_triplet(snaps) triplets[i, 0, :, :, :] = anchor_frame triplets[i, 1, :, :, :] = positive_frame triplets[i, 2, :, :, :] = negative_frame self.sequence_index = (self.sequence_index + 1) % self.sequence_count # Second argument is labels. Not used. return TensorDataset(triplets, torch.zeros(triplets.size()[0])) def sample_anchor_frame_index(self): arange = np.arange(0, self.frame_lengths[self.sequence_index * self.n_views]) return np.random.choice(arange) # def sample_positive_frame_index(self, anchor_index): # upper_bound = min(self.frame_lengths[self.sequence_index * self.n_views + 1], anchor_index) # return upper_bound # in case video has less frames than anchor video def negative_frame_indices(self, anchor_index): video_length = self.frame_lengths[self.sequence_index * self.n_views] lower_bound = 0 upper_bound = max(0, anchor_index - self.negative_frame_margin) range1 = np.arange(lower_bound, upper_bound) lower_bound = min(anchor_index + self.negative_frame_margin, video_length) upper_bound = video_length range2 = np.arange(lower_bound, upper_bound) return np.concatenate([range1, range2]) def sample_negative_frame_index(self, anchor_index): return np.random.choice(self.negative_frame_indices(anchor_index))
def build_vocabs(): tasks = [ '.'.join([id, syn]) for id in ['autoid', 'goldid'] for syn in ['autosyn', 'goldsyn'] ] stypes = ['train', 'dev', 'test'] loader = StreusleLoader() STREUSLE_BASE = os.environ.get( 'STREUSLE_BASE' ) or '/cs/usr/aviramstern/nlp/datasets/streusle_v4/release' all_files = [ STREUSLE_BASE + '/' + stype + '/streusle.ud_' + stype + '.' + task + '.json' for task in tasks for stype in stypes ] records = sum([loader.load(f, input_format='json') for f in all_files], []) samples = [streusle_record_to_lstm_model_sample(r) for r in records] pp_vocab = Vocabulary('PREPS') pp_vocab.add_words( set([ x.token for s in samples for x, y in zip(s.xs, s.ys) if any([y.supersense_role, y.supersense_func]) ])) ner_vocab = Vocabulary('NERS') ner_vocab.add_words( set([x.ner for s in samples for x, y in zip(s.xs, s.ys)])) ner_vocab.add_word(None) lemmas_vocab = Vocabulary('LEMMAS') lemmas_vocab.add_words( set([x.lemma for s in samples for x, y in zip(s.xs, s.ys)])) ud_dep_vocab = Vocabulary('UD_DEPS') ud_dep_vocab.add_words( set([x.ud_dep for s in samples for x, y in zip(s.xs, s.ys)])) ud_dep_vocab.add_word(None) ud_xpos_vocab = Vocabulary('UD_XPOS') ud_xpos_vocab.add_words( set([x.ud_xpos for s in samples for x, y in zip(s.xs, s.ys)])) ud_xpos_vocab.add_word(None) token_vocab = Vocabulary('TOKENS') token_vocab.add_words( set([x.token for s in samples for x, y in zip(s.xs, s.ys)])) govobj_config_vocab = Vocabulary('GOVOBJ_CONFIGS') govobj_config_vocab.add_words( set([x.govobj_config for s in samples for x, y in zip(s.xs, s.ys)])) pss_vocab = Vocabulary('PSS') pss_vocab.add_words(supersense_repo.PREPOSITION_SUPERSENSES_SET) pss_vocab.add_word(None) pss_vocab = Vocabulary('LEXCAT') pss_vocab.add_words( set([x.lexcat for s in samples for x, y in zip(s.xs, s.ys)])) return [ pp_vocab, ner_vocab, lemmas_vocab, ud_dep_vocab, ud_xpos_vocab, token_vocab, pss_vocab, govobj_config_vocab ]