コード例 #1
0
def build_vocabs():
    train, dev, test = load_boknilev()
    samples = [
        s for r in train + dev + test
        for s in boknilev_record_to_hcpd_samples(r)
    ]

    gold_pos_vocab = Vocabulary('GOLD_POS')
    gold_pos_vocab.add_words(
        set([hc.next_pos for s in samples for hc in s.x.head_cands]))
    gold_pos_vocab.add_word(None)

    words_vocab = Vocabulary('WORDS')
    words_vocab.add_words(
        set([hc.word for s in samples for hc in s.x.head_cands]))
    words_vocab.add_words(set([s.x.pp.word for s in samples]))
    words_vocab.add_words(set([s.x.child.word for s in samples]))
    words_vocab.add_word(None)

    words_to_lemmas = {}
    words_to_lemmas.update({s.x.child.word: s.x.child.lemma for s in samples})
    words_to_lemmas.update(
        {hc.word: hc.lemma
         for s in samples for hc in s.x.head_cands})

    return [gold_pos_vocab, words_vocab, words_to_lemmas]
コード例 #2
0
 def build_vocabulary_from_dataset(self, data):
     vocabulary = Vocabulary(custom_unk_word=' ')
     for transcription in data['transcription_tokens']:
         for word in transcription:
             vocabulary.add_word(word)
     dataset_info = {'vocabulary': vocabulary}
     return dataset_info
コード例 #3
0
    def __init__(self,
                 data_file,
                 character_level=None,
                 phoneme_level=None,
                 vocabulary=None,
                 transform=None):
        self.data_file = data_file
        self.data = joblib.load(open(self.data_file, 'rb'))
        self.character_level = character_level
        self.phoneme_level = phoneme_level
        self.transcription_processor = lambda words: words

        if self.character_level:
            characters = [chr(c) for c in range(ord('a'), ord('z') + 1)]
            characters += [' ']
            character_vocab = Vocabulary()
            for character in characters:
                character_vocab.add_word(character)
            self.vocabulary = character_vocab
            self.transcription_processor = self._character_level_transcription_processor
        elif self.phoneme_level:
            cmu_phones = list(map(lambda x: x[0], cmudict.phones()))
            cmu_phones += [' ']
            phones_vocab = Vocabulary(custom_unk_word=' ')
            for phone in cmu_phones:
                phones_vocab.add_word(phone)
            self.vocabulary = phones_vocab
            self.phones_dict = cmudict.dict()
            self.transcription_processor = self._phone_level_transcription_processor
        elif vocabulary is None:
            data_file_dir = os.path.dirname(self.data_file)
            data_file_prefix = os.path.splitext(self.data_file)[0]
            pickle_file_name = f'{data_file_prefix}_SpeechDataset.pickle'
            pickle_file_path = os.path.join(data_file_dir, pickle_file_name)
            if not os.path.isfile(pickle_file_path):
                dataset_info = self.build_vocabulary_from_dataset(self.data)
                pickle.dump(dataset_info, open(pickle_file_path, 'wb'))
            else:
                dataset_info = pickle.load(open(pickle_file_path, 'rb'))
            self.vocabulary = dataset_info['vocabulary']
        else:
            self.vocabulary = vocabulary
        self.transform = transform
        self.max_transcription_length = max([
            len(transcription)
            for transcription in self.data['transcription_tokens']
        ])
        self.max_input_length = max([
            spectrogram.shape[1]
            for spectrogram in self.data['audio_spectrograms']
        ])
コード例 #4
0
def read_instances_from_file(files, max_len=400, keep_case=False):
    ''' Collect instances and construct vocab '''

    vocab = Vocabulary()
    lb_vocab = Vocabulary(need_default=False)
    sets = []

    for file in files:
        sents, labels = [], []
        trimmed_sent = 0
        with open(file) as f:
            lines = f.readlines()
            for l in lines:
                l = l.strip().split('\t')
                if len(l) < 2:
                    continue
                label = l[0]
                sent = l[1]
                if not keep_case:
                    sent = sent.lower()
                word_lst = sent.split()
                if len(word_lst) > max_len:
                    word_lst = word_lst[:max_len]
                    trimmed_sent += 1
                if word_lst:
                    sents.append(word_lst)
                    labels.append(label)
                    vocab.add_word_lst(word_lst)
                    lb_vocab.add_word(label)

        assert len(sents) == len(labels)

        sets.append({'sents': sents, 'labels': labels})

        logger.info('Get {} instances from file {}'.format(len(sents), file))
        if trimmed_sent:
            logger.info(
                '{} sentences are trimmed. Max sentence length: {}.'.format(
                    trimmed_sent, max_len))

    logger.info('Building vocabulary...')
    vocab.add_word_lst(['<cls>'] * 6)
    vocab.build_vocab()
    lb_vocab.build_vocab()
    logger.info('Finished. Size of vocab: {}. # Class: {}.'.format(
        len(vocab), len(lb_vocab)))
    logger.info('<pad>: {}'.format(vocab.to_index('<pad>')))
    logger.info('<unk>: {}'.format(vocab.to_index('<unk>')))
    logger.info('<cls>: {}'.format(vocab.to_index('<cls>')))

    return sets, vocab, lb_vocab
コード例 #5
0
def build_vocab(cleaned_captions):
    """ 
    Parses training set token file captions and builds a Vocabulary object
    Args:
        cleaned_captions (str list): cleaned list of human captions to build vocab with

    Returns:
        vocab (Vocabulary): Vocabulary object
    """

    # QUESTION 1.1
    # TODO collect words
    word_count = {}
    for caption in cleaned_captions:
        for word in caption.split():
            if word not in word_count:
                word_count[word] = 0
            word_count[word] += 1

    # create a vocab instance
    vocab = Vocabulary()

    # add the token words
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # TODO add the rest of the words from the cleaned captions here
    # vocab.add_word('word')
    for word, n in word_count.items():
        if n > 3:
            vocab.add_word(word)

    return vocab
コード例 #6
0
ファイル: 01-build_vocab.py プロジェクト: JoshuaZe/img2tags
def build_vocab(annotation_path, threshold):
    """Build a simple vocabulary wrapper."""
    df_annotation = pd.read_csv(annotation_path, keep_default_na=False)
    counter = Counter()
    for _, each_annotation in df_annotation.iterrows():
        attribute_tags = each_annotation['attribute_tags']
        tokens = list(re.split('[,]', attribute_tags))
        if len(tokens) > 0:
            tokens = [
                token.strip() for token in tokens if not has_numbers(token)
            ]
            counter.update(tokens)

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]
    # print(words)

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
コード例 #7
0
def build_vocab(caption, threshold):
    """Build a simple vocabulary wrapper."""
    counter = Counter()
    n = len(caption.keys())
    for i, key in enumerate(caption.keys()):
        for sentence in caption[key]:
            tokens = nltk.tokenize.word_tokenize(sentence)
            counter.update(tokens)

        if i % 1000 == 0:
            print("[%d/%d] Tokenized the captions." % (i, n))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Creates a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Adds the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
コード例 #8
0
 def process_dataset(self, fileobj):
     vocabulary = Vocabulary()
     sequence_delimiters = [0]
     while True:
         line = fileobj.readline()
         if line is None or len(line) < 1:
             break
         sequence_delimiters.append(fileobj.tell())
         words = line.strip().split(',')
         for word in words:
             vocabulary.add_word(word)
     dataset_info = {
         'sequence_delimiters': sequence_delimiters,
         'vocabulary': vocabulary
     }
     return dataset_info
コード例 #9
0
ファイル: dataloader.py プロジェクト: msieb1/LTCN
class EmbeddingLoader(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, n_views, emb_directory, label_directory):
        self.n_views = n_views
        self._read_embedding_dir(emb_directory) # Creates list of paths with all videos
        self._read_label_dir(label_directory) # Creates list of paths with all videos

        # The negative example has to be from outside the buffer window. Taken from both sides of
        # ihe frame.
        self.sequence_index = 0
        self.vocab = Vocabulary()
        words =['blue', 'orange', 'green', 'blue', 'red', 'yellow']
        for i, word in enumerate(words):
            self.vocab.add_word(word)

    def __len__(self):
        return len(self.emb_paths)

    def __getitem__(self, idx):
        emb = read_npy_file(self.emb_paths[idx])
        seq_idx = self.emb_paths[idx].split('/')[-1].split('_')[0]
        label = read_caption(os.path.join(self._label_directory, seq_idx + '_parsed.txt'))
        label = nltk.tokenize.word_tokenize(str(label).lower())
        active_label = self.vocab(label[-4])
        passive_label = self.vocab(label[-2])
        # Convert caption (string) to word ids.

        target1 = torch.LongTensor([active_label])
        target2 = torch.LongTensor([passive_label])

        emb = np.mean(emb, axis=0)
        emb = torch.FloatTensor(emb)
        return emb, target1, target2

    def _read_embedding_dir(self, emb_directory):
        self._emb_directory = emb_directory
        filenames = ls_npy(emb_directory)
        self.emb_paths = [os.path.join(self._emb_directory, f) for f in filenames]
        self.sequence_count = int(len(self.emb_paths) / self.n_views)

    def _read_label_dir(self, label_directory):
        self._label_directory = label_directory
        filenames = ls_txt(label_directory)
        self.label_paths = [os.path.join(self._label_directory, f) for f in filenames]
コード例 #10
0
    def glove(self):
        embeddings = []
        vocab = Vocabulary()

        with open(self.file_name, encoding='UTF-8') as f:

            for line in f:
                values = line.split()
                vocab.add_word(values[0])
                embeddings.append(np.asarray(values[1:], dtype='float32'))

            if "<UNK>" not in vocab.word2idx:
                vocab.add_word("<UNK>")
                embeddings.append(
                    np.random.uniform(low=-0.05,
                                      high=0.05,
                                      size=self.embedding_dim))

            return vocab, len(embeddings), Embeddings(
                vocab_size=len(embeddings),
                embedding_dim=self.embedding_dim,
                embeddings=np.array(embeddings, dtype='float32'),
                trainable=False)
コード例 #11
0
def build_vocab_question(imgs, params):
    # build vocabulary for question and answers.

    count_thr = params['word_count_threshold']

    # count up the number of words
    counts = {}
    for img in imgs:
        for w in img['processed_tokens']:
            counts[w] = counts.get(w, 0) + 1
    cw = sorted([(count, w) for w, count in counts.items()], reverse=True)
    print('top words and their counts:')
    print('\n'.join(map(str, cw[:20])))

    # print some stats
    total_words = sum(counts.values())
    print('total words:', total_words)
    bad_words = [w for w, n in counts.items() if n <= count_thr]
    words = [w for w, n in counts.items() if n > count_thr]
    bad_count = sum(counts[w] for w in bad_words)
    print('number of bad words: %d/%d = %.2f%%' %
          (len(bad_words), len(counts), len(bad_words) * 100.0 / len(counts)))
    print('number of words in vocab would be %d' % (len(words), ))
    print('number of UNKs: %d/%d = %.2f%%' %
          (bad_count, total_words, bad_count * 100.0 / total_words))

    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    for i, word in enumerate(words):
        vocab.add_word(word)

    for img in imgs:
        txt = img['processed_tokens']
        question = [
            w if counts.get(w, 0) > count_thr else '<unk>' for w in txt
        ]
        img['final_question'] = question

    return imgs, vocab
コード例 #12
0
def build_vocab(json_path: str, threshold: int) -> Vocabulary:
    coco_cls = COCO(json_path)
    countt_cls = Counter()
    ids = coco_cls.anns.keys()
    for i, idt in enumerate(ids):
        caption = str(coco_cls.anns[idt]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        countt_cls.update(tokens)
        if (i + 1) % 1000 == 0:
            print('%d/%d tokenize the captions' % (i + 1, len(ids)))
    words = [word for word, cnt in countt_cls.items() if cnt >= threshold]
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')
    for word in words:
        vocab.add_word(word)
    return vocab
コード例 #13
0
def read_instances_from_file(files, max_len, keep_case):
    ''' Collect instances and construct vocab '''

    vocab = Vocabulary()
    pos_vocab = Vocabulary(need_default=False)
    ner_vocab = Vocabulary(need_default=False)
    srl_vocab = Vocabulary(need_default=False)
    chunk_vocab = Vocabulary(need_default=False)
    sets = []

    for file in files:
        sents = []
        pos_labels, ner_labels, srl_labels, chunk_labels = [], [], [], []
        trimmed_sent = 0
        with open(file) as f:
            lines = f.readlines()
            sent = []
            pos_label, ner_label, srl_label, chunk_label = [], [], [], []
            for l in lines:
                l = l.strip()
                if l == '':
                    if len(sent) > 0:
                        if len(sent) > max_len:
                            trimmed_sent += 1
                            pos_labels.append(pos_label[:max_len])
                            ner_labels.append(ner_label[:max_len])
                            srl_labels.append(srl_label[:max_len])
                            chunk_labels.append(chunk_label[:max_len])
                            sents.append(sent[:max_len])
                        else:
                            pos_labels.append(pos_label)
                            ner_labels.append(ner_label)
                            srl_labels.append(srl_label)
                            chunk_labels.append(chunk_label)
                            sents.append(sent)
                        sent = []
                        pos_label, ner_label, srl_label, chunk_label = [], [], [], []
                else:
                    l = l.split()
                    word = l[0]

                    if not keep_case:
                        word = word.lower()

                    sent.append(word)
                    pos_label.append(l[2])
                    ner_label.append(l[3])
                    srl_label.append(l[4])
                    chunk_label.append(l[5])

                    vocab.add_word(word)
                    pos_vocab.add_word(l[2])
                    ner_vocab.add_word(l[3])
                    srl_vocab.add_word(l[4])
                    chunk_vocab.add_word(l[5])

        sets.append({
            'sents': sents,
            'pos_labels': pos_labels,
            'ner_labels': ner_labels,
            'srl_labels': srl_labels,
            'chunk_labels': chunk_labels
        })

        logger.info('Get {} instances from file {}'.format(len(sents), file))
        if trimmed_sent:
            logger.warning(
                '{} sentences are trimmed. Max sentence length: {}.'.format(
                    trimmed_sent, max_len))

    logger.info('Building vocabulary...')
    vocab.build_vocab()
    logger.info('Finished. Size of vocab: {}'.format(len(vocab)))

    pos_vocab.build_vocab()
    ner_vocab.build_vocab()
    srl_vocab.build_vocab()
    chunk_vocab.build_vocab()
    logger.info('# class in POS Tagging: {}'.format(len(pos_vocab)))
    logger.info('# class in NER Tagging: {}'.format(len(ner_vocab)))
    logger.info('# class in SRL Tagging: {}'.format(len(srl_vocab)))
    logger.info('# class in Chunking: {}'.format(len(chunk_vocab)))

    return sets, vocab, [pos_vocab, ner_vocab, srl_vocab, chunk_vocab]
コード例 #14
0
ファイル: dataloader.py プロジェクト: msieb1/LTCN
class MultiViewTripletLabelDataset(Dataset):

     
    def __init__(self, n_views, video_directory, label_directory, image_size, sample_size=500):
        self.frame_size = image_size
        self.n_views = n_views
        self._read_video_dir(video_directory)
        self.vocab = Vocabulary()
        words =['blue', 'orange', 'green', 'red', 'yellow']
        for i, word in enumerate(words):
            self.vocab.add_word(word)
        self._read_label_dir(label_directory)
        self._count_frames()
        self.sample_size = sample_size
        self.valid_sequence_indices = self._get_valid_sequence_indices(label_directory)
        self.sequence_index = 0
        self.negative_frame_margin = 30
        assert len(self.label_paths) == int(len(self.video_paths) / self.n_views)

    def __len__(self):
        return len(self.valid_sequence_indices)

    def __getitem__(self, idx):
        # build image triplet item
        self.sequence_index = int(idx)
        triplets = []
        triplets = torch.Tensor(self.sample_size, 3, 3, *self.frame_size)
        label = read_caption(self.label_paths[self.sequence_index])
        label = nltk.tokenize.word_tokenize(str(label).lower())
        # print("index: {}, label: {}".format(self.valid_sequence_indices[idx], label))
        for i in range(self.sample_size):
            snaps = self.get_videos(self.sequence_index * self.n_views)
            anchor_frame, positive_frame, negative_frame = self.sample_triplet(snaps)
            triplets[i, 0, :, :, :] = anchor_frame
            triplets[i, 1, :, :, :] = positive_frame
            triplets[i, 2, :, :, :] = negative_frame    

        try:
            active_label = self.vocab(label[-4])
            passive_label = self.vocab(label[-2])
        except:
            print("Unknown label: ", label)
            print("sequence: ", self.sequence_index)
        seq_idx = torch.LongTensor([self.sequence_index] * self.sample_size)
        # Convert caption (string) to word ids.
        target = torch.LongTensor([[active_label, passive_label]] * self.sample_size) # Needs padded targets of same size as inputs
        return triplets, target, seq_idx

    def _read_video_dir(self, video_directory):
        self._video_directory = video_directory
        filenames = ls(video_directory)
        self.video_paths = [os.path.join(self._video_directory, f) for f in filenames]
        self.sequence_count = int(len(self.video_paths) / self.n_views)

    def _count_frames(self):
        frame_lengths = np.array([len(imageio.read(p)) for p in self.video_paths])
        self.frame_lengths = frame_lengths - OFFSET
        self.cumulative_lengths = np.zeros(len(self.frame_lengths), dtype=np.int32)
        prev = 0
        for i, frames in enumerate(self.frame_lengths):
            prev = self.cumulative_lengths[i-1]
            self.cumulative_lengths[i] = prev + frames

    def _read_label_dir(self, label_directory):
        self._label_directory = label_directory
        filenames = ls_txt(label_directory)
        self.label_paths = [os.path.join(self._label_directory, f) for f in filenames]   

    def _get_valid_sequence_indices(self, label_directory):
        valid_sequence_indices = []
        curr_seq_idx = 0
        filenames = ls_txt(label_directory)

        for filename in filenames:
            label = read_caption(os.path.join(label_directory, filename))
            label = nltk.tokenize.word_tokenize(str(label).lower())
            if label[-4] is None or label[-2] is None:
                curr_seq_idx += 1
                continue
            else:
                valid_sequence_indices.append(int(filename.split('_')[0]))
                curr_seq_idx += 1

        return valid_sequence_indices

    @functools.lru_cache(maxsize=1)
    def get_videos(self, index):
        views = []
        for i in range(self.n_views):
            views.append(read_video(self.video_paths[index + i], self.frame_size))
        return views

    def sample_triplet(self, snaps):
        loaded_sample = False
        while not loaded_sample:

            try:
                anchor_index = self.sample_anchor_frame_index()
                positive_index = anchor_index
                negative_index = self.sample_negative_frame_index(anchor_index)
                loaded_sample = True
            except:
                print("Error loading video - sequence index: ", self.sequence_index)
                print("video lengths: ", [len(snaps[i]) for i in range(0, len(snaps))])
                print("Maybe margin too high")
        # random sample anchor view,and positive view
        view_set = set(range(self.n_views))
        anchor_view = np.random.choice(np.array(list(view_set)))
        view_set.remove(anchor_view)
        positive_view = np.random.choice(np.array(list(view_set)))
        negative_view = anchor_view # negative example comes from same view INQUIRE TODO

        anchor_frame = snaps[anchor_view][anchor_index]
        positive_frame = snaps[positive_view][positive_index]
        negative_frame = snaps[negative_view][negative_index]
        return (torch.Tensor(anchor_frame), torch.Tensor(positive_frame),
            torch.Tensor(negative_frame))

    def build_set(self):
        triplets = []
        triplets = torch.Tensor(self.sample_size, 3, 3, *self.frame_size)
        for i in range(0, self.sample_size):
            snaps = self.get_videos(self.sequence_index * self.n_views)
            anchor_frame, positive_frame, negative_frame = self.sample_triplet(snaps)
            triplets[i, 0, :, :, :] = anchor_frame
            triplets[i, 1, :, :, :] = positive_frame
            triplets[i, 2, :, :, :] = negative_frame
        self.sequence_index = (self.sequence_index + 1) % self.sequence_count
        # Second argument is labels. Not used.
        return TensorDataset(triplets, torch.zeros(triplets.size()[0]))

    def sample_anchor_frame_index(self):
        arange = np.arange(0, self.frame_lengths[self.sequence_index * self.n_views])
        return np.random.choice(arange)

    # def sample_positive_frame_index(self, anchor_index):
    #     upper_bound = min(self.frame_lengths[self.sequence_index * self.n_views + 1], anchor_index)
    #     return upper_bound # in case video has less frames than anchor video

    def negative_frame_indices(self, anchor_index):
        video_length = self.frame_lengths[self.sequence_index * self.n_views]
        lower_bound = 0
        upper_bound = max(0, anchor_index - self.negative_frame_margin)
        range1 = np.arange(lower_bound, upper_bound)
        lower_bound = min(anchor_index + self.negative_frame_margin, video_length)
        upper_bound = video_length
        range2 = np.arange(lower_bound, upper_bound)
        return np.concatenate([range1, range2])

    def sample_negative_frame_index(self, anchor_index):
        return np.random.choice(self.negative_frame_indices(anchor_index))
コード例 #15
0
def build_vocabs():
    tasks = [
        '.'.join([id, syn]) for id in ['autoid', 'goldid']
        for syn in ['autosyn', 'goldsyn']
    ]
    stypes = ['train', 'dev', 'test']

    loader = StreusleLoader()
    STREUSLE_BASE = os.environ.get(
        'STREUSLE_BASE'
    ) or '/cs/usr/aviramstern/nlp/datasets/streusle_v4/release'
    all_files = [
        STREUSLE_BASE + '/' + stype + '/streusle.ud_' + stype + '.' + task +
        '.json' for task in tasks for stype in stypes
    ]
    records = sum([loader.load(f, input_format='json') for f in all_files], [])
    samples = [streusle_record_to_lstm_model_sample(r) for r in records]

    pp_vocab = Vocabulary('PREPS')
    pp_vocab.add_words(
        set([
            x.token for s in samples for x, y in zip(s.xs, s.ys)
            if any([y.supersense_role, y.supersense_func])
        ]))

    ner_vocab = Vocabulary('NERS')
    ner_vocab.add_words(
        set([x.ner for s in samples for x, y in zip(s.xs, s.ys)]))
    ner_vocab.add_word(None)

    lemmas_vocab = Vocabulary('LEMMAS')
    lemmas_vocab.add_words(
        set([x.lemma for s in samples for x, y in zip(s.xs, s.ys)]))

    ud_dep_vocab = Vocabulary('UD_DEPS')
    ud_dep_vocab.add_words(
        set([x.ud_dep for s in samples for x, y in zip(s.xs, s.ys)]))
    ud_dep_vocab.add_word(None)

    ud_xpos_vocab = Vocabulary('UD_XPOS')
    ud_xpos_vocab.add_words(
        set([x.ud_xpos for s in samples for x, y in zip(s.xs, s.ys)]))
    ud_xpos_vocab.add_word(None)

    token_vocab = Vocabulary('TOKENS')
    token_vocab.add_words(
        set([x.token for s in samples for x, y in zip(s.xs, s.ys)]))

    govobj_config_vocab = Vocabulary('GOVOBJ_CONFIGS')
    govobj_config_vocab.add_words(
        set([x.govobj_config for s in samples for x, y in zip(s.xs, s.ys)]))

    pss_vocab = Vocabulary('PSS')
    pss_vocab.add_words(supersense_repo.PREPOSITION_SUPERSENSES_SET)
    pss_vocab.add_word(None)

    pss_vocab = Vocabulary('LEXCAT')
    pss_vocab.add_words(
        set([x.lexcat for s in samples for x, y in zip(s.xs, s.ys)]))

    return [
        pp_vocab, ner_vocab, lemmas_vocab, ud_dep_vocab, ud_xpos_vocab,
        token_vocab, pss_vocab, govobj_config_vocab
    ]