def _create_and_save_vocab(self, word_sequences, vocab_threshold, model_dir, vocab_filename): """Create a Vocabulary instance from a list of word sequences, and save it to disk. Args: word_sequences: List of word sequences (sentence(s)) to use as basis for the vocabulary. vocab_threshold: Minimum number of times any word must appear within word_sequences in order to be included in the vocabulary. """ vocabulary = Vocabulary() for i in range(len(word_sequences)): word_seq = word_sequences[i] vocabulary.add_words(word_seq.split()) vocabulary.compile(vocab_threshold) vocab_filepath = path.join(model_dir, vocab_filename) vocabulary.save(vocab_filepath) return vocabulary
def _create_and_save_vocab(self, word_sequences, vocab_threshold, model_dir, vocab_filename, embeddings_dir, normalize_imported_vocab, vocab_import_mode): """Create a Vocabulary instance from a list of word sequences, and save it to disk. Args: word_sequences: List of word sequences (sentence(s)) to use as basis for the vocabulary. vocab_threshold: Minimum number of times any word must appear within word_sequences in order to be included in the vocabulary. model_dir: directory to save the vocabulary file to vocab_filename: file name of the vocabulary file embeddings_dir: Optional directory to import external vocabulary & embeddings If provided, the external vocabulary will be imported and processed according to the vocab_import_mode. If None, only the generated vocabulary will be used. normalize_imported_vocab: See VocabularyImporter.import_vocabulary vocab_import_mode: If embeddings_dir is specified, this flag indicates if the dataset vocabulary should be generated and used in combination with the external vocabulary according to the rules of VocabularyImportMode. """ vocabulary = None if embeddings_dir is None or vocab_import_mode != VocabularyImportMode.External: vocabulary = Vocabulary() for i in range(len(word_sequences)): word_seq = word_sequences[i] vocabulary.add_words(word_seq.split()) vocabulary.compile(vocab_threshold) vocabulary_import_stats = None if embeddings_dir is not None: vocabulary_importer = vocabulary_importer_factory.get_vocabulary_importer( embeddings_dir) vocabulary, vocabulary_import_stats = vocabulary_importer.import_vocabulary( embeddings_dir, normalize_imported_vocab, vocab_import_mode, vocabulary) vocab_filepath = path.join(model_dir, vocab_filename) vocabulary.save(vocab_filepath) return vocabulary, vocabulary_import_stats
def _create_and_save_vocab(self, word_sequences, vocab_threshold, model_dir, vocab_filename, embeddings_dir, normalize_imported_vocab, vocab_import_mode): vocabulary = None if embeddings_dir is None or vocab_import_mode != VocabularyImportMode.External: vocabulary = Vocabulary() for i in range(len(word_sequences)): word_seq = word_sequences[i] vocabulary.add_words(word_seq.split()) vocabulary.compile(vocab_threshold) vocabulary_import_stats = None if embeddings_dir is not None: vocabulary_importer = vocabulary_importer_factory.get_vocabulary_importer(embeddings_dir) vocabulary, vocabulary_import_stats = vocabulary_importer.import_vocabulary(embeddings_dir, normalize_imported_vocab, vocab_import_mode, vocabulary) vocab_filepath = path.join(model_dir, vocab_filename) vocabulary.save(vocab_filepath) return vocabulary, vocabulary_import_stats
def import_vocabulary(self, vocabulary_dir, normalize=True, import_mode=VocabularyImportMode.External, dataset_vocab=None): if dataset_vocab is None and import_mode != VocabularyImportMode.External: raise ValueError( "dataset_vocab must be provided if import_mode is not 'External'." ) import_stats = VocabularyImportStats() #Read the external vocabulary tokens and embeddings tokens_with_embeddings = self._read_vocabulary_and_embeddings( vocabulary_dir) #If normalize flag is true, normalize casing of the external vocabulary and average embeddings for any resulting duplicate tokens if normalize: tokens_with_embeddings = self._normalize_tokens_with_embeddings( tokens_with_embeddings) import_stats.external_vocabulary_size = len(tokens_with_embeddings) #Apply dataset filters if applicable if dataset_vocab is not None: import_stats.dataset_vocabulary_size = dataset_vocab.size() if import_mode == VocabularyImportMode.ExternalIntersectDataset or import_mode == VocabularyImportMode.Dataset: #Get rid of all tokens that exist in the external vocabulary but don't exist in the dataset for token in list(tokens_with_embeddings.keys()): if not dataset_vocab.word_exists(token): del tokens_with_embeddings[token] import_stats.intersection_size = len(tokens_with_embeddings) if import_mode == VocabularyImportMode.ExternalUnionDataset or import_mode == VocabularyImportMode.Dataset: #Add any tokens that exist in the dataset but don't exist in the external vocabulary. #These added tokens will get word vectors sampled from the gaussian distributions of their components: # where the mean of each component is the mean of that component in the external embedding matrix # and the standard deviation of each component is the standard deviation of that component in the external embedding matrix embeddings_matrix = np.array(list( tokens_with_embeddings.values()), dtype=np.float32) emb_size = embeddings_matrix.shape[1] emb_mean = np.mean(embeddings_matrix, axis=0) emb_stdev = np.std(embeddings_matrix, axis=0) for i in range(dataset_vocab.size()): dataset_token = dataset_vocab.int2word(i, capitalize_i=False) if dataset_token not in tokens_with_embeddings: tokens_with_embeddings[ dataset_token] = np.random.normal( emb_mean, emb_stdev, emb_size) if len(tokens_with_embeddings) == 0: raise ValueError( "Imported vocabulary size is 0. Try a different VocabularyImportMode (currently {0})" .format(VocabularyImportMode(import_mode).name)) tokens, embeddings_matrix = zip(*tokens_with_embeddings.items()) embeddings_matrix = np.array(embeddings_matrix, dtype=np.float32) #Create the vocabulary instance vocabulary = Vocabulary(external_embeddings=embeddings_matrix) for i in range(len(tokens)): vocabulary.load_word(tokens[i], i) vocabulary.compile(loading=True) return vocabulary, import_stats