コード例 #1
0
    def _create_and_save_vocab(self, word_sequences, vocab_threshold,
                               model_dir, vocab_filename):
        """Create a Vocabulary instance from a list of word sequences, and save it to disk.

        Args:
            word_sequences: List of word sequences (sentence(s)) to use as basis for the vocabulary.

            vocab_threshold: Minimum number of times any word must appear within word_sequences 
                in order to be included in the vocabulary.
        """
        vocabulary = Vocabulary()
        for i in range(len(word_sequences)):
            word_seq = word_sequences[i]
            vocabulary.add_words(word_seq.split())
        vocabulary.compile(vocab_threshold)

        vocab_filepath = path.join(model_dir, vocab_filename)
        vocabulary.save(vocab_filepath)
        return vocabulary
コード例 #2
0
    def _create_and_save_vocab(self, word_sequences, vocab_threshold,
                               model_dir, vocab_filename, embeddings_dir,
                               normalize_imported_vocab, vocab_import_mode):
        """Create a Vocabulary instance from a list of word sequences, and save it to disk.

        Args:
            word_sequences: List of word sequences (sentence(s)) to use as basis for the vocabulary.

            vocab_threshold: Minimum number of times any word must appear within word_sequences 
                in order to be included in the vocabulary.

            model_dir: directory to save the vocabulary file to

            vocab_filename: file name of the vocabulary file

            embeddings_dir: Optional directory to import external vocabulary & embeddings
                If provided, the external vocabulary will be imported and processed according to the vocab_import_mode.
                If None, only the generated vocabulary will be used.
            
            normalize_imported_vocab: See VocabularyImporter.import_vocabulary

            vocab_import_mode: If embeddings_dir is specified, this flag indicates if the dataset vocabulary should be generated
                and used in combination with the external vocabulary according to the rules of VocabularyImportMode.
        """
        vocabulary = None
        if embeddings_dir is None or vocab_import_mode != VocabularyImportMode.External:
            vocabulary = Vocabulary()
            for i in range(len(word_sequences)):
                word_seq = word_sequences[i]
                vocabulary.add_words(word_seq.split())
            vocabulary.compile(vocab_threshold)

        vocabulary_import_stats = None
        if embeddings_dir is not None:
            vocabulary_importer = vocabulary_importer_factory.get_vocabulary_importer(
                embeddings_dir)
            vocabulary, vocabulary_import_stats = vocabulary_importer.import_vocabulary(
                embeddings_dir, normalize_imported_vocab, vocab_import_mode,
                vocabulary)

        vocab_filepath = path.join(model_dir, vocab_filename)
        vocabulary.save(vocab_filepath)
        return vocabulary, vocabulary_import_stats
コード例 #3
0
    def _create_and_save_vocab(self, word_sequences, vocab_threshold, model_dir, vocab_filename, embeddings_dir, normalize_imported_vocab, vocab_import_mode):
        
        vocabulary = None
        if embeddings_dir is None or vocab_import_mode != VocabularyImportMode.External:
            vocabulary = Vocabulary()
            for i in range(len(word_sequences)):
                word_seq = word_sequences[i]
                vocabulary.add_words(word_seq.split())
            vocabulary.compile(vocab_threshold)
        
        vocabulary_import_stats = None
        if embeddings_dir is not None:
            vocabulary_importer = vocabulary_importer_factory.get_vocabulary_importer(embeddings_dir)
            vocabulary, vocabulary_import_stats = vocabulary_importer.import_vocabulary(embeddings_dir, 
                                                                                        normalize_imported_vocab, 
                                                                                        vocab_import_mode, 
                                                                                        vocabulary)

        vocab_filepath = path.join(model_dir, vocab_filename)
        vocabulary.save(vocab_filepath)
        return vocabulary, vocabulary_import_stats
コード例 #4
0
    def import_vocabulary(self,
                          vocabulary_dir,
                          normalize=True,
                          import_mode=VocabularyImportMode.External,
                          dataset_vocab=None):

        if dataset_vocab is None and import_mode != VocabularyImportMode.External:
            raise ValueError(
                "dataset_vocab must be provided if import_mode is not 'External'."
            )

        import_stats = VocabularyImportStats()

        #Read the external vocabulary tokens and embeddings
        tokens_with_embeddings = self._read_vocabulary_and_embeddings(
            vocabulary_dir)

        #If normalize flag is true, normalize casing of the external vocabulary and average embeddings for any resulting duplicate tokens
        if normalize:
            tokens_with_embeddings = self._normalize_tokens_with_embeddings(
                tokens_with_embeddings)

        import_stats.external_vocabulary_size = len(tokens_with_embeddings)

        #Apply dataset filters if applicable
        if dataset_vocab is not None:
            import_stats.dataset_vocabulary_size = dataset_vocab.size()

            if import_mode == VocabularyImportMode.ExternalIntersectDataset or import_mode == VocabularyImportMode.Dataset:
                #Get rid of all tokens that exist in the external vocabulary but don't exist in the dataset
                for token in list(tokens_with_embeddings.keys()):
                    if not dataset_vocab.word_exists(token):
                        del tokens_with_embeddings[token]
                import_stats.intersection_size = len(tokens_with_embeddings)

            if import_mode == VocabularyImportMode.ExternalUnionDataset or import_mode == VocabularyImportMode.Dataset:
                #Add any tokens that exist in the dataset but don't exist in the external vocabulary.
                #These added tokens will get word vectors sampled from the gaussian distributions of their components:
                #   where the mean of each component is the mean of that component in the external embedding matrix
                #   and the standard deviation of each component is the standard deviation of that component in the external embedding matrix
                embeddings_matrix = np.array(list(
                    tokens_with_embeddings.values()),
                                             dtype=np.float32)
                emb_size = embeddings_matrix.shape[1]
                emb_mean = np.mean(embeddings_matrix, axis=0)
                emb_stdev = np.std(embeddings_matrix, axis=0)
                for i in range(dataset_vocab.size()):
                    dataset_token = dataset_vocab.int2word(i,
                                                           capitalize_i=False)
                    if dataset_token not in tokens_with_embeddings:
                        tokens_with_embeddings[
                            dataset_token] = np.random.normal(
                                emb_mean, emb_stdev, emb_size)

        if len(tokens_with_embeddings) == 0:
            raise ValueError(
                "Imported vocabulary size is 0. Try a different VocabularyImportMode (currently {0})"
                .format(VocabularyImportMode(import_mode).name))

        tokens, embeddings_matrix = zip(*tokens_with_embeddings.items())
        embeddings_matrix = np.array(embeddings_matrix, dtype=np.float32)

        #Create the vocabulary instance
        vocabulary = Vocabulary(external_embeddings=embeddings_matrix)
        for i in range(len(tokens)):
            vocabulary.load_word(tokens[i], i)
        vocabulary.compile(loading=True)
        return vocabulary, import_stats