Esempio n. 1
0
    def load(self, directory):
        """Load model files from a directory"""

        self.ft = Word2Vec.load(os.path.join(directory, "w2v.model"))
        self.dictionary = Dictionary.load(os.path.join(directory,
                                                       "dict.model"))
        self.matrix = SparseTermSimilarityMatrix.load(
            os.path.join(directory, "stsm.model"))
Esempio n. 2
0
    def _preprocess_dataset(
        self, level: str
    ) -> Tuple[List[Document], List[Document], np.ndarray, List[Tuple[int,
                                                                      float]]]:
        LOGGER.info('Preprocessing {} ({})'.format(self.dataset, level))

        if level == 'validation':
            pivot = int(round(len(self.dataset.train_documents) * 0.8))
            train_documents = self.dataset.train_documents[:pivot]
            test_documents = self.dataset.train_documents[pivot:]
        elif level == 'test':
            train_documents = self.dataset.train_documents
            test_documents = self.dataset.test_documents
        else:
            message = 'Expected validation or test level, but got {}'
            raise ValueError(message.format(level))

        cache_path = self.model.cache_dir / 'text_classification'
        cache_path.mkdir(exist_ok=True)
        method_parameters = TEXT_CLASSIFICATION_METHOD_PARAMETERS[self.method]
        if self.method == 'scm':
            train_corpus = [document.words for document in train_documents]
            dictionary = Dictionary(train_corpus, prune_at=None)
            tfidf = TfidfModel(dictionary=dictionary, smartirs='nfn')
            termsim_index = WordEmbeddingSimilarityIndex(
                self.model.vectors, **method_parameters['similarity_index'])
            cache_path = cache_path / '{}-{}-{}-{}'.format(
                self.dataset.name, self.dataset.split_idx, self.method, level)
            try:
                similarity_matrix = SparseTermSimilarityMatrix.load(
                    str(cache_path), mmap='r')
            except IOError:
                similarity_matrix = SparseTermSimilarityMatrix(
                    termsim_index, dictionary, tfidf,
                    **method_parameters['similarity_matrix'])
                similarity_matrix.save(str(cache_path))
            train_corpus = [
                dictionary.doc2bow(document) for document in train_corpus
            ]
            train_corpus = tfidf[train_corpus]
            similarity_model = SoftCosineSimilarity(train_corpus,
                                                    similarity_matrix)
            test_corpus = (document.words for document in test_documents)
            test_corpus = [
                dictionary.doc2bow(document) for document in test_corpus
            ]
            test_corpus = tfidf[test_corpus]
        elif self.method == 'wmd':
            train_corpus = [document.words for document in train_documents]
            cache_path = cache_path / '{}-{}'.format(self.dataset.name,
                                                     self.method)
            cache_path = cache_path.with_suffix('.shelf')
            similarity_model = ParallelCachingWmdSimilarity(
                train_corpus, self.model.vectors, cache_path)
            test_corpus = [document.words for document in test_documents]
        else:
            message = 'Preprocessing for method {} not yet implemented'.format(
                self.method)
            raise ValueError(message)

        with np.errstate(all='ignore'):
            similarities = similarity_model[test_corpus]
        expected_shape = (len(test_documents), len(train_documents))
        if similarities.shape != expected_shape:
            message = 'Expected similarities with shape {}, but received shape {}'
            raise ValueError(message.format(expected_shape,
                                            similarities.shape))

        return (train_documents, test_documents, similarities, test_corpus)
Esempio n. 3
0
                                         smartirs='dtb',
                                         slope=0.2)
                topic_tfidf.save(topic_tfidf_filename)

            try:
                document_tfidf = TfidfModel.load(document_tfidf_filename)
            except IOError:
                document_tfidf = TfidfModel(dictionary=dictionary,
                                            smartirs='dtb',
                                            slope=0.2)
                document_tfidf.save(document_tfidf_filename)

            termsim_matrix_parameters['tfidf'] = document_tfidf

            try:
                similarity_matrix = SparseTermSimilarityMatrix.load(
                    scm_filename)
            except IOError:
                try:
                    model = FastText.load(fasttext_filename, mmap='r')
                except IOError:
                    model = FastText(paragraphs, **fasttext_parameters)
                    model.save(fasttext_filename)
                annoy_indexer = AnnoyIndexer(model, num_trees=1)
                termsim_index_parameters = {
                    **termsim_index_parameters,
                    **{
                        'kwargs': {
                            'indexer': annoy_indexer
                        }
                    }
                }