Esempio n. 1
0
    def _preprocess_dataset(
        self, level: str
    ) -> Tuple[List[Document], List[Document], np.ndarray, List[Tuple[int,
                                                                      float]]]:
        LOGGER.info('Preprocessing {} ({})'.format(self.dataset, level))

        if level == 'validation':
            pivot = int(round(len(self.dataset.train_documents) * 0.8))
            train_documents = self.dataset.train_documents[:pivot]
            test_documents = self.dataset.train_documents[pivot:]
        elif level == 'test':
            train_documents = self.dataset.train_documents
            test_documents = self.dataset.test_documents
        else:
            message = 'Expected validation or test level, but got {}'
            raise ValueError(message.format(level))

        cache_path = self.model.cache_dir / 'text_classification'
        cache_path.mkdir(exist_ok=True)
        method_parameters = TEXT_CLASSIFICATION_METHOD_PARAMETERS[self.method]
        if self.method == 'scm':
            train_corpus = [document.words for document in train_documents]
            dictionary = Dictionary(train_corpus, prune_at=None)
            tfidf = TfidfModel(dictionary=dictionary, smartirs='nfn')
            termsim_index = WordEmbeddingSimilarityIndex(
                self.model.vectors, **method_parameters['similarity_index'])
            cache_path = cache_path / '{}-{}-{}-{}'.format(
                self.dataset.name, self.dataset.split_idx, self.method, level)
            try:
                similarity_matrix = SparseTermSimilarityMatrix.load(
                    str(cache_path), mmap='r')
            except IOError:
                similarity_matrix = SparseTermSimilarityMatrix(
                    termsim_index, dictionary, tfidf,
                    **method_parameters['similarity_matrix'])
                similarity_matrix.save(str(cache_path))
            train_corpus = [
                dictionary.doc2bow(document) for document in train_corpus
            ]
            train_corpus = tfidf[train_corpus]
            similarity_model = SoftCosineSimilarity(train_corpus,
                                                    similarity_matrix)
            test_corpus = (document.words for document in test_documents)
            test_corpus = [
                dictionary.doc2bow(document) for document in test_corpus
            ]
            test_corpus = tfidf[test_corpus]
        elif self.method == 'wmd':
            train_corpus = [document.words for document in train_documents]
            cache_path = cache_path / '{}-{}'.format(self.dataset.name,
                                                     self.method)
            cache_path = cache_path.with_suffix('.shelf')
            similarity_model = ParallelCachingWmdSimilarity(
                train_corpus, self.model.vectors, cache_path)
            test_corpus = [document.words for document in test_documents]
        else:
            message = 'Preprocessing for method {} not yet implemented'.format(
                self.method)
            raise ValueError(message)

        with np.errstate(all='ignore'):
            similarities = similarity_model[test_corpus]
        expected_shape = (len(test_documents), len(train_documents))
        if similarities.shape != expected_shape:
            message = 'Expected similarities with shape {}, but received shape {}'
            raise ValueError(message.format(expected_shape,
                                            similarities.shape))

        return (train_documents, test_documents, similarities, test_corpus)
Esempio n. 2
0
                    model = FastText(paragraphs, **fasttext_parameters)
                    model.save(fasttext_filename)
                annoy_indexer = AnnoyIndexer(model, num_trees=1)
                termsim_index_parameters = {
                    **termsim_index_parameters,
                    **{
                        'kwargs': {
                            'indexer': annoy_indexer
                        }
                    }
                }
                termsim_index = WordEmbeddingSimilarityIndex(
                    model.wv, **termsim_index_parameters)
                similarity_matrix = SparseTermSimilarityMatrix(
                    termsim_index, dictionary, **termsim_matrix_parameters)
                similarity_matrix.save(scm_filename)
                del model, termsim_index

            def topic_transformer(topic):
                return topic_tfidf[dictionary.doc2bow(topic)]

            def document_transformer(document):
                return document_tfidf[dictionary.doc2bow(document)]

            topic_corpus, document_corpus = read_corpora(
                {
                    'topic_corpus_filename': topic_corpus_filename,
                    'topic_corpus_num_documents': topic_corpus_num_documents,
                    'topic_ids': topic_ids,
                    'topic_transformer': topic_transformer,
                    'document_corpus_filename': document_corpus_filename,