Example #1
0
 def testEmptyPhrasifiedSentencesIterator(self):
     bigram_phrases = Phrases(self.sentences)
     bigram_phraser = FrozenPhrases(bigram_phrases)
     trigram_phrases = Phrases(bigram_phraser[self.sentences])
     trigram_phraser = FrozenPhrases(trigram_phrases)
     trigrams = trigram_phraser[bigram_phraser[self.sentences]]
     fst, snd = list(trigrams), list(trigrams)
     self.assertEqual(fst, snd)
     self.assertNotEqual(snd, [])
Example #2
0
    def setUp(self):
        """Set up FrozenPhrases models for the tests."""
        bigram_phrases = Phrases(self.sentences,
                                 min_count=1,
                                 threshold=1,
                                 connector_words=self.connector_words)
        self.bigram = FrozenPhrases(bigram_phrases)

        bigram_default_phrases = Phrases(self.sentences,
                                         connector_words=self.connector_words)
        self.bigram_default = FrozenPhrases(bigram_default_phrases)
Example #3
0
 def testSaveLoad(self):
     """Test saving and loading a FrozenPhrases object."""
     with temporary_file("test.pkl") as fpath:
         bigram = FrozenPhrases(
             Phrases(self.sentences, min_count=1, threshold=1))
         bigram.save(fpath)
         bigram_loaded = FrozenPhrases.load(fpath)
         self.assertEqual(
             bigram_loaded[[
                 'graph', 'minors', 'survey', 'human', 'interface', 'system'
             ]], ['graph_minors', 'survey', 'human_interface', 'system'])
Example #4
0
    def testSaveLoadCustomScorer(self):
        """Test saving and loading a FrozenPhrases object with a custom scorer."""

        with temporary_file("test.pkl") as fpath:
            bigram = FrozenPhrases(
                Phrases(self.sentences,
                        min_count=1,
                        threshold=.001,
                        scoring=dumb_scorer))
            bigram.save(fpath)
            bigram_loaded = FrozenPhrases.load(fpath)
            self.assertEqual(bigram_loaded.scoring, dumb_scorer)
Example #5
0
 def test_save_load_with_connector_words(self):
     """Test saving and loading a FrozenPhrases object."""
     connector_words = frozenset({'of'})
     with temporary_file("test.pkl") as fpath:
         bigram = FrozenPhrases(
             Phrases(self.sentences,
                     min_count=1,
                     threshold=1,
                     connector_words=connector_words))
         bigram.save(fpath)
         bigram_loaded = FrozenPhrases.load(fpath)
         self.assertEqual(bigram_loaded.connector_words, connector_words)
Example #6
0
    def testCompatibilty(self):
        phrases = Phrases.load(datapath("phrases-3.6.0.model"))
        phraser = FrozenPhrases.load(datapath("phraser-3.6.0.model"))
        test_sentences = ['trees', 'graph', 'minors']

        self.assertEqual(phrases[test_sentences], ['trees', 'graph_minors'])
        self.assertEqual(phraser[test_sentences], ['trees', 'graph_minors'])
Example #7
0
 def testSaveLoadNoCommonTerms(self):
     """Ensure backwards compatibility with old versions of Phrases, before connector_words."""
     bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl"))
     self.assertEqual(bigram_loaded.connector_words, frozenset())
     # can make a phraser, cf #1751
     phraser = FrozenPhrases(bigram_loaded)  # does not raise
     phraser[["human", "interface", "survey"]]  # does not raise
Example #8
0
    def partial_fit(self, X):
        """Train model over a potentially incomplete set of sentences.

        This method can be used in two ways:
            1. On an unfitted model in which case the model is initialized and trained on `X`.
            2. On an already fitted model in which case the X sentences are **added** to the vocabulary.

        Parameters
        ----------
        X : iterable of list of str
            Sequence of sentences to be used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.phrases.PhrasesTransformer`
            The trained model.

        """
        if self.gensim_model is None:
            self.gensim_model = models.Phrases(
                sentences=X,
                min_count=self.min_count,
                threshold=self.threshold,
                max_vocab_size=self.max_vocab_size,
                delimiter=self.delimiter,
                progress_per=self.progress_per,
                scoring=self.scoring,
                connector_words=self.connector_words,
            )

        self.gensim_model.add_vocab(X)
        self.phraser = FrozenPhrases(self.gensim_model)
        return self
Example #9
0
    def transform(self, docs):
        """Transform the input documents into phrase tokens.

        Words in the sentence will be joined by `self.delimiter`.

        Parameters
        ----------
        docs : {iterable of list of str, list of str}
            Sequence of documents to be used transformed.

        Returns
        -------
        iterable of str
            Phrase representation for each of the input sentences.

        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        if self.phraser is None:
            self.phraser = FrozenPhrases(self.gensim_model)

        # input as python lists
        if isinstance(docs[0], str):
            docs = [docs]

        return [self.phraser[doc] for doc in docs]
Example #10
0
    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : iterable of list of str
            Sequence of sentences to be used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.phrases.PhrasesTransformer`
            The trained model.

        """
        self.gensim_model = models.Phrases(
            sentences=X,
            min_count=self.min_count,
            threshold=self.threshold,
            max_vocab_size=self.max_vocab_size,
            delimiter=self.delimiter,
            progress_per=self.progress_per,
            scoring=self.scoring,
            connector_words=self.connector_words,
        )
        self.phraser = FrozenPhrases(self.gensim_model)
        return self
Example #11
0
 def testSaveLoadNoCommonTerms(self):
     """Ensure backwards compatibility with old versions of FrozenPhrases, before connector_words."""
     bigram_loaded = FrozenPhrases.load(
         datapath("phraser-no-common-terms.pkl"))
     self.assertEqual(bigram_loaded.connector_words, frozenset())
Example #12
0
 def testSaveLoadNoScoring(self):
     """Test saving and loading a FrozenPhrases object with no scoring parameter.
     This should ensure backwards compatibility with old versions of FrozenPhrases"""
     bigram_loaded = FrozenPhrases.load(datapath("phraser-no-scoring.pkl"))
     # we do not much with scoring, just verify its the one expected
     self.assertEqual(bigram_loaded.scoring, original_scorer)
Example #13
0
 def test_save_load_string_scoring(self):
     """Test saving and loading a FrozenPhrases object with a string scoring parameter.
     This should ensure backwards compatibility with the previous version of FrozenPhrases"""
     bigram_loaded = FrozenPhrases.load(datapath("phraser-scoring-str.pkl"))
     # we do not much with scoring, just verify its the one expected
     self.assertEqual(bigram_loaded.scoring, original_scorer)