def testEmptyPhrasifiedSentencesIterator(self): bigram_phrases = Phrases(self.sentences) bigram_phraser = FrozenPhrases(bigram_phrases) trigram_phrases = Phrases(bigram_phraser[self.sentences]) trigram_phraser = FrozenPhrases(trigram_phrases) trigrams = trigram_phraser[bigram_phraser[self.sentences]] fst, snd = list(trigrams), list(trigrams) self.assertEqual(fst, snd) self.assertNotEqual(snd, [])
def setUp(self): """Set up FrozenPhrases models for the tests.""" bigram_phrases = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words) self.bigram = FrozenPhrases(bigram_phrases) bigram_default_phrases = Phrases(self.sentences, connector_words=self.connector_words) self.bigram_default = FrozenPhrases(bigram_default_phrases)
def testSaveLoad(self): """Test saving and loading a FrozenPhrases object.""" with temporary_file("test.pkl") as fpath: bigram = FrozenPhrases( Phrases(self.sentences, min_count=1, threshold=1)) bigram.save(fpath) bigram_loaded = FrozenPhrases.load(fpath) self.assertEqual( bigram_loaded[[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]], ['graph_minors', 'survey', 'human_interface', 'system'])
def testSaveLoadCustomScorer(self): """Test saving and loading a FrozenPhrases object with a custom scorer.""" with temporary_file("test.pkl") as fpath: bigram = FrozenPhrases( Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)) bigram.save(fpath) bigram_loaded = FrozenPhrases.load(fpath) self.assertEqual(bigram_loaded.scoring, dumb_scorer)
def test_save_load_with_connector_words(self): """Test saving and loading a FrozenPhrases object.""" connector_words = frozenset({'of'}) with temporary_file("test.pkl") as fpath: bigram = FrozenPhrases( Phrases(self.sentences, min_count=1, threshold=1, connector_words=connector_words)) bigram.save(fpath) bigram_loaded = FrozenPhrases.load(fpath) self.assertEqual(bigram_loaded.connector_words, connector_words)
def testCompatibilty(self): phrases = Phrases.load(datapath("phrases-3.6.0.model")) phraser = FrozenPhrases.load(datapath("phraser-3.6.0.model")) test_sentences = ['trees', 'graph', 'minors'] self.assertEqual(phrases[test_sentences], ['trees', 'graph_minors']) self.assertEqual(phraser[test_sentences], ['trees', 'graph_minors'])
def testSaveLoadNoCommonTerms(self): """Ensure backwards compatibility with old versions of Phrases, before connector_words.""" bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl")) self.assertEqual(bigram_loaded.connector_words, frozenset()) # can make a phraser, cf #1751 phraser = FrozenPhrases(bigram_loaded) # does not raise phraser[["human", "interface", "survey"]] # does not raise
def partial_fit(self, X): """Train model over a potentially incomplete set of sentences. This method can be used in two ways: 1. On an unfitted model in which case the model is initialized and trained on `X`. 2. On an already fitted model in which case the X sentences are **added** to the vocabulary. Parameters ---------- X : iterable of list of str Sequence of sentences to be used for training the model. Returns ------- :class:`~gensim.sklearn_api.phrases.PhrasesTransformer` The trained model. """ if self.gensim_model is None: self.gensim_model = models.Phrases( sentences=X, min_count=self.min_count, threshold=self.threshold, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per, scoring=self.scoring, connector_words=self.connector_words, ) self.gensim_model.add_vocab(X) self.phraser = FrozenPhrases(self.gensim_model) return self
def transform(self, docs): """Transform the input documents into phrase tokens. Words in the sentence will be joined by `self.delimiter`. Parameters ---------- docs : {iterable of list of str, list of str} Sequence of documents to be used transformed. Returns ------- iterable of str Phrase representation for each of the input sentences. """ if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) if self.phraser is None: self.phraser = FrozenPhrases(self.gensim_model) # input as python lists if isinstance(docs[0], str): docs = [docs] return [self.phraser[doc] for doc in docs]
def fit(self, X, y=None): """Fit the model according to the given training data. Parameters ---------- X : iterable of list of str Sequence of sentences to be used for training the model. Returns ------- :class:`~gensim.sklearn_api.phrases.PhrasesTransformer` The trained model. """ self.gensim_model = models.Phrases( sentences=X, min_count=self.min_count, threshold=self.threshold, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per, scoring=self.scoring, connector_words=self.connector_words, ) self.phraser = FrozenPhrases(self.gensim_model) return self
def testSaveLoadNoCommonTerms(self): """Ensure backwards compatibility with old versions of FrozenPhrases, before connector_words.""" bigram_loaded = FrozenPhrases.load( datapath("phraser-no-common-terms.pkl")) self.assertEqual(bigram_loaded.connector_words, frozenset())
def testSaveLoadNoScoring(self): """Test saving and loading a FrozenPhrases object with no scoring parameter. This should ensure backwards compatibility with old versions of FrozenPhrases""" bigram_loaded = FrozenPhrases.load(datapath("phraser-no-scoring.pkl")) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, original_scorer)
def test_save_load_string_scoring(self): """Test saving and loading a FrozenPhrases object with a string scoring parameter. This should ensure backwards compatibility with the previous version of FrozenPhrases""" bigram_loaded = FrozenPhrases.load(datapath("phraser-scoring-str.pkl")) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, original_scorer)