Esempio n. 1
0
    def testScoringDefault(self):
        """ test the default scoring, from the mikolov word2vec paper """
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         connector_words=self.connector_words)
        test_sentences = [[
            'data', 'and', 'graph', 'survey', 'for', 'human', 'interface'
        ]]
        seen_scores = set(
            round(score, 3)
            for score in bigram.find_phrases(test_sentences).values())

        min_count = float(bigram.min_count)
        len_vocab = float(len(bigram.vocab))
        graph = float(bigram.vocab["graph"])
        data = float(bigram.vocab["data"])
        data_and_graph = float(bigram.vocab["data_and_graph"])
        human = float(bigram.vocab["human"])
        interface = float(bigram.vocab["interface"])
        human_interface = float(bigram.vocab["human_interface"])

        assert seen_scores == set([
            # score for data and graph
            round((data_and_graph - min_count) / data / graph * len_vocab, 3),
            # score for human interface
            round(
                (human_interface - min_count) / human / interface * len_vocab,
                3),
        ])
Esempio n. 2
0
    def testMultipleBigramsSingleEntry(self):
        """Test a single entry produces multiple bigrams."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         delimiter=' ')
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        seen_bigrams = set(bigram.find_phrases(test_sentences).keys())

        assert seen_bigrams == {'graph minors', 'human interface'}
Esempio n. 3
0
    def testExportPhrases(self):
        """Test Phrases bigram export phrases."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         delimiter=' ')
        seen_bigrams = set(bigram.find_phrases(self.sentences).keys())

        assert seen_bigrams == {
            'response time',
            'graph minors',
            'human interface',
        }
Esempio n. 4
0
    def testExportPhrases(self):
        """Test Phrases bigram export phrases."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         connector_words=self.connector_words,
                         delimiter=' ')
        seen_bigrams = set(bigram.find_phrases(self.sentences).keys())

        assert seen_bigrams == set([
            'human interface',
            'graph of trees',
            'data and graph',
            'lack of interest',
        ])
Esempio n. 5
0
    def testCustomScorer(self):
        """Test using a custom scoring function."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=.001,
                         scoring=dumb_scorer)
        test_sentences = [[
            'graph', 'minors', 'survey', 'human', 'interface', 'system'
        ]]
        seen_scores = list(bigram.find_phrases(test_sentences).values())

        assert all(score == 1 for score in seen_scores)
        assert len(
            seen_scores
        ) == 3  # 'graph minors' and 'survey human' and 'interface system'
Esempio n. 6
0
    def testScoringNpmi(self):
        """Test normalized pointwise mutual information scoring."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=.5,
                         scoring='npmi')
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        seen_scores = set(
            round(score, 3)
            for score in bigram.find_phrases(test_sentences).values())

        assert seen_scores == {
            .882,  # score for graph minors
            .714  # score for human interface
        }
Esempio n. 7
0
    def testScoringDefault(self):
        """Test the default scoring, from the mikolov word2vec paper."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         delimiter=' ')
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        seen_scores = set(
            round(score, 3)
            for score in bigram.find_phrases(test_sentences).values())

        assert seen_scores == {
            5.167,  # score for graph minors
            3.444  # score for human interface
        }
Esempio n. 8
0
    def testCustomScorer(self):
        """Test using a custom scoring function."""
        bigram = Phrases(
            self.sentences,
            min_count=1,
            threshold=.001,
            scoring=dumb_scorer,
            connector_words=self.connector_words,
        )
        test_sentences = [[
            'data', 'and', 'graph', 'survey', 'for', 'human', 'interface'
        ]]
        seen_scores = list(bigram.find_phrases(test_sentences).values())

        assert all(seen_scores)  # all scores 1
        assert len(seen_scores) == 2  # 'data and graph' 'survey for human'
Esempio n. 9
0
    def testMultipleBigramsSingleEntry(self):
        """Test a single entry produces multiple bigrams."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         connector_words=self.connector_words,
                         delimiter=' ')
        test_sentences = [[
            'data', 'and', 'graph', 'survey', 'for', 'human', 'interface'
        ]]
        seen_bigrams = set(bigram.find_phrases(test_sentences).keys())

        assert seen_bigrams == set([
            'data and graph',
            'human interface',
        ])
Esempio n. 10
0
    def testScoringNpmi(self):
        """Test normalized pointwise mutual information scoring."""
        bigram = Phrases(
            self.sentences,
            min_count=1,
            threshold=.5,
            scoring='npmi',
            connector_words=self.connector_words,
        )
        test_sentences = [[
            'data', 'and', 'graph', 'survey', 'for', 'human', 'interface'
        ]]
        seen_scores = set(
            round(score, 3)
            for score in bigram.find_phrases(test_sentences).values())

        assert seen_scores == set([
            .74,  # score for data and graph
            .894  # score for human interface
        ])