def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words) test_sentences = [[ 'data', 'and', 'graph', 'survey', 'for', 'human', 'interface' ]] seen_scores = set( round(score, 3) for score in bigram.find_phrases(test_sentences).values()) min_count = float(bigram.min_count) len_vocab = float(len(bigram.vocab)) graph = float(bigram.vocab["graph"]) data = float(bigram.vocab["data"]) data_and_graph = float(bigram.vocab["data_and_graph"]) human = float(bigram.vocab["human"]) interface = float(bigram.vocab["interface"]) human_interface = float(bigram.vocab["human_interface"]) assert seen_scores == set([ # score for data and graph round((data_and_graph - min_count) / data / graph * len_vocab, 3), # score for human interface round( (human_interface - min_count) / human / interface * len_vocab, 3), ])
def testMultipleBigramsSingleEntry(self): """Test a single entry produces multiple bigrams.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] seen_bigrams = set(bigram.find_phrases(test_sentences).keys()) assert seen_bigrams == {'graph minors', 'human interface'}
def testExportPhrases(self): """Test Phrases bigram export phrases.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') seen_bigrams = set(bigram.find_phrases(self.sentences).keys()) assert seen_bigrams == { 'response time', 'graph minors', 'human interface', }
def testExportPhrases(self): """Test Phrases bigram export phrases.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words, delimiter=' ') seen_bigrams = set(bigram.find_phrases(self.sentences).keys()) assert seen_bigrams == set([ 'human interface', 'graph of trees', 'data and graph', 'lack of interest', ])
def testCustomScorer(self): """Test using a custom scoring function.""" bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] seen_scores = list(bigram.find_phrases(test_sentences).values()) assert all(score == 1 for score in seen_scores) assert len( seen_scores ) == 3 # 'graph minors' and 'survey human' and 'interface system'
def testScoringNpmi(self): """Test normalized pointwise mutual information scoring.""" bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi') test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] seen_scores = set( round(score, 3) for score in bigram.find_phrases(test_sentences).values()) assert seen_scores == { .882, # score for graph minors .714 # score for human interface }
def testScoringDefault(self): """Test the default scoring, from the mikolov word2vec paper.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] seen_scores = set( round(score, 3) for score in bigram.find_phrases(test_sentences).values()) assert seen_scores == { 5.167, # score for graph minors 3.444 # score for human interface }
def testCustomScorer(self): """Test using a custom scoring function.""" bigram = Phrases( self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer, connector_words=self.connector_words, ) test_sentences = [[ 'data', 'and', 'graph', 'survey', 'for', 'human', 'interface' ]] seen_scores = list(bigram.find_phrases(test_sentences).values()) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 2 # 'data and graph' 'survey for human'
def testMultipleBigramsSingleEntry(self): """Test a single entry produces multiple bigrams.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words, delimiter=' ') test_sentences = [[ 'data', 'and', 'graph', 'survey', 'for', 'human', 'interface' ]] seen_bigrams = set(bigram.find_phrases(test_sentences).keys()) assert seen_bigrams == set([ 'data and graph', 'human interface', ])
def testScoringNpmi(self): """Test normalized pointwise mutual information scoring.""" bigram = Phrases( self.sentences, min_count=1, threshold=.5, scoring='npmi', connector_words=self.connector_words, ) test_sentences = [[ 'data', 'and', 'graph', 'survey', 'for', 'human', 'interface' ]] seen_scores = set( round(score, 3) for score in bigram.find_phrases(test_sentences).values()) assert seen_scores == set([ .74, # score for data and graph .894 # score for human interface ])