def get_tfidf_terms_text(text, index_utils, n, total_N=595031): """Extract tf idfterms from a text with wapo as background corpus.""" # retrieve already analyzed terms in dict: tf analyzer = analysis.Analyzer(analysis.get_lucene_analyzer()) analyzed_text = analyzer.analyze(text) unique, counts = np.unique(analyzed_text, return_counts=True) tf = dict(zip(unique, counts)) # Filter terms: should not contain numbers and len >= 2. w_pattern = re.compile("[a-z]+") filtered_tf = {term: tf for term, tf in tf.items() if len(w_pattern.findall(term)) == 1 and len(term.replace('.', '')) >= 2 and re.search("[a-z]+", term)[0] == term} # df df = {term: (index_utils.get_term_counts(term, analyzer=None)) [0] for term in filtered_tf.keys()} # calcute tfidf for each term and store in dict. terms_tfidf = {term: tfidf(tf[term], df[term], total_N) for term in filtered_tf.keys() if tfidf(tf[term], df[term], total_N) >= 3.5} # Sort terms based on tfidf score. tfidf_terms_sorted = {term: tf[term] for term, tfidf in sorted( terms_tfidf.items(), key=itemgetter(1), reverse=True)[:n]} return tfidf_terms_sorted
def test_analysis(self): # Default is Porter stemmer analyzer = analysis.Analyzer(analysis.get_lucene_analyzer()) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['citi', 'buse', 'run', 'time']) # Specify Porter stemmer explicitly analyzer = analysis.Analyzer( analysis.get_lucene_analyzer(stemmer='porter')) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['citi', 'buse', 'run', 'time']) # Specify Krovetz stemmer explicitly analyzer = analysis.Analyzer( analysis.get_lucene_analyzer(stemmer='krovetz')) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['city', 'bus', 'running', 'time']) # No stemming analyzer = analysis.Analyzer( analysis.get_lucene_analyzer(stemming=False)) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['city', 'buses', 'running', 'time']) # No stopword filter, no stemming analyzer = analysis.Analyzer( analysis.get_lucene_analyzer(stemming=False, stopwords=False)) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['city', 'buses', 'are', 'running', 'on', 'time']) # No stopword filter, with stemming analyzer = analysis.Analyzer( analysis.get_lucene_analyzer(stemming=True, stopwords=False)) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['citi', 'buse', 'ar', 'run', 'on', 'time'])
def test_invalid_analysis(self): # Invalid configuration, make sure we get an exception. with self.assertRaises(ValueError): analysis.Analyzer(analysis.get_lucene_analyzer('blah'))
def test_invalid_analyzer_wrapper(self): # Invalid JAnalyzer, make sure we get an exception. with self.assertRaises(TypeError): analysis.Analyzer('str')