Esempio n. 1
0
def get_tfidf_terms_text(text, index_utils, n, total_N=595031):
    """Extract tf idfterms from a text with wapo as background corpus."""
    # retrieve already analyzed terms in dict: tf
    analyzer = analysis.Analyzer(analysis.get_lucene_analyzer())
    analyzed_text = analyzer.analyze(text)
    unique, counts = np.unique(analyzed_text, return_counts=True)
    tf = dict(zip(unique, counts))

    # Filter terms: should not contain numbers and len >= 2.
    w_pattern = re.compile("[a-z]+")
    filtered_tf = {term: tf for term, tf in tf.items() if len(w_pattern.findall(term)) == 1 and
                   len(term.replace('.', '')) >= 2 and
                   re.search("[a-z]+", term)[0] == term}

    # df
    df = {term: (index_utils.get_term_counts(term, analyzer=None))
          [0] for term in filtered_tf.keys()}

    # calcute tfidf for each term and store in dict.
    terms_tfidf = {term: tfidf(tf[term], df[term], total_N) for term in filtered_tf.keys()
                   if tfidf(tf[term], df[term], total_N) >= 3.5}

    # Sort terms based on tfidf score.
    tfidf_terms_sorted = {term: tf[term] for term, tfidf in sorted(
        terms_tfidf.items(), key=itemgetter(1), reverse=True)[:n]}

    return tfidf_terms_sorted
Esempio n. 2
0
    def test_analysis(self):
        # Default is Porter stemmer
        analyzer = analysis.Analyzer(analysis.get_lucene_analyzer())
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'run', 'time'])

        # Specify Porter stemmer explicitly
        analyzer = analysis.Analyzer(
            analysis.get_lucene_analyzer(stemmer='porter'))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'run', 'time'])

        # Specify Krovetz stemmer explicitly
        analyzer = analysis.Analyzer(
            analysis.get_lucene_analyzer(stemmer='krovetz'))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['city', 'bus', 'running', 'time'])

        # No stemming
        analyzer = analysis.Analyzer(
            analysis.get_lucene_analyzer(stemming=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['city', 'buses', 'running', 'time'])

        # No stopword filter, no stemming
        analyzer = analysis.Analyzer(
            analysis.get_lucene_analyzer(stemming=False, stopwords=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens,
                         ['city', 'buses', 'are', 'running', 'on', 'time'])

        # No stopword filter, with stemming
        analyzer = analysis.Analyzer(
            analysis.get_lucene_analyzer(stemming=True, stopwords=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'ar', 'run', 'on', 'time'])
Esempio n. 3
0
 def test_invalid_analysis(self):
     # Invalid configuration, make sure we get an exception.
     with self.assertRaises(ValueError):
         analysis.Analyzer(analysis.get_lucene_analyzer('blah'))
Esempio n. 4
0
 def test_invalid_analyzer_wrapper(self):
     # Invalid JAnalyzer, make sure we get an exception.
     with self.assertRaises(TypeError):
         analysis.Analyzer('str')