Ejemplo n.º 1
0
 def test_spacy_word_count_en(self):
     df_res = WordCountAnalyzer(texts, strategy=TokenizerStrategy.SPACY, language='english')\
         .extract_dataset_metric()
     self.assertEqual({
         8: 2,
         10: 1,
         11: 1,
         14: 1
     },
                      df_res.to_dict()['count'])
Ejemplo n.º 2
0
 def test_nist_word_count(self):
     df_res = WordCountAnalyzer(
         texts,
         strategy=TokenizerStrategy.NLTK_NIST).extract_dataset_metric()
     self.assertEqual({
         8: 1,
         9: 1,
         10: 1,
         12: 1,
         16: 1
     },
                      df_res.to_dict()['count'])
Ejemplo n.º 3
0
 def test_nltk_regex_word_count(self):
     df_res = WordCountAnalyzer(
         texts,
         strategy=TokenizerStrategy.WORD_PUNKT).extract_dataset_metric()
     self.assertEqual({
         8: 1,
         10: 1,
         11: 1,
         13: 1,
         16: 1
     },
                      df_res.to_dict()['count'])
Ejemplo n.º 4
0
 def __init__(self,
              word_count_analyzer=None,
              sentence_count_analyzer=None,
              language='english'):
     self.wca = word_count_analyzer or WordCountAnalyzer([],
                                                         language=language)
     self.sca = sentence_count_analyzer or SentenceCountAnalyzer(
         [], language=language)
Ejemplo n.º 5
0
 def test_nltk_base_word_count_batch(self):
     bm = WordCountAnalyzer(texts,
                            strategy=TokenizerStrategy.NLTK_BASE,
                            language='english').extract_batch_metrics()
     self.assertEqual([8, 9, 10, 16, 10], bm)
Ejemplo n.º 6
0
 def test_gensim_word_count(self):
     df_res = WordCountAnalyzer(
         texts, strategy=TokenizerStrategy.GENSIM).extract_dataset_metric()
     self.assertEqual({7: 2, 9: 2, 10: 1}, df_res.to_dict()['count'])
Ejemplo n.º 7
0
 def test_nltk_base_word_count_de(self):
     # 'U.S.?' is treated differently with german nltk tokenize
     df_res = WordCountAnalyzer(texts, strategy=TokenizerStrategy.NLTK_BASE, language='german')\
         .extract_dataset_metric()
     self.assertEqual({8: 1, 10: 3, 16: 1}, df_res.to_dict()['count'])
Ejemplo n.º 8
0
 def test_nltk_base_word_count_en(self):
     df_res = WordCountAnalyzer(texts, strategy=TokenizerStrategy.NLTK_BASE, language='english')\
         .extract_dataset_metric()
     self.assertEqual({8: 1, 9: 1, 10: 2, 16: 1}, df_res.to_dict()['count'])
Ejemplo n.º 9
0
 def test_split_word_count(self):
     df_res = WordCountAnalyzer(
         texts, strategy=TokenizerStrategy.PYTHON).extract_dataset_metric()
     self.assertEqual({5: 1, 7: 1, 8: 2, 9: 1}, df_res.to_dict()['count'])
Ejemplo n.º 10
0
 def test_regex_word_count(self):
     df_res = WordCountAnalyzer(
         texts, strategy=TokenizerStrategy.REGEX).extract_dataset_metric()
     self.assertEqual({7: 2, 9: 1, 10: 2}, df_res.to_dict()['count'])