Ejemplo n.º 1
0
 def test_frequency_cutoff(self):
     keywords4 = extract_keywords(self.analysis,
                                  self.reference,
                                  1000,
                                  10000,
                                  freq_cut_off=30)
     self.assertEqual(len(keywords4), 2)
Ejemplo n.º 2
0
 def test_limit_rows(self):
     keywords2 = extract_keywords(self.analysis,
                                  self.reference,
                                  1000,
                                  10000,
                                  limit_rows=2)
     self.assertEqual(len(keywords2), 2)
Ejemplo n.º 3
0
 def test_p_value(self):
     keywords3 = extract_keywords(self.analysis,
                                  self.reference,
                                  1000,
                                  10000,
                                  p_value=0.0001)
     # there are only 4 keywords
     self.assertEqual(len(keywords3), 4)
Ejemplo n.º 4
0
 def test_wordlist_merging(self):
     keywords = extract_keywords(self.analysis, self.reference, 1000, 10000)
     # the default p_value is 0.0001
     self.assertEqual(len(keywords), 4)
     self.assertIn('five', keywords.Type.tolist())
     self.assertNotIn('SIX', keywords.Type.tolist())
     self.assertEqual(keywords.Count_analysis[0], 540)
     # five does not occur in ref
     self.assertEqual(keywords.Count_ref[3], 0)
Ejemplo n.º 5
0
 def test_round_values(self):
     keywords6 = extract_keywords(self.analysis,
                                  self.reference,
                                  1000,
                                  10000,
                                  round_values=False)
     # value = 1195.463979
     value = keywords6.loc[0, 'LL']
     nr_of_decimals = len(str(value).split('.')[1])
     self.assertNotEqual(nr_of_decimals, 2)
Ejemplo n.º 6
0
    def test_exclude_underused(self):
        '''
        Expected work:
        
        Type  Count_analysis  Total_analysis  Count_ref  Total_ref  \
        0    one             540            1000        540        100   
        2   four             431            1000        431        100   
        1  three              29            1000         29        100   

           Expected_count_analysis  Expected_count_ref       LL Use           p  
        0                   981.82               98.18  1195.46   -  p < 0.0001  
        2                   783.64               78.36   954.16   -  p < 0.0001  
        1                    52.73                5.27    64.20   -  p < 0.0001
        '''
        keywords5 = extract_keywords(self.analysis,
                                     self.reference,
                                     1000,
                                     100,
                                     p_value=0.05,
                                     exclude_underused=False)
        self.assertEqual(len(keywords5), 3)
Ejemplo n.º 7
0
def build_keyword_list(
    cluster_length,
    subset_analysis,
    subcorpora_analysis,
    subset_reference,
    subcorpora_reference,
    p_value,
    limit_rows=3000,
):
    '''
    Helper function to enable the caching of keywords.
    
    It returns records because dataframes cannot be cached. 
    '''

    index_name_analysis = construct_index_name(subset_analysis, cluster_length)
    wordlist_analysis = Cheshire3WordList()
    wordlist_analysis.build_wordlist(index_name_analysis, subcorpora_analysis)
    # collecting the total needs to precede renaming wordlist_analysis
    total_analysis = wordlist_analysis.total
    wordlist_analysis = wordlist_analysis.wordlist

    index_name_reference = construct_index_name(subset_reference,
                                                cluster_length)
    wordlist_reference = Cheshire3WordList()
    wordlist_reference.build_wordlist(index_name_reference,
                                      subcorpora_reference)
    # collecting the total needs to precede renaming wordlist_reference
    total_reference = wordlist_reference.total
    wordlist_reference = wordlist_reference.wordlist

    keywords = extract_keywords(wordlist_analysis,
                                wordlist_reference,
                                wordlist_analysis.Count.sum(),
                                wordlist_reference.Count.sum(),
                                limit_rows=limit_rows,
                                p_value=p_value)

    return keywords.to_records(), total_analysis, total_reference