Esempio n. 1
0
 def test_wordFrequency(self):
     word_frequency = linguistic_tools.wordFrequency(
         'The the The the The quick Quick quick Quick quick')
     word_frequency_lc = linguistic_tools.wordFrequency(
         'The the The the The quick Quick quick Quick quick'.lower())
     word_frequency_uc = linguistic_tools.wordFrequency(
         'The the The the The quick Quick quick Quick quick'.upper())
     self.assertEqual(3, word_frequency.get('The'))
     self.assertEqual(5, word_frequency_lc.get('the'))
     self.assertEqual(5, word_frequency_uc.get('THE'))
Esempio n. 2
0
 def test_tfIDF(self):
     dict_list = []
     dict_list.append(
         linguistic_tools.wordFrequency(
             'The the The the The quick Quick quick Quick quick'))
     dict_list.append(
         linguistic_tools.wordFrequency(
             'The the The the The quick Quick quick Quick quick'.lower()))
     dict_list.append(
         linguistic_tools.wordFrequency(
             'The the The the The quick Quick quick Quick quick'.upper()))
     doc_freq = linguistic_tools.documentFrequency(dict_list)
     self.assertEqual(7, doc_freq['the'])
     self.assertEqual(5, doc_freq['QUICK'])
     self.assertEqual(2, doc_freq['Quick'])
Esempio n. 3
0
 def test_countWordsinWordlist(self):
     word_frequency = linguistic_tools.wordFrequency(
         'The the The the The quick Quick quick Quick quick')
     wordlist = ['the']
     wordcount = linguistic_tools.countAllWordsInWordList(
         wordlist, word_frequency)
     self.assertEqual(2, wordcount)
Esempio n. 4
0
 def test_countConstructedWordlist(self):
     word_frequency = linguistic_tools.wordFrequency(
         'The the The the The quick Quick quick Quick quick')
     wordlist = ['the']
     wordcount = linguistic_tools.constructWordListFrequency(
         wordlist, word_frequency)
     self.assertEqual(2, wordcount.get('the', 0))
     self.assertEqual(0, wordcount.get('The', 0))
Esempio n. 5
0
 def test_cumWordCount(self):
     word_frequency = linguistic_tools.wordFrequency(
         'The the The the The quick Quick quick Quick quick')
     d_word_freq = linguistic_tools.cumulativePercentWords(word_frequency)
     total_percent = 0
     for value in d_word_freq.values():
         total_percent += value
     self.assertEqual(1, total_percent)
Esempio n. 6
0
def mainthread(file):
    with open(file, 'r', errors='ignore') as f:
        text = f.readlines(10000)
        header_dict = headerSearch.searchEdgarHeader(textSnippet=text)
        #use first cik in edgar file
        cik_header = cleaning_tools.splitEDGARHeader(header_dict, 'CENTRAL INDEX KEY')
        header_dict['CENTRAL INDEX KEY'] = cik_header[0]

        if int(header_dict.get('CENTRAL INDEX KEY', -99)) in int_list:
            #total_obs += 1
            f.seek(0)
            full_text = f.read()
            cleaned_text = linguistic_tools.parse_document(full_text, purge_tables=True)
            cleaned_text = cleaned_text.upper()
            f_words = linguistic_tools.wordFrequency(cleaned_text)
            return (f_words, header_dict)
            #print('out')
        else:
            return False