def test_wordFrequency(self): word_frequency = linguistic_tools.wordFrequency( 'The the The the The quick Quick quick Quick quick') word_frequency_lc = linguistic_tools.wordFrequency( 'The the The the The quick Quick quick Quick quick'.lower()) word_frequency_uc = linguistic_tools.wordFrequency( 'The the The the The quick Quick quick Quick quick'.upper()) self.assertEqual(3, word_frequency.get('The')) self.assertEqual(5, word_frequency_lc.get('the')) self.assertEqual(5, word_frequency_uc.get('THE'))
def test_tfIDF(self): dict_list = [] dict_list.append( linguistic_tools.wordFrequency( 'The the The the The quick Quick quick Quick quick')) dict_list.append( linguistic_tools.wordFrequency( 'The the The the The quick Quick quick Quick quick'.lower())) dict_list.append( linguistic_tools.wordFrequency( 'The the The the The quick Quick quick Quick quick'.upper())) doc_freq = linguistic_tools.documentFrequency(dict_list) self.assertEqual(7, doc_freq['the']) self.assertEqual(5, doc_freq['QUICK']) self.assertEqual(2, doc_freq['Quick'])
def test_countWordsinWordlist(self): word_frequency = linguistic_tools.wordFrequency( 'The the The the The quick Quick quick Quick quick') wordlist = ['the'] wordcount = linguistic_tools.countAllWordsInWordList( wordlist, word_frequency) self.assertEqual(2, wordcount)
def test_countConstructedWordlist(self): word_frequency = linguistic_tools.wordFrequency( 'The the The the The quick Quick quick Quick quick') wordlist = ['the'] wordcount = linguistic_tools.constructWordListFrequency( wordlist, word_frequency) self.assertEqual(2, wordcount.get('the', 0)) self.assertEqual(0, wordcount.get('The', 0))
def test_cumWordCount(self): word_frequency = linguistic_tools.wordFrequency( 'The the The the The quick Quick quick Quick quick') d_word_freq = linguistic_tools.cumulativePercentWords(word_frequency) total_percent = 0 for value in d_word_freq.values(): total_percent += value self.assertEqual(1, total_percent)
def mainthread(file): with open(file, 'r', errors='ignore') as f: text = f.readlines(10000) header_dict = headerSearch.searchEdgarHeader(textSnippet=text) #use first cik in edgar file cik_header = cleaning_tools.splitEDGARHeader(header_dict, 'CENTRAL INDEX KEY') header_dict['CENTRAL INDEX KEY'] = cik_header[0] if int(header_dict.get('CENTRAL INDEX KEY', -99)) in int_list: #total_obs += 1 f.seek(0) full_text = f.read() cleaned_text = linguistic_tools.parse_document(full_text, purge_tables=True) cleaned_text = cleaned_text.upper() f_words = linguistic_tools.wordFrequency(cleaned_text) return (f_words, header_dict) #print('out') else: return False