class WordCountTest(unittest.TestCase): ''' generate test input files and use them for invoking the tests ''' def setUp(self): self.wc = WordCount() self.word_dict_counts = {"this":1,"is":2,"a":2,"test":1,"file":2,"contents":1,"are":1,"written":1,"to":1,"used":1,"it":1,"for":2,"the":1,"purpose":1,"testing":2} self.word_dict_counts_total = {"this":5,"is":10,"a":10,"test":5,"file":10,"contents":5,"are":5,"written":5,"to":5,"used":5,"it":5,"for":10,"the":5,"purpose":5,"testing":10} self.file_list = [] fp2 =open("test_path.txt","w") for i in range(1,6): file_name = str(i)+".txt" fp =open(file_name,"w") fp.write("This is a test file. Contents are written to the file for testing!\n") fp.write("It is used for a testing purpose!\n") fp2.write(os.path.abspath(file_name)) fp2.write("\n") fp.close() self.file_list.append(os.path.abspath(file_name)) fp2.close() def test_count_totoal_words(self): countedWords = self.wc.process_input_file("test_path.txt") #print (countedWords) self.assertEqual(len(countedWords), len(self.word_dict_counts_total), FAILURE) def test_count_words_single_file_2(self): countedWords = self.wc.count_words("2.txt") self.assertEqual(len(countedWords), 15, FAILURE) def test_search_word_totoal_count(self): countedWords = self.wc.process_input_file("test_path.txt") table = self.wc.search_word("file") self.assertEqual(countedWords['file']['total_count'], self.word_dict_counts_total["file"], FAILURE) def test_search_word_totoal_count_individual_file(self): countedWords = self.wc.count_words("2.txt") self.assertEqual(countedWords['file']['total_count'], self.word_dict_counts["file"], FAILURE) def test_invalid_search_word(self): countedWords = self.wc.process_input_file("test_path.txt") table = self.wc.search_word("invalid_word") self.assertFalse("invalid_word" in countedWords) def test_list_words(self): countedWords = self.wc.process_input_file("test_path.txt") for k,v in self.word_dict_counts_total.items(): self.assertEqual(countedWords[k]['total_count'],v,FAILURE) def test_search_words_count_across_files(self): countedWords = self.wc.process_input_file("test_path.txt") word_info = countedWords["for"] for i in range(len(self.file_list)): self.assertEqual(word_info[self.file_list[i]],self.word_dict_counts["for"],FAILURE) def tearDown(self): for i in range(len(self.file_list)): os.remove(self.file_list[i]) os.remove("test_path.txt")
def __init__(self): self.files = FileUtils().get_files_from_path(sys.argv[1]) self.file_count = len(self.files) self.k1 = 1.2 self.b = 0.75 self.max_score = 0 self.min_score = 100 self.avg_doc_length = 0 self.idf = {} self.word_count_calculator = WordCount() self.bm_25_scores = {}
def test_word_occurance8(self): self.assertDictEqual({ 'hello': 1, 'world': 1 }, WordCount.words('hello\nworld'), msg='should not count multilines')
def test_word_occurance9(self): self.assertDictEqual({ 'hello': 1, 'world': 1 }, WordCount.words('hello\tworld'), msg='should not count tabs')
def test_word_occurance0(self): self.assertDictEqual({ 'hello': 1, 'world': 1 }, WordCount.words('hello world'), msg='should count multiple spaces as one')
def setUp(self): self.wc = WordCount() self.word_dict_counts = {"this":1,"is":2,"a":2,"test":1,"file":2,"contents":1,"are":1,"written":1,"to":1,"used":1,"it":1,"for":2,"the":1,"purpose":1,"testing":2} self.word_dict_counts_total = {"this":5,"is":10,"a":10,"test":5,"file":10,"contents":5,"are":5,"written":5,"to":5,"used":5,"it":5,"for":10,"the":5,"purpose":5,"testing":10} self.file_list = [] fp2 =open("test_path.txt","w") for i in range(1,6): file_name = str(i)+".txt" fp =open(file_name,"w") fp.write("This is a test file. Contents are written to the file for testing!\n") fp.write("It is used for a testing purpose!\n") fp2.write(os.path.abspath(file_name)) fp2.write("\n") fp.close() self.file_list.append(os.path.abspath(file_name)) fp2.close()
def test_word_occurance2(self): self.assertDictEqual({ 'one': 1, 'of': 1, 'each': 1 }, WordCount.words("one of each"), msg='should count one of each')
def test_word_occurance5(self): self.assertDictEqual({ 'testing': 2, 1: 1, 2: 1 }, WordCount.words('testing 1 2 testing'), msg='should include numbers')
def test_word_occurance6(self): self.assertDictEqual({ 'go': 1, 'Go': 1, 'GO': 1 }, WordCount.words('go Go GO'), msg='should respect case')
def main(): # NOTE: Change this to name of .txt file you are using within /data/. file_path = 'data/book.txt' start = time.time() print("Starting up.") k = int(input("Enter a value for k:\n")) # Parse and sort data using collections. data = WordCount(file_path) data.sort_and_pop_word_dict(k) # Parse and sort data using dictionary and max heap. # data = WordCountDict('data/book.txt') # heap = MaxHeap() # heap.insert_dict(data.get_word_dict()) # heap.pop_top_k_words(k) print(time.time()-start)
def test_word_occurance7(self): self.assertDictEqual( { "¡Hola!": 1, "¿Qué": 1, "tal?": 1, "Привет!": 1 }, WordCount.words('¡Hola! ¿Qué tal? Привет!'), msg='should count international characters properly')
def generate_counts(): res = request.get_json() urls = res['urls'] print(urls) for url in urls: if cache.get(url) is None: cache.set(url, '') job = q.enqueue_call(func=WordCount().run, args=(url, )) return ('Accepted', 202)
def test_word_occurance3(self): self.assertDictEqual( { 'one': 1, 'fish': 4, 'two': 1, 'red': 1, 'blue': 1 }, WordCount.words("one fish two fish red fish blue fish"), msg='should count multiple occurrences')
def test_word_occurance4(self): self.assertDictEqual( { 'car': 1, ":": 2, 'carpet': 1, 'as': 1, 'java': 1, 'javascript!!&@$%^&': 1 }, WordCount.words('car : carpet as java : javascript!!&@$%^&'), msg='should include punctuation')
def generate_dict(reviewList): dictionary = wc.count_words(wc, reviewList) dictionary = wc.sort_freq_dict(wc, dictionary) return dictionary
# coding=utf-8 from word_count import WordCount from lookup import lookup FLAG_LAST_WORD = 1 FLAG_MIDDLE_WORD = 2 wc = WordCount() def word_topk(k, flag): if flag == FLAG_LAST_WORD: return wc.last_word_topk(k) if flag == FLAG_MIDDLE_WORD: return wc.middle_word_topk(k) return [] def word_freq(word, flag): """返回常见系数、级别""" if flag == FLAG_LAST_WORD: ret = wc.last_word_freq(word) else: ret = wc.middle_word_freq(word) freq, rank, total = ret # 1/4 -- 2/4 -- 3/4 -- 1 if rank <= total / 4: comment = u'大众名'
def test_word_occurance1(self): self.assertDictEqual({'word': 1}, WordCount.words('word'), msg='should count one word')
class TermWeights: def __init__(self): self.files = FileUtils().get_files_from_path(sys.argv[1]) self.file_count = len(self.files) self.k1 = 1.2 self.b = 0.75 self.max_score = 0 self.min_score = 100 self.avg_doc_length = 0 self.idf = {} self.word_count_calculator = WordCount() self.bm_25_scores = {} # Calculates the IDF value for a word def calculate_idf(self): for word, count in self.word_count_calculator.document_count.items(): self.idf[word] = math.log10( (self.file_count - count + 0.5) / (count + 0.5)) self.avg_doc_length = sum(self.word_count_calculator.document_length. values()) / self.file_count # normalizes the BM25 scores to values between 0 and 1 def normalize_score(self, score): if (self.max_score - self.min_score) == 0: return 0 else: return ((score - self.min_score) / (self.max_score - self.min_score)) # Primary method that calculates the BM25 score using the word count # Also keeps track of the max and min BM25 scores to be used for normalization def calculate(self): self.word_count_calculator.calculate() self.calculate_idf() for file_name, word_count_dict in self.word_count_calculator.word_counts.items( ): bm_25 = {} for word, count in word_count_dict.items(): try: if self.word_count_calculator.global_count[ word] != 1 and len(word) != 1: count = int(count) numerator = (count * (self.k1 + 1)) * float(self.idf[word]) denominator = ( count + self.k1 * (1 - self.b + (self.b * (int(self.word_count_calculator.document_length[ file_name]) / self.avg_doc_length)))) bm_25_score = (numerator / denominator) bm_25[word] = bm_25_score if bm_25_score > self.max_score: self.max_score = bm_25_score if bm_25_score < self.min_score: self.min_score = bm_25_score except KeyError: continue self.bm_25_scores[file_name] = bm_25 self.normalize() def normalize(self): for file_name, bm_25 in self.bm_25_scores.items(): for word, score in bm_25.items(): bm_25[word] = self.normalize_score(score)