def test_tabs(self): self.assertEqual( count_words('rah rah ah ah ah\troma roma ma\tga ga oh la la\t' 'want your bad romance'), {'rah': 2, 'ah': 3, 'roma': 2, 'ma': 1, 'ga': 2, 'oh': 1, 'la': 2, 'want': 1, 'your': 1, 'bad': 1, 'romance': 1} )
def test_word_count_passage(self): word = ("The number of orderings of the 52 cards in a deck of cards " "is so great that if every one of the almost 7 billion people alive " "today dealt one ordering of the cards per second, it would take " "2.5 * 10**40 times the age of the universe to order the cards in every " "possible way.") result = word_count.count_words(word) self.assertEqual(result, 56)
def callback(ch, method, properties, body): print " [x] Received %r" % (body) document_meta = json.loads(body) start = datetime.datetime.now() print ' Fetching from S3' s3_key = document_meta['s3_key'] s3_path = get_s3_path(s3_key) response = urllib2.urlopen(s3_path) text_blob = response.read() word_count = count_words(text_blob) update_file_upload_meta( document_slug = document_meta['document_slug'], time_uploaded = document_meta['time_uploaded'], word_counts = json.dumps(word_count), ) duration = datetime.datetime.now() - start print ' Done in {0}!'.format(duration)
def test_alternating_word_separators_not_detected_as_a_word(self): self.assertEqual( count_words(",\n,one,\n ,two \n 'three'"), {"one": 1, "two": 1, "three": 1} )
def test_include_numbers(self): self.assertEqual( count_words("testing, 1, 2 testing"), {"testing": 2, "1": 1, "2": 1} )
def test_with_apostrophes(self): self.assertEqual( count_words("First: don't laugh. Then: don't cry."), {"first": 1, "don't": 2, "laugh": 1, "then": 1, "cry": 1}, )
def test_multiple_occurrences_of_a_word(self): self.assertEqual( count_words("one fish two fish red fish blue fish"), {"one": 1, "fish": 4, "two": 1, "red": 1, "blue": 1}, )
def test_handles_expanded_lists(self): self.assertEqual( count_words("one,\ntwo,\nthree"), {"one": 1, "two": 1, "three": 1} )
def test_word_count_one_word(self): word = 'I' result = word_count.count_words(word) self.assertEqual(result, 1)
def test_non_alphanumeric(self): self.assertEqual( count_words("hey,my_spacebar_is_broken"), {"hey": 1, "my": 1, "spacebar": 1, "is": 1, "broken": 1}, )
def test_multiple_occurrences_of_a_word(self): self.assertEqual( count_words('one fish two fish red fish blue fish'), {'one': 1, 'fish': 4, 'two': 1, 'red': 1, 'blue': 1} )
def test_handles_cramped_lists(self): self.assertEqual( count_words('one,two,three'), {'one': 1, 'two': 1, 'three': 1} )
# with open('standard library/test_files/pi_digits.txt') as file_object: # for line in file_object: # print(line.rstrip()) from word_count import count_words file_name = r'standard library/test_files/write_result.txt' # with open(file_name, 'w') as file_object: # file_object.write('Test string\n') # file_object.write('The sceond line\n') words_count = count_words(file_name) print('The total words count: ' + str(words_count))
def test_count_one_of_each_word(self): self.assertEqual( count_words('one of each'), {'one': 1, 'of': 1, 'each': 1} )
def test_multiple_spaces_not_detected_as_a_word(self): self.assertEqual(count_words(' multiple whitespaces'), { 'multiple': 1, 'whitespaces': 1 })
def test_count_one_word(self): self.assertEqual(count_words('word'), {'word': 1})
def test_normalize_case(self): self.assertEqual(count_words('go Go GO Stop stop'), { 'go': 3, 'stop': 2 })
def test_multiple_spaces_not_detected_as_a_word(self): self.assertEqual(count_words(" multiple whitespaces"), { "multiple": 1, "whitespaces": 1 })
def test_handles_expanded_lists(self): self.assertEqual( count_words('one,\ntwo,\nthree'), {'one': 1, 'two': 1, 'three': 1} )
def _calculate_progress(self, dir_filter = script_analytics.DEFAULT_FILTER): if self._running: return self._running = True self._canceled = False self.ui.lblResults.setText("<center><b>Results</b></center>") start_time = time.time() self.ui.progressBar.setMaximum(72000) self.ui.progressBar.setValue(0) # For our dupe database, we need the relative location of our files, not absolute. dir_start = len(common.editor_config.data01_dir) + 1 total_files = 0 unique_files = 0 translated_files = 0 translated_unique = 0 total_chars = 0 unique_chars = 0 translated_chars = 0 translated_unique_chars = 0 translated_words = 0 translated_unique_words = 0 total_bytes = 0 unique_bytes = 0 translated_bytes = 0 translated_unique_bytes = 0 groups_seen = set() files_seen = set() untranslated_lines = [] next_update = UPDATE_INTERVAL for i, total, filename, data in script_analytics.SA.get_data(dir_filter): if self._canceled: self._running = False self._canceled = False self.ui.progressBar.setValue(0) self.ui.lblTimeElapsed.setText("00:00") return if i >= next_update: self.ui.progressBar.setValue(i) self.ui.progressBar.setMaximum(total) self.ui.lblTimeElapsed.setText("%02d:%02d" % (divmod(time.time() - start_time, 60))) QtGui.QApplication.processEvents() next_update = i + UPDATE_INTERVAL if data == None: continue db_name = filename real_name = os.path.join(common.editor_config.data01_dir, filename) if db_name in files_seen: continue dupe_group = dupe_db.db.group_from_file(db_name) # Add the whole group to the translated files, but only one # to the unique translated. If there is no group, it's size 1. group_size = 1 if not dupe_group == None: if dupe_group in groups_seen: continue else: groups_seen.add(dupe_group) group_files = dupe_db.db.files_in_group(dupe_group) group_files = filter(dir_filter.search, group_files) group_size = len(group_files) files_seen.update(group_files) total_files += group_size unique_files += 1 #file = script_for_counting(data) file = data # How many characters is the untranslated, non-tagged text? num_chars = len(file.notags[common.editor_config.lang_orig]) #num_bytes = len(bytearray(file.notags[common.editor_config.lang_orig], encoding = "SJIS", errors = "replace")) total_chars += num_chars * group_size unique_chars += num_chars #total_bytes += num_bytes * group_size #unique_bytes += num_bytes if not file.notags[common.editor_config.lang_trans] == "" or num_chars == 0: translated_files += group_size translated_unique += 1 translated_chars += num_chars * group_size translated_unique_chars += num_chars words = count_words(file.notags[common.editor_config.lang_trans]) translated_words += words * group_size translated_unique_words += words #translated_bytes += num_bytes * group_size #translated_unique_bytes += num_bytes #elif file.notags[common.editor_config.lang_trans] == "": #untranslated_lines.append(db_name) # progress.close() self.ui.progressBar.setValue(total) #print "Took %s seconds." % (time.time() - start_time) files_percent = 100.0 if total_files == 0 else float(translated_files) / total_files * 100 unique_files_percent = 100.0 if unique_files == 0 else float(translated_unique) / unique_files * 100 chars_percent = 100.0 if total_chars == 0 else float(translated_chars) / total_chars * 100 unique_chars_percent = 100.0 if unique_chars == 0 else float(translated_unique_chars) / unique_chars * 100 bytes_percent = 100.0 if total_bytes == 0 else float(translated_bytes) / total_bytes * 100 unique_bytes_percent = 100.0 if unique_bytes == 0 else float(translated_unique_bytes) / unique_bytes * 100 self.ui.lblResults.setText( "<center><b>Results</b></center><br/>" + ("<b>Files</b>: %d / %d (%0.2f%%)<br/>" % (translated_files, total_files, files_percent)) + ("<b>Unique Files</b>: %d / %d (%0.2f%%)<br/>" % (translated_unique, unique_files, unique_files_percent)) + "<br/>" + ("<b>Japanese Characters</b>: %d / %d (%0.2f%%)<br/>" % (translated_chars, total_chars, chars_percent)) + ("<b>Unique Characters</b>: %d / %d (%0.2f%%)<br/>" % (translated_unique_chars, unique_chars, unique_chars_percent)) + "<br/>" + ("<b>English Words</b>: %d<br/>" % (translated_words)) + ("<b>Unique Words</b>: %d<br/>" % (translated_unique_words)) + "<br/>" + "<b>NOTE</b>: Unique X is lazy for \"X in all unique files.\"" ) self._running = False self._canceled = False
def test_ignore_punctuation(self): self.assertEqual( count_words('car : carpet as java : javascript!!&@$%^&'), {'car': 1, 'carpet': 1, 'as': 1, 'java': 1, 'javascript': 1} )
def test_include_numbers(self): self.assertEqual(count_words('testing 1 2 testing'), { 'testing': 2, '1': 1, '2': 1 })
def test_include_numbers(self): self.assertEqual( count_words('testing 1 2 testing'), {'testing': 2, '1': 1, '2': 1} )
def test_multiple_apostrophes_ignored(self): self.assertEqual(count_words("''hey''"), {"hey": 1})
def test_normalize_case(self): self.assertEqual( count_words('go Go GO Stop stop'), {'go': 3, 'stop': 2} )
def test_handles_cramped_lists(self): self.assertEqual(count_words("one,two,three"), {"one": 1, "two": 1, "three": 1})
def test_with_apostrophes(self): self.assertEqual( count_words("First: don't laugh. Then: don't cry."), {'first': 1, "don't": 2, 'laugh': 1, 'then': 1, 'cry': 1} )
def test_ignore_punctuation(self): self.assertEqual( count_words("car: carpet as java: javascript!!&@$%^&"), {"car": 1, "carpet": 1, "as": 1, "java": 1, "javascript": 1}, )
def test_with_quotations(self): self.assertEqual( count_words("Joe can't tell between 'large' and large."), {'joe': 1, "can't": 1, 'tell': 1, 'between': 1, 'large': 2, 'and': 1} )
def test_normalize_case(self): self.assertEqual(count_words("go Go GO Stop stop"), {"go": 3, "stop": 2})
def test_multiple_spaces_not_detected_as_a_word(self): self.assertEqual( count_words(' multiple whitespaces'), {'multiple': 1, 'whitespaces': 1} )
def test_with_quotations(self): self.assertEqual( count_words("Joe can't tell between 'large' and large."), {"joe": 1, "can't": 1, "tell": 1, "between": 1, "large": 2, "and": 1}, )
def test_alternating_word_separators_not_detected_as_a_word(self): self.assertEqual( count_words(",\n,one,\n ,two \n 'three'"), {'one': 1, 'two': 1, 'three': 1} )
def test_count_one_word(self): self.assertEqual(count_words("word"), {"word": 1})
def test_handles_cramped_lists(self): self.assertEqual(count_words('one,two,three'), { 'one': 1, 'two': 1, 'three': 1 })
def test_count_one_of_each_word(self): self.assertEqual(count_words("one of each"), { "one": 1, "of": 1, "each": 1 })
def test_count_one_of_each_word(self): self.assertEqual(count_words('one of each'), { 'one': 1, 'of': 1, 'each': 1 })
def test_word_count_no_words(self): word = '' result = word_count.count_words(word) self.assertEqual(result, 0)
def test_non_alphanumeric(self): self.assertEqual( count_words('hey,my_spacebar_is_broken.'), {'hey': 1, 'my': 1, 'spacebar': 1, 'is': 1, 'broken': 1} )
def test_word_count_one_sentence(self): word = 'I love my dog.' result = word_count.count_words(word) self.assertEqual(result, 4)
def test_handles_expanded_lists(self): self.assertEqual(count_words('one,\ntwo,\nthree'), { 'one': 1, 'two': 1, 'three': 1 })
def test_count_one_word(self): self.assertEqual( count_words('word'), {'word': 1} )
from word_count import count_words, count_specific_word filenames = ['alice.txt', 'siddhartha.txt', 'sherlock.txt', 'treasure.txt'] for filename in filenames: count_words(filename) count_specific_word(filename, 'grass')