def testRightNumberOfWords(self): word_count("dane.txt") while True: try: file = open("count_answer.txt", 'r') break except IOError: print("No such file in this localisation") text_w = file.read() file.close() assert text_w == "Text from file dane.txt contain 23 words", "word_count failed, wrong answer in the file"
def extract_old(text): lines = re.split('\n', text) lengths = empty(len(lines)) for i in range(len(lines)): lengths[i] = len(lines[i]) total = sum(lengths) average = total / len(lines) longest = amax(lengths) deviation = std(lengths) w_count = word_count(lines) groups = grouper(lengths) deviants = [] dates = find_date(lines) comments = find_comments(lines) start = False end_count = 0 large_block = [] blocks = [] period_ratio = [] # Looking for beginning of story based on cluster of long lines period_ratio = periodCase(lines) for i in range(len(lines)): if lengths[i] > average + (deviation * 1.2) \ and titleCase(lines[i]) == False \ and period_ratio[i] > 0.005: start = True end_count = 0 elif start == True: end_count += 1 if end_count > 1: if len(large_block) < 2: large_block = [] else: blocks += large_block large_block = [] if start == True: large_block.append(lines[i]) deviants.append(i) if len(blocks) == 0: return False # Maybe use second block instead of first if len(blocks) > 1 and \ len(blocks[0]) < 3 and \ len(blocks[1]) > len(blocks[0]): blocks[0] = blocks[1] story = '' for line in blocks[0]: story += line return story
def add_ingredient(self, ingredient): if(ingredient in self.ingredient_list): self.ingredient_list[ingredient].increment() else: ingredient_count = word_count(ingredient, 1) self.ingredient_list[ingredient] = ingredient_count return 0
def get_body(start, stop, lines, dates): w_count = word_count(lines) period_ratio = periodCase(lines) blocks = [] ''' this list should move to the database eventually these regular expressions will block a line from being added to the body if they match that line ''' forbiddenPhrases = [ re.compile(r'(?i)AP Photo'), re.compile(r'(?i)©'), re.compile(r'(?i)@\w+\.(com|net|org)'), re.compile(r'Photo by'), re.compile(r'(?i)All rights reserved'), re.compile(r'(?i)Copyright\s+20[0-9]{2}'), re.compile(r'(?i)follow us on twitter'), re.compile(r'Don\'t let it get away!'), re.compile(r'Your own personalized stock watchlist!'), re.compile(r'Keep track of the stocks that matter to you.'), re.compile( r'yourself with the Fool\'s FREE and easy new watchlist service today.' ), re.compile(r'Please login or register'), re.compile(r'Customize features on the site to'), re.compile(r'Fine Print: The following comments'), re.compile(r'(?i)please login or register.'), re.compile(r'(?i)Each day, we\'ll email you'), re.compile( r'(?i)The content of this field is kept private and will not be shown publicly\.' ), re.compile(r'^Daily Newsletter!$'), re.compile(r'Yahoo! Buzz'), re.compile(r'(?i)is now on twitter!'), re.compile(r'(?i)follow me @'), re.compile( r'(?i)Use of the Website signifies your agreement to the Terms of Use and Privacy Policy' ), re.compile( r'(?i)The following are trademarks or service marks of Major League Baseball entities' ), ] for i in range(start, stop): three_letter_ratio = triple_ratio(lines[i]) #log.plog("B: "+str(i)+": w_count: "+str(w_count[i])+" p_r: "+str(period_ratio[i])+" :: " +" 3_r: "+str(three_letter_ratio)+" :: " +lines[i]) forbidden = False for regex in forbiddenPhrases: if regex.search(lines[i].strip()) != None: forbidden = True break if w_count[i] > 1 and \ forbidden == False and \ titleCase(lines[i]) == False and \ period_ratio[i] > .001 : blocks.append(lines[i]) #else: # print "line " + str(i) + " isn't english (period ratio: " + str(period_ratio[i]) + ")" return blocks
def get_body(start, stop, lines, dates): w_count = word_count(lines) period_ratio = periodCase(lines) blocks = [] ''' this list should move to the database eventually these regular expressions will block a line from being added to the body if they match that line ''' forbiddenPhrases = [re.compile(r'(?i)AP Photo'), re.compile(r'(?i)©'), re.compile(r'(?i)@\w+\.(com|net|org)'), re.compile(r'Photo by'), re.compile(r'(?i)All rights reserved'), re.compile(r'(?i)Copyright\s+20[0-9]{2}'), re.compile(r'(?i)follow us on twitter'), re.compile(r'Don\'t let it get away!'), re.compile(r'Your own personalized stock watchlist!'), re.compile(r'Keep track of the stocks that matter to you.'), re.compile(r'yourself with the Fool\'s FREE and easy new watchlist service today.'), re.compile(r'Please login or register'), re.compile(r'Customize features on the site to'), re.compile(r'Fine Print: The following comments'), re.compile(r'(?i)please login or register.'), re.compile(r'(?i)Each day, we\'ll email you'), re.compile(r'(?i)The content of this field is kept private and will not be shown publicly\.'), re.compile(r'^Daily Newsletter!$'), re.compile(r'Yahoo! Buzz'), re.compile(r'(?i)is now on twitter!'), re.compile(r'(?i)follow me @'), re.compile(r'(?i)Use of the Website signifies your agreement to the Terms of Use and Privacy Policy'), re.compile(r'(?i)The following are trademarks or service marks of Major League Baseball entities'), ] for i in range(start,stop): three_letter_ratio = triple_ratio(lines[i]) #log.plog("B: "+str(i)+": w_count: "+str(w_count[i])+" p_r: "+str(period_ratio[i])+" :: " +" 3_r: "+str(three_letter_ratio)+" :: " +lines[i]) forbidden = False for regex in forbiddenPhrases: if regex.search(lines[i].strip()) != None: forbidden = True break if w_count[i] > 1 and \ forbidden == False and \ titleCase(lines[i]) == False and \ period_ratio[i] > .001 : blocks.append(lines[i]) #else: # print "line " + str(i) + " isn't english (period ratio: " + str(period_ratio[i]) + ")" return blocks
def testCurrencyword_count(self): words = word_count("dane.txt") assert words != None and words >=0, "word_count failed, wrong number of lines"
def test_word_count(): expected_result = int(input('What is the expected_result: ')) assert (word_count() == expected_result)
def test_word_count(self): expected_result = int(input('What is the expected_result: ')) self.assertEqual(word_count(), expected_result)