Example #1
0
 def testRightNumberOfWords(self):
     word_count("dane.txt")
     while True:
         try:
             file = open("count_answer.txt", 'r')
             break
         except IOError:
             print("No such file in this localisation")
     text_w = file.read()
     file.close()
     assert text_w == "Text from file dane.txt contain 23 words",  "word_count failed, wrong answer in the file"
Example #2
0
def extract_old(text):
    lines = re.split('\n', text)

    lengths = empty(len(lines))
    for i in range(len(lines)):
        lengths[i] = len(lines[i])

    total = sum(lengths)
    average = total / len(lines)
    longest = amax(lengths)
    deviation = std(lengths)


    w_count = word_count(lines)
    groups = grouper(lengths)
    deviants = []
    dates = find_date(lines)
    comments = find_comments(lines)

    start = False
    end_count = 0
    large_block = []
    blocks = []
    period_ratio = []

    # Looking for beginning of story based on cluster of long lines
    period_ratio = periodCase(lines)
    for i in range(len(lines)):
        if lengths[i] > average + (deviation * 1.2) \
           and titleCase(lines[i]) == False \
           and period_ratio[i] > 0.005:
            start = True
            end_count = 0
        elif start == True:
            end_count += 1

        if end_count > 1:
            if len(large_block) < 2:
                large_block = []
            else:
                blocks += large_block
                large_block = []

        if start == True:
            large_block.append(lines[i])
            deviants.append(i)

    if len(blocks) == 0:
        return False
    # Maybe use second block instead of first
    if len(blocks) > 1 and \
       len(blocks[0]) < 3 and \
       len(blocks[1]) > len(blocks[0]): 
        blocks[0] = blocks[1]

    story = ''
    for line in blocks[0]:
        story += line

    return story
Example #3
0
def extract_old(text):
    lines = re.split('\n', text)

    lengths = empty(len(lines))
    for i in range(len(lines)):
        lengths[i] = len(lines[i])

    total = sum(lengths)
    average = total / len(lines)
    longest = amax(lengths)
    deviation = std(lengths)

    w_count = word_count(lines)
    groups = grouper(lengths)
    deviants = []
    dates = find_date(lines)
    comments = find_comments(lines)

    start = False
    end_count = 0
    large_block = []
    blocks = []
    period_ratio = []

    # Looking for beginning of story based on cluster of long lines
    period_ratio = periodCase(lines)
    for i in range(len(lines)):
        if lengths[i] > average + (deviation * 1.2) \
           and titleCase(lines[i]) == False \
           and period_ratio[i] > 0.005:
            start = True
            end_count = 0
        elif start == True:
            end_count += 1

        if end_count > 1:
            if len(large_block) < 2:
                large_block = []
            else:
                blocks += large_block
                large_block = []

        if start == True:
            large_block.append(lines[i])
            deviants.append(i)

    if len(blocks) == 0:
        return False
    # Maybe use second block instead of first
    if len(blocks) > 1 and \
       len(blocks[0]) < 3 and \
       len(blocks[1]) > len(blocks[0]):
        blocks[0] = blocks[1]

    story = ''
    for line in blocks[0]:
        story += line

    return story
Example #4
0
	def add_ingredient(self, ingredient):
		if(ingredient in self.ingredient_list):
			self.ingredient_list[ingredient].increment()
		else:
			ingredient_count = word_count(ingredient, 1)
			self.ingredient_list[ingredient] = ingredient_count

		return 0
Example #5
0
def get_body(start, stop, lines, dates):
    w_count = word_count(lines)
    period_ratio = periodCase(lines)
    blocks = []
    '''
     this list should move to the database eventually
     these regular expressions will block a line from being added to the body
     if they match that line
    '''
    forbiddenPhrases = [
        re.compile(r'(?i)AP Photo'),
        re.compile(r'(?i)&copy;'),
        re.compile(r'(?i)@\w+\.(com|net|org)'),
        re.compile(r'Photo by'),
        re.compile(r'(?i)All rights reserved'),
        re.compile(r'(?i)Copyright\s+20[0-9]{2}'),
        re.compile(r'(?i)follow us on twitter'),
        re.compile(r'Don\'t let it get away!'),
        re.compile(r'Your own personalized stock watchlist!'),
        re.compile(r'Keep track of the stocks that matter to you.'),
        re.compile(
            r'yourself with the Fool\'s FREE and easy new watchlist service today.'
        ),
        re.compile(r'Please login or register'),
        re.compile(r'Customize features on the site to'),
        re.compile(r'Fine Print: The following comments'),
        re.compile(r'(?i)please login or register.'),
        re.compile(r'(?i)Each day, we\'ll email you'),
        re.compile(
            r'(?i)The content of this field is kept private and will not be shown publicly\.'
        ),
        re.compile(r'^Daily Newsletter!$'),
        re.compile(r'Yahoo! Buzz'),
        re.compile(r'(?i)is now on twitter!'),
        re.compile(r'(?i)follow me @'),
        re.compile(
            r'(?i)Use of the Website signifies your agreement to the Terms of Use and Privacy Policy'
        ),
        re.compile(
            r'(?i)The following are trademarks or service marks of Major League Baseball entities'
        ),
    ]
    for i in range(start, stop):
        three_letter_ratio = triple_ratio(lines[i])
        #log.plog("B: "+str(i)+": w_count: "+str(w_count[i])+" p_r: "+str(period_ratio[i])+" :: " +" 3_r: "+str(three_letter_ratio)+" :: " +lines[i])
        forbidden = False
        for regex in forbiddenPhrases:
            if regex.search(lines[i].strip()) != None:
                forbidden = True
                break
        if w_count[i] > 1 and \
           forbidden == False and \
           titleCase(lines[i]) == False and \
           period_ratio[i] > .001 :
            blocks.append(lines[i])
        #else:
        #    print "line " + str(i) + " isn't english (period ratio: " + str(period_ratio[i]) + ")"
    return blocks
Example #6
0
def get_body(start, stop, lines, dates):
    w_count = word_count(lines)
    period_ratio = periodCase(lines)
    blocks = []
    '''
     this list should move to the database eventually
     these regular expressions will block a line from being added to the body
     if they match that line
    '''
    forbiddenPhrases = [re.compile(r'(?i)AP Photo'),
                        re.compile(r'(?i)&copy;'),
                        re.compile(r'(?i)@\w+\.(com|net|org)'),
                        re.compile(r'Photo by'),
                        re.compile(r'(?i)All rights reserved'),
                        re.compile(r'(?i)Copyright\s+20[0-9]{2}'),
                        re.compile(r'(?i)follow us on twitter'),
                        re.compile(r'Don\'t let it get away!'),
                        re.compile(r'Your own personalized stock watchlist!'),
                        re.compile(r'Keep track of the stocks that matter to you.'),
                        re.compile(r'yourself with the Fool\'s FREE and easy new watchlist service today.'),
                        re.compile(r'Please login or register'),
                        re.compile(r'Customize features on the site to'),
                        re.compile(r'Fine Print: The following comments'),
                        re.compile(r'(?i)please login or register.'),
                        re.compile(r'(?i)Each day, we\'ll email you'),
                        re.compile(r'(?i)The content of this field is kept private and will not be shown publicly\.'),
                        re.compile(r'^Daily Newsletter!$'),
                        re.compile(r'Yahoo! Buzz'),
                        re.compile(r'(?i)is now on twitter!'),
                        re.compile(r'(?i)follow me @'),
                        re.compile(r'(?i)Use of the Website signifies your agreement to the Terms of Use and Privacy Policy'),
                        re.compile(r'(?i)The following are trademarks or service marks of Major League Baseball entities'),
                        ]
    for i in range(start,stop):
        three_letter_ratio = triple_ratio(lines[i])
        #log.plog("B: "+str(i)+": w_count: "+str(w_count[i])+" p_r: "+str(period_ratio[i])+" :: " +" 3_r: "+str(three_letter_ratio)+" :: " +lines[i])
        forbidden = False
        for regex in forbiddenPhrases:
            if regex.search(lines[i].strip()) != None:
                forbidden = True
                break
        if w_count[i] > 1 and \
           forbidden == False and \
           titleCase(lines[i]) == False and \
           period_ratio[i] > .001 :
            blocks.append(lines[i])
        #else:
        #    print "line " + str(i) + " isn't english (period ratio: " + str(period_ratio[i]) + ")"
    return blocks
Example #7
0
 def testCurrencyword_count(self):
     words = word_count("dane.txt")
     assert words != None and words >=0, "word_count failed, wrong number of lines"
Example #8
0
def test_word_count():
    expected_result = int(input('What is the expected_result: '))
    assert (word_count() == expected_result)
Example #9
0
 def test_word_count(self):
     expected_result = int(input('What is the expected_result: '))
     self.assertEqual(word_count(), expected_result)