Python word_count Beispiele

Beispiel #1

0

Datei anzeigen

Datei: TestCount.py Projekt: klaudiar/Lab_01

 def testRightNumberOfWords(self):
     word_count("dane.txt")
     while True:
         try:
             file = open("count_answer.txt", 'r')
             break
         except IOError:
             print("No such file in this localisation")
     text_w = file.read()
     file.close()
     assert text_w == "Text from file dane.txt contain 23 words",  "word_count failed, wrong answer in the file"

Beispiel #2

0

Datei anzeigen

Datei: body_extractor_old.py Projekt: ctwiz/sourcereader

def extract_old(text):
    lines = re.split('\n', text)

    lengths = empty(len(lines))
    for i in range(len(lines)):
        lengths[i] = len(lines[i])

    total = sum(lengths)
    average = total / len(lines)
    longest = amax(lengths)
    deviation = std(lengths)


    w_count = word_count(lines)
    groups = grouper(lengths)
    deviants = []
    dates = find_date(lines)
    comments = find_comments(lines)

    start = False
    end_count = 0
    large_block = []
    blocks = []
    period_ratio = []

    # Looking for beginning of story based on cluster of long lines
    period_ratio = periodCase(lines)
    for i in range(len(lines)):
        if lengths[i] > average + (deviation * 1.2) \
           and titleCase(lines[i]) == False \
           and period_ratio[i] > 0.005:
            start = True
            end_count = 0
        elif start == True:
            end_count += 1

        if end_count > 1:
            if len(large_block) < 2:
                large_block = []
            else:
                blocks += large_block
                large_block = []

        if start == True:
            large_block.append(lines[i])
            deviants.append(i)

    if len(blocks) == 0:
        return False
    # Maybe use second block instead of first
    if len(blocks) > 1 and \
       len(blocks[0]) < 3 and \
       len(blocks[1]) > len(blocks[0]): 
        blocks[0] = blocks[1]

    story = ''
    for line in blocks[0]:
        story += line

    return story

Beispiel #3

0

Datei anzeigen

def extract_old(text):
    lines = re.split('\n', text)

    lengths = empty(len(lines))
    for i in range(len(lines)):
        lengths[i] = len(lines[i])

    total = sum(lengths)
    average = total / len(lines)
    longest = amax(lengths)
    deviation = std(lengths)

    w_count = word_count(lines)
    groups = grouper(lengths)
    deviants = []
    dates = find_date(lines)
    comments = find_comments(lines)

    start = False
    end_count = 0
    large_block = []
    blocks = []
    period_ratio = []

    # Looking for beginning of story based on cluster of long lines
    period_ratio = periodCase(lines)
    for i in range(len(lines)):
        if lengths[i] > average + (deviation * 1.2) \
           and titleCase(lines[i]) == False \
           and period_ratio[i] > 0.005:
            start = True
            end_count = 0
        elif start == True:
            end_count += 1

        if end_count > 1:
            if len(large_block) < 2:
                large_block = []
            else:
                blocks += large_block
                large_block = []

        if start == True:
            large_block.append(lines[i])
            deviants.append(i)

    if len(blocks) == 0:
        return False
    # Maybe use second block instead of first
    if len(blocks) > 1 and \
       len(blocks[0]) < 3 and \
       len(blocks[1]) > len(blocks[0]):
        blocks[0] = blocks[1]

    story = ''
    for line in blocks[0]:
        story += line

    return story

Beispiel #4

0

Datei anzeigen

Datei: ingredients.py Projekt: justiceamoh/whatscooking

	def add_ingredient(self, ingredient):
		if(ingredient in self.ingredient_list):
			self.ingredient_list[ingredient].increment()
		else:
			ingredient_count = word_count(ingredient, 1)
			self.ingredient_list[ingredient] = ingredient_count

		return 0

Beispiel #5

0

Datei anzeigen

Datei: get_body.py Projekt: dpgailey/bawdy

def get_body(start, stop, lines, dates):
    w_count = word_count(lines)
    period_ratio = periodCase(lines)
    blocks = []
    '''
     this list should move to the database eventually
     these regular expressions will block a line from being added to the body
     if they match that line
    '''
    forbiddenPhrases = [
        re.compile(r'(?i)AP Photo'),
        re.compile(r'(?i)&copy;'),
        re.compile(r'(?i)@\w+\.(com|net|org)'),
        re.compile(r'Photo by'),
        re.compile(r'(?i)All rights reserved'),
        re.compile(r'(?i)Copyright\s+20[0-9]{2}'),
        re.compile(r'(?i)follow us on twitter'),
        re.compile(r'Don\'t let it get away!'),
        re.compile(r'Your own personalized stock watchlist!'),
        re.compile(r'Keep track of the stocks that matter to you.'),
        re.compile(
            r'yourself with the Fool\'s FREE and easy new watchlist service today.'
        ),
        re.compile(r'Please login or register'),
        re.compile(r'Customize features on the site to'),
        re.compile(r'Fine Print: The following comments'),
        re.compile(r'(?i)please login or register.'),
        re.compile(r'(?i)Each day, we\'ll email you'),
        re.compile(
            r'(?i)The content of this field is kept private and will not be shown publicly\.'
        ),
        re.compile(r'^Daily Newsletter!$'),
        re.compile(r'Yahoo! Buzz'),
        re.compile(r'(?i)is now on twitter!'),
        re.compile(r'(?i)follow me @'),
        re.compile(
            r'(?i)Use of the Website signifies your agreement to the Terms of Use and Privacy Policy'
        ),
        re.compile(
            r'(?i)The following are trademarks or service marks of Major League Baseball entities'
        ),
    ]
    for i in range(start, stop):
        three_letter_ratio = triple_ratio(lines[i])
        #log.plog("B: "+str(i)+": w_count: "+str(w_count[i])+" p_r: "+str(period_ratio[i])+" :: " +" 3_r: "+str(three_letter_ratio)+" :: " +lines[i])
        forbidden = False
        for regex in forbiddenPhrases:
            if regex.search(lines[i].strip()) != None:
                forbidden = True
                break
        if w_count[i] > 1 and \
           forbidden == False and \
           titleCase(lines[i]) == False and \
           period_ratio[i] > .001 :
            blocks.append(lines[i])
        #else:
        #    print "line " + str(i) + " isn't english (period ratio: " + str(period_ratio[i]) + ")"
    return blocks

Beispiel #6

0

Datei anzeigen

Datei: get_body.py Projekt: ctwiz/sourcereader

def get_body(start, stop, lines, dates):
    w_count = word_count(lines)
    period_ratio = periodCase(lines)
    blocks = []
    '''
     this list should move to the database eventually
     these regular expressions will block a line from being added to the body
     if they match that line
    '''
    forbiddenPhrases = [re.compile(r'(?i)AP Photo'),
                        re.compile(r'(?i)&copy;'),
                        re.compile(r'(?i)@\w+\.(com|net|org)'),
                        re.compile(r'Photo by'),
                        re.compile(r'(?i)All rights reserved'),
                        re.compile(r'(?i)Copyright\s+20[0-9]{2}'),
                        re.compile(r'(?i)follow us on twitter'),
                        re.compile(r'Don\'t let it get away!'),
                        re.compile(r'Your own personalized stock watchlist!'),
                        re.compile(r'Keep track of the stocks that matter to you.'),
                        re.compile(r'yourself with the Fool\'s FREE and easy new watchlist service today.'),
                        re.compile(r'Please login or register'),
                        re.compile(r'Customize features on the site to'),
                        re.compile(r'Fine Print: The following comments'),
                        re.compile(r'(?i)please login or register.'),
                        re.compile(r'(?i)Each day, we\'ll email you'),
                        re.compile(r'(?i)The content of this field is kept private and will not be shown publicly\.'),
                        re.compile(r'^Daily Newsletter!$'),
                        re.compile(r'Yahoo! Buzz'),
                        re.compile(r'(?i)is now on twitter!'),
                        re.compile(r'(?i)follow me @'),
                        re.compile(r'(?i)Use of the Website signifies your agreement to the Terms of Use and Privacy Policy'),
                        re.compile(r'(?i)The following are trademarks or service marks of Major League Baseball entities'),
                        ]
    for i in range(start,stop):
        three_letter_ratio = triple_ratio(lines[i])
        #log.plog("B: "+str(i)+": w_count: "+str(w_count[i])+" p_r: "+str(period_ratio[i])+" :: " +" 3_r: "+str(three_letter_ratio)+" :: " +lines[i])
        forbidden = False
        for regex in forbiddenPhrases:
            if regex.search(lines[i].strip()) != None:
                forbidden = True
                break
        if w_count[i] > 1 and \
           forbidden == False and \
           titleCase(lines[i]) == False and \
           period_ratio[i] > .001 :
            blocks.append(lines[i])
        #else:
        #    print "line " + str(i) + " isn't english (period ratio: " + str(period_ratio[i]) + ")"
    return blocks

Beispiel #7

0

Datei anzeigen

Datei: TestCount.py Projekt: klaudiar/Lab_01

 def testCurrencyword_count(self):
     words = word_count("dane.txt")
     assert words != None and words >=0, "word_count failed, wrong number of lines"

Beispiel #8

0

Datei anzeigen

def test_word_count():
    expected_result = int(input('What is the expected_result: '))
    assert (word_count() == expected_result)

Beispiel #9

0

Datei anzeigen

 def test_word_count(self):
     expected_result = int(input('What is the expected_result: '))
     self.assertEqual(word_count(), expected_result)