Exemple #1
0
def keywords(str, top=10, nouns=True, singularize=True, filters=[]):
    
    """ Guesses keywords in a piece of text.
    
    Strips delimiters from the text and counts words occurences.
    By default, uses WordNet to filter out words,
    and furthermore ignores connectives and tags.
    By default, attempts to singularize nouns.
    
    The return value is a list (length defined by top)
    of (count, word) tuples.
    
    For example:
    from urllib import urlopen
    html = urlopen("http://news.bbc.co.uk/").read()
    meta = ["news", "health", "uk", "version", "weather", "video", "sport", "return", "read", "help"]
    print sentence_keywords(html, filters=meta)
    >>> [(6, 'funeral'), (5, 'beirut'), (3, 'war'), (3, 'service'), (3, 'radio'), (3, 'mull'), (3, 'lebanon'), (3, 'islamist'), (3, 'function'), (3, 'female')]
    
    """
    
    if nouns:
        # Attempt to load the WordNet library.
        # When this fails, don't filter for nouns.
        try:
            import wordnet
        except:
            nouns = False
    
    str = strip_tags(str)
    str = str.replace("\n", " ")
    str = str.split(" ")

    count = {}
    for word in str:
        
        word = word.lower()
        
        # Remove special characters.
        # Do this a number of times to counter typos like:: this.
        for i in range(10):
            word = word.strip("(){}[]'\"\r\n\t,.?!;:-*/ ")
        
        # Determine nouns using WordNet.
        # Attempt a lame singularization:
        # if a word is not a noun
        # and it is longer than three characters,
        # and it ends in an s,
        # and the same word without the s IS a noun,
        # then this word is probably a plural.
        noun = False
        if nouns == True:
            if wordnet.is_noun(word):
                noun = True
            elif singularize \
            and len(word) > 3 \
            and word.endswith("s") \
            and wordnet.is_noun(word[:-1]):
                noun = True
                word = word[:-1]
            else:
                noun = False
        
        # Filter for connectives
        # and (by default) keep only nouns.
        if len(word) > 1 \
        and not word in filters \
        and not is_connective(word) \
        and (not nouns or noun):
            if word in count.keys():
                count[word] += 1
            else:
                count[word] = 1
    
    sorted = []
    for word in count.keys():
        sorted.append((count[word], word))
    sorted.sort()
    sorted.reverse()
    
    return sorted[:top]
Exemple #2
0
def sentence_keywords(str, top=10, nouns=True, singularize=True, filters=[]):
    """ Guesses keywords in a piece of text.
    
    Strips delimiters from the text and counts words occurences.
    By default, uses WordNet to filter out words,
    and furthermore ignores connectives, numbers and tags.
    By default, attempts to singularize nouns.
    
    The return value is a list (length defined by top)
    of (count, word) tuples.
    
    For example:
    from urllib import urlopen
    html = urlopen("http://news.bbc.co.uk/").read()
    meta = ["news", "health", "uk", "version", "weather", "video", "sport", "return", "read", "help"]
    print sentence_keywords(html, filters=meta)
    >>> [(6, 'funeral'), (5, 'beirut'), (3, 'war'), (3, 'service'), (3, 'radio'), (3, 'mull'), (3, 'lebanon'), (3, 'islamist'), (3, 'function'), (3, 'female')]
    
    """

    str = tags.strip_tags(str)
    str = str.replace("\n", " ")
    str = str.split(" ")

    count = {}
    for word in str:

        word = word.lower()

        # Remove special characters.
        # Do this a number of times to counter typos like:: this.
        for i in range(10):
            word = word.strip("(){}[]'\"\r\n\t,.?!;:-*/ ")

        # Determine nouns using WordNet.
        # Attempt singularization.
        noun = False
        if nouns == True:
            if singularize \
            and len(word) > 3 \
            and wordnet.is_noun(singular.singular(word)):
                noun = True
                word = singular.singular(word)
            elif wordnet.is_noun(word):
                noun = True

        # Filter for connectives, numbers, tags
        # and (by default) keep only nouns.
        if len(word) > 1 \
        and not word in filters \
        and not is_connective(word) \
        and not numeral.is_number(word) \
        and not tags.is_tag(word) \
        and (not nouns or noun):
            if word in count.keys():
                count[word] += 1
            else:
                count[word] = 1

    sorted = []
    for word in count.keys():
        sorted.append((count[word], word))
    sorted.sort()
    sorted.reverse()

    return sorted[:top]
Exemple #3
0
def is_noun(word):
    return wordnet.is_noun(word)