Esempio n. 1
0
def lemmatizer(report):
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized = [
        lemmatizer.lemmatize(token, get_part_of_speech(token))
        for token in report
    ]
    return lemmatized
def count_words(text):
    cleaned = re.sub('\W+', ' ', text).lower()
    tokenized = word_tokenize(cleaned)

    stop_words = stopwords.words('english')
    filtered = [word for word in tokenized if word not in stop_words]

    normalizer = WordNetLemmatizer()
    normalized = [
        normalizer.lemmatize(token, get_part_of_speech(token))
        for token in filtered
    ]

    bag_of_looking_glass_words = Counter(normalized)
    return bag_of_looking_glass_words
Esempio n. 3
0
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
###### grabbing a part of speech function:
from part_of_speech import get_part_of_speech

text = "So many squids are jumping out of suitcases these days that you can barely go anywhere without seeing one burst forth from a tightly packed valise. I went to the dentist the other day, and sure enough I saw an angry one jump out of my dentist's bag within minutes of arriving. She hardly even noticed."

cleaned = re.sub('\W+', ' ', text)
tokenized = word_tokenize(cleaned)

stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in tokenized]

lemmatizer = WordNetLemmatizer()
lemmatized = [
    lemmatizer.lemmatize(token, get_part_of_speech(token))
    for token in tokenized
]

print("Stemmed text:")
print(stemmed)
print("\nLemmatized text:")
print(lemmatized)

#####
print('\n##################################################\n')
#####
"""
It may be helpful to know how the words relate to each other and the underlying syntax (grammar). Parsing is a stage of NLP concerned with segmenting text based on syntax.

Part-of-speech tagging (POS tagging) identifies parts of speech (verbs, nouns, adjectives, etc.). NLTK can do it faster (and maybe more accurately) than your grammar teacher!
# regex for removing punctuation!
import re
# nltk preprocessing magic
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# grabbing a part of speech function:
from part_of_speech import get_part_of_speech

text = "So many squids are jumping out of suitcases these days that you can barely go anywhere without seeing one burst forth from a tightly packed valise. I went to the dentist the other day, and sure enough I saw an angry one jump out of my dentist's bag within minutes of arriving. She hardly even noticed."

cleaned = re.sub('\W+', ' ', text)
tokenized = word_tokenize(cleaned)

stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in tokenized]

lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token, get_part_of_speech(token))
for token in tokenized]

print("Stemmed text:")
print(stemmed)
print("\nLemmatized text:")
print(lemmatized)
Esempio n. 5
0
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from part_of_speech import get_part_of_speech

lemmatizer = WordNetLemmatizer()

populated_island = 'Indonesia was founded in 1945. It contains the most populated island in the world, Java, with over 140 million people.'

tokenized_string = word_tokenize(populated_island)

lemmatized_pos = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized_string]

try:
  print(f'The lemmatized words are: {lemmatized_pos}')
except:
  print('Expected a variable called `lemmatized_pos`')
# importing regex and nltk
import re, nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# importing Counter to get word counts for bag of words
from collections import Counter

# import the text
text = open("iliad.txt", encoding='utf-8').read().lower()

# importing part-of-speech function for lemmatization
from part_of_speech import get_part_of_speech

cleaned = re.sub('\W+', ' ', text).lower()
tokenized = word_tokenize(cleaned)

stop_words = stopwords.words('english')
filtered = [word for word in tokenized if word not in stop_words]

normalizer = WordNetLemmatizer()
normalized = [normalizer.lemmatize(token, get_part_of_speech(token)) for token in filtered]

# Define bag_of_looking_glass_words & print:
bag_of_looking_glass_words = Counter(normalized)
# print(bag_of_looking_glass_words)
# print(type(bag_of_looking_glass_words))
print(bag_of_looking_glass_words)