def readAndCleanDoc(doc): # 1. Open document, read text into *single* string with open(doc, 'r') as file: myfile = file.read() # 2. Tokenize string using nltk.tokenize.word_tokenize # docs = myfile.splitlines() nltk.download('punkt') doc_tokens = nltk.tokenize.word_tokenize(myfile) # 3. Filter out punctuation from list of words (use remove_punc) filtrd_doc = remove_punc(doc_tokens) # 4. Make the words lower case # 5. Filter out stopwords nltk.download('stopwords') stop = stopwords.words('english') doc_tokens_clean = [words.lower() for words in filtrd_doc if words.lower() not in stop] # 6. Stem words stemmer = PorterStemmer() words = [stemmer.stem(wrds) for wrds in doc_tokens_clean] return words
def readAndCleanDoc(doc): #1. Open document, read text into *single* string with open(doc, "r") as file: text = file.read() #2. Tokenize string using nltk.tokenize.word_tokenize text = word_tokenize(text) #3. Filter out punctuation from list of words (use remove_punc) text = remove_punc(text) #4. Make the words lower case #text = rm_punch.lower() for i, word in enumerate(text): text[i] = text[i].lower() #5. Filter out stopwords stop_set = stopwords.words('english') text_nostop = [n for n in text if not n in stop_set] #6. Lemmatize words lmtzr = WordNetLemmatizer() words = [lmtzr.lemmatize(x) for x in text_nostop] return words
def readAndCleanDoc(doc) : #1. Open document, read text into *single* string f = open(doc, 'r') words = f.read() f.close() #2. Tokenize string using nltk.tokenize.word_tokenize words = nltk.tokenize.word_tokenize(words) #3. Filter out punctuation from list of words (use remove_punc) words = remove_punc(words) #4. Make the words lower case tempWords = [] for content in words: tempWords.append(content.lower()) words = tempWords #5. Filter out stopwords stopwords = nltk.corpus.stopwords.words('english') words = [content for content in words if content not in stopwords] #6. Stem words stemmer = nltk.stem.PorterStemmer() words = [stemmer.stem(content) for content in words] return words
def readAndCleanDoc(doc): #1. Open document, read text into *single* string with open(doc, 'r') as file: words = file.read().replace('\n', ' ') #2. Tokenize string using nltk.tokenize.word_tokenize words = nltk.tokenize.word_tokenize(words) #3. Filter out punctuation from list of words (use remove_punc) words = (remove_punc(words)) #4. Make the words lower case words = [i.lower() for i in words] #5. Filter out stopwords stop_words = stopwords.words('english') words = [w for w in words if not w in stop_words] #words = [''.join(remove_punc(i)) for i in words] #6. Lemmatize words lemmentizer = WordNetLemmatizer() words = [lemmentizer.lemmatize(i) for i in words] return words
def readAndCleanDoc(doc): #1. Open document, read text into *single* string with open(doc, 'r') as f: data = f.read() data = data.lower() #2. Tokenize string using nltk.tokenize.word_tokenize word_tokens = word_tokenize(data, language='english') #3. Filter out punctuation from list of words (use remove_punc) word_tokens = remove_punc(word_tokens) #4. Make the words lower case #word_tokens = word_tokens.lower() #5. Filter out stopwords stop_words = set(stopwords.words('english')) words = [w for w in word_tokens if not w in stop_words] #6. Stem words ps = PorterStemmer() for index, w in enumerate(words): words[index] = ps.stem(w) return words
def readAndCleanDoc(doc): #1. Open document, read text into *single* string with open(doc, "r") as f: text = f.read() #2. Tokenize string using nltk.tokenize.word_tokenize #nltk.download('punkt') word_token = word_tokenize(text) #3. Filter out punctuation from list of words (use remove_punc) word_filtered = remove_punc(word_token) #4. Make the words lower case word_lower = [words.lower() for words in word_filtered] #5. Filter out stopwords #nltk.download('stopwords') stop_words = set(stopwords.words('english')) doc_clean = [words for words in word_lower if words not in stop_words] #6. Stem words ps = PorterStemmer() words = [ps.stem(w) for w in doc_clean] return words
def readAndCleanDoc(doc) : #1. Open document, read text into *single* string myfile = open(doc, 'r') tmp = myfile.read() #2. Tokenize string using nltk.tokenize.word_tokenize tokens = word_tokenize(tmp) #3. Filter out punctuation from list of words (use remove_punc) tokens = remove_punc(tokens) #4. Make the words lower case tokens = [word.lower() for word in tokens] #5. Filter out stopwords stop = stopwords.words('english') tokens_clean = [x for x in tokens if not x in stop] #6. Lemmatize words lemmatizer = WordNetLemmatizer() tokens_clean_lem = [lemmatizer.lemmatize(word) for word in tokens_clean] words = tokens_clean_lem return words