def lemma(self, text): lemmatzer = nltk.WordNetLemmatizer() text = [[lemmatzer.lemmatize(w) for w in doc] for doc in text] return text
#joining all strings into one string rawText = " ".join(lst) #tokenize rawtext words tokens = nltk.word_tokenize(rawText) text = nltk.Text(tokens) # Remove extra chars and remove stop words. stopWords = prepareStopWords() text_content = [word for word in text if word not in stopWords] # Remove any entries where the len is zero. text_content = [s for s in text_content if len(s) != 0] #get the lemmas of each word to reduce the number of similar words WNL = nltk.WordNetLemmatizer() text_content = [WNL.lemmatize(t) for t in text_content] # ============================================================================= # UNIGRAM # ============================================================================= unigram_strg = " ".join(text_content) # Setting word cloud params and plotting the word cloud. WC_height = 500 WC_width = 1000 WC_max_words = 100 unigram_wordcloud = WordCloud(max_font_size=50, max_words=100,
def raw_preprocess(textfile): #order of functions: #read in x #tokenise x #normalize x #perform tests on most common words x #contractions (needs to come before removal of punctuation and negation) x #remove stopwords x #negation (needs to come before removal of punctuation) x #lemmatize x #remove punctuation x #remove other words x #tag x 4: default, RE, unigram, bigram x domain = open(textfile) #read in data from text file raw_domain = domain.read() tokens = nltk.word_tokenize(raw_domain) #tokenise words = [w.lower() for w in tokens] #normalize wnl = nltk.WordNetLemmatizer() #find the lexical diversity of the debate def lex_diversity(text): return len(set(text)) / len(text) * 100 #find the most common collocations within the debate def collocations(text): finder = BigramCollocationFinder.from_words(tokens) return sorted( finder.above_score(bigram_measures.raw_freq, 3.0 / len(tuple(nltk.bigrams(tokens))))) #process and replace contracted words with full phrases def contractions(text): #list of contractions contractions = [['don', 't', 'do', 'not'], ['can', 't', 'can', 'not'], ['isn', 't', 'is', 'not'], ['aren', 't', 'are', 'not'], ['wasn', 't', 'was', 'not'], ['weren', 't', 'were', 'not'], ['hasn', 't', 'has', 'not'], ['haven', 't', 'have', 'not'], ['hadn', 't', 'had', 'not'], ['won', 't', 'will', 'not'], ['wouldn', 't', 'would', 'not'], ['doesn', 't', 'does', 'not'], ['didn', 't', 'did', 'not'], ['couldn', 't', 'could', 'not'], ['shouldn', 't', 'should', 'not'], ['mightn', 't', 'might', 'not'], ['mustn', 't', 'must', 'not'], ['would', 've', 'would', 'have'], ['could', 've', 'could', 'have'], ['should', 've', 'should', 'have'], ['I', 'm', 'I', 'am'], ['I', 'll', 'I', 'will'], ['I', 'd', 'I', 'would'], ['I', 've', 'I', 'have'], ['I', 'd', 'I', 'had'], ['you', 're', 'you', 'are'], ['you', 'll', 'you', 'will'], ['you', 'd', 'you', 'would'], ['you', 've', 'you', 'have'], ['you', 'd', 'you', 'had'], ['he', 's', 'he', 'am'], ['he', 'll', 'he', 'will'], ['he', 'd', 'he', 'would'], ['he', 's', 'he', 'has'], ['he', 'd', 'he', 'had'], ['she', 's', 'she', 'am'], ['she', 'll', 'she', 'will'], ['she', 'd', 'she', 'would'], ['she', 's', 'she', 'has'], ['she', 'd', 'she', 'had'], ['it', 's', 'it', 'is'], ['it', 'll', 'it', 'will'], ['it', 'd', 'it', 'would'], ['it', 's', 'it', 'has'], ['it', 'd', 'it', 'had'], ['we', 're', 'we', 'are'], ['we', 'll', 'we', 'will'], ['we', 'd', 'we', 'would'], ['we', 've', 'we', 'have'], ['we', 'd', 'we', 'had'], ['they', 're', 'they', 'are'], ['they', 'll', 'they', 'will'], ['they', 'd', 'they', 'would'], ['they', 've', 'they', 'have'], ['they', 'd', 'they', 'had'], ['that', 's', 'that', 'is'], ['that', 'll', 'that', 'will'], ['that', 'd', 'that', 'would'], ['that', 's', 'that', 'has'], ['that', 'd', 'that', 'had'], ['who', 's', 'who', 'is'], ['who', 'll', 'who', 'will'], ['who', 'd', 'who', 'would'], ['who', 's', 'who', 'has'], ['who', 'd', 'who', 'had'], ['what', 's', 'what', 'is'], ['what', 're', 'what', 'are'], ['what', 'll', 'what', 'will'], ['what', 'd', 'what', 'would'], ['what', 's', 'what', 'has'], ['what', 'd', 'what', 'had'], ['where', 's', 'where', 'is'], ['where', 're', 'where', 'are'], ['where', 'll', 'where', 'will'], ['where', 'd', 'where', 'would'], ['where', 's', 'where', 'has'], ['where', 'd', 'where', 'had'], ['when', 's', 'when', 'is'], ['when', 're', 'when', 'are'], ['when', 'll', 'when', 'will'], ['when', 'd', 'when', 'would'], ['when', 's', 'when', 'has'], ['when', 'd', 'when', 'had'], ['why', 's', 'why', 'is'], ['why', 're', 'why', 'are'], ['why', 'll', 'why', 'will'], ['why', 'd', 'why', 'would'], ['why', 's', 'why', 'has'], ['why', 'd', 'why', 'had'], ['how', 's', 'how', 'is'], ['how', 're', 'how', 'are'], ['how', 'll', 'how', 'will'], ['how', 'd', 'how', 'would'], ['how', 's', 'how', 'has'], ['how', 'd', 'how', 'had']] location = 0 for word in text: if word == '’': for (before, after, newbefore, newafter) in contractions: if text[location - 1] == before: if text[location + 1] == after: text[location - 1] = newbefore text[location + 1] = newafter location += 1 return text #append NOT_ to each word that follows 'not' within a sentence or clause def negation(text): # apply negation for w in range(0, len(text)): #for word in the text if (text[w] == 'not'): #if that word is not - add more?? n = 1 not_ = w while True: #while still in sentence/word clause and not at end of text text[not_ + n] = 'NOT_' + text[not_ + n] #add 'NOT' to each word n += 1 if ((not_ + n) >= len(text)): break if text[not_ + n] in ('NOT_.', '.', 'NOT_?', '?', 'NOT_!', '!', 'NOT_,', ',', 'NOT_:', ':', 'NOT_;', ';'): #until end of sentence break return text #common, potentially skewing, words in text words_to_remove = ['baroness', 'lord', 'lords', 'noble'] words = contractions( words) #apply above function: deal with contractions and apostrophes words = [ word for word in words if word not in stopwords.words('english') or word in ('not') ] #remove stopwords words = negation(words) #deal with negations in the text words = [wnl.lemmatize(w) for w in words] #lemmatize words = [ word for word in words if word.isalpha() or word.startswith('NOT_') ] #remove punctuation #print("distinct words: ", distinct_words(words)) #print("lexical diversity: ",lex_diversity(words),'%') #print("collocations: ",collocations(words)) words = [word for word in words if word not in words_to_remove] words = [ word.replace('NOT_' + r'^-?[0-9]+$', 'NOT_') for word in words ] #remove punctuation/special characters aside from the underscore in NOT_ words = [word.replace('NOT_' + r'(\.|\,|!\?)', 'NOT_') for word in words] words = [word for word in words if word not in ('NOT_')] #print(words) #TAG REMAINING WORDS #find more frequent (therefore default) tagger bts = brown.tagged_sents(categories='news', tagset='universal') tags = [ t for (w, t) in brown.tagged_words(categories='news', tagset='universal') ] fd = nltk.FreqDist(tags) fd.most_common(1) #most common is NOUN #regular expressions tagger patterns = [ (r'.+ing$', 'VERB'), # gerunds (r'.+ed$', 'VERB'), # past tense of verbs (r'.+es$', 'VERB'), # present tense (r'.+ould$', 'VERB'), # modal verb (r'.+\'s$', 'NOUN'), # possessive (r'.+s$', 'NOUN'), # plural nouns (r'^-?[0-9]+$', 'NUM'), # cardinal numbers (r'.+ly$', 'ADV'), # adverbs (r'(^the$|^a$|^wh)', 'DET'), # determiner (r'^[A-Z]', 'NOUN'), # proper names (r'(^he$|^she$|^they$|^him$|^her$|^his$|^hers$|^theirs$)', 'PRO'), #personal pronouns (r'\b(at|in|of)(?:\w+\s){0,3}([A-Z]\w+)', 'PPO'), # prepositions (r'(^can$|^may$|^must$|^should$|^would$|^could$)', 'MOD'), #modals (r'(\.|\,|!\?)', '.') ] #unigram size = int(len(bts) * 0.9) train = bts[:size] test = bts[size:] unigram_tagger = nltk.UnigramTagger(train) unigram_tagger.evaluate(test) #bringing in context: n-gram except issue with sparse data problem #bigram bg_tag = nltk.BigramTagger(train) bg_tag.evaluate(test) bg_tagged_sents = [bg_tag.tag(s) for s in words] bad_tags = [s for s in bg_tagged_sents if None in [tag for (w, tag) in s]] bad_tags[0] #combine different taggers to get the most accurate t0 = nltk.DefaultTagger('NOUN') t1 = nltk.RegexpTagger(patterns, backoff=t0) t2 = nltk.UnigramTagger(train, backoff=t1) t3 = nltk.BigramTagger(train, backoff=t2) t3.evaluate(test) tagged_words = [t3.tag(words)] #count up the NOUN/VERB/ADV/ADJ in the text #c = tuple(i for i in tagged_words[0]) #d = Counter(elem[1] for elem in c) #print(d) #print(tagged_words) return tagged_words
def lemmatize(tokens): lemmatizer = nltk.WordNetLemmatizer() tokens = [lemmatizer.lemmatize(token, pos=get_wordnet_pos(token)) for token in tokens] return tokens
import nltk wn = nltk.WordNetLemmatizer() ps = nltk.PorterStemmer() print('\ndir(wn):') print(dir(wn))
def data_predict(text): print('---------------------'+text+'-------------------') fullCorpus = pd.read_csv('data/final.tsv', sep='|', header=None) fullCorpus.columns = ['label', 'body_text'] fullCorpus.head() print("input data has {} rows columns".format(len(fullCorpus), len(fullCorpus.columns))) print("Out of {} rows , {} are happy , {} are sad , {} are angry , {} are calm".format(len(fullCorpus), len(fullCorpus[fullCorpus['label'] == 'happy']), len(fullCorpus[fullCorpus['label'] == 'sad']), len(fullCorpus[fullCorpus['label'] == 'angry']), len(fullCorpus[fullCorpus['label'] == 'calm']))) print("Number of null in label: {}".format(fullCorpus['label'].isnull().sum())) print("Number of null in label: {}".format(fullCorpus['body_text'].isnull().sum())) def convert_to_lowercase(text): input_str = text input_str = input_str.lower() return input_str fullCorpus['body_text_lower'] = fullCorpus['body_text'].apply(lambda x: convert_to_lowercase(x)) # print seperate file for lowercase output print('----------------------------Print in the punctuation.txt-----------------------------------------') file = open('pre-process_output/lowercase.txt', 'w',encoding="utf-8") lowercased = fullCorpus['body_text_lower'] for index, val in lowercased.iteritems(): line = str(index) + '\t' + str(val) file.write(line + '\n') file.close() def remove_numbers(text): result = re.sub(r'\d+', '', text) return result fullCorpus['body_text_no_numbers'] = fullCorpus['body_text_lower'].apply(lambda x: remove_numbers(x)) # print('finished') # print seperate file for without number output print('----------------------------Print in the no_number.txt-----------------------------------------') file = open('pre-process_output/no_numbers.txt', 'w',encoding="utf-8") no_numbers = fullCorpus['body_text_no_numbers'] for index, val in no_numbers.iteritems(): line = str(index) + '\t' + str(val) file.write(line + '\n') file.close() # remove punctuation string.punctuation def remove_punct(text): text_nopunct = "".join([char for char in text if char not in string.punctuation]) return text_nopunct fullCorpus['body_text_no_punctuation'] = fullCorpus['body_text_no_numbers'].apply(lambda x: remove_punct(x)) fullCorpus.head(25) # print seperate file for remove punctuation output print('----------------------------Print in the punctuation.txt-----------------------------------------') file = open('pre-process_output/no_punctuation.txt', 'w',encoding="utf-8") punctuationed = fullCorpus['body_text_no_punctuation'] for index, val in punctuationed.iteritems(): line = str(index)+'\t'+str(val) file.write(line+'\n') file.close() # tokenization def tokenize(text): tokens = re.split('\W+', text) return tokens fullCorpus['body_text_tokenized'] = fullCorpus['body_text_no_punctuation'].apply(lambda x: tokenize(x.lower())) fullCorpus.head() # print seperate file for tokenize output print('----------------------------Print in the tokenize.txt-----------------------------------------') file = open('pre-process_output/tokenize.txt', 'w',encoding="utf-8") tokenized = fullCorpus['body_text_tokenized'] for index, val in tokenized.iteritems(): line = str(index)+'\t'+str(val) file.write(line+'\n') file.close() # stop word stopword = nltk.corpus.stopwords.words('english') def remove_stopwords(tokenized_list): # for word in tokenized_list: # if word not in stopword: # print(word) text = [word for word in tokenized_list if word not in stopword] return text fullCorpus['body_text_nostop'] = fullCorpus['body_text_tokenized'].apply(lambda x: remove_stopwords(x)) fullCorpus.head() # print seperate file for remove stop word output print('----------------------------Print in the nonstop.txt-----------------------------------------') file = open('pre-process_output/nostop.txt', 'w',encoding="utf-8") nostoped = fullCorpus['body_text_nostop'] for index, val in nostoped.iteritems(): line = str(index)+'\t'+str(val) file.write(line+'\n') file.close() #lemmatizing wn = nltk.WordNetLemmatizer() def clean_text(tokeized_text): text = [wn.lemmatize(word) for word in tokeized_text] return text def join_text(sentence): return ' '.join(sentence) def data_lematization(): fullCorpus['body_text_lemmatized'] = fullCorpus['body_text_nostop'].apply(lambda x: clean_text(x)) fullCorpus['body_text_lemmatized'] = fullCorpus['body_text_lemmatized'].apply(lambda x: join_text(x)) data_lematization() # print seperate file for lematization output print('----------------------------Print in the cleaned.txt-----------------------------------------') file = open('pre-process_output/lemmatized.txt', 'w',encoding="utf-8") cleaned=fullCorpus['body_text_lemmatized'] for index, val in cleaned.iteritems(): line = str(index)+'\t'+val file.write(line+'\n') file.close() # separate training and testing data trainData = fullCorpus['body_text_lemmatized'][:1500] testData = fullCorpus['body_text_lemmatized'][1500:] train_labels = fullCorpus['label'][:1500] test_labels = fullCorpus['label'][1500:] # print train data and test data print("Train data count:\n", train_labels.value_counts()) print("Test data count:\n", test_labels.value_counts()) #extract Unigrams unigram_vectorized = CountVectorizer(stop_words="english",analyzer="word",ngram_range=(1,1), max_df=1.0,min_df=1,max_features=None) count_unigram = unigram_vectorized.fit(trainData) unigrams = unigram_vectorized.transform(trainData) #extract Bigrams bigram_vectorized = CountVectorizer(stop_words="english",analyzer="word",ngram_range=(2,2), max_df=1.0,min_df=1,max_features=None) count_bigram = bigram_vectorized.fit(trainData) bigrams = bigram_vectorized.transform(trainData) #extract Trigrams trigram_vectorized = CountVectorizer(stop_words="english",analyzer="word",ngram_range=(3,3), max_df=1.0,min_df=1,max_features=None) count_trigram = trigram_vectorized.fit(trainData) trigrams = trigram_vectorized.transform(trainData) # unigram,bigram and trigram as together full_vectorized = CountVectorizer(stop_words="english",analyzer="word",ngram_range=(1,3), max_df=1.0,min_df=1,max_features=None) count_full = full_vectorized.fit(trainData) full = full_vectorized.transform(trainData) # unigram frequency unigram_fre = TfidfTransformer().fit(unigrams) transformer_unigrams = unigram_fre.transform(unigrams) #Bigram frequency bigram_fre = TfidfTransformer().fit(bigrams) transformer_bigrams = bigram_fre.transform(bigrams) #Trigram frequency trigram_fre = TfidfTransformer().fit(trigrams) transformer_trigrams = trigram_fre.transform(trigrams) #full set frequency full_fre = TfidfTransformer().fit(full) transformer_full = full_fre.transform(full) # define model for n-grams u_model = MultinomialNB().fit(transformer_unigrams,train_labels) b_model = MultinomialNB().fit(transformer_bigrams,train_labels) t_model = MultinomialNB().fit(transformer_trigrams,train_labels) f_model = MultinomialNB().fit(transformer_full,train_labels) # sample for testing comment test_sample = [text] #Unigram probability counting uni_test_CountVectorized = unigram_vectorized.transform(test_sample) uni_test_fre = unigram_fre.transform(uni_test_CountVectorized) unimodel_test_CountVectorized = unigram_vectorized.transform(testData) unimodel_test_fre = unigram_fre.transform(unimodel_test_CountVectorized) #Bigram probability counting bi_test_CountVectorized = bigram_vectorized.transform(test_sample) bi_test_fre = bigram_fre.transform(bi_test_CountVectorized) bimodel_test_CountVectorized = bigram_vectorized.transform(testData) bimodel_test_fre = bigram_fre.transform(bimodel_test_CountVectorized) #Trigram probability counting tri_test_CountVectorized = trigram_vectorized.transform(test_sample) tri_test_fre = trigram_fre.transform(tri_test_CountVectorized) trimodel_test_CountVectorized = trigram_vectorized.transform(testData) trimodel_test_fre = trigram_fre.transform(trimodel_test_CountVectorized) #Full set probability counting full_test_CountVectorized = full_vectorized.transform(test_sample) full_test_fre = full_fre.transform(full_test_CountVectorized) full_test_CountVectorized = full_vectorized.transform(testData) full_test_fre = full_fre.transform(full_test_CountVectorized) # output final results print("\n Unigram Model result:") unigram_result = u_model.predict(uni_test_fre) print(unigram_result) unigrammodel_result = u_model.predict(unimodel_test_fre) print (classification_report(test_labels, unigrammodel_result)) print("\n Bigram Model result:") bigram_result = b_model.predict(bi_test_fre) print(bigram_result) bigrammodel_result = b_model.predict(bimodel_test_fre) print (classification_report(test_labels, bigrammodel_result)) print("\n Trigram Model result:") trigram_result = t_model.predict(tri_test_fre) print(trigram_result) trigrammodel_result = t_model.predict(trimodel_test_fre) print (classification_report(test_labels, trigrammodel_result)) print("\n Full Model result:") full_set_result = f_model.predict(full_test_fre) print(full_set_result) full_model_result = f_model.predict(full_test_fre) print (classification_report(test_labels, full_model_result))
import re import string import typing # for TypeAliases/pseudotypes such as Match from typing import * import nltk from toolz import pipe _LEMMATISER = nltk.WordNetLemmatizer() def functional_cleaner(ticket_description: str) -> List[str]: """ Takes a ticket description as an argument, and passes it though a functional pipeline, that sanitises, tokenise and lemmatise it, and returns as a list of strings. """ s_list = re.split(r'\s+', ticket_description.lower()) # the input list (*TO LOWER CASE!*) return text_cleaning_pipeline(s_list) def text_cleaning_pipeline(s_list: List[str]) -> List[str]: """ The actual pipeline through which the list of unigrams is passed. """ return pipe( s_list, # ie the argument being passed through the pipeline transform_relevant_URLs_into_tags, remove_twitter_contacts, remove_email_addresses, remove_URLs, remove_non_printable_hex, remove_stopwords,
bad_symbols = re.compile( '[^0-9a-z #+_]') #create function to remove special characters stopword = [ 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'to', 'from', 'then', 'here', 'there', 'when', 'where', 'so', 'than' ] #Identify stopwords to remove, you can identify your own here ps = nltk.PorterStemmer() #Load word stemmer wn = nltk.WordNetLemmatizer() # Load word lemmatizer def clean(text): text = text.lower() # lowercase text text = re.sub('[0-9]+', '', text) text = BeautifulSoup(text, "html.parser").text # HTML decoding text = replace_with_space.sub( ' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text text = bad_symbols.sub( '', text) # delete symbols which are in BAD_SYMBOLS_RE from text return text #Write function to conduct all text cleaning def contract(text): text = [wn.lemmatize(word) for word in text]
def lemmatize(text): wnl = nltk.WordNetLemmatizer() return [(wnl.lemmatize(w), t) for (w, t) in text]
def cleanThisTextLemmatized(rawText, removeProperNouns=False): print("Tokenize") tokenized = nltk.word_tokenize(rawText) tokenized = [w.lower() for w in tokenized if w.isalpha()] #list of words. #-punctuation is in its own element #-breaks up "It's" -> ["It", "'s'] #-does not stem plurals print("Remove stop words") from nltk.corpus import stopwords swEn = stopwords.words('english') #includes "don" "s". all lowercase tokenized_noSW = [w for w in tokenized if w not in swEn] print("Part-of-speech tagging (wait...)") pos_tagged = nltk.pos_tag(tokenized_noSW) #default tags from https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html #tag_types = set(t for (w,t) in pos_tagged) pos_tagged_simple = [(w, get_wordnet_pos(w, t)) for (w, t) in pos_tagged] #mistagged = [(w,t) for (w,t) in pos_tagged if len(t)<2] #this is often empty, but some common words like "young" are not tagged correctly as as "IN" etc. #make all nouns #pos_tagged_simple2 = [(w, t if t != '' else wn.NOUN) for (w, t) in pos_tagged_simple] #mistagged_freq = nltk.FreqDist((w,t) for (w, t) in pos_tagged_simple if t == '') #print(mistagged_freq.most_common()) # #brown corpus categories: # # [u'mystery', u'belles_lettres', u'humor', u'government', u'fiction', u'reviews', u'religion', u'romance', u'science_fiction', u'adventure', u'editorial', u'hobbies', u'lore', u'news', u'learned'] # brown_tagged = nltk.corpus.brown.tagged_words(categories='fiction', tagset='universal') # print([(w,t) for (w,t) in brown_tagged]) #nText = nltk.Text(rawText) print("Lemmatize") wnl = nltk.WordNetLemmatizer() #WordNet's lemmatizer lemmatized = [ wnl.lemmatize(w, t) for (w, t) in pos_tagged_simple if t != '' ] #OLD: a lot of errors when the word could be used as an adjective #assert('.' not in tokenized) #wnl = nltk.WordNetLemmatizer() #WordNet's lemmatizer #lemmatized = [wnl.lemmatize(t.lower()) for t in tokenized] #wishlist: use POS tagger to improve this #leaves some noise: remembered (POS issue?), unicode versions of words if removeProperNouns: print("Remove proper nouns") text_vocab = set(w.lower() for w in lemmatized if w.isalpha()) english_vocab = set(w.lower() for w in nltk.corpus.words.words()) unusual = text_vocab.difference(english_vocab) #not completely OK. misses girls, constitutes removed_proper = [w for w in lemmatized if w not in unusual] return removed_proper else: return lemmatized
def cleanText(text, acronymFile, topic_id): #Convert text into lowercase text = text.lower() text = ' '+text; # insert space in the begining, because text can start with acronyms, and acronym search assumes whitespace symbol in front of acronyms text = re.sub('[\s]',' ',text) # replace all \s=[ \t\n\r\f\v] symbols with whitespace ' ' #Load dictionary of acronyms and abbrs if not os.path.isfile(acronymFile): pass #print('***Acronym/abbreviation file does not exist! Acronyms and abbrs will not be expanded! (In normalize.py)***') else: temp_reader = open(acronymFile,'r') abbr_list = json.load(temp_reader) temp_reader.close() #Expand abbreviations and acronyms for abbr in abbr_list: if abbr in text: text = text.replace(abbr,abbr_list[abbr]) #protect not identified (not in our list) abbrs and acronynms, by replacing periods with underscore, A.B.C.D.E.F .=> A_B_C_D_E_F_ text = re.sub('_',' ',text) # remove underscores before Abbr/Acronym protection #text = re.sub(' ii ',' 2 ',text) #text = re.sub(' iii ',' 3 ',text) #text = re.sub(' iv ',' 4 ',text) #text = re.sub(' vi ',' 6 ',text) #text = re.sub(' vii ',' 7 ',text) #text = re.sub(' viii ',' 8 ',text) #text = re.sub(' ix ',' 9 ',text) text = re.sub(' [a-z]\. ',' ',text) # Remove single characters with period, Ex: 'M. Jordan' =>' Jordan' text = re.sub(' [a-z]\.\'s ',' ',text) # Ex: ' T.'s '=>' ' text = re.sub('([a-z])\.([a-z])','\g<1>_\g<2>',text) # Ex: A.B. => A_B., A.B.C. => A_B.C., A.B.C.D. => A_B.C_D., A.B.C.D.E. => A_B.C_D.E., text = re.sub('_([a-z])\.','_\g<1>_',text) # Ex: (after preprocessing of previous line) A_B. => A_B_, A_B.C. => A_B_C., A_B.C_D. => A_B_C_D_, A_B.C_D.E. => A_B_C_D_E, will cover all even number of letter abbs text = re.sub('_([a-z])\.','_\g<1>_',text) # Ex: (after preprocessing of two previous lines) A_B_C. => A_B_C_, A_B_C_D_E. => A_B_C_D_E_, will cover all odd number of letters abbs #Remove phrases enclosed in the brackers, eg: (word1 word2), <word3 word4> and etc. Non-greedy approach text = re.sub('(\[.*?\]|\(.*?\)|\{.*?\}|<.*?>)','',text) text = re.sub(' +',' ',text) #remove multiple spaces #Seperate floating numbers by 'point', eg: 1.4 => 1 point 4, .5 => 0 point 5 #text = re.sub('(\d)\.(\d)','\g<1> point \g<2>',text) #text = re.sub('( +)\.(\d)',' 0 point \g<2>',text) #Protect decimal points text = re.sub('(\d)\.(\d)','\g<1>_\g<2>',text) #Join thousands, eg: 1,000,000 => 1000000 text = re.sub('(\d),(\d)','\g<1>\g<2>',text) #Put space between decimal and non decimal, eg: 50%=>5 %, 4am=>4 am, 100$=>100 $, $1=>1 $, #5=># 5, 0s => 0 s $ #text = re.sub('(\d)(\D)','\g<1> \g<2>',text) #text = re.sub('(\$)(\d+)','\g<2> \g<1>',text) #text = re.sub('([^\s\d]+)(\d)','\g<1> \g<2>',text) #text = re.sub(' +',' ',text) #remove multiple spaces #Replace symbols, eg: 2 $=>2 dollars,1 $=>1 dollars, 5 %=>5 percents, G&G =>G and G, etc. #text = re.sub('\$','dollars', text) #text = re.sub('','euros', text) #text = re.sub('','pounds', text) #text = re.sub('','yuans', text) #text = re.sub('%','percents', text) #text = re.sub(' +',' ',text) #remove multiple spaces text = re.sub(' & ','and', text) #TDT2 uses '&' symbol for ampersand text = re.sub(' & ',' and ', text) #TDT2 uses '&' symbol for ampersand text = re.sub(' +',' ',text) #remove multiple spaces #Math equations, eg: 4 + 5 = 9, 4 plus 5 equals 9, 1/1 => 1 over 1 #text = re.sub('(\d) *\+ *(\d)','\g<1> plus \g<2>', text) #text = re.sub('(\d) *- *(\d)','\g<1> minus \g<2>', text) #text = re.sub('(\d) *\* *(\d)','\g<1> product \g<2>', text) #text = re.sub('(\d) */ *(\d)','\g<1> over \g<2>', text) #text = re.sub('(\d) *= *(\d)','\g<1> equals \g<2>', text) #text = re.sub('(\d) *> *(\d)','\g<1> more than \g<2>', text) #text = re.sub('(\d) *< *(\d)','\g<1> less than \g<2>', text) #text = re.sub('(\d) >= (\d)','\g<1> more or equal to \g<2>', text) #text = re.sub('(\d) <= (\d)','\g<1> less or equal to \g<2>', text) #text = re.sub(' +',' ',text) #remove multiple spaces #Fill some words #text = re.sub("( |^)\'cause( |$)",' because ',text) #text = re.sub("\'ll( |$)",' will ', text) #text = re.sub("( |^)can\'t( |$)",' can not ', text) #text = re.sub('( |^)cannot( |$)',' can not ', text) #text = re.sub("\'re( |$)",' are ', text) #text = re.sub("( |^)he\'s( |$)",' he is ', text) #text = re.sub("( |^)she\'s( |$)",' she is ', text) #text = re.sub("it\'s",'it is', text) #text = re.sub("( |^)i\'m( |$)",' i am ', text) #text = re.sub("( |^)isn\'t( |$)",' is not ', text) #text = re.sub("( |^)aren\'t( |$)",' are not ', text) #text = re.sub("( |^)doesn\'t( |$)",' does not ', text) #text = re.sub("( |^)don\'t( |$)",' do not ', text) #text = re.sub("( |^)didn\'t( |$)",' did not ', text) #text = re.sub("( |^)wasn\'t( |$)",' was not ', text) #text = re.sub("( |^)won\'t( |$)",' will not ', text) #text = re.sub("( |^)shouldn\'t( |$)",' should not ', text) #text = re.sub("( |^)haven\'t( |$)",' have not ', text) #text = re.sub("( |^)hasn\'t( |$)",' has not ', text) #text = re.sub("( |^)couldn\'t( |$)",' could not ', text) #text = re.sub("( |^)wouldn\'t( |$)",' would not ', text) # #text = re.sub("i\'d",'i would', text) # #text = re.sub("i\'d",'i had', text) #Replaces other symbols with whitespace, symbols untouched: NONE #text = re.sub('(\++|-+|,+|\*+|:+|/+|>+|<+|=+|\^+|%+|\$+|#+|@+|\|+|~+|`+|\"+|`+|\'s|\(+|\)+|\[+|\]+|{+|}+|<+|>+|&lr|&ur|&qc|&qr|\'+)',' ',text) #text = re.sub('(\++|-+|,+|\*+|:+|/+|>+|<+|=+|\^+|%+|\$+|#+|@+|\|+|~+|`+|\"+|`+|\(+|\)+|\[+|\]+|{+|}+|<+|>+|&lr|&ur|&qc|&qr| \' |\'\'+)',' ',text) text = re.sub('(,+|/+|>+|<+|\^+|#+|@+|\|+|~+|`+|\"+|`+|\(+|\)+|\[+|\]+|{+|}+|&lr|&ur|&qc|&qr| \' |\'\'+|-+|\++)',' ',text) text = re.sub(' +',' ',text) #replace multiple spaces with single space text = re.sub('(^\'+ | \'+ | \'+ +\'+ | \'+$)',' ',text) # remove quotes text = re.sub(' +',' ',text) #replace multiple spaces with single space #Remove multiple periods, eg: '...'=> '' text = re.sub('\.\.+','.',text) text = re.sub(' +',' ',text) #replace multiple spaces with single space #num to words conversion #for word in text.split(): # if str(word.encode('utf-8')).isdigit(): # text = text.replace(word, num2words(float(word)),1) #text = re.sub('\d+',' ',text) # remove all the numbers #Replace the rest symbols with the period, in order to split sentences by period later. text = re.sub('(!+|\?+|;+|:+|\.+)','.', text) text = re.sub(' +',' ',text) #remove multiple spaces #Remove leading and trailing whitespaces text.strip(); # Stemming and lemmatization if 0: porter = nltk.PorterStemmer() text = [porter.stem(t) for t in text.split()] wnl = nltk.WordNetLemmatizer() text = [wnl.lemmatize(t) for t in text] text = ' '.join(text) #Convert to one line per sentence manner text = text.replace('.','\n') text = re.sub(' +',' ',text) #remove multiple spaces text = text.splitlines() for line in text: if line.strip() != '': line_tmp = re.sub(' +',' ',line) line_tmp = re.sub('_','.',line) # put back periods for abbs/acros and floatin point numbers text[text.index(line)] = topic_id+' '+line_tmp.strip() else: text[text.index(line)] = '' text = [line for line in text if line.strip() != ''] text = '\n'.join(text) return text.strip()
for i in stopwords.words('english'): sw_dict[i] = 1 # In[19]: # In[20]: import re from nltk.stem import WordNetLemmatizer #from nltk.stem import PorterStemmer from stemming.porter2 import stem lemi_words = [] stem_words = [] lmtzr = nltk.WordNetLemmatizer().lemmatize ps = PorterStemmer() def do_lemitise(word): old_word = word word = lmtzr(word) if word != old_word: lemi_words.append((old_word, word)) return word def do_stem(word): old_word = word word = stem(word)
def lemmatizer(): wnl = nltk.WordNetLemmatizer() return wnl
def main(): if len(sys.argv) < 2: return usage() print("[.] Checking dependenices") try: check_dependencies() except ValueError as e: print(" [-] Checking failed:") print(" |", e) return path_to_script = os.path.dirname(os.path.realpath(__file__)) path_to_nltk = os.path.join(path_to_script, "nltk_data") nltk.data.path = [path_to_nltk] + nltk.data.path path_to_book = os.path.abspath(sys.argv[1]) if os.path.exists(sys.argv[1]) == False: print("[-] Wrong path to book: {}".format(path_to_book)) sys.exit() book_name = os.path.basename(path_to_book) book_name_without_ex = os.path.splitext(book_name)[0] result_dir_name = "{}-WordWised".format(book_name_without_ex) result_dir_path = os.path.join(os.path.dirname(path_to_book), result_dir_name) new_book_path = os.path.join(result_dir_path, book_name) if os.path.exists(result_dir_path): shutil.rmtree(result_dir_path) if not os.path.exists(result_dir_path): os.makedirs(result_dir_path) print("[.] Getting ASIN") try: book_asin = get_book_asin(path_to_book) except WiseException as e: print(" [-] Can't get ASIN:") for item in e.desc: print(" |", item) return if book_asin == None: print(" [-] The original book doesn't have ASIN:") print(" [.] Converting mobi 2 mobi to generate ASIN") try: cmd_str = "{} {} {}".format('ebook-convert', path_to_book, new_book_path) out = subprocess.check_output(cmd_str, shell=True) except Exception as e: print(" [-] Failed to convert mobi 2 mobi:") print(" |", e) return path_to_book = new_book_path book_asin = get_book_asin(path_to_book) else: shutil.copyfile(path_to_book, new_book_path) print("[.] Getting rawml content of the book") try: book_content = get_rawml_content(path_to_book) except WiseException as e: print(" [-] Can't get rawml content:") print(" |", e) return sdr_dir_name = "{}.sdr".format(book_name_without_ex) sdr_dir_path = os.path.join(result_dir_path, sdr_dir_name) if not os.path.exists(sdr_dir_path): os.makedirs(sdr_dir_path) LangLayerDb = LanguageLayerDB(sdr_dir_path, book_asin) print("[.] Collecting words") parser = RawmlRarser(book_content) words = parser.parse() count = len(words) if count == 0: print("[.] There are no suitable words in the book") return else: print("[.] Count of words: {}".format(count)) lookup = {} senses_path = get_resource_path("lemmas.csv") with open(senses_path, 'rb') as f: f = f.read().decode('utf-8') for line in f.splitlines(): l = line.strip() if l[0] == '"': continue word, sense_id = l.split(',') lookup[word] = sense_id lemmatizer = nltk.WordNetLemmatizer() prfx = "[.] Processing words: " print_progress(0, count, prefix=prfx, suffix='') LangLayerDb.start_transaction() if DEBUG == True: f = open('log.txt', 'a') for i, gloss in enumerate(words): word_offset = gloss[0] word = gloss[1] word = word.lower() pos_tag = nltk.pos_tag([word])[0][1] pos_tag_wordnet = get_wordnet_pos(pos_tag) word = lemmatizer.lemmatize(word, pos=pos_tag_wordnet) if word in lookup: sense_id = lookup[word] if DEBUG == True: f.write("{} - {} - {}\n".format(word_offset, word, sense_id)) LangLayerDb.add_gloss(word_offset, sense_id) print_progress(i + 1, count, prefix=prfx, suffix='') if DEBUG == True: f.close() LangLayerDb.end_transaction() LangLayerDb.close_db() print("[.] Success!") print( "Now copy this folder: \"{}\" to your Kindle".format(result_dir_path))
def non_punkt_normalize(txt): return nltk.WordNetLemmatizer(txt)
def tokenize_corpus(path, train=True): # used to stem words later on/get roots of words porter = nltk.PorterStemmer() # also lancaster stemmer # used to get lemmas (complete words themselves) of words wnl = nltk.WordNetLemmatizer() # list of stopwords, can print to view stopWords = stopwords.words("english") classes = [] samples = [] docs = [] if train == True: words = {} f = open(path, 'r') lines = f.readlines() for line in lines: # separating serial, review and label classes.append(line.rsplit()[-1]) samples.append(line.rsplit()[0]) raw = line.decode('latin1') raw = ' '.join(raw.rsplit()[1:-1]) # remove noisy characters; tokenize - specified at the start raw = re.sub('[%s]' % ''.join(chars), ' ', raw) tokens = word_tokenize(raw) # make lower case - !! consider all capitals as more positive or negative? # !! what if we didn't do this? tokens = [w.lower() for w in tokens] # removing stopwords # using python stopwords scripts - !! add manually to stopwords? # !! what if we didn't do this? tokens = [w for w in tokens if w not in stopWords] # first lemmatize then stem, significance? # !! what if we didn't do this? - lemmatize tokens = [wnl.lemmatize(t) for t in tokens] # !! what if we didn't do this? - stem tokens = [porter.stem(t) for t in tokens] # --------------------------------------------------------- # !! add bigram collocations here # tokens = bigram_tokenize_corpus(tokens, BigramAssocMeasures.chi_sq, 200) bigram_measures = nltk.collocations.BigramAssocMeasures() # !! experiment with window size for accuracy finder = BigramCollocationFinder.from_words(tokens, window_size = 3) # !! experiment with frequency count for accuracy finder.apply_freq_filter(2) # !! understand pmi filter # bigrams = finder.nbest(bigram_measures.pmi, 10) #for k,v in finder.ngram_fd.items(): #print(k,v) for k in finder.ngram_fd.items(0): print(k) # --------------------------------------------------------- if train == True: # add to word count frequency for t in tokens: try: words[t] = words[t]+1 except: words[t] = 1 docs.append(tokens) # docs: consist of a long list of tokens as they # appear in the text ie may be repeated # words: dictionary - has the frequency of the vocabulary if train == True: return(docs, classes, samples, words) else: return(docs, classes, samples)
def lemmatization(): wnl = nltk.WordNetLemmatizer() [wnl.lemmatize(t) for t in tokens]
def fullTokenize(self, text, removalPattern=False, gensim=False): ''' A method to process texts by tokenising, stemming and retrieving noun phrases. The method a. removes useless stuff (stopwords and the like) b. stems words (removing plurals and verbal tenses) c. detects noun-phrases and joins them with hypens --------------------------- Mandatory Arguments: i. text : a string to be processed KeyWord Arguments: i. removalPattern : a regex pattern for sentences to be discarded --------------------------- Returns: finalText : a tokenized and stemmed list of the input text ''' ## split the text into sentences if removalPattern: sentences = [ p for p in nltk.sent_tokenize(text) if ( not\ ('ociety' in str(p) and ('©' in str(p) or 'ublished' in p) and len(p.split()) < 10) and \ removalPattern.search( p + ' ') == None) ] # NLTK default sentence segmenter else: sentences = [ p for p in nltk.sent_tokenize(text) if ( not\ ('ociety' in str(p) and ('©' in str(p) or 'ublished' in p) and len(p.split()) < 10) ) ] # NLTK default sentence segmenter sentences = [nltk.word_tokenize(sent) for sent in sentences] # NLTK word tokenizer sentences = [nltk.pos_tag(sent) for sent in sentences] # NLTK POS tagger # lemmatizing stuff sentences = [[(nltk.WordNetLemmatizer().lemmatize( w, self.Lemmatizer[t[0].lower()]), t) for w, t in sent] for sent in sentences] ## retain the nounphrases as a single-hypened word finalText = [] for sentence in sentences: ## tag the sentence with the chunker result = self.chunker.tagger.tag(sentence) result = filter(lambda _: _[0][1] != 'DT', result) ## merge the nounphrases phrase = [] previous = 'O' for w0, t in result: if t == 'I-NP' and previous != 'O': phrase[-1] += '-%s' % w0[0] else: phrase.append(w0[0]) previous = t ## remove stopstuff filterStop = filter( lambda x: '-' in x and (len(set(x.split('-')) & set(stopwords.words('english'))) < len(x.split('-')) / 2) or not ('-' in x), phrase) phrase = list(filterStop) finalText.extend(phrase) return finalText
def extract_filled_templates2(sentence, stem): impact_word = "" cause_subject = "" cause_object = "" Predictor = "" imp_sub_sentence = "" impact_subjects = [] cause_object_word = "" lmtzr = nltk.WordNetLemmatizer() word_tokens = word_tokenize(sentence) # lemmas = [lmtzr.lemmatize(token, 'v') for token in word_tokens] # lemma_set = set(lemmas) # for item in lemma_set: # print("stem is ",stem,"item is ",item) # if stem in item: # matching_template = item matching_template = stem word_index = word_tokens.index(matching_template) # divide the sentence before the template verb and after the template verb by considering 'by' to be active or passive voice determiner sub_sentence, obj_sentence = get_subject_object(word_index, word_tokens) # print(sub_sentence) # print(obj_sentence) # include grammar for extracting head level noun phrases in the template grammar = r""" NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and noun {<NNP>+} # chunk sequences of proper nouns {<NN>+} # chunk consecutive nouns {<NNS>+} {<NNPS>+} """ cp = nltk.RegexpParser( grammar) # Define Parser for extracting noun phrases sub_tagged_sent = nltk.pos_tag(sub_sentence.split()) sub_tokens = word_tokenize(sub_sentence) sub_parsed_sent = cp.parse(sub_tagged_sent) predictors = set([ get_noun_phrases(npstr, sentence) for npstr in extract_np(sub_parsed_sent) ]) # predictors = [get_noun_phrases(entry,sub_sentence) for entry in sub_tagged_sent] parsetree = get_parse_tree(sentence) for triple in parsetree: # if 'VB' in triple[0][1] and triple[1] in ['nsubj']: if stem in triple[0][0]: predictor_word = triple[2][0] # print("Predictor word is ",predictor_word) break # for entry in predictors: # if predictor_word in entry: # Predictor = entry Predictor = get_noun_phrases(predictor_word, sub_sentence) # parsetree_sub = get_parse_tree(obj_sentence) impact_word = "" for triple in parsetree_sub: # print("triple in sub sentence is ",triple) if 'VB' in triple[0][1] and triple[1] == 'nsubj': if (triple[0][0] in ['was', 'is']): continue impact_word = triple[0][0] # print("impact_word found is ************",impact_word) word_tokens = word_tokenize(obj_sentence) # lemmas = [lmtzr.lemmatize(token, 'v') for token in word_tokens] word_index_imp = word_tokens.index(impact_word) imp_sub_sentence, imp_obj_sentence = get_subject_object( word_index_imp, word_tokens) if triple[0][0] == impact_word and triple[1] == 'nsubj': cause_subject = get_noun_phrases(triple[2][0], imp_sub_sentence) elif triple[0][0] == impact_word and triple[1] == 'dobj': cause_object = get_full_word(triple[2][0], parsetree_sub) break # elif impact_word != "" and triple[2][1] == 'NN': # cause_object = get_full_word(triple[2][0],parsetree_sub) # break # if cause_subject == "": # obj_tagged_sent = nltk.pos_tag(imp_sub_sentence.split()) # obj_parsed_sent = cp.parse(obj_tagged_sent) # impact_subjects = set([get_noun_phrases(npstr,imp_sub_sentence) for npstr in extract_np(obj_parsed_sent)]) if impact_word == "" or cause_object == "": doc_sub = nlp(obj_sentence) for triple in parsetree_sub: if triple[1] in ['pobj', 'dobj']: cause_object_word = triple[2][0] # print("cause_object word is **",cause_object_word) break for entry in doc_sub.noun_chunks: if cause_object_word in entry.text: cause_object = get_full_word(entry.text, parsetree_sub) else: impact_subjects.append(entry.text) print("Predictor:", predictors) # print("Predictor:", Predictor) # print("impact_subjects",impact_subjects) print("cause_subject:", cause_subject) print("Impact:", impact_word) print("cause_object:", cause_object)
data = fp.read() #to tokenize input text into sentences #print('\n-----\n'.join(tokenizer.tokenize(data)))# splits text into sentences #to tokenize the tokenized sentences into words tokens = nltk.wordpunct_tokenize(data) text = nltk.Text(tokens) words = [w.lower() for w in text] print(data) #to print the tokens #for a in data: #print (a) for synset in wn.synsets(tokens[30]): for lemma in synset.lemmas(): print(lemma.name()) stemmer = PorterStemmer() print(stemmer.stem(tokens[38])) lemmetizer = nltk.WordNetLemmatizer() print(lemmetizer.lemmatize('Breathing', pos='v')) print(pos_tag(words)) print(nltk.ngrams('hey all li lee', 3)) # nltk.download('all') print(nltk.ne_chunk(pos_tag(nltk.wordpunct_tokenize("heya ola hows it"))))
def do_lemmitizing(filtered_sentence): lemma = nltk.WordNetLemmatizer() lemmas = [] for i in filtered_sentence: lemmas.append(lemma.lemmatize(i)) return lemmas
# split yerine tokenizer kullanabiliriz description = nltk.word_tokenize(description) # split kullanırsak "shouldn't " gibi kelimeler "should" ve "not" diye ikiye ayrılmaz ama word_tokenize() kullanirsak ayrilir # %% # greksiz kelimeleri cikar description = [ word for word in description if not word in set(stopwords.words("english")) ] # %% # lemmatazation loved => love gitmeyecegim = > git import nltk as nlp lemma = nlp.WordNetLemmatizer() description = [lemma.lemmatize(word) for word in description] description = " ".join(description) #%% description_list = [] for description in data.description: description = re.sub("[^a-zA-Z]", " ", description) description = description.lower() # buyuk harftan kucuk harfe cevirme description = nltk.word_tokenize(description) #description = [ word for word in description if not word in set(stopwords.words("english"))] lemma = nlp.WordNetLemmatizer() description = [lemma.lemmatize(word) for word in description] description = " ".join(description) description_list.append(description)
import nltk from nltk.corpus import stopwords wordnet = nltk.WordNetLemmatizer() stoplist = stopwords.words('english') def normalize_token(token): """ Convert token to lowercase, and stem using the Porter algorithm. Parameters ---------- token : str Returns ------- token : str """ return wordnet.lemmatize(token.lower()) def filter_token(token): """ Evaluate whether or not to retain ``token``. Parameters ---------- token : str Returns
def __init__(self): NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger') self.normalizer = NltkNormalizer() self.lem = nltk.WordNetLemmatizer() self.tagger = nltk.PerceptronTagger() self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
┗┻┛ ┗┻┛ """ from __future__ import division from numpy.ma import sort import re import numpy from scipy import spatial from numpy import savetxt, loadtxt import nltk from opinion_mining.AMC_preprocess import domain_preprocess f = open( r'E:\python_workplace\Opinion Mining (LML)\Data\English_stopwords.txt', encoding='utf-8') stopwords = set(line.strip() for line in f.readlines()) # 读入停用词 lemitaion = nltk.WordNetLemmatizer() f.close() ignorechars = ''',:'.;!()#''' def pre_proc(C): C = [w.replace(ignorechars, "") for w in C] C = [ lemitaion.lemmatize(w) for w in C if w not in stopwords and len(w) >= 3 ] C = [ lemitaion.lemmatize(w, pos='v') for w in C if w not in stopwords and len(w) >= 3 ] C = [w for w in C if w not in stopwords and len(w) >= 3] return C
import nltk stopwords = nltk.corpus.stopwords.words('english') def stopwords_remove(text): no_stopwords = [word for word in text if word not in stopwords] return no_stopwords # ### Sub 4: Lemmatizing Words # In[5]: lemwords = nltk.WordNetLemmatizer() def lemmatizer(text): lemmatized = [lemwords.lemmatize(word) for word in text] return lemmatized # ### Cleaning Data (with previous four functions) # In[6]: def clean_data(text): return lemmatizer(stopwords_remove(tokenizer(punctuation_remove(text))))
# Data Preparation and Preprocessing print("Loading Data Sources") #Load Data Source data_path = os.getcwd() + '\\lsa\\links.json' print("Data source : " + data_path) data = pd.read_json(data_path) data.head() temp_data = pd.read_json(data_path) question_data = temp_data['MESSAGE'] #Create Stop Word newstopwords = set(stopwords.words('english')) #define Wordnet Lemmatizer WNlemma = nltk.WordNetLemmatizer() #Create Preprocessing Function def pre_process(text): tokens = nltk.word_tokenize(text) tokens = [WNlemma.lemmatize(t) for t in tokens] tokens = [t for t in tokens if t not in string.punctuation] tokens = [word for word in tokens if word.lower() not in newstopwords] # bigr = nltk.bigrams(tokens[:10]) # trigr = nltk.trigrams(tokens[:10]) return (tokens) #greeting function # GREETING_INPUTS = ("hello", "hi", "greetings", "hello i need help", "good day","hey","i need help", "greetings")
def get_keywords_phrases(text): try: text = encode_ignore(text) lemmatizer = nltk.WordNetLemmatizer() # stemmer = nltk.stem.porter.PorterStemmer() # Based on... Extract key phrases with NLTK ... https://gist.github.com/alexbowe/879414 # This gist is part of a blog post (http://alexbowe.com/au-naturale/) # in which the paper is cited: # S. N. Kim, T. Baldwin, and M.-Y. Kan. Evaluating n-gram based evaluation metrics for automatic # keyphrase extraction. Technical report, University of Melbourne, Melbourne 2010. grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = nltk.RegexpParser(grammar) # POS tagging postoks = get_pos_tags(text) this_tree = chunker.parse(postoks) from nltk.corpus import stopwords stopwords = stopwords.words('english') def leaves(tree): """Finds NP (nounphrase) leaf nodes of a chunk tree.""" # for subtree in tree.subtrees(filter = lambda t: t.node=='NP'): for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'): yield subtree.leaves() def normalise(word): """Normalises words to lowercase and stems and lemmatizes it.""" word = word.lower() # word = stemmer.stem_word(word) word = lemmatizer.lemmatize(word) return word def acceptable_word(word): """Checks conditions for acceptable word: length, stopword.""" accepted = bool(2 <= len(word) <= 40 and word.lower() not in stopwords) return accepted def get_terms(tree): """a generator for the normalized, acceptable, leaf terms""" for leaf in leaves(tree): this_term = [ normalise(w) for w, t in leaf if acceptable_word(w) ] yield this_term terms = get_terms(this_tree) phrases = [] terms_freq_dict = {} for termList in terms: phrase = " ".join([str(term) for term in termList]) phrases.append(phrase) if phrase not in terms_freq_dict: terms_freq_dict[phrase] = 1 else: terms_freq_dict[phrase] += 1 sorted_tfd = sorted(list(terms_freq_dict.items()), key=operator.itemgetter(1), reverse=True) return sorted_tfd except Exception as e: error_msg = "processing error: ", str(e) return {'keyWordsPhrases': error_msg}
import os from functools import lru_cache import pickle import nltk from nltk.stem.porter import PorterStemmer _max_ppdb_score = 10.0 _min_ppdb_score = -_max_ppdb_score _wnl = nltk.WordNetLemmatizer() def normalize_word(w): return _wnl.lemmatize(w).lower() def get_tokenized_lemmas(s): return [normalize_word(t) for t in nltk.word_tokenize(s)] @lru_cache(maxsize=1) def get_ppdb_data(): with open('ppdb.pickle'), 'rb') as f: return pickle.load(f) _stemmer = PorterStemmer()
# dont store credentials within the repository if os.path.exists('credentials.py'): import credentials alpow.sftp = credentials.sftp # p(alpow.sftp); p('sftp online:', ftpls()[0], '\n\n\n') else: alpow.useFTP = False sendimages2ftp = 0 # }{ np.random.seed(1983) nltk.download('stopwords') nltk.download('wordnet') alpow.stop_words = nltk.corpus.stopwords.words('english') alpow.lemma = nltk.WordNetLemmatizer() alpow.token = ToktokTokenizer() def load(fn='allVars', onlyIfNotSet=1): fns = fn.split(',') for fn in fns: fn = fn.strip(', \n') ok = 1 if (len(fn) == 0): continue if (onlyIfNotSet): if fn in globals().keys(): # override empty lists, dict, dataframe and items if type(globals()[fn]) == type: continue