def process(word_list): lancaster=LancasterStemmer() new_list=[] for word in word_list: w=lancaster.stem(word) new_list.append(w) return new_list
def Stem(s): if s is not None and isinstance(s, str) and len(s) > 0: stemmer = LancasterStemmer() s = (" ").join([stemmer.stem(z) for z in s.split(" ")]) s = s.lower() return s else: return ""
def stem_words(self, words): """Stem words in list of tokenized words""" stemmer = LancasterStemmer() stems = "" for word in words.split(" "): stem = stemmer.stem(word) stems = stems + " " + stem return stems
def stem_words(words): """Stem words in list of tokenized words""" stemmer = LancasterStemmer() stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) return stems
def main(): save_data_from_webpage() text = get_data_from_file() #creates a list of the tolkenized words tt = word_tokenize(text) pprint(tt) #creates a new list for the steam words using all of the stemmers psteam = PorterStemmer() psteam_list = [] for word in tt: psteam_list.append(psteam.stem(word)) pprint(psteam_list) lsteam = LancasterStemmer() lsteam_list = [] for word in tt: lsteam_list.append(lsteam.stem(word)) pprint(lsteam_list) ssteam = SnowballStemmer() ssteam_list = [] for word in tt: ssteam_list.append(ssteam.stem(word)) pprint(ssteam_list) p = set(psteam_list) l = set(lsteam_list) s = set(ssteam_list) #displays the different steams pprint(s.difference(l.difference(p))) #pos taging pos_list = pos_tag(text) pprint(pos_list) #creates a new list for the lematized words lemmatizer = WordNetLemmatizer() lem = [] for word in tt: lem.append(lemmatizer.lemmatize(word)) #pprint(lem) # returns a generator of trigrams using the tokenized list tt trig = trigrams(tt) displays the results print(list(trig)) #ne_chunck finds non overlapping groups #pos_tag ids how the text is used in speech NamedEntity = ne_chunk(pos_tag(wordpunct_tokenize(text))) print(NamedEntity)
def _normalize(self, item): key, value = item ls = LancasterStemmer() text = word_tokenize(value[0]) text = [word.lower() for word in text] text = [ ls.stem(word).rstrip('s') for word in text if word not in stopwords.words('english') and word.isalnum() ] return (key, (text, value[1]))
def __stem_document(document_name: pathlib.Path) -> list: stemmer = LancasterStemmer() with document_name.open('r', encoding='utf-8') as document: lines = document.readlines() result = [] for line in lines: line = line.strip() words = [token for token in line.split(' ')] words = [stemmer.stem(word) for word in words] sentence = ' '.join(words) result.append(sentence) return result
def get_stems(tokens): stemmer = LancasterStemmer() stemmed_tokens = [] for token in tokens: for word in token: if word[1] == 'DT' or word[1] == 'PRP' or word[1] == 'PRP$' or word[ 1] == 'NN' or word[1] == 'NNP' or word[1] == 'NNPS': temp_tokens = word[0] else: temp_tokens = stemmer.stem(word[0]) stemmed_tokens.append(temp_tokens) return get_lemma(stemmed_tokens)
def getStemsFromURL(page_url): ''' Given the link of a webpage (string), returns a list of all the words' stems in the webpage text ''' with urlopen(page_url) as infile: soup = BeautifulSoup(infile, features="lxml") ls = LancasterStemmer() words = word_tokenize(soup.text) words = [w.lower() for w in words] words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalpha()] return words
def checkstemmers(): raw = customparse("C://cygwin//home//nelson auner//Pontikes//FinalData.OctNewKeepAndAnonymous/capsavem/my_cape/outtoget.cap.txt") wordz = raw.split(" ") O = ["sweating","tripping","gunning","going"] HH = [i[0:-1] for i in O] dic = enchant.Dict("en_US") from nltk import LancasterStemmer, PorterStemmer lancaster = LancasterStemmer() porter = PorterStemmer() resporter = [porter.stem(t).replace(" ","") for t in wordz] reslan = [lancaster.stem(t).replace(" ","") for t in wordz] resall = [[wordz[i],resporter[i],reslan[i]] for i in range(len(wordz)) ] filtres = [resall[i] for i in range(len(resall)) if not (resall[i][0]==resall[i][2]==resall[i][1])] return resall
def clean_tweets(self, text): st = LancasterStemmer() #st = PorterStemmer() with open('newspaper3k/SmartStoplist.txt', 'r') as f: stopwords = [line.strip() for line in f] # remove URL's text = re.sub(r'http\S+', '', text) tweet_tmp = text.split("\n") for k in tweet_tmp: tweet_tmp = re.sub(r"[^a-zA-Z0-9]+", ' ', k).lower() tweet_tmp = st.stem(tweet_tmp) tweet_tmp = ''.join([i for i in tweet_tmp if not i.isdigit()]) tweet_tmp = tweet_tmp.split() result = [word for word in tweet_tmp if word not in stopwords] return result
def checkstemmers(): raw = customparse( "C://cygwin//home//nelson auner//Pontikes//FinalData.OctNewKeepAndAnonymous/capsavem/my_cape/outtoget.cap.txt" ) wordz = raw.split(" ") O = ["sweating", "tripping", "gunning", "going"] HH = [i[0:-1] for i in O] dic = enchant.Dict("en_US") from nltk import LancasterStemmer, PorterStemmer lancaster = LancasterStemmer() porter = PorterStemmer() resporter = [porter.stem(t).replace(" ", "") for t in wordz] reslan = [lancaster.stem(t).replace(" ", "") for t in wordz] resall = [[wordz[i], resporter[i], reslan[i]] for i in range(len(wordz))] filtres = [ resall[i] for i in range(len(resall)) if not (resall[i][0] == resall[i][2] == resall[i][1]) ] return resall
def getMostUsedWordsTxt(file, wordnum): ''' Given a text file name (string) and the number of most used words we want to find (int), returns a list of the wordnum most common elements and their counts from the most common to the least: [('1st_most_common_word', count1), ('2nd_most_common_word', count2), ..., ('wordnumth_most_common_word', countwordnum)] ''' with open(file, "r") as f: words = f.read() words = words.split() ls = LancasterStemmer() words = [w.lower() for w in words] words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalpha()] freqs = Counter(words) return freqs.most_common(wordnum)
def tokenize(self, description): filtered = [] # dont process NaN or Null values if pd.isnull(description): return filtered, filtered else: terms = description.lower().split() # terms = word_tokenize(description.lower().decode('utf-8')) filtered_stopwords = [word for word in terms if not word in stopwords.words('english')] # # Stemming Snowball # stemmer = SnowballStemmer('english') # for stem in filtered_stopwords: # filtered.append(stemmer.stem(stem.decode('utf-8'))) # # Stemming Porter # stemmer = PorterStemmer() # for stem in filtered_stopwords: # filtered.append(stemmer.stem(stem.decode('utf-8'))) # Lemmatizer Word Net Lemmatizer lemmatizer = WordNetLemmatizer() for lemmatized in filtered_stopwords: filtered.append(lemmatizer.lemmatize(lemmatized)) filtered_final = [] # Stemming Lancaster stemmer = LancasterStemmer() for stem in filtered: # filtered_final.append(stemmer.stem(stem.decode('utf-8'))) filtered_final.append(stemmer.stem(stem)) # # Lemmatizer TextBlob # for lemmatized in filtered_stopwords: # w = Word(lemmatized.decode('utf-8')) # filtered.append(w.lemmatize) return filtered_final
def get_words_from_string(string): string = string.lower() word_pattern = r'[A-Za-z]+' # link_pattern = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})" # email_pattern = r"\S+@\S+" # ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b" result = [] # for x in re.findall(link_pattern, string): # try: # url = "{0.scheme}://{0.netloc}/".format(urlsplit(x)) # except: # url = x # result.append(url) # string = re.sub(link_pattern, "", string) # result.extend(re.findall(email_pattern, string)) # string = re.sub(email_pattern, "", string) # result.extend(re.findall(ip_pattern, string)) # string = re.sub(ip_pattern, "", string) # stemmer = PorterStemmer() stemmer = LancasterStemmer() result.extend( [stemmer.stem(word) for word in re.findall(word_pattern, string)]) # result.extend(re.findall(word_pattern, string)) return result
# Tweet loading and cleaning wrong = 0 with open('corpus.txt', 'r') as f: tweets = [] for line in f.readlines(): cols = line.replace('\n', '').replace('\ufeff', '').replace('\t', '').split('|') if len(cols) == 2: (cat, tweet) = (cols[0], cols[1]) # Removal of URLs, hashtags and mentions tweet_regex = regex_spaces.sub( ' ', regex_ht_mn.sub('', regex_url.sub('', tweet))).lower() # Removal of caps and accents tweet_raw = unidecode.unidecode(tweet_regex).lower() tokens = [ remove_repeated_chars(stemmer.stem(t)) for t in tweet_tokenizer.tokenize(tweet_regex) if not t in stopwords and not regex_nonword.match(t) ] tweets.append((tokens, cols[0])) else: wrong += 1 print(line, end='') print('Wrong: {0}'.format(wrong)) word_features = get_word_features(get_words_in_tweets(tweets)) tweets_cat = { 'P': [t for t in tweets if t[1] == 'P'], 'N': [t for t in tweets if t[1] == 'N'], 'NEU': [t for t in tweets if t[1] == 'NEU'] }
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer pStemmer = PorterStemmer() lStemmer = LancasterStemmer() sStemmer = SnowballStemmer('english') print(pStemmer.stem("Playing")) print(lStemmer.stem("Dancing")) print(sStemmer.stem("Killing")) from nltk.stem import WordNetLemmatizer lemmetizer = WordNetLemmatizer() print(lemmetizer.lemmatize("Playing")) print(lemmetizer.lemmatize("Dancing")) print(lemmetizer.lemmatize("Killing")) print(lemmetizer.lemmatize("geese")) from nltk import wordpunct_tokenize, pos_tag, ne_chunk sentence = "Mark and John are working at google" print(wordpunct_tokenize(sentence), '\n') print(pos_tag(wordpunct_tokenize(sentence)), '\n') print(ne_chunk(pos_tag(wordpunct_tokenize(sentence))))
doc = docx.Document(arquivo) # 02 - Lista de parágrafos texto_full = [] for paragrafo in doc.paragraphs: texto_full.append(paragrafo.text) # Seleção do 2° e 3° parágrafo p_2e3 = texto_full[2:4] # Tokenizar a lista de paragrafos tokens = word_tokenize(' '.join(p_2e3)) ## RSLP rslp = RSLPStemmer() stemms_rslp = [] for i in tokens: stemms_rslp.append(rslp.stem(i)) ## Poter poter = PorterStemmer() stemms_poter = [] for i in tokens: stemms_poter.append(poter.stem(i)) ## Lancaster lancaster = LancasterStemmer() stemms_lanc = [] for i in tokens: stemms_lanc.append(lancaster.stem(i))
提取文本数据的词干 三种词干提取算法,Lancaster词干提取器比其他两个词干提取器更严格 严格程度而言:Porter最轻松,Lancaster最严格。 Lancaster速度快但是会减少单词的很大部分,通常会选择Snowball词干提取器 ''' from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer words = [ 'table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches', 'grounded', 'dreamt', 'envision' ] stemmers = ['PORTER', 'LANCASTER', 'SNOWBALL'] stemmer_porter = PorterStemmer() stemmer_lancaster = LancasterStemmer() stemmer_snowball = SnowballStemmer('english') formatted_row = '{:>16}' * (len(stemmers) + 1) print(formatted_row.format('WORD', *stemmers)) for word in words: stemmed_words = [ stemmer_porter.stem(word), stemmer_lancaster.stem(word), stemmer_snowball.stem(word) ] print(formatted_row.format(word, *stemmed_words))
from nltk import PorterStemmer, LancasterStemmer, word_tokenize line = "My name is Venkatram Veerareddy, technical architect.\n I am having 20 years of experience in "\ " Software industry working \nfrom applications to products by using \n" \ " C, C++, Java, Javascript and databases "\ " like Oracle, MS SQL Server, Postgres, MySQL and OrientDB." tokens = word_tokenize(line) porter = PorterStemmer() pStems = [porter.stem(t) for t in tokens] print(pStems) print("************************************************") lancaster = LancasterStemmer() lStems = [lancaster.stem(t) for t in tokens] print(lStems)
'page': TITLE, 'format': "json" } R = S.get(url=URL, params=PARAMS) DATA = R.json() # get the text wiki_page_text = DATA["parse"]["text"]["*"] h = html2text.HTML2Text() h.ignore_links = True page_text = h.handle(wiki_page_text) # create a new stemmer ls = LancasterStemmer() # tokenize text words = nltk.word_tokenize(page_text) words = [w.lower() for w in words] # eliminate stop words and stem the rest of the words words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalnum()] freqs = Counter(words) print("The 10 most frequently used stems in the ''Data science'' Wikipedia page are:") for word, count in freqs.most_common(10): print(word, count)
def MakeFeaturesFromText(DIR, FNAME, SENT_CLASS, max_features): ''' !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! EXPERIMENTAL: Still trying to figure out if it produces good Data sets. Use at your own risk !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Grabs a text file and creates a sparse binary data set to training and makes class labels based off of the class identifiers in SENT_CLASS ARG: DIR TYPE: string DESC: Directory where text file is saved. End with \ ARG: FNAME TYPE: string DESC: File name ARG: SENT_CLASS TYPE: List (of lists) DESC: A list of lists with each internal list representing a class id ''' print 'STARTING DATA GENERATION' from nltk import download as nldl from nltk import LancasterStemmer import nltk.corpus as corp import string # used for referencing later NUM_CLASSES = len(SENT_CLASS) TOTAL_CLASSES = NUM_CLASSES + 1 # read file and begin cleaning TextFile = open(DIR + FNAME, 'r') RAW = TextFile.readlines() # removes newline, makes it all lower case, and splits it into seperable words by spaces CLEAN = [sent[:-1].lower().split(' ') for sent in RAW] print 'LOADING STOPWORDS' # need the try, catch because it requires the stop words from the nltk database # the catch is just for first time running try: STOP_WORDS = corp.stopwords.words('english') except LookupError: nldl('words') STOP_WORDS = corp.stopwords.words('english') finally: # stop words are common words like "the", "an" "a" etc.. STOP_WORDS += ' '.join(p for p in string.punctuation).split(' ') + ' '.join(d for d in string.digits).split(' ') + ' '.join(w for w in string.whitespace).split(' ') # removes stopwords and trivially short features CLEANER = [[word for word in sent if word not in STOP_WORDS] for sent in CLEAN if len(sent) > 3] KEYS = [] print 'SORTING KEYS' # starts compiliing the key list from the id words in sent_class for IDS in SENT_CLASS: KEYS += IDS print 'HEAVY DUTY STUFF' # finds all unique words. keeps combining and fitlering duplicates for sent in CLEANER: TEMP = KEYS + sent KEYS = np.unique(TEMP).tolist() print 'COUNTING NCOs. . ', MAX_LIST = [] MAX = -1 KEY_COUNT = {} for sent in CLEANER: for word in sent: if not KEY_COUNT.has_key(word): KEY_COUNT[word] = 0 KEY_COUNT[word] += 1 if KEY_COUNT[word] >= MAX: MAX = KEY_COUNT[word] if word in MAX_LIST: MAX_LIST.remove(word) MAX_LIST.insert(0, word) print '.', TOO_FEW = [key for key in list(KEY_COUNT) if KEY_COUNT[key] <= 1] TOTAL_KEYS = len(list(KEY_COUNT)) REMOVE_TOP = np.floor(TOTAL_KEYS / 10.) print '.', for SCLASS in SENT_CLASS: for s in SCLASS: if s in MAX_LIST: MAX_LIST.remove(s) if s in TOO_FEW: TOO_FEW.remove(s) print '.', for tf in TOO_FEW: if tf in KEYS: KEYS.remove(tf) print '.', TOO_MANY = MAX_LIST for tm in TOO_MANY: if tm in KEYS: KEYS.remove(tm) print 'TERMINATED THE UNDESIREABLES' print 'REMOVED ', len(TOO_MANY) + len(TOO_FEW), 'OF ', TOTAL_KEYS, 'UNDESIREABLES' print 'STARTING STEMMING' # removes suffixes and prefixes from words leaving the rootword only STEMMER = LancasterStemmer() # hash of stemmed words because it's trememndously faster to do it this # way versus calling LancasterStemmer every time STEM_DICT = { K : STEMMER.stem(K) for K in KEYS } print 'STEMMED DICTIONARY CREATED' # stem the class labels... and we know theyre in the hash because that # was the first thing we added to the key list STEM_LABS = [[STEM_DICT[ID] for ID in CLASS] for CLASS in SENT_CLASS] # now the dictionary is generated and the vectorization has begun DICT = STEM_DICT.values() # using the hash table of stemmed words to look up the stemmed root STEM_DATA = [[STEM_DICT[word] for word in sent if word in list(STEM_DICT)] for sent in CLEANER] # indexs of root words in the dictionary so that you only have to do a few lookups # when gnerating the data set of binary vectors INT_DATA = [np.array([DICT.index(word) for word in sent], dtype=np.int32, order='c') for sent in STEM_DATA] # same as above INT_LABS = [[DICT.index(ID) for ID in CLASS] for CLASS in STEM_LABS] print 'CREATING DATA SET' print 'I BET THIS TAKES THE LONGSEST' # meat and potatoes LABS = np.zeros((len(CLEANER), 1), dtype=np.int32, order='c') MAT_SHAPE = (1, len(DICT)) MAT = np.zeros(MAT_SHAPE, dtype=np.int32, order='c') # priming for the for loop below so that we can stack each new feature at # the bottom of our data set. M_IND = np.array(INT_DATA[0], dtype=np.int32, order='c') MAT[0, M_IND] = 1 print 'STILL Go', CLIST = range(0, NUM_CLASSES) for d in range(NUM_CLASSES): print 'i', i = CLIST[np.random.randint(0, len(CLIST))] IND = np.atleast_2d(INT_LABS[i]) if np.any(MAT[0, IND] == 1) and LABS[0] == 0: LABS[0] = i + 1 else: CLIST.remove(i) # makes whole data set for i in range(1, np.size(INT_DATA, 0)): if np.mod(i, np.floor(np.size(INT_DATA, 0) * .2)) == 0 : print 'n', NEXT_ARRAY = np.zeros(MAT_SHAPE, dtype=np.int32, order='c') TO_ONES = np.array(INT_DATA[i], dtype=np.int32, order='c') NEXT_ARRAY[0, TO_ONES] = 1 # making labels CLIST = range(0, NUM_CLASSES) for d in range(NUM_CLASSES): j = CLIST[np.random.randint(0, len(CLIST))] IND = np.atleast_2d(INT_LABS[j]) if np.any(NEXT_ARRAY[0, IND] == 1) and LABS[i] == 0: LABS[i] = j + 1 CLIST.remove(j) MAT = np.vstack((MAT, NEXT_ARRAY)) print 'g!' print 'I WAS RIGHT' # to reduce non-classed features the below algo tries to reduce the number of # non-classed features but if max-features is fewer than MAT with all non-classed features # removed then steps have to be taken to remove classed features TOTAL = np.size(MAT, 0) if max_features > TOTAL: max_features = TOTAL print 'THINNING THE HERD A BIT MORE' REMOVALS = range(0, TOTAL_CLASSES) if TOTAL > max_features: NO_CLASS_CAND = np.argwhere(LABS == 0) HAS_CLASS_CAND = np.argwhere(LABS != 0) # gets weighted number of features of each class to remove if (TOTAL - np.size(NO_CLASS_CAND)) >= max_features: HAS_CLASS_REMOVE = TOTAL - np.size(NO_CLASS_CAND) - max_features CLASS_FEATURES = np.array([np.sum(HAS_CLASS_CAND == CLASS) for CLASS in range(1, TOTAL_CLASSES)]) * 1. SHARED_REMOVE = np.floor(HAS_CLASS_REMOVE * (CLASS_FEATURES / np.sum(CLASS_FEATURES))) SHARED_REMOVE = SHARED_REMOVE.tolist() NO_CLASS_REMOVE = TOTAL - np.sum(SHARED_REMOVE) - max_features # no features removed else: NO_CLASS_REMOVE = TOTAL - np.size(NO_CLASS_CAND) REMOVALS[0] = NO_CLASS_REMOVE SHARED_REMOVE = range(0, NUM_CLASSES) * 0 # removes the number of features determined above for c in range(0, TOTAL_CLASSES): if REMOVALS[c] != 0: for i in range(0, REMOVALS[c]): CANDIDATES = np.argwhere(LABS == c) DRAW = np.random.randint(0, np.size(CANDIDATES)) np.delete(LABS, DRAW, 0) np.delete(MAT, DRAW, 0) print 'DONE!... where\'d you go???' return MAT, np.ravel(LABS, order='c')
texto_e = 'My name is Maximus Decimus Meridius, commander of the armies of ' \ 'the north, General of the Felix legions and loyal servant to the ' \ 'true emperor, Marcus Aurelius. Father to a murdered son, husband ' \ 'to a murdered wife. And I will have my vengeance, in this life or ' \ 'the next (Gladiator, the movie).' texto_p = 'Meu nome é Maximus Decimus Meridius, comandante dos exércitos do ' \ 'norte, general das legiões de Félix e servo leal ao verdadeiro ' \ 'imperador, Marcus Aurelius. Pai de um filho assassinado, marido ' \ 'de uma esposa assassinada. E eu terei minha vingança, nesta vida ' \ 'ou na próxima (Gladiador, o filme).' tokens = word_tokenize(texto_e) tokens = word_tokenize(texto_p) tokens = ['amor', 'amora', 'amoroso'] porter = PorterStemmer() stems_porter = [porter.stem(t) for t in tokens] lancaster = LancasterStemmer() stems_lancaster = [lancaster.stem(t) for t in tokens] rslp = RSLPStemmer() stems_rslp = [rslp.stem(t) for t in tokens] print('{:12s} {:12s} {:12s} {}'.format('Tokens', 'Porter', 'Lancaster', 'RSLP')) for t, p, l, r in zip(tokens, stems_porter, stems_lancaster, stems_rslp): print('{:12s} {:12s} {:12s} {}'.format(t, p, l, r))
articleData = json.load(file) # save content of the json file tokenTitle = word_tokenize(articleData['title']) # add word from the tokenized data to create a list of all words for that article for word in tokenTitle: # convert all words to lower case to avoid duplicates word = word.lower() # remove the symbol stopwords for char in word: if char in symbolStopwords: word = word.replace(char, "") # check if the word contains a number or is a stopword if not any(char.isdigit() for char in word): if word not in stopwords: # stem words to avoid duplication by pluralization word = stem.stem(word) # if word isn't already in the dict add it if word not in wordCount: wordCount[word] = 1 else: # else increase the value of that key in the dict wordCount[word] += 1 except ValueError: print("JsonDecodeError for file " + articleTitle) with open("C:/Users/caire/Desktop/OutputData/ClassifyArticlesContentandTitle/OutputTitleArticles/" + s + ".txt", 'a', encoding='utf-8') as newFile: newFile.write(str(dict(Counter(wordCount).most_common(10))) + "\n") wordCount.clear() print(s + "'s title words counted for each article and added to file")
#스톱워드제거 nltk.download('stopwords') stopwords = nltk.corpus.stopwords.words('english') all_tokens = [] for sentence in word_tokens: filtered_words = [] for word in sentence: word = word.lower() if word not in stopwords: filtered_words.append(word) all_tokens.append(filtered_words) #스테밍 & 레마타이징 from nltk import LancasterStemmer stemmer = LancasterStemmer() stemmer.stem('happiest') from nltk.stem import WordNetLemmatizer lemma = WordNetLemmatizer() lemma.lemmatize('happiest', 'a') #BOW(출현횟수에 기반하여 문맥해석이 되지 않음) import numpy as np data = np.array([3, 1, 2]) row_pos = np.array([0, 0, 1]) col_pos = np.array([0, 2, 1]) from scipy import sparse sparse_coo = sparse.coo_matrix((data, (row_pos, col_pos))) sparse_coo.toarray() #예제1
from nltk import PorterStemmer, LancasterStemmer import nltk tokens = nltk.corpus.brown.words(categories=['romance']) porter = PorterStemmer() tokens = ['lying'] print(porter.stem(tokens[0])) lancast = LancasterStemmer() print(lancast.stem(tokens[0]))
book_fileid = 'romance/marm05.txt' # Carrega todos os parágrafos do livro, já tokenizados book_paras = machado.paras(book_fileid) # A posição 17 é o primeiro parágrafo do primeiro capítulo, portanto: book_tokens = book_paras[17][0] + book_paras[18][0] book_stopwords = stopwords.words('portuguese') book_stopwords += [p for p in string.punctuation] book_tokens = [ t.lower() for t in book_tokens if t.lower() not in book_stopwords ] # # Executa os Stemmers # porter = PorterStemmer() stems_porter = [porter.stem(t) for t in book_tokens] lancaster = LancasterStemmer() stems_lancaster = [lancaster.stem(t) for t in book_tokens] rslp = RSLPStemmer() stems_rslp = [rslp.stem(t) for t in book_tokens] print('{:18s} {:18s} {:18s} {}'.format('Tokens', 'Porter', 'Lancaster', 'RSLP')) for t, p, l, r in zip(book_tokens, stems_porter, stems_lancaster, stems_rslp): print('{:18s} {:18s} {:18s} {}'.format(t, p, l, r))
# stemming from nltk import PorterStemmer pst = PorterStemmer() pst.stem("having") pst.stem("sudeep") words_stem = ["give", "giving", "given", "gave"] for words in words_stem: print(words + " :" + pst.stem(words)) from nltk import LancasterStemmer lnst = LancasterStemmer() for words in words_stem: print(words + " :" + lnst.stem(words)) from nltk import SnowballStemmer snl = SnowballStemmer("english") for words in words_stem: print(words + " :" + snl.stem(words)) # lemmetizing from nltk import WordNetLemmatizer wordnet = WordNetLemmatizer() for words in words_stem: print(words + " :" + wordnet.lemmatize(words))
# In[30]: # Copy from glove weights of words that appear in index2word count = 0 for i in range(1, VOCAB_SIZE): w = index2word[i] g = glove_index_dict.get(w) if g is None: w = wnl.lemmatize(w) g = glove_index_dict.get(w) if g is None: w = porter.stem(w) g = glove_index_dict.get(w) if g is None: w = lancaster.stem(w) g = glove_index_dict.get(w) if g is not None: embedding[i, :] = glove_embedding_weights[g, :] count += 1 print( '{num_tokens}-{per:.2f}% tokens in vocab found in glove and copied to embedding.' .format(num_tokens=count, per=count / float(VOCAB_SIZE) * 100)) # # Build Dateset # In[32]: from keras.preprocessing.sequence import pad_sequences from sklearn.model_selection import train_test_split
'The interviewee attributed their correct guess to how famous the first phrase is.' ) print( 'The first four content words were readily recognizable to anyone who has read the book.' ) print('No function words were needed to identify the source.') print('\n') print('\n') print("_" * 70) print('QUESTION 3: Stemming and Lemmatization: \n') porter = PorterStemmer() lancaster = LancasterStemmer() snowball = SnowballStemmer('english') porter_stemming = [porter.stem(w) for w in filtered_words] lancaster_stemming = [lancaster.stem(w) for w in filtered_words] snowball_stemming = [snowball.stem(w) for w in filtered_words] #with wrapping format = '%s' pieces = [format % (word) for word in porter_stemming] output = ', '.join(pieces) wrapped_porter = fill(output) print('The Normalized, Filtered Text Stemmed with PorterStemmer is: \n') print(wrapped_porter) #with wrapping format = '%s' pieces = [format % (word) for word in lancaster_stemming] output = ', '.join(pieces)
#!/usr/bin/python """ This script takes tf-idf results and filters just those that are included in the review's feature list """ import sys from nltk import LancasterStemmer tfidf_fname = sys.argv[1] features_fname = sys.argv[2] tfidf_file = open(tfidf_fname) features_file = open(features_fname) stemmer = LancasterStemmer() stemmed_features = [] for line in features_file: cols = line.split(',') feature = cols[2] stemmed_words = [stemmer.stem(w) for w in feature.split()] stemmed_features += stemmed_words #print stemmed_features for line in tfidf_file: cols = line.split(',') word = cols[2] if word.strip() in stemmed_features: print line.strip()
#!/usr/bin/python3.6 # -*- coding: utf-8 -*- # @Time : 2020/7/11 17:50 # @Author : 代登辉 # @Email : [email protected] # @File : stemmers.py # @Software : PyCharm # @Description: 词干提取 from nltk import PorterStemmer, LancasterStemmer, word_tokenize raw = "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and " \ "loyal servant to the true emperor, Marcus Aurelius. Father to a murdered son, husband to a murdered wife. And " \ "I will have my vengeance, in this life or the next. " tokens = word_tokenize(raw) # 根据单词分词 porter = PorterStemmer() # 相对少去后缀 pStems = [porter.stem(t) for t in tokens] # 后缀(s es e ed al) print(pStems) lancaster = LancasterStemmer() # 更彻底 lStems = [lancaster.stem(t) for t in tokens] # 去除单词的大小写和后缀 print(lStems)
import nltk from nltk.corpus import stopwords from nltk import LancasterStemmer from nltk import PorterStemmer textcontent = "this is an input" # Write your code here pattern = r'[A-Za-z0-9]+' tokenizedwords = nltk.regexp_tokenize(textcontent, pattern) # print(tokenizedwords) tokenizedwords = [word.lower() for word in set(tokenizedwords)] stop_words = set(stopwords.words('english')) filteredwords = [w for w in tokenizedwords if not w in stop_words] porter = nltk.PorterStemmer() porterstemmedwords = [porter.stem(w) for w in filteredwords] lancaster = LancasterStemmer() lancasterstemmedwords = [lancaster.stem(w) for w in filteredwords] wnl = nltk.WordNetLemmatizer() lemmatizedwords = [wnl.lemmatize(w) for w in filteredwords] print(porterstemmedwords, lancasterstemmedwords, lemmatizedwords)