def process(word_list): lancaster=LancasterStemmer() new_list=[] for word in word_list: w=lancaster.stem(word) new_list.append(w) return new_list
def Stem(s): if s is not None and isinstance(s, str) and len(s) > 0: stemmer = LancasterStemmer() s = (" ").join([stemmer.stem(z) for z in s.split(" ")]) s = s.lower() return s else: return ""
def stem_words(self, words): """Stem words in list of tokenized words""" stemmer = LancasterStemmer() stems = "" for word in words.split(" "): stem = stemmer.stem(word) stems = stems + " " + stem return stems
def stem_words(words): """Stem words in list of tokenized words""" stemmer = LancasterStemmer() stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) return stems
def main(): save_data_from_webpage() text = get_data_from_file() #creates a list of the tolkenized words tt = word_tokenize(text) pprint(tt) #creates a new list for the steam words using all of the stemmers psteam = PorterStemmer() psteam_list = [] for word in tt: psteam_list.append(psteam.stem(word)) pprint(psteam_list) lsteam = LancasterStemmer() lsteam_list = [] for word in tt: lsteam_list.append(lsteam.stem(word)) pprint(lsteam_list) ssteam = SnowballStemmer() ssteam_list = [] for word in tt: ssteam_list.append(ssteam.stem(word)) pprint(ssteam_list) p = set(psteam_list) l = set(lsteam_list) s = set(ssteam_list) #displays the different steams pprint(s.difference(l.difference(p))) #pos taging pos_list = pos_tag(text) pprint(pos_list) #creates a new list for the lematized words lemmatizer = WordNetLemmatizer() lem = [] for word in tt: lem.append(lemmatizer.lemmatize(word)) #pprint(lem) # returns a generator of trigrams using the tokenized list tt trig = trigrams(tt) displays the results print(list(trig)) #ne_chunck finds non overlapping groups #pos_tag ids how the text is used in speech NamedEntity = ne_chunk(pos_tag(wordpunct_tokenize(text))) print(NamedEntity)
def _normalize(self, item): key, value = item ls = LancasterStemmer() text = word_tokenize(value[0]) text = [word.lower() for word in text] text = [ ls.stem(word).rstrip('s') for word in text if word not in stopwords.words('english') and word.isalnum() ] return (key, (text, value[1]))
def __stem_document(document_name: pathlib.Path) -> list: stemmer = LancasterStemmer() with document_name.open('r', encoding='utf-8') as document: lines = document.readlines() result = [] for line in lines: line = line.strip() words = [token for token in line.split(' ')] words = [stemmer.stem(word) for word in words] sentence = ' '.join(words) result.append(sentence) return result
def get_stems(tokens): stemmer = LancasterStemmer() stemmed_tokens = [] for token in tokens: for word in token: if word[1] == 'DT' or word[1] == 'PRP' or word[1] == 'PRP$' or word[ 1] == 'NN' or word[1] == 'NNP' or word[1] == 'NNPS': temp_tokens = word[0] else: temp_tokens = stemmer.stem(word[0]) stemmed_tokens.append(temp_tokens) return get_lemma(stemmed_tokens)
def getStemsFromURL(page_url): ''' Given the link of a webpage (string), returns a list of all the words' stems in the webpage text ''' with urlopen(page_url) as infile: soup = BeautifulSoup(infile, features="lxml") ls = LancasterStemmer() words = word_tokenize(soup.text) words = [w.lower() for w in words] words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalpha()] return words
def checkstemmers(): raw = customparse("C://cygwin//home//nelson auner//Pontikes//FinalData.OctNewKeepAndAnonymous/capsavem/my_cape/outtoget.cap.txt") wordz = raw.split(" ") O = ["sweating","tripping","gunning","going"] HH = [i[0:-1] for i in O] dic = enchant.Dict("en_US") from nltk import LancasterStemmer, PorterStemmer lancaster = LancasterStemmer() porter = PorterStemmer() resporter = [porter.stem(t).replace(" ","") for t in wordz] reslan = [lancaster.stem(t).replace(" ","") for t in wordz] resall = [[wordz[i],resporter[i],reslan[i]] for i in range(len(wordz)) ] filtres = [resall[i] for i in range(len(resall)) if not (resall[i][0]==resall[i][2]==resall[i][1])] return resall
def _create_stemmer(stemmer_type): """ Initialize a stemmer """ return { 'Porter': PorterStemmer(), 'Snowball': SnowballStemmer('english'), 'Lancaster': LancasterStemmer(), }[stemmer_type]
def words_stemmer(words, type="PorterStemmer", lang="english", encoding="utf8"): supported_stemmers = [ "PorterStemmer", "LancasterStemmer", "SnowballStemmer" ] if type is False or type not in supported_stemmers: return words else: stem_words = [] if type == "PorterStemmer": stemmer = PorterStemmer() for word in words: stem_words.append(stemmer.stem(word).encode(encoding)) if type == "LancasterStemmer": stemmer = LancasterStemmer() for word in words: stem_words.append(stemmer.stem(word).encode(encoding)) if type == "SnowballStemmer": stemmer = SnowballStemmer(lang) for word in words: stem_words.append(stemmer.stem(word).encode(encoding)) return " ".join(stem_words)
def clean_tweets(self, text): st = LancasterStemmer() #st = PorterStemmer() with open('newspaper3k/SmartStoplist.txt', 'r') as f: stopwords = [line.strip() for line in f] # remove URL's text = re.sub(r'http\S+', '', text) tweet_tmp = text.split("\n") for k in tweet_tmp: tweet_tmp = re.sub(r"[^a-zA-Z0-9]+", ' ', k).lower() tweet_tmp = st.stem(tweet_tmp) tweet_tmp = ''.join([i for i in tweet_tmp if not i.isdigit()]) tweet_tmp = tweet_tmp.split() result = [word for word in tweet_tmp if word not in stopwords] return result
def checkstemmers(): raw = customparse( "C://cygwin//home//nelson auner//Pontikes//FinalData.OctNewKeepAndAnonymous/capsavem/my_cape/outtoget.cap.txt" ) wordz = raw.split(" ") O = ["sweating", "tripping", "gunning", "going"] HH = [i[0:-1] for i in O] dic = enchant.Dict("en_US") from nltk import LancasterStemmer, PorterStemmer lancaster = LancasterStemmer() porter = PorterStemmer() resporter = [porter.stem(t).replace(" ", "") for t in wordz] reslan = [lancaster.stem(t).replace(" ", "") for t in wordz] resall = [[wordz[i], resporter[i], reslan[i]] for i in range(len(wordz))] filtres = [ resall[i] for i in range(len(resall)) if not (resall[i][0] == resall[i][2] == resall[i][1]) ] return resall
def getMostUsedWordsTxt(file, wordnum): ''' Given a text file name (string) and the number of most used words we want to find (int), returns a list of the wordnum most common elements and their counts from the most common to the least: [('1st_most_common_word', count1), ('2nd_most_common_word', count2), ..., ('wordnumth_most_common_word', countwordnum)] ''' with open(file, "r") as f: words = f.read() words = words.split() ls = LancasterStemmer() words = [w.lower() for w in words] words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalpha()] freqs = Counter(words) return freqs.most_common(wordnum)
def tokenize(self, description): filtered = [] # dont process NaN or Null values if pd.isnull(description): return filtered, filtered else: terms = description.lower().split() # terms = word_tokenize(description.lower().decode('utf-8')) filtered_stopwords = [word for word in terms if not word in stopwords.words('english')] # # Stemming Snowball # stemmer = SnowballStemmer('english') # for stem in filtered_stopwords: # filtered.append(stemmer.stem(stem.decode('utf-8'))) # # Stemming Porter # stemmer = PorterStemmer() # for stem in filtered_stopwords: # filtered.append(stemmer.stem(stem.decode('utf-8'))) # Lemmatizer Word Net Lemmatizer lemmatizer = WordNetLemmatizer() for lemmatized in filtered_stopwords: filtered.append(lemmatizer.lemmatize(lemmatized)) filtered_final = [] # Stemming Lancaster stemmer = LancasterStemmer() for stem in filtered: # filtered_final.append(stemmer.stem(stem.decode('utf-8'))) filtered_final.append(stemmer.stem(stem)) # # Lemmatizer TextBlob # for lemmatized in filtered_stopwords: # w = Word(lemmatized.decode('utf-8')) # filtered.append(w.lemmatize) return filtered_final
def get_words_from_string(string): string = string.lower() word_pattern = r'[A-Za-z]+' # link_pattern = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})" # email_pattern = r"\S+@\S+" # ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b" result = [] # for x in re.findall(link_pattern, string): # try: # url = "{0.scheme}://{0.netloc}/".format(urlsplit(x)) # except: # url = x # result.append(url) # string = re.sub(link_pattern, "", string) # result.extend(re.findall(email_pattern, string)) # string = re.sub(email_pattern, "", string) # result.extend(re.findall(ip_pattern, string)) # string = re.sub(ip_pattern, "", string) # stemmer = PorterStemmer() stemmer = LancasterStemmer() result.extend( [stemmer.stem(word) for word in re.findall(word_pattern, string)]) # result.extend(re.findall(word_pattern, string)) return result
from nltk import WordNetLemmatizer, PorterStemmer, LancasterStemmer # In[28]: # Generate random embedding with same scale as glove np.random.seed(SEED) shape = (VOCAB_SIZE, EMBEDDING_SIZE) scale = glove_embedding_weights.std() * np.sqrt(12) / 2 embedding = np.random.uniform(low=-scale, high=scale, size=shape) # In[29]: wnl = WordNetLemmatizer() porter = PorterStemmer() lancaster = LancasterStemmer() # In[30]: # Copy from glove weights of words that appear in index2word count = 0 for i in range(1, VOCAB_SIZE): w = index2word[i] g = glove_index_dict.get(w) if g is None: w = wnl.lemmatize(w) g = glove_index_dict.get(w) if g is None: w = porter.stem(w) g = glove_index_dict.get(w) if g is None:
import unidecode from nltk import LancasterStemmer from nltk.classify import NaiveBayesClassifier from nltk.corpus import subjectivity, stopwords from nltk.sentiment import SentimentAnalyzer from nltk.sentiment.util import * from nltk.tokenize import TweetTokenizer # NLTK stuff tweet_tokenizer = TweetTokenizer() stopwords = sorted(stopwords.words('spanish') + ['rt']) stemmer = LancasterStemmer() # Regex stuff regex_url = re.compile( '((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?' ) regex_ht_mn = re.compile('(#|@)[\w]*') regex_spaces = re.compile('[ ]+') regex_nonword = re.compile('[\W]+') regex_repeated_ch = re.compile(r'(\w*)(\w)\2(\w*)') regex_ch = r'\1\2\3' # Feature stuff def get_words_in_tweets(tweets): all_words = [] for (words, sentiment) in tweets: all_words.extend(words) return all_words
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """Python for AHDA. Part 5, Example 8. """ # Lemmatize words from nltk import LancasterStemmer from nltk import PorterStemmer print('LancasterStemmer') print(LancasterStemmer().stem('lying')) print(LancasterStemmer().stem('lie')) print() print('PorterStemmer') print(PorterStemmer().stem('lying')) print(PorterStemmer().stem('lie'))
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer pStemmer = PorterStemmer() lStemmer = LancasterStemmer() sStemmer = SnowballStemmer('english') print(pStemmer.stem("Playing")) print(lStemmer.stem("Dancing")) print(sStemmer.stem("Killing")) from nltk.stem import WordNetLemmatizer lemmetizer = WordNetLemmatizer() print(lemmetizer.lemmatize("Playing")) print(lemmetizer.lemmatize("Dancing")) print(lemmetizer.lemmatize("Killing")) print(lemmetizer.lemmatize("geese")) from nltk import wordpunct_tokenize, pos_tag, ne_chunk sentence = "Mark and John are working at google" print(wordpunct_tokenize(sentence), '\n') print(pos_tag(wordpunct_tokenize(sentence)), '\n') print(ne_chunk(pos_tag(wordpunct_tokenize(sentence))))
arquivo = 'C:\\Users\\Usuario\\Dropbox\\Pos\\Pós DataScience\\4 - Análise de textos com R e Python\\Dados\\Noticia_2.docx' doc = docx.Document(arquivo) # 02 - Lista de parágrafos texto_full = [] for paragrafo in doc.paragraphs: texto_full.append(paragrafo.text) # Seleção do 2° e 3° parágrafo p_2e3 = texto_full[2:4] # Tokenizar a lista de paragrafos tokens = word_tokenize(' '.join(p_2e3)) ## RSLP rslp = RSLPStemmer() stemms_rslp = [] for i in tokens: stemms_rslp.append(rslp.stem(i)) ## Poter poter = PorterStemmer() stemms_poter = [] for i in tokens: stemms_poter.append(poter.stem(i)) ## Lancaster lancaster = LancasterStemmer() stemms_lanc = [] for i in tokens: stemms_lanc.append(lancaster.stem(i))
def stem(array, word): stemmed = LancasterStemmer().stem(word) array.remove(word) array.append(stemmed)
from nltk import PorterStemmer, LancasterStemmer, word_tokenize line = "My name is Venkatram Veerareddy, technical architect.\n I am having 20 years of experience in "\ " Software industry working \nfrom applications to products by using \n" \ " C, C++, Java, Javascript and databases "\ " like Oracle, MS SQL Server, Postgres, MySQL and OrientDB." tokens = word_tokenize(line) porter = PorterStemmer() pStems = [porter.stem(t) for t in tokens] print(pStems) print("************************************************") lancaster = LancasterStemmer() lStems = [lancaster.stem(t) for t in tokens] print(lStems)
#!/usr/bin/python3.6 # -*- coding: utf-8 -*- # @Time : 2020/7/11 17:50 # @Author : 代登辉 # @Email : [email protected] # @File : stemmers.py # @Software : PyCharm # @Description: 词干提取 from nltk import PorterStemmer, LancasterStemmer, word_tokenize raw = "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and " \ "loyal servant to the true emperor, Marcus Aurelius. Father to a murdered son, husband to a murdered wife. And " \ "I will have my vengeance, in this life or the next. " tokens = word_tokenize(raw) # 根据单词分词 porter = PorterStemmer() # 相对少去后缀 pStems = [porter.stem(t) for t in tokens] # 后缀(s es e ed al) print(pStems) lancaster = LancasterStemmer() # 更彻底 lStems = [lancaster.stem(t) for t in tokens] # 去除单词的大小写和后缀 print(lStems)
'page': TITLE, 'format': "json" } R = S.get(url=URL, params=PARAMS) DATA = R.json() # get the text wiki_page_text = DATA["parse"]["text"]["*"] h = html2text.HTML2Text() h.ignore_links = True page_text = h.handle(wiki_page_text) # create a new stemmer ls = LancasterStemmer() # tokenize text words = nltk.word_tokenize(page_text) words = [w.lower() for w in words] # eliminate stop words and stem the rest of the words words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalnum()] freqs = Counter(words) print("The 10 most frequently used stems in the ''Data science'' Wikipedia page are:") for word, count in freqs.most_common(10): print(word, count)
import json import os from collections import Counter from nltk import LancasterStemmer from nltk.corpus import stopwords from nltk.tokenize import word_tokenize # define variables articles_path = [] articleTitles = [] articleData = [] wordCount = {} tokenTitle = [] summaryAllArticles = {} stem = LancasterStemmer() # a subset of all sources for the articles in the NELA2017 dataset sources = ["AP", "BBC", "PBS", "Salon", "Slate", "The New York Times", "BuzzFeed", "Drudge Report", "Faking News", "RedState", "The Gateway Pundit", "The Huffington Post"] # second subset sources used to determine if the results so far are dependent on the current sources being used # sources = ["CNN", "MotherJones", "NPR", "PBS", "The Hill", "Vox", "Addicting Info", "New York Daily News", "Prntly", # "The D.C. Clothesline", "The Duran", "Yahoo News"] # set of commonly used words such as "the", "a", "in" etc. englishStopWords = set(stopwords.words('english')) symbolStopwords = ( {":", ";", "'", '"', '”', '“', ",", ".", "-", "_", "?", "$", "&", '...', '.', '�', '!', "''", "``", "%", "@", "--", ")", "(", "[", "]", "[]", "[ ]", "’", "|", "‘", " ", "'s", 'mr', 'mrs', 'one', 'two', 'said', 'hi', 'say', "n't",
def MakeFeaturesFromText(DIR, FNAME, SENT_CLASS, max_features): ''' !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! EXPERIMENTAL: Still trying to figure out if it produces good Data sets. Use at your own risk !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Grabs a text file and creates a sparse binary data set to training and makes class labels based off of the class identifiers in SENT_CLASS ARG: DIR TYPE: string DESC: Directory where text file is saved. End with \ ARG: FNAME TYPE: string DESC: File name ARG: SENT_CLASS TYPE: List (of lists) DESC: A list of lists with each internal list representing a class id ''' print 'STARTING DATA GENERATION' from nltk import download as nldl from nltk import LancasterStemmer import nltk.corpus as corp import string # used for referencing later NUM_CLASSES = len(SENT_CLASS) TOTAL_CLASSES = NUM_CLASSES + 1 # read file and begin cleaning TextFile = open(DIR + FNAME, 'r') RAW = TextFile.readlines() # removes newline, makes it all lower case, and splits it into seperable words by spaces CLEAN = [sent[:-1].lower().split(' ') for sent in RAW] print 'LOADING STOPWORDS' # need the try, catch because it requires the stop words from the nltk database # the catch is just for first time running try: STOP_WORDS = corp.stopwords.words('english') except LookupError: nldl('words') STOP_WORDS = corp.stopwords.words('english') finally: # stop words are common words like "the", "an" "a" etc.. STOP_WORDS += ' '.join(p for p in string.punctuation).split(' ') + ' '.join(d for d in string.digits).split(' ') + ' '.join(w for w in string.whitespace).split(' ') # removes stopwords and trivially short features CLEANER = [[word for word in sent if word not in STOP_WORDS] for sent in CLEAN if len(sent) > 3] KEYS = [] print 'SORTING KEYS' # starts compiliing the key list from the id words in sent_class for IDS in SENT_CLASS: KEYS += IDS print 'HEAVY DUTY STUFF' # finds all unique words. keeps combining and fitlering duplicates for sent in CLEANER: TEMP = KEYS + sent KEYS = np.unique(TEMP).tolist() print 'COUNTING NCOs. . ', MAX_LIST = [] MAX = -1 KEY_COUNT = {} for sent in CLEANER: for word in sent: if not KEY_COUNT.has_key(word): KEY_COUNT[word] = 0 KEY_COUNT[word] += 1 if KEY_COUNT[word] >= MAX: MAX = KEY_COUNT[word] if word in MAX_LIST: MAX_LIST.remove(word) MAX_LIST.insert(0, word) print '.', TOO_FEW = [key for key in list(KEY_COUNT) if KEY_COUNT[key] <= 1] TOTAL_KEYS = len(list(KEY_COUNT)) REMOVE_TOP = np.floor(TOTAL_KEYS / 10.) print '.', for SCLASS in SENT_CLASS: for s in SCLASS: if s in MAX_LIST: MAX_LIST.remove(s) if s in TOO_FEW: TOO_FEW.remove(s) print '.', for tf in TOO_FEW: if tf in KEYS: KEYS.remove(tf) print '.', TOO_MANY = MAX_LIST for tm in TOO_MANY: if tm in KEYS: KEYS.remove(tm) print 'TERMINATED THE UNDESIREABLES' print 'REMOVED ', len(TOO_MANY) + len(TOO_FEW), 'OF ', TOTAL_KEYS, 'UNDESIREABLES' print 'STARTING STEMMING' # removes suffixes and prefixes from words leaving the rootword only STEMMER = LancasterStemmer() # hash of stemmed words because it's trememndously faster to do it this # way versus calling LancasterStemmer every time STEM_DICT = { K : STEMMER.stem(K) for K in KEYS } print 'STEMMED DICTIONARY CREATED' # stem the class labels... and we know theyre in the hash because that # was the first thing we added to the key list STEM_LABS = [[STEM_DICT[ID] for ID in CLASS] for CLASS in SENT_CLASS] # now the dictionary is generated and the vectorization has begun DICT = STEM_DICT.values() # using the hash table of stemmed words to look up the stemmed root STEM_DATA = [[STEM_DICT[word] for word in sent if word in list(STEM_DICT)] for sent in CLEANER] # indexs of root words in the dictionary so that you only have to do a few lookups # when gnerating the data set of binary vectors INT_DATA = [np.array([DICT.index(word) for word in sent], dtype=np.int32, order='c') for sent in STEM_DATA] # same as above INT_LABS = [[DICT.index(ID) for ID in CLASS] for CLASS in STEM_LABS] print 'CREATING DATA SET' print 'I BET THIS TAKES THE LONGSEST' # meat and potatoes LABS = np.zeros((len(CLEANER), 1), dtype=np.int32, order='c') MAT_SHAPE = (1, len(DICT)) MAT = np.zeros(MAT_SHAPE, dtype=np.int32, order='c') # priming for the for loop below so that we can stack each new feature at # the bottom of our data set. M_IND = np.array(INT_DATA[0], dtype=np.int32, order='c') MAT[0, M_IND] = 1 print 'STILL Go', CLIST = range(0, NUM_CLASSES) for d in range(NUM_CLASSES): print 'i', i = CLIST[np.random.randint(0, len(CLIST))] IND = np.atleast_2d(INT_LABS[i]) if np.any(MAT[0, IND] == 1) and LABS[0] == 0: LABS[0] = i + 1 else: CLIST.remove(i) # makes whole data set for i in range(1, np.size(INT_DATA, 0)): if np.mod(i, np.floor(np.size(INT_DATA, 0) * .2)) == 0 : print 'n', NEXT_ARRAY = np.zeros(MAT_SHAPE, dtype=np.int32, order='c') TO_ONES = np.array(INT_DATA[i], dtype=np.int32, order='c') NEXT_ARRAY[0, TO_ONES] = 1 # making labels CLIST = range(0, NUM_CLASSES) for d in range(NUM_CLASSES): j = CLIST[np.random.randint(0, len(CLIST))] IND = np.atleast_2d(INT_LABS[j]) if np.any(NEXT_ARRAY[0, IND] == 1) and LABS[i] == 0: LABS[i] = j + 1 CLIST.remove(j) MAT = np.vstack((MAT, NEXT_ARRAY)) print 'g!' print 'I WAS RIGHT' # to reduce non-classed features the below algo tries to reduce the number of # non-classed features but if max-features is fewer than MAT with all non-classed features # removed then steps have to be taken to remove classed features TOTAL = np.size(MAT, 0) if max_features > TOTAL: max_features = TOTAL print 'THINNING THE HERD A BIT MORE' REMOVALS = range(0, TOTAL_CLASSES) if TOTAL > max_features: NO_CLASS_CAND = np.argwhere(LABS == 0) HAS_CLASS_CAND = np.argwhere(LABS != 0) # gets weighted number of features of each class to remove if (TOTAL - np.size(NO_CLASS_CAND)) >= max_features: HAS_CLASS_REMOVE = TOTAL - np.size(NO_CLASS_CAND) - max_features CLASS_FEATURES = np.array([np.sum(HAS_CLASS_CAND == CLASS) for CLASS in range(1, TOTAL_CLASSES)]) * 1. SHARED_REMOVE = np.floor(HAS_CLASS_REMOVE * (CLASS_FEATURES / np.sum(CLASS_FEATURES))) SHARED_REMOVE = SHARED_REMOVE.tolist() NO_CLASS_REMOVE = TOTAL - np.sum(SHARED_REMOVE) - max_features # no features removed else: NO_CLASS_REMOVE = TOTAL - np.size(NO_CLASS_CAND) REMOVALS[0] = NO_CLASS_REMOVE SHARED_REMOVE = range(0, NUM_CLASSES) * 0 # removes the number of features determined above for c in range(0, TOTAL_CLASSES): if REMOVALS[c] != 0: for i in range(0, REMOVALS[c]): CANDIDATES = np.argwhere(LABS == c) DRAW = np.random.randint(0, np.size(CANDIDATES)) np.delete(LABS, DRAW, 0) np.delete(MAT, DRAW, 0) print 'DONE!... where\'d you go???' return MAT, np.ravel(LABS, order='c')
import re import logging from nltk import WordNetLemmatizer, LancasterStemmer from django.core.urlresolvers import reverse logger = logging.getLogger(__name__) wordnet_lemmatizer = WordNetLemmatizer() lancaster_stemmer = LancasterStemmer() def extract_keywords(title): original_keywords = [keyword.lower() for keyword in re.split('\W+', title)] try: lemmatized_keywords = map(wordnet_lemmatizer.lemmatize, original_keywords) except LookupError: logging.error('Please install corpora/wordnet dictionary') return [] stemmed_keywords = map(lancaster_stemmer.stem, original_keywords) return list(set(original_keywords + lemmatized_keywords + stemmed_keywords)) def reverse_tastypie_url(resource_name, pk=None): """ Returns tastypie url
from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize, word_tokenize import math # untuk operasi matematika lanjutan app = Flask(__name__) # ----------KONFIGURASI DATABASE DOKUMEN---------- db = mysql.connector.connect(host="localhost", user="******", passwd="", database="stki") cursor = db.cursor() # buat variabel tempat stopwords stop_words = set(stopwords.words('english')) lancaster = LancasterStemmer() # Lancaster/Paice-Husk Stemmer eliminasi = [ '.', '?', '!', ' ', ',', ':', ';', '(', ')', '\'', '"', '%', '&', '*', '-', '_', '+', '=', '{', '}', '[', ']', '\\', '|', '"', '<', '>', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '�' ] def preProcessDoc(docs): docs_token = word_tokenize(docs) arr = [] for i in range(len(docs_token)): docs_token[i] = docs_token[i].lower() if docs_token[i] not in stop_words: skip = 0 for j in range(len(docs_token[i])):
提取文本数据的词干 三种词干提取算法,Lancaster词干提取器比其他两个词干提取器更严格 严格程度而言:Porter最轻松,Lancaster最严格。 Lancaster速度快但是会减少单词的很大部分,通常会选择Snowball词干提取器 ''' from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer words = [ 'table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches', 'grounded', 'dreamt', 'envision' ] stemmers = ['PORTER', 'LANCASTER', 'SNOWBALL'] stemmer_porter = PorterStemmer() stemmer_lancaster = LancasterStemmer() stemmer_snowball = SnowballStemmer('english') formatted_row = '{:>16}' * (len(stemmers) + 1) print(formatted_row.format('WORD', *stemmers)) for word in words: stemmed_words = [ stemmer_porter.stem(word), stemmer_lancaster.stem(word), stemmer_snowball.stem(word) ] print(formatted_row.format(word, *stemmed_words))
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer from nltk.stem import WordNetLemmatizer from nltk import ngrams pStemmer = PorterStemmer() lStemmer = LancasterStemmer() sStemmer = SnowballStemmer('english') lemmetizer = WordNetLemmatizer() def stem_each_word(tokens, lancaster_file, porter_file, snowball_file, lemmetizer_file, trigrams_file): lancaster_file_out = open(lancaster_file, "a+") porter_file_out = open(porter_file, "a+") snowball_file_out = open(snowball_file, "a+") lemmetizer_file_out = open(lemmetizer_file, "a+") trigrams_file_out = open(trigrams_file, "a+") for token in tokens: porter_file_out.write(str(pStemmer.stem(token)) + "\t") lancaster_file_out.write(str(lStemmer.stem(token)) + "\t") snowball_file_out.write(str(sStemmer.stem(token)) + "\t") lemmetizer_file_out.write(str(lemmetizer.lemmatize(token)) + "\t") trigrams_file_out.write(str(list(ngrams(tokens, 3)))) porter_file_out.write("\n") lancaster_file_out.write("\n") snowball_file_out.write("\n") lemmetizer_file_out.write("\n") trigrams_file_out.write("\n")
#!/usr/bin/python """ This script takes tf-idf results and filters just those that are included in the review's feature list """ import sys from nltk import LancasterStemmer tfidf_fname = sys.argv[1] features_fname = sys.argv[2] tfidf_file = open(tfidf_fname) features_file = open(features_fname) stemmer = LancasterStemmer() stemmed_features = [] for line in features_file: cols = line.split(',') feature = cols[2] stemmed_words = [stemmer.stem(w) for w in feature.split()] stemmed_features += stemmed_words #print stemmed_features for line in tfidf_file: cols = line.split(',') word = cols[2] if word.strip() in stemmed_features: print line.strip()