def __init__(self, datareader, stopwords,norm,work,split, skip_words, date,porter,porter2,lanca,lanca2): """ :param datareader: a Datareader object :param stopwords: a list of stopwords """ self.norm = norm self.work = work self.split = split self.skip_words = skip_words self.date = date self.porter = porter self.porter2 = porter2 self.lanca = lanca self.lanca2 = lanca2 self.ps = stem.PorterStemmer() self.ls = stem.LancasterStemmer() train_playlists_df = datareader.get_df_train_playlists() test_playlists_df = datareader.get_df_test_playlists() concat_df = pd.concat([train_playlists_df, test_playlists_df]) concat_df = concat_df.sort_values(['pid'], ascending=True) self.stopwords = stopwords self.titles = concat_df['name'].as_matrix() self.tokens_dict = dict() self.__set_params() self.words = list(self.tokens_dict.keys())
def sosei(): stemmer2 = stem.LancasterStemmer() ids = defaultdict(lambda: len(ids)) for line in open("sentiment.txt").readlines(): line = line.split() for word in line.pop(0): ids[stemmer2.stem(word)] stop = [] for line in open("stop.txt","r"): stop.append(line.strip()) sosei_list = [] label_list = [] for line in open("sentiment.txt","r"): line = line.split() label = line[0] line = line.pop(0) line2 = copy.deepcopy(line) for word in line: if stop_check(word,stop): line2.remove(word) line = [0]*len(ids) for word in line2: line[ids[stemmer2.stem(word)]] += 1 sosei_list.append(line) label_list.append(label) return label_list,sosei_list
def stem_token(word): stem_token = "" if stemmer_name == "Porter-Stemmer": #print ("Performing Porter Stemming") stemmer = stem.PorterStemmer() phrase_array_token = word.split() stem_token = "" for s in phrase_array_token: stem_token = stem_token + stemmer.stem(s) + " " stem_token = stem_token.strip(" ") word = stem_token elif stemmer_name == "Lancaster-Stemmer": #print ("Performing Lancaster Stemming") stemmer = stem.LancasterStemmer() phrase_array_token = word.split() stem_token = "" for s in phrase_array_token: stem_token = stem_token + stemmer.stem(s) + " " stem_token = stem_token.strip(" ") word = stem_token elif stemmer_name == "WordNet-Lemmatizer": #print ("Performing Wordnet Lemmatization") stemmer = stem.WordNetLemmatizer() phrase_array_token = word.split() stem_token = "" for s in phrase_array_token: stem_token = stem_token + stemmer.lemmatize(s) + " " stem_token = stem_token.strip(" ") word = stem_token #stopword[count]=stemmer.lemmatize(stopword[count]) return (word)
def demo(): """A demonstration of the lancaster stemmer on a samples described in Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61. """ from nltk import stem stemmer = stem.LancasterStemmer() print "%-20s%-20s" % ("Original Word", "Stemmed Word") print "*" * 40 for word in ( 'maximum', # Remove "-um" when word is intact 'presumably', # Don't remove "-um" when word is not intact 'multiply', # No action taken if word ends with "-ply" 'provision', # Replace "-sion" with "-j" to trigger "j" set of rules 'owed', # Word starting with vowel must contain at least 2 letters 'ear', # ditto. 'saying', # Words starting with consonant must contain at least 3 'crying', # letters and one of those letters must be a vowel 'string', # ditto. 'meant', # ditto. 'cement'): # ditto. stemmed_word = stemmer.stem(word) print "%-20s%-20s" % (word, stemmed_word)
def getFeature(word_list): stemmer = stem.LancasterStemmer() # stemmer2 = stem.PorterStemmer() feature = defaultdict(lambda: 0) for word in word_list: if not isStopWords(word): word_stem = stemmer.stem(word) feature[word_stem] += 1 return dict(feature)
def preprocessor_words(words): stopwords_set = set(stopwords.words('english')) stemmer = stem.LancasterStemmer() words_preprocessed = [] for word in words: if word in stopwords_set: continue lemmatized = stemmer.stem(word) words_preprocessed.append(lemmatized) return words_preprocessed
def preprocessor(input): stopwords_set = set(stopwords.words('english')) stemmer = stem.LancasterStemmer() preprocessed_list = [] for word in input.lower().split(): if word in stopwords_set: continue else: lemmatized = stemmer.stem(word) preprocessed_list.append(lemmatized) return ' '.join(preprocessed_list)
def stem_token(token): root_word = token if stemmer_name == "Porter-Stemmer": #print ("Performing Porter Stemming") stemmer = stem.PorterStemmer() token = stemmer.stem(token) elif stemmer_name == "Lancaster-Stemmer": #print ("Performing Lancaster Stemming") stemmer = stem.LancasterStemmer() token = stemmer.stem(token) elif stemmer_name == "WordNet-Lemmatizer": #print ("Performing Wordnet Lemmatization") stemmer = WordNetLemmatizer() token = stemmer.lemmatize(token) stem_to_root[token] = root_word return (token)
def text_to_word_list(self, text): ''' Pre process and convert texts to a list of words ''' text = str(text) text = text.lower() text = sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = sub(r"what's", "what is ", text) text = sub(r"\'s", " ", text) text = sub(r"\'ve", " have ", text) text = sub(r"can't", "cannot ", text) text = sub(r"n't", " not ", text) text = sub(r"i'm", "i am ", text) text = sub(r"\'re", " are ", text) text = sub(r"\'d", " would ", text) text = sub(r"\'ll", " will ", text) text = sub(r",", " ", text) text = sub(r"\.", " ", text) text = sub(r"!", " ! ", text) text = sub(r"\/", " ", text) text = sub(r"\^", " ^ ", text) text = sub(r"\+", " + ", text) text = sub(r"\-", " - ", text) text = sub(r"\=", " = ", text) text = sub(r"'", " ", text) text = sub(r"(\d+)(k)", r"\g<1>000", text) text = sub(r":", " : ", text) text = sub(r" e g ", " eg ", text) text = sub(r" b g ", " bg ", text) text = sub(r" u s ", " american ", text) text = sub(r"\0s", "0", text) text = sub(r" 9 11 ", "911", text) text = sub(r"e - mail", "email", text) text = sub(r"j k", "jk", text) text = sub(r"\s{2,}", " ", text) text = word_tokenize(text) normalized_sentence = [] lancaster = stem.LancasterStemmer() lemmatizer = WordNetLemmatizer() for word in text: if self.normalizer == 'lancaster': normalized_sentence.append(lancaster.stem(word)) elif self.normalizer == 'wordnet': normalized_sentence.append(lemmatizer.lemmatize(word)) else: normalized_sentence.append(word) return normalized_sentence
def stem_token(stemmer_name, stopword): if stemmer_name == "Porter-Stemmer": #print ("Performing Porter Stemming") stemmer = stem.PorterStemmer() for count in range(len(stopword)): stopword[count] = stemmer.stem(stopword[count]) elif stemmer_name == "Lancaster-Stemmer": #print ("Performing Lancaster Stemming") stemmer = stem.LancasterStemmer() for count in range(len(stopword)): stopword[count] = stemmer.stem(stopword[count]) elif stemmer_name == "WordNet-Lemmatizer": #print ("Performing Wordnet Lemmatization") stemmer = WordNetLemmatizer() for count in range(len(stopword)): stopword[count] = stemmer.lemmatize(stopword[count]) return (stopword)
def stemText(self, text, intensity): """Apply stemming to a string according to :intesity.""" #select nltk stemmer if intensity is 'light': s = stem.PorterStemmer() elif intensity is 'medium': s = stem.snowball.EnglishStemmer() elif intensity is 'heavy': s = stem.LancasterStemmer() else: raise Exception( "'{0}' is not a correct intensity parameter. Must be light, medium or heavy." .format(intensity)) bow = text.split(" ") #this creates a bag of words result = [] for word in bow: result.append(s.stem(word)) return ' '.join(result)
def __init__(self, datareader, stopwords=[]): """ :param datareader: a Datareader object :param stopwords: a list of stopwords """ self.stopwords = stopwords self.ps = stem.PorterStemmer() self.ls = stem.LancasterStemmer() train_playlists_df = datareader.get_df_train_playlists() test_playlists_df = datareader.get_df_test_playlists() concat_df = pd.concat([train_playlists_df, test_playlists_df]) if datareader.offline(): concat_df = concat_df.sort_values(['pid'], ascending=True) self.playlists = concat_df['pid'].values self.titles = concat_df['name'].values self.tokens_dict = dict() self.__set_params() self.words = list(self.tokens_dict.keys())
def preprocessor_data(data, ids, test=0): stopwords_set = set(stopwords.words('english')) stemmer = stem.LancasterStemmer() data_in_preprocessed = [] labels = [] for line in data: words_preprocessed = [] line.lower() label, words = line.split()[0], line.split()[1:] labels.append(int(label)) for word in words: if word in stopwords_set: continue lemmatized = stemmer.stem(word) if test == 0: ids[lemmatized] words_preprocessed.append(lemmatized) data_in_preprocessed.append(words_preprocessed) return data_in_preprocessed, labels
def postgre_retrieve_sentences(word, language): conn_string = "dbname= 'postgres' user='******' host='postgre-psd.postgres.database.azure.com' password='******' port='5432' " con = psycopg2.connect(conn_string) curs = con.cursor() #word = ls.stem("eyes") #word = "банковский" #language = "russian" #executing the query for retrieval if language == "english": ls = stem.LancasterStemmer() word = ls.stem(word) curs.execute(f"select sentence from {language} where word='{word}'") x = curs.fetchall() return x if language == "russian": curs.execute(f"select sentence from {language} where word='{word}'") x = curs.fetchall() return x if language == "turkish": curs.execute(f"select sentence from {language} where word='{word}'") x = curs.fetchall() return x
def stem_text_contents(self): if stemmer_name == "Porter-Stemmer": stemmer = stem.PorterStemmer() for counter in range(len(self.text)): text_tokens = self.text[counter].split() stem_text = "" for t in text_tokens: root_word = t stem_text = stem_text + stemmer.stem(t) + " " stem_to_root[stemmer.stem(t)] = root_word stem_text = stem_text.strip(" ") self.stem_text.append(stem_text) elif stemmer_name == "Lancaster-Stemmer": stemmer = stem.LancasterStemmer() for counter in range(len(self.text)): text_tokens = self.text[counter].split() stem_text = "" for t in text_tokens: root_word = t stem_text = stem_text + stemmer.stem(t) + " " stem_to_root[stemmer.stem(t)] = root_word stem_text = stem_text.strip(" ") self.stem_text.append(stem_text) elif stemmer_name == "WordNet-Lemmatizer": stemmer = WordNetLemmatizer() for counter in range(len(self.text)): text_tokens = self.text[counter].split() stem_text = "" for t in text_tokens: root_word = t stem_text = stem_text + stemmer.lemmatize(t) + " " stem_to_root[stemmer.stem(t)] = root_word stem_text = stem_text.strip(" ") self.stem_text.append(stem_text) else: self.stem_text = self.text
''' usage $ python knock50.py| python knock51.py | python knock52.py ''' from nltk import stem import sys lines_list = sys.stdin.readlines() for line in lines_list: l = line.strip('\n') porter_stm = stem.PorterStemmer() lancas_stm = stem.LancasterStemmer() p_gokan = porter_stm.stem(l.replace('\n','').lower()) l_gokan = lancas_stm.stem(l.replace('\n','').lower()) print("{0}\t{1}\t{2}".format(l, p_gokan, l_gokan))
def test_stemming(word): print 'WordNetLemmatizer:', stem.WordNetLemmatizer().lemmatize(word) print 'LancasterStemmer:', stem.LancasterStemmer().stem(word) print 'PorterStemmer:', stem.PorterStemmer().stem(word) print 'RegexpStemmer:', stem.RegexpStemmer('ing$|s$|e$', min=4).stem(word) print 'SnowballStemmer:', stem.SnowballStemmer('english').stem(word)
def __init__(self): self.stopwords = stopwords.words('english') self._lancaster = stem.LancasterStemmer() self._porter = stem.PorterStemmer() self._lemmatizer = stem.WordNetLemmatizer()
import re from collections import defaultdict from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet as wn from nltk import stem stemmer = stem.LancasterStemmer() #----------- MINING PHRASES FROM TRAINING DATA -------------- def getSynonyms(word): syns = wn.synsets(word) synonyms = [item.lemmas()[0].name() for item in syns] return list(set(synonyms)) # print(unknown + ': ') # print(list(set(synonyms))) def topFreq(wordList): freqs = {} for word in wordList: freqs[word] = train_dic[word] for k in reversed(sorted(freqs, key=lambda k:freqs[k])): if freqs[k] > 0: return k return None train_phrases = open('../data/training_text', 'r') test_phrases = open('../data/test_text', 'r') train_phrases.readline() test_phrases.readline() train_dic = defaultdict(int) test_dic = defaultdict(int)
def stemming(words): ls = stem.LancasterStemmer() stemmed_words = [] for word in words: stemmed_words.append(ls.stem(word)) return (stemmed_words)