def tokenise(self, stem: bool = False) -> List[str]: words = word_tokenize(self.content) if stem: stemmer = RegexpStemmer('ing$|s$|ed$|y$|er$|[^aeiou]{1}y$|e$', min=3) words = [stemmer.stem(word) for word in words] return words
def f(s): if s is not None: line = s.lower().replace('"', ']').replace('\'', ' ') # converting words in lowercase tokenized_words = word_tokenize(line) # tokenizing regexFile="regex.txt" Snowballstemmer=SnowballStemmer("english") RegexStemmer=[] #Stemmer for Regular expression with open(regexFile,'r') as regFile: while True: line = regFile.readline() print(line) if not line: break RegexStemmer=RegexpStemmer(line,min=2) data =filter(lambda x: x not in stopwords, tokenized_words) # data=[tokenized_words - nouse_words] lmtzr = WordNetLemmatizer() list_of_words=[] for item in data: if len(item)>2: # words with length <=2 are removed #rlemma=lmtzr.lemmatize(item) # lemmatizing # stemming x=RegexStemmer.stem(item) #x=Snowballstemmer.stem(regx) if len(x)>2: list_of_words.append(x) # adding item to list_of_words t = ' '.join(str(item) for item in list_of_words) return t
def my_stem(word): st = RegexpStemmer('ness$|ity$|ment', min=4) if word.endswith('acy'): stem = word[:-2] stem += 'te' elif word.endswith('cy'): stem = word[:-2] stem += 't' elif word.endswith('ility'): stem = word[:-5] stem += 'le' if stem not in model.vocab: stem = word[:-3] # elif word.endswith('ality'): # stem = word[:-5] # if stem not in model.vocab: # stem = word[:-3] elif word.endswith('ce'): stem = word[:-2] stem += 't' else: stem = st.stem(word) if stem.endswith('i'): stem = stem[:-1] + 'y' return stem
def stemming(word): # Use stemmers for removing morphological affixes from words. # Portst = PorterStemmer() # Landst = LancasterStemmer() Regst = RegexpStemmer('ing|ed') new = Regst.stem(word) return new
def cleanText( raw_text ): text = raw_text # replace non-alpha characters text = re.sub( '[^a-z\s]+','', text, flags=re.IGNORECASE ) # replace multiple spaces with a single one text = re.sub('(\s+)',' ', text ) # converting string to lower case text = text.lower() # regex to remove punctuation tokenizer = RegexpTokenizer( r'\w+' ) # initial tokenization tokenized_text = tokenizer.tokenize( text ) # stemmer to remove plurals stemmer = RegexpStemmer( 's$|ies$' ) # remove stop words stop_words = set(['whom', 'that', 'those', "needn't", 'where', 'has', 'same', 'had', 'we', 'my', 'hers', 'does', 'they', 'the', 'only', "doesn't", 'be', 'mightn', 'her', 'wasn', 'being', 'am', 'but', 'themselves', 'during', "don't", 'into', 'its', 'isn', 'of', 'won', 'few', 'as', 'own', 'more', "shouldn't", 'myself', "mightn't", 'after', 'below', "didn't", "you've", 'wouldn', 'any', 'his', 'in', 'hasn', "weren't", 'him', 'she', 'will', "won't", 'it', 'y', 'he', 'now', 'such', 'haven', 'most', 'who', 'an', 'shan', 'at', "she's", 'were', 'weren', 'do', 'did', 've', 'all', 'between', 'above', "you're", 'no', "you'll", 'which', 'i', 'been', 'doesn', "hasn't", 'each', 'some', 'don', "aren't", 'should', 'mustn', 'our', "wouldn't", 'their', 'your', 'yours', 'doing', 'why', "hadn't", 'down', 'so', 'for', 'while', 'this', "shan't", 'there', 'needn', 'up', 'shouldn', 'by', "mustn't", 'have', 'yourself', "you'd", 'd', "haven't", 'about', 'ain', 'or', 'ourselves', 'when', "couldn't", 'is', 'with', "that'll", 'these', 'further', "should've", 'if', 'than', 'just', "wasn't", 'other', "isn't", 'you', 'then', 'how', 'too', 'until', 'very', 'are', 'to', 'itself', 'aren', 't', 'a', 'before', 'm', 'can', 'out', 'and', 'under', 'here', 'o', 'on', 'theirs', 'ma', 'couldn', 'having', 'himself', 'against', 'again', 'll', 'nor', 'hadn', 'ours', 'through', 'both', 'because', 'what', 's', 'them', 'not', 'off', 'me', "it's", 'once', 'over', 'didn', 'was', 're', 'from', 'yourselves', 'herself']) clean_text = [] for word in tokenized_text: if word not in stop_words: # make plurals singular token = stemmer.stem( word ) clean_text.append( token ) return clean_text
def stem_words(text): words = word_tokenize(text) #Regex for Suffixes st = RegexpStemmer('ing$|s$|able$|ible$|ful$|less$|ive$|acy$|al$|ance$|ence$|dom$|er$|or$|ism$|ist$|ity$|ty$|ment$|ship$|sion$|tion$|ate$|en$|ify$|fy$|ize$|ise$', min=4) stemmed = [] for word in words: stemmed.append(st.stem(word)) return ' '.join(stemmed)
def __init__(self, language): self.language = language if self.language == "eng": self.model = WordNetLemmatizer() elif self.language == "nso": self.model = RegexpStemmer('ng$', min=4) else: self.model = None
def stemming(word): # Use stemmers for removing morphological affixes from words. Portst = PorterStemmer() Landst = LancasterStemmer() Regst = RegexpStemmer('ing|ed') new = Portst.stem(word) if new == word: new = Landst.stem(word) if new == word: new = Regst.stem(word) return new
def __init__(self, db): super().__init__(db) self.nltk_data_path = os.path.join(os.getcwd(), 'nltk_data') # Remove affixes from a word: it's -> it, we'll -> we stemmer_pattern = r"’s$|n’t$|’ll$|’re$|’ve$|’d$|’m$|'s$" stemmer_pattern += r"|n't$|'ll$|'re$|'ve$|'d$|'m$|" self.stemmer = RegexpStemmer(stemmer_pattern) # Part-of-speech tagger self.tagger = nltk.tag.pos_tag self.wordnetlemmatize = WordNetLemmatizer() self._stop_words = None self._junk_symbols = None self._proper_nouns = None
def analyze(text, stop, stem, wstem): # Set utilities if stop: stopeng = set(stopwords.words('english')) if wstem: stemmer = RegexpStemmer('ing$|s$|e$', min=4) if stem: stemmer = PorterStemmer() tok = RegexpTokenizer(r'\w+') # Remove weird characters text = stripSpecial(text) # Tokenize and lowercase text = tok.tokenize(text.lower()) # Remove stopwords if flagged if stop: text = [w for w in text if w not in stopeng] # Stem if flagged if (stem or wstem): text = [stemmer.stem(w) for w in text] return ' '.join(text)
class word_lemmatiser: def __init__(self, language): self.language = language if self.language == "eng": self.model = WordNetLemmatizer() elif self.language == "nso": self.model = RegexpStemmer('ng$', min=4) else: self.model = None def lemma(self, x): if self.language == "eng": return self.model.lemmatize(x[0]) elif self.language == "nso": return self.model.stem(x[0].lower()) elif self.language == "zul": return x[2] else: return x[0] def identity(self, word): return word
def word_refiner(*args): Portst = PorterStemmer() Landst = LancasterStemmer() Regst = RegexpStemmer('ing|ed|ly|lly') args = [i for i in args if isinstance(i, unicode)] for w in map(str, args): if w in dic1: yield w else: st1 = Portst.stem(w) if st1 in dic1: yield st1 else: st2 = Landst.stem(w) if st2 in dic1: yield st2 else: st3 = Regst.stem(w) if st3 in dic1: yield st3 else: yield w
def normalize(sentences, stem_type): G = nx.DiGraph() # Create stemmer object of the type specified by stem_type stemmers = { '-p': PorterStemmer(), '-l': LancasterStemmer(), '-s': SnowballStemmer('english'), '-w': WordNetLemmatizer(), '-r': RegexpStemmer('ing$|s$|e$|able$', min=4) } try: stemmer = stemmers[stem_type] except KeyError: print('\nInvalid stemmer type passed as argument.\n') return # Define collections to reference during # normalization and initialize stemmer punc = set(string.punctuation) stop = stopwords.words('english') # Iterate over sentences, normalizing and # creating vertices for our graph as we go i = 0 for s in sentences: if len(s) > 1: l = (s.lower()).split(' ') # eliminate stop words norm = [w for w in l if w not in stop] # apply stemming to each word if stem_type == '-w': norm = [stemmer.lemmatize(w) for w in norm] else: norm = [stemmer.stem(w) for w in norm] # remove punctuation from each word temp = [] for w in norm: w = ''.join([l for l in w if l not in punc]) temp += w temp += ' ' norm = ''.join(temp) G.add_node(i, iden=i, raw=s, nrm=norm) i += 1 return G
def stemming(lines, algorithm=3): #selecting the algorithm to use #total 57370 in 12/11/2015 if algorithm == 0: #results with this algorith 56700 features stemmer = PorterStemmer() elif algorithm == 1: #results with this algorith 57731 features stemmer = LancasterStemmer() elif algorithm == 2: #results with this algorith 58007 features stemmer = RegexpStemmer('ing$|s$|e$|able$', min=4) elif algorithm == 3: #results with this algorith 56282 features (stopwords removed after stemmed) # 55230 if stopwords are remove first with method==2 stemmer = SnowballStemmer("english") elif algorithm == 4: #results with this algorith 56795 features wnl = WordNetLemmatizer() else: raise ValueError('Algorithm values should [0-4] ') stemmed_lines = [] # run thru all lines for each_line in lines: a_line_stemmed = '' #tokenize each line tokens = each_line.split() # run thru all tokens for each_token in tokens: #do the stemming to each token and join the tokens back togther if algorithm == 4: a_line_stemmed = a_line_stemmed + ' ' + wnl.lemmatize( each_token) else: a_line_stemmed = a_line_stemmed + ' ' + stemmer.stem( each_token) #recreate the list all over stemmed_lines.append(a_line_stemmed) return stemmed_lines
def example3(word='Amevive'): '''stem algorithm (词根解析算法) ''' ''' nltk.stem.lancaster module ''' ## 推荐分词算法1 from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() print st.stem(word) ''' nltk.stem.porter module ''' ## 推荐分词算法2 from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() print stemmer.stem(word) ''' nltk.stem.regexp module ''' ## 正则分词算法 from nltk.stem import RegexpStemmer st = RegexpStemmer('ing$|s$|e$', min=4) print st.stem(word) ''' nltk.stem.snowball module ''' ## 多语言支持分词算法 from nltk.stem import SnowballStemmer stemmer = SnowballStemmer('english') # Choose a language print stemmer.stem(word)
def stemming(tokens, Type='ps', rgxRule='ing$|s$|ed$', MIN=4): ''' Code adopted from text Text-analytics-with-python-a-practical-dipanjan-sarkar this function stems the tokens to get the root Stemmers: - LancasterStemmer - RegexpStemmer #user defined rules - SnowballStemmer # can stem other languages - PorterStemmer ''' stemmers = { 'ps': PorterStemmer(), 'ls': LancasterStemmer(), 'sn': SnowballStemmer("english"), 'rg': RegexpStemmer(rgxRule, MIN) } stemmer = stemmers[Type] stemmed_list = [] for i in tokens: stemmed_list = stemmed_list + [stemmer.stem(i)] return stemmed_list
def remove_english(text, cooking_list): stemmer = RegexpStemmer("ed$|'s$") stemmer1 = RegexpStemmer("d$") text = treebank_tokenizer.tokenize(text) lemmatized_text = [wordnet_lemmatizer.lemmatize(word) for word in text] lemmatized_text = [w for w in lemmatized_text if w not in cooking_list] lemmatized_stemmed_text = [] for w in lemmatized_text: w = stemmer.stem(w) w = stemmer1.stem(w) lemmatized_stemmed_text.append(w) tokenized_Italian_text = [w for w in lemmatized_stemmed_text if w not in words.words()] Italian_text = ' '.join(tokenized_Italian_text) Italian_text = re.sub('[^a-zA-ZÀ-ÿ.\s]', '', Italian_text) #removing all the numbers and special characters return Italian_text
#!/usr/bin/env python # coding: utf-8 # # Task-6 # ## A. TYPES OF STEMMERS # ### I. REGEX STEMMER # In[1]: import nltk from nltk.stem import RegexpStemmer stemmerregexp=RegexpStemmer('ing') stemmerregexp.stem('running') # ### II. SNOWBALL STEMMER # In[7]: import nltk from nltk.stem import SnowballStemmer SnowballStemmer.languages frstemmer = SnowballStemmer('french') frstemmer.stem('manges') # ### III. LANCASTER STEMMER
# <nbformat>2</nbformat> # <markdowncell> # <h2>Stemming Words</h2> # <p>Stemming is the process of removing <em>affixes</em> from a word to obtain it's root, or <em>stem</em>. For example, the stem of <strong> # growing</strong> is <strong>grow</strong>. </p> # <p>Python includes 4 stemming algorithms, 3 of which are demonstrated below. The fourth, <em>Snowball</em> is for non-English languages # and is not covered here but is in the text </p> # <codecell> from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer porter = PorterStemmer() lancaster = LancasterStemmer() reg = RegexpStemmer('ing') g = 'growing' print 'Porter yields: ',porter.stem(g) print 'lancaster yields: ', lancaster.stem(g) print 'Regexp yields: ', reg.stem(g) # <markdowncell> # <p>The output of various words can be different between stemmers:</p> # <codecell> g = 'cookery' print 'Porter yields: ',porter.stem(g) print 'lancaster yields: ', lancaster.stem(g) print 'Regexp yields: ', reg.stem(g)
def OutputRelations(abstractFileName, seta, negSet, neutralSet, negationSet, posSet, fullNames, threshold): #added threshold in input format #recent change: no longer using filename for abstract. instead, input the string of the abstract import nltk import copy import re from nltk.stem.lancaster import LancasterStemmer from nltk.stem import RegexpStemmer sentencedb = dict() fullnamestore = dict() a = readf(fullNames) for i in a: i = i.split(";") if len(i) > 1: #storing the full names, using the short symbols as dict keys fullnamestore[i[0]] = i[1] else: fullnamestore[i[0]] = "none" #sentencedb indexes the sentences by a unique identifier (int) def isGene(x, t, sentence): #checks if gene 'x' in a list of tokens 't' is really a gene or a variable with the same name if len(t) > 1 and len(x) > 2: if t.index(x) == 0: if t[t.index(x) + 1] in [">", "<", "=", "score"]: return False elif t.index(x) == len(t) - 1: if t[t.index(x) - 1] in [">", "<", "=", "score"]: return False elif (t[t.index(x) + 1] in [ ">", "<", "=", "score" ]) or (t[t.index(x) - 1] in [">", "<", "=", "score"]): return False elif (t[t.index(x) + 1], t[t.index(x) - 1]) == (")", "("): if x in fullnamestore: if fullnamestore[x] != "none": fullLength = len(fullnamestore[x]) #full length is length of full name if t.index(x) > len(fullnamestore[x]) + 2: if sentence[(t.index(x) - 1 - fullLength):(t.index(x) - 1)] == fullnamestore[x]: return True else: return False else: return True return True else: return False def countgenes(s, geneset): #counts the number of unique genes in a sentence "s" ss = nltk.word_tokenize(s) numgenes = 0 existingGenes = [] for i in ss: if i in geneset and isGene(i, ss, s) and i not in existingGenes: numgenes += 1 existingGenes.append(i) return numgenes def countWords(gene1, gene2, token): #counts the words between gene 1 and gene2 count = 0 for i in xrange(token.index(gene1) + 1, token.index(gene2) - 1): count += 1 return count #abstracts = open(abstractFileName,"r") storage = dict() b = [] #a=a.replace("\n"," ") #for i in a.split("\n\n"): # i=i.replace("\n"," ") # b.append(i) #print b[4] #print b[-1].split()[3] for x in abstractFileName.split("\n\n"): x = x.replace("\n", " ") b.append(x) #print x #x =x.split("\t") #print x parsedB = [] for line in b: if len(line) > 0: parsedB.append(line) b = parsedB # print b sentencelist = re.split("\. (?=[A-Z])", b[-2]) sentencelistcopy = copy.deepcopy(sentencelist) l = len(sentencelist) for i in xrange(l): if countgenes(sentencelistcopy[i], seta) < 2: sentencelist.remove(sentencelistcopy[i]) # print b[-1] storage[b[-1].split()[1]] = sentencelist #abstracts.close() #print sentencelistcopy,sentencelist,storage num_genes = 0 bw = 0 gene_names = seta st = RegexpStemmer('ing$|s$|e$|ed$|es$', min=4) def findsuf(string, x): a = "" for i in xrange(x): a += string[len(string) - 1 - (x - i - 1)] return a finalOutput = [] for id in storage: countsentences = 0 for sentence in storage[id]: rlist = [0, 0, 0] #sentence = storage[id] tokens = nltk.word_tokenize(sentence) tokenscopy = copy.deepcopy(tokens) tagged = nltk.pos_tag(tokens) for x in tagged: if x[1] in ['VBP', 'VBN', 'VBZ', 'VBG', 'VB']: tokenscopy[tagged.index(x)] = st.stem(x[0]) store = 0 genes = [] #print tokens,tokenscopy relation = 2 currentlist = [] direction = 0 for x in tokens: if x in gene_names and x not in currentlist and isGene( x, tokens, sentence): genes.append(x) num_genes += 1 currentlist.append(x) #store = tokens.index(x) in1 = tokens.index(genes[0]) in2 = tokens.index(genes[1]) indexx = 0 neg = 1 if countWords(genes[0], genes[1], tokenscopy) <= threshold: for i in xrange(in1 + 1, in2): if tokenscopy[i] in posSet: relation = 1 elif tokenscopy[i] in negSet: relation = -1 #elif tokenscopy[i] in neutralSet: #relation = 0 if (tokenscopy[i] in negSet or tokenscopy[i] in posSet): for y in xrange(in1 + 1, tokenscopy.index(tokenscopy[i])): if tokenscopy[y] == "not": relation = 0 #2 means neutral if findsuf(tokens[i], 2) == "ed": direction = 1 else: direction = 0 if direction == 0: rlist = [genes[0], genes[1], relation] #print genes[0],relation,genes[1] elif direction == 1: rlist = [genes[1], genes[0], relation] #print genes[1], relation, genes[0] # if relation!="none": if True: #the above condition is so that it does not output sentences for which no relation #has been found. This makes analysis easier. Must change this during final program. sentencedb[countsentences] = sentence #use this to have the sentences represented by a number #change id to pmid finalOutput.append( [id, sentence, rlist[0], rlist[1], rlist[2]]) #use this to have the actual sentences in the output #finalOutput.append([id,countsentences,rlist]) countsentences += 1 return finalOutput
import pandas as pd import string from nltk import word_tokenize from nltk.corpus import stopwords from nltk.stem import RegexpStemmer from nltk.stem.snowball import SnowballStemmer from tabulate import tabulate from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.model_selection import cross_val_score stop = stopwords.words("english") st = RegexpStemmer('ing$', min=8) stemmer = SnowballStemmer("english") encoding = "utf-8" class LoadData(): def __init__(self): pass def load_data_file(self, file): self.data = pd.read_csv(file) def test_train_data(self): msk = np.random.rand(len(self.data)) < 1.0 train = self.data[msk] test = self.data[~msk]
print(ls.stem(w)) stem_word_list = [ls.stem(w) for w in words_list] print(stem_word_list.count('jump')) print(stem_word_list) print(ls.stem("lying")) print(ls.stem("strange")) """ There are several other stemmers, including RegexpStemmer , where you can build your own stemmer based on user-defined rules , and SnowballStemmer , which supports stemming in 13 different languages besides English. """ #Regex Based stemmer from nltk.stem import RegexpStemmer rs = RegexpStemmer("ing$|s$|ed$", min=4) for w in words_list: print(rs.stem(w)) print(rs.stem("lying")) print(rs.stem("strange")) #Snow Ball stemmer from nltk.stem import SnowballStemmer ss = SnowballStemmer("german") print("supported languages are :", SnowballStemmer.languages) german_cars = "autobahnen"
#!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import unicode_literals import csv from nltk.stem import RegexpStemmer if __name__ == '__main__': patterns = 'i$|t$' regexp_stemmer = RegexpStemmer(patterns, 3) result_list = list() for word in ['Péter', 'szereti', 'Enikőt', 'és', 'Marit']: stem = regexp_stemmer.stem(word) result_list.append([word, stem]) with open('output/regexp.csv', 'w') as f: writer = csv.writer(f) writer.writerow(['word', 'stem']) for i in result_list: writer.writerow(i) print('See the result in output/regexp.csv')
# In[7]: import nltk from nltk.stem import LancasterStemmer stemmerlanc=LancasterStemmer() stemmerlanc.stem('darling') #doesn't work here as well # In[8]: from nltk.stem import RegexpStemmer regexpStemmer=RegexpStemmer('ing') regexpStemmer.stem('dancing') #doesn't support # In[10]: import nltk from nltk.stem import SnowballStemmer SnowballStemmer.languages frenchstemmer=SnowballStemmer('french') frenchstemmer.stem('manges') # In[11]:
from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer, SnowballStemmer, WordNetLemmatizer from features.process_text.patterns import get_stemming_pattern from nltk import pos_tag from nltk.corpus import wordnet import re from features.process_text.tokenize import is_tokenized, merge_tokens, word_tokenize _stemming_porter = PorterStemmer().stem _stemming_lancaster = LancasterStemmer().stem _stemming_regex = RegexpStemmer(get_stemming_pattern()).stem _stemming_snowball = SnowballStemmer('english').stem _STEMMING_DICT = { 'porter': _stemming_porter, 'lancaster': _stemming_lancaster, 'regex': _stemming_regex, 'snowball': _stemming_snowball } def convert_word_stem(string, stemming_id='porter'): test = string.split() """Converts words to word stem""" stemming = _STEMMING_DICT.get(stemming_id) return " ".join([stemming(word_token) for word_token in test]) #correcting repeated characters
import nltk from nltk.stem import RegexpStemmer st1 = RegexpStemmer('ing') print("Learning - ", st1.stem('Learning')) print("Singing - ", st1.stem('Singing')) print() st2 = RegexpStemmer('na') print("Banana - ", st2.stem('Banana'))
print ps.stem('strange') # lancaster stemmer from nltk.stem import LancasterStemmer ls = LancasterStemmer() print ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped') print ls.stem('lying') print ls.stem('strange') # regex stemmer from nltk.stem import RegexpStemmer rs = RegexpStemmer('ing$|s$|ed$', min=4) print rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped') print rs.stem('lying') print rs.stem('strange') # snowball stemmer from nltk.stem import SnowballStemmer ss = SnowballStemmer("german") print 'Supported Languages:', SnowballStemmer.languages # autobahnen -> cars
Created on Fri Apr 8 11:03:16 2016 @author: shen """ import time start_time = time.time() import numpy as np import pandas as pd from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer('english') from nltk.stem import RegexpStemmer st = RegexpStemmer('s$', min=4) import re, math from collections import Counter from sklearn.ensemble import RandomForestRegressor from sklearn import pipeline, grid_search from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import FeatureUnion from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import mean_squared_error, make_scorer import random random.seed(22) strNum = {
import nltk from nltk.stem import RegexpStemmer stemmerregexp=RegexpStemmer('ing') print(stemmerregexp.stem('working')) print(stemmerregexp.stem('happiness')) print(stemmerregexp.stem('pairing'))
ls = LancasterStemmer() print(ls.stem("jumping")) print(ls.stem("jumps")) print(ls.stem("jumper")) print(ls.stem("strange")) print(ls.stem("stranger")) print(ls.stem("lying")) # REGEXP STEMMER # Uses regular expressions to identify the morphological affixes in words and any part of the # string matching the same is removed # Note that this stemmer is case sensitive (won't work on capitalized affixes) rs = RegexpStemmer(r"ing$|s$|ed$", min=4) print(rs.stem("jumping")) print(rs.stem("colored")) print(rs.stem("lying")) # SNOWBALL STEMMER # Stems words in a dozen of languages. http://snowballstem.org ss = SnowballStemmer(language="german") print("Supported languages: {}".format(SnowballStemmer.languages)) print(ss.stem("autobahnen")) print(ss.stem("endlich")) print(ss.stem("unglaublich"))
def regexStemmer(self, term): v_sufixos = ['ando', 'endo', 's', 'é'] expr = 's$|es$' stemmer = RegexpStemmer(expr) return stemmer.stem(term)
from nltk.tokenize import WhitespaceTokenizer wh_tokenizer = WhitespaceTokenizer() wh_tokenizer.tokenize(sentence5) # 5. WordPunct Tokenizer from nltk.tokenize import WordPunctTokenizer wp_tokenizer = WordPunctTokenizer() wp_tokenizer.tokenize(sentence5) # Regexp Stemmer sentence6 = "I love playing Cricket. Cricket players practice hard." from nltk.stem import RegexpStemmer regex_stemmer = RegexpStemmer('ing$') ' '.join([regex_stemmer.stem(wd) for wd in sentence6.split()]) # Porter Stemmer sentence7 = "Before eating, it would be nice to sanitize your hands with a sanitizer" from nltk.stem.porter import PorterStemmer ps_stemmer = PorterStemmer() ' '.join([ps_stemmer.stem(wd) for wd in sentence7.split()]) # Lemmatization import nltk from nltk.stem import WordNetLemmatizer from nltk import word_tokenize
print([pst.stem(w) for w in words]) result = [] for w in words: result.append(pst.stem(w)) # Lancaster Stemmer words = ["sending", "cooking", "files", "lives", "crying", "dying"] from nltk.stem import LancasterStemmer lst = LancasterStemmer() # object 생성 print([lst.stem(w) for w in words]) # 정규표현식(Regexp Stemmer) words = ["sending", "cooking", "files", "lives", "crying", "dying"] from nltk.stem import RegexpStemmer lst = RegexpStemmer('ing') print([lst.stem(w) for w in words]) # 스페인어 추출(Snowball Stemmer) words2 = ['enviar', 'cocina', 'moscas', 'vidas', 'ilorar', 'morir'] from nltk.stem.snowball import SnowballStemmer sbst = SnowballStemmer('spanish') print([sbst.stem(w) for w in words2]) # 원형복원(WordNet Lemmatizer) word3 = ['coocking', 'believes'] from nltk.stem.wordnet import WordNetLemmatizer wl = WordNetLemmatizer() print([wl.lemmatize(w) for w in word3]) print([wl.lemmatize(w, pos='v') for w in word3])
def get_prescription(text): pstem = PorterStemmer() with open("symptoms.txt") as f: symptoms = f.readlines() finalsyns=[] for word in symptoms: syns = wordnet.synsets(word.strip()) syns = [s.lemma_names() for s in syns ] merged = list(itertools.chain(*syns)) if len(merged) == 0: finalsyns = finalsyns+[pstem.stem(word.strip())] ##print(finalsyns) else : finalsyns = finalsyns+merged finalsyns = [f.replace('\n','') for f in finalsyns] finalsyns = list(dict.fromkeys(finalsyns)) #print(finalsyns) pstem = PorterStemmer() rstem = RegexpStemmer('\(s\)') def words_in_string(word_list, a_string): return set(word_list).intersection(a_string) with open("Amount.txt") as f: Amount = f.readlines() Amount = [rstem.stem(x.strip()).split(' - ') for x in Amount] Amount = list(chain(*Amount)) prescription_dataset = tuple(open("dataset2.txt", 'r')) with open("Frequency.txt") as f: frequency = f.readlines() frequency = [rstem.stem(x.strip()).split(' - ') for x in frequency] frequency = list(chain(*frequency)) schedule={} with open("schedule.txt") as f: for line in f: #print(line.split(':')) s = line.split(':') schedule[s[0].strip().lower()] = s[1].strip() data = {} prescription = text.lower() prescription_tokenized = [word.replace(".","").replace("(","").replace(")","") for word in prescription.split()] prescription_tokenized_final = [pstem.stem(word) for word in prescription_tokenized ] print(prescription) data.update({'prescription': prescription}) amount="" for word in Amount: if pstem.stem(word) in prescription_tokenized_final or rstem.stem(word) in prescription_tokenized: index = prescription_tokenized_final.index(pstem.stem(word)) amount = prescription_tokenized[index-1]+" "+prescription_tokenized[index] if amount is "": print("Amount not mentioned!") else: print("Amount : " + amount) freq= "" timing = "" if "every" in prescription or "each" in prescription: if "every" in prescription: ei = prescription_tokenized.index("every") re="every" elif "each in prescription": ei = prescription_tokenized.index("each") re="each" st = ["minutes","minute","hours","hour","meal","day","days","morning","evening","afternoon"] for i in range(0,10): s=st[i] if s in prescription_tokenized: ti=prescription_tokenized.index(s) if ti-ei==2: freq=re+" "+prescription_tokenized[ei+1]+" "+s elif ti-ei==1 and i>=7: freq= re+" "+ s if schedule.get(freq.strip()) is not None: timing = schedule.get(freq.strip()) for word in frequency: if word in prescription and word.strip() is not "": freq=freq + " "+ word if schedule.get(word.strip()) is not None: timing = schedule.get(word.strip()) if freq is "": print("No Frequency mentioned!") symptoms ="" for s in finalsyns: if s in prescription: symptoms+= " "+s _check = ["", None] if freq in _check: return {'error':"No prescription found!"} data.update({"Amount":amount,"Symptoms" :symptoms,"Frequency":freq,"Timings":timing}) return data
more_stop_en = set(get_stop_words("english")) more_stop_es = set(get_stop_words("spanish")) stop_es = set(stopwords.words("spanish")) stop_en = set(stopwords.words("english")) adj = Adjectives() tdm = textmining.TermDocumentMatrix() room = 0 # for line in var: for line in sys.stdin: room += 1 line = line.replace("á", "a").replace("é", "e").replace("í", "i").replace("ó", "o").replace("ú", "u") line = re.sub("[0-9,#,!,¡,&,.]", "", line) line = re.sub("[^a-zA-Z]", " ", line) words = line.lower().split() st = RegexpStemmer("ing$|s$|able$|thing$|ful$", min=4) words = [st.stem(w) for w in words] words = [w for w in words if not w in stop_en and not w in stop_es] words = [w for w in words if not w in more_stop_en and not w in more_stop_es] words = [w for w in words if len(w) > 2] tdm.add_doc(" ".join(words)) good_count = [words.count(ad) for ad in adj.good if ad in words] good_count = len(good_count) bad_count = [words.count(ad) for ad in adj.bad if ad in words] bad_count = len(bad_count) print "Comentario " + str(room) + ": Sentiment Score: " + str(good_count - bad_count) + "\n"
lstemmer = LancasterStemmer() lstemmer.stem('dancing') """ $$ LancasterStemmer - Most Aggressive. LancasterStemmer is mostly used for the cases where the data or text is very huge, but your accuracy might falldown because of its most aggressive nature. """ ######## RegexpStemmer ########### rstemmer = RegexpStemmer('ing') ## remove all the letters except for a given word rstemmer.stem('cooking') rstemmer.stem('dancing') rstemmer.stem('king') ## as you can only k is given if we have given king... ## so should be more carefull about it. """ That's the End of Stemming concept. If you have any questions or suggestions regarding the concept, feel free to contact me via [email protected]
def OutputRelations(abstractFileName,seta,negSet,neutralSet,negationSet,posSet,fullNames,threshold): #added threshold in input format #recent change: no longer using filename for abstract. instead, input the string of the abstract import nltk import copy import re from nltk.stem.lancaster import LancasterStemmer from nltk.stem import RegexpStemmer sentencedb = dict() fullnamestore = dict() a = readf(fullNames) for i in a: i = i.split(";") if len(i)>1: #storing the full names, using the short symbols as dict keys fullnamestore[i[0]] = i[1] else: fullnamestore[i[0]] = "none" #sentencedb indexes the sentences by a unique identifier (int) def isGene(x,t,sentence): #checks if gene 'x' in a list of tokens 't' is really a gene or a variable with the same name if len(t)>1 and len(x)>2: if t.index(x) ==0: if t[t.index(x)+1] in [">","<","=","score"]: return False elif t.index(x) ==len(t)-1: if t[t.index(x)-1] in [">","<","=","score"]: return False elif(t[t.index(x)+1] in [">","<","=","score"])or ( t[t.index(x)-1] in [">","<","=","score"]): return False elif (t[t.index(x)+1],t[t.index(x)-1])==(")","("): if x in fullnamestore: if fullnamestore[x]!="none": fullLength = len(fullnamestore[x]) #full length is length of full name if t.index(x)>len(fullnamestore[x])+2: if sentence[(t.index(x)-1-fullLength):(t.index(x)-1)]==fullnamestore[x]: return True else: return False else: return True return True else: return False def countgenes(s,geneset): #counts the number of unique genes in a sentence "s" ss=nltk.word_tokenize(s) numgenes=0 existingGenes = [] for i in ss: if i in geneset and isGene(i,ss,s) and i not in existingGenes: numgenes+=1 existingGenes.append(i) return numgenes def countWords(gene1,gene2,token): #counts the words between gene 1 and gene2 count = 0 for i in xrange(token.index(gene1)+1,token.index(gene2) -1): count+=1 return count #abstracts = open(abstractFileName,"r") storage = dict() b = [] #a=a.replace("\n"," ") #for i in a.split("\n\n"): # i=i.replace("\n"," ") # b.append(i) #print b[4] #print b[-1].split()[3] for x in abstractFileName.split("\n\n"): x=x.replace("\n"," ") b.append(x) #print x #x =x.split("\t") #print x parsedB=[] for line in b: if len(line)>0: parsedB.append(line) b=parsedB # print b sentencelist =re.split("\. (?=[A-Z])",b[-2]) sentencelistcopy=copy.deepcopy(sentencelist) l = len(sentencelist) for i in xrange(l): if countgenes(sentencelistcopy[i],seta)<2: sentencelist.remove(sentencelistcopy[i]) # print b[-1] storage[b[-1].split()[1]] = sentencelist #abstracts.close() #print sentencelistcopy,sentencelist,storage num_genes=0 bw=0 gene_names = seta st = RegexpStemmer('ing$|s$|e$|ed$|es$', min=4) def findsuf(string,x): a = "" for i in xrange(x): a+=string[len(string)-1-(x-i-1)] return a finalOutput=[] for id in storage: countsentences=0 for sentence in storage[id]: rlist = [0,0,0] #sentence = storage[id] tokens = nltk.word_tokenize(sentence) tokenscopy = copy.deepcopy(tokens) tagged = nltk.pos_tag(tokens) for x in tagged: if x[1] in ['VBP','VBN','VBZ','VBG','VB'] : tokenscopy[tagged.index(x)] = st.stem(x[0]) store=0 genes = [] #print tokens,tokenscopy relation = 2 currentlist = [] direction = 0 for x in tokens: if x in gene_names and x not in currentlist and isGene(x,tokens,sentence): genes.append(x) num_genes+=1 currentlist.append(x) #store = tokens.index(x) in1 = tokens.index(genes[0]) in2 = tokens.index(genes[1]) indexx=0 neg=1 if countWords(genes[0],genes[1],tokenscopy)<=threshold: for i in xrange(in1 +1,in2): if tokenscopy[i] in posSet: relation = 1 elif tokenscopy[i] in negSet: relation = -1 #elif tokenscopy[i] in neutralSet: #relation = 0 if (tokenscopy[i] in negSet or tokenscopy[i] in posSet): for y in xrange(in1+1,tokenscopy.index(tokenscopy[i])): if tokenscopy[y]=="not": relation =0 #2 means neutral if findsuf(tokens[i],2)=="ed": direction =1 else: direction =0 if direction ==0: rlist = [genes[0],genes[1],relation] #print genes[0],relation,genes[1] elif direction == 1 : rlist = [genes[1],genes[0],relation] #print genes[1], relation, genes[0] # if relation!="none": if True: #the above condition is so that it does not output sentences for which no relation #has been found. This makes analysis easier. Must change this during final program. sentencedb[countsentences]=sentence #use this to have the sentences represented by a number #change id to pmid finalOutput.append([id,sentence,rlist[0],rlist[1],rlist[2]]) #use this to have the actual sentences in the output #finalOutput.append([id,countsentences,rlist]) countsentences+=1 return finalOutput