def stem_tokens(tokens): stop = nltk.corpus.stopwords.words('english') punctuation = u",.;:'()" return [ wnl().lemmatize(item.strip(punctuation)) for item in tokens if item not in stop ]
def __init__(self, data_dir, df=10**6): self.terms_store_loc = "%s/terms_store.db" % data_dir self.tweet_index_loc = "%s/tweet_index.db" % data_dir self.tweet_store_loc = "%s/tweet_vector.db" % data_dir self.term_id_map = None self.id_term_map = None self.id_term_freq = None self.total_freq = 0 self.id_index_map = None self.lmtz = wnl() self.df = df
def lemmaExtractor(self,term): lemma = wnl().lemmatize(term) if lemma == term : return "" return lemma
def __init__(self): self.wnl = wnl()
from nltk import WordNetLemmatizer as wnl import nltk #Lemmatize lemm = wnl().lemmatize('cats') print(lemm) #Finding Named Entity sentence = "This is normal sentence by Neil Shah" #Create tokens tokens = nltk.word_tokenize(text) #Find POS of that tokens tokens = nltk.pos_tag(tokens) temp = nltk.ne_chunk(tokens, binary=True) #Function which would find NamedEntity from POS chunks def NE_ext(temp): entity_names = [] if hasattr(temp,'node') and temp.node: if temp.node == 'NE': entity_names.append(' '.join([child[0] for child in temp])) else: for child in temp: entity_names.extend(NE_ext(child)) return entity_names #Return only named entity from POS chunks NamedEntity = NE_ext(temp)