Exemple #1
0
def stem_tokens(tokens):
    stop = nltk.corpus.stopwords.words('english')
    punctuation = u",.;:'()"
    return [
        wnl().lemmatize(item.strip(punctuation)) for item in tokens
        if item not in stop
    ]
Exemple #2
0
 def __init__(self, data_dir, df=10**6):
     self.terms_store_loc = "%s/terms_store.db" % data_dir
     self.tweet_index_loc = "%s/tweet_index.db" % data_dir
     self.tweet_store_loc = "%s/tweet_vector.db" % data_dir
     self.term_id_map = None
     self.id_term_map = None
     self.id_term_freq = None
     self.total_freq = 0
     self.id_index_map = None
     self.lmtz = wnl()
     self.df = df
	def lemmaExtractor(self,term):
		lemma = wnl().lemmatize(term)
		if lemma == term :
			return ""
		return lemma
 def __init__(self):
     self.wnl = wnl()
from nltk import WordNetLemmatizer as wnl
import nltk

#Lemmatize
lemm = wnl().lemmatize('cats')
print(lemm)
#Finding Named Entity
sentence = "This is normal sentence by Neil Shah"

#Create tokens
tokens = nltk.word_tokenize(text)

#Find POS of that tokens
tokens = nltk.pos_tag(tokens)
temp = nltk.ne_chunk(tokens, binary=True)

#Function which would find NamedEntity from POS chunks
def NE_ext(temp):
    entity_names = []
    if hasattr(temp,'node') and temp.node:
        if temp.node == 'NE':
            entity_names.append(' '.join([child[0] for child in temp]))
        else:
            for child in temp:
                entity_names.extend(NE_ext(child))
    return entity_names


#Return only named entity from POS chunks
NamedEntity = NE_ext(temp)