def extract_entities2(text): entities = [] """t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t2.evaluate(test_sents)""" for sentence in sent_tokenize(text): #print pos_tag(nltk.word_tokenize(sentence)) print sentence tags=pos_tag(nltk.word_tokenize(sentence)) tags=tagear(tags) chunks = ne_chunk(pos_tag(nltk.word_tokenize(sentence))) #chunks = ne_chunk(regexp_tagger.tag((nltk.word_tokenize(text)))) chunks = ne_chunk(tags) #chunks.draw() #print chunks for chunk in chunks: #print chunk #if hasattr(chunk, 'node'): # print chunk.node if hasattr(chunk, 'node') : print chunk entities.extend([chunk for chunk in chunks if hasattr(chunk, 'node')]) return entities
def test_nltkNERParsing(self): testString = 'Natural Sciences and Engineering Research Council of Canada' unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams() posTagged = nltk.pos_tag(unigrams) chunked = nltk.ne_chunk(posTagged) getGPEs = [] for treeBranch in chunked: if hasattr(treeBranch, 'label') and treeBranch.label() == 'GPE': getGPEs.append(str(treeBranch)) self.assertEqual(1, len(getGPEs)) testString = 'Milwaukee Foundation' unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams() posTagged = nltk.pos_tag(unigrams) chunked = nltk.ne_chunk(posTagged) # returns (S (PERSON Milwaukee/NNP) (ORGANIZATION Foundation/NNP)) testString = 'New England Board of Higher Education' unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams() posTagged = nltk.pos_tag(unigrams) chunked = nltk.ne_chunk(posTagged) # returns (S (GPE New/NNP)(ORGANIZATION England/NNP Board/NNP) of/IN (PERSON Higher/NNP Education/NNP)) testString = 'New England Board of Higher Education' unigrams = TokenizeOnWhitespacePunctuation(testString).getUnigrams() posTagged = nltk.pos_tag(unigrams) chunked = nltk.ne_chunk(posTagged)
def nameEntityExtract(document): sentences = nltk.sent_tokenize(document) sentences = [nltk.word_tokenize(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences] print sentences[0] print "the length of sentences is: " + str(len(sentences)) sent = sentences[0] print nltk.ne_chunk(sent,binary=True)
def English_NER(sentence): # 命名实体只被标注为NE print '命名实体只被标注为NE:' print nltk.ne_chunk(sentence, binary=True) # 命名实体会添加类型标签,例如PERSON,ORGANIZATION,GPE等 print '命名实体会添加类型标签,例如PERSON,ORGANIZATION,GPE等:' print nltk.ne_chunk(sentence)
def extractNE(sentence, withClass): words = nltk.word_tokenize(sentence) # Extract words from sentence: Stopwords removed, punctuations removed if withClass: tree = nltk.ne_chunk(nltk.pos_tag(words), binary=False) return extractNEwithClass(tree) else: tree = nltk.ne_chunk(nltk.pos_tag(words), binary=True) return extractNEwithoutClass(tree)
def main(): sent = nltk.corpus.treebank.tagged_sents()[22] print "sent (nltk):", sent #print nltk.ne_chunk(sent, binary=True) #print nltk.ne_chunk(sent) sent = ie_preprocess("""Injured personnel consisting of six Schlum employees were immediately transported to nearby hospitals and most of them (were) discharged after having received treatment""") print sent print nltk.ne_chunk(sent[0])
def process_contents(): try: for i in tokenized[5:]: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt = nltk.ne_chunk(tagged) #White, House namedEnt = nltk.ne_chunk(tagged, binary = True) #White House namedEnt.draw() except Exception as e: print(str(e))
def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i); tagged = nltk.pos_tag(words) namedEnt1 = nltk.ne_chunk(tagged) #Give all named entities with category namedEnt2 = nltk.ne_chunk(tagged, binary=True) #This gives named entity without category namedEnt2.draw() except Exception as e: print(str(e))
def entity_names(self, tuple_list = None): if tuple_list is None: tuple_list = self.updated_element # Recognize the names of the entities contained in the string tree = nltk.ne_chunk(tuple_list, binary=False) # At this point instead of the entity names I substitute the # name of the entity. In order to recognize the names having the # entity name I simply make a check on the final name of the element # if it is different from 'S', then it was a valid node for el in tree: if type(el) == nltk.tree.Tree: # If it was an entity then the tag can be # stored as a name self.entity_named.append((el.node, 'NNP')) else: self.entity_named.append(el) # Print # print 'Named entities', self.entity_named # I update the element self.updated_element = self.entity_named return self.updated_element
def get_xmen_text(soup): #en_stopwords = set(nltk.corpus.stopwords.words('english')) raw = nltk.clean_html(str(soup)) raw_trunc = raw[:raw.rfind('References')] sents = nltk.sent_tokenize(raw_trunc) words = [nltk.word_tokenize(sent) for sent in sents] poss = [nltk.pos_tag(word) for word in words] #nes = [nltk.ne_chunk(pos, binary=True) for pos in poss] #for pos in poss: print pos poss_filter = [filter_insignificant(pos, tag_suffixes=['DT']) for pos in poss] print poss_filter nes = [nltk.ne_chunk(poss_filter, binary=True) for pos in poss_filter] def sub_leaves(tree, node): return [t.leaves() for t in tree.subtrees (lambda s: s.node == node)] people = [sub_leaves(ne, 'NE') for ne in nes] people = [item for sublist in people for subsublist in sublist for subsubsublist in subsublist for item in subsubsublist if item not in ('NNP', 'NN', 'NNPS', 'JJ')] people = merge_people(people) fd = nltk.FreqDist(person for person in people if person!='Magneto') fd.plot(50)
def ne_tag(sentences): tagged = raw_trigram_tag(sentences, tagger_file="tagger.pkl")[1] fin = [] for tagged_sent in tagged: # print tagged_sent fin.append(nltk.ne_chunk(tagged_sent)) return fin
def extract_ent(): data_dir = "/Users/Brishti/Documents/Internships/scripts/" inputfile = open(data_dir + 'output3.txt', 'r') # outputfile = open(data_dir + 'entity.txt', 'w') for line in inputfile: # print("Looking at: " + line) if re.match("^\s*$", line): next line = line.split("|") # print("Length is: " + str(len(line))) # print line[2] for sent in nltk.sent_tokenize(line[2]): print("______") # print sent for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): # print nltk.pos_tag(nltk.word_tokenize(sent)) print chunk #if hasattr(chunk, 'label') and chunk.label() == "PERSON": # print chunk.leaves() #print(line[0] + '|' +' '.join(c[0] for c in chunk.leaves())+'\n') # outputfile.write(line[0] + '|' +' '.join(c[0] for c in chunk.leaves())+'\n') inputfile.close()
def processor(data): try: tokenized = nltk.word_tokenize(data) tagged = nltk.pos_tag(tokenized) namedEnt = nltk.ne_chunk(tagged, binary=True) entities = re.findall(r'NE\s(.*?)/', str(namedEnt)) # (' descriptives = re.findall(r'\(\'(\w*)\'.\s\'JJ\w?\'',str(tagged)) if len(entities) > 1: pass elif len(entities) == 0: pass elif str(entities) == '_blank': pass else: print 'Named: ', entities[0] print 'Description: ' for eachDesc in descriptives: print eachDesc currentTime = time.time() dateStamp = datetime.datetime.fromtimestamp(currentTime).strftime('%Y-%m-%d %H:%M:%S') namedEntity = entities[0] relatedWord = eachDesc c.execute("INSERT INTO knowledgeBase (unix, dateStamp, namedEntity, relatedWord) VALUES (?,?,?,?)", (currentTime, dateStamp, namedEntity, relatedWord)) conn.commit() except Exception, e: print 'failed in the first try of processor' print str(e)
def ne_chunk(self, tweet_id, tweet_text): sent_ner = [] ner_tweets = {} sents = nltk.sent_tokenize(tweet_text) for text in sents: text = nltk.word_tokenize(text) text = nltk.pos_tag(text) text = nltk.ne_chunk(text) ner_list = self.getNodes(text, []) if ner_list: sent_ner.extend(ner_list) #ner_list = self.get_entity_value(text) #sent_ner.extend(ner_list) for each_ner in ner_list: if each_ner[1] in ner_tweets and tweet_id not in ner_tweets[each_ner[1]]: ner_tweets[each_ner[1]].append(tweet_id) else: ner_tweets[each_ner[1]] = [tweet_id] return sent_ner, ner_tweets
def recuperarEntidadesEn(texto): ObjTag = Tokenizar() ObjDes = Desambiguar() Lista = [] Lista2= [] for sentence in sent_tokenize(texto): #print sentence tags=ObjTag.tagear(sentence) #tags=tagear(traducir(word_tokenize(sentence))) print tags parsed = ne_chunk(tags) print parsed for chunk in parsed: #print chunk #if hasattr(chunk, 'node'): # print chunk.node if hasattr(chunk, 'node'): #print chunk #print chunk.leaves() Lista2.append(chunk.leaves()[0]) #print ' '.join(c[0] for c in chunk.leaves()) Lista.append (' '.join(c[0] for c in chunk.leaves())) print Lista2 print ObjDes.DesambiguarTexto(Lista2, sentence) Lista2=[] return Lista
def process_content(): for i in custom_tokenized[5:]: words = word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt = nltk.ne_chunk(tagged); print(namedEnt)
def processor(data): namedEntArray = [] try: tokenized = nltk.word_tokenize(data) tagged = nltk.pos_tag(tokenized) namedEnt = nltk.ne_chunk(tagged, binary=True) entities = re.findall(r'NE\s(.*?)/',str(namedEnt)) #('not', 'RB') descriptives = re.findall(r'\(\'(\w*)\',\s\'JJ\w?\'', str(tagged)) if len(entities) > 1: pass elif len(entities) == 0: pass else: print '_________________________' print 'Named:',entities[0] print 'Descriptions:' for eachDesc in descriptives: print eachDesc except Exception, e: print 'failed in the main try of processor' print str(e) time.sleep(555)
def parse_questions(): print "Parsing Questions..." parsed_questions = {} with open(DIR+'/questions.txt', 'r') as f: data = f.read() questions = re.split('[\s]*</top>[\s]*', data) if len(questions[-1].strip()) == 0: questions.pop() qc = QuestionClassifier.QuestionClassifier() for question in questions: question_number = int(re.search(r"<num>[\s]*Number:[\s]*([0-9]+)", question).group(1)) question = re.search(r"<desc>[\s]*Description:[\s]*([a-zA-Z0-9\-\?\'\. ]+)", question).group(1) question_words = nltk.word_tokenize(question) question_pos = nltk.pos_tag(question_words) question_nes = nltk.ne_chunk(question_pos) question_tree = Chunker.chunker.parse(question_pos) question_classification = qc.classify(question) qwords, nouns, nes = [], [], [] for part in question_nes: try: nes.append((part.node, part.leaves()[0][0])) except: if part[1] == 'WP' or part[1] == 'WRB': qwords.append(part[0]) elif part[1] == 'NN' or part[1] == 'NNP': nouns.append(part[0]) # print qwords, nouns, nes # print question_pos parsed_questions[question_number] = { "question": question, "pos": question_pos, "ne": question_nes, "parse_tree": question_tree, "question_classification": question_classification, "question_words": qwords, "nouns": nouns, "ne_words": nes } with open(DIR+'/parsed_questions.txt', 'wb') as f: pickle.dump(parsed_questions, f)
def get_entities(self,sentences): """ The function returns the dictionary containing the results for the Name Entity Recognition analyze. Args: sentences: the sentences list. Returns: dictionary: """ entities = dict([]) # Tokenization tokens = [nltk.tokenize.word_tokenize(s) for s in sentences] # Part-Of-Speech tagging pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens] # Chunking chunked_nes = [nltk.ne_chunk(c) for c in pos_tagged_tokens] for tree in chunked_nes: for s in tree.subtrees(lambda t: (t.height()==2)): if s.label()!='S': entity = ' '.join(i[0] for i in s.leaves()) if s.label() in entities.keys(): if entity not in entities[s.label()]: entities[s.label()].append(entity) entities[s.label()].sort() else: entities[s.label()] = [entity] return entities
def get_NERs(path_to_seg): NER_dict = {} # map entities to counts (i.e., # of occurences in this seg) NERs_to_types = {} # map the NERs to the kinds of things they are seg_text = open(path_to_seg).read() # strip *all* tags seg_text = strip_tags(seg_text, get_tags_in_text(seg_text)) # tokenize, then POS text pos_tagged_seg = nltk.pos_tag(nltk.word_tokenize(seg_text)) # and now the NER NERd_seg = nltk.ne_chunk(pos_tagged_seg) # kind of hacky, but this is how I'm parsing # the induced tree structure for subtree in NERd_seg: # then this is an NER if type(subtree) == nltk.tree.Tree: # ignoring the *type* of NER for now -- i can't think of a # case in which we'd care (typically, entities with the same # name *ought* to be of the same type, I think...) entity = subtree[0][0] # this parses out the token (entity) itself entity_type = subtree.node # if we've already encountered it, just bump the count if entity in NER_dict: NER_dict[entity] += 1 else: NER_dict[entity] = 1 NERs_to_types[entity] = subtree.node ### going to assume we always get this correct, I guess return NER_dict, NERs_to_types
def processor(data): try: tokenized = nltk.word_tokenize(data) tagged = nltk.pos_tag(tokenized) namedEnt = nltk.ne_chunk(tagged, binary=True) #print (namedEnt) #time.sleep(55) entities = re.findall(r'NE\s(.*?)/',str(namedEnt)) descriptives_adj = re.findall(r'\(\'(\w*)\',\s\'JJ\w?\'',str(tagged)) '''if len(entities) > 1: pass elif len(entities) == 0: pass else: ''' print ('Sentence with POS-tagging : ') print (str(tagged)) print ('-----------------------------------------------') print ('Named Entity of the Sentence : ',entities) print ('Descriptions : ') for desc in descriptives: print (desc) except Exception as e: print ('Failed in the first loop of processor') print (str(e))
def extract_normal_ne(self, text): result = [] for sent in sent_tokenize(text) if text else []: for chunk in ne_chunk(pos_tag(word_tokenize(sent))): if hasattr(chunk, "node"): result.append(" ".join([c[0] for c in chunk.leaves()])) return result
def extract_named_entities(request): """ Uses the NLTK to extract named entities from a given text. """ named_entities = [] if request.GET: if 'text' not in request.GET: return HttpResponse('Please enter the text to analyze') else: return HttpResponse('Please enter the text to analyze') try: text = request.GET["text"] tokenized = nltk.word_tokenize(text) tagged = nltk.pos_tag(tokenized) result = nltk.ne_chunk(tagged) if len(result.productions()) > 1: for ne in result.productions()[1:]: name = ne.rhs()[0][0] pos_tag = ne.rhs()[0][1] inferred_type = ne.lhs().symbol() named_entities.append( {"name":name, "pos_tag":pos_tag, "guessed_type":inferred_type,} ) except: return HttpResponse('Failed to extract named entitied from text "%s": %s' % (text, str(sys.exc_info()[1])) ) return HttpResponse(json.dumps(named_entities))
def extract_concepts(text): """ Uses the NLTK natural language processing library to extract from a text the essential terms that appeared in it. """ try: ignored_words = corpus.stopwords.words('english') ignored_words.append("n't") appeared = {} concepts = [] tokenized = nltk.word_tokenize(text) tagged = nltk.pos_tag(tokenized) named_entities = nltk.ne_chunk(tagged) for ne in named_entities.leaves(): #if ne[1] in ('NNS', 'NNP', 'NN'): if len(ne[0]) > 2 and ne[0].lower() not in ignored_words and not (ne[0].startswith("http") or ne[0].startswith("//")): name = ne[0] if name in appeared: continue concepts.append(name) appeared[name] = True except: print "extract concepts failed:", sys.exc_info() return concepts
def _getAnswer(self, text, extract_node): try: answer_list = [] # To remove extra spacea and special characteres from text text = re.sub(r'\W+\d+\s+.,\'"&', '', text) #Start extraction process from the text # Sentence Tokenization for sent in nltk.sent_tokenize(text): # Word Tokenization and pos tagging # Create chunks of the text which may have the answer for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): #print chunk if hasattr(chunk, 'node'): # Check if the cunkes contain the needed node if chunk.node == extract_node: performer = ' '.join(c[0] for c in chunk.leaves()) answer_list.append(performer) # Create a central result set result.append(performer) return answer_list except: print " ERROR: Couldn't perform named entity recognition on this text"
def word_tokenize(sent): nonlocal time_pos nonlocal time_chunk # replace typographic marks with simple marks sent = sent.replace('…', '...') sent = sent.replace('”', "''") sent = sent.replace('“', ',,') sent = sent.replace(',', ',') sent = sent.replace('’', "'") words = nltk.word_tokenize(sent) # strip punctuation from words words = [word.strip(string.punctuation) for word in words] words = [word for word in words if len(word) > 0] if not analyse_pos: return words else: start = time.time() tagged = tagger.tag(words) time_pos += (time.time() - start) if preserve_entities: start = time.time() chunks = nltk.ne_chunk(tagged, binary=ner_binary) time_chunk += (time.time() - start) word_list = [] ne_concat(chunks, word_list) return word_list else: return [nltk.tuple2str(t) for t in tagged]
def get_entities(self, document): """ Extract entities from a single document using the nltk.tree.ne_chunk method This method is called multiple times by the tranform method :param document: a list of lists of tuples :return entities: a list of comma-separated strings """ entities = [] for paragraph in document: for sentence in paragraph: # classifier chunk the sentences, adds category labels, e.g. PERSON trees = ne_chunk(sentence) # select only trees with the kinds of entities we want for tree in trees: if hasattr(tree, 'label'): if tree.label() in self.labels: # entities is a list, each entry is a list of entities # for a document entities.append( ' '.join([child[0].lower() for child in tree]) ) return entities
def question_processing(ques): global corpus, name, list_query list_query = [] # corpus=[] speak(random.choice(choices) + ' ' + name, False) # Step1: Generate all tokens tokens = nltk.word_tokenize(ques) # Step2: Part of Speech tagging of the question pos_tags = nltk.pos_tag(tokens) # Step3: Named Entity Recoginition of the POS Tags pos_tree = nltk.ne_chunk(pos_tags) # filter all query words for i in pos_tags: if i[1] == 'NNP' or i[1] == 'NN' or i[1] == 'JJ' or i[1] == 'JJS' or i[1] == 'NNS' or i[1] == 'VBZ' or i[ 1] == 'RBS': list_query.append(i[0]) # list_query) collection_name = [] # Get the Matching List of Collection(DBs) where the answer could be. for i in list_query: if dict_collections.get(i.lower()): collection_name.append(dict_collections[i.lower()]) # print(collection_name) # Aggerate all the Documents from the list of Collections db.cursor = db.questions.find() corpus = [] for i in db.cursor: for t in collection_name: if t in i: corpus.append(i[t])
def processLanguage(): try: opener = urllib2.build_opener() #opener.addheaders[('User-agent','Mozilla/5.0')] url = "http://disqus.com/embed/comments/?disqus_version=82d70f54&base=default&f=cnn&t_i=%2F2013%2F12%2F01%2Fpolitics%2Fobamacare-website%2Findex.html&t_u=http%3A%2F%2Fwww.cnn.com%2F2013%2F12%2F01%2Fpolitics%2Fobamacare-website%2Findex.html&t_e=Administration%3A%20Obamacare%20website%20working%20smoothly&t_d=Administration%3A%20Obamacare%20website%20working%20smoothly&t_t=Administration%3A%20Obamacare%20website%20working%20smoothly&t_c=207582&s_o=default#2" urlContent = opener.open(url).read() soup = BeautifulSoup(urlContent) title = soup.title.text body = soup.findAll('p') for item in body: #print item sentence = item.text.encode('ascii','ignore') #print sentence tokenized = nltk.word_tokenize(sentence) tagged = nltk.pos_tag(tokenized) namedEntity = nltk.ne_chunk(tagged,binary=True) compiler = re.compile("[(]['][a-zA-Z]+[']") for chunks in namedEntity: #print chunks[0] if compiler.match(str(chunks[0])): chunk = str(chunks[0]) front = chunk[2:] word = re.search("[a-zA-Z]+[']",front) print word.group(0)[:-1] #print "Matched" #namedEntity.draw() except Exception, e: print str(e)
def named_entities(text, types=None): """This functions returns named entities from a text. Adapted from emh's code (http://stackoverflow.com/users/2673189/emh) Parameters ---------- text: str UTF-8 string types: list of strings Currently the list can include only "PERSON" and "ORGANIZATION" Returns ------- dict Dictionary with one entry for each type of entity. For each of these entries, contains a list of strings with found entities """ if not types: types = ["PERSON", "ORGANIZATION"] named_entities = {"PERSON": [], "ORGANIZATION": []} tokens = nltk.tokenize.word_tokenize(text) pos = nltk.pos_tag(tokens) sentt = nltk.ne_chunk(pos, binary=False) for type_ in types: for subtree in sentt.subtrees(filter=lambda t: t.label() == type_): entity = "" for leaf in subtree.leaves(): entity = entity + " " + leaf[0] named_entities[type_].append(entity.strip()) return named_entities
def extract_entity(s): return ne_chunk(pos_tag(word_tokenize(s)))
def preprocessing(tokenize_text): pos_tag_text = pos_tag(tokenize_text) chunk_text = ne_chunk(pos_tag_text, binary=True) return chunk_text
print('\nperfoming POS:') text = nltk.word_tokenize(file_content) print(nltk.pos_tag(text)) #performing POS tag #POS ends #lemmatization starts print('\nperforming lemmatization:') from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() for w in wtokens: print(lemmatizer.lemmatize(w)) #performing lemmatizer #lemmatization ends #trigram starts print('\nperforming Trigram:') from nltk import ngrams n = 3 #n defines the number of ngrams trigrams = ngrams(file_content.split(), n) #splitting with respect to n for grams in trigrams: print(grams) #trigram ends #Named Entity Recognizer starts print('\nPerforming NER:') from nltk import word_tokenize, pos_tag, ne_chunk print(ne_chunk(pos_tag(word_tokenize(file_content)))) #NER ends
def nltk_tagger(token_text): tagged_words = nltk.pos_tag(token_text) ne_tagged = nltk.ne_chunk(tagged_words) return ne_tagged
def write(filename, predictor): sentence = read_sentence(filename) for s in sentence: sentence_list, label_list = process_sentence(s) sen = mergeWords(sentence_list) # print(sen) #####assign pos#############################################3 pos_list = [] # truple = tree2conlltags(ne_chunk(pos_tag(word_tokenize(sen)))) truple = tree2conlltags(ne_chunk(pos_tag(sentence_list))) # the truple contains word, pos, ner-label for item in truple: pos_list.append(item[1]) ################get words lemma and stem###################### wordnet_lemmatizer = WordNetLemmatizer() lemma_list = [] for word in sentence_list: lemma_list.append(wordnet_lemmatizer.lemmatize(word, pos="v")) stem_list = [] lancaster = LancasterStemmer() for word in sentence_list: stem_list.append(lancaster.stem(word)) # print(stem_list) #####assign consituency parent pos############################ pos_parent_list, right_sublings_list, chunk_position, left_sublings_list = parse_consituency_tree( sentence_list, predictor) # print("=========pos===") # print(len(sentence_list)) # print(len(chunk_position)) # 追加一行空行 sentence_list.append(" ") label_list.append(" ") pos_list.append(" ") pos_parent_list.append(" ") right_sublings_list.append(" ") chunk_position.append(" ") lemma_list.append(" ") stem_list.append(" ") left_sublings_list.append(" ") data = {} data["word"] = sentence_list data["label"] = label_list data["pos"] = pos_list data["chunk"] = pos_list data["pos_parent"] = pos_parent_list data["right_sublings_list"] = right_sublings_list data["chunk_position"] = chunk_position data["lemma_list"] = lemma_list data["stem_list"] = stem_list data["left_sublings_list"] = left_sublings_list df = pd.DataFrame(data) # to_filename = "word.csv" # df.to_csv(to_filename) to_file = filename.split(".tsv")[0] to_file1 = to_file + "_feature_v1" + ".tsv" df.to_csv(to_file1, sep='\t', index=False, header=False, encoding="utf8", mode='a')
)) lemma = WordNetLemmatizer() stem = PorterStemmer() stem_wrds = [] lemma_wrds = [] for token in tokens: stem_wrds.extend([stem.stem(token)]) lemma_wrds.extend([lemma.lemmatize(token)]) print(pos_tag(tokens)) print(stem_wrds) print(lemma_wrds) sent = "John Works in FVDS and stays in Chennai" tokens = word_tokenize(sent) chunked = ne_chunk(pos_tag(tokens)) for elt in chunked: if isinstance(elt, Tree): print(elt) ############################################################################### from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split, cross_val_score import pandas as pd from sklearn.metrics import confusion_matrix inp = pd.read_excel(r"\Movie review.xlsx", encoding='utf-8') X = inp.SNTC_TXT y = inp.REVIEW X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
def extract_entities(self, doc): sentence_list = [] for sent in sent_tokenize(doc): sentence_list.append( [chunk for chunk in ne_chunk(pos_tag(word_tokenize(sent)))]) return sentence_list
import nltk from nltk import ne_chunk NE_sent="The Indian Politicians shouts in the Parliament House" # In[32]: NE_tokens=word_tokenize(NE_sent) NE_tags=nltk.pos_tag(NE_tokens) # In[33]: NE_NER=ne_chunk(NE_tags) print(NE_NER) # # Chunking # picking up individual pieces of information and grouping them into bigger pieces # In[34]: new = "The cat sat on a mat and ate the rat" new_Tokens = nltk.pos_tag(word_tokenize(new)) new_Tokens # In[35]:
def parts_of_speech_flow(self, doc): sentences = sent_tokenize(doc) tokenized = [word_tokenize(sentence) for sentence in sentences] pos_tags = [pos_tag(sentence) for sentence in tokenized] return ne_chunk(pos_tags, binary=True)
from nltk import pos_tag sentence = word_tokenize("I always lie down to tell a lie.") tags = pos_tag(sentence) # print(tags) import nltk my_grammar = nltk.CFG.fromstring(""" S -> NP VP PP -> P NP NP -> Det N | Det N PP | 'I' VP -> V NP | VP PP Det -> 'an' | 'my' N -> 'elephant' | 'pajamas' V -> 'shot' P -> 'in' """) parser = nltk.ChartParser(my_grammar) sentence = word_tokenize("I shot an elephant in my pajamas") for tree in parser.parse(sentence): print(tree) # tree.draw() from nltk import pos_tag, ne_chunk chunk_list = ne_chunk( pos_tag(word_tokenize("Antonio joined Udacity Inc. in California."))) print(chunk_list)
def extract_named_entities(text): entity_names = [] entities = ne_chunk(pos_tag(word_tokenize(text)), binary=True) for tree in entities: entity_names.extend(extract_entity_names(tree)) return entity_names
#read/create the text data sent = "John is studying at Stanford University in California" #Extract the entities #Using NLTK #import libraries import nltk from nltk import ne_chunk from nltk import word_tokenize #NER ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=False) #using spacy import spacy nlp = spacy.load('en') # Read/create a sentence doc = nlp(u'Apple is ready to launch new phone worth $10000 in New york time square ') for ent in doc.ents: print(ent.text, ent.start_char, ent.end_char, ent.label_)
sentences = re.findall(r'(.*?)[\.|\?|!+]',exampleReview) for sent in sentences: print ('******************************************************') print ('******************************************************') print('The Sentence : ',sent) print ('-----------------------------------------------') processor(sent) ''' sentences = re.findall(r'(.*?)[\.|\?|!+]',exampleReview) for sent in sentences: tokenized = nltk.word_tokenize(sent) tagged = nltk.pos_tag(tokenized) namedEnt = nltk.ne_chunk(tagged, binary=True) entities = re.findall(r'NE\s(.*?)/',str(namedEnt)) descriptives_noun = re.findall(r'\(\'(\w*)\',\s\'NN\w?\'',str(tagged)) descriptives_verbs = re.findall(r'\(\'(\w*)\',\s\'VB\w?\'',str(tagged)) descriptives_adj = re.findall(r'\(\'(\w*)\',\s\'JJ\w?\'',str(tagged)) descriptives_adverb = re.findall(r'\(\'(\w*)\',\s\'RB\w?\'',str(tagged)) print ('---------------------------------------------------------------------------------------------------------------------') print ('*** The Sentence : ***') print (sent) print ('*** POS-tagged Sentence : ***') print (str(tagged)) print ('*** Named Entity : ***') for entity in entities: print(entity) print ('*** Nouns : ***')
def namedEntities(self, ex): ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(self.input_text))) return ne_tree
import nltk from nltk.tokenize import PunktSentenceTokenizer from nltk.corpus import state_union train_text = state_union.raw('2005-GWBush.txt') sample_text = state_union.raw('2006-GWBush.txt') cst = PunktSentenceTokenizer(train_text) tknd = cst.tokenize(sample_text) try: for i in tknd[5:]: word = nltk.word_tokenize(i) # print(word) tgd = nltk.pos_tag(word) nER = nltk.ne_chunk(tgd) print(nER) except Exception as e: print(str(e))
sentences = nltk.sent_tokenize(article) len(sentences) # Tokenize each sentence into words: token_sentences token_sentences = [nltk.word_tokenize(sent) for sent in sentences] # this is actually broken up into lists of lists.... len(token_sentences) token_sentences[0] # Tag each tokenized sentence into parts of speech: pos_sentences pos_sentences = [nltk.pos_tag(sent) for sent in token_sentences] print nltk.ne_chunk(pos_sentences[2], binary = True) len(pos_sentences) # Create the named entity chunks: chunked_sentences chunked_sentences = nltk.ne_chunk_sents(pos_sentences, binary=True) # Test for stems of the tree with 'NE' tags for sent in chunked_sentences: for chunk in sent: if hasattr(chunk, "label") and chunk.label() == "NE": print(chunk) test = [] for sent in chunked_sentences: for chunk in sent:
def answer_processing(s_tuple, q_type, q_keywords, dependency): #print "DOING ANSWER_PROCESSING" sentences = s_tuple print len(sentences) # http://nbviewer.jupyter.org/github/gmonce/nltk_parsing/blob/master/1.%20NLTK%20Syntax%20Trees.ipynb # in string answers = [] grammar_passed_answers = [] grammar_failed_answers = [] # NEED TO ACCOUNT FOR CASES IN WHICH THERE ARE LESS THAN 5 ANSWERS num_answers_needed = 5 - len(sentences) if(num_answers_needed > 0): for i in range(0,num_answers_needed): sentences.append(('100','nil')) for i in range(0, min(10, len(sentences))): doc_num = sentences[i][0] sentence = sentences[i][1] if q_type == WHEN_TYPE: sentence_after_tagging = timex.tag(sentence) when_answers = re.findall('<TIMEX2>(.*?)</TIMEX2>', sentence_after_tagging) # in case answer comes out as empty, output an empty string when_answer = when_answers[0] if len(when_answers) != 0 else 'nil' answers.append((doc_num, when_answer)) else: words = nltk.word_tokenize(sentence) pos_tag = nltk.pos_tag(words) ner_tree = nltk.ne_chunk(pos_tag) #print ner_tree # the list of tuples((word, pos),ner) to be considered for this sentence matching_tuples = [] # print q_keywords global subtree tmp = [] for subtree in ner_tree.subtrees(): #if subtree.label() in NER_TAG[q_type] and subtree.pos()[0][0][1]=='NNP': if subtree.label() in NER_TAG[q_type]: word = ' '.join(map(lambda x : x[0][0], subtree.pos())) #print word #print q_keywords iskwin = map(lambda x : x in word, q_keywords) if not any(iskwin): # print "SUBTREE!", subtree # matching_tuples = subtree.pos() answer = ' '.join(map(lambda x : x[0][0], subtree.pos())) if answer not in map(lambda x : x[1],answers): tmp.append(answer) ''' if(len(tmp) > 0 and dependency != '' and q_type == WHO_TYPE): try: p, f = grammar_stuff(tmp,sentence, dependency, doc_num) grammar_passed_answers += p grammar_failed_answers += f except: for answer in tmp: grammar_failed_answers.append((doc_num, answer)) else: for answer in tmp: grammar_failed_answers.append((doc_num,answer)) ''' for answer in tmp: if answer not in map(lambda x : x[1],grammar_failed_answers): grammar_failed_answers.append((doc_num,answer)) #print "SENTENCE : ", sentence, "ANSWER : ", tmp # t : ((word, pos), ner) # answer = '' # for t in matching_tuples: # #print t # if t[0][0] not in q_keywords: # answer += t[0][0] + ' ' # # remove any possible trailing whitespaces # answer = answer.rstrip() # answers.append((doc_num,answer)) answers += grammar_passed_answers + grammar_failed_answers print 'ANSWERS!!!!!' print answers return answers
date_clean.append(date.today() + datetime.timedelta(days=d_plus * 30)) else: date_clean.append(parser.parse(d[0])) print(date_clean) # ## NLTK NER # Chunking? # In[99]: #nltk.download('maxent_ne_chunker') #nltk.download('words') from nltk import ne_chunk, pos_tag chunked = ne_chunk(loc_tag) print(chunked) # ## Text Categorizer # ## Custom Components # ## Logic Engine to parse NE # In[100]: # If no second loaction, ask for start location # If no second date, assume one way
taggedToken = pos_tag(token1) print(taggedToken[:20]) # # 영문 개체명인식 # In[33]: nltk.download('words') nltk.download('maxent_ne_chunker') # In[34]: import nltk nltk.download('punkt') from nltk.tokenize import word_tokenize # In[37]: #토큰화 token1 = word_tokenize("Barack Obana likes fried chicken very much") print('token1', token1) taggedToken = pos_tag(token1) print('pi=os_tag', taggedToken) from nltk import ne_chunk neToken = ne_chunk(taggedToken) print(neToken) # In[ ]:
#!/usr/bin/python# -*- coding: utf-8 -*- from nltk import word_tokenize, pos_tag, ne_chunk sentence = "Mark and John are working at Google." print(ne_chunk(pos_tag(word_tokenize(sentence))))
for token in tex: print(nltk.pos_tag([token])) ############################################################################################################ #### Named entity recognition ############################################################################################################ # Es el proceso de detectar las entidades nombradas como el nombre de la persona, el nombre de la ubicación, # el nombre de la empresa, las cantidades y el valor monetario. text = "Google’s CEO Sundar Pichai introduced the new Pixel at Minnesota Roi Centre Event" #importing chunk library from nltk from nltk import ne_chunk nltk.download('maxent_ne_chunker') nltk.download('words') # tokenize and POS Tagging before doing chunk token = word_tokenize(text) tags = nltk.pos_tag(token) chunk = ne_chunk(tags) print(chunk) ############################################################################################################ #### Chunking ############################################################################################################ # El "chunking" significa recoger trozos individuales de información y agruparlos en trozos más grandes. # En el contexto de la NLP y la minería de textos, "chunking" significa una agrupación de palabras o tokens en trozos. text = "We saw the yellow dog" token = word_tokenize(text) tags = nltk.pos_tag(token) reg = "NP: {<DT>?<JJ>*<NN>}" a = nltk.RegexpParser(reg) result = a.parse(tags) print(result)
import nltk nltk.download('punkt') nltk.download('averaged_perceptron_tagger') nltk.download('maxent_ne_chunker') nltk.download('words') nltk.pos_tag("Machine Learning is great".split()) from nltk.stem.porter import PorterStemmer porter_stemmer=PorterStemmer() print(porter_stemmer.stem('wolves')) from nltk.stem import WordNetLemmatizer lemmatizer=WordNetLemmatizer() print(lemmatizer.lemmatize('wolves')) s="Albert Einstein was born on March 14,1879" tags=nltk.pos_tag(s.split()) print(tags) nltk.ne_chunk(tags).draw() print(nltk.ne_chunk(tags))
text1 = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.' text2="Please advise on the options the deceased clients wife has in relation to this pension" \ " She wishes to exercise ARF option if available " text="Hi I was trying to register online but I was n t recognised " \ " My France number is 4824461 " \ "Looking to register on Pension Planet Robert Manning" \ " but Irish Ronnie Gardner website ca n t find my details " \ "Richard Wade " text = 'How can I pay my car renewal' tokenized_text = word_tokenize(text) ner_st = st.tag(tokenized_text) print(ner_st) pos_st = post.tag(tokenized_text) print(pos_st) exit() pos_nltk = nltk.pos_tag(tokenized_text) print(pos_nltk) blob = TextBlob(text) print(blob.tags) print("tree stanford\n") print("type of chunk", type(ne_chunk(pos_st))) print("type of tree", len(tree2conlltags(ne_chunk(pos_st)))) print("tree nltk\n") print(tree2conlltags(ne_chunk(pos_nltk))) print("tree blob\n") print(ne_chunk(pos_nltk)) print(tree2conlltags(ne_chunk(blob.tags))) exit()
def entities(text): return ne_chunk(pos_tag(word_tokenize(text)))
if (type(items) == nltk.tree.Tree): # word = str(items[0]) + ' - ' + str(items.label()) word = str(items[0]) elif (type(items) == unicode): word = str(items) if (word.find('.') != -1): end_of_sentence = True sentence = sentence + word + " " if (end_of_sentence): sentence = sentence[:len(sentence) - 1] sentence_list.append(sentence) text = nltk.word_tokenize(sentence) pos_tagged_sentence = nltk.pos_tag(text) ne_chunked_sentence = nltk.ne_chunk(pos_tagged_sentence) for words in ne_chunked_sentence: word = None pos_tag = None ner = None it = it + 1 print(it) if (type(words) == nltk.tree.Tree): word = words[0][0] pos_tag = words[0][1] ner = words.label() # print(words.label(), words[0][0], words[0][1]) else: word = words[0]
) for thing in dummyDoc.ents: print(thing, end=" ") print(thing.label_, end=" ") print(thing.label, end=" ") print("\n") dummyComment = "The latest example deals with the cost of the United States embassy in Jerusalem. The President publicly announced the cost in March: “We’re going to have it built very quickly and inexpensively,” he said. “They put an order in front of my desk last week for $1 billion . . . We’re actually doing it for about $250,000, so check that out.” The actual cost will be almost 100 times higher, as CNN reports. A contract summary file for the embassy from the Office of Acquisitions of the Department of State (available on usaspending.gov) puts the figure at $21.2 million." print("DUMMY COMMENT: ", dummyComment) dummyComment = ' '.join( [word for word in dummyComment.split() if word not in stop]) sentences = nltk.sent_tokenize(dummyComment) #tokenizes sentences into words --> sentences will become a 2D list [ [blah, blah, blah], [blah, blah, blah] ] sentences = [nltk.word_tokenize(sent) for sent in sentences] #tags each word in a sentence with a "part of speech" label --> sentences will become a 2D list with tuples # --> [ [ (blah, yuh), (blah, yuh), (blah, yuh) ], [ (blah, yuh), (blah, yuh), (blah, yuh) ] ] sentences = [nltk.pos_tag(sent) for sent in sentences] print(sentences, "\n") #stuff here for tagged_sentence in sentences: for chunk in nltk.ne_chunk(tagged_sentence): # print (chunk) if type(chunk) == nltk.tree.Tree: print("CHUNK: ", type(chunk))
if isinstance(document, str): document = document else: raise ValueError('Document is not string!') document = document.strip() sentences = nltk.sent_tokenize(document) sentences = [sentence.strip() for sentence in sentences] return sentences # tokenize sentences sentences = parse_document(text) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] # tag sentences and use nltk's Named Entity Chunker tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] ne_chunked_sents = [nltk.ne_chunk(tagged) for tagged in tagged_sentences] # extract all named entities named_entities = [] for ne_tagged_sentence in ne_chunked_sents: for tagged_tree in ne_tagged_sentence: # extract only chunks having NE labels if hasattr(tagged_tree, 'label'): entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #get NE name entity_type = tagged_tree.label() # get NE category named_entities.append((entity_name, entity_type)) # get unique named entities named_entities = list(set(named_entities)) # store named entities in a data frame entity_frame = pd.DataFrame(named_entities,
corpus.question.dtype corpus=pd.DataFrame(corpus) def preprocess(sent): sent = nltk.word_tokenize(sent) sent = nltk.pos_tag(sent) return sent ##O/p we get a list of tuples containing the individual words in the ##sentence and their associated part-of-speech sent = preprocess(str(corpus["question"])) print(sent) ##Now we implement noun phrase chunking to identify named entities using ##a regular expression consisting of rules that indicate how sentences should be chunked pattern = 'NP: {<DT>?<JJ>*<NN>}' cp = nltk.RegexpParser(pattern) cs = cp.parse(sent) print(cs) from nltk.chunk import conlltags2tree, tree2conlltags from pprint import pprint iob_tagged = tree2conlltags(cs) pprint(iob_tagged) ##With the function nltk.ne_chunk(), ##we can recognize named entities using a classifier ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(str(corpus)))) print(ne_tree)
pattern = 'NP: {<DT>?<JJ>*<NN>}' cp = nltk.RegexpParser(pattern) cs = cp.parse(sent) #print(cs) # In[59]: from nltk.chunk import conlltags2tree, tree2conlltags nltk.download('maxent_ne_chunker') nltk.download('words') from pprint import pprint iob_tagged = tree2conlltags(cs) #print(iob_tagged) ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(text))) #print(ne_tree) # In[60]: #Using spacy to for entity recognition. import spacy from spacy import displacy from collections import Counter import en_core_web_sm nlp = en_core_web_sm.load() doc = nlp(text) #print([(X.text, X.label_) for X in doc.ents]) #print([(X, X.ent_iob_, X.ent_type_) for X in doc])
filepaths.sort() corpus = [open(f, 'r').read() for f in filepaths] corpus = np.array(corpus) dump = '' entities = [] organizations = [] for j in range(corpus.shape[0]): dump += corpus[j] tokenized = nltk.word_tokenize(dump) tagged = nltk.pos_tag(tokenized) ## Generating a list of all entities namedEnt = nltk.ne_chunk(tagged) for i in namedEnt: if type(i) == Tree: for subtree in i.subtrees(): name = '' for leaf in subtree.leaves(): leaf_parts = list(leaf[0]) for part in leaf_parts: name += part name += ' ' if subtree.label() == 'PERSON' and len(subtree) > 1: if name not in entities: entities.append(name)