def replace_person_mail_names(filename, outputfile=None, replacetoken='***'): """(str) -> None Replace all person names and begining of email adresses by replacetoken (defaut ***) if outputfile, the file is not rewritten but an new file output is written """ with open(filename, 'r') as f: rawtext = f.read() tokens = word_tokenize( rawtext ) #"Mrs.Truc" -> ['Mrs.Truc'] and "*****@*****.**" -> ["nemo.nemo","@","xmail.com"] # tokens = wordpunct_tokenize(rawtext)#"Mrs.Truc" -> ['Mrs', '.', 'Truc'] and "*****@*****.**" -> ["nemo",".","nemo","@","xmail",".","com"] chunked_tokens = ne_chunk(nltk.pos_tag(tokens)) if not outputfile: outputfile = filename with open(outputfile, 'w') as f: prec, curr = '', '' for tok in chunked_tokens: if isinstance(tok, Tree) and tok.label( ) == 'PERSON': #if the token is a person, replace by replacetoken curr = replacetoken elif isinstance(tok, Tree): curr = ' '.join(x[0] for x in tok) elif tok[ 0] == '@': #if the token is @, replace last token by replacetoken prec = replacetoken curr = '@' else: curr = tok[0] f.write(prec) if curr in PUNCTUATION or "'" in curr: prec = curr else: prec = ' ' + curr f.write(curr)
def _process_simpleHash(self, simpleHash): # Extract entities from keys resulting from SimpleExtractor process_* entityHash = {} for data in simpleHash: occs = simpleHash[data]['occurences'] proxLoc = simpleHash[data]['proxLoc'] # Tokenize sentences for sent in tokenize_sentences(data): # Tokenize words tokens = tokenize_words(sent) # Tag words with Parts of Speech tagged = pos_tag(tokens) # Identify named entities entities = ne_chunk(tagged) for ent in entities: if isinstance(ent, NLTKParseTree): # Is it a wanted type? if ent.node in self.types: # Should we keep the PoS tag? if self.keepPos: txts = ['/'.join(token) for token in ent.leaves()] else: txts = [token[0] for token in ent.leaves()] txt = ' '.join(txts) new = {txt: {'text': txt, 'occurences': occs, 'proxLoc': proxLoc[:]}} entityHash = self._mergeHash(entityHash, new) return entityHash
def getPersonOrPlaceAnswers(self, mainObj): corrIndex = 0 itr = 0 while self.answer == "" and corrIndex < len(mainObj.sim) and itr < 4: currentPara = mainObj.paras[mainObj.sim[corrIndex][0]] simCoeff = self.getSentQueryCorrelation(mainObj.question, mainObj.qv, currentPara) sentences = sent_tokenize(currentPara) answers = [] for iSimilarSent in simCoeff: sent = sentences[iSimilarSent[0]] taggedSent = pos_tag(word_tokenize(sent)) chunked = ne_chunk(taggedSent) temp = {} for chunk in chunked: if type(chunk) == Tree: temp[chunk.label()] = [c[0] for c in chunk] answers.append(temp) for entity in answers: if mainObj.question_type in entity.keys(): tAnswer = entity[mainObj.question_type][0] if PorterStemmer().stem( tAnswer.lower()) not in mainObj.qv.keys(): self.answer = tAnswer break corrIndex += 1 itr += 1 return self.answer
def extract_ner(text: str): chunks = [] tree = ne_chunk(pos_tag(word_tokenize(text))) for leaf in tree: if hasattr(leaf, 'label'): chunks.append([leaf.label(), ' '.join(c[0] for c in leaf)]) return chunks
def replace_person_names_version2(filename, outputfile=None, replacetoken='***'): """(str) -> None Replace all person names and begining of email adresses by replacetoken (defaut ***) if outputfile, the file is not rewritten but an new file output is written """ with open(filename, 'r') as f: rawtext = f.read() tokens = word_tokenize( rawtext ) #"Mrs.Truc" -> ['Mrs.Truc'] and "*****@*****.**" -> ["nemo.nemo","@","xmail.com"] # tokens = wordpunct_tokenize(rawtext)#"Mrs.Truc" -> ['Mrs', '.', 'Truc'] and "*****@*****.**" -> ["nemo",".","nemo","@","xmail",".","com"] tokens_with_pos = nltk.pos_tag(tokens) chunked_tokens = ne_chunk(tokens_with_pos) if not outputfile: outputfile = filename with open(outputfile, 'w') as f: for tok in chunked_tokens: if isinstance(tok, Tree) and tok.label( ) == 'PERSON': #if the token is a person, replace by replacetoken f.write(' ' + replacetoken) elif isinstance(tok, Tree): f.write(' ' + ' '.join(x[0] for x in tok)) elif tok[0] in PUNCTUATION or "'" in tok[0]: f.write(tok[0]) else: f.write(' ' + tok[0])
def frequency(): valid_input = False while not valid_input: tips = input("Share your beauty tips: ") valid_input = validate_skin_input(tips) words_tokenize = word_tokenize(tips) words_tagged = pos_tag(words_tokenize, tagset="universal") print("\nPart of speech words") for word in words_tagged: print(f"{word[0]}: {word[1]}") print("\nFrequency Distributions") freq_dist = FreqDist(words_tokenize) for fd, count in freq_dist.most_common(): print(f"{fd}: {count}") user_input = "" while user_input is not "Y" and user_input is not "N": user_input = input( "Do you want to show parse tree? [Y|N, case sensitive]: ") if user_input == "Y": ner = ne_chunk(words_tagged) ner.draw() else: pass
def tag_with_NLTK(text_and_id): text, id = text_and_id set_entities_tag = {(' '.join(c[0] for c in chunk), chunk.label()) for chunk in ne_chunk(pos_tag(word_tokenize(text))) if hasattr(chunk, 'label')} return (set_entities_tag, id)
def extract_entities(words): entities = [] for chunk in ne_chunk(pos_tag(words)): if hasattr(chunk, 'node'): performer = ' '.join(c[0] for c in chunk.leaves()) entities.append(performer.lower()) return entities
def postags(self, doc): for f in ['title', 'desc', 'text']: [ doc.postags[f].extend(pos_tag(sentence)) for sentence in doc.tokens[f] ] doc.entities[f] = [ c for c in ne_chunk(doc.postags[f], binary=True) if hasattr(c, '_label') ] doc.entities[f] = list( set([ ' '.join([l[0] for l in e.leaves()]) for e in doc.entities[f] ])) doc.topmod[f] = [ t for t in doc.postags[f] if t[1] in self.topmod_list ] doc.topmod[f] = [(self.replace_list[t[0]], t[1]) if t[0] in self.replace_list.keys() else (t[0], t[1]) for t in doc.topmod[f]] doc.topmod[f] = [ t for t in doc.topmod[f] if lower(t[0]) not in stopwords.words('english') + self.punct + self.remove_list[0] and t[1] not in self.remove_list[1] ] doc.postags[f] = [(self.replace_list[t[0]], t[1]) if t[0] in self.replace_list.keys() else (t[0], t[1]) for t in doc.postags[f]] doc.postags[f] = [ t for t in doc.postags[f] if lower(t[0]) not in stopwords.words('english') + self.punct + self.remove_list[0] and t[1] not in self.remove_list[1] ]
def nltk_entity_groups(text): """Return all contiguous NER tagged chunks by NLTK.""" parse_tree = ne_chunk(pos_tag(word_tokenize(text))) ner_chunks = [ ' '.join([l[0] for l in t.leaves()]) for t in parse_tree.subtrees() if t.label() != 'S' ] return ner_chunks
def chunk(self, tags): # 개체명 인식 (nltk에서 기본 지원하는 ne_chunk 사용) r = [] for chunk in ne_chunk(tags): if hasattr(chunk, 'label'): r.append(' '.join(c[0] for c in chunk)) r = ' '.join(r).split(' ') return r
def mywarWritten2(*args): sents = tokenize.sent_tokenize(myvar.get()) sent = tokenize.word_tokenize(sents[int(myvar2.get())]) tagged_sent = tag.pos_tag(sent) #print(tagged_sent) tree = chunk.ne_chunk(tagged_sent) tree.draw()
def find_named_entity(sentence, word_tokenizer): # U+0027 (') and U+2019 (’) sentence = sentence.replace('\u2019', '\u0027') default_count = 0 default_entity_word = [] for chunk in ne_chunk(pos_tag(word_tokenize(sentence))): if hasattr(chunk, 'label'): #print(chunk.label(), ' '.join(c[0] for c in chunk.leaves())) default_entity_word.append(chunk.label() + ' ' + ' '.join(c[0] for c in chunk.leaves())) default_count += 1 custom_count = 0 custom_entity_word = [] for chunk in ne_chunk(pos_tag(word_tokenizer.tokenize(sentence))): if hasattr(chunk, 'label'): #print(chunk.label(), ' '.join(c[0] for c in chunk.leaves())) custom_entity_word.append(chunk.label() + ' ' + ' '.join(c[0] for c in chunk.leaves())) custom_count += 1 return[default_count, custom_count, default_entity_word, custom_entity_word]
def postags(self, doc): for f in ['title', 'desc', 'text']: [doc.postags[f].extend(pos_tag(sentence)) for sentence in doc.tokens[f]] doc.entities[f] = [c for c in ne_chunk(doc.postags[f], binary=True) if hasattr(c, '_label')] doc.entities[f] = list(set([' '.join([l[0] for l in e.leaves()]) for e in doc.entities[f]])) doc.topmod[f] = [t for t in doc.postags[f] if t[1] in self.topmod_list] doc.topmod[f] = [(self.replace_list[t[0]], t[1]) if t[0] in self.replace_list.keys() else (t[0], t[1]) for t in doc.topmod[f]] doc.topmod[f] = [t for t in doc.topmod[f] if lower(t[0]) not in stopwords.words('english')+self.punct + self.remove_list[0] and t[1] not in self.remove_list[1]] doc.postags[f] = [(self.replace_list[t[0]], t[1]) if t[0] in self.replace_list.keys() else (t[0], t[1]) for t in doc.postags[f]] doc.postags[f] = [t for t in doc.postags[f] if lower(t[0]) not in stopwords.words('english')+self.punct + self.remove_list[0] and t[1] not in self.remove_list[1]]
def menu_2(): tokenized = word_tokenize(user_sentences) pt = pos_tag(tokenized) fd = FreqDist(tokenized).most_common() print('Part of Speech') for word, tag in pt: print(word, '-', tag) print('Frequency') for word, count in fd: print(word, '-', count) while True: show = input('Want draw show tree Y/N : ') if show == 'Y' or show == 'N': break if show == 'Y': ne_chunk(pt).draw()
def make_syntax_trees(sentences): trees = [] i = 0 #to count where the program is at for sentence in sentences: print(i) words = tokenize.word_tokenize(sentence) tagged_sentence = tag.pos_tag(words) tree = chunk.ne_chunk(tagged_sentence) trees.append(tree) i += 1 return trees
def get_names(sentence): assert isinstance(sentence, list), "Sentence must be tokenized first" tagged_sent = nltk.tag.pos_tag(sentence) names = [ i[0] for i in list( chain(*[ chunk.leaves() for chunk in ne_chunk(tagged_sent) if isinstance(chunk, Tree) ])) ] possessives = [word for word in sentence if word.endswith("s'")] return names + possessives
def nltk_entity_groups(text): # Return all contiguous NER tagged chunks by NLTK. # https://www.nltk.org/book/ch07.html """nltk.ne_chunk: returns a nested nltk.tree.Tree object so you would have to traverse the Tree object to get to the NEs. POS(part-of speech)-tagger: processes a sequence of words, and attaches a part of speech tag to each word """ parse_tree = ne_chunk(pos_tag(word_tokenize(text))) ner_chunks = [' '.join([l[0] for l in t.leaves()]) for t in parse_tree.subtrees() if t.label() != 'S'] return ner_chunks
def get_nltk_vectors(self, texts: List[str]): # https://gist.github.com/japerk/1909413 from textblob import TextBlob sid = self.nltk_sid vsid = self.vader_sid pdict = self.pdict n_tokens_in = self.n_tokens_in rake = self.rake_nltk nltk_texts = [fasttext.tokenize(text) for text in texts] textblob_sentiments = [[sentiment.polarity, sentiment.subjectivity] for sentiment in [TextBlob(text).sentiment for text in texts]] textblob_sentiments = torch.tensor(textblob_sentiments).unsqueeze(1).expand(len(texts), n_tokens_in, 2) textblob_sentiments = textblob_sentiments.to(get_device()) mask = stack_and_pad_tensors(list(map(lambda x: torch.ones(len(x), dtype=int), nltk_texts)), n_tokens_in) mask = mask.to(get_device()) mask = self.is_mask_em(mask) has_digit = stack_and_pad_tensors( list(map(lambda x: torch.tensor([has_digits(str(t)) for t in x]), nltk_texts)), n_tokens_in) has_digit = has_digit.to(get_device()) has_digit = self.has_digit_em(has_digit) m = self.text_model nltk_emb = stack_and_pad_tensors([torch.tensor([m[t] for t in sent]) for sent in nltk_texts], n_tokens_in) # if t in m else np.zeros(m.vector_size) nltk_emb = nltk_emb.to(get_device()) sid_vec = torch.tensor([list(sid.polarity_scores(t).values()) for t in texts]) sid_vec = sid_vec.unsqueeze(1).expand(len(texts), n_tokens_in, sid_vec.size(1)) sid_vec = sid_vec.to(get_device()) vsid_vec = torch.tensor([list(vsid.polarity_scores(t).values()) for t in texts]) vsid_vec = vsid_vec.unsqueeze(1).expand(len(texts), n_tokens_in, vsid_vec.size(1)) vsid_vec = vsid_vec.to(get_device()) conlltags = [[ptags for ptags in nltk.tree2conlltags(ne_chunk(pos_tag(x)))] for x in nltk_texts] pos = stack_and_pad_tensors( list(map(lambda x: torch.tensor([pdict[tag.lower()] for token, tag, ne in x]), conlltags)), n_tokens_in) pos = pos.to(get_device()) pos_emb = self.tag_em(pos) ner = stack_and_pad_tensors( list(map(lambda x: torch.tensor([pdict[ne.lower().split("-")[-1]] for token, tag, ne in x]), conlltags)), n_tokens_in) ner = ner.to(get_device()) ner_emb = self.tag_em(ner) phrases = [get_rake_nltk_phrases(rake, t) for t in texts] key_wc_rake_nltk = [get_rake_nltk_wc(tokens, phr) for tokens, phr in zip(nltk_texts, phrases)] key_wc_rake_nltk = stack_and_pad_tensors(key_wc_rake_nltk, self.n_tokens_in) key_wc_rake_nltk = key_wc_rake_nltk.to(get_device()) nltk_rake_vectors = self.key_wc_rake_nltk(key_wc_rake_nltk) result = torch.cat([vsid_vec, nltk_emb, textblob_sentiments, pos_emb, ner_emb, nltk_rake_vectors, sid_vec, mask, has_digit], 2) result = result.to(get_device()) result = self.nltk_nn(result) return result
def get_location_entities(content): pos_sentences_tokens = prep_named_entities(content) named_entities = [] locations = [] for pos_tree in pos_sentences_tokens: ne_chunk_tree = ne_chunk(pos_tree) leaves = sub_leaves(ne_chunk_tree, "GPE") if leaves: named_entities.append(leaves) for sentence_entities in named_entities: for entity in sentence_entities: locations.append(" ".join([name[0] for name in entity])) return list(OrderedDict.fromkeys(locations))
def get_person_entities(content): pos_sentences_tokens = prep_named_entities(content) named_entities = [] people = [] for pos_tree in pos_sentences_tokens: ne_chunk_tree = ne_chunk(pos_tree) leaves = sub_leaves(ne_chunk_tree, "PERSON") if leaves: named_entities.append(leaves) for sentence_entities in named_entities: for entity in sentence_entities: people.append(" ".join([name[0] for name in entity])) return list(OrderedDict.fromkeys(people))
def test_interactive(self): docs = self.source.find() docs.batch_size(1000) tagger = ngrams.make_backoff_tagger() for ind, doc in enumerate(clean_html.doc_iter(docs)): sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) print(chunks) if ind == 10: break
def get_all_named_entities(content): pos_sentences_tokens = prep_named_entities(content) named_entities = [] nouns = [] for pos_tree in pos_sentences_tokens: # binary = True grabs all named entities instead of classifying person, location, etc. ne_chunk_tree = ne_chunk(pos_tree, binary=True) leaves = sub_leaves(ne_chunk_tree, "NE") if leaves: named_entities.append(leaves) for sentence_entities in named_entities: for entity in sentence_entities: nouns.append(" ".join([name[0] for name in entity])) return list(OrderedDict.fromkeys(nouns))
def print_names(filename): """(str) -> None Find all the names into a text file and print them into the terminal """ with open(filename, 'r') as f: rawtext = f.read() tokens = word_tokenize(rawtext) tokens_with_pos = nltk.pos_tag(tokens) print("---------------------") print("Named entities") print() named_entities = [chunk for chunk in ne_chunk(tokens_with_pos) ] # if isinstance(chunk, Tree)] print(tokens_with_pos) print(named_entities)
def test_interactive(self): docs = self.source.find_clean(batch_size=1000) tagger = ngrams.make_backoff_tagger() print() for ind, doc in docs: sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) print("CHUNKS" + str(chunks)) print("NE" + str(cnll.get_ne(chunks))) print("NOUNS" + str(cnll.get_nouns(chunks))) if ind == 10: break
def nltk_method(str): # INSTALL # nltk.download() # nltk.download('averaged_perceptron_tagger') # nltk.download('maxent_ne_chunker') # nltk.download('words') ex = str u = unicode(ex, 'utf-8') pattern = 'NP: {<DT>?<JJ>*<NN>}' sent = preprocess(ex) cp = nltk.RegexpParser(pattern) cs = cp.parse(sent) iob_tagged = tree2conlltags(cs) ne_tree = ne_chunk(pos_tag(word_tokenize(ex))) print(ne_tree)
def get_named_entities(sentence): """ Get named entities from a sentence. """ tokens = wordpunct_tokenize(sentence) posTaggedTokens = pos_tag(tokens) tree = ne_chunk(posTaggedTokens) subtrees = dropFirst(tree.subtrees()) entities = defaultdict(int) for subtree in subtrees: # We could add the entity type here (e.g. PERSON, ORGANIZATION) # entity = subtree.node + " " # words = [word for (word,pos) in subtree if word != "FTP"] if len(words) > 0: entities[" ".join(words)] += 1 return entities
def data(fname): reader = csv.DictReader(open(fname)) idx = 0 for row in reader: if idx % 100 == 0: print "Row %d" % idx row["words_basic"] = re.findall(r'\w+', row["Comment"]) row["words_nltk"] = word_tokenize(row["Comment"]) row["pos_tag"] = [] row["ne_tags"] = [] for sent in sent_tokenize(row["Comment"]): words = tagger.tag(word_tokenize(sent)) row["pos_tag"].extend([wd[1] for wd in words]) for x in ne_chunk(words): if x.__class__.__name__ == "Tree": row["ne_tags"].append(x.node) yield row idx += 1
def clean_dict(doc, tagger=nltk.pos_tag): """ Processes NLP features from cleansed_text. All other functions wrap this one. Serves to act as the NLP-front end for reddit corpus parsing. Dictionaries and json strings are accepted and return dictionaries containing additional information. The processing done here represents the general annotations. The following are the new fields added to the dictionary. Classifiers will work to modify or wrap these methods. :: { conlltags : [[(word, pos, BIO)]], nouns : [word], named_entities : [[word, pos, BIO]], cleansed_text : [[word]] } :param doc: dictionary of reddit corpus. :type doc: dict :param tagger: A pos tagger. :type tagger: Tagger :returns: dict """ if "_id" in doc: del (doc["_id"]) sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) or [] doc["conlltags"] = [] doc["nouns"] = [] doc["named_entities"] = [] for sent in tags: tagged_sent = nltk.pos_tag(sent) or [] d = ne_chunk(tagged_sent) or [] chunks = tree2conlltags(d) doc["conlltags"].append(chunks) doc["nouns"].extend(cnll.get_nouns(chunks)) doc["named_entities"].extend(cnll.get_ne(chunks)) return doc
def ne_removal(text): ''' The ne_removal funcition recieves a text corpus, tokenize it, set parts of speach and name entities. Then, remove the name entities from the tokenize text. INPUT: text = string with text OUTPUT: tokens_no_ne = array of tuple with token and part of speach ''' raw_tokens = word_tokenize(text) chunked = ne_chunk(pos_tag(raw_tokens)) tokens_no_ne = [leaf for leaf in chunked if (type(leaf) != nltk.Tree)] return tokens_no_ne
def extract_names(self): names = [] for sentence in self.sentences: text = word_tokenize(sentence) tags = nltk.pos_tag(text) # for chunk in ne_chunk(tags): # if isinstance(chunk, Tree): # print chunk for i in list( chain(*[ chunk.leaves() for chunk in ne_chunk(tags) if isinstance(chunk, Tree) ])): names.extend(i) unique_names = list(set(names)) unique_names.remove("NNS") unique_names.remove("NNP") unique_names.remove("NNPS") print "unique names: ", unique_names return unique_names
def get_names_entities_list(tokens): """(nltk.tokens) -> list Return a list of the person names among the tokens""" words_pos_tags = nltk.pos_tag(tokens) named_entities = [ chunk for chunk in ne_chunk(words_pos_tags) if isinstance(chunk, Tree) ] named_entities_list = [] for entity in named_entities: if entity.label() == 'PERSON': person_name = "" for word, postag in entity.leaves(): person_name += word + " " named_entities_list.append(person_name) named_entities_list = list(set(named_entities_list)) named_entities_list = [entity.strip() for entity in named_entities_list] return named_entities_list
def NER(entry): global nouns, person, gpe, org, loc, mon global postags word_tokens = nltk.word_tokenize(entry) postags = nltk.pos_tag(word_tokens) nouns = ne_chunk(postags) for t in str(nouns).split("\n"): if "NNP" in t: if "PERSON" in t: a = t.split("/") b = a[0].split(" ") person.append(b[3]) person = set(person) person = list(person) elif "GPE" in t: a = t.split("/") b = a[0].split(" ") gpe.append(b[3]) gpe = set(gpe) gpe = list(gpe) elif "ORGANIZATION" in t: a = t.split("/") b = a[0].split(" ") org.append(b[3]) org = set(org) org = list(org) elif "LOC" in t: a = t.split("/") b = a[0].split(" ") loc.append(b[3]) loc = set(loc) loc = list(loc) elif "CD" in t: a = t.split(" ") b = a[2].split("/") mon.append(b[0]) mon = set(mon) mon = list(mon) show() return 0
def get_ne(self, post_taged_sents, sent_tokenized_sents, chunker, mark): """ This function gets all the name entity in the text. Parameters: post_taged_sents (list) - pos taged sentece sent_tokenized_sents (list) - sentenced tokenized text mark (string) - 'Q' means question; 'T' means text. To decide which chunker is used Variables: name_entity_sents (list) - the list of all the name entities in the text, the element of the list is the list of the ne of each sentence return name_entity_sents """ name_entity_sents = [] idx = 0 for pos_tag_sent in post_taged_sents: #print(len(post_taged_sents)) #print() #print(pos_tag_sent) #print(idx) if re.findall(r'[A-Z]&[A-Z]', sent_tokenized_sents[idx]): ne_abbred = re.findall(r'[A-Z]&[A-Z]', sent_tokenized_sents[idx]) #name_entity_sents.append(ne_abbred) else: ne_abbred = [] if mark == 'Q': ne_chunk = chunk.ne_chunk(pos_tag_sent, binary=True) #name_entity_sents.append(get_name_entity(get_name_entity_list(ne_chunk))) if mark == 'T': ne_chunk = chunker.parse(pos_tag_sent) #name_entity_sents.append(get_name_entity(get_name_entity_list(ne_chunk))) #print(ne_chunk) ne_chunks = ne_abbred + self.get_name_entity( self.get_name_entity_list(ne_chunk)) name_entity_sents.append(ne_chunks) idx += 1 return name_entity_sents
def preprocess(sent): sent = nltk.word_tokenize(sent) sent = nltk.pos_tag(sent) sent = ne_chunk(sent) return sent
(genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) cfd.tabulate(conditions=genres, samples=modals) # can could may might must will # news 93 86 66 38 50 389 # religion 82 59 78 12 54 71 # hobbies 268 58 131 22 83 264 #science_fiction 16 49 4 12 8 16 # romance 74 193 11 51 45 43 # humor 16 30 8 8 9 13 ## Example Taken from Natural Language Processing with Python ## (Steve Bird, Ewan Klein, and Edward Loper 2009) ## Available at http://nltk.org/book/ from nltk import ngrams, wordpunct_tokenize from ntlk.tag import pos_tag from nltk.stem.porter import PorterStemmer from nltk.chunk import ne_chunk s = "The Democrat admitted he's eying the mayor's race in a lengthy New York Times Magazine article that detailed his life with wife Huma Abedin, a close aide to former Secretary of State Hillary Clinton, and their attempts to stay out of the limelight over the past two years-until now." tokens = wordpunct_tokenize(s) st = PorterStemmer() stemmed = [st.stem(w) for w in tokens] pos = pos_tag(tokens) tree = ne_chunk(pos) tree.draw()
def ne_chucking(): tree = chunk.ne_chunk(part_of_speech_tagging())
from nltk.chunk import ne_chunk from nltk.tokenize import word_tokenize from nltk.tag import pos_tag in_filepath = 'E:\Subash\AI\data_in.txt' fldetails = open(in_filepath, 'r') for lines in fldetails.readlines(): tokenizing = word_tokenize(lines) postagging = pos_tag(tokenizing) chunk_Sent = ne_chunk(postagging) print chunk_Sent chunk_Sent.draw()
from nltk.tokenize import word_tokenize from nltk import pos_tag from nltk.chunk import ne_chunk import wikipedia e = "" topic = "Python (programming language)" # topic = 'python' try: entity = str(wikipedia.summary(topic, sentences = 4).encode('utf-8')) tokens = word_tokenize(entity) gmrTags = pos_tag(tokens) gmrChunks = ne_chunk(gmrTags, binary = True) print("Topic summary {}".format(topic)) print entity print("= = = =") print("Topic has these noun phrases in 4 sentence summary: ") gmrNouns = [] gmrPrev = None gmrPhrase = [] for (token, pos) in gmrTags: if pos.startswith('NN'): if pos == gmrPrev: gmrPhrase.append(token)
words_in_the_part_of_the_sentence = tokenize.word_tokenize(parts_of_the_sentence[2]) print("words_in_the_part_of_the_sentence") print(words_in_the_part_of_the_sentence) from nltk import tag tagged_part_of_sentence_with_corresponding_syntactical_value = tag.pos_tag(words_in_the_part_of_the_sentence) print("tagged_part_of_sentence_with_corresponding_syntactical_value") print(tagged_part_of_sentence_with_corresponding_syntactical_value) from nltk import chunk from nltk.tree import Tree tree = chunk.ne_chunk(tagged_part_of_sentence_with_corresponding_syntactical_value) print("tree") print(tree) #tree.draw() the_tree = Tree("sentence", tokenize.sent_tokenize(initial_sentence)) #the_tree.draw() print(the_tree) list_of_sentences = [] for i in tokenize.sent_tokenize(initial_sentence): list_of_sentences.append(tag.pos_tag(tokenize.word_tokenize(i)))
def named_entities(tagged_body): body = ne_chunk(tagged_body) return body
import cPickle as pickle from pprint import pprint import os from nltk import chunk, tokenize d = """"Hello, Martha. It isn't cocktail-time yet, is it?" The girl at the table spoke without raising her head, almost without moving her lips, as though she were afraid that the slightest breath would disturb the flaky stuff in front of her.""" def tagLoader(): data_dir = os.path.expanduser("~/.cwethan") tagger_cache_file = os.path.join(data_dir, "tagger.pkl") fh = open(tagger_cache_file, "rb") tagger = pickle.load(fh) fh.close() return tagger tagger = tagLoader() sent = tokenize.word_tokenize(d) tags = tagger.tag(sent) print chunk.ne_chunk(tags)
if (tag == 'NNP'): value = "%s" % word nlist.append(value) if (tag == 'NNS'): value = "%s" % word nlist.append(value) print(nlist) #named entities nen = ne_chunk(rtag) print(nen) #putting all the noun phrases in a file f= open("noun_all.csv", "w") f.write("\n".join(nlist)) f.close() #extracting capitalized words capword = RegexpTokenizer('[A-Z]\w+')
from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() wordnet_lemmatizer.lemmatize('dogs') from nltk.tokenize import sent_tokenize sent_tokenize("Hello SF Python. This is NLTK. Its a good library.") from nltk.tokenize import word_tokenize word_tokenize('This is NLTK.') from nltk.tokenize import wordpunct_tokenize wordpunct_tokenize("What's up?") words = word_tokenize("And now for something completely different") from nltk.tag import pos_tag pos_tag(words) from nltk.chunk import ne_chunk ne_chunk(pos_tag(word_tokenize('My name is Jacob Perkins.'))) ne_chunk(pos_tag(word_tokenize('San Francisco is foggy.'))) def bag_of_words(words): return dict([(word, True) for word in words]) feats = bag_of_words(word_tokenize("great movie")) import nltk.data classifier = nltk.data.load('classifiers/movie_reviews_NaiveBayes.pickle') classifier.classify(feats)
def nltk_entity_groups(text): """Return all contiguous NER tagged chunks by NLTK.""" parse_tree = ne_chunk(pos_tag(word_tokenize(text))) ner_chunks = [' '.join([l[0] for l in t.leaves()]) for t in parse_tree.subtrees() if t.label() != 'S'] return ner_chunks
tokens = word_tokenize(file_y) for value in tokens: all_toks_class.append(value) good_sent = [w.encode('ascii', 'replace') for w in all_sent if len(w) >= 10 and len(w) < 100] good_toks = [w for w in all_toks_class if not w.lower() in stopset and not w.isdigit() and w.isalpha() and len(w) >= 4 and len(w) < 125] fdist1 = FreqDist(good_toks) most = fdist1.most_common(100) list_values = list() for word in most: list_values.append(word[0]) st = NERTagger('./stanford-ner/english.all.3class.distsim.crf.ser.gz','./stanford-ner/stanford-ner.jar') tagged_words = st.tag(list_values) tag_words = list() for word in list_values: tag_words = tag_words + tag.pos_tag(word) print "CHUNK WORDS:" tree = chunk.ne_chunk(tagged_words) print tree.draw print "STANDFORD WORDS:" for word in tagged_words: if (word[1] != 'O'): print word
from nltk.chunk.util import tree2conlltags import rdt.nlp.conll_get as cnll if __name__ == "__main__": source = rdtcorp.Source(conf_key="source_test") annotated = rdtcorp.Source(conf_key="annotated_test") docs = source.find() docs.batch_size(1000) tagger = ngrams.make_backoff_tagger() buf = [] for ind, doc in enumerate(clean.doc_iter(docs)): del (doc["_id"]) sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) doc["conlltags"] = [] doc["nouns"] = [] doc["named_entities"] = [] for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) doc["conlltags"].append(chunks) doc["nouns"].extend(cnll.get_nouns(chunks)) doc["named_entities"].extend(cnll.get_ne(chunks)) buf.append(doc) if ind % 1000: annotated.insert(buf) buf = [] if buf: annotated.insert(buf)
def test_ne_chunk(sent): from nltk.chunk import ne_chunk print ne_chunk(sent)
#This is based on import sys from nltk import sent_tokenize from nltk import word_tokenize from nltk import pos_tag from nltk import chunk file_name = sys.argv[1] file = open(file_name, 'r', encoding='utf8') text = "" for line in file: text += line file.close() sentence_lst = sent_tokenize(text) tagged_sentences = [] for sentence in sentence_lst: tokens = word_tokenize(sentence) tagged = pos_tag(tokens) tagged_sentences.append(tagged) for tagged_sentence in tagged_sentences: tree = chunk.ne_chunk(tagged_sentence) print(tree)
import nltk from nltk.tokenize import word_tokenize from nltk.tag import pos_tag from nltk.chunk import conlltags2tree, tree2conlltags, ne_chunk import spacy from pprint import pprint ex = 'Golden Diner is a little spot in Two Bridges that serves modern diner classics. It’s worth planning lunch here a week in advance.' ex = 'Troubled burger group Byron will launch a new brand concept and menu on November 21. The 53-strong restaurant chain, which launched 12 years ago to much fanfare, narrowly escaped collapse amid the casual dining crunch, but has come out fighting with a bold new strategy. As much as £15m will be invested in the brand […]' def preprocess(sent): sent = nltk.word_tokenize(sent) sent = nltk.pos_tag(sent) return sent sent = preprocess(ex) # print(sent) pattern = 'NP: {<DT>?<JJ>*<NN>}' cp = nltk.RegexpParser(pattern) cs = cp.parse(sent) iob_tagged = tree2conlltags(cs) pprint(iob_tagged) ne_tree = ne_chunk(pos_tag(word_tokenize(ex))) print(ne_tree)
file = open(file_path, "w+", encoding="utf-8") # Document level for doc_idx, text in enumerate(cleantexts): #if ( (doc_idx + 1) % 1) == 0: # print("Processing document %d out of %d" % (doc_idx+1,n_docs)) doc_key = doc_ids[doc_idx] # NER if tagger == "NLTK": # set_entities_tag = tag_with_NLTK(text) set_entities_tag = { (' '.join(c[0] for c in chunk), chunk.label()) for chunk in ne_chunk(pos_tag(word_tokenize(text))) if hasattr(chunk, 'label') } #elif tagger == "Spacy": # set_entities_tag = {(ent.text.strip(), ent.label_) for ent in spacy_model(text).ents if # ent.label_ not in skip} # elif tagger == "Stanford": # set_entities_tag = tag_with_stanford(text) # print(set_entities_tag) # Go through each entity found for word, label in set_entities_tag: # Skip entity if have following constraints if len(word) < 1 or check_skip_constraints(
# load tagger from storage f = open( 'tagger.pickle', 'r' ) tagger = pickle.load( f ) # If your tagger pickle file is located in a NLTK data directory, you could also # use nltk.data.load('tagger.pickle') #-------------------------------------------------------------------------------- # Named Entity Chunking #-------------------------------------------------------------------------------- # Need to start experimenting with chunkers. # In packt book, NER is on page 133. # Once you have parsed parts of speech from your sentences, then you can look # for names. Must pass the ne_chunk method a list of tagged words, though, # not just a list of tokens. from nltk.chunk import ne_chunk ne_chunk( tagged_words ) # todo: figure out how to deal with the output of chunker for named-entity # recognition. # todo: figure out how to detect, filter said verbs - need to go through # articles, find all said verbs, make a custom classifier that can tag them # differently. # todo: need a way to take said verbs and proper names and see if they are # proximal in a given sentence - if not within 4 or 5 words, might not be # attribution.