def __init__(self, query_string): self.query_string = query_string sentences = nltk.sent_tokenize(query_string) self.tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] self.tagged_sentences = [nltk.pos_tag(sentence) for sentence in self.tokenized_sentences] self.binary_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=True) self.multiclass_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=False) self.temporal_sentences = timex.ground(timex.tag(query_string), mx.DateTime.gmt())
def __init__(self, query_string): self.query_string = query_string sentences = nltk.sent_tokenize(query_string) self.tokenized_sentences = [ nltk.word_tokenize(sentence) for sentence in sentences ] self.tagged_sentences = [ nltk.pos_tag(sentence) for sentence in self.tokenized_sentences ] self.binary_chunked_sentences = nltk.batch_ne_chunk( self.tagged_sentences, binary=True) self.multiclass_chunked_sentences = nltk.batch_ne_chunk( self.tagged_sentences, binary=False) self.temporal_sentences = timex.ground(timex.tag(query_string), mx.DateTime.gmt())
def obtenerNEs(lista): listaGeneral = [] for (tweet, listaPalabras, clasificacion, diferenciaProbabilidad) in lista: # Condicionamos para que solo evalue los positivos print clasificacion if clasificacion == 'positive': sentences = nltk.tokenize.sent_tokenize(tweet) # Hacemos split en lugar de tokenize, para poder extrar las menciones a usuario. # El word_tokenize, separa el @ entonces no podemos filtrar nuevaSentences = [] for s in sentences: subLista = quitarExcedenteSimple(s.split()) nuevaSentences.append(' '.join(subLista)) tokens = [nltk.tokenize.word_tokenize(s) for s in nuevaSentences] pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens] ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens, binary=True) listaNEs = [] for subArbol in ne_chunks: traverse(subArbol, listaNEs, False) if listaNEs: listaGeneral.append((tweet, listaPalabras, listaNEs)) web.debug('Tweets con NEs:' + str(len(listaGeneral))) return listaGeneral
def extractchunk(tweettuple): sentences = [nltk.tokenize.sent_tokenize(nltk.clean_html(str(w))) for (a,w) in tweettuple] cid = [str(a) for (a,w) in tweettuple] tokens = [nltk.tokenize.word_tokenize(str(s)) for s in sentences] pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens] ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens) return dict(zip(cid, ne_chunks))
def extract_entities(sample): print 'extracting entities' sentences = nltk.sent_tokenize(sample) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) entity_names = [] for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) #create a map with entity,count count representing # the number of occurences of an entity entity_count = {} for entity in entity_names: if entity in entity_count: entity_count[entity] += 1 else: entity_count[entity] = 1 sorted_occurences = sorted(entity_count.iteritems(), reverse=True, key=operator.itemgetter(1)) #return OrderedDict(entity_count) # Print unique entity names #print set(entity_names) return sorted_occurences
def process_entities(sentence): words = [] #print sentence #now break sentences into tokens tokens = nltk.word_tokenize(sentence) #print tokens #A bit of POS tagging pos_tagged_tokens = [nltk.pos_tag(tokens)] #Chunk extraction time ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens) # Flatten the list since we're not using sentence structure # and sentences are guaranteed to be separated by a special # POS tuple such as ('.', '.') pos_tagged_tokens = [token for sent in pos_tagged_tokens for token in sent] #Entity extraction #Code from Mining data from the social web: https://github.com/ptwobrussell/Mining-the-Social-Web/blob/master/python_code/blogs_and_nlp__extract_entities.py post = {} all_entity_chunks = [] previous_pos = None current_entity_chunk = [] #print pos_tagged_tokens for (token, pos) in pos_tagged_tokens: if pos == previous_pos and pos.startswith('NN'): current_entity_chunk.append(token) elif pos.startswith('NN'): if current_entity_chunk != []: # Note that current_entity_chunk could be a duplicate when appended, # so frequency analysis again becomes a consideration all_entity_chunks.append((' '.join(current_entity_chunk), pos)) current_entity_chunk = [token] previous_pos = pos # Store the chunks as an index for the document # and account for frequency while we're at it... post['entities'] = {} for c in all_entity_chunks: post['entities'][c] = post['entities'].get(c, 0) + 1 # For example, we could display just the title-cased entities proper_nouns = [] for (entity, pos) in post['entities']: if entity.istitle(): proper_nouns.append(entity) #print '\t%s (%s)' % (entity, post['entities'][(entity, pos)]) #print entity #[(entity, pos)] return proper_nouns
def _nltk_ner(self, text, searched_entity, question): # Entity Classification sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(s) for s in sentences] tagged_sentences = [nltk.pos_tag(s) for s in tokenized_sentences] ne_chunked_sentences = nltk.batch_ne_chunk(tagged_sentences) # Entity Extraction entities = [] all_entities = [] for tree in ne_chunked_sentences: for child in tree: if isinstance(child, Tree): entity = " ".join([word for (word, pos) in child.leaves()]) if child.node == searched_entity: entities.append(entity) all_entities.append(entity) if 'OTHER' == searched_entity: entities += self._other_recognition(tagged_sentences, all_entities, question) if 'NUMBER' == searched_entity: entities += self._number_recognition(text, tagged_sentences, all_entities) return entities
def nlp_extract_tags(text, lang=None): """ Return a list of tags extracted from provided text. """ sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) def extract_entity_names(t): entity_names = [] if hasattr(t, "node") and t.node: if t.node == "NE": entity_names.append(" ".join([child[0] for child in t])) else: for child in t: entity_names.extend(extract_entity_names(child)) return entity_names entity_names = [] for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) result = {"tags": list(set(entity_names))} return jsonp({"status": "ok", "result": result})
def get_named_entities(self,text): sentences = nltk.sent_tokenize(text) sentences = [nltk.word_tokenize(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences] #takes 3ish seconds nes = nltk.batch_ne_chunk(sentences,binary=False) #takes 2ish seconds named_entities = {} stop_names = ['Mr.'] # Loop through the tagged sentences, looking for named entites, and put their "leaves" together # e.g. "White" + " " + "House" # for i in nes: for j in i: if re.search('PERSON|ORGANIZATION|LOCATION|GPE|FACILITY',str(j)): name = ' '.join(c[0] for c in j.leaves()) # Attempt to merge people names if you've seen them before # e.g. Ms. Clinton gets merged into Hillary Clinton if not (name in stop_names): regex = re.compile(r'^'+name.split(' ')[-1]+'|\s'+name.split(' ')[-1]+'$') regex_match = filter(regex.search,named_entities.keys()) if (name in named_entities): named_entities[name]+=1 elif (len(regex_match)>0 and re.search('PERSON',str(j))!=None): named_entities[regex_match[0]]+=1 else: named_entities[name] = 1 # Sort named entities by count and take first 8 sorted_names = sorted(named_entities.iteritems(), key=operator.itemgetter(1), reverse=True) names=[] for name in sorted_names[:8]: names.append(name[0].lower()) return names
def extract_entities(shorttext_rows, site): # { short text id -> (noun entities, named entities) } shorttext_entities = {} # nltk entity classes nltk_entity_types = __get_nltk_entity_types__() for shorttext_row in shorttext_rows: shorttext_id = shorttext_row[0] shorttext_str = shorttext_row[1] noun_entities = [] named_entities = [] sentences = nltk.sent_tokenize(shorttext_str) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences) for tree in chunked_sentences: __extract_valid_entities__(tree, (noun_entities, named_entities), nltk_entity_types) shorttext_entities[shorttext_id] = (noun_entities, named_entities) # Cache extracted entities pkl_util.write_pickle(__output_str__, shorttext_entities, __get_nltk_entities_cache_path__(site))
def extract_chunked_sentences( raw ): """ """ sentences = nltk.sent_tokenize(raw) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) return chunked_sentences
def extractNamedEntities(sentences): tok_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tag_sentences = [nltk.pos_tag(sentence) for sentence in tok_sentences] cnk_sentences = nltk.batch_ne_chunk(tag_sentences, binary=True) all_named_entities = [] for tree in cnk_sentences: named_entities = extractNamedEntitiesFromChunkSentence(tree) all_named_entities.extend(named_entities) return list(set(all_named_entities))
def prepareSentence(sample): sentences = nltk.sent_tokenize(sample) tokenized_sentences = [ nltk.word_tokenize(sentence) for sentence in sentences ] tagged_sentences = [ nltk.pos_tag(sentence) for sentence in tokenized_sentences ] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) return chunked_sentences
def extractchunk(tweettuple): #Break each tweet into groups of sentences and words #Run through the nltk standard pos tag and chunker functions sentences = [nltk.tokenize.sent_tokenize(nltk.clean_html(str(c))) for (a,w,c) in tweettuple] cid = [str(a) for (a,w, c) in tweettuple] tnum =[w for (a,w,c) in tweettuple] tokens = [nltk.tokenize.word_tokenize(str(s)) for s in sentences] pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens] ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens) return zip(cid, tnum, ne_chunks)
def get_entities(sentences): #sentences = nltk.sent_tokenize(doc) # some nltk preprocessing: tokenize, tag, chunk, NER tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) entities = [] for t in chunked_sentences: entities.append(entitify(t)) return entities
def get_entities3(text): sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) entity_names=[] for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) return filter_entities(entity_names)
def gen_ners(self,sample): """ returns NERS in the sample given as a list """ sentences = nltk.sent_tokenize(sample) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) entity_names = [] for tree in chunked_sentences: entity_names.extend(self._extract_entity_names(tree)) unique_ners = list(set(entity_names)) return unique_ners
def get_entities(text): ''' Extracts named entities from the supplied text. Returns a list of entity names. ''' sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) entity_names = [] for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) return entity_names
def char_recognition(self, char_number = 20): tagged_sentences = nltk.pos_tag_sents(self.tokenized_sentences) self.entities = [] entity_names = [] if nltk.__version__[0] == '3': chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False) for tree in chunked_sentences: entity_names.extend(extract_entity_names3(tree)) else: chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=False) for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) count = Counter([name for name in entity_names]) for c in count.most_common(char_number): self.entities.append(c[0])
def analyzeText(self,question,answers): #does tokenization , pos tagging , chunking and returns all 3 of them # for answer in answers: answers = ''.join(answers) #print answer #sentence tokenzier sentences = sent_tokenize(answers) # word tokenizer tokens = [word_tokenize(sentence) for sentence in sentences] #pos tagger postags = [pos_tag(token) for token in tokens] #chunking chunks = batch_ne_chunk(postags,binary=True) TAObj = TextAnalyticsObj(question,answers,sentences,tokens,postags,chunks) return TAObj
def get_named_entities(paragraph, ent_type): sentences = nltk.sent_tokenize(paragraph) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences) entity_names = [] for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree, ent_type)) keywords = '' for ent in entity_names: keywords = keywords + ' ' + ent return keywords
def chunkSentences(text): """ Parses text into parts of speech tagged with parts of speech labels. """ sentences = nltk.sent_tokenize(text) tokenizedSentences = [ nltk.word_tokenize(sentence) for sentence in sentences ] taggedSentences = [ nltk.pos_tag(sentence) for sentence in tokenizedSentences ] if nltk.__version__[0:2] == "2.": chunkedSentences = nltk.batch_ne_chunk(taggedSentences, binary=True) else: chunkedSentences = nltk.ne_chunk_sents(taggedSentences, binary=True) return chunkedSentences
def analyzeText(self, question, answers): #does tokenization , pos tagging , chunking and returns all 3 of them # for answer in answers: answers = ''.join(answers) #print answer #sentence tokenzier sentences = sent_tokenize(answers) # word tokenizer tokens = [word_tokenize(sentence) for sentence in sentences] #pos tagger postags = [pos_tag(token) for token in tokens] #chunking chunks = batch_ne_chunk(postags, binary=True) TAObj = TextAnalyticsObj(question, answers, sentences, tokens, postags, chunks) return TAObj
def chunkSentences(text): """ Parses text into parts of speech tagged with parts of speech labels. Used for reference: https://gist.github.com/onyxfish/322906 """ sentences = nltk.sent_tokenize(text) tokenizedSentences = [nltk.word_tokenize(sentence) for sentence in sentences] taggedSentences = [nltk.pos_tag(sentence) for sentence in tokenizedSentences] if nltk.__version__[0:2] == "2.": chunkedSentences = nltk.batch_ne_chunk(taggedSentences, binary=True) else: chunkedSentences = nltk.ne_chunk_sents(taggedSentences, binary=True) return chunkedSentences
def extract_entities(text): """Create named entities from a text body. string, text -> text body to parse. First tokenize the text body into sentences and words. Then part-of-speech tag for each word. Finally run the tagged sentences through the batch named entity tagger. We must use NLTK batch_ne_chunk to handle multiple sentences per call. Return list of entities. """ sentences = nltk.sent_tokenize(text) sentences = [nltk.word_tokenize(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences] entities = nltk.batch_ne_chunk(sentences, binary=True) return entities
def get_entities(text): ''' Extracts named entities from the supplied text. Returns a list of entity names. ''' sentences = nltk.sent_tokenize(text) tokenized_sentences = [ nltk.word_tokenize(sentence) for sentence in sentences ] tagged_sentences = [ nltk.pos_tag(sentence) for sentence in tokenized_sentences ] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) entity_names = [] for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) return entity_names
def char_recognition(self, char_number=20): tagged_sentences = nltk.pos_tag_sents(self.tokenized_sentences) self.entities = [] entity_names = [] if nltk.__version__[0] == '3': chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False) for tree in chunked_sentences: entity_names.extend(extract_entity_names3(tree)) else: chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=False) for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) count = Counter([name for name in entity_names]) for c in count.most_common(char_number): self.entities.append(c[0])
def getSentChunks(d, n): chunks = [] nament = [] i = 0 for doc in d: sentences = nltk.sent_tokenize(doc) tokenized = [nltk.word_tokenize(s) for s in sentences] pos_taged = [nltk.pos_tag(ptag) for ptag in tokenized] chunksent = nltk.batch_ne_chunk(pos_taged) chunks.append(chunksent) print "calling extract_en on " + n[i] for chunk in chunksent: nament.extend(extract_en(chunk)) i = i + 1 return (chunks, nament)
def get_entities(text): case=parse_case_title(text) if case is not None: return [case['defendant'],case['plaintiff']] sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) entity_names=[] for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) return filter_entities(entity_names)
def chunkSentences(text): """ Parses text into parts of speech tagged with parts of speech labels. Used for reference: https://gist.github.com/onyxfish/322906 """ sentences = nltk.sent_tokenize(text) tokenizedSentences = [ nltk.word_tokenize(sentence) for sentence in sentences ] taggedSentences = [ nltk.pos_tag(sentence) for sentence in tokenizedSentences ] if nltk.__version__[0:2] == "2.": chunkedSentences = nltk.batch_ne_chunk(taggedSentences, binary=True) else: chunkedSentences = nltk.ne_chunk_sents(taggedSentences, binary=True) return chunkedSentences
def analyzeText(self, answers): # find entities and TFIDF for each answers and then add it up in the end for answer in answers: if answer != "": # print answer # sentence tokenzier sentences = sent_tokenize(answer) # word tokenizer tokens = [word_tokenize(sentence) for sentence in sentences] # pos tagger postags = [pos_tag(token) for token in tokens] # chunking chunks = batch_ne_chunk(postags, binary=True) # find entites entites = [] for tree in chunks: # print tree,"tree" self.extractEntities(tree)
def extract_named_entities(text): ''' Set up stopwords ''' stop = stopwords.words('english') ''' Set up vars ''' reports = {} companies = {} company = {} matches = {} ''' pre-process ''' tk_text = text.replace('\n', ' ') tk_text = nltk.clean_html(tk_text.strip()) #regular methods: sentences = nltk.sent_tokenize(tk_text) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=False) entity_names = {} for tree in chunked_sentences: #extract named entities: ents = functions.extract_entity_names(tree) if ents: #increment occurrence or add to dict: if ents['ne'] in entity_names: entity_names[ents['ne']]['count'] += 1 else: entity_names[ents['ne']] = { 'type': ents['type'], 'count': 1} return json.dumps(entity_names)
def named_entities(text): print 'Extracting named entities...' sentences = nltk.sent_tokenize(text) tokenized_sentences = [ nltk.word_tokenize(sentence) for sentence in sentences ] tagged_sentences = [ nltk.pos_tag(sentence) for sentence in tokenized_sentences ] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) entity_names = [] for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) results = set(entity_names) print results return results
def nltk_extraction(test): sentences = nltk.sent_tokenize(test) tokenized_sentences = [ nltk.word_tokenize(sentence) for sentence in sentences ] tagged_sentences = [ nltk.pos_tag(sentence) for sentence in tokenized_sentences ] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) entity_names = [] for tree in chunked_sentences: # Print results per sentence # print extract_entity_names(tree) entity_names.extend(extract_entity_names(tree)) #print set(entity_names) return entity_names
def parse_description(description): sentences = nltk.tokenize.sent_tokenize(description) #print '*****************sentences*********' new_sentences = [] for sentence in sentences: new_sentences += sentence.split("\n") sentences = new_sentences tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) actions = [] entities = [] for tree in chunked_sentences: e, a = extract_entity_names(tree) entities += e actions += a return entities, actions
def ExtractNER(txt): sentences = nltk.sent_tokenize( txt ) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) def extract_entity_names(t): names = [] if hasattr(t, 'node') and t.node: if t.node == 'NE': names.append(' '.join([child[0] for child in t])) else: for child in t: names.extend(extract_entity_names(child)) return names entity_names = [] for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) return entity_names
def execute(self, context): sentences = context[ProcessingContext.TAGGED_SENTENCES] trees = nltk.batch_ne_chunk(sentences, binary=False) terms = {} # chain the iterators of all trees in order to access their # children directly nodes = chain(*trees) for node in nodes: if isinstance(node, Tree): # self.__process_named_entity(terms, node) pass else: self.__process_term(terms, node) if len(terms) < self._min_term_count: raise ProcessingException('Insufficient terms') context[ProcessingContext.EXTRACTED_TERMS] = terms
def nlp_extract_tags(text, lang=None): '''returns list of named entities''' sample = text sentences = nltk.sent_tokenize(sample) tokenized_sentences = [ nltk.word_tokenize(sentence) for sentence in sentences ] tagged_sentences = [ nltk.pos_tag(sentence) for sentence in tokenized_sentences ] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) def extract_entity_names(t): entity_names = [] if hasattr(t, 'node') and t.node: if t.node == 'NE': entity_names.append(' '.join([child[0] for child in t])) else: for child in t: entity_names.extend(extract_entity_names(child)) return entity_names entity_names = [] for tree in chunked_sentences: # Print results per sentence # print extract_entity_names(tree) entity_names.extend(extract_entity_names(tree)) # Print all entity names #print entity_names # Print unique entity names result = {'tags': list(set(entity_names))} return jsonp({'status': 'ok', 'result': result})
def NERCount(sample): #sample = 'Born into an aristocratic Bengali family of Calcutta' #sample = "I am Jhon rom America" sentences = nltk.word_tokenize(sample) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) count = 0 for tree in chunked_sentences: # Print results per sentence # print extract_entity_names(tree) #entity_names.extend(extract_entity_names(tree)) count += extract_entity_names(tree) return count # Print all entity names #print entity_names # Print unique entity names #print list(set(entity_names)) #print count
def nlp_extract_tags(text, lang=None): '''returns list of named entities''' sample = text sentences = nltk.sent_tokenize(sample) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) def extract_entity_names(t): entity_names = [] if hasattr(t, 'node') and t.node: if t.node == 'NE': entity_names.append(' '.join([child[0] for child in t])) else: for child in t: entity_names.extend(extract_entity_names(child)) return entity_names entity_names = [] for tree in chunked_sentences: # Print results per sentence # print extract_entity_names(tree) entity_names.extend(extract_entity_names(tree)) # Print all entity names #print entity_names # Print unique entity names result = {'tags': list(set(entity_names))} return jsonp({'status': 'ok', 'result': result})
def chunk(text, binary=True): return batch_ne_chunk(imap(pos_tag, imap(word_tokenize, sent_tokenize(text))), binary=binary)
def cfst(text_raw, text_stemmer, text_parser_a): # TOKENIZATION: split into sentences text_sentences = nltk.sent_tokenize(text_raw) # TOKENIZATION: split into words text_words = [nltk.word_tokenize(sent) for sent in text_sentences] # TOKENIZATION: tag the words' part of speach text_posed = [nltk.pos_tag(sent) for sent in text_words] # TOKENIZATION: stem and also remove sentences without either verbs or nouns text_cleaned = [] for sent in text_posed: noun_found = False verb_found = False new_sent = [] for word in sent: if word[1].startswith('NN'): noun_found = True if word[1].startswith('VB'): verb_found = True new_sent.append((text_stemmer.stem(word[0]), word[1])) if noun_found and verb_found: text_cleaned.append(new_sent) # reverse sentence order while we are here # (put the sentences in chronological order) # (gets weird when sentences are split wrong) #text_cleaned.insert(0, new_sent) # only keep three sentences # (these are most likely to be from the user) #text_cleaned = text_cleaned[:3] # COMPLEX-WORD HANDLING # TODO: Add something here (or earlier) # Look for most common tri-grams and bi-grams? # BASIC-GROUP HANDLING: chunk the words at named entities text_chunked = nltk.batch_ne_chunk( text_cleaned) #[nltk.chunk.ne_chunk(sent) for sent in text_cleaned] # COMPLEX-PHRASE HANDLING: chunk the words text_chunked = [text_parser_a.parse(sent) for sent in text_chunked] #for sent in text_chunked: # print sent # print # unwind the tree only keeping interesting parts text_done = [] for sent in text_chunked: for x in xrange(0, len(sent)): tmp_string = str(sent[x]) if tmp_string.startswith('(NP') or tmp_string.startswith('(VP'): text_frag = [] for y in xrange(0, len(sent[x])): word = sent[x][y] if len(word) == 2: if len(word[0][0]) > 1: for sub_word in word: if len(sub_word[0]) > 0: #print "a", str(sub_word[0]).lower() text_frag.append(str(sub_word[0]).lower()) else: if len(word[0]) > 0: #print "b", str(word[0]).lower() text_frag.append(str(word[0]).lower()) elif len(word) > 2: for sub_word in word: if len(sub_word[0]) > 0: #print "c", len(sub_word), str(sub_word[0]).lower() text_frag.append(str(sub_word[0]).lower()) if (len(text_frag) > 0): text_done.append('-'.join(text_frag)) return text_done
#!/usr/bin/python import nltk import cPickle, string, numpy, getopt, sys, random, time, re, pprint import wikirandom from nltk.corpus import conll2000 corpus = [] corpus.append(nltk.pos_tag(nltk.word_tokenize("The Stones Roses and New Order both kicked off reunion tours last year, and Wu-Tang Clan are at work on a new album to mark their twentieth anniversary."))) corpus.append(nltk.pos_tag(nltk.word_tokenize("Rumors about this year's Coachella headliners had been swirling for weeks, with the Rolling Stones and Daft Punk among the acts speculated to be performing at the festival."))) print nltk.batch_ne_chunk(corpus) def getArticles(nr = 5): """ Downloads and analyzes a bunch of random Wikipedia articles """ (docs, names) = wikirandom.get_random_wikipedia_articles(nr) return (docs, names) def getSentChunks(d, n): chunks = [] nament = [] i = 0 for doc in d: sentences = nltk.sent_tokenize(doc) tokenized = [nltk.word_tokenize(s) for s in sentences] pos_taged = [nltk.pos_tag(ptag) for ptag in tokenized] chunksent = nltk.batch_ne_chunk(pos_taged)
if hasattr(tree, 'node') and tree.node: if tree.node == 'NE': entity_names.append(' '.join([child[0] for child in tree])) else: for child in tree: entity_names.extend(find_entities(child)) return entity_names df = read_data() tagged_sentences = df.corpus.apply(parts_of_speech) del df chunked_sentences = tagged_sentences.apply( lambda x: nltk.batch_ne_chunk(x, binary=True) ) entity_names = [] print 'Extracting entities...' print 'Grab a coffee. This takes about 10 minutes.' for cs in chunked_sentences: entities = sorted(list(set([word for tree in cs for word in find_entities(tree)]))) for entity in entities: if entity not in entity_names: entity_names.append(entity) print 'Writing entities to reference folder' with open('./ref/entities.txt', 'w') as f: f.write('\n'.join(word for word in sorted(entity_names)))
import wikirandom from nltk.corpus import conll2000 corpus = [] corpus.append( nltk.pos_tag( nltk.word_tokenize( "The Stones Roses and New Order both kicked off reunion tours last year, and Wu-Tang Clan are at work on a new album to mark their twentieth anniversary." ))) corpus.append( nltk.pos_tag( nltk.word_tokenize( "Rumors about this year's Coachella headliners had been swirling for weeks, with the Rolling Stones and Daft Punk among the acts speculated to be performing at the festival." ))) print nltk.batch_ne_chunk(corpus) def getArticles(nr=5): """ Downloads and analyzes a bunch of random Wikipedia articles """ (docs, names) = wikirandom.get_random_wikipedia_articles(nr) return (docs, names) def getSentChunks(d, n): chunks = [] nament = [] i = 0 for doc in d: sentences = nltk.sent_tokenize(doc)
for word in traverse(tree)]))) for e in entities: if e not in named_entities: named_entities.append(e) return named_entities ## takes text and associate w. POS tags (so can filter on basis of tags) tokens = nltk.word_tokenize(text) sentences = nltk.sent_tokenize(text) words = (nltk.word_tokenize(sentence) for sentence in sentences) tags = [nltk.pos_tag(word) for word in words] named_entity_chunks = nltk.batch_ne_chunk(tags) find_entities(named_entity_chunks) # http://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity/12128777 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel sentences[:10] tfidf = TfidfVectorizer(ngram_range=(1,3), token_pattern=r'\b\w+\b', min_df=1).fit_transform(cleaned_text) # need to use character n-grams cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten() related_docs_indices = cosine_similarities.argsort()[:-20:-1] related_docs_indices
def chunk(patterns): for sent in chunks: tree = NPChunker.parse(sent) for subtree in tree.subtrees(): if subtree.node == 'NP': print subtree nplist.append(subtree) chunk(patterns) print len(nplist) ne_chunks = nltk.batch_ne_chunk(nplist) print ne_chunks for i in range(len(ne_chunks)): tree = ne_chunks[i] print 'PERSON' print sub_leaves(tree, 'PERSON') for i in range(len(ne_chunks)): tree = ne_chunks[i] print 'ORGANIZATION' print sub_leaves(tree, 'ORGANIZATION') for i in range(len(ne_chunks)): tree = ne_chunks[i]
words = [] #break into sentences sentences = nltk.tokenize.sent_tokenize(raw_quote) #now break sentences into tokens tokens = [nltk.word_tokenize(s) for s in sentences] #A bit of POS tagging pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens] #Chunk extraction time ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens) # Flatten the list since we're not using sentence structure # and sentences are guaranteed to be separated by a special # POS tuple such as ('.', '.') pos_tagged_tokens = [token for sent in pos_tagged_tokens for token in sent] print type(pos_tagged_tokens) pos_tagged_tokens words = [w.lower() for sentence in sentences for w in nltk.word_tokenize(sentence)] #Basic lexical diversity measures length_words = len(words) print length_words
import nltk import json import fileinput sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') text = '' for line in fileinput.input(): text += line sentences = nltk.sent_tokenize(text) sentences = [nltk.word_tokenize(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences] chunked_sentences = nltk.batch_ne_chunk(sentences, binary=True) def extract_entity_names(t): entity_names = [] if hasattr(t, 'node') and t.node: if t.node == 'NE': entity_names.append(' '.join([child[0] for child in t])) else: for child in t: entity_names.extend(extract_entity_names(child)) return entity_names entity_names = [] for tree in chunked_sentences:
def parts_of_speech(corpus): "returns named entity chunks in a given text" sentences = nltk.sent_tokenize(corpus) tokenized = [nltk.word_tokenize(sentence) for sentence in sentences] pos_tags = [nltk.pos_tag(sentence) for sentence in tokenized] return nltk.batch_ne_chunk(pos_tags, binary=True)
#! /usr/bin/env python # Import required modules import sys import string import nltk # Read data from stdin and store it as sentences for line in sys.stdin: if len(line) == 0: continue sentences = nltk.tokenize.sent_tokenize(line) # Extract words from sentences words = [nltk.tokenize.word_tokenize(s) for s in sentences] # Extract Part of Speech from words pos_words = [nltk.pos_tag(t) for t in words] # Chunk the extracted Part of Speech tags named_entities = nltk.batch_ne_chunk(pos_words) # Write the chunks to stdout print named_entities[0]