def extract_relations(s, chunk_type = 'np'): ''' given a sentence, extract the list of relations chunk_type: define the chunked type among all relations, 'np'|'ne' ''' s = clean_sent(s) tokens = word_tokenize(s) # add the an NP to resolve the NLTK relationship BUG pos_sent = pos_tag(tokens) pos_sent = change_pos(pos_sent) cp = nltk.RegexpParser(NP_PATTERN) np_sent = cp.parse(pos_sent) # pprint(np_sent) nps = TextBlob(s).noun_phrases np_chunk = _build_tree_from_nps(tokens, nps) # pprint(nps) if chunk_type=='np': pairs = relextract.tree2semi_rel(np_sent) # pprint(len(pairs)) elif chunk_type == 'ne': pairs = relextract.tree2semi_rel(nltk.ne_chunk(pos_sent)) rel_dicts = pair2rel(pairs) # pprint(rel_dicts) # stop() return rel_dicts
def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10): """ Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern. The parameters ``subjclass`` and ``objclass`` can be used to restrict the Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'). :param subjclass: the class of the subject Named Entity. :type subjclass: str :param objclass: the class of the object Named Entity. :type objclass: str :param doc: input document :type doc: ieer document or a list of chunk trees :param corpus: name of the corpus to take as input; possible values are 'ieer' and 'conll2002' :type corpus: str :param pattern: a regular expression for filtering the fillers of retrieved triples. :type pattern: SRE_Pattern :param window: filters out fillers which exceed this threshold :type window: int :return: see ``mk_reldicts`` :rtype: list(defaultdict) """ if subjclass and subjclass not in NE_CLASSES[corpus]: if _expand(subjclass) in NE_CLASSES[corpus]: subjclass = _expand(subjclass) else: raise ValueError( "your value for the subject type has not been recognized: %s" % subjclass) if objclass and objclass not in NE_CLASSES[corpus]: if _expand(objclass) in NE_CLASSES[corpus]: objclass = _expand(objclass) else: raise ValueError( "your value for the object type has not been recognized: %s" % objclass) if corpus == 'ace' or corpus == 'conll2002': pairs = tree2semi_rel(doc) elif corpus == 'ieer': pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline) else: raise ValueError("corpus type not recognized") reldicts = semi_rel2reldict(pairs) relfilter = lambda x: (x['subjclass'] == subjclass and len(x[ 'filler'].split()) <= window and pattern.match(x['filler']) and x[ 'objclass'] == objclass) return list(filter(relfilter, reldicts))
def detectNERViaTrainedAlgo(sentence): dataDirectoryLocation = "data_nlp_ner" reader = readLoadTrainData(dataDirectoryLocation) loadedData = list(reader) training_samples = loadedData[:int(len(loadedData) * 0.8)] test_samples = loadedData[int(len(loadedData) * 0.8):] print("#Numebr of training samples are = %s" % len(training_samples)) print("#Numebr of test samples are = %s" % len(test_samples)) # I can incread number of traning sample this will imrove result but need more time classifier = NERClassifier(training_samples[:100]) classifieddataTree, classifieddataList = classifier.parseTheData( pos_tag(word_tokenize(sentence))) listOfNamedEnties = [] properNoun = "NNP" noun = "NN" properNounPlural = "NNPS" nounPlural = "NNS" for x in classifieddataList: if properNoun in x: listOfNamedEnties.append((x, properNoun)) if noun in x: listOfNamedEnties.append((x, noun)) if properNounPlural in x: listOfNamedEnties.append((x, properNounPlural)) if nounPlural in x: listOfNamedEnties.append((x, nounPlural)) print("*****---------- NER detected by Learned Annotator") print(listOfNamedEnties) print( "**** ---below is the output from code to extract realtion between entities by library--****" ) relationResult = relextract.tree2semi_rel(classifieddataTree) for s, tree in relationResult: print(s + " has something to do with: " + tree)
def reasoning(dList): reasonList = [] tokenizer = TweetTokenizer() for tweet in dList: print tweet # tokenize words = tokenizer.tokenize(tweet) # get POS tag pos_tokens = pos_tag(words) # get name entities tree = ne_chunk(pos_tokens, binary = False) # find relations pairs = relextract.tree2semi_rel(tree) # get interesting name entities reason = [] for s, tree in pairs: reasonStr = ("%s") % tree reasonStr = reasonStr.split(" ") label = reasonStr[0].replace("(","").strip() content = "" for wordTag in reasonStr[1:]: sp = wordTag.split("/") word = sp[0].replace("(","") print word # content.append(word) content += (word + " ") # reason: [(label, content)] reason.append({"label": label, "content": content}) # reasonList [reason] if len(reason) > 0: reasonList.append({"reason": reason}) print str(len(reasonList)) + "/" + str(len(dList)) return reasonList
def main(): pattern = 'NP: {<DT>?<JJ>*<NN>}' #IN = re.compile(r'.*\bin\b(?!\b.+ing)') with open("wiki-abstracts-only.txt", "r") as fin: for line in fin: sent = nltk.word_tokenize(line) #augment with POS tags sent = nltk.pos_tag(sent) cp = nltk.RegexpParser(pattern) cs = cp.parse(sent) #print(cs) ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(line))) #print(ne_tree) #for rel in nltk.sem.extract_rels('ORG', 'LOC', line, corpus='ieer', pattern = IN): # print(nltk.sem.rtuple(rel)) pairs = relextract.tree2semi_rel(ne_tree) reldicts = relextract.semi_rel2reldict(pairs) #print(len(reldicts)) for r in reldicts: #print('[' + r['subjtext'] + '] : [' + r['filler'] + '] : [' + r['objtext'] + ']') # remove POS tags sub = r['subjtext'].replace('/NNPS', '').replace('/NNP', '').replace('/JJ', '') obj = r['objtext'].replace('/NNPS', '').replace('/NNP', '') vb = r['filler'].replace('/NNS','').replace('/NNP','').replace('/NN','').replace('/CC','').\ replace('/PRP$','').replace('/DT','').replace('/CD','').replace('/JJ','').replace('/PRP','').\ replace('/WP','').replace('/IN',"").replace('/VBD','').replace('/VBN','') print('[' + sub + '] : [' + vb + '] : [' + obj + ']')
def extract_entities(text): tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] ))) pairs=relextract.tree2semi_rel(tree) for sent,tree in pairs: print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree)) reldicts = relextract.semi_rel2reldict(pairs) for r in reldicts: print '='*30 print(r['subjclass'],':', r['subjtext']) print (r['filler']) print (r['objclass'],':', r['objtext'])
def relationExtraction(self, subjectClass, objectClass, pattern): window = 5 relations = [] relfilter = lambda x: (x['subjclass'] == subjectClass and len(x[ 'filler'].split()) <= window and pattern.match(x['filler']) and x[ 'objclass'] == objectClass) for sent in self.tagged_sentences: chunked = nltk.ne_chunk(sent) reldicts = self.semi_rel2reldict(tree2semi_rel(chunked)) rels = list(filter(relfilter, reldicts)) for rel in rels: relations.append(nltk.sem.relextract.rtuple(rel)) return relations
def relationExtraction(chunk_tree): IN = re.compile(r'.*\b in \b.*') # chunk_tree.headline = ['S'] #文法开头什么都没有,因为文章被分成句子,每个句子单独成树了 # chunk_tree.text = chunk_tree #api变动,识别text作为其读入内容 relations = [] # conllStr = chunk.tree2conlltags(chunk_tree) # chunk_tree = chunk.conlltags2tree(conllStr) #得到不同的两对,一个是没有命名实体的词组,另一个则有 pairs = relextract.tree2semi_rel(chunk_tree) fix_pairs = [] for word_list, tree in pairs: fix_pairs.append(tree) reldicts = relextract.semi_rel2reldict(pairs) #打印出关系表 for reldict in reldicts: print('\n') for k, v in sorted(reldict.items()): print(k, '=>', v) # doctest: +ELLIPSIS # org_place_relations = relextract.extract_rels( # 'ORG', 'GPE', chunk_tree, corpus='conll2002', pattern=IN) # per_place_relations = relextract.extract_rels( # 'PER', 'GPE', chunk_tree, corpus='conll2002', pattern=IN) # condition = False # if fix_pairs.__contains__('PERSON') and fix_pairs.__contains__('ORGANIZATION'): # condition = True # has_per = False # has_org = False # for tree in fix_pairs: # if getattr(tree,'label') is 'PERSON': # has_per = True # if getattr(tree, 'label') is 'ORGANIZATION': # has_org = False # for relation in nltk.sem.extract_rels('PERSON', 'ORGANIZATION', chunk_tree, corpus='ace', pattern=PR): # print(relation) # relations.append(nltk.sem.rtuple(relation)) return reldicts
def extractSampleRels(sample): #with open('toyset', 'r') as f: # sample = f.read().decode('utf-8') sentences = nltk.sent_tokenize(sample) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] entitiesMap = [] for i, sent in enumerate(tagged_sentences): sent = nltk.ne_chunk(sent) # ne_chunk method expects one tagged sentence pairs = relextract.tree2semi_rel(sent) reldicts = relextract.semi_rel2reldict(pairs) for r in reldicts: entitiesMap.append((r['subjtext'],r['filler'],r['objtext'])) return entitiesMap
def such_as_np(s, np_sent): ''' Given a np chunked sentences, try to extract the concepts X--set y--set ''' X = set() Y = set() if re.findall(r'\bsuch\b\s\bas\b', s): # extract the such as pattern # logging.info(s) semi_pairs = relextract.tree2semi_rel(np_sent) reldicts = relextract.semi_rel2reldict(semi_pairs) # find the first such as logging.info(np_sent) # pprint(semi_pairs) # pprint(reldicts) # logging.info(len(reldicts)) if len(reldicts) > 0: try: while 'such as' not in reldicts[0]['untagged_filler']: reldicts.pop(0) X.add(reldicts[0]['subjsym']) Y.add(reldicts[0]['objsym']) reldicts.pop(0) # find the sub concept for reldict in reldicts: if reldict['untagged_filler'] not in [',', 'and', 'or']: Y.add(reldict['subjsym']) break Y.add(reldict['subjsym']) Y.add(reldict['objsym']) except Exception as e: logging.error(e) logging.error(reldicts) logging.error('Original sentence: '+s) stop() return (X, Y)
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences. chunkedSents=list() for sent in nltk.sent_tokenize(text): chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))] docTree=nltk.Tree('DOCUMENT',chunkedSents) pairs=relextract.tree2semi_rel(docTree) for sent,tree in pairs: print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print reldicts = relextract.semi_rel2reldict(pairs) return reldicts for r in reldicts: print '='*30 print(r['subjclass'],':', r['subjtext']) print (r['filler']) print (r['objclass'],':', r['objtext'])
def getGrammarRelations(named_entities, tree): pairs = relextract.tree2semi_rel(tree) reldicts = rel2dict(pairs) rules = [ # TYPE1, TYPE2, [keywords], ONTOLOGY, ISMIRRORED ('ORGANIZATION', 'PERSON', ['work'], 'employer', False), ('PERSON', 'PERSON', ['sister'], 'sibling', True), ] porter = PorterStemmer() relations = {} for rels in reldicts: dbotype1 = [ner[1] for ner in named_entities if ner[0] == rels[0]][0] dbotype2 = [ner[1] for ner in named_entities if ner[0] == rels[2]][0] space = [porter.stem(word[0]) for word in rels[1]] proposals = [(rule[2], rule[3], rule[4]) for rule in rules if rule[0] == dbotype1 and rule[1] == dbotype2] for (words, newtype, mirror) in proposals: isIn = bool(list(set(space) & set(words))) if isIn: if rels[0] not in relations: relations[rels[0]] = [] relations[rels[0]].append( (rels[2], ["http://dbpedia.org/ontology/" + newtype])) if mirror: if rels[2] not in relations: relations[rels[2]] = [] relations[rels[2]].append( (rels[0], ["http://dbpedia.org/ontology/" + newtype])) return relations
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences. chunkedSents=list() for sent in nltk.sent_tokenize(text): chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))] docTree=nltk.Tree('DOCUMENT',chunkedSents) pairs=relextract.tree2semi_rel(docTree) for sent,tree in pairs: print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print reldicts = relextract.semi_rel2reldict(pairs) return reldicts for r in reldicts: print '='*30 print(r['subjclass'],':', r['subjtext']) print (r['filler']) print (r['objclass'],':', r['objtext']) """def improve(reldicts): for dicts in reldicts: print len(nltk.sent_tokenize(dicts['filler'])) improve(reldicts) """ #print pairs[0] #print pairs[1] #print pairs[2] #for sent,tree in pairs[0]: # print sent,tree #print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree)) #tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] ))) #l=[chunk for chunk in tree] #print "%s"%docTree """
def main(): pattern = 'NP: {<DT>?<JJ>*<NN>}' with open("wiki-abstracts-only.txt", "r") as fin: for line in fin: sent = nltk.word_tokenize(line) #augment with POS tags sent = nltk.pos_tag(sent) cp = nltk.RegexpParser(pattern) cs = cp.parse(sent) ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(line))) pairs = relextract.tree2semi_rel(ne_tree) reldicts = relextract.semi_rel2reldict(pairs) for r in reldicts: # remove POS tags sub = r['subjtext'].replace('/NNPS', '').replace('/NNP', '').replace('/JJ', '') obj = r['objtext'].replace('/NNPS', '').replace('/NNP', '') vb = r['filler'].replace('/NNS','').replace('/NNP','').replace('/NN','').replace('/CC','').\ replace('/PRP$','').replace('/DT','').replace('/CD','').replace('/JJ','').replace('/PRP','').\ replace('/WP','').replace('/IN',"").replace('/VBD','').replace('/VBN','') # print result print('[' + sub + '] : [' + vb + '] : [' + obj + ']')
s = "" for word in line: s = s + " " + word print s + "\n" for word in line: meanings = net.synsets(word) if len(meanings) > 0: print meanings[0].definition() elif num == "3": docs = ieer.parsed_docs('APW_19980424') tree = docs[1].text from nltk.sem import relextract pairs = relextract.tree2semi_rel(tree) for s, tree in pairs[18:22]: print('("...%s", %s)' % (" ".join(s[-5:]), tree)) reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # The function relextract() allows us to filter the reldicts # according to the classes of the subject and object named entities. # In addition, we can specify that the filler text has to match a given regular expression, # as illustrated in the next example. Here, we are looking for pairs of entities in the IN # relation, where IN has signature <ORG, LOC>. IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)') for fileid in ieer.fileids(): print fileid
def detectDisplayNamedEntities(sentence): ##############if we use buit in pos result owuld be better ###### # I have trained my own POS tagger as well butits accuracy is around 80 to 90 % I can use that as well tokens = nltk.word_tokenize(sentence) resultList2 = list(nltk.pos_tag(tokens)) print(resultList2) #grammar = "NP: {<DT>?<JJ>*<NN>}" # for desired resutlt we can update the grammer grammar2 = r""" NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and noun {<NNP>+} # chunk sequences of proper nouns {<NNS>+} # chunk sequences of Noun plural {<NNPS>+} # chunk sequences of Proper noun, plural {<LS>+} # chunk sequences of List item marker """ cp = nltk.RegexpParser(grammar2) nounPhraseTree = cp.parse(resultList2) print(nounPhraseTree) print( "**** -below is the output from code to extract realtion between entities by library--****" ) relationResult = relextract.tree2semi_rel(nounPhraseTree) for s, tree in relationResult: print(str(s) + " has something to do with: " + str(tree)) # uncomment line below ehn you want to see the tree structure of tags as well #nounPhraseTree.draw() nounList = [] for node in nounPhraseTree: if isinstance(node, nltk.tree.Tree): if node.label() == 'NP': NP = node.leaves() print(NP) for x in NP: if x[1] == 'NN' or x[1] == 'NNP' or x[1] == 'NNPS' or x[ 1] == 'NNS': nounList.append(x[0]) print( "*****-----------------------------------------------------------*****" ) print("list of all nouns detected in the text is result as below:") print(nounList) dictionary = {} dictionary['coutries'] = [] # with open('countries.txt') as openfileobject: # for line in openfileobject: # dictionary['coutries'].append(line.rstrip()) # openfileobject.closed fileHandler = open('countries.txt') allCountries = fileHandler.read() fileHandler.close() dictionary['coutries'] = allCountries.split("\n") fileHandler = open('months.txt') allCountries = fileHandler.read() fileHandler.close() dictionary['months'] = allCountries.split("\n") fileHandler = open('days.txt') allCountries = fileHandler.read() fileHandler.close() dictionary['days'] = allCountries.split("\n") ### same way we can use different dictionalries to tag detail with our detected nouns #print(dictionary['coutries'][1]) finalNamedEntityWithEntityTags = [] for n in nounList: # here by n I mean one noun from the list of nouns if n in dictionary['coutries']: finalNamedEntityWithEntityTags.append((n, 'name of Country')) if n in dictionary['months']: finalNamedEntityWithEntityTags.append((n, 'name of Month')) if n in dictionary['days']: finalNamedEntityWithEntityTags.append((n, 'Day of the week')) for resultLine in finalNamedEntityWithEntityTags: print(resultLine) finalNERWithDetail = [] dictionary = PyDictionary() for n in nounList: # this will help user to understand detected NER try: #try block if dictionary has no synonyn then its a name finalNERWithDetail.append((n, dictionary.synonym(n))) except: finalNERWithDetail.append( (n, "it is a name of something or a person")) print( "=> Detected NER with synonym detail that help to understand these NER: " ) for resultLine in finalNERWithDetail: print(resultLine)
def main(): print "user input(1) or semcor(2)?" num = raw_input() if num == "1": #input print "enter word" word = raw_input() for meaning in (net.synsets(word)): #print "Sense: " + re.findall("'.*'", str(meaning))[0] print "Sense: " + str(meaning) print meaning.definition() + "\n" hypernyms = (meaning.hypernyms()) if len(hypernyms) > 0: print "\nHypernyms:" for meaning2 in hypernyms: print re.findall("'.*'", str(meaning2))[0] hyponyms = (meaning.hyponyms()) if len(hyponyms) > 0: print "\nHyponyms:" for meaning2 in hyponyms: print re.findall("'.*'", str(meaning2))[0] # print "\nHypernym Tree:" # print (gethypernymtree(meaning)) print "\n" # dog = wn.synset('dog.n.01') # hypo = lambda s: s.hyponyms() # hyper = lambda s: s.hypernyms() #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms() #True #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms() elif (num == "2"): #semcor print "semcor" for line in semcor.sents()[0:100]: s = "" for word in line: s = s + " " + word print s + "\n" for word in line: meanings = net.synsets(word) if len(meanings) > 0: print meanings[0].definition() elif num == "3": docs = ieer.parsed_docs('APW_19980424') tree = docs[1].text from nltk.sem import relextract pairs = relextract.tree2semi_rel(tree) for s, tree in pairs[18:22]: print('("...%s", %s)' % (" ".join(s[-5:]), tree)) reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # The function relextract() allows us to filter the reldicts # according to the classes of the subject and object named entities. # In addition, we can specify that the filler text has to match a given regular expression, # as illustrated in the next example. Here, we are looking for pairs of entities in the IN # relation, where IN has signature <ORG, LOC>. IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)') for fileid in ieer.fileids(): print fileid for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)" ROLES = re.compile(roles, re.VERBOSE) for fileid in ieer.fileids(): for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences. chunkedSents=list() for sent in nltk.sent_tokenize(text): chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))] docTree=nltk.Tree('DOCUMENT',chunkedSents) pairs=relextract.tree2semi_rel(docTree) for sent,tree in pairs: print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print reldicts = relextract.semi_rel2reldict(pairs) for r in reldicts: print '='*30 print(r['subjclass'],':', r['subjtext']) print (r['filler']) print (r['objclass'],':', r['objtext']) # Match pattern in filler roles = """ (.*( analyst| chair(wo)?man| commissioner| counsel| director| economist| editor| executive| foreman| governor| head| lawyer| leader| librarian).*)| manager| partner| president| producer| professor| researcher| spokes(wo)?man| writer| ,\sof\sthe?\s* """# "X, of (the) Y" ROLES = re.compile(roles, re.VERBOSE) IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') pattern=ROLES subjclass='PERSON'#'ORGANIZATION' objclass='ORGANIZATION'#'GPE' window=10 relfilter = lambda x: (x['subjclass'] == subjclass and len(x['filler'].split()) <= window and pattern.match(x['filler']) and x['objclass'] == objclass) for rel in list(filter(relfilter, reldicts)): print(relextract.rtuple(rel)) def improve(reldicts): for dicts in reldicts: print len(nltk.sent_tokenize(dicts['filler'])) improve(reldicts) #print pairs[0] #print pairs[1] #print pairs[2] #for sent,tree in pairs[0]: # print sent,tree #print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree)) #tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] ))) #l=[chunk for chunk in tree] #print "%s"%docTree """