def main(): pattern = 'NP: {<DT>?<JJ>*<NN>}' #IN = re.compile(r'.*\bin\b(?!\b.+ing)') with open("wiki-abstracts-only.txt", "r") as fin: for line in fin: sent = nltk.word_tokenize(line) #augment with POS tags sent = nltk.pos_tag(sent) cp = nltk.RegexpParser(pattern) cs = cp.parse(sent) #print(cs) ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(line))) #print(ne_tree) #for rel in nltk.sem.extract_rels('ORG', 'LOC', line, corpus='ieer', pattern = IN): # print(nltk.sem.rtuple(rel)) pairs = relextract.tree2semi_rel(ne_tree) reldicts = relextract.semi_rel2reldict(pairs) #print(len(reldicts)) for r in reldicts: #print('[' + r['subjtext'] + '] : [' + r['filler'] + '] : [' + r['objtext'] + ']') # remove POS tags sub = r['subjtext'].replace('/NNPS', '').replace('/NNP', '').replace('/JJ', '') obj = r['objtext'].replace('/NNPS', '').replace('/NNP', '') vb = r['filler'].replace('/NNS','').replace('/NNP','').replace('/NN','').replace('/CC','').\ replace('/PRP$','').replace('/DT','').replace('/CD','').replace('/JJ','').replace('/PRP','').\ replace('/WP','').replace('/IN',"").replace('/VBD','').replace('/VBN','') print('[' + sub + '] : [' + vb + '] : [' + obj + ']')
def extract_entities(text): tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] ))) pairs=relextract.tree2semi_rel(tree) for sent,tree in pairs: print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree)) reldicts = relextract.semi_rel2reldict(pairs) for r in reldicts: print '='*30 print(r['subjclass'],':', r['subjtext']) print (r['filler']) print (r['objclass'],':', r['objtext'])
def relationExtraction(chunk_tree): IN = re.compile(r'.*\b in \b.*') # chunk_tree.headline = ['S'] #文法开头什么都没有,因为文章被分成句子,每个句子单独成树了 # chunk_tree.text = chunk_tree #api变动,识别text作为其读入内容 relations = [] # conllStr = chunk.tree2conlltags(chunk_tree) # chunk_tree = chunk.conlltags2tree(conllStr) #得到不同的两对,一个是没有命名实体的词组,另一个则有 pairs = relextract.tree2semi_rel(chunk_tree) fix_pairs = [] for word_list, tree in pairs: fix_pairs.append(tree) reldicts = relextract.semi_rel2reldict(pairs) #打印出关系表 for reldict in reldicts: print('\n') for k, v in sorted(reldict.items()): print(k, '=>', v) # doctest: +ELLIPSIS # org_place_relations = relextract.extract_rels( # 'ORG', 'GPE', chunk_tree, corpus='conll2002', pattern=IN) # per_place_relations = relextract.extract_rels( # 'PER', 'GPE', chunk_tree, corpus='conll2002', pattern=IN) # condition = False # if fix_pairs.__contains__('PERSON') and fix_pairs.__contains__('ORGANIZATION'): # condition = True # has_per = False # has_org = False # for tree in fix_pairs: # if getattr(tree,'label') is 'PERSON': # has_per = True # if getattr(tree, 'label') is 'ORGANIZATION': # has_org = False # for relation in nltk.sem.extract_rels('PERSON', 'ORGANIZATION', chunk_tree, corpus='ace', pattern=PR): # print(relation) # relations.append(nltk.sem.rtuple(relation)) return reldicts
def extractSampleRels(sample): #with open('toyset', 'r') as f: # sample = f.read().decode('utf-8') sentences = nltk.sent_tokenize(sample) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] entitiesMap = [] for i, sent in enumerate(tagged_sentences): sent = nltk.ne_chunk(sent) # ne_chunk method expects one tagged sentence pairs = relextract.tree2semi_rel(sent) reldicts = relextract.semi_rel2reldict(pairs) for r in reldicts: entitiesMap.append((r['subjtext'],r['filler'],r['objtext'])) return entitiesMap
def such_as_np(s, np_sent): ''' Given a np chunked sentences, try to extract the concepts X--set y--set ''' X = set() Y = set() if re.findall(r'\bsuch\b\s\bas\b', s): # extract the such as pattern # logging.info(s) semi_pairs = relextract.tree2semi_rel(np_sent) reldicts = relextract.semi_rel2reldict(semi_pairs) # find the first such as logging.info(np_sent) # pprint(semi_pairs) # pprint(reldicts) # logging.info(len(reldicts)) if len(reldicts) > 0: try: while 'such as' not in reldicts[0]['untagged_filler']: reldicts.pop(0) X.add(reldicts[0]['subjsym']) Y.add(reldicts[0]['objsym']) reldicts.pop(0) # find the sub concept for reldict in reldicts: if reldict['untagged_filler'] not in [',', 'and', 'or']: Y.add(reldict['subjsym']) break Y.add(reldict['subjsym']) Y.add(reldict['objsym']) except Exception as e: logging.error(e) logging.error(reldicts) logging.error('Original sentence: '+s) stop() return (X, Y)
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences. chunkedSents=list() for sent in nltk.sent_tokenize(text): chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))] docTree=nltk.Tree('DOCUMENT',chunkedSents) pairs=relextract.tree2semi_rel(docTree) for sent,tree in pairs: print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print reldicts = relextract.semi_rel2reldict(pairs) return reldicts for r in reldicts: print '='*30 print(r['subjclass'],':', r['subjtext']) print (r['filler']) print (r['objclass'],':', r['objtext'])
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences. chunkedSents=list() for sent in nltk.sent_tokenize(text): chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))] docTree=nltk.Tree('DOCUMENT',chunkedSents) pairs=relextract.tree2semi_rel(docTree) for sent,tree in pairs: print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print reldicts = relextract.semi_rel2reldict(pairs) return reldicts for r in reldicts: print '='*30 print(r['subjclass'],':', r['subjtext']) print (r['filler']) print (r['objclass'],':', r['objtext']) """def improve(reldicts): for dicts in reldicts: print len(nltk.sent_tokenize(dicts['filler'])) improve(reldicts) """ #print pairs[0] #print pairs[1] #print pairs[2] #for sent,tree in pairs[0]: # print sent,tree #print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree)) #tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] ))) #l=[chunk for chunk in tree] #print "%s"%docTree """
def main(): pattern = 'NP: {<DT>?<JJ>*<NN>}' with open("wiki-abstracts-only.txt", "r") as fin: for line in fin: sent = nltk.word_tokenize(line) #augment with POS tags sent = nltk.pos_tag(sent) cp = nltk.RegexpParser(pattern) cs = cp.parse(sent) ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(line))) pairs = relextract.tree2semi_rel(ne_tree) reldicts = relextract.semi_rel2reldict(pairs) for r in reldicts: # remove POS tags sub = r['subjtext'].replace('/NNPS', '').replace('/NNP', '').replace('/JJ', '') obj = r['objtext'].replace('/NNPS', '').replace('/NNP', '') vb = r['filler'].replace('/NNS','').replace('/NNP','').replace('/NN','').replace('/CC','').\ replace('/PRP$','').replace('/DT','').replace('/CD','').replace('/JJ','').replace('/PRP','').\ replace('/WP','').replace('/IN',"").replace('/VBD','').replace('/VBN','') # print result print('[' + sub + '] : [' + vb + '] : [' + obj + ']')
for word in line: meanings = net.synsets(word) if len(meanings) > 0: print meanings[0].definition() elif num == "3": docs = ieer.parsed_docs('APW_19980424') tree = docs[1].text from nltk.sem import relextract pairs = relextract.tree2semi_rel(tree) for s, tree in pairs[18:22]: print('("...%s", %s)' % (" ".join(s[-5:]), tree)) reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # The function relextract() allows us to filter the reldicts # according to the classes of the subject and object named entities. # In addition, we can specify that the filler text has to match a given regular expression, # as illustrated in the next example. Here, we are looking for pairs of entities in the IN # relation, where IN has signature <ORG, LOC>. IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)') for fileid in ieer.fileids(): print fileid for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('ORG', 'LOC', doc,
def main(): print "user input(1) or semcor(2)?" num = raw_input() if num == "1": #input print "enter word" word = raw_input() for meaning in (net.synsets(word)): #print "Sense: " + re.findall("'.*'", str(meaning))[0] print "Sense: " + str(meaning) print meaning.definition() + "\n" hypernyms = (meaning.hypernyms()) if len(hypernyms) > 0: print "\nHypernyms:" for meaning2 in hypernyms: print re.findall("'.*'", str(meaning2))[0] hyponyms = (meaning.hyponyms()) if len(hyponyms) > 0: print "\nHyponyms:" for meaning2 in hyponyms: print re.findall("'.*'", str(meaning2))[0] # print "\nHypernym Tree:" # print (gethypernymtree(meaning)) print "\n" # dog = wn.synset('dog.n.01') # hypo = lambda s: s.hyponyms() # hyper = lambda s: s.hypernyms() #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms() #True #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms() elif (num == "2"): #semcor print "semcor" for line in semcor.sents()[0:100]: s = "" for word in line: s = s + " " + word print s + "\n" for word in line: meanings = net.synsets(word) if len(meanings) > 0: print meanings[0].definition() elif num == "3": docs = ieer.parsed_docs('APW_19980424') tree = docs[1].text from nltk.sem import relextract pairs = relextract.tree2semi_rel(tree) for s, tree in pairs[18:22]: print('("...%s", %s)' % (" ".join(s[-5:]), tree)) reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # The function relextract() allows us to filter the reldicts # according to the classes of the subject and object named entities. # In addition, we can specify that the filler text has to match a given regular expression, # as illustrated in the next example. Here, we are looking for pairs of entities in the IN # relation, where IN has signature <ORG, LOC>. IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)') for fileid in ieer.fileids(): print fileid for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)" ROLES = re.compile(roles, re.VERBOSE) for fileid in ieer.fileids(): for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences. chunkedSents=list() for sent in nltk.sent_tokenize(text): chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))] docTree=nltk.Tree('DOCUMENT',chunkedSents) pairs=relextract.tree2semi_rel(docTree) for sent,tree in pairs: print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print reldicts = relextract.semi_rel2reldict(pairs) for r in reldicts: print '='*30 print(r['subjclass'],':', r['subjtext']) print (r['filler']) print (r['objclass'],':', r['objtext']) # Match pattern in filler roles = """ (.*( analyst| chair(wo)?man| commissioner| counsel| director| economist| editor| executive| foreman| governor| head| lawyer| leader| librarian).*)| manager| partner| president| producer| professor| researcher| spokes(wo)?man| writer| ,\sof\sthe?\s* """# "X, of (the) Y" ROLES = re.compile(roles, re.VERBOSE) IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') pattern=ROLES subjclass='PERSON'#'ORGANIZATION' objclass='ORGANIZATION'#'GPE' window=10 relfilter = lambda x: (x['subjclass'] == subjclass and len(x['filler'].split()) <= window and pattern.match(x['filler']) and x['objclass'] == objclass) for rel in list(filter(relfilter, reldicts)): print(relextract.rtuple(rel)) def improve(reldicts): for dicts in reldicts: print len(nltk.sent_tokenize(dicts['filler'])) improve(reldicts) #print pairs[0] #print pairs[1] #print pairs[2] #for sent,tree in pairs[0]: # print sent,tree #print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree)) #tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] ))) #l=[chunk for chunk in tree] #print "%s"%docTree """