Ejemplo n.º 1
0
def main():
    pattern = 'NP: {<DT>?<JJ>*<NN>}'
    #IN = re.compile(r'.*\bin\b(?!\b.+ing)')

    with open("wiki-abstracts-only.txt", "r") as fin:
        for line in fin:
            sent = nltk.word_tokenize(line)
            #augment with POS tags
            sent = nltk.pos_tag(sent)
            cp = nltk.RegexpParser(pattern)
            cs = cp.parse(sent)
            #print(cs)
            ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(line)))
            #print(ne_tree)
            #for rel in nltk.sem.extract_rels('ORG', 'LOC', line, corpus='ieer', pattern = IN):
            #    print(nltk.sem.rtuple(rel))
            pairs = relextract.tree2semi_rel(ne_tree)
            reldicts = relextract.semi_rel2reldict(pairs)
            #print(len(reldicts))
            for r in reldicts:
                #print('[' + r['subjtext'] + '] : [' + r['filler'] + '] : [' + r['objtext'] + ']')
                # remove POS tags
                sub = r['subjtext'].replace('/NNPS',
                                            '').replace('/NNP',
                                                        '').replace('/JJ', '')
                obj = r['objtext'].replace('/NNPS', '').replace('/NNP', '')
                vb = r['filler'].replace('/NNS','').replace('/NNP','').replace('/NN','').replace('/CC','').\
                replace('/PRP$','').replace('/DT','').replace('/CD','').replace('/JJ','').replace('/PRP','').\
                replace('/WP','').replace('/IN',"").replace('/VBD','').replace('/VBN','')
                print('[' + sub + '] : [' + vb + '] : [' + obj + ']')
Ejemplo n.º 2
0
def extract_entities(text):
	tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] )))
	pairs=relextract.tree2semi_rel(tree)
	for sent,tree in pairs:	
		 print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree))

	reldicts = relextract.semi_rel2reldict(pairs)

	for r in reldicts:
	        print '='*30
		print(r['subjclass'],':', r['subjtext'])
		print (r['filler'])
		print (r['objclass'],':', r['objtext'])
Ejemplo n.º 3
0
def relationExtraction(chunk_tree):
    IN = re.compile(r'.*\b in \b.*')
    # chunk_tree.headline = ['S'] #文法开头什么都没有,因为文章被分成句子,每个句子单独成树了
    # chunk_tree.text = chunk_tree  #api变动,识别text作为其读入内容
    relations = []
    # conllStr = chunk.tree2conlltags(chunk_tree)
    # chunk_tree = chunk.conlltags2tree(conllStr)

    #得到不同的两对,一个是没有命名实体的词组,另一个则有
    pairs = relextract.tree2semi_rel(chunk_tree)
    fix_pairs = []

    for word_list, tree in pairs:
        fix_pairs.append(tree)
    reldicts = relextract.semi_rel2reldict(pairs)

    #打印出关系表
    for reldict in reldicts:
        print('\n')
        for k, v in sorted(reldict.items()):
            print(k, '=>', v)  # doctest: +ELLIPSIS

    # org_place_relations = relextract.extract_rels(
    #     'ORG', 'GPE', chunk_tree, corpus='conll2002', pattern=IN)

    # per_place_relations = relextract.extract_rels(
    #     'PER', 'GPE', chunk_tree, corpus='conll2002', pattern=IN)

    # condition = False
    # if fix_pairs.__contains__('PERSON') and fix_pairs.__contains__('ORGANIZATION'):
    #     condition = True

    # has_per = False
    # has_org = False
    # for tree in fix_pairs:
    #     if getattr(tree,'label') is 'PERSON':
    #         has_per = True
    #     if getattr(tree, 'label') is 'ORGANIZATION':
    #         has_org = False

    # for relation in nltk.sem.extract_rels('PERSON', 'ORGANIZATION', chunk_tree, corpus='ace', pattern=PR):
    #     print(relation)
    #     relations.append(nltk.sem.rtuple(relation))

    return reldicts
Ejemplo n.º 4
0
def extractSampleRels(sample):
	#with open('toyset', 'r') as f:
	#    sample = f.read().decode('utf-8')

	sentences = nltk.sent_tokenize(sample)
	tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
	tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]

	entitiesMap = []

	for i, sent in enumerate(tagged_sentences):
		sent = nltk.ne_chunk(sent) # ne_chunk method expects one tagged sentence
		pairs = relextract.tree2semi_rel(sent)
		reldicts = relextract.semi_rel2reldict(pairs)
		for r in reldicts:
			entitiesMap.append((r['subjtext'],r['filler'],r['objtext']))

	return entitiesMap	
Ejemplo n.º 5
0
def such_as_np(s, np_sent):
    '''
    Given a np chunked sentences, try to extract the concepts

    X--set
    y--set
    '''
    X = set()
    Y = set()
    if re.findall(r'\bsuch\b\s\bas\b', s):
        # extract the such as pattern
        # logging.info(s)

        semi_pairs = relextract.tree2semi_rel(np_sent)
        reldicts = relextract.semi_rel2reldict(semi_pairs)
        # find the first such as
        logging.info(np_sent)
        # pprint(semi_pairs)
        # pprint(reldicts)
        # logging.info(len(reldicts))
        if len(reldicts) > 0:
            try:
                while 'such as' not in reldicts[0]['untagged_filler']:
                    reldicts.pop(0)


                X.add(reldicts[0]['subjsym'])
                Y.add(reldicts[0]['objsym'])

                reldicts.pop(0)

                # find the sub concept
                for reldict in reldicts:
                    if reldict['untagged_filler'] not in [',', 'and', 'or']:
                        Y.add(reldict['subjsym'])
                        break
                    Y.add(reldict['subjsym'])
                    Y.add(reldict['objsym'])
            except Exception as e:
                logging.error(e)
                logging.error(reldicts)
                logging.error('Original sentence: '+s)
        stop()
    return (X, Y)
Ejemplo n.º 6
0
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences.
	chunkedSents=list()
	for sent in nltk.sent_tokenize(text):
		chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))]
	docTree=nltk.Tree('DOCUMENT',chunkedSents)
	pairs=relextract.tree2semi_rel(docTree)

	for sent,tree in pairs:
		print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print
	
	reldicts = relextract.semi_rel2reldict(pairs)
	
	return reldicts

	for r in reldicts:
	        print '='*30
		print(r['subjclass'],':', r['subjtext'])
		print (r['filler'])
		print (r['objclass'],':', r['objtext'])
Ejemplo n.º 7
0
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences.
	chunkedSents=list()
	for sent in nltk.sent_tokenize(text):
		chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))]
	docTree=nltk.Tree('DOCUMENT',chunkedSents)
	pairs=relextract.tree2semi_rel(docTree)

	for sent,tree in pairs:
		print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print
	
	reldicts = relextract.semi_rel2reldict(pairs)
	
	return reldicts

	for r in reldicts:
	        print '='*30
		print(r['subjclass'],':', r['subjtext'])
		print (r['filler'])
		print (r['objclass'],':', r['objtext'])

	

	"""def improve(reldicts):
		for dicts in reldicts:
			print len(nltk.sent_tokenize(dicts['filler']))
	improve(reldicts)
	"""
	#print pairs[0]
	#print pairs[1]
	#print pairs[2]
	#for sent,tree in pairs[0]:
	#	print sent,tree 
		#print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree))
	#tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] )))
	#l=[chunk for chunk in tree]
	#print "%s"%docTree
	"""	
Ejemplo n.º 8
0
def main():
    pattern = 'NP: {<DT>?<JJ>*<NN>}'

    with open("wiki-abstracts-only.txt", "r") as fin:
        for line in fin:
            sent = nltk.word_tokenize(line)
            #augment with POS tags
            sent = nltk.pos_tag(sent)
            cp = nltk.RegexpParser(pattern)
            cs = cp.parse(sent)
            ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(line)))
            pairs = relextract.tree2semi_rel(ne_tree)
            reldicts = relextract.semi_rel2reldict(pairs)
            for r in reldicts:
                # remove POS tags
                sub = r['subjtext'].replace('/NNPS',
                                            '').replace('/NNP',
                                                        '').replace('/JJ', '')
                obj = r['objtext'].replace('/NNPS', '').replace('/NNP', '')
                vb = r['filler'].replace('/NNS','').replace('/NNP','').replace('/NN','').replace('/CC','').\
                replace('/PRP$','').replace('/DT','').replace('/CD','').replace('/JJ','').replace('/PRP','').\
                replace('/WP','').replace('/IN',"").replace('/VBD','').replace('/VBN','')
                # print result
                print('[' + sub + '] : [' + vb + '] : [' + obj + ']')
Ejemplo n.º 9
0
        for word in line:
            meanings = net.synsets(word)
            if len(meanings) > 0:
                print meanings[0].definition()
elif num == "3":

    docs = ieer.parsed_docs('APW_19980424')
    tree = docs[1].text

    from nltk.sem import relextract
    pairs = relextract.tree2semi_rel(tree)
    for s, tree in pairs[18:22]:
        print('("...%s", %s)' % (" ".join(s[-5:]), tree))

    reldicts = relextract.semi_rel2reldict(pairs)
    for k, v in sorted(reldicts[0].items()):
        print(k, '=>', v)

#	The function relextract() allows us to filter the reldicts
#	according to the classes of the subject and object named entities.
#	In addition, we can specify that the filler text has to match a given regular expression,
#	 as illustrated in the next example. Here, we are looking for pairs of entities in the IN
#	relation, where IN has signature <ORG, LOC>.
    IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)')
    for fileid in ieer.fileids():
        print fileid
        for doc in ieer.parsed_docs(fileid):
            for rel in relextract.extract_rels('ORG',
                                               'LOC',
                                               doc,
Ejemplo n.º 10
0
def main():
    print "user input(1) or semcor(2)?"

    num = raw_input()

    if num == "1":
        #input
        print "enter word"
        word = raw_input()
        for meaning in (net.synsets(word)):
            #print "Sense: " + re.findall("'.*'", str(meaning))[0]
            print "Sense: " + str(meaning)
            print meaning.definition() + "\n"
            hypernyms = (meaning.hypernyms())
            if len(hypernyms) > 0:
                print "\nHypernyms:"
                for meaning2 in hypernyms:
                    print re.findall("'.*'", str(meaning2))[0]

            hyponyms = (meaning.hyponyms())
            if len(hyponyms) > 0:
                print "\nHyponyms:"
                for meaning2 in hyponyms:
                    print re.findall("'.*'", str(meaning2))[0]

    #		print "\nHypernym Tree:"
    #		print (gethypernymtree(meaning))
            print "\n"

    #		dog = wn.synset('dog.n.01')
    #		hypo = lambda s: s.hyponyms()
    #	 	hyper = lambda s: s.hypernyms()
    #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms()
    #True
    #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms()

    elif (num == "2"):
        #semcor
        print "semcor"

        for line in semcor.sents()[0:100]:
            s = ""
            for word in line:
                s = s + " " + word
            print s + "\n"

            for word in line:
                meanings = net.synsets(word)
                if len(meanings) > 0:
                    print meanings[0].definition()
    elif num == "3":

        docs = ieer.parsed_docs('APW_19980424')
        tree = docs[1].text

        from nltk.sem import relextract
        pairs = relextract.tree2semi_rel(tree)
        for s, tree in pairs[18:22]:
            print('("...%s", %s)' % (" ".join(s[-5:]), tree))

        reldicts = relextract.semi_rel2reldict(pairs)
        for k, v in sorted(reldicts[0].items()):
            print(k, '=>', v)

    #	The function relextract() allows us to filter the reldicts
    #	according to the classes of the subject and object named entities.
    #	In addition, we can specify that the filler text has to match a given regular expression,
    #	 as illustrated in the next example. Here, we are looking for pairs of entities in the IN
    #	relation, where IN has signature <ORG, LOC>.
        IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)')
        for fileid in ieer.fileids():
            print fileid
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('ORG',
                                                   'LOC',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=IN):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS

        roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)"

        ROLES = re.compile(roles, re.VERBOSE)
        for fileid in ieer.fileids():
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('PER',
                                                   'ORG',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=ROLES):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS
Ejemplo n.º 11
0
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences.
	chunkedSents=list()
	for sent in nltk.sent_tokenize(text):
		chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))]
	docTree=nltk.Tree('DOCUMENT',chunkedSents)
	pairs=relextract.tree2semi_rel(docTree)

	for sent,tree in pairs:
		print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print
	
	reldicts = relextract.semi_rel2reldict(pairs)

	for r in reldicts:
	        print '='*30
		print(r['subjclass'],':', r['subjtext'])
		print (r['filler'])
		print (r['objclass'],':', r['objtext'])

	# Match pattern in filler
	roles = """
			(.*(
			analyst|
			chair(wo)?man|
			commissioner|
			counsel|
			director|
			economist|
			editor|
			executive|
			foreman|
			governor|
			head|
			lawyer|
			leader|
			librarian).*)|
			manager|
			partner|
			president|
			producer|
			professor|
			researcher|
			spokes(wo)?man|
			writer|
			,\sof\sthe?\s* 
			"""# "X, of (the) Y"
	ROLES = re.compile(roles, re.VERBOSE)
	IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
	pattern=ROLES
	subjclass='PERSON'#'ORGANIZATION'
	objclass='ORGANIZATION'#'GPE'
	window=10
	relfilter = lambda x: (x['subjclass'] == subjclass and
                           len(x['filler'].split()) <= window and
                           pattern.match(x['filler']) and
                           x['objclass'] == objclass)

	for rel in list(filter(relfilter, reldicts)):
		print(relextract.rtuple(rel))
	
	def improve(reldicts):
		for dicts in reldicts:
			print len(nltk.sent_tokenize(dicts['filler']))
	improve(reldicts)

	#print pairs[0]
	#print pairs[1]
	#print pairs[2]
	#for sent,tree in pairs[0]:
	#	print sent,tree 
		#print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree))
	#tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] )))
	#l=[chunk for chunk in tree]
	#print "%s"%docTree
	"""