Ejemplo n.º 1
0
def extract_relations(s, chunk_type = 'np'):
    '''
    given a sentence, extract the list of relations

    chunk_type: define the chunked type among all relations, 'np'|'ne'
    '''
    s = clean_sent(s)
    tokens = word_tokenize(s)
    # add the an NP to resolve the NLTK relationship BUG
    pos_sent = pos_tag(tokens)
    pos_sent = change_pos(pos_sent)

    cp = nltk.RegexpParser(NP_PATTERN)
    np_sent = cp.parse(pos_sent)
    # pprint(np_sent)

    nps = TextBlob(s).noun_phrases
    np_chunk = _build_tree_from_nps(tokens, nps)
    # pprint(nps)

    if chunk_type=='np':
        pairs = relextract.tree2semi_rel(np_sent)
        # pprint(len(pairs))
    elif chunk_type == 'ne':
        pairs = relextract.tree2semi_rel(nltk.ne_chunk(pos_sent))

    rel_dicts = pair2rel(pairs)
    # pprint(rel_dicts)
    # stop()
    return rel_dicts
Ejemplo n.º 2
0
def extract_rels(subjclass,
                 objclass,
                 doc,
                 corpus='ace',
                 pattern=None,
                 window=10):
    """
    Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern.
    The parameters ``subjclass`` and ``objclass`` can be used to restrict the
    Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION',
    'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE').
    :param subjclass: the class of the subject Named Entity.
    :type subjclass: str
    :param objclass: the class of the object Named Entity.
    :type objclass: str
    :param doc: input document
    :type doc: ieer document or a list of chunk trees
    :param corpus: name of the corpus to take as input; possible values are
        'ieer' and 'conll2002'
    :type corpus: str
    :param pattern: a regular expression for filtering the fillers of
        retrieved triples.
    :type pattern: SRE_Pattern
    :param window: filters out fillers which exceed this threshold
    :type window: int
    :return: see ``mk_reldicts``
    :rtype: list(defaultdict)
    """

    if subjclass and subjclass not in NE_CLASSES[corpus]:
        if _expand(subjclass) in NE_CLASSES[corpus]:
            subjclass = _expand(subjclass)
        else:
            raise ValueError(
                "your value for the subject type has not been recognized: %s" %
                subjclass)
    if objclass and objclass not in NE_CLASSES[corpus]:
        if _expand(objclass) in NE_CLASSES[corpus]:
            objclass = _expand(objclass)
        else:
            raise ValueError(
                "your value for the object type has not been recognized: %s" %
                objclass)

    if corpus == 'ace' or corpus == 'conll2002':
        pairs = tree2semi_rel(doc)
    elif corpus == 'ieer':
        pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline)
    else:
        raise ValueError("corpus type not recognized")

    reldicts = semi_rel2reldict(pairs)

    relfilter = lambda x: (x['subjclass'] == subjclass and len(x[
        'filler'].split()) <= window and pattern.match(x['filler']) and x[
            'objclass'] == objclass)

    return list(filter(relfilter, reldicts))
Ejemplo n.º 3
0
def detectNERViaTrainedAlgo(sentence):
    dataDirectoryLocation = "data_nlp_ner"
    reader = readLoadTrainData(dataDirectoryLocation)
    loadedData = list(reader)
    training_samples = loadedData[:int(len(loadedData) * 0.8)]
    test_samples = loadedData[int(len(loadedData) * 0.8):]
    print("#Numebr of training samples are = %s" % len(training_samples))
    print("#Numebr of test samples are = %s" % len(test_samples))
    # I can incread number of traning sample this will imrove result but need more time
    classifier = NERClassifier(training_samples[:100])
    classifieddataTree, classifieddataList = classifier.parseTheData(
        pos_tag(word_tokenize(sentence)))
    listOfNamedEnties = []
    properNoun = "NNP"
    noun = "NN"
    properNounPlural = "NNPS"
    nounPlural = "NNS"
    for x in classifieddataList:
        if properNoun in x:
            listOfNamedEnties.append((x, properNoun))
        if noun in x:
            listOfNamedEnties.append((x, noun))
        if properNounPlural in x:
            listOfNamedEnties.append((x, properNounPlural))
        if nounPlural in x:
            listOfNamedEnties.append((x, nounPlural))
    print("*****---------- NER detected by Learned Annotator")
    print(listOfNamedEnties)

    print(
        "**** ---below is the output from code to extract realtion between entities by library--****"
    )
    relationResult = relextract.tree2semi_rel(classifieddataTree)
    for s, tree in relationResult:
        print(s + " has something to do with:  " + tree)
def reasoning(dList):
	reasonList = []
	tokenizer = TweetTokenizer()
	for tweet in dList:
		print tweet
		# tokenize
		words = tokenizer.tokenize(tweet)
		# get POS tag
		pos_tokens = pos_tag(words)
		# get name entities
		tree = ne_chunk(pos_tokens, binary = False)
		# find relations
		pairs = relextract.tree2semi_rel(tree)
		# get interesting name entities
		reason = []
		for s, tree in pairs:
			reasonStr = ("%s") % tree
			reasonStr = reasonStr.split(" ")
			label = reasonStr[0].replace("(","").strip()
			content = ""
			for wordTag in reasonStr[1:]:
				sp = wordTag.split("/")
				word = sp[0].replace("(","")
				print word
				# content.append(word)
				content += (word + " ")
			# reason: [(label, content)]
			reason.append({"label": label, "content": content})
		# reasonList [reason]
		if len(reason) > 0:
			reasonList.append({"reason": reason})
		print str(len(reasonList)) + "/" + str(len(dList))
	return reasonList
Ejemplo n.º 5
0
def main():
    pattern = 'NP: {<DT>?<JJ>*<NN>}'
    #IN = re.compile(r'.*\bin\b(?!\b.+ing)')

    with open("wiki-abstracts-only.txt", "r") as fin:
        for line in fin:
            sent = nltk.word_tokenize(line)
            #augment with POS tags
            sent = nltk.pos_tag(sent)
            cp = nltk.RegexpParser(pattern)
            cs = cp.parse(sent)
            #print(cs)
            ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(line)))
            #print(ne_tree)
            #for rel in nltk.sem.extract_rels('ORG', 'LOC', line, corpus='ieer', pattern = IN):
            #    print(nltk.sem.rtuple(rel))
            pairs = relextract.tree2semi_rel(ne_tree)
            reldicts = relextract.semi_rel2reldict(pairs)
            #print(len(reldicts))
            for r in reldicts:
                #print('[' + r['subjtext'] + '] : [' + r['filler'] + '] : [' + r['objtext'] + ']')
                # remove POS tags
                sub = r['subjtext'].replace('/NNPS',
                                            '').replace('/NNP',
                                                        '').replace('/JJ', '')
                obj = r['objtext'].replace('/NNPS', '').replace('/NNP', '')
                vb = r['filler'].replace('/NNS','').replace('/NNP','').replace('/NN','').replace('/CC','').\
                replace('/PRP$','').replace('/DT','').replace('/CD','').replace('/JJ','').replace('/PRP','').\
                replace('/WP','').replace('/IN',"").replace('/VBD','').replace('/VBN','')
                print('[' + sub + '] : [' + vb + '] : [' + obj + ']')
Ejemplo n.º 6
0
def extract_entities(text):
	tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] )))
	pairs=relextract.tree2semi_rel(tree)
	for sent,tree in pairs:	
		 print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree))

	reldicts = relextract.semi_rel2reldict(pairs)

	for r in reldicts:
	        print '='*30
		print(r['subjclass'],':', r['subjtext'])
		print (r['filler'])
		print (r['objclass'],':', r['objtext'])
 def relationExtraction(self, subjectClass, objectClass, pattern):
     window = 5
     relations = []
     relfilter = lambda x: (x['subjclass'] == subjectClass and len(x[
         'filler'].split()) <= window and pattern.match(x['filler']) and x[
             'objclass'] == objectClass)
     for sent in self.tagged_sentences:
         chunked = nltk.ne_chunk(sent)
         reldicts = self.semi_rel2reldict(tree2semi_rel(chunked))
         rels = list(filter(relfilter, reldicts))
         for rel in rels:
             relations.append(nltk.sem.relextract.rtuple(rel))
     return relations
Ejemplo n.º 8
0
def relationExtraction(chunk_tree):
    IN = re.compile(r'.*\b in \b.*')
    # chunk_tree.headline = ['S'] #文法开头什么都没有,因为文章被分成句子,每个句子单独成树了
    # chunk_tree.text = chunk_tree  #api变动,识别text作为其读入内容
    relations = []
    # conllStr = chunk.tree2conlltags(chunk_tree)
    # chunk_tree = chunk.conlltags2tree(conllStr)

    #得到不同的两对,一个是没有命名实体的词组,另一个则有
    pairs = relextract.tree2semi_rel(chunk_tree)
    fix_pairs = []

    for word_list, tree in pairs:
        fix_pairs.append(tree)
    reldicts = relextract.semi_rel2reldict(pairs)

    #打印出关系表
    for reldict in reldicts:
        print('\n')
        for k, v in sorted(reldict.items()):
            print(k, '=>', v)  # doctest: +ELLIPSIS

    # org_place_relations = relextract.extract_rels(
    #     'ORG', 'GPE', chunk_tree, corpus='conll2002', pattern=IN)

    # per_place_relations = relextract.extract_rels(
    #     'PER', 'GPE', chunk_tree, corpus='conll2002', pattern=IN)

    # condition = False
    # if fix_pairs.__contains__('PERSON') and fix_pairs.__contains__('ORGANIZATION'):
    #     condition = True

    # has_per = False
    # has_org = False
    # for tree in fix_pairs:
    #     if getattr(tree,'label') is 'PERSON':
    #         has_per = True
    #     if getattr(tree, 'label') is 'ORGANIZATION':
    #         has_org = False

    # for relation in nltk.sem.extract_rels('PERSON', 'ORGANIZATION', chunk_tree, corpus='ace', pattern=PR):
    #     print(relation)
    #     relations.append(nltk.sem.rtuple(relation))

    return reldicts
Ejemplo n.º 9
0
def extractSampleRels(sample):
	#with open('toyset', 'r') as f:
	#    sample = f.read().decode('utf-8')

	sentences = nltk.sent_tokenize(sample)
	tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
	tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]

	entitiesMap = []

	for i, sent in enumerate(tagged_sentences):
		sent = nltk.ne_chunk(sent) # ne_chunk method expects one tagged sentence
		pairs = relextract.tree2semi_rel(sent)
		reldicts = relextract.semi_rel2reldict(pairs)
		for r in reldicts:
			entitiesMap.append((r['subjtext'],r['filler'],r['objtext']))

	return entitiesMap	
Ejemplo n.º 10
0
def such_as_np(s, np_sent):
    '''
    Given a np chunked sentences, try to extract the concepts

    X--set
    y--set
    '''
    X = set()
    Y = set()
    if re.findall(r'\bsuch\b\s\bas\b', s):
        # extract the such as pattern
        # logging.info(s)

        semi_pairs = relextract.tree2semi_rel(np_sent)
        reldicts = relextract.semi_rel2reldict(semi_pairs)
        # find the first such as
        logging.info(np_sent)
        # pprint(semi_pairs)
        # pprint(reldicts)
        # logging.info(len(reldicts))
        if len(reldicts) > 0:
            try:
                while 'such as' not in reldicts[0]['untagged_filler']:
                    reldicts.pop(0)


                X.add(reldicts[0]['subjsym'])
                Y.add(reldicts[0]['objsym'])

                reldicts.pop(0)

                # find the sub concept
                for reldict in reldicts:
                    if reldict['untagged_filler'] not in [',', 'and', 'or']:
                        Y.add(reldict['subjsym'])
                        break
                    Y.add(reldict['subjsym'])
                    Y.add(reldict['objsym'])
            except Exception as e:
                logging.error(e)
                logging.error(reldicts)
                logging.error('Original sentence: '+s)
        stop()
    return (X, Y)
Ejemplo n.º 11
0
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences.
	chunkedSents=list()
	for sent in nltk.sent_tokenize(text):
		chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))]
	docTree=nltk.Tree('DOCUMENT',chunkedSents)
	pairs=relextract.tree2semi_rel(docTree)

	for sent,tree in pairs:
		print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print
	
	reldicts = relextract.semi_rel2reldict(pairs)
	
	return reldicts

	for r in reldicts:
	        print '='*30
		print(r['subjclass'],':', r['subjtext'])
		print (r['filler'])
		print (r['objclass'],':', r['objtext'])
def getGrammarRelations(named_entities, tree):
    pairs = relextract.tree2semi_rel(tree)
    reldicts = rel2dict(pairs)

    rules = [
        # TYPE1, TYPE2, [keywords], ONTOLOGY, ISMIRRORED
        ('ORGANIZATION', 'PERSON', ['work'], 'employer', False),
        ('PERSON', 'PERSON', ['sister'], 'sibling', True),
    ]

    porter = PorterStemmer()

    relations = {}

    for rels in reldicts:
        dbotype1 = [ner[1] for ner in named_entities if ner[0] == rels[0]][0]
        dbotype2 = [ner[1] for ner in named_entities if ner[0] == rels[2]][0]
        space = [porter.stem(word[0]) for word in rels[1]]

        proposals = [(rule[2], rule[3], rule[4]) for rule in rules
                     if rule[0] == dbotype1 and rule[1] == dbotype2]

        for (words, newtype, mirror) in proposals:
            isIn = bool(list(set(space) & set(words)))

            if isIn:
                if rels[0] not in relations:
                    relations[rels[0]] = []

                relations[rels[0]].append(
                    (rels[2], ["http://dbpedia.org/ontology/" + newtype]))

                if mirror:
                    if rels[2] not in relations:
                        relations[rels[2]] = []

                    relations[rels[2]].append(
                        (rels[0], ["http://dbpedia.org/ontology/" + newtype]))

    return relations
Ejemplo n.º 13
0
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences.
	chunkedSents=list()
	for sent in nltk.sent_tokenize(text):
		chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))]
	docTree=nltk.Tree('DOCUMENT',chunkedSents)
	pairs=relextract.tree2semi_rel(docTree)

	for sent,tree in pairs:
		print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print
	
	reldicts = relextract.semi_rel2reldict(pairs)
	
	return reldicts

	for r in reldicts:
	        print '='*30
		print(r['subjclass'],':', r['subjtext'])
		print (r['filler'])
		print (r['objclass'],':', r['objtext'])

	

	"""def improve(reldicts):
		for dicts in reldicts:
			print len(nltk.sent_tokenize(dicts['filler']))
	improve(reldicts)
	"""
	#print pairs[0]
	#print pairs[1]
	#print pairs[2]
	#for sent,tree in pairs[0]:
	#	print sent,tree 
		#print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree))
	#tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] )))
	#l=[chunk for chunk in tree]
	#print "%s"%docTree
	"""	
Ejemplo n.º 14
0
def main():
    pattern = 'NP: {<DT>?<JJ>*<NN>}'

    with open("wiki-abstracts-only.txt", "r") as fin:
        for line in fin:
            sent = nltk.word_tokenize(line)
            #augment with POS tags
            sent = nltk.pos_tag(sent)
            cp = nltk.RegexpParser(pattern)
            cs = cp.parse(sent)
            ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(line)))
            pairs = relextract.tree2semi_rel(ne_tree)
            reldicts = relextract.semi_rel2reldict(pairs)
            for r in reldicts:
                # remove POS tags
                sub = r['subjtext'].replace('/NNPS',
                                            '').replace('/NNP',
                                                        '').replace('/JJ', '')
                obj = r['objtext'].replace('/NNPS', '').replace('/NNP', '')
                vb = r['filler'].replace('/NNS','').replace('/NNP','').replace('/NN','').replace('/CC','').\
                replace('/PRP$','').replace('/DT','').replace('/CD','').replace('/JJ','').replace('/PRP','').\
                replace('/WP','').replace('/IN',"").replace('/VBD','').replace('/VBN','')
                # print result
                print('[' + sub + '] : [' + vb + '] : [' + obj + ']')
Ejemplo n.º 15
0
        s = ""
        for word in line:
            s = s + " " + word
        print s + "\n"

        for word in line:
            meanings = net.synsets(word)
            if len(meanings) > 0:
                print meanings[0].definition()
elif num == "3":

    docs = ieer.parsed_docs('APW_19980424')
    tree = docs[1].text

    from nltk.sem import relextract
    pairs = relextract.tree2semi_rel(tree)
    for s, tree in pairs[18:22]:
        print('("...%s", %s)' % (" ".join(s[-5:]), tree))

    reldicts = relextract.semi_rel2reldict(pairs)
    for k, v in sorted(reldicts[0].items()):
        print(k, '=>', v)

#	The function relextract() allows us to filter the reldicts
#	according to the classes of the subject and object named entities.
#	In addition, we can specify that the filler text has to match a given regular expression,
#	 as illustrated in the next example. Here, we are looking for pairs of entities in the IN
#	relation, where IN has signature <ORG, LOC>.
    IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)')
    for fileid in ieer.fileids():
        print fileid
Ejemplo n.º 16
0
def detectDisplayNamedEntities(sentence):
    ##############if we use buit in pos result owuld be better ######
    # I have trained my own POS tagger as well butits accuracy is around 80 to 90 % I can use that as well
    tokens = nltk.word_tokenize(sentence)
    resultList2 = list(nltk.pos_tag(tokens))
    print(resultList2)
    #grammar = "NP: {<DT>?<JJ>*<NN>}" # for desired resutlt we can update the grammer
    grammar2 = r"""
      NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
          {<NNP>+}                # chunk sequences of proper nouns
          {<NNS>+}                # chunk sequences of Noun plural
          {<NNPS>+}                # chunk sequences of Proper noun, plural
          {<LS>+}                # chunk sequences of List item marker
    """

    cp = nltk.RegexpParser(grammar2)
    nounPhraseTree = cp.parse(resultList2)
    print(nounPhraseTree)

    print(
        "**** -below is the output from code to extract realtion between entities by library--****"
    )
    relationResult = relextract.tree2semi_rel(nounPhraseTree)
    for s, tree in relationResult:
        print(str(s) + " has something to do with:  " + str(tree))

    # uncomment line below ehn you want to see the tree structure of tags as well
    #nounPhraseTree.draw()

    nounList = []
    for node in nounPhraseTree:
        if isinstance(node, nltk.tree.Tree):
            if node.label() == 'NP':
                NP = node.leaves()
                print(NP)
                for x in NP:
                    if x[1] == 'NN' or x[1] == 'NNP' or x[1] == 'NNPS' or x[
                            1] == 'NNS':
                        nounList.append(x[0])

    print(
        "*****-----------------------------------------------------------*****"
    )
    print("list of all nouns detected in the text is result as below:")
    print(nounList)
    dictionary = {}
    dictionary['coutries'] = []

    #    with open('countries.txt') as openfileobject:
    #        for line in openfileobject:
    #            dictionary['coutries'].append(line.rstrip())
    #        openfileobject.closed

    fileHandler = open('countries.txt')
    allCountries = fileHandler.read()
    fileHandler.close()
    dictionary['coutries'] = allCountries.split("\n")

    fileHandler = open('months.txt')
    allCountries = fileHandler.read()
    fileHandler.close()
    dictionary['months'] = allCountries.split("\n")

    fileHandler = open('days.txt')
    allCountries = fileHandler.read()
    fileHandler.close()
    dictionary['days'] = allCountries.split("\n")
    ### same way we can use different dictionalries to tag detail with our detected nouns
    #print(dictionary['coutries'][1])
    finalNamedEntityWithEntityTags = []

    for n in nounList:  # here by n I mean one noun from the list of nouns
        if n in dictionary['coutries']:
            finalNamedEntityWithEntityTags.append((n, 'name of Country'))
        if n in dictionary['months']:
            finalNamedEntityWithEntityTags.append((n, 'name of Month'))
        if n in dictionary['days']:
            finalNamedEntityWithEntityTags.append((n, 'Day of the week'))

    for resultLine in finalNamedEntityWithEntityTags:
        print(resultLine)

    finalNERWithDetail = []
    dictionary = PyDictionary()
    for n in nounList:
        # this will help user to understand detected NER
        try:  #try block if dictionary has no synonyn then its a name
            finalNERWithDetail.append((n, dictionary.synonym(n)))
        except:
            finalNERWithDetail.append(
                (n, "it is a name of something or a person"))

    print(
        "=> Detected NER with synonym detail that help to understand these NER: "
    )
    for resultLine in finalNERWithDetail:
        print(resultLine)
Ejemplo n.º 17
0
def main():
    print "user input(1) or semcor(2)?"

    num = raw_input()

    if num == "1":
        #input
        print "enter word"
        word = raw_input()
        for meaning in (net.synsets(word)):
            #print "Sense: " + re.findall("'.*'", str(meaning))[0]
            print "Sense: " + str(meaning)
            print meaning.definition() + "\n"
            hypernyms = (meaning.hypernyms())
            if len(hypernyms) > 0:
                print "\nHypernyms:"
                for meaning2 in hypernyms:
                    print re.findall("'.*'", str(meaning2))[0]

            hyponyms = (meaning.hyponyms())
            if len(hyponyms) > 0:
                print "\nHyponyms:"
                for meaning2 in hyponyms:
                    print re.findall("'.*'", str(meaning2))[0]

    #		print "\nHypernym Tree:"
    #		print (gethypernymtree(meaning))
            print "\n"

    #		dog = wn.synset('dog.n.01')
    #		hypo = lambda s: s.hyponyms()
    #	 	hyper = lambda s: s.hypernyms()
    #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms()
    #True
    #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms()

    elif (num == "2"):
        #semcor
        print "semcor"

        for line in semcor.sents()[0:100]:
            s = ""
            for word in line:
                s = s + " " + word
            print s + "\n"

            for word in line:
                meanings = net.synsets(word)
                if len(meanings) > 0:
                    print meanings[0].definition()
    elif num == "3":

        docs = ieer.parsed_docs('APW_19980424')
        tree = docs[1].text

        from nltk.sem import relextract
        pairs = relextract.tree2semi_rel(tree)
        for s, tree in pairs[18:22]:
            print('("...%s", %s)' % (" ".join(s[-5:]), tree))

        reldicts = relextract.semi_rel2reldict(pairs)
        for k, v in sorted(reldicts[0].items()):
            print(k, '=>', v)

    #	The function relextract() allows us to filter the reldicts
    #	according to the classes of the subject and object named entities.
    #	In addition, we can specify that the filler text has to match a given regular expression,
    #	 as illustrated in the next example. Here, we are looking for pairs of entities in the IN
    #	relation, where IN has signature <ORG, LOC>.
        IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)')
        for fileid in ieer.fileids():
            print fileid
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('ORG',
                                                   'LOC',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=IN):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS

        roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)"

        ROLES = re.compile(roles, re.VERBOSE)
        for fileid in ieer.fileids():
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('PER',
                                                   'ORG',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=ROLES):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS
Ejemplo n.º 18
0
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences.
	chunkedSents=list()
	for sent in nltk.sent_tokenize(text):
		chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))]
	docTree=nltk.Tree('DOCUMENT',chunkedSents)
	pairs=relextract.tree2semi_rel(docTree)

	for sent,tree in pairs:
		print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print
	
	reldicts = relextract.semi_rel2reldict(pairs)

	for r in reldicts:
	        print '='*30
		print(r['subjclass'],':', r['subjtext'])
		print (r['filler'])
		print (r['objclass'],':', r['objtext'])

	# Match pattern in filler
	roles = """
			(.*(
			analyst|
			chair(wo)?man|
			commissioner|
			counsel|
			director|
			economist|
			editor|
			executive|
			foreman|
			governor|
			head|
			lawyer|
			leader|
			librarian).*)|
			manager|
			partner|
			president|
			producer|
			professor|
			researcher|
			spokes(wo)?man|
			writer|
			,\sof\sthe?\s* 
			"""# "X, of (the) Y"
	ROLES = re.compile(roles, re.VERBOSE)
	IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
	pattern=ROLES
	subjclass='PERSON'#'ORGANIZATION'
	objclass='ORGANIZATION'#'GPE'
	window=10
	relfilter = lambda x: (x['subjclass'] == subjclass and
                           len(x['filler'].split()) <= window and
                           pattern.match(x['filler']) and
                           x['objclass'] == objclass)

	for rel in list(filter(relfilter, reldicts)):
		print(relextract.rtuple(rel))
	
	def improve(reldicts):
		for dicts in reldicts:
			print len(nltk.sent_tokenize(dicts['filler']))
	improve(reldicts)

	#print pairs[0]
	#print pairs[1]
	#print pairs[2]
	#for sent,tree in pairs[0]:
	#	print sent,tree 
		#print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree))
	#tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] )))
	#l=[chunk for chunk in tree]
	#print "%s"%docTree
	"""