def annotate(text, properties_step): ''' use NLPcore to annotate text and extract relations :param text: list of sentences, a list of a str :param properties_step: 1 or 2, the first step or the second step :return: annotated document ''' properties_1 = { "annotators": "tokenize,ssplit,pos,lemma,ner", "ner.useSUTime": "0" } properties_2 = { "annotators": "tokenize,ssplit,pos,lemma,ner,parse,relation", # Second pipeline; leave out parse,relation for first "parse.model": "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", # Must be present for the second pipeline! "ner.useSUTime": "0" } nlpcore_path = os.path.abspath("stanford-corenlp-full-2017-06-09") # nlpcore_path = "/Users/vibrioh/local_projects/stanford-corenlp-full-2017-06-09" clinet = NLPCoreClient(nlpcore_path) if properties_step == 1: pipeline = properties_1 else: pipeline = properties_2 doc = clinet.annotate(text, pipeline) return doc
def __init__(self): self.relation_map = { 1: "Live_In", 2: "Located_In", 3: "OrgBased_In", 4: "Work_For" } self.entity_map = { 1: ['PEOPLE', 'LOCATION'], 2: ['LOCATION', 'LOCATION'], 3: ['ORGANIZATION', 'LOCATION'], 4: ['PEOPLE', 'ORGANIZATION'] } self.queries = set() # (word1, word2) -> (entity1, entity2, relation, prob) self.X = {} # Parameters from input self.SEARCH_JSON_API_KEY = "" self.SEARCH_ENGINE_ID = "" self.RELATION = 0 self.THRESHOLD = 0 self.QUERY = "" self.k = 0 # Retrieved set self.retrieved_url = set() # Client self.client = NLPCoreClient( os.path.abspath("stanford-corenlp-full-2017-06-09"))
def tag_relations(phrases): client = NLPCoreClient(STANFORD_CORENLP_PATH) properties = { "annotators": "", "parse.model": "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", "ner.useSUTime": "0" } # annotate second pipeline properties["annotators"] = "tokenize,ssplit,pos,lemma,ner,parse,relation" doc = client.annotate(text=phrases, properties=properties) # Iterate through all relations, evaluate and print and record relations = [] for sentence in doc.sentences: relations.extend(record_relations(sentence)) return relations
def find_query_term_occurrences(text): """Annotate text with the Stanford CoreNLP.""" client = NLPCoreClient(STANFORD_CORENLP_PATH) properties = { "annotators": "", "parse.model": "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", "ner.useSUTime": "0" } # annotate first pipeline properties["annotators"] = "tokenize,ssplit,pos,lemma,ner" doc = client.annotate(text=text, properties=properties) # find sentences with matching tokens from query eligiblePhrases = [] for sentence in doc.sentences: s = eval_sentence(sentence) if s is not False: eligiblePhrases.append(s) return eligiblePhrases
def __init__(self, lib): self.client = NLPCoreClient(lib) # some default parameters to feed the "annotator" self.annotators = "tokenize,ssplit,pos,lemma,ner,parse,relation" self.parsemodel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" self.useSUTime = "0" # used for filtering sentences that contain certain named entities like 'PERSON' # e.g. if relation is 'Work_In' then there must be at least one entity of 'PERSON' # and one of 'ORGANIZATION' in the sentence self.entity_filters = { "Live_In": lambda x: x['PERSON'] > 0 and x['LOCATION'] > 0, "Located_In": lambda x: x['LOCATION'] > 1, "OrgBased_In": lambda x: x['LOCATION'] > 0 and x['ORGANIZATION'] > 0, "Work_For": lambda x: x['PERSON'] > 0 and x['ORGANIZATION'] > 0 } if DEBUG: # cache the parsed results under a local directory for faster debugging self.cache_dir = "parsed" if self.cache_dir and not os.path.exists(self.cache_dir): os.makedirs(self.cache_dir) os.makedirs(self.cache_dir + '/1') os.makedirs(self.cache_dir + '/2')
class NLPParser(object): ## the constructor # @param lib the root path of Stanford NLP tools, type: str def __init__(self, lib): self.client = NLPCoreClient(lib) # some default parameters to feed the "annotator" self.annotators = "tokenize,ssplit,pos,lemma,ner,parse,relation" self.parsemodel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" self.useSUTime = "0" # used for filtering sentences that contain certain named entities like 'PERSON' # e.g. if relation is 'Work_In' then there must be at least one entity of 'PERSON' # and one of 'ORGANIZATION' in the sentence self.entity_filters = { "Live_In": lambda x: x['PERSON'] > 0 and x['LOCATION'] > 0, "Located_In": lambda x: x['LOCATION'] > 1, "OrgBased_In": lambda x: x['LOCATION'] > 0 and x['ORGANIZATION'] > 0, "Work_For": lambda x: x['PERSON'] > 0 and x['ORGANIZATION'] > 0 } if DEBUG: # cache the parsed results under a local directory for faster debugging self.cache_dir = "parsed" if self.cache_dir and not os.path.exists(self.cache_dir): os.makedirs(self.cache_dir) os.makedirs(self.cache_dir + '/1') os.makedirs(self.cache_dir + '/2') # END - if DEBUG ## calling the annotator # @param lines texts to process, type: list(str) # @param annotators specific annotators, type: str # @param chunk size of chunk to process in one batch, type: int # # @ret list of data.Sentence instances, each representing a parsed sentence def annotate(self, lines, annotators=None, chunk=1e6): if not annotators: annotators = self.annotators properties = { "annotators": annotators, "parse.model": self.parsemodel, "ner.useSUTime": self.useSUTime } # split the processing into chunck to avoid memory overflowing, # or generating gigantic temp files (like the input.txt.html) # it might not be necessary so I leave the options here sentences = [] i = 0 chunk = int(chunk) while i < len(lines): j = min(len(lines), i + chunk) if i >= j: break # calling annotator, a Python-wrapped Java codes document = self.client.annotate(text=lines[i:j], properties=properties) sentences.extend(document.sentences) i = j return sentences ## first round of parsing, screening sentences with relevant named entities # @param key key to the document, for caching, type: str # @param lines texts to process, type: list(str) # @param relation relation of interest (e.g. Work_In), type: str # # @ret list of sentences in pure text def __first_round(self, key, lines, relation): if DEBUG: # use hash value of url + relation string to name the cache file fname = "{}/1/{}.txt".format(self.cache_dir, abs(hash(key + relation))) if os.path.exists(fname): with open(fname, 'r') as f: return [l.rstrip('\n') for l in f] # END - if DEBUG res = [] if not relation in self.entity_filters: return res entity_filter = self.entity_filters[relation] # calling a selection of annotators (no parsing or relation) sentences = self.annotate(lines, "tokenize,ssplit,pos,lemma,ner") for sentence in sentences: if len(sentence.tokens) >= 50: ## # sometimes the scrapper returns very long sentences that are # very computational expensive but usually not as productive in # generating relation tuples, the sentence length limit of 50 # words comes after several experiments on the trade-off of # performance and correctness continue # count number of named entities and filter sentences accordingly entity_counts = defaultdict(int) for token in sentence.tokens: entity_counts[token.ner] += 1 # filter by named entity counts if entity_filter(entity_counts): line = u' '.join([token.word for token in sentence.tokens ]).encode('ascii', 'ignore').replace('|', '') res.append(line) if DEBUG: with open(fname, 'w') as f: for line in res: f.write(line + '\n') # END - if DEBUG return res ## second round of parsing, get the relations # @param key key to the document, for caching, type: str # @param lines texts to process, type: list(str) # @param relation relation of interest (e.g. Work_In), type: str # # @ret list of relation descriptions (entity#1 value, entity#2 value, # entity#1 type, entity#2 type, confidence, sentence text), type: tuple def __second_round(self, key, lines, relation): if DEBUG: # use hash value of url + relation to name the cache file fname = "{}/2/{}.txt".format(self.cache_dir, abs(hash(key + relation))) if os.path.exists(fname): with open(fname, 'r') as f: return [tuple(l.rstrip('\n').split('|')) for l in f] # END - if DEBUG # calling a full set of annotators (default) sentences = self.annotate(lines) res = [] for sentence in sentences: raw = "" # raw text of the sentence for rel in sentence.relations: # each pair of relation in the sentence contains a relation type, # its probability (confidence), and a pair of entities with value and type. # skip the relationship pair if the confidence of the relation we are looking # for is not the highest among all relations probabilities = rel.probabilities if float(probabilities.get(relation, -1)) < max( map(float, probabilities.values())): continue e = rel.entities if len(e) == 2: if not raw: # construct the raw text of sentence now by # joining the "word"s of its tokens raw = u' '.join([t.word for t in sentence.tokens ]).encode('ascii', 'ignore') # append the relation description as a tuple to the results res.append((e[0].value.rstrip(), e[1].value.rstrip(), \ e[0].type.rstrip(), e[1].type.rstrip(), \ probabilities[relation], raw)) if DEBUG: with open(fname, 'w') as f: for line in res: f.write("|".join(map(str, line)) + '\n') # END - if DEBUG return res ## extract relation tuples from a search document # @param doc the search document with scraped text, type: SearchDocument # @param relation relation of interest (e.g. Work_In), type: str # # @ret list of relation tuples, type: list(RelationTuple) def extract_relation(self, doc, relation): key = doc.key # first round, screen sentences lines = self.__first_round(key, doc.text.split('\n'), relation) # second round, get relations relations = self.__second_round(key, lines, relation) # combine relation tuples with same entities res = {} for t in relations: (v0, v1, t0, t1, prob, sentence) = t rt = RelationTuple(v0, v1, t0, t1, prob, sentence, relation) key = hash(rt) if not key in res or res[key] < rt: # new tuple or better than existing, add to results res[key] = rt return res.values()
class InformationExtractionEngine: def __init__(self): self.relation_map = { 1: "Live_In", 2: "Located_In", 3: "OrgBased_In", 4: "Work_For" } self.entity_map = { 1: ['PEOPLE', 'LOCATION'], 2: ['LOCATION', 'LOCATION'], 3: ['ORGANIZATION', 'LOCATION'], 4: ['PEOPLE', 'ORGANIZATION'] } self.queries = set() # (word1, word2) -> (entity1, entity2, relation, prob) self.X = {} # Parameters from input self.SEARCH_JSON_API_KEY = "" self.SEARCH_ENGINE_ID = "" self.RELATION = 0 self.THRESHOLD = 0 self.QUERY = "" self.k = 0 # Retrieved set self.retrieved_url = set() # Client self.client = NLPCoreClient( os.path.abspath("stanford-corenlp-full-2017-06-09")) def find_entities(self, sentences): """ Find the name entities in a list of sentence, serving as the first pipine :return: list of entities """ entities = [] properties = { "annotators": "tokenize,ssplit,pos,lemma,ner", "parse.model": "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", "ner.useSUTime": "0" } doc = self.client.annotate(text=sentences, properties=properties) for sentence in doc.sentences: for token in sentence.tokens: if token.ner != 'O': entities.append(token.ner) return entities def extract_relation_from_page(self, sentences): """ Extract relation from sentence list(retrieved from a webpage), if satisfy condition(threshold and RELATION type), update relations :return: void """ properties = { "annotators": "tokenize,ssplit,pos,lemma,ner,parse,relation", "parse.model": "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", "ner.useSUTime": "0" } doc = self.client.annotate(text=sentences, properties=properties) # print doc.tree_as_string() for sentence in doc.sentences: for relation in sentence.relations: try: probs = relation.probabilities # Find max relation and corresponding max prob max_relation = max(probs.iterkeys(), key=(lambda key: probs[key])) max_prob = probs[max_relation] # If max prob over threshold and max relation equals to curr relation, update relations if max_prob > self.THRESHOLD and max_relation == self.relation_map[ self.RELATION]: key = (relation.entities[0].value, relation.entities[1].value) # judge the type of inputs if float(max_prob) >= float( self.THRESHOLD ) and self.entity_map[ self.RELATION][0] == relation.entities[ 0].type and self.entity_map[self.RELATION][ 1] == relation.entities[1].type: if key not in self.X or float( self.X[key][3]) < float(max_prob): self.X[key] = (relation.entities[0].type, relation.entities[1].type, max_relation, max_prob) print "Sentence:", for token in sentence.tokens: print " " + token.word, print "" print "Relation Type: {0:10}| Confidence: {1:.3f} | EntityType1: {2:15} | EntityValue1: {3:15} | EntityType2: {4:15} | EntityValue2: {5:15}".format( max_relation, float(max_prob), relation.entities[0].type, relation.entities[0].value, relation.entities[1].type, relation.entities[1].value) except: pass def extract_relation(self): """ This function executes each iteration 1. Google Search and get a list of sentences for each url 2. First pipeline, find entities and identify page 3. If pass, second pipeline, extract relation and update relations :return: """ web_pages = self.google_search() # print "Web Page Retrieved" for page in web_pages: entities = self.find_entities(page) # print entities if self.identity_page(entities): # print "Page Identified" self.extract_relation_from_page(page) # print "Page Processed" def identity_page(self, entities): """ Check entities from one retrieved web page contains required entities for its r :return: boolean """ if self.RELATION == 1: # Live_In if 'PERSON' in entities and 'LOCATION' in entities: return True else: return False elif self.RELATION == 2: # Located_In location_count = 0 for entity in entities: if entity == 'LOCATION': location_count += 1 if location_count < 2: return False else: return True elif self.RELATION == 3: # OrgBased_In if 'ORGANIZATION' in entities and 'LOCATION' in entities: return True else: return False elif self.RELATION == 4: # Work_For if 'ORGANIZATION' in entities and 'PERSON' in entities: return True else: return False def google_search(self): """ Return the Top-10 results of Google search using QUERY :return: list """ results = [] # Google search url = "https://www.googleapis.com/customsearch/v1?key=" + self.SEARCH_JSON_API_KEY + "&cx=" + self.SEARCH_ENGINE_ID + "&q=" + self.QUERY response = requests.get(url) search_results = json.loads(response.text)['items'] # Retrieve each url and extract plain text for item in search_results: item_url = item['link'] if item_url not in self.retrieved_url: try: text = self.extract_text_from_page(item_url) results.append(text) except: pass self.retrieved_url.add(item_url) return results def extract_text_from_page(self, url): """ Extract plain text from a web page pointed by url :return: list of sentences(str) """ response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") # Kill all script ans style elements for script in soup(["script", "style"]): script.extract() # Get text text = soup.get_text() # Break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # text = [chunk.decode('unicode_escape').encode('ascii','ignore') + "." for chunk in chunks if chunk] text = [chunk.encode('utf-8') + "." for chunk in chunks if chunk] sentences = [] for t in text: sentences.extend(re.split(r' *[\.\?!][\'"\)\]]* *', t)) results = [sentence + "." for sentence in sentences] return results def read_parameters(self): """ Read parameters from command line and assign to global vars :return: void """ inputs = sys.argv if len(inputs) < 7: self.usage() sys.exit(1) self.SEARCH_JSON_API_KEY = inputs[1] self.SEARCH_ENGINE_ID = inputs[2] self.RELATION = int(inputs[3]) self.THRESHOLD = float(inputs[4]) self.QUERY = inputs[5] self.k = int(inputs[6]) # Print to console print("Search Key: " + self.SEARCH_JSON_API_KEY) print("Search Engine ID: " + self.SEARCH_ENGINE_ID) print("RELATION: " + self.relation_map[self.RELATION]) print("THRESHOLD: " + str(self.THRESHOLD)) print("QUERY: " + self.QUERY) print("# of Tuples: " + str(self.k)) def usage(self): sys.stderr.write( """Usage: python main.py <SEARCH_JSON_API_KEY> <SEARCH_ENGINE_ID> <RELATION> <THRESHOLD> <QUERY> <k>.\n""" ) def run(self): self.read_parameters() ind = 1 while len(self.X) < self.k: print("Iteration " + str(ind) + ": query - " + self.QUERY) self.extract_relation() sorted_X = sorted(self.X.items(), key=lambda (k, v): v[3], reverse=True) print("=====Relations=====") count = 0 for t in sorted_X: count += 1 if count > self.k: break print "Relation Type: {0:10}| Confidence: {1:.3f} | Entity #1: {2:15}| Entity #2: {3:15}".format( self.relation_map[self.RELATION], float(t[1][3]), t[0][0], t[0][1]) for t in sorted_X: temp_query = t[0][0] + " " + t[0][1] if not temp_query in self.queries: self.queries.add(temp_query) break # Check if enters infinite loop if self.QUERY == temp_query: sys.stderr.write( """Can not find enough relation instances, program ends. Please try another set of parameters.\n""" ) sys.exit(1) self.QUERY = temp_query ind += 1
from NLPCore import NLPCoreClient text = ["Bill Gates works at Microsoft.", "Sergei works at Google."] #path to corenlp client = NLPCoreClient('/path/to/stanford-corenlp-full-2017-06-09') properties = { "annotators": "tokenize,ssplit,pos,lemma,ner,parse,relation", "parse.model": "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", "ner.useSUTime": "0" } doc = client.annotate(text=text, properties=properties) print(doc.sentences[0].relations[0]) print(doc.tree_as_string())
def makeQuery(apiKey, engineID, relation, threshold, query, k): if int(relation) == 1: relationName = "Live_In" elif int(relation) == 2: relationName = "Located_In" elif int(relation) == 3: relationName = "OrgBased_In" else: relationName = "Work_For" print("Parameters:") print("Client Key = " + apiKey) print("Engine Key = " + engineID) print("Relation = " + relationName) print("Threshold = " + str(threshold)) print("Query = " + query) print("# of tuples = " + str(k)) iterationNum = 1 goodTuples = 0 extractedRelations = 0 totalExtractedRelations = 0 tuples = set() queries = set() while goodTuples < int(k): totalExtractedRelations = goodTuples service = build("customsearch", "v1", developerKey=apiKey) res = service.cse().list( q=query, cx=engineID, ).execute() print("=========== Iteration: " + str(iterationNum) + " - Query: " + query + " ===========") for i in range(10): extractedRelations = 0 solution = res[u'items'][i][u'link'].encode('ascii', 'ignore') print("Processing: " + solution) try: r = urllib.urlopen(solution).read() except Exception as e: print( "Program could not extract text content from this web site; moving to the next one..." ) continue soup = BeautifulSoup(r) texts = soup.find_all(['h1', 'h2', 'h3', 'p']) result = [] for text in texts: result.append(text.text.encode('ascii', 'ignore')) client = NLPCoreClient('stanford-corenlp-full-2017-06-09') properties = { "annotators": "tokenize,ssplit,pos,lemma,ner", #Second pipeline; leave out parse,relation for first "parse.model": "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", #Must be present for the second pipeline! "ner.useSUTime": "0" } properties2 = { "annotators": "tokenize,ssplit,pos,lemma,ner,parse,relation", #Second pipeline; leave out parse,relation for first "parse.model": "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", #Must be present for the second pipeline! "ner.useSUTime": "0" } doc = client.annotate(text=result, properties=properties) goodSentences = [] for sen in doc.sentences: tok1 = False tok2 = False if int(relation) == 1: for re in sen.tokens: if re.ner == "PERSON": tok1 = True elif re.ner == "LOCATION": tok2 = True elif int(relation) == 2: for re in sen.tokens: if re.ner == "LOCATION" and tok1 == False: tok1 = True elif tok1 == True and re.ner == "LOCATION": tok2 = True elif int(relation) == 3: for re in sen.tokens: if re.ner == "ORGANIZATION": tok1 = True elif re.ner == "LOCATION": tok2 = True else: for re in sen.tokens: if re.ner == "PERSON": tok1 = True elif re.ner == "ORGANIZATION": tok2 = True if tok1 == True and tok2 == True: goodSentences.append(sen) finalSentences = [] for sentence in goodSentences: newsentence = "" for x in sentence.tokens: newsentence += " " + x.word.encode('ascii', 'ignore') finalSentences.append(newsentence) doc2 = client.annotate(text=finalSentences, properties=properties2) list1 = [] list2 = [] for s1 in doc2.sentences: list1.append(s1) for s3 in list1: counterr = 0 for s4 in s3.relations: if counterr == 2: break counterr += 1 if relationValid(s4, relation, relationName): print( "=============== EXTRACTED RELATION ===============" ) extractedRelations += 1 newsentence1 = "" for x1 in s3.tokens: newsentence1 += " " + x1.word print("Sentence: " + newsentence1) confidence = s4.probabilities[relationName] enTy1 = s4.entities[0].type enVa1 = s4.entities[0].value enTy2 = s4.entities[1].type enVa2 = s4.entities[1].value print("RelationType: " + relationName + " | Confidence= " + confidence + " | EntityType1= " + enTy1 + " |") print("EntityValue1= " + enVa1 + " | EntityType2= " + enTy2 + " | EntityValue2= " + enVa2 + " |") print( "============== END OF RELATION DESC ==============" ) if (float(confidence) >= float(threshold) and float(confidence) >= float( s4.probabilities["Live_In"]) and float(confidence) >= float( s4.probabilities["OrgBased_In"]) and float(confidence) >= float( s4.probabilities["Located_In"]) and float(confidence) >= float( s4.probabilities["Work_For"])): tuples.add((relationName, round(float(confidence), 3), enTy1, enVa1, enTy2, enVa2)) totalExtractedRelations += extractedRelations print("Relations extracted from this website: " + str(extractedRelations) + " (Overall: " + str(totalExtractedRelations) + ")") print("Pruning relations below threshold...") goodTuples = len(tuples) print("Number of tuples after pruning: " + str(goodTuples)) print("================== ALL RELATIONS =================") myTuples = list(tuples) myTuples.sort(key=operator.itemgetter(1), reverse=True) count = 0 queries.add(query) for tup in myTuples: if (count == 0 and enVa2 + " " + enVa1 not in queries): query = enVa2 + " " + enVa1 count = 1 print("RelationType: " + tup[0] + " | Confidence: " + str(tup[1]) + " | Entity #1= " + tup[3] + " (" + tup[2] + ") | Entity #2: " + tup[5] + " (" + tup[4] + ")") if (count == 0): print( "All possible queries have already been used! Breaking Program" ) goodTuples = 1000 iterationNum += 1
#!/usr/bin/env python # -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup from NLPCore import NLPCoreClient from collections import defaultdict import urllib.request as ur import tika import json import sys import re SearchAPI = "https://www.googleapis.com/customsearch/v1" NLPPackagePath = sys.argv[7] client = NLPCoreClient(NLPPackagePath) ## Parameter for selection DEBUG = False USE_TIKA = True # Default using BeautifulSoup, can choose TIKA checkList = {"Work_For":"ORGANIZATION PEOPLE", "OrgBased_In": "LOCATION ORGANIZATION", "Live_In":"LOCATION PEOPLE","Located_In":"LOCATION LOCATION"} def googleQuery(CSEKey, JsonAPIKey, query): payload = {'cx': CSEKey , 'key': JsonAPIKey ,'q' : query} r = requests.get(SearchAPI, params=payload) return json.loads(r.text) def print_formatter(source): print("=============== EXTRACTED RELATION ===============") print("Sentence:",source['text']) builder = []