def pipeline1(text, r, t): extractedRelations = [] with CoreNLPClient( annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner'], timeout=450000, memory='4G', endpoint="http://localhost:9000", threads=7) as pipeline1: print( "\tAnnotating the webpage using [tokenize, ssplit, pos, lemma, ner] annotators ..." ) ann = pipeline1.annotate(text) sentenceNumber = len(ann.sentence) namedEntity = patterns[toRelation[r]] print( "\tExtracted %d sentences. Processing each sentence one by one to check for presence of right pair of named entity types; if so, will run the second pipeline ..." % sentenceNumber) # if a sentence has the targeted two named entities # add the sentence to the list that the element of which perform extracting kbp annotations processedSentence = [] for i, sentence in enumerate(ann.sentence): # check if those named entity in the query all appear in the extract sentence firstEntity = False secondEntity = False for token in sentence.token: if toRelation[r] == relation[2]: if token.ner == namedEntity[0]: firstEntity = True if token.ner in namedEntity[1]: secondEntity = True else: if token.ner == namedEntity[0]: firstEntity = True if token.ner == namedEntity[1]: secondEntity = True # if both targeted named entity appear, the sentence adds to the list if firstEntity and secondEntity: processedSentence.append([i, to_text(sentence)]) # extract the relations in the list of sentence through pipeline2 extractedRelations += pipeline2(processedSentence, t) print("Extracted kbp annotations for %d out of total %d sentences" % (len(processedSentence), sentenceNumber)) return extractedRelations
def checkKBPConfidence(ann_kbp, r, counterExtractedTuples): # check confidence > threshold for sentence in ann_kbp.sentence: for kbp_triple in sentence.kbpTriple: if kbp_triple.relation == r: print("\t=== Extracted Relation ===") print("\tSentence: ", to_text(sentence)) print( f"\tConfidence: {kbp_triple.confidence}; Subject: {kbp_triple.subject}; Object: {kbp_triple.object}" ) if kbp_triple.confidence > t: # Update Confidence if possible (Higher ONLY) if str(kbp_triple.subject) + "," + str( kbp_triple.object) in extractedTuples: if kbp_triple.confidence > extractedTuples[ str(kbp_triple.subject) + "," + str(kbp_triple.object)]: extractedTuples[ str(kbp_triple.subject) + "," + str(kbp_triple.object)] = kbp_triple.confidence print( "The same relation is already present but with a lower confidence. Just updating the confident value." ) else: print( "The same relation is already present with higher (or equal) confidence. Ignoring this." ) # Brand new relation else: extractedTuples[str(kbp_triple.subject) + "," + str( kbp_triple.object)] = kbp_triple.confidence print("\tAdding to set of extracted relations") counterExtractedTuples += 1 else: print( "\tConfidence is lower than threshold confidence. Ignoring this." ) print("\t==========") return counterExtractedTuples
def test_update(): with corenlp.CoreNLPClient(annotators="tokenize ssplit".split()) as client: ann = client.annotate(TEXT) ann = client.update(ann) assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1]
def process_urls(res, relationdict, r, t): name_dict = { '1': ["ORGANIZATION", "PERSON"], '2': ["ORGANIZATION", "PERSON"], '3': ["PERSON", "CITY"], '4': ["ORGANIZATION", "PERSON"] } X = defaultdict(float) count = 1 with CoreNLPClient( annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner'], timeout=30000, memory='4G', endpoint="http://localhost:9000") as pipeline_ner: with CoreNLPClient(annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'depparse', 'coref', 'kbp' ], timeout=30000, memory='4G', endpoint="http://localhost:9001") as pipeline_kbp: for results in res['items']: url = results['link'] title = results['title'] print("URL (", count, "/ 10):", url) print("\tFetching text from url ...") ## gary please fill in webpage_length = 0 # print('result ' + str(count)) # print('title: '+title) # print('url: '+url) count += 1 # now get contents using tika try: parsed = parser.from_file(url) except: print("Unable to fetch URL. Continuing.") continue content = parsed["content"] strip_content = ' '.join(content.split()) print(len(strip_content)) # get first 20000 characters if len(strip_content) >= 20000: strip_content = strip_content[:20000] print("\tWebpage length (num characters):", len(strip_content)) print( "\tAnnotating the webpage using [tokenize, ssplit, pos, lemma, ner] annotators ..." ) # use package now # for j in range(10): # this number can be changed back to 10 if needed # try: #print(f">>> Repeating {j}th time.") ann_ner = pipeline_ner.annotate(strip_content) countkbp = 0 print( "\tExtracted", len(ann_ner.sentence), "sentences. Processing each sentence one by one to check for presence of right pair of named entity types; if so, will run the second pipeline ..." ) for sentence in ann_ner.sentence: #print("matching ners...") match = [False] * len(name_dict[r]) for token in sentence.token: for i in range(len(name_dict[r])): if token.ner == name_dict[r][i]: match[i] = True if all(match): try: ann = pipeline_kbp.annotate(to_text(sentence)) except: continue countsentences = 0 countkbp += 1 for i in ann.sentence: if (countsentences % 5 == 0) and (countsentences != 0): print("\tProcessed", countsentences + 1, "/", len(ann_ner.sentence), "sentences") elif (countsentences == len(ann.sentence)): print("\tProcessed", len(ann_ner.sentence), "/", len(ann_ner.sentence), "sentences") for kbp_triple in i.kbpTriple: if kbp_triple.relation == relationdict[r]: print("\t\t=== Extracted Relation ===") print( "\t\tSentence:", to_text(i) ) ########## not sure if this is right, please check print("\t\tConfidence:", kbp_triple.confidence, "; Subject:", kbp_triple.subject, "; Object:", kbp_triple.object, ";") if kbp_triple.confidence > float(t): if X[(kbp_triple.subject, kbp_triple.relation, kbp_triple. object)] < kbp_triple.confidence: #update key value now X[(kbp_triple.subject, kbp_triple.relation, kbp_triple.object )] = kbp_triple.confidence print( "\t\tAdding to set of extracted relations" ) #print((kbp_triple.subject,kbp_triple.relation,kbp_triple.object,kbp_triple.confidence)) else: print( "\t\tDuplicate with lower confidence than existing record. Ignoring this." ) else: print( "\t\tConfidence is lower than threshold confidence. Ignoring this." ) print("\t\t==========") countsentences += 1 print("\tExtracted kbp annotations for ", countkbp, " out of total ", len(ann_ner.sentence), " sentences") return X
def test_update(corenlp_client): ann = corenlp_client.annotate(TEXT) ann = corenlp_client.update(ann) assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1]
def test_context_manager(): with corenlp.CoreNLPClient(annotators="tokenize,ssplit") as context_client: ann = context_client.annotate(TEXT) assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1]
def main(api_key, engine_id, r, t, q, k): queryIteration = 0 pageNumberVisited = 0 r = relations[r] counterExtractedTuples = 0 visitedTuples.add(q) # Initial Print of Parameters print("\nParameters:") print("Client key = ", api_key) print("Engine key = ", engine_id) print("Relation = ", r) print("Threshold = ", t) print("Query = ", q) print("# of Tuples = ", k) print("Loading necessary libraries; This should take a minute or so ...") while (len(extractedTuples) < k) and (queryIteration < 9): print("=========== Iteration: %s - Query: %s ===========" % (queryIteration, q)) # Google search API (returns top 10 pages) service = build("customsearch", "v1", developerKey=api_key) res = service.cse().list( q=q, cx=engine_id, ).execute() # Dissect the 10 pages for page in res['items']: # Print URL and mark as visited url = page['formattedUrl'] if (url in visitedURLs): print("Already seen URL ... skipping") continue else: visitedURLs.add(url) pageNumberVisited += 1 print( f"URL ({pageNumberVisited} / {(queryIteration+1)*10}): {url}") # Get the 20000 characters from page print("Fetching text from url ...") try: rawPage = requests.get(url) except: print("Unable to fetch URL. Continuing.") continue contents = BeautifulSoup(rawPage.text, 'html.parser') pageText = contents.findAll(text=True) rawText = filter(tag_visible, pageText) rawText = u" ".join(t.strip() for t in rawText) if len(rawText) > 20000: print(( "Truncating webpage text from size (num characters) %s to 20000 ..." ), len(rawText)) rawText = rawText[:20000] print("Webpage length (num characters):", len(rawText)) # Do the annotation print( "Annotating the webpage using [tokenize, ssplit, pos, lemma, ner] annotators ..." ) counterLastIterationExtractedTuples = counterExtractedTuples # TA PROVIDED CODE For KBP TRIPLE EXTRACTION try: with CoreNLPClient(timeout=30000, memory='4G', be_quiet=True) as pipeline: ann_ner = pipeline.annotate(rawText, annotators=annotators_ner) # print 330 lines # check ner tags ( CHECK THE RELATION HERE ) for sentence in ann_ner.sentence: sentenceText = to_text(sentence) #print("Sentence: ", sentenceText) for token in sentence.token: #print(f"****Token word:: {token.word};\t ner : {token.ner}; ") if (token.ner not in sentenceNers): if r == "per:cities_of_residence": if (token.ner in nersRelationtype2): sentenceNers.add(token.ner) else: if (token.ner in nersRelationtype1): sentenceNers.add(token.ner) #print("Sentence NERS: ", sentenceNers) if r == "per:cities_of_residence": if (nersRelationtype2[0] in sentenceNers) and (nersRelationtype2[1] in sentenceNers) \ or (nersRelationtype2[2] in sentenceNers) or (nersRelationtype2[3] in sentenceNers): # KBP Annotate for more detailed analysis ann_kbp = pipeline.annotate( sentenceText, annotators=annotators_kbp) counterExtractedTuples = checkKBPConfidence( ann_kbp, r, counterExtractedTuples) else: #print("~~~~~~NO MATCHING NERS") pass else: if nersRelationtype1[ 0] in sentenceNers and nersRelationtype1[ 1] in sentenceNers: # KBP Annotate for more detailed analysis ann_kbp = pipeline.annotate( sentenceText, annotators=annotators_kbp) counterExtractedTuples = checkKBPConfidence( ann_kbp, r, counterExtractedTuples) else: #print("~~~~~~NO MATCHING NERS") pass sentenceNers.clear() # End of webPage print( f"Relations extracted from this website {counterExtractedTuples - counterLastIterationExtractedTuples} (Overall: {counterExtractedTuples})" ) except: print("Timeout Stanford NLP Server --- Continuing") pass # Next iteration, need new query based off high confidence tuple queryIteration += 1 sortedTuples = sortByConfidence(extractedTuples) for i in sortedTuples: newQuery = ' '.join(i[0].lower().split(',')) if newQuery not in visitedTuples: q = newQuery visitedTuples.add(q) break # End results finalResultsPrint(sortedTuples)