def run(self): self.args = self.parser2.parse_args() self.logger = Logger(self.args.resultsdir, self.args.resultsfilename, 'INFO') self.logger.startExperiment(sys.argv) self.log = self.logger.getLog() dbpath = os.path.dirname(self.args.input) + "/vlog" self.db = trident.Db(dbpath) #set up params postprocessing paramspostp = ParamsPostProcessing(self.args.post_threshold, self.args.post_nvars, self.args.post_minsize, self.args.post_minmatch, self.args.post_defaultincr, self.args.post2_enable, self.args.post2_increment, self.args.post2_topk) # Load the vectors with open(self.args.vectors, 'rb') as finv: vectors = pickle.load(finv) # Load the input with open(self.args.input, 'rb') as fin: data = pickle.load(fin) self.logger.setInput(data) self.dictr = data['relations'] self.dicte = data['entities'] nameSetToTest = self.args.testValid ev = vectors['ev'] N = len(data['entities']) true_triples = data['train_subs'] + data['test_subs'] + data['valid_subs'] with open(self.args.fileTestValid, 'rb') as fileTV: setToTest = pickle.load(fileTV) testMode = self.args.testmode while True: tester = self.evaluator(self.log, setToTest, true_triples, N, ev.getNExtVars(), ev, data['relations'], data['entities'], paramspostp, self.neval) # Do the test pos_v, fpos_v = tester.positions(vectors['model'], self.args.exvars, ev, self.db) fmrr_valid = self.ranking_scores(pos_v, fpos_v, 0, nameSetToTest) if testMode: print(fmrr_valid) # Change the parameters else: break self.logger.stopExperiment()
def __init__(self, KBPATH: str): self.es = AsyncElasticsearch([{ "host": "localhost", "port": 9200 }], timeout=30) self.db = trident.Db(KBPATH) self.loop = asyncio.get_event_loop() self.cache: Dict[str, str] = {}
def open(self, configuration: str, create=False): if not configuration: configuration = self.configuration if not os.path.isdir(configuration): raise Exception(f"Trident db path {configuration} does not exist!") import trident self._db = trident.Db(configuration) log.debug(f"Using Trident DB with {len(self)} triples") return rdflib.store.VALID_STORE
def run(self): # parse comandline arguments self.args = self.parser.parse_args() if self.args.mode != 'rank': raise ValueError('Unknown experiment mode (%s)' % self.args.mode) self.callback = self.ranking_callback self.logger = Logger(self.args.resultsdir, self.args.resultsfilename, self.args.loglevel) self.logger.startExperiment(sys.argv) self.log = self.logger.getLog() self.ev = None if self.args.extvars: self.evActive = True self.existing_model = None dbpath = os.path.dirname(self.args.fin) + "/vlog" self.db = trident.Db(dbpath) self.minSize = self.args.thre_min #self.maxSize = self.args.thre_max self.train() self.logger.stopExperiment()
if len(sys.argv) == 4: warc_path = sys.argv[3] #cleantexts, doc_ids = read_warc(warc_path) if len(sys.argv) == 5: annotations_path = sys.argv[4] #else: cleantexts, doc_ids = read_warc( warc_path) #Assuming path = './data/sample.warc.gz' n_docs = len(cleantexts) #sparql_domain = "localhost:1234" sparql_path = '/home/jurbani/data/motherkb-trident' db = trident.Db(sparql_path) # Hyperparameters, how many results of (elasticsearch and trident) to compare (for discard and popularity) k = 5 q = 10 r = 10 min_score = 2 min_ratio_value = 0.82 small_value = 0.5 title_multiplier = 3 label_multiplier = 2 matching_label_value = 3 non_match_value = 2 score_step_value = 1 / r # Tagger
def __init__(self): self.db = trident.Db(TRIDENT_PATH)
def queries_and_get_best_id(set_info_and_id): set_info, id = set_info_and_id full_output = "" #---------------------- Queries----------------------------------------------------------------------------------------- # General sparql_query_title = "select distinct ?obj where { \ <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/key/wikipedia.en_title> ?obj . \ } " # % (freebase_id) sparql_query_n_webpage = "select distinct ?obj where { \ <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/ns/common.topic.topic_equivalent_webpage> ?obj . \ } " # % (freebase_id) # Is social media active (Not consistent for persons) #Avg 2 or 1 sparql_query_media_presence = "select distinct * where { \ <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/ns/common.topic.social_media_presence> ?obj .\ } " # % (freebase_id) # Not yahoo and disney , Avg 1 # Is social media active (Not consisten for persons?) sparql_query_twitter = "select distinct * where { \ <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/key/authority.twitter> ?obj .\ } " # % (freebase_id) # Avg 1 or 2 # Is social media active (Not consisten for persons?) #New york times of Flash player bijv sparql_query_website = "select distinct * where { \ <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/ns/common.topic.official_website> ?obj .\ } " # % (freebase_id) # ------------Location----------------------------------------------------------------------------------------------------- sparql_query_location = "select distinct * where {\ { <http://rdf.freebase.com/ns%s> ?rel <http://rdf.freebase.com/ns/location.location> . }\ UNION \ { <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/ns/base.biblioness.bibs_location.loc_type> ?type . } \ } " # % (freebase_id,freebase_id) # ----------Person------------------------------------------------------------------------------------------------------ sparql_query_person = "select distinct * where { \ <http://rdf.freebase.com/ns%s> ?rel <http://rdf.freebase.com/ns/people.person> .\ } " # % (freebase_id) # Scenario schrijver Tony Gilroy is a not a celebrity and not a notable name either. sparql_query_person_nndb = "select distinct * where { \ <http://rdf.freebase.com/ns%s> ?rel <http://rdf.freebase.com/ns/user.narphorium.people.nndb_person> .\ } " # % (freebase_id) # Jeremy Renner is notable name but does not have this type celebrity sparql_query_person_celeb = "select distinct * where { \ <http://rdf.freebase.com/ns%s> ?rel <http://rdf.freebase.com/ns/base.popstra.celebrity> .\ } " # % (freebase_id) # Tony Gilroy (and Matt damon) has these properties, politician not: sparql_query_entertainment = "select distinct * where {\ { <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/key/source.entertainmentweekly.person> ?name . }\ UNION \ { <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/key/source.filmstarts.personen> ?name2 . } \ } " # % (freebase_id,freebase_id) # --------Organization------------------------------------------------------------------------------------------------------ # Should be the first one if company sparql_query_company = "select distinct * where { \ <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/key/authority.crunchbase.company> ?obj . \ } " # % (freebase_id) #Skipped other ones #--------Other--------------------------------------------------------------------------------------------------------------- # Also Flash player etc. sparql_query_inanimate = "select distinct * where { \ <http://rdf.freebase.com/ns%s> ?rel <http://rdf.freebase.com/ns/base.type_ontology.inanimate> .\ } " # % (freebase_id) # ------------------------------------------------------------------------------------------------------------------------ query_label_order = { "GPE": ["GPE", "ORGANIZATION"], "GSP": ["GPE", "ORGANIZATION"], "LOCATION": ["GPE", "ORGANIZATION"], "PERSON": ["PERSON", "GPE"], "ORGANIZATION": ["ORGANIZATION", "GPE"], "FACILITY": ["ORGANIZATION", "GPE"], } r = 10 small_value = 0.5 title_multiplier = 3 label_multiplier = 2 matching_label_value = 3 non_match_value = 2 score_step_value = 1 / r sparql_path = '/home/jurbani/data/motherkb-trident' db = trident.Db(sparql_path) final_set = set() for word_info in set_info: word, label, set_response, elastic_scores, matching_ratio = word_info elastic_id_labels = list(set_response) total_scores = {} label_order = query_label_order[label] for freebase_id in elastic_id_labels[:r]: current_score_of_id = 0 # Do Sparql query modified_id = freebase_id[:2] + "." + freebase_id[3:] # General query = sparql_query_title % (modified_id) #output_query = sparql(sparql_domain, query) output_query = json.loads(db.sparql(query)) results = int(output_query["stats"].get("nresults", -1)) if results == -1: #Incorrect ID otherwise total_scores[freebase_id] = 0 continue #print(word,freebase_id) #print("results title ", results) if results > 0: #print("heb title gevonden\n") title = output_query["results"]["bindings"][0]["obj"][ "value"] # Assumming only 1 title on index 0 title = title.translate({ ord('"'): None, ord('$'): " ", ord('_'): " " }) lowered_title = title.lower() lowered_word = word.lower() max_ratio = SequenceMatcher(None, lowered_word, lowered_title).ratio() # Extra check for aliases or per word #splitted_title = lowered_title.split() #n_words = len(splitted_title) #if n_words == len(word): # initials = "" # for w in splitted_title: # letter = w[0] # initials += letter # ratio = SequenceMatcher(None, lowered_word, initials).ratio() # if ratio > max_ratio: # max_ratio = ratio #elif n_words > 1: # for w in splitted_title: # ratio = SequenceMatcher(None, w, lowered_title).ratio() # if ratio > max_ratio: # max_ratio = ratio #tuple_info_ratio = (freebase_id, max_ratio) #sorted_tuples_title_ratio.append(tuple_info_ratio) #total_scores[freebase_id] = max_ratio * 30 current_score_of_id += max_ratio * title_multiplier # # Add n_labels to list, where score is given for ranking on that is given after the for loop n_labels = len(set_response[freebase_id]) #tuple_info = (freebase_id, n_labels) #sorted_tuples_n_labels.append(tuple_info) #total_scores[freebase_id] += min(int(n_labels / 5),20) / 10 #total_scores[freebase_id] = n_labels current_score_of_id += (min(int(n_labels / 5), 20) / 20) * label_multiplier query = sparql_query_n_webpage % (modified_id) #output_query = sparql(sparql_domain, query) output_query = json.loads(db.sparql(query)) n_results = int(output_query["stats"]["nresults"]) #print("Results webpage ",n_results) #tuple_info_n_results = (freebase_id, n_results) #sorted_tuples_n_pages.append(tuple_info_n_results) current_score_of_id += min(int(n_results / 5), 20) / 20 #query = sparql_query_media_presence % (modified_id) #output_query = sparql(sparql_domain, query) #output_query = json.loads(db.sparql(query)) #n_results = int(output_query["stats"]["nresults"]) #print("results media ",n_results) #current_score_of_id += (n_results > 0) * small_value #query = sparql_query_twitter % (modified_id) #output_query = sparql(sparql_domain, query) #output_query = json.loads(db.sparql(query)) #n_results = int(output_query["stats"]["nresults"]) #print("results twitter ", n_results) #current_score_of_id += (n_results > 0) * small_value query = sparql_query_website % (modified_id) #output_query = sparql(sparql_domain, query) output_query = json.loads(db.sparql(query)) n_results = int(output_query["stats"]["nresults"]) #print("results website ", n_results) current_score_of_id += (n_results > 0) * small_value match = False for i, label in enumerate(label_order): addition = matching_label_value - i #value 3 if correct, 2 if second, 1 if third or lower if label == "GPE": query = sparql_query_location % (modified_id, modified_id) #output_query = sparql(sparql_domain, query) output_query = json.loads(db.sparql(query)) n_results = int(output_query["stats"]["nresults"]) #print("results location ",n_results) if n_results: current_score_of_id += addition match = True elif label == "PERSON": query = sparql_query_person % (modified_id) #output_query = sparql(sparql_domain, query) output_query = json.loads(db.sparql(query)) n_results = int(output_query["stats"]["nresults"]) #print("results person ",n_results) if n_results: current_score_of_id += addition match = True #query = sparql_query_person_nndb % (modified_id) #output_query = sparql(sparql_domain, query) #output_query = json.loads(db.sparql(query)) #n_results = int(output_query["stats"]["nresults"]) #print("results nndb ", n_results) #if n_results: # current_score_of_id += small_value #addition # match = True #query = sparql_query_person_celeb % (modified_id) #output_query = sparql(sparql_domain, query) #output_query = json.loads(db.sparql(query)) #n_results = int(output_query["stats"]["nresults"]) #print("results celeb ", n_results) #if n_results: # current_score_of_id += small_value #addition # match = True #query = sparql_query_entertainment % (modified_id, modified_id) #output_query = sparql(sparql_domain, query) #output_query = json.loads(db.sparql(query)) #n_results = int(output_query["stats"]["nresults"]) #print("results enter ", n_results) #if n_results: # current_score_of_id += small_value #addition # match = True elif label == "ORGANIZATION": query = sparql_query_company % (modified_id) #output_query = sparql(sparql_domain, query) output_query = json.loads(db.sparql(query)) n_results = int(output_query["stats"]["nresults"]) #print("results org ", n_results) if n_results: current_score_of_id += addition match = True elif label == "OTHER": query = sparql_query_inanimate % (modified_id) #output_query = sparql(sparql_domain, query) output_query = json.loads(db.sparql(query)) n_results = int(output_query["stats"]["nresults"]) #print("Other results ",n_results) if n_results: current_score_of_id += 1 if match: break total_scores[freebase_id] = current_score_of_id #elastic_scores.sort(key=itemgetter(1)) #matching_ratio.sort(key=itemgetter(1)) #for i, tuple_value in enumerate(elastic_scores): # freebase_id = tuple_value[0] # if freebase_id in total_scores: # score = ((i + 1) * score_step_value) # total_scores[freebase_id] += score #for i, tuple_value in enumerate(matching_ratio): # freebase_id = tuple_value[0] # if freebase_id in total_scores: # score = ((i + 1) * score_step_value) # total_scores[freebase_id] += score best_id_key = max(total_scores.items(), key=itemgetter(1))[0] final_set.add((word, best_id_key)) #line = doc_key + '\t' + word + '\t' + best_id_key + "\n" #full_output += line return final_set, id
def db(self): if self._kb is None: self._kb = trident.Db(self.kb_path) return self._kb return self._kb
import trident import json KBPATH = "assets/wikidata-20200203-truthy-uri-tridentdb" # Retrieve first 10 entities of type (P31) city (Q515) query = ( "PREFIX wde: <http://www.wikidata.org/entity/> " "PREFIX wdp: <http://www.wikidata.org/prop/direct/> " "PREFIX wdpn: <http://www.wikidata.org/prop/direct-normalized/> " "select ?s where { ?s wdp:P31 wde:Q515 . } LIMIT 10" ) # Load the KB db = trident.Db(KBPATH) results = db.sparql(query) json_results = json.loads(results) print("*** VARIABLES ***") variables = json_results["head"]["vars"] print(variables) print("\n*** BINDINGS ***") results = json_results["results"] for b in results["bindings"]: line = "" for var in variables: line += var + ": " + b[var]["value"] + " " print(line) print("\n*** STATISTICS ***")
def get_pop(url): db = trident.Db("/app/assignment/assets/wikidata-20200203-truthy-uri-tridentdb") id = db.lookup_id("<http://www.wikidata.org/entity/Q11268>") print(db.count_o(id))