def gen_train_set(gt, query_file, train_set): """Trains LTR model for entity linking.""" entity, elastic, fcache = Entity(), ElasticCache( ELASTIC_INDICES[0]), FeatureCache() inss = Instances() positive_annots = set() # Adds groundtruth instances (positive instances) PLOGGER.info("Adding groundtruth instances (positive instances) ....") for item in sorted(gt): # qid, query, en_id, mention ltr = LTR(Query(item[1], item[0]), entity, elastic, fcache) ins = ltr.__gen_raw_ins(item[2], item[3]) ins.features = ltr.get_features(ins) ins.target = 1 inss.add_instance(ins) positive_annots.add((item[0], item[2])) # Adds all other instances PLOGGER.info("Adding all other instances (negative instances) ...") for qid, q in sorted(json.load(open(query_file, "r")).items()): PLOGGER.info("Query [" + qid + "]") ltr = LTR(Query(q, qid), entity, elastic, fcache) q_inss = ltr.get_candidate_inss() for ins in q_inss.get_all(): if (qid, ins.get_property("en_id")) in positive_annots: continue ins.target = 0 inss.add_instance(ins) inss.to_json(train_set)
def batch_linking(self): """Scores queries in a batch and outputs results.""" results = {} if self.__config["step"] == "linking": queries = json.load(open(self.__query_file)) for qid in sorted(queries): results[qid] = self.link(queries[qid], qid) json.dump(results, open(self.__output_file, "w"), indent=4, sort_keys=True) # only ranking step if self.__config["step"] == "ranking": queries = json.load(open(self.__query_file)) for qid in sorted(queries): linker = self.__get_linker(Query(queries[qid], qid)) results[qid] = linker.rank_ens() ranked_inss = Instances( sum([inss.get_all() for inss in results.values()], [])) ranked_inss.to_json(self.__output_file) # only disambiguation step if self.__config["step"] == "disambiguation": inss = Instances.from_json(self.__config["test_set"]) inss_by_query = inss.group_by_property("qid") for qid, q_inss in sorted(inss_by_query.items()): linker = self.__get_linker("") results[qid] = linker.disambiguate(Instances(q_inss)) to_elq_eval(results, self.__output_file) PLOGGER.info("Output file: " + self.__output_file)
def tem(self): """Returns True if title of entity equals mention.""" self.__load_en() tem = 0 en_title = Query(self.__en_doc.get(TITLE, [""])[0]).query if self.__mention == en_title: tem = 1 return tem
def tcm(self): """Returns True if title of entity contains mention """ self.__load_en() tcm = 0 en_title = Query(self.__en_doc.get(TITLE, [""])[0]).query if self.__mention in en_title: tcm = 1 return tcm
def mct(self): """Returns True if mention contains the title of entity """ self.__load_en() mct = 0 en_title = Query(self.__en_doc.get(TITLE, [""])[0]).query if en_title in self.__mention: mct = 1 return mct
def load_yerd(gt_file): """ Reads the Y-ERD collection and returns a dictionary. :param gt_file: Path to the Y-ERD collection :return: dictionary {(qid, query, en_id, mention) ...} """ PLOGGER.info("Loading the ground truth ...") gt = set() with open(gt_file, "r") as tsvfile: reader = csv.DictReader(tsvfile, delimiter="\t", quoting=csv.QUOTE_NONE) for line in reader: if line["entity"] == "": continue query = Query(line["query"]).query mention = Query(line["mention"]).query gt.add((line["qid"], query, line["entity"], mention)) return gt
def link(self, query): """Performs entity linking for the query. :param query: query string :return: annotated query """ q = Query(query) linker = self.__get_linker(q) linked_ens = linker.link() res = { "query": q.raw_query, "processed_query": q.query, "results": linked_ens } return res
def link(self, query, qid=""): """Performs entity linking for the query. :param query: query string :return: annotated query """ PLOGGER.info("Linking query " + qid + " [" + query + "] ") q = Query(query, qid) linker = self.__get_linker(q) if self.__config["step"] == "ranking": res = linker.rank_ens() else: linked_ens = linker.link() res = { "query": q.raw_query, "processed_query": q.query, "results": linked_ens } return res
def main(args): entity = Entity() query = Query(args[0]) cmns = Cmns(query, entity, cmns_th=0.1) print(cmns.link())