def gen_report(infile_path, outfile_path, linker_name): """ A function to generate a report that can be used by dexter. :param infile_path: str: input gold standard :param outfile_path: str: output of tsv predictions :param linker_name: str: name of entity linker """ infile = codecs.open(infile_path, "r", encoding="utf8") outfile = codecs.open(outfile_path, "w", encoding="utf8") if infile and outfile: logger.info("Starting Entity Linking benchmark") for doc in infile: doc_data = json.loads(doc) if linker_name == "spotlight": entities = get_entities(doc_data["text"]) n_entities = len(entities["Resources"]) if "Resources" in entities else 0 logger.info("Retrieved %d entities for document %s", n_entities, doc_data["docId"]) out_data = format_data(entities) for data_row in out_data: if data_row: data_row.insert(0, doc_data["docId"]) data_line = u"\t".join(data_row) + u"\n" outfile.write(data_line) infile.close() outfile.close()
def main(args): ent_linker_name = args["<entity-linker>"].lower() base_endpoint = args["<base-endpoint>"].lower() infile = None outfile = None if ent_linker_name not in SUPPORTED_LINKERS: die(ent_linker_name + " is not a supported entity linking system. Exiting.") try: infile = codecs.open(args["<gs-file-path>"], "r", encoding="utf8") outfile = codecs.open(args["<output-file>"], "w", encoding="utf8") except Exception as ex: logger.exception("An exception occured, %s", ex) die("Could not read from gold standard file or not write to output file") if infile and outfile: logger.info("Starting Entity Linking benchmark") for doc in infile: doc_data = parse_gs_line(doc) if ent_linker_name == "spotlight": doc_id = doc_data["docId"] doc_text = doc_data["text"] logger.info("Processing entities for document %s . First chars are %s", doc_id, doc_text[:10]) entities = get_entities(base_endpoint, doc_data["text"]) logger.info("Retrieved %d entitines", len(entities)) out_data = format_data(entities) for data_row in out_data: if data_row: data_row.insert(0, doc_data["docId"]) logger.info("Retrieved entity : %s", data_row[5]) data_line = u"\t".join(data_row) + u"\n" outfile.write(data_line) infile.close() outfile.close()
def main(args): ent_linker_name = args["<entity-linker>"].lower() base_endpoint = args["<base-endpoint>"].lower() infile = None outfile = None if ent_linker_name not in SUPPORTED_LINKERS: die(ent_linker_name + " is not a supported entity linking system. Exiting.") try: infile = codecs.open(args["<gs-file-path>"], "r", encoding="utf8") outfile = codecs.open(args["<output-file>"], "w", encoding="utf8") except Exception as ex: logger.exception("An exception occured, %s", ex) die("Could not read from gold standard file or not write to output file" ) if infile and outfile: logger.info("Starting Entity Linking benchmark") for doc in infile: doc_data = parse_gs_line(doc) if ent_linker_name == "spotlight": doc_id = doc_data["docId"] doc_text = doc_data["text"] logger.info( "Processing entities for document %s . First chars are %s", doc_id, doc_text[:10]) entities = get_entities(base_endpoint, doc_data["text"]) logger.info("Retrieved %d entitines", len(entities)) out_data = format_data(entities) for data_row in out_data: if data_row: data_row.insert(0, doc_data["docId"]) logger.info("Retrieved entity : %s", data_row[5]) data_line = u"\t".join(data_row) + u"\n" outfile.write(data_line) infile.close() outfile.close()