Esempio n. 1
0
def gen_report(infile_path, outfile_path, linker_name):
    """
    A function to generate a report that can be used by dexter.
    :param infile_path: str: input gold standard
    :param outfile_path: str: output of tsv predictions
    :param linker_name: str: name of entity linker
    """

    infile = codecs.open(infile_path, "r", encoding="utf8")
    outfile = codecs.open(outfile_path, "w", encoding="utf8")
    if infile and outfile:
        logger.info("Starting Entity Linking benchmark")
        for doc in infile:
            doc_data = json.loads(doc)
            if linker_name == "spotlight":
                entities = get_entities(doc_data["text"])
                n_entities = len(entities["Resources"]) if "Resources" in entities else 0
                logger.info("Retrieved %d entities for document %s",
                            n_entities, doc_data["docId"])
                out_data = format_data(entities)
                for data_row in out_data:
                    if data_row:
                        data_row.insert(0, doc_data["docId"])
                        data_line = u"\t".join(data_row) + u"\n"
                        outfile.write(data_line)
        infile.close()
        outfile.close()
Esempio n. 2
0
def main(args):
    ent_linker_name = args["<entity-linker>"].lower()
    base_endpoint = args["<base-endpoint>"].lower()
    infile = None
    outfile = None
    if ent_linker_name not in SUPPORTED_LINKERS:
        die(ent_linker_name + " is not a supported entity linking system. Exiting.")

    try:
        infile = codecs.open(args["<gs-file-path>"], "r", encoding="utf8")
        outfile = codecs.open(args["<output-file>"], "w", encoding="utf8")
    except Exception as ex:
        logger.exception("An exception occured, %s", ex)
        die("Could not read from gold standard file or not write to output file")
    if infile and outfile:
        logger.info("Starting Entity Linking benchmark")
        for doc in infile:
            doc_data = parse_gs_line(doc)
            if ent_linker_name == "spotlight":
                doc_id = doc_data["docId"]
                doc_text = doc_data["text"]
                logger.info("Processing entities for document %s . First chars are %s", doc_id, doc_text[:10])
                entities = get_entities(base_endpoint, doc_data["text"])
                logger.info("Retrieved %d entitines", len(entities))
                out_data = format_data(entities)
                for data_row in out_data:
                    if data_row:
                        data_row.insert(0, doc_data["docId"])
                        logger.info("Retrieved entity : %s", data_row[5])
                        data_line = u"\t".join(data_row) + u"\n"
                        outfile.write(data_line)
        infile.close()
        outfile.close()
Esempio n. 3
0
def main(args):
    ent_linker_name = args["<entity-linker>"].lower()
    base_endpoint = args["<base-endpoint>"].lower()
    infile = None
    outfile = None
    if ent_linker_name not in SUPPORTED_LINKERS:
        die(ent_linker_name +
            " is not a supported entity linking system. Exiting.")

    try:
        infile = codecs.open(args["<gs-file-path>"], "r", encoding="utf8")
        outfile = codecs.open(args["<output-file>"], "w", encoding="utf8")
    except Exception as ex:
        logger.exception("An exception occured, %s", ex)
        die("Could not read from gold standard file or not write to output file"
            )
    if infile and outfile:
        logger.info("Starting Entity Linking benchmark")
        for doc in infile:
            doc_data = parse_gs_line(doc)
            if ent_linker_name == "spotlight":
                doc_id = doc_data["docId"]
                doc_text = doc_data["text"]
                logger.info(
                    "Processing entities for document %s . First chars are %s",
                    doc_id, doc_text[:10])
                entities = get_entities(base_endpoint, doc_data["text"])
                logger.info("Retrieved %d entitines", len(entities))
                out_data = format_data(entities)
                for data_row in out_data:
                    if data_row:
                        data_row.insert(0, doc_data["docId"])
                        logger.info("Retrieved entity : %s", data_row[5])
                        data_line = u"\t".join(data_row) + u"\n"
                        outfile.write(data_line)
        infile.close()
        outfile.close()