Beispiel #1
0
def run_oie(lines, batch_size=1, debug=False):
    """
    Run the OIE model and process the output.
    """

    if debug:
        logging.basicConfig(level = logging.DEBUG)
    else:
        logging.basicConfig(level = logging.INFO)

    # Init OIE
    #model = open_information_extraction_stanovsky_2018()
    model = PretrainedModel('model_final.tar.gz',
                            'open-information-extraction')
    model =  model.predictor()  # type: ignore

    # process sentences
    logging.info("Processing sentences")
    oie_lines = []
    for chunk in tqdm(chunks(lines, batch_size)):
        oie_inputs = []
        for sent in chunk:
            oie_inputs.extend(create_instances(model, sent))
        if not oie_inputs:
            # No predicates in this sentence
            continue

        # Run oie on sents
        sent_preds = model.predict_batch_json(oie_inputs)

        # Collect outputs in batches
        predictions_by_sent = defaultdict(list)
        for outputs in sent_preds:
            sent_tokens = outputs["words"]
            tags = outputs["tags"]
            sent_str = " ".join(sent_tokens)
            assert(len(sent_tokens) == len(tags))
            predictions_by_sent[sent_str].append((outputs["tags"], outputs["class_probabilities"]))

        # Create extractions by sentence
        for sent_tokens, predictions_for_sent in predictions_by_sent.items():
            raw_tags = list(map(itemgetter(0), predictions_for_sent))
            class_probs = list(map(itemgetter(1), predictions_for_sent))

            # Compute confidence per extraction
            confs = [get_confidence(model, tag_per_token, class_prob)
                     for tag_per_token, class_prob in zip(raw_tags, class_probs)]

            extractions, tags = format_extractions([Mock_token(tok) for tok in sent_tokens.split(" ")], raw_tags)

            oie_lines.extend([extraction + f"\t{conf}" for extraction, conf in zip(extractions, confs)])
    logging.info("DONE")
    return oie_lines
Beispiel #2
0
            sent_str = " ".join(sent_tokens)
            assert (len(sent_tokens) == len(tags))
            predictions_by_sent[sent_str].append(
                (outputs["tags"], outputs["class_probabilities"]))

        # Create extractions by sentence
        for sent_tokens, predictions_for_sent in predictions_by_sent.items():
            raw_tags = list(map(itemgetter(0), predictions_for_sent))
            class_probs = list(map(itemgetter(1), predictions_for_sent))

            # Compute confidence per extraction
            confs = [
                get_confidence(model, tag_per_token, class_prob)
                for tag_per_token, class_prob in zip(raw_tags, class_probs)
            ]

            extractions, tags = format_extractions(
                [Mock_token(tok) for tok in sent_tokens.split(" ")], raw_tags)
            oie_lines.extend([
                extraction + f"\t{conf}"
                for extraction, conf in zip(extractions, confs)
            ])
    t2 = time.perf_counter()
    print("E2E time: ", t2 - t1)

    # Write to file
    logging.info(f"Writing output to {out_fn}")
    with open(out_fn, "w", encoding="utf8") as fout:
        fout.write("\n".join(oie_lines))
    logging.info("DONE")
Beispiel #3
0
            # Collect outputs in batches
            predictions_by_sent = defaultdict(list)
            for outputs in sent_preds:
                sent_tokens = outputs["words"]
                tags = outputs["tags"]
                sent_str = " ".join(sent_tokens)
                assert(len(sent_tokens) == len(tags))
                predictions_by_sent[sent_str].append((outputs["tags"], outputs["class_probabilities"]))

            # Create extractions by sentence
            for sent_tokens, predictions_for_sent in predictions_by_sent.items():
                raw_tags = list(map(itemgetter(0), predictions_for_sent))
                class_probs = list(map(itemgetter(1), predictions_for_sent))

                # Compute confidence per extraction
                confs = [get_confidence(model, tag_per_token, class_prob)
                         for tag_per_token, class_prob in zip(raw_tags, class_probs)]

                extractions, tags = format_extractions([Mock_token(tok) for tok in sent_tokens.split(" ")], raw_tags)
                oie_lines.extend([extraction + f"\t{conf}" for extraction, conf in zip(extractions, confs)])
        loops+=1
        print("CURRENT LOOP: ", loops)
        t2 = time.time()
        print("E2E time: ", t2-t1)

    # Write to file
    logging.info(f"Writing output to {out_fn}")
    with open(out_fn, "w", encoding = "utf8") as fout:
        fout.write("\n".join(oie_lines))
    logging.info("DONE")