コード例 #1
0
def main(
    wd_json,
    wp_xml,
    output_dir,
    model,
    max_per_alias=10,
    min_freq=20,
    min_pair=5,
    entity_vector_length=64,
    loc_prior_prob=None,
    loc_entity_defs=None,
    loc_entity_alias=None,
    loc_entity_desc=None,
    descr_from_wp=False,
    limit_prior=None,
    limit_train=None,
    limit_wd=None,
    lang="en",
):
    entity_defs_path = loc_entity_defs if loc_entity_defs else output_dir / ENTITY_DEFS_PATH
    entity_alias_path = loc_entity_alias if loc_entity_alias else output_dir / ENTITY_ALIAS_PATH
    entity_descr_path = loc_entity_desc if loc_entity_desc else output_dir / ENTITY_DESCR_PATH
    entity_freq_path = output_dir / ENTITY_FREQ_PATH
    prior_prob_path = loc_prior_prob if loc_prior_prob else output_dir / PRIOR_PROB_PATH
    training_entities_path = output_dir / TRAINING_DATA_FILE
    kb_path = output_dir / KB_FILE

    logger.info("Creating KB with Wikipedia and WikiData")

    # STEP 0: set up IO
    if not output_dir.exists():
        output_dir.mkdir(parents=True)

    # STEP 1: Load the NLP object
    logger.info("STEP 1: Loading NLP model {}".format(model))
    nlp = spacy.load(model)

    # check the length of the nlp vectors
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    # STEP 2: create prior probabilities from WP
    if not prior_prob_path.exists():
        # It takes about 2h to process 1000M lines of Wikipedia XML dump
        logger.info("STEP 2: Writing prior probabilities to {}".format(
            prior_prob_path))
        if limit_prior is not None:
            logger.warning(
                "Warning: reading only {} lines of Wikipedia dump".format(
                    limit_prior))
        wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit_prior)
    else:
        logger.info("STEP 2: Reading prior probabilities from {}".format(
            prior_prob_path))

    # STEP 3: calculate entity frequencies
    if not entity_freq_path.exists():
        logger.info(
            "STEP 3: Calculating and writing entity frequencies to {}".format(
                entity_freq_path))
        io.write_entity_to_count(prior_prob_path, entity_freq_path)
    else:
        logger.info("STEP 3: Reading entity frequencies from {}".format(
            entity_freq_path))

    # STEP 4: reading definitions and (possibly) descriptions from WikiData or from file
    if (not entity_defs_path.exists()) or (not descr_from_wp
                                           and not entity_descr_path.exists()):
        # It takes about 10h to process 55M lines of Wikidata JSON dump
        logger.info(
            "STEP 4: Parsing and writing Wikidata entity definitions to {}".
            format(entity_defs_path))
        if limit_wd is not None:
            logger.warning(
                "Warning: reading only {} lines of Wikidata dump".format(
                    limit_wd))
        title_to_id, id_to_descr, id_to_alias = wd.read_wikidata_entities_json(
            wd_json,
            limit_wd,
            to_print=False,
            lang=lang,
            parse_descr=(not descr_from_wp),
        )
        io.write_title_to_id(entity_defs_path, title_to_id)

        logger.info("STEP 4b: Writing Wikidata entity aliases to {}".format(
            entity_alias_path))
        io.write_id_to_alias(entity_alias_path, id_to_alias)

        if not descr_from_wp:
            logger.info(
                "STEP 4c: Writing Wikidata entity descriptions to {}".format(
                    entity_descr_path))
            io.write_id_to_descr(entity_descr_path, id_to_descr)
    else:
        logger.info("STEP 4: Reading entity definitions from {}".format(
            entity_defs_path))
        logger.info("STEP 4b: Reading entity aliases from {}".format(
            entity_alias_path))
        if not descr_from_wp:
            logger.info("STEP 4c: Reading entity descriptions from {}".format(
                entity_descr_path))

    # STEP 5: Getting gold entities from Wikipedia
    if (not training_entities_path.exists()) or (
            descr_from_wp and not entity_descr_path.exists()):
        logger.info(
            "STEP 5: Parsing and writing Wikipedia gold entities to {}".format(
                training_entities_path))
        if limit_train is not None:
            logger.warning(
                "Warning: reading only {} lines of Wikipedia dump".format(
                    limit_train))
        wp.create_training_and_desc(wp_xml, entity_defs_path,
                                    entity_descr_path, training_entities_path,
                                    descr_from_wp, limit_train)
        if descr_from_wp:
            logger.info(
                "STEP 5b: Parsing and writing Wikipedia descriptions to {}".
                format(entity_descr_path))
    else:
        logger.info("STEP 5: Reading gold entities from {}".format(
            training_entities_path))
        if descr_from_wp:
            logger.info("STEP 5b: Reading entity descriptions from {}".format(
                entity_descr_path))

    # STEP 6: creating the actual KB
    # It takes ca. 30 minutes to pretrain the entity embeddings
    if not kb_path.exists():
        logger.info("STEP 6: Creating the KB at {}".format(kb_path))
        kb = kb_creator.create_kb(
            nlp=nlp,
            max_entities_per_alias=max_per_alias,
            min_entity_freq=min_freq,
            min_occ=min_pair,
            entity_def_path=entity_defs_path,
            entity_descr_path=entity_descr_path,
            entity_alias_path=entity_alias_path,
            entity_freq_path=entity_freq_path,
            prior_prob_path=prior_prob_path,
            entity_vector_length=entity_vector_length,
        )
        kb.dump(kb_path)
        logger.info("kb entities: {}".format(kb.get_size_entities()))
        logger.info("kb aliases: {}".format(kb.get_size_aliases()))
        nlp.to_disk(output_dir / KB_MODEL_DIR)
    else:
        logger.info("STEP 6: KB already exists at {}".format(kb_path))

    logger.info("Done!")
コード例 #2
0
def run_pipeline():
    # set the appropriate booleans to define which parts of the pipeline should be re(run)
    print("START", datetime.datetime.now())
    print()
    nlp_1 = spacy.load('en_core_web_lg')
    nlp_2 = None
    kb_2 = None

    # one-time methods to create KB and write to file
    to_create_prior_probs = False
    to_create_entity_counts = False
    to_create_kb = False

    # read KB back in from file
    to_read_kb = True
    to_test_kb = False

    # create training dataset
    create_wp_training = False

    # train the EL pipe
    train_pipe = True
    measure_performance = True

    # test the EL pipe on a simple example
    to_test_pipeline = True

    # write the NLP object, read back in and test again
    to_write_nlp = True
    to_read_nlp = True
    test_from_file = False

    # STEP 1 : create prior probabilities from WP (run only once)
    if to_create_prior_probs:
        print("STEP 1: to_create_prior_probs", datetime.datetime.now())
        wp.read_wikipedia_prior_probs(wikipedia_input=ENWIKI_DUMP,
                                      prior_prob_output=PRIOR_PROB)
        print()

    # STEP 2 : deduce entity frequencies from WP (run only once)
    if to_create_entity_counts:
        print("STEP 2: to_create_entity_counts", datetime.datetime.now())
        wp.write_entity_counts(prior_prob_input=PRIOR_PROB,
                               count_output=ENTITY_COUNTS,
                               to_print=False)
        print()

    # STEP 3 : create KB and write to file (run only once)
    if to_create_kb:
        print("STEP 3a: to_create_kb", datetime.datetime.now())
        kb_1 = kb_creator.create_kb(nlp_1,
                                    max_entities_per_alias=MAX_CANDIDATES,
                                    min_entity_freq=MIN_ENTITY_FREQ,
                                    min_occ=MIN_PAIR_OCC,
                                    entity_def_output=ENTITY_DEFS,
                                    entity_descr_output=ENTITY_DESCR,
                                    count_input=ENTITY_COUNTS,
                                    prior_prob_input=PRIOR_PROB,
                                    wikidata_input=WIKIDATA_JSON)
        print("kb entities:", kb_1.get_size_entities())
        print("kb aliases:", kb_1.get_size_aliases())
        print()

        print("STEP 3b: write KB and NLP", datetime.datetime.now())
        kb_1.dump(KB_FILE)
        nlp_1.to_disk(NLP_1_DIR)
        print()

    # STEP 4 : read KB back in from file
    if to_read_kb:
        print("STEP 4: to_read_kb", datetime.datetime.now())
        nlp_2 = spacy.load(NLP_1_DIR)
        kb_2 = KnowledgeBase(vocab=nlp_2.vocab,
                             entity_vector_length=DESC_WIDTH)
        kb_2.load_bulk(KB_FILE)
        print("kb entities:", kb_2.get_size_entities())
        print("kb aliases:", kb_2.get_size_aliases())
        print()

        # test KB
        if to_test_kb:
            check_kb(kb_2)
            print()

    # STEP 5: create a training dataset from WP
    if create_wp_training:
        print("STEP 5: create training dataset", datetime.datetime.now())
        training_set_creator.create_training(wikipedia_input=ENWIKI_DUMP,
                                             entity_def_input=ENTITY_DEFS,
                                             training_output=TRAINING_DIR)

    # STEP 6: create and train the entity linking pipe
    if train_pipe:
        print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
        type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)}
        print(" -analysing", len(type_to_int), "different entity types")
        el_pipe = nlp_2.create_pipe(name='entity_linker',
                                    config={
                                        "context_width": CONTEXT_WIDTH,
                                        "pretrained_vectors":
                                        nlp_2.vocab.vectors.name,
                                        "type_to_int": type_to_int
                                    })
        el_pipe.set_kb(kb_2)
        nlp_2.add_pipe(el_pipe, last=True)

        other_pipes = [
            pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"
        ]
        with nlp_2.disable_pipes(*other_pipes):  # only train Entity Linking
            optimizer = nlp_2.begin_training()
            optimizer.learn_rate = LEARN_RATE
            optimizer.L2 = L2

        # define the size (nr of entities) of training and dev set
        train_limit = 5000
        dev_limit = 5000

        train_data = training_set_creator.read_training(
            nlp=nlp_2, training_dir=TRAINING_DIR, dev=False, limit=train_limit)

        print("Training on", len(train_data), "articles")
        print()

        dev_data = training_set_creator.read_training(
            nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit)

        print("Dev testing on", len(dev_data), "articles")
        print()

        if not train_data:
            print("Did not find any training data")
        else:
            for itn in range(EPOCHS):
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data,
                                    size=compounding(4.0, 128.0, 1.001))
                batchnr = 0

                with nlp_2.disable_pipes(*other_pipes):
                    for batch in batches:
                        try:
                            docs, golds = zip(*batch)
                            nlp_2.update(
                                docs,
                                golds,
                                sgd=optimizer,
                                drop=DROPOUT,
                                losses=losses,
                            )
                            batchnr += 1
                        except Exception as e:
                            print("Error updating batch:", e)

                if batchnr > 0:
                    el_pipe.cfg["context_weight"] = 1
                    el_pipe.cfg["prior_weight"] = 1
                    dev_acc_context, dev_acc_context_dict = _measure_accuracy(
                        dev_data, el_pipe)
                    losses['entity_linker'] = losses['entity_linker'] / batchnr
                    print("Epoch, train loss", itn,
                          round(losses['entity_linker'], 2), " / dev acc avg",
                          round(dev_acc_context, 3))

        # STEP 7: measure the performance of our trained pipe on an independent dev set
        if len(dev_data) and measure_performance:
            print()
            print("STEP 7: performance measurement of Entity Linking pipe",
                  datetime.datetime.now())
            print()

            counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines(
                dev_data, kb_2)
            print("dev counts:", sorted(counts.items(), key=lambda x: x[0]))
            print("dev acc oracle:", round(acc_o, 3),
                  [(x, round(y, 3)) for x, y in acc_o_label.items()])
            print("dev acc random:", round(acc_r, 3),
                  [(x, round(y, 3)) for x, y in acc_r_label.items()])
            print("dev acc prior:", round(acc_p, 3),
                  [(x, round(y, 3)) for x, y in acc_p_label.items()])

            # using only context
            el_pipe.cfg["context_weight"] = 1
            el_pipe.cfg["prior_weight"] = 0
            dev_acc_context, dev_acc_context_dict = _measure_accuracy(
                dev_data, el_pipe)
            print("dev acc context avg:", round(dev_acc_context, 3),
                  [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()])

            # measuring combined accuracy (prior + context)
            el_pipe.cfg["context_weight"] = 1
            el_pipe.cfg["prior_weight"] = 1
            dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(
                dev_data, el_pipe, error_analysis=False)
            print("dev acc combo avg:", round(dev_acc_combo, 3),
                  [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])

        # STEP 8: apply the EL pipe on a toy example
        if to_test_pipeline:
            print()
            print("STEP 8: applying Entity Linking to toy example",
                  datetime.datetime.now())
            print()
            run_el_toy_example(nlp=nlp_2)

        # STEP 9: write the NLP pipeline (including entity linker) to file
        if to_write_nlp:
            print()
            print("STEP 9: testing NLP IO", datetime.datetime.now())
            print()
            print("writing to", NLP_2_DIR)
            nlp_2.to_disk(NLP_2_DIR)
            print()

    # verify that the IO has gone correctly
    if to_read_nlp:
        print("reading from", NLP_2_DIR)
        nlp_3 = spacy.load(NLP_2_DIR)

        print("running toy example with NLP 3")
        run_el_toy_example(nlp=nlp_3)

    # testing performance with an NLP model from file
    if test_from_file:
        nlp_2 = spacy.load(NLP_1_DIR)
        nlp_3 = spacy.load(NLP_2_DIR)
        el_pipe = nlp_3.get_pipe("entity_linker")

        dev_limit = 5000
        dev_data = training_set_creator.read_training(
            nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit)

        print("Dev testing from file on", len(dev_data), "articles")
        print()

        dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(
            dev_data, el_pipe=el_pipe, error_analysis=False)
        print("dev acc combo avg:", round(dev_acc_combo, 3),
              [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])

    print()
    print("STOP", datetime.datetime.now())
コード例 #3
0
def main(
    wd_json,
    wp_xml,
    output_dir,
    model,
    max_per_alias=10,
    min_freq=20,
    min_pair=5,
    entity_vector_length=64,
    loc_prior_prob=None,
    loc_entity_defs=None,
    loc_entity_desc=None,
    descriptions_from_wikipedia=False,
    limit=None,
    lang="en",
):

    entity_defs_path = loc_entity_defs if loc_entity_defs else output_dir / ENTITY_DEFS_PATH
    entity_descr_path = loc_entity_desc if loc_entity_desc else output_dir / ENTITY_DESCR_PATH
    entity_freq_path = output_dir / ENTITY_FREQ_PATH
    prior_prob_path = loc_prior_prob if loc_prior_prob else output_dir / PRIOR_PROB_PATH
    training_entities_path = output_dir / TRAINING_DATA_FILE
    kb_path = output_dir / KB_FILE

    logger.info("Creating KB with Wikipedia and WikiData")

    if limit is not None:
        logger.warning(
            "Warning: reading only {} lines of Wikipedia/Wikidata dumps.".
            format(limit))

    # STEP 0: set up IO
    if not output_dir.exists():
        output_dir.mkdir(parents=True)

    # STEP 1: create the NLP object
    logger.info("STEP 1: Loading model {}".format(model))
    nlp = spacy.load(model)

    # check the length of the nlp vectors
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    # STEP 2: create prior probabilities from WP
    if not prior_prob_path.exists():
        # It takes about 2h to process 1000M lines of Wikipedia XML dump
        logger.info("STEP 2: writing prior probabilities to {}".format(
            prior_prob_path))
        wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit)
    logger.info(
        "STEP 2: reading prior probabilities from {}".format(prior_prob_path))

    # STEP 3: deduce entity frequencies from WP (takes only a few minutes)
    logger.info("STEP 3: calculating entity frequencies")
    wp.write_entity_counts(prior_prob_path, entity_freq_path, to_print=False)

    # STEP 4: reading definitions and (possibly) descriptions from WikiData or from file
    message = " and descriptions" if not descriptions_from_wikipedia else ""
    if (not entity_defs_path.exists()) or (not descriptions_from_wikipedia
                                           and not entity_descr_path.exists()):
        # It takes about 10h to process 55M lines of Wikidata JSON dump
        logger.info("STEP 4: parsing wikidata for entity definitions" +
                    message)
        title_to_id, id_to_descr = wd.read_wikidata_entities_json(
            wd_json,
            limit,
            to_print=False,
            lang=lang,
            parse_descriptions=(not descriptions_from_wikipedia),
        )
        wd.write_entity_files(entity_defs_path, title_to_id)
        if not descriptions_from_wikipedia:
            wd.write_entity_description_files(entity_descr_path, id_to_descr)
    logger.info("STEP 4: read entity definitions" + message)

    # STEP 5: Getting gold entities from wikipedia
    message = " and descriptions" if descriptions_from_wikipedia else ""
    if (not training_entities_path.exists()) or (
            descriptions_from_wikipedia and not entity_descr_path.exists()):
        logger.info("STEP 5: parsing wikipedia for gold entities" + message)
        training_set_creator.create_training_examples_and_descriptions(
            wp_xml,
            entity_defs_path,
            entity_descr_path,
            training_entities_path,
            parse_descriptions=descriptions_from_wikipedia,
            limit=limit,
        )
    logger.info("STEP 5: read gold entities" + message)

    # STEP 6: creating the actual KB
    # It takes ca. 30 minutes to pretrain the entity embeddings
    logger.info("STEP 6: creating the KB at {}".format(kb_path))
    kb = kb_creator.create_kb(
        nlp=nlp,
        max_entities_per_alias=max_per_alias,
        min_entity_freq=min_freq,
        min_occ=min_pair,
        entity_def_input=entity_defs_path,
        entity_descr_path=entity_descr_path,
        count_input=entity_freq_path,
        prior_prob_input=prior_prob_path,
        entity_vector_length=entity_vector_length,
    )

    kb.dump(kb_path)
    nlp.to_disk(output_dir / KB_MODEL_DIR)

    logger.info("Done!")
コード例 #4
0
def main(
    wd_json,
    wp_xml,
    output_dir,
    model,
    max_per_alias=10,
    min_freq=20,
    min_pair=5,
    entity_vector_length=64,
    loc_prior_prob=None,
    loc_entity_defs=None,
    loc_entity_desc=None,
    limit=None,
):
    print(now(), "Creating KB with Wikipedia and WikiData")
    print()

    if limit is not None:
        print("Warning: reading only", limit, "lines of Wikipedia/Wikidata dumps.")

    # STEP 0: set up IO
    if not output_dir.exists():
        output_dir.mkdir()

    # STEP 1: create the NLP object
    print(now(), "STEP 1: loaded model", model)
    nlp = spacy.load(model)

    # check the length of the nlp vectors
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
        raise ValueError(
            "The `nlp` object should have access to pre-trained word vectors, "
            " cf. https://spacy.io/usage/models#languages."
        )

    # STEP 2: create prior probabilities from WP
    print()
    if loc_prior_prob:
        print(now(), "STEP 2: reading prior probabilities from", loc_prior_prob)
    else:
        # It takes about 2h to process 1000M lines of Wikipedia XML dump
        loc_prior_prob = output_dir / "prior_prob.csv"
        print(now(), "STEP 2: writing prior probabilities at", loc_prior_prob)
        wp.read_prior_probs(wp_xml, loc_prior_prob, limit=limit)

    # STEP 3: deduce entity frequencies from WP (takes only a few minutes)
    print()
    print(now(), "STEP 3: calculating entity frequencies")
    loc_entity_freq = output_dir / "entity_freq.csv"
    wp.write_entity_counts(loc_prior_prob, loc_entity_freq, to_print=False)

    loc_kb = output_dir / "kb"

    # STEP 4: reading entity descriptions and definitions from WikiData or from file
    print()
    if loc_entity_defs and loc_entity_desc:
        read_raw = False
        print(now(), "STEP 4a: reading entity definitions from", loc_entity_defs)
        print(now(), "STEP 4b: reading entity descriptions from", loc_entity_desc)
    else:
        # It takes about 10h to process 55M lines of Wikidata JSON dump
        read_raw = True
        loc_entity_defs = output_dir / "entity_defs.csv"
        loc_entity_desc = output_dir / "entity_descriptions.csv"
        print(now(), "STEP 4: parsing wikidata for entity definitions and descriptions")

    # STEP 5: creating the actual KB
    # It takes ca. 30 minutes to pretrain the entity embeddings
    print()
    print(now(), "STEP 5: creating the KB at", loc_kb)
    kb = kb_creator.create_kb(
        nlp=nlp,
        max_entities_per_alias=max_per_alias,
        min_entity_freq=min_freq,
        min_occ=min_pair,
        entity_def_output=loc_entity_defs,
        entity_descr_output=loc_entity_desc,
        count_input=loc_entity_freq,
        prior_prob_input=loc_prior_prob,
        wikidata_input=wd_json,
        entity_vector_length=entity_vector_length,
        limit=limit,
        read_raw_data=read_raw,
    )
    if read_raw:
        print(" - wrote entity definitions to", loc_entity_defs)
        print(" - wrote writing entity descriptions to", loc_entity_desc)

    kb.dump(loc_kb)
    nlp.to_disk(output_dir / "nlp")

    print()
    print(now(), "Done!")