Ejemplo n.º 1
0
def load_data(text_path=None, mention_file=None, supplement=None,
              include_unresolved=False, lowercase=False, wiki_entity_file=None):
    assert not isinstance(text_path, type(None)), "xlwiki data requires raw path!"
    print("Loading", text_path)
    wiki_map = loadWikiVocab(wiki_entity_file)
    if supplement is None or supplement not in [0, 1, 2]: supplement=2
    docs = []
    doc_iter = XlwikiDataLoader(text_path, genre=supplement,
                                lowercase=lowercase, wiki_map=wiki_map)
    for doc in doc_iter.documents():
        docs.append(doc)
    return docs
Ejemplo n.º 2
0
def load_data(text_path=None,
              mention_file=None,
              supplement=None,
              include_unresolved=False,
              lowercase=False,
              wiki_entity_file=None):
    assert not isinstance(text_path, type(None)) and not isinstance(mention_file, type(None)),\
        "wned data requires raw text path and mention file!"
    print("Loading {0}, {1}".format(text_path, mention_file))
    wiki_map = loadWikiVocab(wiki_entity_file)
    docs = []
    doc_iter = WnedDataLoader(text_path,
                              mention_file,
                              include_unresolved=include_unresolved,
                              lowercase=lowercase,
                              wiki_map=wiki_map)
    for doc in doc_iter.documents():
        docs.append(doc)
    return docs
Ejemplo n.º 3
0
def load_data_and_embeddings(FLAGS, logger, candidate_manager):

    # dev, train and eval
    # conll only one file, controlled by genre (FLAG.test_genre)

    # no dev, train and eval
    # xlwiki only text path of training or eval, controlled by genre (FLAG.genre)

    # kbp15,16 may contain multiple training and eval [path, file]
    # kbp10 only one training and eval [path, file]

    # must cross_validation
    # wned only one eval [path, file]
    dataset_types = set()
    raw_training_data = None
    if not FLAGS.eval_only_mode:
        raw_training_data = []
        unwraped_data_tuples = unwrapDataset(FLAGS.training_data)
        for data_tuple in unwraped_data_tuples:
            dataset_types.add(data_tuple[0])
            raw_training_data.extend(
                extractRawData(data_tuple[0], data_tuple[2], data_tuple[3],
                               data_tuple[1], FLAGS))

    raw_eval_sets = []
    unwraped_data_tuples = unwrapDataset(FLAGS.eval_data)
    for data_tuple in unwraped_data_tuples:
        dataset_types.add(data_tuple[0])
        raw_eval_sets.append(
            extractRawData(data_tuple[0], data_tuple[2], data_tuple[3],
                           data_tuple[1], FLAGS))

    # replace mention gold id in redirect to entity id
    redirect_vocab = None
    if FLAGS.wiki_redirect_vocab is not None:
        gold_id_set = set()
        if raw_training_data is not None:
            gold_id_set.update([
                m.gold_ent_id() for doc in raw_training_data
                for m in doc.mentions if m.gold_ent_id() is not None
            ])
        for eval_data in raw_eval_sets:
            gold_id_set.update([
                m.gold_ent_id() for doc in eval_data for m in doc.mentions
                if m.gold_ent_id() is not None
            ])
        redirect_vocab = loadRedirectVocab(FLAGS.wiki_redirect_vocab,
                                           id_vocab=gold_id_set)
        if raw_training_data is not None:
            for i, doc in enumerate(raw_training_data):
                for j, mention in enumerate(doc.mentions):
                    if mention.gold_ent_id() in redirect_vocab:
                        raw_training_data[i].mentions[
                            j]._gold_ent_id = redirect_vocab[
                                mention.gold_ent_id()]
        for eval_data in raw_eval_sets:
            for i, doc in enumerate(eval_data):
                for j, mention in enumerate(doc.mentions):
                    if mention.gold_ent_id() in redirect_vocab:
                        eval_data[i].mentions[j]._gold_ent_id = redirect_vocab[
                            mention.gold_ent_id()]

    # Prepare the word and mention vocabulary.
    word_vocab, mention_vocab = BuildVocabulary(raw_training_data,
                                                raw_eval_sets,
                                                FLAGS.word_embedding_file,
                                                logger=logger)

    wiki2id_vocab, id2wiki_vocab = loadWikiVocab(FLAGS.wiki_entity_vocab)

    # candidate file types
    candidate_types = []
    files = re.split(r',', FLAGS.candidates_file)
    for f in files:
        tmp_items = re.split(r':', f)
        candidate_types.append(tmp_items[0])

    candidate_handler = candidate_manager(FLAGS.candidates_file,
                                          vocab=mention_vocab,
                                          lowercase=FLAGS.lowercase,
                                          id2label=id2wiki_vocab,
                                          label2id=wiki2id_vocab,
                                          support_fuzzy=FLAGS.support_fuzzy,
                                          redirect_vocab=redirect_vocab,
                                          topn=FLAGS.topn_candidate)
    candidate_handler.loadCandidates()
    if FLAGS.save_candidates_path is not None:
        fuzzy_str = 'fuzzy' if FLAGS.support_fuzzy else 'nofuzzy'
        candidate_handler.saveCandidatesToFile(
            os.path.join(
                FLAGS.save_candidates_path, '-'.join(dataset_types) +
                '-'.join(candidate_types) + '_candidate_' + fuzzy_str))

    logger.Log(
        "Unk mention types rate: {:2.6f}% ({}/{}), average candidates: {:2.2f} ({}/{}) from {}!"
        .format((len(mention_vocab) - len(candidate_handler._mention_dict)) *
                100 / float(len(mention_vocab)),
                len(mention_vocab) - len(candidate_handler._mention_dict),
                len(mention_vocab), candidate_handler._candidates_total /
                float(len(candidate_handler._mention_dict)),
                candidate_handler._candidates_total,
                len(candidate_handler._mention_dict), FLAGS.candidates_file))

    entity_vocab, sense_vocab = BuildEntityVocabulary(
        candidate_handler._entity_set,
        FLAGS.entity_embedding_file,
        FLAGS.sense_embedding_file,
        logger=logger)

    # Load pretrained embeddings.
    logger.Log("Loading vocabulary with " + str(len(word_vocab)) +
               " words from " + FLAGS.word_embedding_file)
    word_embeddings = LoadEmbeddingsFromBinary(word_vocab, FLAGS.embedding_dim,
                                               FLAGS.word_embedding_file)

    logger.Log("Loading vocabulary with " + str(len(entity_vocab)) +
               " entities from " + FLAGS.entity_embedding_file)
    entity_embeddings = LoadEmbeddingsFromBinary(entity_vocab,
                                                 FLAGS.embedding_dim,
                                                 FLAGS.entity_embedding_file)

    sense_embeddings = None
    mu_embeddings = None
    if sense_vocab is not None:
        sense_embeddings, mu_embeddings = LoadEmbeddingsFromBinary(
            sense_vocab,
            FLAGS.embedding_dim,
            FLAGS.sense_embedding_file,
            isSense=True)
        logger.Log("Loading vocabulary with " + str(len(sense_vocab)) +
                   " senses from " + FLAGS.sense_embedding_file)

    initial_embeddings = (word_embeddings, entity_embeddings, sense_embeddings,
                          mu_embeddings)
    vocabulary = (word_vocab, entity_vocab, sense_vocab, id2wiki_vocab)
    stop_words = loadStopWords(
        FLAGS.stop_word_file) if FLAGS.stop_word_file is not None else {}

    feature_manager = get_feature_manager(
        initial_embeddings,
        FLAGS.embedding_dim,
        lowercase=FLAGS.lowercase,
        str_sim=FLAGS.str_sim,
        prior=FLAGS.prior,
        hasAtt=FLAGS.att,
        local_context_window=FLAGS.local_context_window,
        global_context_window=FLAGS.global_context_window)

    # Trim dataset, convert token sequences to integer sequences, crop, and
    # pad. construct data iterator
    logger.Log("Preprocessing data.")
    eval_sets = []
    for i, raw_eval_data in enumerate(raw_eval_sets):
        logger.Log("Processing {} raw eval data ...".format(i))
        AddCandidatesToDocs(raw_eval_sets[i],
                            candidate_handler,
                            topn=FLAGS.topn_candidate,
                            vocab=entity_vocab,
                            logger=logger,
                            include_unresolved=FLAGS.include_unresolved)
        eval_data = PreprocessDataset(
            raw_eval_sets[i],
            vocabulary,
            initial_embeddings,
            FLAGS.max_tokens,
            FLAGS.max_candidates_per_document,
            feature_manager,
            stop_words=stop_words,
            logger=logger,
            include_unresolved=FLAGS.include_unresolved,
            allow_cropping=FLAGS.allow_cropping)
        eval_sets.append(eval_data)
    training_data_iter = None
    training_data_length = 0
    if raw_training_data is not None:
        logger.Log("Processing raw training data ...")
        AddCandidatesToDocs(raw_training_data,
                            candidate_handler,
                            topn=FLAGS.topn_candidate,
                            vocab=entity_vocab,
                            logger=logger,
                            include_unresolved=FLAGS.include_unresolved)
        training_data = PreprocessDataset(
            raw_training_data,
            vocabulary,
            initial_embeddings,
            FLAGS.max_tokens,
            FLAGS.max_candidates_per_document,
            feature_manager,
            stop_words=stop_words,
            logger=logger,
            include_unresolved=FLAGS.include_unresolved,
            allow_cropping=FLAGS.allow_cropping)
        training_data_length = training_data.shape[0]
        training_data_iter = MakeTrainingIterator(training_data,
                                                  FLAGS.batch_size,
                                                  FLAGS.smart_batching)
    logger.Log("Processing raw eval data ...")
    eval_iterators = []
    for eval_data in eval_sets:
        eval_it = MakeEvalIterator(eval_data, FLAGS.batch_size)
        eval_iterators.append(eval_it)
    return vocabulary, initial_embeddings, training_data_iter, eval_iterators, training_data_length, feature_manager.base_feature_dim