Beispiel #1
0
 def accuracy(dataset, batch_size, train):
     epoch_correct = np.zeros(len(qids))
     epoch_nll = 0.0
     epoch_total = np.zeros(len(qids))
     op = train_op if train else noop
     all_labels = []
     all_preds = []
     for i in get_progress_bar("train" if train else "dev",
                               item="batches")(range(
                                   0, len(dataset), batch_size)):
         batch_labels = [label for _, label in dataset[i:i + batch_size]]
         csum, corr, num_examples, preds, _ = session.run(
             [cost_sum, correct, size, out_activated, op],
             feed_dict={
                 indices: [[inv_vocab.get(w, 0) for w in window]
                           for window, _ in dataset[i:i + batch_size]],
                 labels:
                 batch_labels,
                 keep_prob_pholder:
                 keep_prob if train else 1.0
             })
         epoch_correct += corr
         epoch_nll += csum
         epoch_total += num_examples
         all_labels.extend(batch_labels)
         all_preds.append(preds)
     return (epoch_nll, epoch_correct, epoch_total, np.vstack(all_preds),
             np.vstack(all_labels))
Beispiel #2
0
def disambiguate_batch(test_tags, train_tags, oracles):
    test_tags = test_tags
    total_report = {}
    ambiguous_tags = []
    for tags in get_progress_bar("disambiguating", item="articles")(test_tags):
        report, remainder = disambiguate(tags, oracles)
        ambiguous_tags.extend(remainder)
        for key, value in report.items():
            if key not in total_report:
                total_report[key] = value
            else:
                total_report[key] += value
    return total_report, ambiguous_tags
def main():
    args = parse_args()
    trie = marisa_trie.RecordTrie('i').load(args.wikipedia2wikidata_trie)
    print('loaded trie')

    num_lines = count_lines(args.category_links)
    num_ids = count_lines(args.wikidata_ids)
    missing = []
    num_missing = 0
    num_broken = 0
    all_category_links = [[] for i in range(num_ids)]
    with open(args.category_links, 'rt') as fin:
        fin_pbar = get_progress_bar('reading category_links',
                                    max_value=num_lines)(fin)
        for line in fin_pbar:
            try:
                origin, dest = line.rstrip('\n').split('\t')
            except:
                num_broken += 1
                continue
            if len(dest) == 0:
                num_broken += 1
                continue
            origin = args.prefix + '/' + origin
            prefixed_dest = args.prefix + '/' + dest
            origin_index = trie.get(origin, None)
            dest_index = trie.get(prefixed_dest, None)

            if dest_index is None:
                prefixed_dest = args.prefix + '/' + dest[0].upper() + dest[1:]
                dest_index = trie.get(prefixed_dest, None)

            if origin_index is None or dest_index is None:
                missing.append((origin, prefixed_dest))
                num_missing += 1
            else:
                all_category_links[origin_index[0][0]].append(dest_index[0][0])

    print("%d/%d category links could not be found in wikidata" %
          (num_missing, num_lines))
    print("%d/%d category links were malformed" % (num_broken, num_lines))
    print("Missing links sample:")
    for origin, dest in missing[:10]:
        print("%r -> %r" % (origin, dest))
    save_record_with_offset(
        join(args.out, "wikidata_%s_category_links" % (args.prefix, )),
        all_category_links)
def get_cached_satisfy(collection, aucs, ids, mmap=False):
    path = join(SCRIPT_DIR, "cached_satisfy.npy")
    if not exists(path):
        cached_satisfy = np.zeros((len(aucs), len(ids)), dtype=np.bool)
        for row, (qid, relation_name) in get_progress_bar(
                "satisfy", item="types")(enumerate(sorted(aucs.keys()))):
            cached_satisfy[row, :] = collection.satisfy(
                [relation_name], [collection.name2index[qid]])[ids]
            collection._satisfy_cache.clear()
        np.save(path, cached_satisfy)
        if mmap:
            del cached_satisfy
            cached_satisfy = np.load(path, mmap_mode="r")
    else:
        if mmap:
            cached_satisfy = np.load(path, mmap_mode="r")
        else:
            cached_satisfy = np.load(path)
    return cached_satisfy
Beispiel #5
0
def main():
    args = parse_args()
    approx_max_quantity = 24642416
    pbar = get_progress_bar('compress wikidata',
                            max_value=approx_max_quantity,
                            item='entities')
    pbar.start()
    seen = 0
    with open(args.out, "wb") as fout:
        for doc in open_wikidata_file(args.wikidata, 1000):
            seen += 1
            if 'descriptions' in doc:
                del doc['descriptions']
            if 'labels' in doc:
                del doc['labels']
            if 'aliases' in doc:
                del doc['aliases']
            for claims in doc['claims'].values():
                for claim in claims:
                    if 'id' in claim:
                        del claim['id']
                    if 'rank' in claim:
                        del claim['rank']
                    if 'references' in claim:
                        for ref in claim['references']:
                            if 'hash' in ref:
                                del ref['hash']
                    if 'qualifiers' in claim:
                        for qualifier in claim['qualifiers'].values():
                            if 'hash' in qualifier:
                                del qualifier['hash']
            fout.write(msgpack.packb(doc))
            if seen % 1000 == 0:
                if seen < approx_max_quantity:
                    pbar.update(seen)
    pbar.finish()
Beispiel #6
0
def fix_and_parse_tags(config, collection, size):
    trie_index2indices = OffsetArray.load(join(config.language_path,
                                               "trie_index2indices"),
                                          compress=True)
    trie_index2indices_counts = OffsetArray(
        np.load(join(config.language_path, "trie_index2indices_counts.npy")),
        trie_index2indices.offsets)
    if exists(
            join(config.language_path,
                 "trie_index2indices_transition_values.npy")):
        trie_index2indices_transitions = OffsetArray(
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_values.npy")),
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_offsets.npy")),
        )
    else:
        trie_index2indices_transitions = None

    anchor_trie = marisa_trie.Trie().load(
        join(config.language_path, "trie.marisa"))
    wiki_trie = marisa_trie.RecordTrie('i').load(
        join(config.wikidata, "wikititle2wikidata.marisa"))
    prefix = get_prefix(config)
    redirections = load_redirections(config.redirections)
    docs = load_wikipedia_docs(config.wiki, size)

    while True:
        try:
            collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json"))
        except (ValueError, ) as e:
            print("issue reading blacklist, please fix.")
            print(str(e))
            enter_or_quit()
            continue
        break

    print("Load first_names")
    with open(join(PROJECT_DIR, "data", "first_names.txt"), "rt") as fin:
        first_names = set(fin.read().splitlines())

    all_tags = []
    for doc in get_progress_bar('fixing links', item='article')(docs):
        tags = obtain_tags(
            doc,
            wiki_trie=wiki_trie,
            anchor_trie=anchor_trie,
            trie_index2indices=trie_index2indices,
            trie_index2indices_counts=trie_index2indices_counts,
            trie_index2indices_transitions=trie_index2indices_transitions,
            redirections=redirections,
            prefix=prefix,
            first_names=first_names,
            collection=collection,
            fix_destination=fix_destination,
            min_count=config.min_count,
            min_percent=config.min_percent)
        if any(x is not None for _, x in tags):
            all_tags.append(tags)
    collection.reset_cache()
    return all_tags
Beispiel #7
0
def get_wikidata_mapping(name2id_path,
                         wikidata_ids_path,
                         jsons,
                         relation_names,
                         verbose=False):
    approx_max_quantity = 24642416
    if verbose:
        pbar = None
        from IPython.display import clear_output
    else:
        pbar = get_progress_bar("collect wikilinks",
                                max_value=approx_max_quantity)
        pbar.start()
        clear_output = None
    wikidata_ids = []
    entity_types = []
    subclass = []
    seen = 0

    relations = {
        name: (open(outfile, "wt"), is_temporal)
        for name, outfile, is_temporal in relation_names
    }
    fout_name2id = None if true_exists(name2id_path) else open(
        name2id_path, "wt")
    fout_wikidata_ids = None if true_exists(wikidata_ids_path) else open(
        wikidata_ids_path, "wt")
    try:
        t_then = time.time()
        seen_last = 0
        speed = None
        index = 0
        for doc in jsons:
            seen += 1
            if seen % 2000 == 0:
                if verbose:
                    t_now = time.time()
                    new_speed = (seen - seen_last) / (t_now - t_then)
                    if speed is None:
                        speed = new_speed
                    else:
                        speed = 0.9 * speed + 0.1 * new_speed
                    clear_output(wait=True)
                    print("%.3f%% done (%d seen, %.3f docs/s, ETA: %ds)" %
                          (100.0 * seen / approx_max_quantity, seen, speed,
                           int((approx_max_quantity - seen) / speed)),
                          flush=True)
                    seen_last = seen
                    t_then = t_now
                else:
                    if seen < approx_max_quantity:
                        pbar.update(seen)
            if fout_name2id is not None:
                if "sitelinks" in doc:
                    for key, value in doc["sitelinks"].items():
                        if key.endswith("wiki"):
                            fout_name2id.write(key + "/" + value["title"] +
                                               "\t" + str(index) + "\n")
            index += 1
            if fout_wikidata_ids is not None:
                fout_wikidata_ids.write(doc["id"] + "\n")
            for name, (outfile, is_temporal) in relations.items():
                if is_temporal:
                    outfile.write("\t".join(
                        get_claim_time(doc["claims"].get(name, []))) + "\n")
                else:
                    outfile.write("\t".join(
                        get_related_entities(doc["claims"].get(name, []))) +
                                  "\n")
        if pbar is not None:
            pbar.finish()
    finally:
        for name, (outfile, _) in relations.items():
            outfile.close()
        if fout_name2id is not None:
            fout_name2id.close()
        if fout_wikidata_ids is not None:
            fout_wikidata_ids.close()
Beispiel #8
0
def main():
    args = parse_args()
    makedirs(args.wikidata, exist_ok=True)

    wikidata_names2prop_names = property_names(
        join(PROJECT_DIR, "data", "wikidata", 'wikidata_property_names.json'))
    wikidata_names2temporal_prop_names = temporal_property_names(
        join(PROJECT_DIR, "data", "wikidata",
             'wikidata_time_property_names.json'))
    # fields to make easily accessible:
    important_properties = [
        wikidata_properties.INSTANCE_OF,
        wikidata_properties.SUBCLASS_OF,
        wikidata_properties.PART_OF,
        wikidata_properties.OCCUPATION,
        wikidata_properties.FIELD_OF_WORK,
        wikidata_properties.FIELD_OF_THIS_OCCUPATION,
        wikidata_properties.MEDICAL_SPECIALITY,
        wikidata_properties.GENRE,
        wikidata_properties.SEX_OR_GENDER,
        wikidata_properties.COUNTRY_OF_CITIZENSHIP,
        wikidata_properties.COUNTRY,
        wikidata_properties.CONTINENT,
        wikidata_properties.LOCATED_IN_THE_ADMINISTRATIVE_TERRITORIAL_ENTITY,
        wikidata_properties.SPORT,
        wikidata_properties.STUDIES,
        wikidata_properties.SERIES,
        wikidata_properties.USE,
        wikidata_properties.LOCATION,
        wikidata_properties.FACE_OF,
        wikidata_properties.IS_A_LIST_OF,
        wikidata_properties.COUNTRY_OF_ORIGIN,
        wikidata_properties.PRODUCT_OR_MATERIAL_PRODUCED,
        wikidata_properties.INDUSTRY,
        wikidata_properties.PARENT_TAXON,
        wikidata_properties.APPLIES_TO_TERRITORIAL_JURISDICTION,
        wikidata_properties.POSITION_HELD,
        wikidata_properties.CATEGORYS_MAIN_TOPIC,
        # temporal properties
        wikidata_properties.PUBLICATION_DATE,
        wikidata_properties.DATE_OF_BIRTH,
        wikidata_properties.DATE_OF_DEATH,
        wikidata_properties.INCEPTION,
        wikidata_properties.DISSOLVED_OR_ABOLISHED,
        wikidata_properties.POINT_IN_TIME,
        wikidata_properties.START_TIME,
        wikidata_properties.END_TIME
    ]
    prop_names2wikidata_names = {
        value: key
        for key, value in wikidata_names2prop_names.items()
    }
    wikidata_important_properties = [
        prop_names2wikidata_names[prop] for prop in important_properties
    ]
    wikidata_important_properties_fnames = [
        (name, join(args.wikidata, "wikidata_%s.txt" % (name, )), name
         in wikidata_names2temporal_prop_names)
        for name in wikidata_important_properties
    ]

    missing_wikidata_important_properties_fnames = [
        (name, outfile, is_temporal)
        for name, outfile, is_temporal in wikidata_important_properties_fnames
        if not true_exists(outfile)
    ]

    wikidata_ids_path = join(args.wikidata, WIKIDATA_IDS_NAME)
    wikititle2wikidata_path = join(args.wikidata, WIKITILE_2_WIKIDATA_TSV_NAME)

    work_to_be_done = (not true_exists(wikidata_ids_path)
                       or not true_exists(wikititle2wikidata_path) or
                       len(missing_wikidata_important_properties_fnames) > 0)

    if work_to_be_done:
        get_wikidata_mapping(
            wikititle2wikidata_path, wikidata_ids_path,
            open_wikidata_file(args.wikidata_dump, args.batch_size),
            missing_wikidata_important_properties_fnames)

    numpy_wikidata_important_properties_fnames = [
        (name, outfile, is_temporal)
        for name, outfile, is_temporal in wikidata_important_properties_fnames
        if not values_exist(join(args.wikidata, "wikidata_%s" % (name, )))
    ]

    # obtain a mapping from id -> number
    if len(numpy_wikidata_important_properties_fnames) > 0:
        _, id2index = load_wikidata_ids(args.wikidata)
        # make relations numerical:
        for relname, outfile, is_temporal in numpy_wikidata_important_properties_fnames:
            with open(outfile, "rt") as fin:
                lines = fin.read().splitlines()
            fin_pbar = get_progress_bar("loading relation %r" %
                                        (relname, ))(lines)
            if is_temporal:
                value = np.zeros(len(lines) * 2 + 1, dtype=np.int32)
                position = 1
                seen = 0
                for idx, line in enumerate(fin_pbar):
                    for wikidata_id in line.split('\t'):
                        if len(wikidata_id) > 0:
                            value[position] = idx
                            value[position + 1] = parse_year(wikidata_id)
                            position += 2
                            seen += 1
                            break
                value[0] = len(lines)
                value = value[:position]
                np.save(
                    join(args.wikidata,
                         "wikidata_%s_values.sparse.npy" % (relname, )), value)
            else:
                relation = [line2indices(id2index, line) for line in fin_pbar]
                save_record_with_offset(
                    join(args.wikidata, "wikidata_%s" % (relname, )), relation)
        del id2index

    # convert the mapping from wikinames to integer values:
    trie_save_path = join(args.wikidata, WIKITILE_2_WIKIDATA_TRIE_NAME)
    if not true_exists(trie_save_path):
        print("loading wikipedia name -> wikidata")
        name2id = pandas.read_csv(wikititle2wikidata_path,
                                  sep="\t",
                                  encoding='utf-8')
        print("loaded")
        trie = marisa_trie.RecordTrie(
            'i',
            get_progress_bar("convert to trie", max_value=name2id.shape[0])(
                (key, (value, )) for _, key, value in name2id.itertuples()))
        trie.save(trie_save_path)

    build_fixed_point(args.wikidata, "enwiki")
Beispiel #9
0
def fix(collection,
        offsets,
        values,
        counts,
        anchor_length,
        num_category_link=8,
        keep_min=5):
    relations_that_can_extend = [{
        "steps": [wprop.INSTANCE_OF]
    }, {
        "steps": [wprop.INSTANCE_OF, (wprop.SUBCLASS_OF, 2)]
    }, {
        "steps": [wprop.INSTANCE_OF, wprop.FACET_OF]
    }, {
        "steps": [(wprop.SUBCLASS_OF, 3)]
    }, {
        "steps": [wprop.OCCUPATION],
        "promote": True
    }, {
        "steps": [wprop.POSITION_HELD],
        "promote": True
    }, {
        "steps": [wprop.PART_OF, wprop.INSTANCE_OF]
    }, {
        "steps": [wprop.SERIES, wprop.INSTANCE_OF]
    }, {
        "steps": [wprop.SERIES, wprop.LOCATION]
    }, {
        "steps": [wprop.LOCATED_IN_THE_ADMINISTRATIVE_TERRITORIAL_ENTITY]
    }, {
        "steps": [wprop.COUNTRY]
    }, {
        "steps": [wprop.CATEGORY_LINK, wprop.CATEGORYS_MAIN_TOPIC]
    }, {
        "steps": [(wprop.CATEGORY_LINK, num_category_link), wprop.FIXED_POINTS]
    }, {
        "steps": [wprop.CATEGORY_LINK, wprop.FIXED_POINTS, wprop.IS_A_LIST_OF]
    }, {
        "steps": [wprop.IS_A_LIST_OF, (wprop.SUBCLASS_OF, 2)]
    }]
    relation_data = get_relation_data(collection, relations_that_can_extend)
    new_values = values
    # get rid of History of BLAH where link also points to BLAH:

    is_history = IS_HISTORY[new_values]
    is_people_mask = IS_PEOPLE[new_values]
    is_list = IS_LIST_ARTICLE[new_values]
    new_values = related_promote_highest(new_values,
                                         offsets,
                                         counts,
                                         condition=is_history,
                                         alternative=is_people_mask,
                                         keep_min=keep_min)
    unchanged = values == new_values
    is_not_history_or_list = logical_and(logical_not(is_history),
                                         logical_not(is_list))
    new_values = related_promote_highest(new_values,
                                         offsets,
                                         counts,
                                         condition=logical_and(
                                             is_history, unchanged),
                                         alternative=is_not_history_or_list,
                                         keep_min=keep_min)

    is_sport_or_thoroughfare = logical_or(IS_EVENT_SPORT,
                                          IS_THOROUGHFARE)[new_values]

    # delete these references:
    new_values[anchor_length < 2] = -1
    # get rid of shorthand for sports:
    new_values[logical_and(is_sport_or_thoroughfare, anchor_length <= 2)] = -1
    # remove lists of episodes:
    is_episode_list = IS_EPISODE_LIST[new_values]
    new_values[is_episode_list] = -1

    # get rid of "car" -> "Renault Megane", when "car" -> "Car",
    # and "Renault Megane" is instance of "Car":
    is_not_people = logical_not(IS_PEOPLE)[new_values]
    new_values = extend_relations(relation_data,
                                  new_values,
                                  offsets,
                                  counts,
                                  alternative=is_not_people,
                                  pbar=get_progress_bar("extend_relations",
                                                        max_value=len(offsets),
                                                        item="links"),
                                  keep_min=keep_min)
    unchanged = values == new_values
    # remove all non-modified values that are
    # not instances of anything, nor subclasses of anything:
    new_values[logical_ands([
        logical_ands([
            collection.relation(wprop.INSTANCE_OF).edges() == 0,
            collection.relation(wprop.SUBCLASS_OF).edges() == 0,
            collection.relation(wprop.PART_OF).edges() == 0,
            collection.relation(wprop.CATEGORY_LINK).edges() == 0
        ])[new_values], unchanged
    ])] = -1

    is_kinship = IS_KINSHIP[new_values]
    is_human = IS_CHARACTER_HUMAN[new_values]
    new_values = related_promote_highest(new_values,
                                         offsets,
                                         counts,
                                         condition=is_human,
                                         alternative=is_kinship,
                                         keep_min=keep_min)

    # replace elements by a country
    # if a better alternative is present,
    # counts is less than 100:
    should_replace_by_country = logical_ands([
        logical_not(
            logical_ors([
                IS_POLITICAL_ORGANIZATION, IS_CARDINAL_DIRECTION,
                IS_LANGUAGE_ALPHABET, IS_COUNTRY, IS_PEOPLE_GROUP, IS_BREED,
                IS_BATTLE, IS_SOCIETY, IS_POSITION, IS_POLITICAL_PARTY,
                IS_SPORTS_TEAM, IS_CHARACTER_HUMAN, IS_LANDFORM, IS_ACTIVITY
            ]))[new_values], counts < 100
    ])

    # turn this into a promote highest in this order:
    is_country_or_cardinal = [
        IS_CARDINAL_DIRECTION, IS_COUNTRY, IS_POLITICAL_ORGANIZATION
    ]
    for i, alternative in enumerate(is_country_or_cardinal):
        unchanged = values == new_values
        should_replace_by_country = logical_and(should_replace_by_country,
                                                unchanged)
        new_values = related_promote_highest(
            new_values,
            offsets,
            counts,
            condition=should_replace_by_country,
            alternative=alternative[new_values],
            keep_min=keep_min)

    new_offsets, new_values, new_counts, location_shift = reduce_values(
        offsets, new_values, counts)

    return (new_offsets, new_values, new_counts), location_shift