def accuracy(dataset, batch_size, train): epoch_correct = np.zeros(len(qids)) epoch_nll = 0.0 epoch_total = np.zeros(len(qids)) op = train_op if train else noop all_labels = [] all_preds = [] for i in get_progress_bar("train" if train else "dev", item="batches")(range( 0, len(dataset), batch_size)): batch_labels = [label for _, label in dataset[i:i + batch_size]] csum, corr, num_examples, preds, _ = session.run( [cost_sum, correct, size, out_activated, op], feed_dict={ indices: [[inv_vocab.get(w, 0) for w in window] for window, _ in dataset[i:i + batch_size]], labels: batch_labels, keep_prob_pholder: keep_prob if train else 1.0 }) epoch_correct += corr epoch_nll += csum epoch_total += num_examples all_labels.extend(batch_labels) all_preds.append(preds) return (epoch_nll, epoch_correct, epoch_total, np.vstack(all_preds), np.vstack(all_labels))
def disambiguate_batch(test_tags, train_tags, oracles): test_tags = test_tags total_report = {} ambiguous_tags = [] for tags in get_progress_bar("disambiguating", item="articles")(test_tags): report, remainder = disambiguate(tags, oracles) ambiguous_tags.extend(remainder) for key, value in report.items(): if key not in total_report: total_report[key] = value else: total_report[key] += value return total_report, ambiguous_tags
def main(): args = parse_args() trie = marisa_trie.RecordTrie('i').load(args.wikipedia2wikidata_trie) print('loaded trie') num_lines = count_lines(args.category_links) num_ids = count_lines(args.wikidata_ids) missing = [] num_missing = 0 num_broken = 0 all_category_links = [[] for i in range(num_ids)] with open(args.category_links, 'rt') as fin: fin_pbar = get_progress_bar('reading category_links', max_value=num_lines)(fin) for line in fin_pbar: try: origin, dest = line.rstrip('\n').split('\t') except: num_broken += 1 continue if len(dest) == 0: num_broken += 1 continue origin = args.prefix + '/' + origin prefixed_dest = args.prefix + '/' + dest origin_index = trie.get(origin, None) dest_index = trie.get(prefixed_dest, None) if dest_index is None: prefixed_dest = args.prefix + '/' + dest[0].upper() + dest[1:] dest_index = trie.get(prefixed_dest, None) if origin_index is None or dest_index is None: missing.append((origin, prefixed_dest)) num_missing += 1 else: all_category_links[origin_index[0][0]].append(dest_index[0][0]) print("%d/%d category links could not be found in wikidata" % (num_missing, num_lines)) print("%d/%d category links were malformed" % (num_broken, num_lines)) print("Missing links sample:") for origin, dest in missing[:10]: print("%r -> %r" % (origin, dest)) save_record_with_offset( join(args.out, "wikidata_%s_category_links" % (args.prefix, )), all_category_links)
def get_cached_satisfy(collection, aucs, ids, mmap=False): path = join(SCRIPT_DIR, "cached_satisfy.npy") if not exists(path): cached_satisfy = np.zeros((len(aucs), len(ids)), dtype=np.bool) for row, (qid, relation_name) in get_progress_bar( "satisfy", item="types")(enumerate(sorted(aucs.keys()))): cached_satisfy[row, :] = collection.satisfy( [relation_name], [collection.name2index[qid]])[ids] collection._satisfy_cache.clear() np.save(path, cached_satisfy) if mmap: del cached_satisfy cached_satisfy = np.load(path, mmap_mode="r") else: if mmap: cached_satisfy = np.load(path, mmap_mode="r") else: cached_satisfy = np.load(path) return cached_satisfy
def main(): args = parse_args() approx_max_quantity = 24642416 pbar = get_progress_bar('compress wikidata', max_value=approx_max_quantity, item='entities') pbar.start() seen = 0 with open(args.out, "wb") as fout: for doc in open_wikidata_file(args.wikidata, 1000): seen += 1 if 'descriptions' in doc: del doc['descriptions'] if 'labels' in doc: del doc['labels'] if 'aliases' in doc: del doc['aliases'] for claims in doc['claims'].values(): for claim in claims: if 'id' in claim: del claim['id'] if 'rank' in claim: del claim['rank'] if 'references' in claim: for ref in claim['references']: if 'hash' in ref: del ref['hash'] if 'qualifiers' in claim: for qualifier in claim['qualifiers'].values(): if 'hash' in qualifier: del qualifier['hash'] fout.write(msgpack.packb(doc)) if seen % 1000 == 0: if seen < approx_max_quantity: pbar.update(seen) pbar.finish()
def fix_and_parse_tags(config, collection, size): trie_index2indices = OffsetArray.load(join(config.language_path, "trie_index2indices"), compress=True) trie_index2indices_counts = OffsetArray( np.load(join(config.language_path, "trie_index2indices_counts.npy")), trie_index2indices.offsets) if exists( join(config.language_path, "trie_index2indices_transition_values.npy")): trie_index2indices_transitions = OffsetArray( np.load( join(config.language_path, "trie_index2indices_transition_values.npy")), np.load( join(config.language_path, "trie_index2indices_transition_offsets.npy")), ) else: trie_index2indices_transitions = None anchor_trie = marisa_trie.Trie().load( join(config.language_path, "trie.marisa")) wiki_trie = marisa_trie.RecordTrie('i').load( join(config.wikidata, "wikititle2wikidata.marisa")) prefix = get_prefix(config) redirections = load_redirections(config.redirections) docs = load_wikipedia_docs(config.wiki, size) while True: try: collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json")) except (ValueError, ) as e: print("issue reading blacklist, please fix.") print(str(e)) enter_or_quit() continue break print("Load first_names") with open(join(PROJECT_DIR, "data", "first_names.txt"), "rt") as fin: first_names = set(fin.read().splitlines()) all_tags = [] for doc in get_progress_bar('fixing links', item='article')(docs): tags = obtain_tags( doc, wiki_trie=wiki_trie, anchor_trie=anchor_trie, trie_index2indices=trie_index2indices, trie_index2indices_counts=trie_index2indices_counts, trie_index2indices_transitions=trie_index2indices_transitions, redirections=redirections, prefix=prefix, first_names=first_names, collection=collection, fix_destination=fix_destination, min_count=config.min_count, min_percent=config.min_percent) if any(x is not None for _, x in tags): all_tags.append(tags) collection.reset_cache() return all_tags
def get_wikidata_mapping(name2id_path, wikidata_ids_path, jsons, relation_names, verbose=False): approx_max_quantity = 24642416 if verbose: pbar = None from IPython.display import clear_output else: pbar = get_progress_bar("collect wikilinks", max_value=approx_max_quantity) pbar.start() clear_output = None wikidata_ids = [] entity_types = [] subclass = [] seen = 0 relations = { name: (open(outfile, "wt"), is_temporal) for name, outfile, is_temporal in relation_names } fout_name2id = None if true_exists(name2id_path) else open( name2id_path, "wt") fout_wikidata_ids = None if true_exists(wikidata_ids_path) else open( wikidata_ids_path, "wt") try: t_then = time.time() seen_last = 0 speed = None index = 0 for doc in jsons: seen += 1 if seen % 2000 == 0: if verbose: t_now = time.time() new_speed = (seen - seen_last) / (t_now - t_then) if speed is None: speed = new_speed else: speed = 0.9 * speed + 0.1 * new_speed clear_output(wait=True) print("%.3f%% done (%d seen, %.3f docs/s, ETA: %ds)" % (100.0 * seen / approx_max_quantity, seen, speed, int((approx_max_quantity - seen) / speed)), flush=True) seen_last = seen t_then = t_now else: if seen < approx_max_quantity: pbar.update(seen) if fout_name2id is not None: if "sitelinks" in doc: for key, value in doc["sitelinks"].items(): if key.endswith("wiki"): fout_name2id.write(key + "/" + value["title"] + "\t" + str(index) + "\n") index += 1 if fout_wikidata_ids is not None: fout_wikidata_ids.write(doc["id"] + "\n") for name, (outfile, is_temporal) in relations.items(): if is_temporal: outfile.write("\t".join( get_claim_time(doc["claims"].get(name, []))) + "\n") else: outfile.write("\t".join( get_related_entities(doc["claims"].get(name, []))) + "\n") if pbar is not None: pbar.finish() finally: for name, (outfile, _) in relations.items(): outfile.close() if fout_name2id is not None: fout_name2id.close() if fout_wikidata_ids is not None: fout_wikidata_ids.close()
def main(): args = parse_args() makedirs(args.wikidata, exist_ok=True) wikidata_names2prop_names = property_names( join(PROJECT_DIR, "data", "wikidata", 'wikidata_property_names.json')) wikidata_names2temporal_prop_names = temporal_property_names( join(PROJECT_DIR, "data", "wikidata", 'wikidata_time_property_names.json')) # fields to make easily accessible: important_properties = [ wikidata_properties.INSTANCE_OF, wikidata_properties.SUBCLASS_OF, wikidata_properties.PART_OF, wikidata_properties.OCCUPATION, wikidata_properties.FIELD_OF_WORK, wikidata_properties.FIELD_OF_THIS_OCCUPATION, wikidata_properties.MEDICAL_SPECIALITY, wikidata_properties.GENRE, wikidata_properties.SEX_OR_GENDER, wikidata_properties.COUNTRY_OF_CITIZENSHIP, wikidata_properties.COUNTRY, wikidata_properties.CONTINENT, wikidata_properties.LOCATED_IN_THE_ADMINISTRATIVE_TERRITORIAL_ENTITY, wikidata_properties.SPORT, wikidata_properties.STUDIES, wikidata_properties.SERIES, wikidata_properties.USE, wikidata_properties.LOCATION, wikidata_properties.FACE_OF, wikidata_properties.IS_A_LIST_OF, wikidata_properties.COUNTRY_OF_ORIGIN, wikidata_properties.PRODUCT_OR_MATERIAL_PRODUCED, wikidata_properties.INDUSTRY, wikidata_properties.PARENT_TAXON, wikidata_properties.APPLIES_TO_TERRITORIAL_JURISDICTION, wikidata_properties.POSITION_HELD, wikidata_properties.CATEGORYS_MAIN_TOPIC, # temporal properties wikidata_properties.PUBLICATION_DATE, wikidata_properties.DATE_OF_BIRTH, wikidata_properties.DATE_OF_DEATH, wikidata_properties.INCEPTION, wikidata_properties.DISSOLVED_OR_ABOLISHED, wikidata_properties.POINT_IN_TIME, wikidata_properties.START_TIME, wikidata_properties.END_TIME ] prop_names2wikidata_names = { value: key for key, value in wikidata_names2prop_names.items() } wikidata_important_properties = [ prop_names2wikidata_names[prop] for prop in important_properties ] wikidata_important_properties_fnames = [ (name, join(args.wikidata, "wikidata_%s.txt" % (name, )), name in wikidata_names2temporal_prop_names) for name in wikidata_important_properties ] missing_wikidata_important_properties_fnames = [ (name, outfile, is_temporal) for name, outfile, is_temporal in wikidata_important_properties_fnames if not true_exists(outfile) ] wikidata_ids_path = join(args.wikidata, WIKIDATA_IDS_NAME) wikititle2wikidata_path = join(args.wikidata, WIKITILE_2_WIKIDATA_TSV_NAME) work_to_be_done = (not true_exists(wikidata_ids_path) or not true_exists(wikititle2wikidata_path) or len(missing_wikidata_important_properties_fnames) > 0) if work_to_be_done: get_wikidata_mapping( wikititle2wikidata_path, wikidata_ids_path, open_wikidata_file(args.wikidata_dump, args.batch_size), missing_wikidata_important_properties_fnames) numpy_wikidata_important_properties_fnames = [ (name, outfile, is_temporal) for name, outfile, is_temporal in wikidata_important_properties_fnames if not values_exist(join(args.wikidata, "wikidata_%s" % (name, ))) ] # obtain a mapping from id -> number if len(numpy_wikidata_important_properties_fnames) > 0: _, id2index = load_wikidata_ids(args.wikidata) # make relations numerical: for relname, outfile, is_temporal in numpy_wikidata_important_properties_fnames: with open(outfile, "rt") as fin: lines = fin.read().splitlines() fin_pbar = get_progress_bar("loading relation %r" % (relname, ))(lines) if is_temporal: value = np.zeros(len(lines) * 2 + 1, dtype=np.int32) position = 1 seen = 0 for idx, line in enumerate(fin_pbar): for wikidata_id in line.split('\t'): if len(wikidata_id) > 0: value[position] = idx value[position + 1] = parse_year(wikidata_id) position += 2 seen += 1 break value[0] = len(lines) value = value[:position] np.save( join(args.wikidata, "wikidata_%s_values.sparse.npy" % (relname, )), value) else: relation = [line2indices(id2index, line) for line in fin_pbar] save_record_with_offset( join(args.wikidata, "wikidata_%s" % (relname, )), relation) del id2index # convert the mapping from wikinames to integer values: trie_save_path = join(args.wikidata, WIKITILE_2_WIKIDATA_TRIE_NAME) if not true_exists(trie_save_path): print("loading wikipedia name -> wikidata") name2id = pandas.read_csv(wikititle2wikidata_path, sep="\t", encoding='utf-8') print("loaded") trie = marisa_trie.RecordTrie( 'i', get_progress_bar("convert to trie", max_value=name2id.shape[0])( (key, (value, )) for _, key, value in name2id.itertuples())) trie.save(trie_save_path) build_fixed_point(args.wikidata, "enwiki")
def fix(collection, offsets, values, counts, anchor_length, num_category_link=8, keep_min=5): relations_that_can_extend = [{ "steps": [wprop.INSTANCE_OF] }, { "steps": [wprop.INSTANCE_OF, (wprop.SUBCLASS_OF, 2)] }, { "steps": [wprop.INSTANCE_OF, wprop.FACET_OF] }, { "steps": [(wprop.SUBCLASS_OF, 3)] }, { "steps": [wprop.OCCUPATION], "promote": True }, { "steps": [wprop.POSITION_HELD], "promote": True }, { "steps": [wprop.PART_OF, wprop.INSTANCE_OF] }, { "steps": [wprop.SERIES, wprop.INSTANCE_OF] }, { "steps": [wprop.SERIES, wprop.LOCATION] }, { "steps": [wprop.LOCATED_IN_THE_ADMINISTRATIVE_TERRITORIAL_ENTITY] }, { "steps": [wprop.COUNTRY] }, { "steps": [wprop.CATEGORY_LINK, wprop.CATEGORYS_MAIN_TOPIC] }, { "steps": [(wprop.CATEGORY_LINK, num_category_link), wprop.FIXED_POINTS] }, { "steps": [wprop.CATEGORY_LINK, wprop.FIXED_POINTS, wprop.IS_A_LIST_OF] }, { "steps": [wprop.IS_A_LIST_OF, (wprop.SUBCLASS_OF, 2)] }] relation_data = get_relation_data(collection, relations_that_can_extend) new_values = values # get rid of History of BLAH where link also points to BLAH: is_history = IS_HISTORY[new_values] is_people_mask = IS_PEOPLE[new_values] is_list = IS_LIST_ARTICLE[new_values] new_values = related_promote_highest(new_values, offsets, counts, condition=is_history, alternative=is_people_mask, keep_min=keep_min) unchanged = values == new_values is_not_history_or_list = logical_and(logical_not(is_history), logical_not(is_list)) new_values = related_promote_highest(new_values, offsets, counts, condition=logical_and( is_history, unchanged), alternative=is_not_history_or_list, keep_min=keep_min) is_sport_or_thoroughfare = logical_or(IS_EVENT_SPORT, IS_THOROUGHFARE)[new_values] # delete these references: new_values[anchor_length < 2] = -1 # get rid of shorthand for sports: new_values[logical_and(is_sport_or_thoroughfare, anchor_length <= 2)] = -1 # remove lists of episodes: is_episode_list = IS_EPISODE_LIST[new_values] new_values[is_episode_list] = -1 # get rid of "car" -> "Renault Megane", when "car" -> "Car", # and "Renault Megane" is instance of "Car": is_not_people = logical_not(IS_PEOPLE)[new_values] new_values = extend_relations(relation_data, new_values, offsets, counts, alternative=is_not_people, pbar=get_progress_bar("extend_relations", max_value=len(offsets), item="links"), keep_min=keep_min) unchanged = values == new_values # remove all non-modified values that are # not instances of anything, nor subclasses of anything: new_values[logical_ands([ logical_ands([ collection.relation(wprop.INSTANCE_OF).edges() == 0, collection.relation(wprop.SUBCLASS_OF).edges() == 0, collection.relation(wprop.PART_OF).edges() == 0, collection.relation(wprop.CATEGORY_LINK).edges() == 0 ])[new_values], unchanged ])] = -1 is_kinship = IS_KINSHIP[new_values] is_human = IS_CHARACTER_HUMAN[new_values] new_values = related_promote_highest(new_values, offsets, counts, condition=is_human, alternative=is_kinship, keep_min=keep_min) # replace elements by a country # if a better alternative is present, # counts is less than 100: should_replace_by_country = logical_ands([ logical_not( logical_ors([ IS_POLITICAL_ORGANIZATION, IS_CARDINAL_DIRECTION, IS_LANGUAGE_ALPHABET, IS_COUNTRY, IS_PEOPLE_GROUP, IS_BREED, IS_BATTLE, IS_SOCIETY, IS_POSITION, IS_POLITICAL_PARTY, IS_SPORTS_TEAM, IS_CHARACTER_HUMAN, IS_LANDFORM, IS_ACTIVITY ]))[new_values], counts < 100 ]) # turn this into a promote highest in this order: is_country_or_cardinal = [ IS_CARDINAL_DIRECTION, IS_COUNTRY, IS_POLITICAL_ORGANIZATION ] for i, alternative in enumerate(is_country_or_cardinal): unchanged = values == new_values should_replace_by_country = logical_and(should_replace_by_country, unchanged) new_values = related_promote_highest( new_values, offsets, counts, condition=should_replace_by_country, alternative=alternative[new_values], keep_min=keep_min) new_offsets, new_values, new_counts, location_shift = reduce_values( offsets, new_values, counts) return (new_offsets, new_values, new_counts), location_shift