def build_fixed_point(out, prefix): wiki_fixed_point_save = join( out, "wikidata_%s_fixed_points_values.npy" % (prefix, )) if not true_exists(wiki_fixed_point_save): print("building %s fixed point property." % (prefix, )) trie = marisa_trie.RecordTrie('i').load( join(out, WIKITILE_2_WIKIDATA_TRIE_NAME)) num_items = count_lines(join(out, WIKIDATA_IDS_NAME)) fixed_point_relation = {} category_prefix = "%s/Category:" % (prefix, ) article_prefix = "%s/" % (prefix, ) wikititle2wikidata_path = join(out, WIKITILE_2_WIKIDATA_TSV_NAME) relevant_items = trie.iteritems(category_prefix) for name, category_idx in relevant_items: article_name = article_prefix + name[len(category_prefix):] for fixed_point_name_alternate in fixed_point_name_alternates( article_name): matches = trie.get(fixed_point_name_alternate, None) if matches is not None and len(matches) > 0: fixed_point_relation[category_idx] = [matches[0][0]] break print("Found %d fixed point relations for %s" % ( len(fixed_point_relation), prefix, )) save_record_with_offset( join(out, "wikidata_%s_fixed_points" % (prefix, )), fixed_point_relation, num_items)
def main(): args = parse_args() trie = marisa_trie.RecordTrie('i').load(args.wikipedia2wikidata_trie) print('loaded trie') num_lines = count_lines(args.category_links) num_ids = count_lines(args.wikidata_ids) missing = [] num_missing = 0 num_broken = 0 all_category_links = [[] for i in range(num_ids)] with open(args.category_links, 'rt') as fin: fin_pbar = get_progress_bar('reading category_links', max_value=num_lines)(fin) for line in fin_pbar: try: origin, dest = line.rstrip('\n').split('\t') except: num_broken += 1 continue if len(dest) == 0: num_broken += 1 continue origin = args.prefix + '/' + origin prefixed_dest = args.prefix + '/' + dest origin_index = trie.get(origin, None) dest_index = trie.get(prefixed_dest, None) if dest_index is None: prefixed_dest = args.prefix + '/' + dest[0].upper() + dest[1:] dest_index = trie.get(prefixed_dest, None) if origin_index is None or dest_index is None: missing.append((origin, prefixed_dest)) num_missing += 1 else: all_category_links[origin_index[0][0]].append(dest_index[0][0]) print("%d/%d category links could not be found in wikidata" % (num_missing, num_lines)) print("%d/%d category links were malformed" % (num_broken, num_lines)) print("Missing links sample:") for origin, dest in missing[:10]: print("%r -> %r" % (origin, dest)) save_record_with_offset( join(args.out, "wikidata_%s_category_links" % (args.prefix, )), all_category_links)
def main(): args = parse_args() makedirs(args.wikidata, exist_ok=True) wikidata_names2prop_names = property_names( join(PROJECT_DIR, "data", "wikidata", 'wikidata_property_names.json')) wikidata_names2temporal_prop_names = temporal_property_names( join(PROJECT_DIR, "data", "wikidata", 'wikidata_time_property_names.json')) # fields to make easily accessible: important_properties = [ wikidata_properties.INSTANCE_OF, wikidata_properties.SUBCLASS_OF, wikidata_properties.PART_OF, wikidata_properties.OCCUPATION, wikidata_properties.FIELD_OF_WORK, wikidata_properties.FIELD_OF_THIS_OCCUPATION, wikidata_properties.MEDICAL_SPECIALITY, wikidata_properties.GENRE, wikidata_properties.SEX_OR_GENDER, wikidata_properties.COUNTRY_OF_CITIZENSHIP, wikidata_properties.COUNTRY, wikidata_properties.CONTINENT, wikidata_properties.LOCATED_IN_THE_ADMINISTRATIVE_TERRITORIAL_ENTITY, wikidata_properties.SPORT, wikidata_properties.STUDIES, wikidata_properties.SERIES, wikidata_properties.USE, wikidata_properties.LOCATION, wikidata_properties.FACE_OF, wikidata_properties.IS_A_LIST_OF, wikidata_properties.COUNTRY_OF_ORIGIN, wikidata_properties.PRODUCT_OR_MATERIAL_PRODUCED, wikidata_properties.INDUSTRY, wikidata_properties.PARENT_TAXON, wikidata_properties.APPLIES_TO_TERRITORIAL_JURISDICTION, wikidata_properties.POSITION_HELD, wikidata_properties.CATEGORYS_MAIN_TOPIC, # temporal properties wikidata_properties.PUBLICATION_DATE, wikidata_properties.DATE_OF_BIRTH, wikidata_properties.DATE_OF_DEATH, wikidata_properties.INCEPTION, wikidata_properties.DISSOLVED_OR_ABOLISHED, wikidata_properties.POINT_IN_TIME, wikidata_properties.START_TIME, wikidata_properties.END_TIME ] prop_names2wikidata_names = { value: key for key, value in wikidata_names2prop_names.items() } wikidata_important_properties = [ prop_names2wikidata_names[prop] for prop in important_properties ] wikidata_important_properties_fnames = [ (name, join(args.wikidata, "wikidata_%s.txt" % (name, )), name in wikidata_names2temporal_prop_names) for name in wikidata_important_properties ] missing_wikidata_important_properties_fnames = [ (name, outfile, is_temporal) for name, outfile, is_temporal in wikidata_important_properties_fnames if not true_exists(outfile) ] wikidata_ids_path = join(args.wikidata, WIKIDATA_IDS_NAME) wikititle2wikidata_path = join(args.wikidata, WIKITILE_2_WIKIDATA_TSV_NAME) work_to_be_done = (not true_exists(wikidata_ids_path) or not true_exists(wikititle2wikidata_path) or len(missing_wikidata_important_properties_fnames) > 0) if work_to_be_done: get_wikidata_mapping( wikititle2wikidata_path, wikidata_ids_path, open_wikidata_file(args.wikidata_dump, args.batch_size), missing_wikidata_important_properties_fnames) numpy_wikidata_important_properties_fnames = [ (name, outfile, is_temporal) for name, outfile, is_temporal in wikidata_important_properties_fnames if not values_exist(join(args.wikidata, "wikidata_%s" % (name, ))) ] # obtain a mapping from id -> number if len(numpy_wikidata_important_properties_fnames) > 0: _, id2index = load_wikidata_ids(args.wikidata) # make relations numerical: for relname, outfile, is_temporal in numpy_wikidata_important_properties_fnames: with open(outfile, "rt") as fin: lines = fin.read().splitlines() fin_pbar = get_progress_bar("loading relation %r" % (relname, ))(lines) if is_temporal: value = np.zeros(len(lines) * 2 + 1, dtype=np.int32) position = 1 seen = 0 for idx, line in enumerate(fin_pbar): for wikidata_id in line.split('\t'): if len(wikidata_id) > 0: value[position] = idx value[position + 1] = parse_year(wikidata_id) position += 2 seen += 1 break value[0] = len(lines) value = value[:position] np.save( join(args.wikidata, "wikidata_%s_values.sparse.npy" % (relname, )), value) else: relation = [line2indices(id2index, line) for line in fin_pbar] save_record_with_offset( join(args.wikidata, "wikidata_%s" % (relname, )), relation) del id2index # convert the mapping from wikinames to integer values: trie_save_path = join(args.wikidata, WIKITILE_2_WIKIDATA_TRIE_NAME) if not true_exists(trie_save_path): print("loading wikipedia name -> wikidata") name2id = pandas.read_csv(wikititle2wikidata_path, sep="\t", encoding='utf-8') print("loaded") trie = marisa_trie.RecordTrie( 'i', get_progress_bar("convert to trie", max_value=name2id.shape[0])( (key, (value, )) for _, key, value in name2id.itertuples())) trie.save(trie_save_path) build_fixed_point(args.wikidata, "enwiki")