コード例 #1
0
def build_fixed_point(out, prefix):
    wiki_fixed_point_save = join(
        out, "wikidata_%s_fixed_points_values.npy" % (prefix, ))
    if not true_exists(wiki_fixed_point_save):
        print("building %s fixed point property." % (prefix, ))
        trie = marisa_trie.RecordTrie('i').load(
            join(out, WIKITILE_2_WIKIDATA_TRIE_NAME))
        num_items = count_lines(join(out, WIKIDATA_IDS_NAME))
        fixed_point_relation = {}

        category_prefix = "%s/Category:" % (prefix, )
        article_prefix = "%s/" % (prefix, )
        wikititle2wikidata_path = join(out, WIKITILE_2_WIKIDATA_TSV_NAME)
        relevant_items = trie.iteritems(category_prefix)

        for name, category_idx in relevant_items:
            article_name = article_prefix + name[len(category_prefix):]
            for fixed_point_name_alternate in fixed_point_name_alternates(
                    article_name):
                matches = trie.get(fixed_point_name_alternate, None)
                if matches is not None and len(matches) > 0:
                    fixed_point_relation[category_idx] = [matches[0][0]]
                    break
        print("Found %d fixed point relations for %s" % (
            len(fixed_point_relation),
            prefix,
        ))
        save_record_with_offset(
            join(out, "wikidata_%s_fixed_points" % (prefix, )),
            fixed_point_relation, num_items)
コード例 #2
0
def main():
    args = parse_args()
    trie = marisa_trie.RecordTrie('i').load(args.wikipedia2wikidata_trie)
    print('loaded trie')

    num_lines = count_lines(args.category_links)
    num_ids = count_lines(args.wikidata_ids)
    missing = []
    num_missing = 0
    num_broken = 0
    all_category_links = [[] for i in range(num_ids)]
    with open(args.category_links, 'rt') as fin:
        fin_pbar = get_progress_bar('reading category_links',
                                    max_value=num_lines)(fin)
        for line in fin_pbar:
            try:
                origin, dest = line.rstrip('\n').split('\t')
            except:
                num_broken += 1
                continue
            if len(dest) == 0:
                num_broken += 1
                continue
            origin = args.prefix + '/' + origin
            prefixed_dest = args.prefix + '/' + dest
            origin_index = trie.get(origin, None)
            dest_index = trie.get(prefixed_dest, None)

            if dest_index is None:
                prefixed_dest = args.prefix + '/' + dest[0].upper() + dest[1:]
                dest_index = trie.get(prefixed_dest, None)

            if origin_index is None or dest_index is None:
                missing.append((origin, prefixed_dest))
                num_missing += 1
            else:
                all_category_links[origin_index[0][0]].append(dest_index[0][0])

    print("%d/%d category links could not be found in wikidata" %
          (num_missing, num_lines))
    print("%d/%d category links were malformed" % (num_broken, num_lines))
    print("Missing links sample:")
    for origin, dest in missing[:10]:
        print("%r -> %r" % (origin, dest))
    save_record_with_offset(
        join(args.out, "wikidata_%s_category_links" % (args.prefix, )),
        all_category_links)
コード例 #3
0
def main():
    args = parse_args()
    makedirs(args.wikidata, exist_ok=True)

    wikidata_names2prop_names = property_names(
        join(PROJECT_DIR, "data", "wikidata", 'wikidata_property_names.json'))
    wikidata_names2temporal_prop_names = temporal_property_names(
        join(PROJECT_DIR, "data", "wikidata",
             'wikidata_time_property_names.json'))
    # fields to make easily accessible:
    important_properties = [
        wikidata_properties.INSTANCE_OF,
        wikidata_properties.SUBCLASS_OF,
        wikidata_properties.PART_OF,
        wikidata_properties.OCCUPATION,
        wikidata_properties.FIELD_OF_WORK,
        wikidata_properties.FIELD_OF_THIS_OCCUPATION,
        wikidata_properties.MEDICAL_SPECIALITY,
        wikidata_properties.GENRE,
        wikidata_properties.SEX_OR_GENDER,
        wikidata_properties.COUNTRY_OF_CITIZENSHIP,
        wikidata_properties.COUNTRY,
        wikidata_properties.CONTINENT,
        wikidata_properties.LOCATED_IN_THE_ADMINISTRATIVE_TERRITORIAL_ENTITY,
        wikidata_properties.SPORT,
        wikidata_properties.STUDIES,
        wikidata_properties.SERIES,
        wikidata_properties.USE,
        wikidata_properties.LOCATION,
        wikidata_properties.FACE_OF,
        wikidata_properties.IS_A_LIST_OF,
        wikidata_properties.COUNTRY_OF_ORIGIN,
        wikidata_properties.PRODUCT_OR_MATERIAL_PRODUCED,
        wikidata_properties.INDUSTRY,
        wikidata_properties.PARENT_TAXON,
        wikidata_properties.APPLIES_TO_TERRITORIAL_JURISDICTION,
        wikidata_properties.POSITION_HELD,
        wikidata_properties.CATEGORYS_MAIN_TOPIC,
        # temporal properties
        wikidata_properties.PUBLICATION_DATE,
        wikidata_properties.DATE_OF_BIRTH,
        wikidata_properties.DATE_OF_DEATH,
        wikidata_properties.INCEPTION,
        wikidata_properties.DISSOLVED_OR_ABOLISHED,
        wikidata_properties.POINT_IN_TIME,
        wikidata_properties.START_TIME,
        wikidata_properties.END_TIME
    ]
    prop_names2wikidata_names = {
        value: key
        for key, value in wikidata_names2prop_names.items()
    }
    wikidata_important_properties = [
        prop_names2wikidata_names[prop] for prop in important_properties
    ]
    wikidata_important_properties_fnames = [
        (name, join(args.wikidata, "wikidata_%s.txt" % (name, )), name
         in wikidata_names2temporal_prop_names)
        for name in wikidata_important_properties
    ]

    missing_wikidata_important_properties_fnames = [
        (name, outfile, is_temporal)
        for name, outfile, is_temporal in wikidata_important_properties_fnames
        if not true_exists(outfile)
    ]

    wikidata_ids_path = join(args.wikidata, WIKIDATA_IDS_NAME)
    wikititle2wikidata_path = join(args.wikidata, WIKITILE_2_WIKIDATA_TSV_NAME)

    work_to_be_done = (not true_exists(wikidata_ids_path)
                       or not true_exists(wikititle2wikidata_path) or
                       len(missing_wikidata_important_properties_fnames) > 0)

    if work_to_be_done:
        get_wikidata_mapping(
            wikititle2wikidata_path, wikidata_ids_path,
            open_wikidata_file(args.wikidata_dump, args.batch_size),
            missing_wikidata_important_properties_fnames)

    numpy_wikidata_important_properties_fnames = [
        (name, outfile, is_temporal)
        for name, outfile, is_temporal in wikidata_important_properties_fnames
        if not values_exist(join(args.wikidata, "wikidata_%s" % (name, )))
    ]

    # obtain a mapping from id -> number
    if len(numpy_wikidata_important_properties_fnames) > 0:
        _, id2index = load_wikidata_ids(args.wikidata)
        # make relations numerical:
        for relname, outfile, is_temporal in numpy_wikidata_important_properties_fnames:
            with open(outfile, "rt") as fin:
                lines = fin.read().splitlines()
            fin_pbar = get_progress_bar("loading relation %r" %
                                        (relname, ))(lines)
            if is_temporal:
                value = np.zeros(len(lines) * 2 + 1, dtype=np.int32)
                position = 1
                seen = 0
                for idx, line in enumerate(fin_pbar):
                    for wikidata_id in line.split('\t'):
                        if len(wikidata_id) > 0:
                            value[position] = idx
                            value[position + 1] = parse_year(wikidata_id)
                            position += 2
                            seen += 1
                            break
                value[0] = len(lines)
                value = value[:position]
                np.save(
                    join(args.wikidata,
                         "wikidata_%s_values.sparse.npy" % (relname, )), value)
            else:
                relation = [line2indices(id2index, line) for line in fin_pbar]
                save_record_with_offset(
                    join(args.wikidata, "wikidata_%s" % (relname, )), relation)
        del id2index

    # convert the mapping from wikinames to integer values:
    trie_save_path = join(args.wikidata, WIKITILE_2_WIKIDATA_TRIE_NAME)
    if not true_exists(trie_save_path):
        print("loading wikipedia name -> wikidata")
        name2id = pandas.read_csv(wikititle2wikidata_path,
                                  sep="\t",
                                  encoding='utf-8')
        print("loaded")
        trie = marisa_trie.RecordTrie(
            'i',
            get_progress_bar("convert to trie", max_value=name2id.shape[0])(
                (key, (value, )) for _, key, value in name2id.itertuples()))
        trie.save(trie_save_path)

    build_fixed_point(args.wikidata, "enwiki")