Esempio n. 1
0
async def get_fact(query, args, tokenizer, trex_set, common_vocab, f_out):
    """
    Collect more facts for the TREx-train set from LPAQA
    """
    line = query.strip().split('\t')
    sub_url, sub, obj_url, obj = line
    sub_id = get_id_from_url(sub_url)
    obj_id = get_id_from_url(obj_url)

    # First, make sure fact is not in TREx test set
    if (sub_id, obj_id) in trex_set:
        return

    # Make sure object is a single token
    if len(tokenizer.tokenize(obj)) != 1:
        return

    # Make sure object is in common vocab subset
    if obj not in common_vocab:
        return

    # Make sure subject is prominent (has a Wikipedia page)
    try:
        q_dict = get_entity_dict_from_api(sub_id)
        q = WikidataItem(q_dict)
        if not q.get_sitelinks():
            return
    except ValueError:
        return

    # Some entities don't have labels so the subject label is the URI
    if sub_id == sub:
        return

    # print('Writing fact: {} - {}', sub, obj)
    f_out.write(
        json.dumps({
            'sub_uri': sub_id,
            'obj_uri': obj_id,
            'sub_label': sub,
            'obj_label': obj
        }) + '\n')

    # Increment global count
    await increment_count()
Esempio n. 2
0
def get_place_from_wikidata(entity_id):
    parents = set()
    entity = WikidataItem(get_entity_dict_from_api(entity_id))
    claims_groups = entity.get_truthy_claim_groups()
    place = Place()
    place.set_gramps_id(entity_id)

    name = PlaceName()
    name.set_language('sv')
    name.set_value(entity.get_label('sv'))
    place.set_name(name=name)

    place.set_title(entity.get_label('sv'))
    for lang in ['sv', 'en', 'de', 'fi', 'no', 'nn', 'da', 'se']:
        wiki_name = entity.get_label(lang)
        if len(wiki_name):
            place_name = PlaceName()
            place_name.set_language(lang)
            place_name.set_value(wiki_name)
            place.add_alternative_name(name=place_name)
            for alias in entity.get_aliases(lang):
                alt_name = PlaceName()
                alt_name.set_language(lang)
                alt_name.set_value(alias)
                place.add_alternative_name(name=alt_name)

        for link in entity.get_sitelinks(lang).values():
            wikipedia_url = Url()
            wikipedia_url.set_path(link['url'])
            wikipedia_url.set_type('Wikipedia entry')
            wikipedia_url.set_description('Wikipedia %s:%s' %
                                          (link["title"], link["site"]))
            place.add_url(wikipedia_url)

    # Instance of -> PlaceType
    if PROPERTY_INSTANCE_OF in claims_groups:
        for claim in claims_groups[PROPERTY_INSTANCE_OF]:
            instance_of = claim.mainsnak.datavalue.value['id']
            if ITEM_PARISH == instance_of:
                place.set_type(PlaceType.PARISH)
            elif ITEM_SOCKEN == instance_of:
                place.set_type(PlaceType.PARISH)
            elif ITEM_ISLAND == instance_of:
                place.set_type(PlaceType.UNKNOWN)  # No islands in Gramps
            elif ITEM_MUNICIPALITY_OF_SWEDEN == instance_of:
                place.set_type(PlaceType.MUNICIPALITY)
            elif ITEM_MUNICIPALITY == instance_of:
                place.set_type(PlaceType.MUNICIPALITY)
            elif ITEM_COUNTRY == instance_of:
                place.set_type(PlaceType.COUNTRY)
            elif ITEM_SOVEREIGN_STATE == instance_of:
                place.set_type(PlaceType.COUNTRY)
            elif ITEM_STATE_OF_US == instance_of:
                place.set_type(PlaceType.STATE)
            elif ITEM_FEDERAL_STATE == instance_of:
                place.set_type(PlaceType.STATE)
            elif ITEM_COUNTY == instance_of:
                place.set_type(PlaceType.COUNTY)
            elif ITEM_COUNTY_OF_SWEDEN == instance_of:
                place.set_type(PlaceType.COUNTY)
            elif ITEM_FORMER_COUNTY_OF_SWEDEN == instance_of:
                place.set_type(PlaceType.COUNTY)
            elif ITEM_PROVINCE_OF_SWEDEN == instance_of:
                place.set_type(PlaceType.PROVINCE)
            elif ITEM_PROVINCE == instance_of:
                place.set_type(PlaceType.PROVINCE)
            elif ITEM_ADM_REGION == instance_of:
                place.set_type(PlaceType.REGION)
            elif ITEM_NEIGHBORHOOD == instance_of:
                place.set_type(PlaceType.NEIGHBORHOOD)
            elif ITEM_DISTRICT == instance_of:
                place.set_type(PlaceType.DISTRICT)
            elif ITEM_BOROUGH == instance_of:
                place.set_type(PlaceType.BOROUGH)
            elif ITEM_TOWN == instance_of:
                place.set_type(PlaceType.TOWN)
            elif ITEM_LARGE_VILLAGE == instance_of:
                place.set_type(PlaceType.VILLAGE)
            elif ITEM_VILLAGE == instance_of:
                place.set_type(PlaceType.VILLAGE)
            elif ITEM_URBAN_AREA_IN_SWEDEN == instance_of:
                place.set_type(PlaceType.VILLAGE)
            elif ITEM_HAMLET == instance_of:
                place.set_type(PlaceType.HAMLET)
            elif ITEM_FARM == instance_of:
                place.set_type(PlaceType.FARM)
            elif ITEM_BUILDING == instance_of:
                place.set_type(PlaceType.BUILDING)

    if PROPERTY_COORDINATE_LOCATION in claims_groups:
        for claim in claims_groups[PROPERTY_COORDINATE_LOCATION]:
            datavalue = claim.mainsnak.datavalue
            place.set_latitude(str(datavalue.value['latitude']))
            place.set_longitude(str(datavalue.value['longitude']))

    extract_located_in(claims_groups, PROPERTY_LOCATED_IN_PRESENT, parents)
    extract_located_in(claims_groups, PROPERTY_LOCATED_IN_ADM, parents)
    extract_located_in(claims_groups, PROPERTY_LOCATED, parents)

    return place, parents