コード例 #1
0
def scrape_person(context, doc, url):
    hierarchy = doc.find(
        './/span[@itemtype="http://data-vocabulary.org/Breadcrumb"]')
    # Remove empty items in the list
    hierarchy = [
        item.text_content() for item in hierarchy
        if item.text_content() and item.text_content().strip()
    ]
    # Strip first item ('institution') and last item ('name of person')
    hierarchy = hierarchy[1:-1]

    name = doc.find('.//h3[@itemprop="name"]').text_content()
    title = doc.findtext('.//td[@itemprop="jobTitle"]')
    entity_id = make_id(name, title)
    entity = Entity.create('eu-whoiswho', entity_id)
    entity.name = name
    entity.url = url
    entity.function = title

    address = entity.create_address()
    address.street = doc.findtext('.//span[@itemprop="streetAddress"]')
    address.postal_code = doc.findtext('.//span[@itemprop="postalCode"]')
    address.text = doc.findtext('.//span[@itemprop="addressLocality"]')
    # address.phone = doc.findtext('.//span[@itemprop="telephone"]')

    if len(hierarchy) > 1:
        entity.program = hierarchy[1]

    # pprint(entity.to_dict())
    context.emit(data=entity.to_dict())
コード例 #2
0
def handle_organisation(context, data):
    header = ["No",
              "Name",
              "Reason for inclusion",
              "Category of entity",
              "Date of inclusion"]
    data = {key: value for key, value in zip(header, data)}

    entity_id = make_id(data["Name"], data["Reason for inclusion"])
    entity = Entity.create("kg-fiu-national", entity_id)
    entity.type = entity.TYPE_ENTITY

    if "," in data["Name"]:
        data["Name"] = data["Name"].split(",")
    else:
        data["Name"] = [data["Name"]]
    entity.name = data["Name"][0]
    for alias in data["Name"][1:]:
        entity.create_alias(alias)

    entity.program = data["Category of entity"]
    entity.summary = data["Reason for inclusion"]
    entity.listed_at = data["Date of inclusion"]

    # pprint(entity.to_dict())
    context.emit(data=entity.to_dict())
コード例 #3
0
def handle_individual(context, data):
    header = ["No", "Last Name", "Name", "Middle Name", "Date of birth",
              "Place of birth", "Reason for inclusion",
              "Category of entity", "Date of inclusion"]
    data = {key: value for key, value in zip(header, data)}

    entity_id = make_id(data["Last Name"],
                        data["Middle Name"],
                        data["Name"],
                        data["Reason for inclusion"])
    entity = Entity.create("kg-fiu-national", entity_id)
    entity.type = entity.TYPE_INDIVIDUAL
    entity.last_name = data["Last Name"]
    entity.first_name = data["Name"]
    entity.second_name = data["Middle Name"]
    birth_date = entity.create_birth_date()
    birth_date.date = data["Date of birth"]
    birth_place = entity.create_birth_date()
    birth_place.place = data["Place of birth"]
    entity.program = data["Category of entity"]
    entity.summary = data["Reason for inclusion"]
    entity.listed_at = data["Date of inclusion"]

    # pprint(entity.to_dict())
    context.emit(data=entity.to_dict())
コード例 #4
0
ファイル: crawler.py プロジェクト: brrttwrks/datacommons
def load_features(context, data, seen, extent):
    q = QUERY.copy()
    if data["token"] is not None:
        q["token"] = data["token"]
    q["geometry"] = json.dumps(extent)
    url = "%s/%s/query" % (data["rest_url"], data["id"])
    # TODO: For some weird reason, using context.http returns 403 errors.
    # Things I have tried: reseting the session, setting stealth to true so
    # that the User-Agent is randomized, getting rid of all request headers.
    # None of it helps. So using requests instead because it works.
    res = requests.get(url, params=q).json()
    for feature in res.get("features", []):
        attrs = feature.get("attributes")
        obj = make_id(
            data.get("name"),
            attrs.get("guidPart"),
            attrs.get("OBJECTID_1"),
            attrs.get("OBJECTID_12"),
            attrs.get("OBJECTID"),
            attrs.get("ESRI_OID"),
        )
        if obj is None:
            context.log.info("Missing ID: %r", attrs.keys())
        if obj not in seen:
            seen.add(obj)
            attrs["FeatureId"] = obj
            yield attrs
    if res.get("exceededTransferLimit"):
        for child in split_envelope(extent):
            for attrs in load_features(context, data, seen, child):
                yield attrs
コード例 #5
0
def parse_entry(context, entry, url, updated_at):
    uid = entry.findtext('uid')
    type_ = ENTITY_TYPES[entry.findtext('./sdnType')]
    if type_ is None:
        return
    entity = Entity.create('us-ofac', make_id(url, uid))
    entity.type = type_
    entity.updated_at = updated_at
    programs = [p.text for p in entry.findall('./programList/program')]
    entity.program = '; '.join(programs)
    entity.summary = entry.findtext('./remarks')
    entity.function = entry.findtext('./title')
    entity.first_name = entry.findtext('./firstName')
    entity.last_name = entry.findtext('./lastName')

    for aka in entry.findall('./akaList/aka'):
        alias = entity.create_alias()
        alias.first_name = aka.findtext('./firstName')
        alias.last_name = aka.findtext('./lastName')
        alias.type = aka.findtext('./type')
        alias.quality = ALIAS_QUALITY[aka.findtext('./category')]

    for ident in entry.findall('./idList/id'):
        type_ = ID_TYPES.get(ident.findtext('./idType'), Identifier.TYPE_OTHER)
        if type_ is None:
            continue
        identifier = entity.create_identifier()
        identifier.type = type_
        identifier.number = ident.findtext('./idNumber')
        identifier.country = ident.findtext('./idCountry')
        identifier.description = ident.findtext('./idType')

    for addr in entry.findall('./addressList/address'):
        address = entity.create_address()
        address.street = addr.findtext('./address1')
        address.street_2 = addr.findtext('./address2')
        address.city = addr.findtext('./city')
        address.country = addr.findtext('./country')

    for pob in entry.findall('./placeOfBirthList/placeOfBirthItem'):
        birth_place = entity.create_birth_place()
        birth_place.place = pob.findtext('./placeOfBirth')
        birth_place.quality = BirthPlace.QUALITY_WEAK
        if pob.findtext('./mainEntry') == 'true':
            birth_place.quality = BirthPlace.QUALITY_STRONG

    for pob in entry.findall('./dateOfBirthList/dateOfBirthItem'):
        birth_date = entity.create_birth_date()
        birth_date.date = stringify(parse_date(pob.findtext('./dateOfBirth')))
        birth_date.quality = BirthDate.QUALITY_WEAK
        if pob.findtext('./mainEntry') == 'true':
            birth_date.quality = BirthDate.QUALITY_STRONG

    # pprint(entity.to_dict())
    context.emit(data=entity.to_dict())
コード例 #6
0
def scrape_case(context, data):
    url = data.get('url')
    res = context.http.get(url)
    doc = res.html
    name = element_text(doc.find('.//div[@class="nom_fugitif_wanted"]'))
    if name is None or name == 'Identity unknown':
        return
    uid = make_id(url)
    entity = Entity.create('interpol-red-notices', uid)
    entity.url = url
    entity.type = entity.TYPE_INDIVIDUAL
    entity.name = name
    entity.program = element_text(doc.find('.//span[@class="nom_fugitif_wanted_small"]'))  # noqa

    if ', ' in name:
        last, first = name.split(', ', 1)
        alias = entity.create_alias()
        alias.name = ' '.join((first, last))

    for row in doc.findall('.//div[@class="bloc_detail"]//tr'):
        title, value = row.findall('./td')
        name = slugify(element_text(title), sep='_')
        value = element_text(value)
        if value is None:
            continue
        if name == 'charges':
            entity.summary = value
        elif name == 'present_family_name':
            entity.last_name = value
        elif name == 'forename':
            entity.first_name = value
        elif name == 'nationality':
            for country in value.split(', '):
                nationality = entity.create_nationality()
                nationality.country = country
        elif name == 'sex':
            entity.gender = SEXES[value]
        elif name == 'date_of_birth':
            birth_date = entity.create_birth_date()
            birth_date.date = value.split('(')[0]
        elif name == 'place_of_birth':
            birth_place = entity.create_birth_place()
            birth_place.date = value

    # pprint(entity.to_dict())
    context.emit(data=entity.to_dict())
コード例 #7
0
def parse_row(context, data):
    row = data.get('row')
    uid = make_id(row.get('Effective_Date'), row.get('Name'))
    entity = Entity.create('us-bis-denied', uid)
    entity.type = Entity.TYPE_ENTITY
    entity.name = row.get('Name')
    entity.updated_at = row.get('Effective_Date')
    entity.program = row.get('FR_Citation')
    entity.summary = row.get('Action')
    address = entity.create_address()
    address.street = row.get('Street_Address')
    address.postal_code = row.get('Postal_Code')
    address.region = row.get('State')
    address.city = row.get('City')
    address.country = row.get('Country')

    # pprint(entity.to_dict())
    context.emit(data=entity.to_dict())
コード例 #8
0
def parse_entry(context, entry):
    url = entry.get('href')
    res = context.http.get('https://www.worldpresidentsdb.com/' + url)
    doc = res.html
    content = doc.find('.//main/div')

    uid = make_id(url)

    entity = Entity.create('worldpresidentsdb', uid)
    entity.type = Entity.TYPE_INDIVIDUAL
    entity.function = 'President'
    entity.url = url
    entity.first_name, entity.last_name = content.find('h1').text.split(' ', 1)

    for element in content.findall('.//p'):
        type = element.find('.//b')

        if type is None:
            continue
        else:
            type = type.text

        if type == 'Country:':
            nationality = entity.create_nationality()
            nationality.country = element.find('a').text
        elif type == 'Date of Birth:':
            value = element[0].tail.strip()
            month, day, year = value.split('-', 2)
            birth_date = entity.create_birth_date()
            birth_date.date = year + '-' + month + '-' + day
            birth_date.quality = 'strong'
        elif type == 'Birth Place:':
            value = element[0].tail.strip()
            birth_place = entity.create_birth_place()
            birth_place.place = value
        elif type == 'Political Party:':
            value = element[0].tail.strip()
            entity.program = value
        elif type == 'Other Political Titles:':
            value = element[0].tail.strip()
            entity.summary = value
    # pprint(entity.to_dict())
    context.emit(data=entity.to_dict())
コード例 #9
0
def parse_entity(context, url, country, component, row, updated_at):
    function = element_text(row.find('.//span[@class="title"]'))
    if function is None:
        return
    name = element_text(row.find('.//span[@class="cos_name"]'))
    if name is None:
        return

    uid = make_id(country, name, function)
    entity = Entity.create('us-cia-world-leaders', uid)
    entity.name = name
    entity.type = entity.TYPE_INDIVIDUAL
    entity.function = function
    entity.program = country
    entity.url = url
    entity.updated_at = updated_at
    nationality = entity.create_nationality()
    nationality.country = country

    # pprint(entity.to_dict())
    context.emit(data=entity.to_dict())
コード例 #10
0
ファイル: crawler.py プロジェクト: ANCIR/flexicadastre
def load_features(context, data, seen, extent):
    q = QUERY.copy()
    if data['token'] is not None:
        q['token'] = data['token']
    q['geometry'] = json.dumps(extent)
    url = '%s/%s/query' % (data['rest_url'], data['id'])
    res = context.http.get(url, params=q)
    for feature in res.json.get('features', []):
        attrs = feature.get('attributes')
        obj = make_id(data.get('name'), attrs.get('guidPart'),
                      attrs.get('OBJECTID_1'), attrs.get('OBJECTID_12'),
                      attrs.get('OBJECTID'), attrs.get('ESRI_OID'))
        if obj is None:
            context.log.info("Missing ID: %r", attrs.keys())
        if obj not in seen:
            seen.add(obj)
            attrs['FeatureId'] = obj
            yield attrs
    if res.json.get('exceededTransferLimit'):
        for child in split_envelope(extent):
            for attrs in load_features(context, data, seen, child):
                yield attrs