Example #1
0
def parse_individual(record, ind):
    last_name = ind.findtext('.//THIRD_NAME')
    second_name = ind.findtext('.//SECOND_NAME')
    if last_name is None:
        last_name = second_name
        second_name = None
    record.update({
        'first_name': ind.findtext('.//FIRST_NAME'),
        'second_name': second_name,
        'last_name': last_name
    })

    for alias in ind.findall('./INDIVIDUAL_ALIAS'):
        parse_alias(alias, record)

    for addr in ind.findall('./INDIVIDUAL_ADDRESS'):
        parse_address(addr, record)

    for ident in ind.findall('./INDIVIDUAL_DOCUMENT'):
        country = normalize_country(ident.findtext('./COUNTRY_OF_ISSUE'))
        number = ident.findtext('./NUMBER')
        if number is None and country is None:
            continue
        record['identities'].append({
            'type': ident.findtext('./TYPE_OF_DOCUMENT'),
            'number': number,
            'country': country,
        })

    for dob in ind.findall('./INDIVIDUAL_DATE_OF_BIRTH'):
        date = dob.findtext('./DATE')
        if date is None:
            continue
        if ':' in date:
            date = date.rsplit('-', 1)[0]
        approx = dob.findtext('./TYPE_OF_DATE') == 'Approximately'
        if approx and 'date_of_birth' in record:
            continue
        record['date_of_birth'] = date

    for pob in ind.findall('./INDIVIDUAL_PLACE_OF_BIRTH'):
        country = pob.findtext('./COUNTRY')
        record['country_of_birth'] = normalize_country(country)

        place = pob.findtext('./CITY')
        region = pob.findtext('./STATE_PROVINCE')
        if place and region:
            record['place_of_birth'] = '%s (%s)' % (place, region)
        if place:
            record['place_of_birth'] = place
        if region:
            record['place_of_birth'] = region

    source.emit(record)
Example #2
0
def parse_case(case):
    url = case.get('url')
    name = combine_name(*reversed(case.get('name').split(', ')))
    updated = dateutil_parse(case.get('last_updated'))
    record = {
        'uid': make_id('interpol', url.split('/')[-1]),
        'source_url': url,
        'name': name,
        'summary': case.get('reason'),
        'updated_at': updated.date().isoformat(),
        'place_of_birth': case.get('place_of_birth'),
        'gender': case.get('sex', '').lower(),
        'first_name': case.get('forename'),
        'last_name': case.get('present_family_name'),
        'nationality': normalize_country(case.get('nationality')),
        'identities': [],
        'addresses': [],
        'other_names': []
    }
    record.update(SOURCE)
    birth = case.get('date_of_birth').split(' ')[0]

    try:
        dt = datetime.strptime(birth, '%Y').date().isoformat()
        record['date_of_birth'] = dt
    except Exception:
        try:
            dt = datetime.strptime(birth, '%d/%m/%Y').date().isoformat()
            record['date_of_birth'] = dt
        except Exception as ex:
            log.exception(ex)

    source.emit(record)
Example #3
0
def parse_common(node, type_):
    program_ref = '%s (%s)' % (node.findtext('./UN_LIST_TYPE').strip(),
                               node.findtext('./REFERENCE_NUMBER').strip())
    record = {
        'uid': make_id('un', 'sc', node.findtext('./DATAID')),
        'type': type_,
        'program': program_ref,
        'summary': node.findtext('./COMMENTS1'),
        'name': combine_name(node.findtext('./FIRST_NAME'),
                             node.findtext('./SECOND_NAME'),
                             node.findtext('./THIRD_NAME')),
        'function': node.findtext('./DESIGNATION/VALUE'),
        'updated_at': node.findtext('./LISTED_ON'),
        'nationality': normalize_country(node.findtext('./NATIONALITY/VALUE')),
        'other_names': [],
        'addresses': [],
        'identities': []
    }
    record.update(BASE)
    orig = node.findtext('./NAME_ORIGINAL_SCRIPT')
    if orig is not None:
        record['name'] = orig

    last_updated = node.findtext('./LAST_DAY_UPDATED/VALUE')
    if last_updated is not None:
        record['updated_at'] = last_updated

    if ':' in record['updated_at']:
        record['updated_at'] = record['updated_at'].rsplit('-', 1)[0]

    # print etree.tostring(node, pretty_print=True)
    return record
Example #4
0
def parse_row(row):
    record = SOURCE.copy()
    record.update({
        'uid': make_id('us', 'bis', row.get('Effective_Date'),
                       row.get('Name')),
        'name': row.get('Name'),
        'program': row.get('FR_Citation'),
        'summary': row.get('Action'),
        'updated_at': row.get('Last Update'),
        'nationality': normalize_country(row.get('Country')),
        'addresses': [{
            'address1': row.get('Street_Address'),
            'postal_code': row.get('Postal_Code'),
            'region': row.get('State'),
            'city': row.get('City'),
            'country': normalize_country(row.get('Country'))
        }]
    })
    source.emit(record)
Example #5
0
def wbdeb_parse(html_file):
    doc = html.parse(html_file)
    for table in doc.findall('//table'):
        if 'List of Debarred' not in table.get('summary', ''):
            continue
        rows = table.findall('.//tr')
        print table.get('summary'), len(rows)
        for row in rows:
            tds = row.findall('./td')
            if len(tds) != 6:
                continue
            values = [clean_value(td) for td in tds]
            uid = sha1()
            for value in values:
                uid.update(value.encode('utf-8'))
            uid = uid.hexdigest()[:10]

            names = clean_name(values[0])
            if not len(names):
                log.warning("No name: %r", values)
                continue

            record = {
                'uid': make_id('wb', 'debarred', uid),
                'name': names[0],
                'nationality': normalize_country(values[2]),
                'program': values[5],
                'addresses': [{
                    'text': values[1],
                    'country': normalize_country(values[2])
                }],
                'other_names': [],
                'updated_at': dateutil_parse(values[3]).date().isoformat()
            }

            for name in names[1:]:
                record['other_names'].append({
                    'other_name': name
                })
            record.update(SOURCE)
            source.emit(record)
Example #6
0
def parse_address(addr, record):
    data = {
        'text': addr.findtext('./NOTE'),
        'address1': addr.findtext('./STREET'),
        'city': addr.findtext('./CITY'),
        'region': addr.findtext('./STATE_PROVINCE'),
        'country': normalize_country(addr.findtext('./COUNTRY')),
    }
    exist = set(data.values())
    if exist == 1 and exist[0] is None:
        return
    record['addresses'].append(data)
Example #7
0
def parse_politician(source, country, legislature, data):
    # from pprint import pprint
    # pprint(policitian)

    # TODO: add politician
    code = normalize_country(country.get('code'))
    entity = {
        'uid': make_id('evpo', data.get('id').split('-')[-1]),
        'name': data.get('name'),
        'type': 'individual',
        'addresses': [{'country': code}],
        'updated_at': parse_ts(legislature.get('lastmod')),
        'source_url': data.get('source_url'),
        'source': '%s (%s)' % (legislature['name'], country['name'])
    }
    entity.update(PUBLISHER)
    source.emit(entity)
Example #8
0
def worldleaders_parse(json_file):
    with open(json_file, 'r') as fh:
        data = json.load(fh)

    for leader in data.get('leaders'):
        if not len(leader.get('name')):
            log.warning('No name on entity: %(title)s (%(country)s)', leader)
            continue
        country = normalize_country(leader.get('country'))
        summary = leader.get('title')
        if leader.get('component'):
            summary = '%s (%s)' % (summary, leader.get('component'))
        entity = {
            'uid': make_id('us', 'cia', 'worldleaders', country,
                           leader.get('name')),
            'name': leader.get('name'),
            'type': 'individual',
            'summary': summary,
            'addresses': [{'country': country}],
            'updated_at': parse_date(leader.get('last_update')),
            'source_url': leader.get('url')
        }
        entity.update(PUBLISHER)
        source.emit(entity)
Example #9
0
def parse_entry(entry):
    uid = entry.findtext('number-entry')
    record = {
        'uid': make_id('ua', 'sdfm', uid),
        'type': 'individual',
        'publisher': 'Ukraine SDFM',
        'publisher_url': 'http://www.sdfm.gov.ua/',
        'source': 'Blacklist',
        'source_id': 'UA-SDFM-SANC',
        'source_url': 'http://www.sdfm.gov.ua/articles.php?cat_id=87&lang=en',
        'program': entry.findtext('./program-entry'),
        'summary': entry.findtext('./comments')
    }
    date_entry = entry.findtext('./date-entry')
    if date_entry:
        date_entry = datetime.strptime(date_entry, '%Y%m%d')
        record['updated_at'] = date_entry.date().isoformat()

    is_entity = entry.findtext('./type-entry') != '1'
    if is_entity:
        record['type'] = 'entity'

    record['other_names'] = []
    for aka in entry.findall('./aka-list'):
        data = get_names(aka)
        if aka.findtext('type-aka') == 'N':
            record['name'] = data.pop('other_name', None)
            record.update(data)
        else:
            data['type'] = aka.findtext('./type-aka')
            data['quality'] = aka.findtext('./quality-aka')
            if data['quality'] == '1':
                data['quality'] = 'strong'
            if data['quality'] == '2':
                data['quality'] = 'weak'
            # if is_entity:
            #     data.pop('last_name', None)
            record['other_names'].append(data)

    record['identities'] = []
    for doc in entry.findall('./document-list'):
        data = {
            'text': doc.findtext('./document-reg'),
            'number': doc.findtext('./document-id'),
            'country': normalize_country(doc.findtext('./document-country'))
        }
        record['identities'].append(data)

    for doc in entry.findall('./id-number-list'):
        data = {'text': doc.text.strip()}
        record['identities'].append(data)

    record['addresses'] = []
    for address in entry.findall('./address-list'):
        data = {
            'text': address.findtext('./address')
        }
        record['addresses'].append(data)

    # FIXME: handle multiple
    for pob in entry.findall('./place-of-birth-list'):
        if 'place_of_birth' in record:
            log.debug('Multiple places of birth: %(name)s', record)
        record['place_of_birth'] = pob.text.strip()

    # FIXME: handle multiple
    for dob in entry.findall('./date-of-birth-list'):
        if 'place_of_birth' in record:
            log.debug('Multiple dates of birth: %(name)s', record)
        record['date_of_birth'] = parse_date(dob.text)

    # print etree.tostring(entry, pretty_print=True)
    source.emit(record)
Example #10
0
def parse_entry(source, record, entry):
    uid = entry.findtext('uid')
    record.update({
        'uid': make_id('us', 'ofac', uid),
        'type': 'individual',
        'program': entry.findtext('./programList/program'),
        'summary': entry.findtext('./remarks'),
        'first_name': entry.findtext('./firstName'),
        'last_name': entry.findtext('./lastName'),
        'name': combine_name(entry.findtext('./firstName'),
                             entry.findtext('./lastName'))
    })
    is_entity = entry.findtext('./sdnType') != 'Individual'
    if is_entity:
        record['type'] = 'entity'
        record.pop('last_name', None)

    record['other_names'] = []
    for aka in entry.findall('./akaList/aka'):
        data = {
            'type': aka.findtext('./type'),
            'quality': aka.findtext('./category'),
            'first_name': aka.findtext('./firstName'),
            'last_name': aka.findtext('./lastName'),
            'other_name': combine_name(aka.findtext('./firstName'),
                                       aka.findtext('./lastName'))
        }
        if is_entity:
            data.pop('last_name', None)
        record['other_names'].append(data)

    record['identities'] = []
    for ident in entry.findall('./idList/id'):
        data = {
            'type': ident.findtext('./idType'),
            'number': ident.findtext('./idNumber'),
            'country': normalize_country(ident.findtext('./idCountry'))
        }
        record['identities'].append(data)

    record['addresses'] = []
    for address in entry.findall('./addressList/address'):
        data = {
            'address1': address.findtext('./address1'),
            'address2': address.findtext('./address2'),
            'city': address.findtext('./city'),
            'country': normalize_country(address.findtext('./country'))
        }
        record['addresses'].append(data)

    for pob in entry.findall('./placeOfBirthList/placeOfBirthItem'):
        if pob.findtext('./mainEntry') == 'true':
            record['place_of_birth'] = pob.findtext('./placeOfBirth')

    for pob in entry.findall('./dateOfBirthList/dateOfBirthItem'):
        if pob.findtext('./mainEntry') == 'true':
            dt = pob.findtext('./dateOfBirth')
            record['date_of_birth'] = parse_date(dt)

    # print etree.tostring(entry, pretty_print=True)

    if is_entity:
        record.pop('last_name', None)
    source.emit(record)
Example #11
0
def parse_entry(group, rows):
    record = SOURCE.copy()
    record.update({"uid": make_id("gb", "hmt", group), "identities": [], "addresses": [], "other_names": []})
    for row in rows:
        record.update(
            {
                "type": row.pop("Group Type").lower(),
                "date_of_birth": parse_date(row.pop("DOB")),
                "place_of_birth": row.pop("Town of Birth"),
                "country_of_birth": normalize_country(row.pop("Country of Birth")),
                "nationality": normalize_country(row.get("Nationality")),
                "program": row.pop("Regime"),
                "summary": row.pop("Other Information"),
                "updated_at": parse_date(row.pop("Last Updated")),
                "function": row.pop("Position"),
            }
        )

        names = {
            "first_name": row.get("Name 1"),
            "second_name": row.get("Name 2"),
            "middle_name": row.get("Name 3"),
            "last_name": row.get("Name 6"),
        }

        name = [
            row.pop("Title"),
            row.pop("Name 1"),
            row.pop("Name 2"),
            row.pop("Name 3"),
            row.pop("Name 4"),
            row.pop("Name 5"),
            row.pop("Name 6"),
        ]
        name = combine_name(*name)

        if "name" not in record:
            record["name"] = name
            record.update(names)
        else:
            names["other_name"] = name
            names["type"] = row.pop("Alias Type")
            record["other_names"].append(names)

        addr = [
            row.pop("Address 1"),
            row.pop("Address 2"),
            row.pop("Address 3"),
            row.pop("Address 4"),
            row.pop("Address 5"),
            row.pop("Address 6"),
        ]
        addr = combine_name(*addr)
        if len(addr):
            record["addresses"].append({"text": addr, "postal_code": row.pop("Post/Zip Code")})

        if row.get("Passport Details"):
            record["identities"].append(
                {
                    "type": "Passport",
                    "number": row.pop("Passport Details"),
                    "country": normalize_country(row.get("Nationality")),
                }
            )

        if row.get("NI Number"):
            record["identities"].append(
                {"type": "NI", "number": row.pop("NI Number"), "country": normalize_country(row.get("Country"))}
            )

        # from pprint import pprint
        # pprint(row)
    source.emit(record)
Example #12
0
def parse_entry(record, entry):
    uid = entry.get('Id')
    record.update({
        'uid': make_id('eu', 'eeas', uid),
        'type': 'individual',
        'updated_at': entry.get('reg_date'),
        'source_url': entry.get('pdf_link'),
        'program': entry.get('programme'),
        'summary': entry.get('remark')
    })
    is_entity = entry.get('Type') != 'P'
    if is_entity:
        record['type'] = 'entity'

    record['other_names'] = []
    for aka in entry.findall('./NAME'):
        data = {
            'first_name': aka.findtext('./FIRSTNAME'),
            'last_name': aka.findtext('./LASTNAME'),
            'middle_name': aka.findtext('./MIDDLENAME'),
            'other_name': aka.findtext('./WHOLENAME')
        }

        funct = aka.findtext('./FUNCTION')
        if funct and len(funct) > len(record.get('function', '')):
            record['function'] = funct

        gender = aka.findtext('./GENDER')
        if gender == 'M':
            data['gender'] = 'male'
        if gender == 'F':
            data['gender'] = 'female'

        if 'name' not in record:
            record['name'] = data.pop('other_name')
            record.update(data)
        else:
            record['other_names'].append(data)

    record['identities'] = []
    for passport in entry.findall('./PASSPORT'):
        data = {
            'type': 'Passport',
            'number': passport.findtext('./NUMBER'),
            'country': normalize_country(passport.findtext('./COUNTRY'))
        }
        record['identities'].append(data)

    record['addresses'] = []
    for address in entry.findall('./ADDRESS'):
        data = {
            'address1': address.findtext('./STREET'),
            'address2': address.findtext('./NUMBER'),
            'city': address.findtext('./CITY'),
            'postal_code': address.findtext('./ZIPCODE'),
            'country': normalize_country(address.findtext('./COUNTRY'))
        }
        record['addresses'].append(data)

    for birth in entry.findall('./BIRTH'):
        place = birth.findtext('./PLACE')
        if place and len(place) > len(record.get('place_of_birth', '')):
            record['place_of_birth'] = place

        date_ = parse_date(birth.findtext('./DATE'))
        if date_ and len(date_) > len(record.get('date_of_birth', '')):
            record['date_of_birth'] = date_

        country = normalize_country(birth.findtext('./COUNTRY'))
        if country and len(country) > len(record.get('country_of_birth', '')):
            record['country_of_birth'] = country

    # print etree.tostring(entry, pretty_print=True)
    source.emit(record)