def parse(context, data): emitter = EntityEmitter(context) url = data.get('url') country = normalize_country(data.get('country')) with context.http.rehash(data) as res: doc = res.html # updated_at = doc.findtext('.//span[@id="lastUpdateDate"]') output = doc.find('.//div[@id="countryOutput"]') if output is None: return # component = None for row in output.findall('.//li'): # next_comp = row.findtext('./td[@class="componentName"]/strong') # if next_comp is not None: # component = next_comp # continue function = element_text(row.find('.//span[@class="title"]')) if function is None: continue name = element_text(row.find('.//span[@class="cos_name"]')) if name is None: continue person = emitter.make('Person') person.make_id(country, name, function) person.add('name', name) person.add('country', country) person.add('position', function) person.add('sourceUrl', url) emitter.emit(person) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) references = defaultdict(list) with context.http.rehash(data) as res: xls = xlrd.open_workbook(res.file_path) ws = xls.sheet_by_index(0) headers = [slugify(h, sep='_') for h in ws.row_values(0)] for r in range(1, ws.nrows): row = ws.row(r) row = dict(zip(headers, row)) for header, cell in row.items(): if cell.ctype == 2: row[header] = str(int(cell.value)) elif cell.ctype == 3: date = xldate_as_datetime(cell.value, xls.datemode) row[header] = date.isoformat() elif cell.ctype == 0: row[header] = None row[header] = cell.value reference = clean_reference(row.get('reference')) references[reference].append(row) for ref, rows in references.items(): parse_reference(emitter, ref, rows) emitter.finalize()
def parse_notice(context, data): with context.http.rehash(data) as res: res = res.json first_name = res['forename'] or '' last_name = res['name'] or '' dob = res['date_of_birth'] nationalities = res['nationalities'] place_of_birth = res['place_of_birth'] warrants = [ (warrant['charge'], warrant['issuing_country_id']) for warrant in res['arrest_warrants'] # noqa ] gender = SEXES.get(res['sex_id']) emitter = EntityEmitter(context) entity = emitter.make('Person') entity.make_id(first_name, last_name, res['entity_id']) entity.add('name', first_name + ' ' + last_name) entity.add('firstName', first_name) entity.add('lastName', last_name) entity.add('nationality', nationalities) for charge, country in warrants: entity.add('program', country) entity.add('summary', charge) entity.add('gender', gender) entity.add('birthPlace', place_of_birth) entity.add('birthDate', parse_date(dob)) entity.add('sourceUrl', res['_links']['self']['href']) entity.add('keywords', 'REDNOTICE') entity.add('keywords', 'CRIME') emitter.emit(entity) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) entity = emitter.make('LegalEntity') name = data.get('SUPP_NAME') ent_id = data.get('SUPP_ID') reason = data.get('DEBAR_REASON') country = data.get('COUNTRY_NAME') city = data.get('SUPP_CITY') address = data.get('SUPP_ADDR') start_date = data.get('DEBAR_FROM_DATE') end_date = data.get('DEBAR_TO_DATE') entity.make_id(name, ent_id, country) names = clean_name(name) entity.add('name', names[0]) entity.add('address', address) entity.add('address', city) entity.add('country', normalize_country(country)) for name in names[1:]: entity.add('alias', name) sanction = emitter.make('Sanction') sanction.make_id('Sanction', entity.id) sanction.add('authority', 'World Bank Debarrment') sanction.add('program', reason) sanction.add('startDate', clean_date(start_date)) sanction.add('endDate', clean_date(end_date)) sanction.add('sourceUrl', SOURCE) emitter.emit(entity) emitter.emit(sanction) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: with open(res.file_path, 'r') as csvfile: for row in csv.DictReader(csvfile, delimiter='\t'): parse_row(emitter, row) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for node in res.xml.findall('.//INDIVIDUAL'): parse_individual(emitter, node) for node in res.xml.findall('.//ENTITY'): parse_entity(emitter, node) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for row in extract_rows(res.xml): if len(row) == 5: parse_organisation(emitter, row) if len(row) == 9: parse_individual(emitter, row) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: doc = res.xml for distinct_party in doc.findall(qpath('DistinctParty')): parse_party(emitter, doc, distinct_party) for entry in doc.findall(qpath('SanctionsEntry')): parse_entry(emitter, doc, entry) for relation in doc.findall(qpath('ProfileRelationship')): parse_relation(emitter, doc, relation) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) groups = defaultdict(list) with context.http.rehash(data) as res: with open(res.file_path, 'r', encoding='iso-8859-1') as csvfile: # ignore first line next(csvfile) for row in csv.DictReader(csvfile): group = row.pop('Group ID') if group is not None: groups[group].append(row) for group, rows in groups.items(): parse_entry(emitter, group, rows) emitter.finalize()
def seco_parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: updated_at = res.xml.getroot().get('date') programs = {} for sanc in res.xml.findall('.//sanctions-program'): ssid = sanc.find('./sanctions-set').get('ssid') programs[ssid] = sanc.findtext('./program-name[@lang="eng"]') places = {} for place in res.xml.findall('.//place'): places[place.get('ssid')] = parse_address(place) for target in res.xml.findall('./target'): parse_entry(emitter, target, programs, places, updated_at) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) country = data.get('country', {}).get('code') with context.http.rehash(data) as res: persons = {} for person in res.json.get('persons', []): ep_id, ftm_id = parse_person(emitter, person, country) persons[ep_id] = ftm_id organizations = {} for organization in res.json.get('organizations', []): ep_id, ftm_id = parse_organization(emitter, organization, country) organizations[ep_id] = ftm_id for membership in res.json.get('memberships', []): parse_membership(emitter, membership, persons, organizations) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as result: doc = result.html name = element_text(doc.find('.//div[@class="nom_fugitif_wanted"]')) if name is None or name == 'Identity unknown': return entity = emitter.make('Person') entity.make_id(data.get('url')) entity.add('name', name) entity.add('sourceUrl', data.get('url')) wanted = element_text( doc.find('.//span[@class="nom_fugitif_wanted_small"]')) # noqa entity.add('program', wanted) entity.add('keywords', 'REDNOTICE') entity.add('keywords', 'CRIME') if ', ' in name: last, first = name.split(', ', 1) entity.add('alias', jointext(first, last)) for row in doc.findall('.//div[@class="bloc_detail"]//tr'): title, value = row.findall('./td') name = slugify(element_text(title), sep='_') value = element_text(value) if value is None: continue if name == 'charges': entity.add('summary', value) elif name == 'present_family_name': entity.add('lastName', value) elif name == 'forename': entity.add('firstName', value) elif name == 'nationality': for country in value.split(', '): entity.add('nationality', country) elif name == 'sex': entity.add('gender', SEXES[value]) elif name == 'date_of_birth': entity.add('birthDate', value.split('(')[0]) elif name == 'place_of_birth': entity.add('birthPlace', value) emitter.emit(entity) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) seen = set() for letter in string.ascii_uppercase: url = URL % letter while True: context.log.info("URL: %s", url) res = context.http.get(url) doc = res.html for member in doc.findall('.//ul[@class="member-results"]/li'): parse_entry(emitter, member) seen.add(url) url = None for a in doc.findall('.//div[@id="pagination"]//a'): next_url = urljoin(URL, a.get('href')) if next_url not in seen: url = next_url if url is None: break emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for table in res.html.findall('.//table'): if 'List of Debarred' not in table.get('summary', ''): continue rows = table.findall('.//tr') for row in rows: tds = row.findall('./td') if len(tds) != 6: continue values = [clean_value(td) for td in tds] entity = emitter.make('LegalEntity') entity.make_id(*values) names = clean_name(values[0]) if not len(names): context.log.warning("No name: %r", values) continue entity.add('name', names[0]) entity.add('address', values[1]) entity.add('country', normalize_country(values[2])) for name in names[1:]: entity.add('alias', name) sanction = emitter.make('Sanction') sanction.make_id('Sanction', entity.id) sanction.add('authority', 'World Bank Debarrment') sanction.add('program', values[5]) sanction.add('startDate', clean_date(values[3])) sanction.add('endDate', clean_date(values[4])) sanction.add('sourceUrl', data.get('url')) emitter.emit(entity) emitter.emit(sanction) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for entry in res.xml.findall('.//acount-list'): parse_entry(emitter, entry) emitter.finalize()
def eeas_parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for entry in res.xml.findall('.//ENTITY'): parse_entry(emitter, entry) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for node in res.xml.findall('.//mep'): parse_node(emitter, node) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as result: for node in result.xml.findall('.//record'): parse_entry(emitter, node) emitter.finalize()