def parse_country(country, guess=True): """Determine a two-letter country code based on an input. The input may be a country code, a country name, etc. """ if guess: country = countrynames.to_code(country) if country is not None: country = country.lower() if is_country_code(country): return country
def clean_text(self, country, fuzzy=False, **kwargs): """Determine a two-letter country code based on an input. The input may be a country code, a country name, etc. """ code = country.lower().strip() if code in self.codes: return code country = countrynames.to_code(country, fuzzy=fuzzy) if country is not None: return country.lower()
def test_GB(): assert to_code("Scotland") == "GB-SCT" assert to_code("Wales") == "GB-WLS" assert to_code("Northern Ireland") == "GB-NIR" assert to_code("Northern Ireland", fuzzy=True) == "GB-NIR" assert to_code( "United Kingdom of Great Britain and Northern Ireland") == "GB" assert to_code("United Kingdom of Great Britain and Northern Ireland", fuzzy=True) == "GB"
def clean_text( self, text: str, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional["EntityProxy"] = None, ) -> Optional[str]: """Determine a two-letter country code based on an input. The input may be a country code, a country name, etc. """ code = countrynames.to_code(text, fuzzy=fuzzy) if code is not None: lower = code.lower() if lower in self.codes: return lower return None
def emit_entity(self, data): uid = data.get('uid') or data.get('uid_canonical') if uid is None: raise ValueError("No UID for entity: %r", data) if data.get('type') not in TYPES: raise ValueError("Invalid entity type: %r", data) try: data['weight'] = int(data.get('weight', 0)) except Exception: raise ValueError("Invalid weight: %r", data) if 'country' in data: data['country'] = countrynames.to_code(data['country']) name = data.get('name') if name is not None: name = unicode(name).strip() if not len(name): name = None data['name'] = name for k, v in data.items(): if v is None: data.pop(k) # TODO: partial dates aliases = data.pop('aliases', []) self.entities.upsert(data, ['origin', 'uid']) for alias in aliases: self.emit_alias({ 'name': alias, 'origin': data.get('origin'), 'uid': data.get('uid'), 'uid_canonical': data.get('uid_canonical'), }) return data
def test_non_standard_codes(): assert to_code("European Union") == "EU" assert to_code_3("European Union") == "EUU" assert to_code("Kosovo") == "XK" assert to_code_3("Kosovo") == "XKX"
def test_fuzzy_matching(): assert to_code('Rossiyskaya Federatsiya', fuzzy=True) == "RU" assert to_code("Falklands Islands", fuzzy=True) == "FK" assert to_code("TGermany", fuzzy=True) == "DE"
def test_unicode(): assert to_code(u'Российская Федерация') == "RU"
def parse_officer(line): results = dict() # number of the company to which this officer is appointed to. # The majority of company numbers are 8 digit numeric; however, some # consist of a prefix of 2 alphanum characters # followed by 6 digits. results['appointed_to_company_number'] = line[0:8] # nature of the officer. # 1 is person (as in officer, could be legal or natural person) # 2 is company (as in companies to which the officer is appointed to) results['record_type'] = line[8] # source document of the appointment date. results['appointment_date_origin_code'] = line[9:10] # role of the appointed officer. results['officer_role_code'] = line[10:12] # personal number: as of 2009 pnr are composed of 12 digits. The first # 8 uniquely identify the person. A person is composed of a name and a # usual residence address (URA) which is *not* public. If director with # many appointments changes URA or name for an appointment then the pnr # last 4 digits will be incremented from 0000 to 0001. results['pnr'] = line[12:24] # indicator for record being a company. officer can be either natual # (h**o sapiens =! Y) or legal (corporation == Y) person. results['is_company'] = line[24] == 'Y' # filler, can throw away. results['filler_a'] = line[25:32] # appointment dates. If a date is provided for officer_role_code 11, # 12, or 13 this refers to the date that the form was registered. # Resigned appointments are not normally included in a snapshot so this # field will usually be blank. date format: CCYYMMDD (C for century, Y # for year, M for month, D for day). results['start_date_text'] = line[32:40] results['end_date_text'] = line[40:48] # postal code. results['service_address_post_code'] = line[48:56] # dob. partial_dob field will contain either all spaces, or a partial # dob followed by 2 space chars ‘CCYYMM ‘. If full_dob is provided # then partial_dob will also be provided, but partial_dob may be # provided w/out full_dob. results['partial_dob'] = line[56:64] # full_dob could be thrown away but we keep for completeness. # tested on 1000 records, always ' '. results['full_dob'] = line[64:72] # holds the length of the variable data bit (incl. "<" chars), used for # validation, do not insert in database. results['unwanted_variable_data_length'] = line[72:76] # variable_data: contains officer’s name, service address, occupation, # and nationality, formatted as below: # TITLE |-> 'title' # <FORENAMES |-> 'name' # <SURNAME |-> 'surname' # <HONOURS |-> 'honours' # <CARE OF |-> 'service_address_care_of' # <PO BOX |-> 'service_address_po_box' # <ADDRESS LINE 1 |-> 'service_address_line_1' # <ADDRESS LINE 2 |-> 'service_address_line_2' # <POST TOWN |-> 'service_address_post_town' # <COUNTY |-> 'service_address_county' # <COUNTRY |-> 'service_address_country' # <OCCUPATION |-> 'occupation' # <NATIONALITY |-> 'nationality' # <USUAL RESIDENTIAL COUNTRY |-> 'ura_country' # < |-> 'filler_b' # Each variable data field will contain 14 “<” delimiters. Consecutive # “<” delimiters indicates that the particular element of the variable # data is not present. variable_data = line[76:].rstrip(' \n') vardata = variable_data.split('<') vardata_components = ( 'title', 'name', 'surname', 'honours', 'service_address_care_of', 'service_address_po_box', 'service_address_line_1', 'service_address_line_2', 'service_address_post_town', 'service_address_county', 'service_address_country', 'occupation', 'nationality', 'ura_country', 'filler_b', ) # after the last '<' there's just a bunch of # white spaces till end of line, can throw away. for component, datapoint in zip(vardata_components, vardata): results[component] = datapoint results["ura_country_norm"] = to_code(results.get( "ura_country_norm", None), fuzzy=True) results["nationality_norm"] = to_code(results.get("nationality", None), fuzzy=True) results["service_address_country_norm"] = to_code(results.get( "service_address_country", None), fuzzy=True) results["name_fp"] = generate( results.get("name", "") + " " + results.get("surname", "")) return results
def country(self, name): self.country_name = name self.country_code = countrynames.to_code(name)
def normalize_country(name): return countrynames.to_code(name)
def main(f, failed_out, new_out, log_out): logging.basicConfig(filename=log_out, level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler()) new_docs = [] failed_docs = [] try: with open(f) as fd: docs = list(yaml.load_all(fd)) N = len(docs) for K, d in enumerate(docs): url = d.get('url') affs = has_affiliation(d) success_rate = 100.0 - float(len(failed_docs)) / N * 100 logging.info( "{c.Fore.CYAN}[{s:.1f}%]: {K}/{N}. {t}>>" "{c.Fore.YELLOW}Trying {0}{c.Style.RESET_ALL}".format( url, s=success_rate, K=K, N=N, t=time.ctime(), c=colorama)) try: if not affs: raise Exception('No affiliations') countries = list( filter(bool, [ countrynames.to_code(aff, fuzzy=True) for aff in affs ])) centers = list( filter(bool, [get_centers(aff) for aff in affs])) if not centers: raise Exception( 'No countries ({aff!s})'.format(aff=affs)) except Exception as e: logging.info("{c.Fore.RED}\tFailed ({e})" "{c.Style.RESET_ALL}".format(e=e, c=colorama)) failed_docs.append(d) else: logging.info("{c.Fore.GREEN}\tsuccess " "{codes!s}\n" "\t\t{affs!s}" "{c.Style.RESET_ALL}".format(d.get('url'), codes=countries, affs=affs, c=colorama)) d['countries'] = countries d['centers'] = countries new_docs.append(d) except Exception as e: logging.error(e) finally: with open(failed_out, 'w+') as fd: logging.info('writing ' + failed_out) yaml.dump_all(list(failed_docs), fd, allow_unicode=True, default_flow_style=False) with open(new_out, 'w+') as fd: logging.info('writing ' + new_out) yaml.dump_all(list(new_docs), fd, allow_unicode=True, default_flow_style=False)
def process_file(filepath): with open(filepath) as f: for ix, line in enumerate(f): print(f"Inserting line {ix} of file {filepath}") # check line is not empty string. if line.strip(): jsonline = json.loads(line) line_type = determine_line(jsonline) if line_type == "psc": (current_psc, current_address, current_identification, current_control) = unpack_psc_line(jsonline) # normalise some fields and insert current_psc["name_fp"] = generate(current_psc["name"]) current_psc["country_of_residence_norm"] = to_code( current_psc.get("country_of_residence", None), fuzzy=True) current_psc["nationality_norm"] = to_code(current_psc.get( "nationality", None), fuzzy=True) psc_id = psc_table.insert(current_psc) if current_address: # normalise country field and insert current_address["country_norm"] = to_code( current_address.get("country", None), fuzzy=True) address_table.insert({ **current_address, **{ "psc_serial_id": psc_id } }) if current_identification: current_identification[ "country_registered_norm"] = to_code( current_identification.get( "country_registered", None), fuzzy=True) identification_table.insert({ **current_identification, **{ "psc_serial_id": psc_id } }) if current_control: # stack the array of control types like this # company_number | nature_of_control # ----------------------------------- # OC123456 | sometypeofcotrol_1 # OC123456 | sometypeofcotrol_2 # OC123456 | sometypeofcotrol_3 root = current_control["company_number"] for nature in current_control["natures_of_control"]: stacked_control_data = { "company_number": root, "psc_serial_id": psc_id, "nature_of_control": nature } control_table.insert(stacked_control_data) # the exempted psc json is different. Needs its own processing. elif line_type == "exemptions": # lazyly create the list of dictionaries to be inserted as # records into the table. exemptions_generator = unpack_exemptions_line( json.loads(line)) for exemption_dict in list(exemptions_generator): exemptions_table.insert(exemption_dict) elif line_type == "summary_line": # example of summary_line below: # {"data": # { # "kind": "totals#persons-of-significant-control-snapshot", # "persons_of_significant_control_count": 7131880, # "statements_count": 564130, # "exemptions_count": 92, # "generated_at" : "2020-03-25T03:39:38Z"} # } summary_data = jsonline.pop("data") summary_table.insert(summary_data) # if line is empty, go to next one. else: continue
def normalize_value(self, value, prop, record): return [countrynames.to_code(value)]
def clean(self, value, prop, record): value = super(CountryProperty, self).clean(value, prop, record) return countrynames.to_code(value) or value
def test_to_code(): assert to_code("Germany") == "DE" assert to_code("UK") == "GB" assert to_code("Nothing") == None
def test_fuzzy_matching(): assert to_code("Rossiyskaya Federacia", fuzzy=True) == "RU" assert to_code("Falklands Islands", fuzzy=True) == "FK" assert to_code("TGermany", fuzzy=True) == "DE" assert to_code_3("State of Palestine", fuzzy=True) == "PSE"
def test_to_code(): assert to_code("Germany") == "DE" assert to_code("UK") == "GB" assert to_code("North Macedonia") == "MK" assert to_code("Nothing") is None
# coding: utf-8 import countrynames tests = [ 'Germany', 'DE', 'UK', u'Российская Федерация', 'Rossiyskaya Federatsiya', 'Tgermany', None ] for test in tests: print[test, countrynames.to_code(test, fuzzy=False), countrynames.to_code(test)]