def _get_officers(self, data, officer): """Get officers from dict""" officer_dicts = getlink(data, 'officers')['items'] for off in officer_dicts: # fix the dict tho work CompaniesHouseOfficer class # pprint(off) # format the title splits = off['name'].split(',') name = '%s %s' % (splits[-1].title(), splits[0].title()) # setting the appointments to [], ensures we dont recurse all through the companieshouse db off['appointments'] = [] off['title'] = name off['address_snippet'] = ' '.join(off['address'].values()) off['matches'] = {} x = CompaniesHouseOfficer(off) if self.match_significant_to_self(off, officer, fuzzy_threshold=65, count_threshold=1): x.isOfficer = True else: x.isOfficer = False self.officers.append(x.data)
def get_company_officers(self, company): """""" if company['links'].has_key('officers'): officers = getlink(company, 'officers')['items'] else: officers = [] return officers
def get_company_persons(self, company): """""" if company['links'].has_key('persons_with_significant_control'): persons = getlink(company, 'persons_with_significant_control')['items'] else: persons = [] return persons
def _get_company(self, data, get_officers, get_filing, get_persons): """Get company class from dict""" company_cls = CompaniesHouseCompany(getlink(data, 'company'), self._officer, get_officers, get_filing, get_persons) self.company_name = company_cls.company_name self.company_status = company_cls.company_status self.company_number = company_cls.company_number self.company = company_cls.data
def _get_persons(self, data, officer): """Get persons with significant control from dict""" persons_dicts = getlink(data, 'persons_with_significant_control')['items'] for person in persons_dicts: x = CompaniesHousePerson(person) if self.match_significant_to_self(person, officer): x.isOfficer = True self.persons.append(x.data)
def __init__(self, item_id, category_id, raw_string, pretty, registered, amount, company, link): """ OtherShareholdingsItem """ Item.__init__(self, item_id, category_id, raw_string, pretty, registered, amount) self.isWealth = True self.link = link self.company = getlink(company, 'self') persons = getlink(self.company, 'persons_with_significant_control') self.persons = persons['items'] officers = getlink(self.company, 'officers') self.officers = officers['items'] if self.company == {'items': []}: self.company = { 'company_name': pretty, 'company_number': 'N/A', 'company_status': 'N/A' }
def identify_company(self, keywords, month, year, first, middle, last, display): """""" count_threshold = 1 self.matched_companies = [] names = ['%s %s' % (first, last), display] if middle != '': names.append('%s %s %s' % (first, middle, last)) for record in self.data: match_count = 0 # data is a searchresults type, not the actual company. # get the actual company record, but onyl for those search results that match the name # look for display name first if filter_by_name_string(record, display) != []: match_display = True else: match_display = False # look for first last name if filter_by_name_string(record, '%s %s' % (first, last)) != []: match_fl = True else: match_fl = False # look for first middle last name if filter_by_name_string(record, '%s %s %s' % (first, middle, last)) != []: match_fml = True else: match_fml = False # only count a name match once if True in [match_display, match_fl, match_fml]: match_count += 1 if match_count >= count_threshold: company = getlink(record, 'self') self.matched_companies.append(company)
def check_match(i, company_search_string, month, year, first, middle, last, display): """ get the first company that matches (in order): - if the search string exactly matches the company name - if everyword in search string in the company name - if previous company name matches the search string - if a person with significant control, matches the display name of mp - if the date of birth data is there, we test that too, if the name matches but the dob is incorrect, then it cant be them. (michael gove has entered an incorrect dob though) - if an officer (who may also be a shareholder) matches the display name - if dob present, test that too """ title = i['title'] # remove ltd and limited from search string, companies house dont match against it company_search_string_clean = company_search_string.lower().replace( 'ltd', '').replace('limited', '').strip() match_count = False if not match_count: ''' this is the hardest to match against as the search string has to match exactly the company house record. ''' if title.lower() == company_search_string: # is an exact string match - bingo match_count = True if not match_count: ''' if the number of words searched for, matches the number of words found, this matches. it's got all the words. ''' if i.has_key('matches'): if i['matches'].has_key('title'): title_match_list = i['matches']['title'] title_tuples = [(title_match_list[x], title_match_list[x + 1]) for x in range(0, len(title_match_list), 2)] number_of_search_words = len( company_search_string_clean.split(' ')) if number_of_search_words == len(title_tuples): # all the words in the search string are in the company title - bingo match_count = True # if there are 5 or more search words, lets account for one missing elif number_of_search_words > 5: if number_of_search_words - 1 == len(title_tuples): match_count = True else: matched_words_company_title = [] for tup in title_tuples: first_bit = tup[0] - 1 last_bit = tup[-1] matched_words_company_title.append( title[first_bit:last_bit]) # print matched_words_company_title # not sure how to proceed from here ? if not match_count: ''' ok, so, no name match. lets get the company record and check previous_names, maybe the company has changed name ''' company = getlink(i, 'self') previous_names = [] if company.has_key('previous_company_names'): for c in company['previous_company_names']: previous_names.append(c['name']) for previous in previous_names: if previous.lower() == company_search_string.lower(): match_count = True if not match_count: ''' ok, no name matches or previous names. time to check the significant persons. these are people / companies that took shares at the formation of the company. subsequent investors aren't required to submit shareholder details, but many do. if the name of someone with significant control matches the mp, NOT the search string, then we can match against that instead. we verify with the date of birth, if present in companies house record. the companies house records arent consistent or complete, there are lots of gaps, which makes it hard to verify with a second value, such as date of birth or address. this is an ongoing problem. ''' remove = ['mr', 'mrs', 'ms', 'miss', 'sir', 'lady', 'dr', 'rt', 'hon'] persons = getlink(company, 'persons_with_significant_control')['items'] for person in persons: # print '' keys = [ 'name', 'name_elements', 'date_of_birth', 'natures_of_control', 'country_of_residence' ] # check our display name, with the name key, strip out titles and check for an exact match person_name = person['name'].lower() person_string = '' for w in person_name.split(' '): if w not in remove: person_string += '%s ' % w person_string = person_string.strip() if person_string == display: match_count = True if not match_count: ''' check officers ''' remove = ['mr', 'mrs', 'ms', 'miss', 'sir', 'lady', 'dr', 'rt', 'hon'] officers = getlink(company, 'officers')['items'] for officer in officers: officer_name = officer['name'] # sort out the name, it comes in as 'LAST, First' last_regex = re.compile('[A-Z]+, ') if last_regex.search(officer_name): last_match = last_regex.search(officer_name).group() first_match = officer_name.split(last_match)[-1] name = '%s %s' % (first_match, last_match.split(',')[0].lower()) officer_name = name.lower() else: officer_name = officer_name.lower() # check our display name, with the name key, strip out titles and check for an exact match officer_string = '' for w in officer_name.split(' '): if w not in remove: officer_string += '%s ' % w officer_string = officer_string.strip() if officer_string == display: match_count = True break officer_splits = officer_string.split(' ') display_splits = display.lower().split(' ') counter = 0 # if all the display names are in the officer name, good, match that for sp in display_splits: if sp.lower() in officer_splits: counter += 1 if counter == len(display_splits): match_count = True break if middle != '': if first in officer_splits and middle in officer_splits and last in officer_splits: match_count = True break if first in officer_splits and last in officer_splits: match_count = True break return match_count
def lookup(self): self.donor = str(self.donor) company_number = None people_links = [] found = False # ugly hack corrections if self.donor in ['Tresco Estate', 'James Hay', 'Think BDW Ltd']: self.status = 'company' if self.status == 'company, no 10120655': company_number = 10120655 if 'Armed Forces Parliamentary Trust' == self.donor: self.status = 'other' if u'Buck’s Club 1919' in self.donor: self.donor = "Buck's Club 1919" self.status = 'members' if u'Pratt’s Club' in self.donor: self.donor = "Pratt's Club" self.status = 'members' if 'carlton club' in self.donor.lower(): self.donor = 'Carlton Club' self.status = 'members' if 'National Liberal Club' in self.donor: self.donor = 'National Liberal Club' self.status = 'members' if 'The Public Interest Foundation (UK charity)' == self.donor: self.status = 'charity' # apply patches if self.donor in urls.keys(): company_number = urls[self.donor].split('/')[-1] if self.donor in people.keys(): people_links = people[self.donor] if not company_number: # use the supplied company number from the register of interests # if 'company' in self.status: company_number_search = re.search('registration [0-9|a-z|A-Z]+', self.status) if company_number_search: company_number = company_number_search.group().split( 'registration ')[-1] # needs padding to 8 digits, if it starts with an int if re.match('[0-9]', company_number): company_number = '%08d' % (int(company_number)) self.company = { 'company_name': self.donor, 'company_number': 'N/A', 'company_status': 'Active' } self.persons = [] self.officers = [] self.link = None self.appointments = [] if company_number: # we have a company number, no need to search for it self.company = getlink( {'links': { 'self': '/company/%s' % str(company_number) }}, 'self') persons = getlink(self.company, 'persons_with_significant_control') self.persons = persons['items'] officers = getlink(self.company, 'officers') self.officers = officers['items'] if not self.company.has_key('errors'): self.link = 'https://beta.companieshouse.gov.uk' + self.company[ 'links']['self'] found = True else: self.company = { 'company_name': self.donor, 'company_number': 'N/A', 'company_status': 'Active' } self.link = '' else: if 'individual' in self.status.lower( ) or 'private' in self.status.lower(): # found = True # for individuals, we store the appointments, then the company, officers etc as children # of the appointment if people_links != []: for pl in people_links: bit = pl.split( 'https://beta.companieshouse.gov.uk')[-1] appointments = getlink({'links': { 'self': '%s' % bit }}, 'self') for i in appointments['items']: if i not in self.appointments: self.appointments.append(i) # just take the last one self.link = pl found = True for app in self.appointments: # add the company, officers and persons record to appointment record app['company'] = getlink(app, 'company') app['officers'] = getlink(app['company'], 'officers')['items'] app['persons_with_significant_control'] = getlink( app['company'], 'persons_with_significant_control')['items'] # eveything below here, should generate a company / entity elif 'trade' in self.status.lower(): self.type = 'union' if self.donor in trade_union.keys(): self.donor = trade_union[self.donor] found = True elif 'charity' in self.status.lower(): self.type = 'charity' if self.donor in charities.keys(): self.donor = charities[self.donor] found = True elif 'unincorporated' in self.status.lower(): self.type = 'club' if self.donor in clubs.keys(): self.donor = clubs[self.donor] found = True elif 'members' in self.status.lower(): self.type = 'club' if self.donor in clubs.keys(): self.donor = clubs[self.donor] found = True elif 'friendly' in self.status.lower(): self.type = 'club' if self.donor in clubs.keys(): self.donor = clubs[self.donor] found = True elif 'other' in self.status.lower(): self.type = 'other' if self.donor in others.keys(): self.donor = others[self.donor] found = True elif 'trust' in self.status.lower(): self.type = 'other' if self.donor in others.keys(): self.donor = others[self.donor] found = True elif 'provident' in self.status.lower(): self.type = 'company' if self.donor in others.keys(): self.donor = others[self.donor] found = True elif 'visit' in self.status: # TODO self.type = 'visit' else: # we dont have a company number, so do a company search if 'llp' in self.status.lower( ) or 'limited' in self.status.lower(): self.type = 'company' else: self.type = 'other' # these are the remaining things to search - can only do a company search really companies = CompaniesHouseCompanySearch([self.donor]) for i in companies.data: # we need the name and address to fuzzy match name_ratio = fuzz.token_set_ratio(i['title'].lower(), self.donor) if name_ratio > 90: if i['address_snippet']: addr_ratio = fuzz.token_set_ratio( i['address_snippet'].lower(), self.address) # if the address matches enough if addr_ratio > 90: self.link = 'https://beta.companieshouse.gov.uk' + i[ 'links']['self'] self.company = getlink(i, 'self') persons = getlink( self.company, 'persons_with_significant_control') self.persons = persons['items'] officers = getlink(self.company, 'officers') self.officers = officers['items'] # print 'FOUND %s: , %s' % (self.status.upper(), self.company['company_name']) found = True break # print self.donor, self.address # if 'sw1p 3ql' in self.address.lower(): # print '*'*100 # print '55 TUFTON STREET: %s' % self.donor # print '*'*100 if found: pass # print '\tFOUND %s: %s' % (self.status.upper(), self.donor) else: # pass print '\tMISSING %s: %s' % (self.status.upper(), self.donor)
def _get_filing_history(self, data): """Get filing history of company from dict""" filing_dicts = getlink(data, 'filing_history')['items'] for filing in filing_dicts: self.filing.append(CompaniesHouseFiling(filing).data)
def _get_appointments(self, record): """Get the appointments of the found officer""" return getlink(record, 'self')
def do_logic(self): """ OK. Here we need to do two separate things, look for officers that match the name and for companies that match the name. ADD TO SELF.ITEMS """ next_id = len(self.items) + 1 item_id = '%04d' % next_id companies = CompaniesHouseCompanySearch(self.names) companies.get_data(keywords=KEYWORDS, month=self.month, year=self.year, first=self.first, middle=self.middle, last=self.last, display=self.display) if len(companies.matched_officers) > 0 or len( companies.matched_persons) > 0: for i in companies.matched_companies: company = getlink(i, 'self') # check for errors if not company.has_key('errors'): # print '\tAdding Company Search Company : %s' % company['company_name'] raw_string = ' '.join(self.names) pretty = self.display.title() registered = '' amount = 0 url = base_url + company['links']['self'] self.items.append( ShareholdingsItem(item_id, self.category_id, raw_string, pretty, registered, amount, company, url)) users = CompaniesHouseUserSearch(self.names) users.identify(keywords=KEYWORDS, month=self.month, year=self.year, first=self.first, middle=self.middle, last=self.last, display=self.display) for i in users.matched: for app in i['appointments']: company = getlink(app, 'company') # check for errors if not company.has_key('errors'): # print '\tAdding Officer Search Company : %s' % company['company_name'] raw_string = ' '.join(self.names) pretty = self.display.title() registered = '' amount = 0 url = base_url + company['links']['self'] self.items.append( ShareholdingsItem(item_id, self.category_id, raw_string, pretty, registered, amount, company, url))