def home(request): sys.path.append(os.path.abspath('scrapers')) data = json.loads(requests.get('https://represent.opennorth.ca/representative-sets/?limit=0', verify=settings.SSL_VERIFY).text) names = {} for obj in data['objects']: # The `ca` scraper has "Parliament of Canada" as the root organization. if obj['name'] == 'House of Commons': names['Parliament of Canada'] = obj['data_url'] else: names[obj['name']] = obj['data_url'] reports = Report.objects.order_by('module').all() for report in reports: try: name = module_name_to_metadata(report.module)['name'] if not report.exception: if name in names: if names[name].startswith('https://scrapers.herokuapp.com/represent/'): report.icon = 'noop' else: report.icon = 'replace' else: report.icon = 'add' except ImportError: report.delete() # delete reports for old modules return render(request, 'index.html', { 'exceptions': Report.objects.exclude(exception='').count(), 'reports': reports, })
def handle(self, *args, **options): sys.path.append(os.path.abspath('scrapers')) args = list(args) threshold = args and int(args.pop(0)) module_names = args or os.listdir('scrapers') urls = [ # Provinces and territories 'https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=101&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-101.CSV', # Census subdivisions 'https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=701&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-701.CSV', # Census divisions 'https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=301&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-301.CSV', ] populations = {} for url in urls: response = requests.get(url) response.encoding = 'ISO-8859-1' reader = csv.reader(StringIO(response.text)) next(reader) # title next(reader) # headers for row in reader: if row: populations[row[0]] = int(row[4] or 0) else: break for module_name in module_names: if os.path.isdir(os.path.join('scrapers', module_name)) and module_name not in ('.git', '_cache', '_data', '__pycache__', 'disabled'): division_id = module_name_to_metadata(module_name)['division_id'] try: report = Report.objects.get(module=module_name) if report.exception: status = 'error' else: status = 'success' except Report.DoesNotExist: status = 'unknown' sgc = Division.get(division_id).attrs['sgc'] or division_id.rsplit('/', 1)[-1].split(':', 1)[-1] if sgc == 'ca': sgc = '01' population = populations.get(sgc, 0) if not threshold or population < threshold: print('%-32s %-7s %8d' % (module_name, status, population))
def handle(self, *args, **options): sys.path.append(os.path.abspath('scrapers')) threshold = options['threshold'] module_names = options['module'] or os.listdir('scrapers') # @see http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/index-eng.cfm urls = [ # Provinces and territories 'http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=101&OFT=FULLCSV', # Census subdivisions 'http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=701&OFT=FULLCSV', # Census divisions 'http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=301&OFT=FULLCSV', ] populations = {} for url in urls: response = requests.get(url, verify=settings.SSL_VERIFY) response.encoding = 'iso-8859-1' reader = csv.DictReader(StringIO(response.text)) for row in reader: if row: populations[row['Geographic code']] = int(row['Population, 2016'] or 0) else: break for module_name in module_names: if os.path.isfile(os.path.join('scrapers', module_name, '__init__.py')): division_id = module_name_to_metadata(module_name)['division_id'] try: report = Report.objects.get(module=module_name) if report.exception: status = 'error' else: status = 'success' except Report.DoesNotExist: status = 'unknown' sgc = Division.get(division_id).attrs['sgc'] or division_id.rsplit(':', 1)[1] population = populations.get(sgc, 0) if not threshold or population < threshold: print('{:<32} {:<7} {:8}'.format(module_name, status, population))
def home(request): sys.path.append(os.path.abspath('scrapers')) data = json.loads( requests.get( 'https://represent.opennorth.ca/representative-sets/?limit=0').text ) names = {} for obj in data['objects']: if obj['name'] == 'House of Commons': names['Parliament of Canada'] = obj['data_url'] else: names[obj['name']] = obj['data_url'] reports = Report.objects.order_by('module').all() for report in reports: if not report.exception: try: name = module_name_to_metadata(report.module)['name'] if name in names: if names[name].startswith( 'https://scrapers.herokuapp.com/represent/'): report.icon = 'noop' else: report.icon = 'replace' else: report.icon = 'add' except ImportError: report.delete() # delete reports for old modules return render_to_response( 'index.html', RequestContext( request, { 'exceptions': Report.objects.exclude(exception='').count(), 'reports': reports, }))
def represent(request, module_name): sys.path.append(os.path.abspath('scrapers')) metadata = module_name_to_metadata(module_name) representatives = [] # Exclude party memberships. queryset = Membership.objects.filter(organization__jurisdiction_id=metadata['jurisdiction_id']) if module_name.endswith('_candidates'): # Include only candidates. queryset.filter(role='candidate') else: # Exclude candidates and party memberships. queryset.exclude(role__in=('member', 'candidate')) for membership in queryset.prefetch_related('contact_details', 'person', 'person__links', 'person__sources', 'post'): person = membership.person # Not sure why this is necessary. if not isinstance(membership.extras, dict): membership.extras = json.loads(membership.extras) if not isinstance(person.extras, dict): person.extras = json.loads(person.extras) try: party_name = Membership.objects.select_related('organization').get(organization__classification='party', role='member', person=person).organization.name except Membership.DoesNotExist: party_name = None if person.gender == 'male': gender = 'M' elif person.gender == 'female': gender = 'F' else: gender = None # Candidates only. incumbent = person.extras.pop('incumbent', None) # @see https://represent.opennorth.ca/api/#fields representative = { 'name': person.name, 'elected_office': membership.role, 'party_name': party_name, 'email': next((contact_detail.value for contact_detail in membership.contact_details.all() if contact_detail.type == 'email'), None), 'photo_url': person.image or None, 'personal_url': get_personal_url(person), 'gender': gender, 'offices': json.dumps(get_offices(membership)), 'extra': json.dumps(get_extra(person)), } sources = list(person.sources.all()) # The first URL ought to be the most generic source. representative['source_url'] = sources[0].url if len(sources) > 1: # The last URL ought to be the most specific source. representative['url'] = sources[-1].url if incumbent: representative['incumbent'] = True match = re.search(r'^(\S+) (Ward \d+)$', membership.post.label) # If the person is part of Peel Regional Council. if match: parent = Division.objects.get(subtype1='csd', subtype2='', name=match.group(1)) division = Division.objects.get(subtype1='csd', subid1=parent.subid1, name=match.group(2)) boundary_set_slug = '{}-wards'.format(parent.name.lower()) representative['district_name'] = membership.post.label representative['boundary_url'] = '/boundaries/{}/ward-{}/'.format(boundary_set_slug, division.subid2) representatives.append(representative) # If the person is associated to multiple boundaries. elif re.search(r'^Wards\b', membership.post.label): for district_id in re.findall(r'\d+', membership.post.label): representative = representative.copy() representative['district_id'] = district_id representative['district_name'] = 'Ward {}'.format(district_id) representatives.append(representative) else: division_id = metadata['division_id'] if re.search('^ocd-division/country:ca/csd:(\d{7})\Z', division_id): geographic_code = division_id[-7:] elif re.search('^ocd-division/country:ca/cd:(\d{4})\Z', division_id): geographic_code = division_id[-4:] else: geographic_code = None post_label = remove_suffix_re.sub('', membership.post.label) # If the post label is numeric. if re.search(r'^\d+\Z', post_label): representative['district_id'] = post_label # If the person has a boundary URL. elif 'boundary_url' in membership.extras: representative['district_name'] = post_label representative['boundary_url'] = membership.extras['boundary_url'] # If the post label is a Census geographic name. elif post_label == metadata['division_name'] and geographic_code: representative['district_name'] = post_label if len(geographic_code) == 7: representative['boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(geographic_code) elif len(geographic_code) == 4: representative['boundary_url'] = '/boundaries/census-divisions/{}/'.format(geographic_code) else: representative['district_name'] = post_label district_id = re.search(r'^(?:District|Division|Ward) (\d+)\Z', post_label) if district_id: representative['district_id'] = district_id.group(1) representatives.append(representative) return HttpResponse(json.dumps(representatives), content_type='application/json')
def handle(self, *args, **options): def save(key, body): k = Key(bucket) k.key = key k.set_contents_from_string(body) k.set_acl('public-read') sys.path.append(os.path.abspath('scrapers')) bucket = S3Connection().get_bucket('represent.opennorth.ca') names = { 'Parliament of Canada': 'house-of-commons', 'Legislative Assembly of Alberta': 'alberta-legislature', 'Legislative Assembly of British Columbia': 'bc-legislature', 'Legislative Assembly of Manitoba': 'manitoba-legislature', 'Legislative Assembly of New Brunswick': 'new-brunswick-legislature', 'Newfoundland and Labrador House of Assembly': 'newfoundland-labrador-legislature', 'Nova Scotia House of Assembly': 'nova-scotia-legislature', 'Legislative Assembly of Ontario': 'ontario-legislature', 'Legislative Assembly of Prince Edward Island': 'pei-legislature', 'Assemblée nationale du Québec': 'quebec-assemblee-nationale', 'Legislative Assembly of Saskatchewan': 'saskatchewan-legislature', } default_headers = [ 'District name', 'Primary role', 'Name', # not in CSV schema 'First name', 'Last name', 'Gender', 'Party name', 'Email', 'Photo URL', 'Source URL', 'Website', 'Facebook', 'Instagram', 'Twitter', 'LinkedIn', 'YouTube', ] office_headers = [ 'Office type', # not in CSV schema 'Address', # not in CSV schema 'Phone', 'Fax', ] all_rows = [] max_offices_count = 0 reports = Report.objects.filter(exception='').exclude( module__endswith='_candidates').exclude( module__endswith='_municipalities').order_by('module') for report in reports: try: metadata = module_name_to_metadata(report.module) rows = [] offices_count = 0 # Exclude party memberships. queryset = Membership.objects.filter( organization__jurisdiction_id=metadata['jurisdiction_id'] ).exclude(role__in=('member', 'candidate')) for membership in queryset.prefetch_related( 'contact_details', 'person', 'person__links', 'person__sources'): person = membership.person try: party_name = Membership.objects.get( organization__classification='party', role='member', person=person).organization.name except Membership.DoesNotExist: party_name = None facebook = None instagram = None linkedin = None twitter = None youtube = None for link in person.links.all(): domain = '.'.join( urlsplit(link.url).netloc.split('.')[-2:]) if domain in ('facebook.com', 'fb.com'): facebook = link.url elif domain == 'instagram.com': instagram = link.url elif domain == 'linkedin.com': linkedin = link.url elif domain == 'twitter.com': twitter = link.url elif domain == 'youtube.com': youtube = link.url if person.gender == 'male': gender = 'M' elif person.gender == 'female': gender = 'F' else: gender = None if ' ' in person.name: first_name, last_name = person.name.rsplit(' ', 1) else: first_name, last_name = None, person.name # @see https://represent.opennorth.ca/api/#fields sources = list(person.sources.all()) row = [ remove_suffix_re.sub( '', membership.post.label), # District name membership.role, # Elected office person.name, # Name first_name, # First name last_name, # Last name gender, # Gender party_name, # Party name next((contact_detail.value for contact_detail in membership.contact_details.all() if contact_detail.type == 'email'), None), # Email person.image, # Photo URL sources[-1].url if len(sources) > 1 else None, # Source URL get_personal_url(person), # Website facebook, # Facebook instagram, # Instagram twitter, # Twitter linkedin, # LinkedIn youtube, # YouTube ] offices = get_offices(membership) if len(offices) > offices_count: offices_count = len(offices) for office in offices: for key in ('type', 'postal', 'tel', 'fax'): row.append(office.get(key)) # If the person is associated to multiple boundaries. if re.search(r'\AWards\b', membership.post.label): for district_id in re.findall(r'\d+', membership.post.label): row = row[:] row[0] = 'Ward %s' % district_id rows.append(row) else: rows.append(row) rows.sort() headers = default_headers[:] for _ in range(offices_count): headers += office_headers name = metadata['name'] if name in names: slug = names[name] else: slug = slugify(name) io = StringIO() body = csv.writer(io) body.writerow(headers) body.writerows(rows) save('csv/%s.csv' % slug, codecs.encode(io.getvalue(), 'windows-1252')) if offices_count > max_offices_count: max_offices_count = offices_count for row in rows: row.insert(0, name) all_rows.append(row) except ImportError: report.delete() # delete reports for old modules headers = ['Organization'] + default_headers for _ in range(max_offices_count): headers += office_headers io = StringIO() body = csv.writer(io) body.writerow(headers) body.writerows(all_rows) save('csv/complete.csv', codecs.encode(io.getvalue(), 'windows-1252'))
def handle(self, *args, **options): sys.path.append(os.path.abspath('scrapers')) empty_organizations = {'Parliament of Canada', 'Senate'} if options['module']: division_id = module_name_to_metadata( options['module'])['division_id'] jurisdictions = Jurisdiction.objects.filter( division_id=division_id) else: # Exclude candidate scrapers. jurisdictions = Jurisdiction.objects.exclude( classification='executive') organizations = Organization.objects.filter( jurisdiction__in=jurisdictions) posts = Post.objects.filter(organization__in=organizations) people = Person.objects.filter( memberships__organization__in=organizations) memberships = Membership.objects.filter(person_id__in=people) contact_details = MembershipContactDetail.objects.filter( membership__in=memberships) # A person has multiple memberships. jurisdiction_with_repetition = { 'ocd-jurisdiction/country:ca/cd:3521/legislature': 4, # Peel, due to Brampton 'ocd-jurisdiction/country:ca/csd:3521010/legislature': 4, # Brampton } post_memberships_count = posts.values('id').annotate( count=Count('memberships')) # Validate the number of organizations per jurisdiction. results = jurisdictions.values('id').annotate( count=Count('organizations')).exclude(count=1) # The Parliament of Canada has three organizations. if len(results) > 1 or results and results[0] != { 'count': 3, 'id': 'ocd-jurisdiction/country:ca/legislature' }: log.error('{} jurisdictions do not have one organization'.format( len(results))) for result in results: log.info('{} {}'.format(result['count'], result['id'])) # Validate the presence of posts and memberships on organizations. results = set( organizations.values('id').exclude( classification__in=('committee', 'party')).annotate( count=Count('posts')).filter(count=0).values_list( 'name', flat=True)) - empty_organizations self.report_value( 'non-committee, non-party organizations have no posts', results) results = set( organizations.values('id').exclude( classification='committee').annotate( count=Count('memberships')).filter(count=0).values_list( 'name', flat=True)) - empty_organizations self.report_value('non-committee organizations have no memberships', results) # Validate the number of memberships per post. results = Counter( post_memberships_count.filter(count=0).values_list( 'organization__name', flat=True)) self.report_count( 'organizations have posts with no memberships (seats may be vacant)', results) results = Counter( post_memberships_count.filter(count__gt=1).values_list( 'organization__name', flat=True)) self.report_count('organizations have posts with many memberships', results) # Validate the presence of posts on memberships. results = Counter( memberships.filter(post_id=None).exclude( organization__classification='party').values_list( 'organization__name', flat=True)) self.report_count( 'non-party organizations have memberships with no posts', results) # Validate that people have at most one post-membership. results = people.values('id').exclude( memberships__organization__classification='party').exclude( memberships__organization__jurisdiction_id__in= jurisdiction_with_repetition.keys()).annotate( count=Count('memberships')).exclude(count=1).values_list( 'name', flat=True) self.report_value('people have many non-party memberships', results) for jurisdiction_id, threshold in jurisdiction_with_repetition.items(): results = people.values('id').exclude( memberships__organization__classification='party').filter( memberships__organization__jurisdiction_id=jurisdiction_id ).annotate(count=Count('memberships')).exclude( count__lte=threshold).values_list('name', flat=True) self.report_value( 'people have many non-party memberships in {}'.format( jurisdiction_id), results) # Validate that people have at most one party-membership. results = people.values('id').filter( memberships__organization__classification='party').annotate( count=Count('memberships')).exclude(count=1).values_list( 'name', flat=True) self.report_value('people have many party memberships', results) # Validate the uniqueness of names and images. people_without_repetition = people.exclude( memberships__organization__jurisdiction_id__in= jurisdiction_with_repetition.keys()) results = self.repeated( people_without_repetition.values_list('name', flat=True)) self.report_count('names are repeated across people', results) results = self.repeated( people_without_repetition.exclude(image='').values_list('image', flat=True)) self.report_count('images are repeated across people', results) for jurisdiction_id, threshold in jurisdiction_with_repetition.items(): people_with_repetition = people.filter( memberships__organization__jurisdiction_id=jurisdiction_id) results = self.repeated(people_with_repetition.values_list( 'name', flat=True), threshold=threshold) self.report_count( 'names are repeated across people in {}'.format( jurisdiction_id), results) results = self.repeated(people_with_repetition.exclude( image='').values_list('image', flat=True), threshold=threshold) self.report_count( 'images are repeated across people in {}'.format( jurisdiction_id), results) # Validate the uniqueness of link URLs. results = self.repeated( people.exclude(links__url=None).values_list('links__url', flat=True)) self.report_count('link URLs are repeated across people', results) # Validate the uniqueness of email contact detail values. results = self.repeated( contact_details.filter(type='email').exclude( membership__organization__jurisdiction_id__in= jurisdiction_with_repetition.keys()).values_list('value', flat=True)) self.report_count( 'emails are repeated across membership contact details', results) for jurisdiction_id, threshold in jurisdiction_with_repetition.items(): results = self.repeated( contact_details.filter(type='email').filter( membership__organization__jurisdiction_id=jurisdiction_id). values_list('value', flat=True), threshold=threshold) self.report_count( 'emails are repeated across membership contact details in {}'. format(jurisdiction_id), results) # Validate presence of email contact detail. jurisdiction_with_no_email = [ # Javascript-encoded email 'ocd-jurisdiction/country:ca/csd:1217030/legislature', # Cape Breton # Webform email 'ocd-jurisdiction/country:ca/csd:2423027/legislature', # Québec 'ocd-jurisdiction/country:ca/csd:2464008/legislature', # Terrebonne 'ocd-jurisdiction/country:ca/csd:3524009/legislature', # Milton 'ocd-jurisdiction/country:ca/csd:3530016/legislature', # Waterloo 'ocd-jurisdiction/country:ca/csd:3530027/legislature', # Wellesley 'ocd-jurisdiction/country:ca/csd:3530035/legislature', # Woolwich 'ocd-jurisdiction/country:ca/csd:4706027/legislature', # Regina 'ocd-jurisdiction/country:ca/csd:4711066/legislature', # Saskatoon 'ocd-jurisdiction/country:ca/csd:4806016/legislature', # Calgary 'ocd-jurisdiction/country:ca/csd:5909052/legislature', # Abbotsford ] leaders_with_no_email = { 'ocd-jurisdiction/country:ca/cd:3521/legislature', # Peel 'ocd-jurisdiction/country:ca/csd:2437067/legislature', # Trois-Rivières 'ocd-jurisdiction/country:ca/csd:2456083/legislature', # Saint-Jean-sur-Richelieu 'ocd-jurisdiction/country:ca/csd:2494068/legislature', # Saguenay 'ocd-jurisdiction/country:ca/csd:3520005/legislature', # Toronto 'ocd-jurisdiction/country:ca/csd:3521024/legislature', # Caledon 'ocd-jurisdiction/country:ca/csd:3530013/legislature', # Kitchener 'ocd-jurisdiction/country:ca/csd:4811061/legislature', # Edmonton 'ocd-jurisdiction/country:ca/csd:4816037/legislature', # Wood Buffalo 'ocd-jurisdiction/country:ca/csd:5909052/legislature', # Abbotsford 'ocd-jurisdiction/country:ca/csd:5915004/legislature', # Surrey } jurisdiction_ids = jurisdictions.exclude( id__in=jurisdiction_with_no_email).values_list('id', flat=True) for jurisdiction_id in jurisdiction_ids: for organization in organizations.filter( jurisdiction_id=jurisdiction_id): # It's ridiculous that Django can't do a LEFT OUTER JOIN with a WHERE clause. memberships_with_no_email = sum( not membership.contact_details.filter( type='email').count() for membership in organization.memberships.all()) if memberships_with_no_email > 1 or memberships_with_no_email and jurisdiction_id not in leaders_with_no_email: log.error('{:2} memberships have no email in {}'.format( memberships_with_no_email, organization.name))
def handle(self, *args, **options): sys.path.append(os.path.abspath('scrapers')) empty_organizations = {'Parliament of Canada', 'Senate'} if args: division_id = module_name_to_metadata(args[0])['division_id'] jurisdictions = Jurisdiction.objects.filter(division_id=division_id) organizations = Organization.objects.filter(jurisdiction__in=jurisdictions) posts = Post.objects.filter(organization__in=organizations) people = Person.objects.filter(memberships__organization__in=organizations) memberships = Membership.objects.filter(person__id__in=people) contact_details = MembershipContactDetail.objects.filter(membership__in=memberships) else: jurisdictions = Jurisdiction.objects organizations = Organization.objects posts = Post.objects people = Person.objects memberships = Membership.objects contact_details = MembershipContactDetail.objects post_memberships_count = posts.values('id').annotate(count=Count('memberships')) # Validate the number of organizations per jurisdiction. results = jurisdictions.values('id').annotate(count=Count('organizations')).exclude(count=1) # The Parliament of Canada has three organizations. if len(results) > 1 or results and results[0] != {'count': 3, 'id': 'ocd-jurisdiction/country:ca/legislature'}: log.error('%d jurisdictions do not have one organization' % len(results)) for result in results: log.info('%d %s' % (result['count'], result['id'])) # Validate the presence of posts and memberships on organizations. results = set(organizations.values('id').exclude(classification='party').annotate(count=Count('posts')).filter(count=0).values_list('name', flat=True)) - empty_organizations self.report_value('non-party organizations have no posts', results) results = set(organizations.values('id').annotate(count=Count('memberships')).filter(count=0).values_list('name', flat=True)) - empty_organizations self.report_value('organizations have no memberships', results) # Validate the number of memberships per post. results = Counter(post_memberships_count.filter(count=0).values_list('organization__name', flat=True)) self.report_count('organizations have posts with no memberships (seats may be vacant)', results) results = Counter(post_memberships_count.filter(count__gt=1).values_list('organization__name', flat=True)) self.report_count('organizations have posts with many memberships', results) # Validate the presence of posts on memberships. results = Counter(memberships.filter(post_id=None).exclude(organization__classification='party').values_list('organization__name', flat=True)) self.report_count('non-party organizations have memberships with no posts', results) # Validate that people have at most one post-membership and one party-membership. results = people.values('id').exclude(memberships__organization__classification='party').annotate(count=Count('memberships')).exclude(count=1).values_list('name', flat=True) self.report_value('people have many non-party memberships', results) results = people.values('id').filter(memberships__organization__classification='party').annotate(count=Count('memberships')).exclude(count=1).values_list('name', flat=True) self.report_value('people have many party memberships', results) # Validate the uniqueness of names, images and link URLs. results = self.repeated(people.exclude(image='').values_list('image', flat=True)) self.report_count('people have the same image', results) results = self.repeated(people.values_list('name', flat=True)) self.report_count('people have the same name', results) results = self.repeated(people.exclude(links__url=None).values_list('links__url', flat=True)) self.report_count('people have the same link URL', results) # Validate the uniqueness of email contact detail values. results = self.repeated(contact_details.filter(type='email').values_list('value', flat=True)) self.report_count('membership contact details with the same email', results) # Validate presence of email contact detail. jurisdiction_with_no_email = [ # Javascript-encoded email 'ocd-jurisdiction/country:ca/csd:1217030/legislature', # Cape Breton # Webform email 'ocd-jurisdiction/country:ca/csd:1310032/legislature', # Fredericton 'ocd-jurisdiction/country:ca/csd:2423027/legislature', # Québec 'ocd-jurisdiction/country:ca/csd:2464008/legislature', # Terrebonne 'ocd-jurisdiction/country:ca/csd:2466097/legislature', # Pointe-Claire 'ocd-jurisdiction/country:ca/csd:3530016/legislature', # Waterloo 'ocd-jurisdiction/country:ca/csd:3530035/legislature', # Woolwich 'ocd-jurisdiction/country:ca/csd:4706027/legislature', # Regina 'ocd-jurisdiction/country:ca/csd:4806016/legislature', # Calgary ] leaders_with_no_email = [ 'ocd-jurisdiction/country:ca/cd:3521/legislature', # Peel 'ocd-jurisdiction/country:ca/csd:2437067/legislature', # Trois-Rivières 'ocd-jurisdiction/country:ca/csd:2456083/legislature', # Saint-Jean-sur-Richelieu 'ocd-jurisdiction/country:ca/csd:2494068/legislature', # Saguenay 'ocd-jurisdiction/country:ca/csd:3520005/legislature', # Toronto 'ocd-jurisdiction/country:ca/csd:3521024/legislature', # Caledon 'ocd-jurisdiction/country:ca/csd:3530013/legislature', # Kitchener 'ocd-jurisdiction/country:ca/csd:4711066/legislature', # Saskatoon 'ocd-jurisdiction/country:ca/csd:4811061/legislature', # Edmonton 'ocd-jurisdiction/country:ca/csd:4816037/legislature', # Wood Buffalo 'ocd-jurisdiction/country:ca/csd:5909052/legislature', # Abbotsford 'ocd-jurisdiction/country:ca/csd:5915004/legislature', # Surrey ] jurisdiction_ids = jurisdictions.exclude(id__in=jurisdiction_with_no_email).values_list('id', flat=True) for jurisdiction_id in jurisdiction_ids: for organization in organizations.filter(jurisdiction_id=jurisdiction_id): # It's ridiculous that Django can't do a LEFT OUTER JOIN with a WHERE clause. memberships_with_no_email = sum(not membership.contact_details.filter(type='email').count() for membership in organization.memberships.all()) if memberships_with_no_email > 1 or memberships_with_no_email and jurisdiction_id not in leaders_with_no_email: log.error('%2d memberships have no email in %s' % (memberships_with_no_email, organization.name))
def process(report, *, candidates=False): rows = [] offices_count = 0 try: metadata = module_name_to_metadata(report.module) # Exclude party memberships. queryset = Membership.objects.filter( organization__jurisdiction_id=metadata['jurisdiction_id']) if candidates: queryset = queryset.filter(role='candidate') else: queryset = queryset.exclude(role__in=('member', 'candidate')) for membership in queryset.prefetch_related( 'contact_details', 'person', 'person__links', 'person__sources'): person = membership.person try: party_name = Membership.objects.get( organization__classification='party', role='member', person=person).organization.name except Membership.DoesNotExist: party_name = None facebook = None instagram = None linkedin = None twitter = None youtube = None for link in person.links.all(): domain = '.'.join( urlsplit(link.url).netloc.split('.')[-2:]) if domain in ('facebook.com', 'fb.com'): facebook = link.url elif domain == 'instagram.com': instagram = link.url elif domain == 'linkedin.com': linkedin = link.url elif domain == 'twitter.com': twitter = link.url elif domain == 'youtube.com': youtube = link.url if person.gender == 'male': gender = 'M' elif person.gender == 'female': gender = 'F' else: gender = None if ' ' in person.name: first_name, last_name = person.name.rsplit(' ', 1) else: first_name, last_name = None, person.name # @see https://represent.opennorth.ca/api/#fields sources = list(person.sources.all()) row = [ remove_suffix_re.sub( '', membership.post.label), # District name membership.role, # Elected office person.name, # Name first_name, # First name last_name, # Last name gender, # Gender party_name, # Party name next((contact_detail.value for contact_detail in membership.contact_details.all() if contact_detail.type == 'email'), None), # Email person.image, # Photo URL sources[-1].url if len(sources) > 1 else None, # Source URL get_personal_url(person), # Website facebook, # Facebook instagram, # Instagram twitter, # Twitter linkedin, # LinkedIn youtube, # YouTube ] offices = get_offices(membership) if len(offices) > offices_count: offices_count = len(offices) for office in offices: for key in ('type', 'postal', 'tel', 'fax'): row.append(office.get(key)) # If the person is associated to multiple boundaries. if re.search(r'\AWards\b', membership.post.label): for district_id in re.findall(r'\d+', membership.post.label): row = row[:] row[0] = 'Ward {}'.format(district_id) rows.append(row) else: rows.append(row) rows.sort() headers = self.default_headers[:] for _ in range(offices_count): headers += self.office_headers name = metadata['name'] if name in self.names: slug = self.names[name] else: slug = slugify(name) io = StringIO() body = csv.writer(io) body.writerow(headers) body.writerows(rows) key = 'csv/{}/{}.csv'.format( 'candidates' if candidates else 'representatives', slug) save(key, io) for row in rows: row.insert(0, name) except ImportError: report.delete() # delete reports for old modules return [rows, offices_count]
def represent(request, module_name): sys.path.append(os.path.abspath('scrapers')) metadata = module_name_to_metadata(module_name) representatives = [] # Exclude party memberships. queryset = Membership.objects.filter(organization__jurisdiction_id=metadata['jurisdiction_id']) if module_name.endswith('_candidates'): queryset.filter(role='candidate') else: queryset.exclude(role__in=('member', 'candidate')) for membership in queryset.prefetch_related('contact_details', 'person', 'person__links', 'person__sources'): person = membership.person try: party_name = Membership.objects.get(organization__classification='party', role='member', person=person).organization.name except Membership.DoesNotExist: party_name = None if person.gender == 'male': gender = 'M' elif person.gender == 'female': gender = 'F' else: gender = None incumbent = person.extras.pop('incumbent', None) # @see https://represent.opennorth.ca/api/#fields representative = { 'name': person.name, 'elected_office': membership.role, 'party_name': party_name, 'email': next((contact_detail.value for contact_detail in membership.contact_details.all() if contact_detail.type == 'email'), None), 'photo_url': person.image or None, 'personal_url': get_personal_url(person), 'gender': gender, 'offices': json.dumps(get_offices(membership)), 'extra': json.dumps(get_extra(person)), } # @see https://github.com/opennorth/represent-canada/issues/81 sources = list(person.sources.all()) if len(sources[0].url) <= 200: representative['source_url'] = sources[0].url if len(sources) > 1: representative['url'] = sources[-1].url if incumbent: representative['incumbent'] = True match = re.search(r'^(\S+) (Ward \d+)$', membership.post.label) # If the person is part of Peel Regional Council. if match: parent = Division.objects.get(subtype1='csd', subtype2='', name=match.group(1)) division = Division.objects.get(subtype1='csd', subid1=parent.subid1, name=match.group(2)) boundary_set_slug = next((k for k, v in settings.IMAGO_BOUNDARY_MAPPINGS.items() if v['prefix'].startswith(parent.id)), None) representative['district_name'] = membership.post.label representative['boundary_url'] = '/boundaries/{}/ward-{}/'.format(boundary_set_slug, division.subid2) representatives.append(representative) # If the person is associated to multiple boundaries. elif re.search(r'^Wards\b', membership.post.label): for district_id in re.findall(r'\d+', membership.post.label): representative = representative.copy() representative['district_id'] = district_id representative['district_name'] = 'Ward %s' % district_id representatives.append(representative) else: division_id = metadata['division_id'] if re.search('^ocd-division/country:ca/csd:(\d{7})\Z', division_id): geographic_code = division_id[-7:] elif re.search('^ocd-division/country:ca/cd:(\d{4})\Z', division_id): geographic_code = division_id[-4:] else: geographic_code = None post_label = remove_suffix_re.sub('', membership.post.label) # If the post label is numeric. if re.search(r'^\d+\Z', post_label): representative['district_id'] = post_label # If the person has a boundary URL. elif membership.extras.get('boundary_url'): representative['district_name'] = post_label representative['boundary_url'] = membership.extras['boundary_url'] # If the post label is a census subdivision. elif post_label == metadata['division_name'] and geographic_code: representative['district_name'] = post_label if len(geographic_code) == 7: representative['boundary_url'] = '/boundaries/census-subdivisions/%s/' % geographic_code elif len(geographic_code) == 4: representative['boundary_url'] = '/boundaries/census-divisions/%s/' % geographic_code else: representative['district_name'] = post_label district_id = re.search(r'^(?:District|Division|Ward) (\d+)\Z', post_label) if district_id: representative['district_id'] = district_id.group(1) representatives.append(representative) return HttpResponse(json.dumps(representatives), content_type='application/json')
def represent(request, module_name): sys.path.append(os.path.abspath('scrapers')) metadata = module_name_to_metadata(module_name) representatives = [] # Exclude party memberships. queryset = Membership.objects.filter( organization__jurisdiction_id=metadata['jurisdiction_id']) if module_name.endswith('_candidates'): queryset.filter(role='candidate') else: queryset.exclude(role__in=('member', 'candidate')) for membership in queryset.prefetch_related('contact_details', 'person', 'person__links', 'person__sources'): person = membership.person try: party_name = Membership.objects.get( organization__classification='party', role='member', person=person).organization.name except Membership.DoesNotExist: party_name = None if person.gender == 'male': gender = 'M' elif person.gender == 'female': gender = 'F' else: gender = None incumbent = person.extras.pop('incumbent', None) # @see https://represent.opennorth.ca/api/#fields representative = { 'name': person.name, 'elected_office': membership.role, 'party_name': party_name, 'email': next((contact_detail.value for contact_detail in membership.contact_details.all() if contact_detail.type == 'email'), None), 'photo_url': person.image or None, 'personal_url': get_personal_url(person), 'gender': gender, 'offices': json.dumps(get_offices(membership)), 'extra': json.dumps(get_extra(person)), } # @see https://github.com/opennorth/represent-canada/issues/81 sources = list(person.sources.all()) if len(sources[0].url) <= 200: representative['source_url'] = sources[0].url if len(sources) > 1: representative['url'] = sources[-1].url if incumbent: representative['incumbent'] = True match = re.search(r'^(\S+) (Ward \d+)$', membership.post.label) # If the person is part of Peel Regional Council. if match: parent = Division.objects.get(subtype1='csd', subtype2='', name=match.group(1)) division = Division.objects.get(subtype1='csd', subid1=parent.subid1, name=match.group(2)) boundary_set_slug = next( (k for k, v in settings.IMAGO_BOUNDARY_MAPPINGS.items() if v['prefix'].startswith(parent.id)), None) representative['district_name'] = membership.post.label representative['boundary_url'] = '/boundaries/{}/ward-{}/'.format( boundary_set_slug, division.subid2) representatives.append(representative) # If the person is associated to multiple boundaries. elif re.search(r'^Wards\b', membership.post.label): for district_id in re.findall(r'\d+', membership.post.label): representative = representative.copy() representative['district_id'] = district_id representative['district_name'] = 'Ward %s' % district_id representatives.append(representative) else: division_id = metadata['division_id'] if re.search('^ocd-division/country:ca/csd:(\d{7})\Z', division_id): geographic_code = division_id[-7:] elif re.search('^ocd-division/country:ca/cd:(\d{4})\Z', division_id): geographic_code = division_id[-4:] else: geographic_code = None post_label = remove_suffix_re.sub('', membership.post.label) # If the post label is numeric. if re.search(r'^\d+\Z', post_label): representative['district_id'] = post_label # If the person has a boundary URL. elif membership.extras.get('boundary_url'): representative['district_name'] = post_label representative['boundary_url'] = membership.extras[ 'boundary_url'] # If the post label is a census subdivision. elif post_label == metadata['division_name'] and geographic_code: representative['district_name'] = post_label if len(geographic_code) == 7: representative[ 'boundary_url'] = '/boundaries/census-subdivisions/%s/' % geographic_code elif len(geographic_code) == 4: representative[ 'boundary_url'] = '/boundaries/census-divisions/%s/' % geographic_code else: representative['district_name'] = post_label district_id = re.search(r'^(?:District|Division|Ward) (\d+)\Z', post_label) if district_id: representative['district_id'] = district_id.group(1) representatives.append(representative) return HttpResponse(json.dumps(representatives), content_type='application/json')
def handle(self, *args, **options): sys.path.append(os.path.abspath('scrapers')) empty_organizations = {'Parliament of Canada', 'Senate'} if options['module']: division_id = module_name_to_metadata(options['module'])['division_id'] jurisdictions = Jurisdiction.objects.filter(division_id=division_id) else: # Exclude candidate scrapers. jurisdictions = Jurisdiction.objects.exclude(classification='executive') organizations = Organization.objects.filter(jurisdiction__in=jurisdictions) posts = Post.objects.filter(organization__in=organizations) people = Person.objects.filter(memberships__organization__in=organizations) memberships = Membership.objects.filter(person_id__in=people) contact_details = MembershipContactDetail.objects.filter(membership__in=memberships) # A person has multiple memberships. jurisdiction_with_repetition = { 'ocd-jurisdiction/country:ca/cd:3521/legislature': 4, # Peel, due to Brampton 'ocd-jurisdiction/country:ca/csd:3521010/legislature': 4, # Brampton } post_memberships_count = posts.values('id').annotate(count=Count('memberships')) # Validate the number of organizations per jurisdiction. results = jurisdictions.values('id').annotate(count=Count('organizations')).exclude(count=1) # The Parliament of Canada has three organizations. if len(results) > 1 or results and results[0] != {'count': 3, 'id': 'ocd-jurisdiction/country:ca/legislature'}: log.error('{} jurisdictions do not have one organization'.format(len(results))) for result in results: log.info('{} {}'.format(result['count'], result['id'])) # Validate the presence of posts and memberships on organizations. results = set(organizations.values('id').exclude(classification__in=('committee', 'party')).annotate(count=Count('posts')).filter(count=0).values_list('name', flat=True)) - empty_organizations self.report_value('non-committee, non-party organizations have no posts', results) results = set(organizations.values('id').exclude(classification='committee').annotate(count=Count('memberships')).filter(count=0).values_list('name', flat=True)) - empty_organizations self.report_value('non-committee organizations have no memberships', results) # Validate the number of memberships per post. results = Counter(post_memberships_count.filter(count=0).values_list('organization__name', flat=True)) self.report_count('organizations have posts with no memberships (seats may be vacant)', results) results = Counter(post_memberships_count.filter(count__gt=1).values_list('organization__name', flat=True)) self.report_count('organizations have posts with many memberships', results) # Validate the presence of posts on memberships. results = Counter(memberships.filter(post_id=None).exclude(organization__classification='party').values_list('organization__name', flat=True)) self.report_count('non-party organizations have memberships with no posts', results) # Validate that people have at most one post-membership. results = people.values('id').exclude(memberships__organization__classification='party').exclude(memberships__organization__jurisdiction_id__in=jurisdiction_with_repetition.keys()).annotate(count=Count('memberships')).exclude(count=1).values_list('name', flat=True) self.report_value('people have many non-party memberships', results) for jurisdiction_id, threshold in jurisdiction_with_repetition.items(): results = people.values('id').exclude(memberships__organization__classification='party').filter(memberships__organization__jurisdiction_id=jurisdiction_id).annotate(count=Count('memberships')).exclude(count__lte=threshold).values_list('name', flat=True) self.report_value('people have many non-party memberships in {}'.format(jurisdiction_id), results) # Validate that people have at most one party-membership. results = people.values('id').filter(memberships__organization__classification='party').annotate(count=Count('memberships')).exclude(count=1).values_list('name', flat=True) self.report_value('people have many party memberships', results) # Validate the uniqueness of names and images. people_without_repetition = people.exclude(memberships__organization__jurisdiction_id__in=jurisdiction_with_repetition.keys()) results = self.repeated(people_without_repetition.values_list('name', flat=True)) self.report_count('names are repeated across people', results) results = self.repeated(people_without_repetition.exclude(image='').values_list('image', flat=True)) self.report_count('images are repeated across people', results) for jurisdiction_id, threshold in jurisdiction_with_repetition.items(): people_with_repetition = people.filter(memberships__organization__jurisdiction_id=jurisdiction_id) results = self.repeated(people_with_repetition.values_list('name', flat=True), threshold=threshold) self.report_count('names are repeated across people in {}'.format(jurisdiction_id), results) results = self.repeated(people_with_repetition.exclude(image='').values_list('image', flat=True), threshold=threshold) self.report_count('images are repeated across people in {}'.format(jurisdiction_id), results) # Validate the uniqueness of link URLs. results = self.repeated(people.exclude(links__url=None).values_list('links__url', flat=True)) self.report_count('link URLs are repeated across people', results) # Validate the uniqueness of email contact detail values. results = self.repeated(contact_details.filter(type='email').exclude(membership__organization__jurisdiction_id__in=jurisdiction_with_repetition.keys()).values_list('value', flat=True)) self.report_count('emails are repeated across membership contact details', results) for jurisdiction_id, threshold in jurisdiction_with_repetition.items(): results = self.repeated(contact_details.filter(type='email').filter(membership__organization__jurisdiction_id=jurisdiction_id).values_list('value', flat=True), threshold=threshold) self.report_count('emails are repeated across membership contact details in {}'.format(jurisdiction_id), results) # Validate presence of email contact detail. jurisdiction_with_no_email = [ # Javascript-encoded email 'ocd-jurisdiction/country:ca/csd:1217030/legislature', # Cape Breton # Webform email 'ocd-jurisdiction/country:ca/csd:2423027/legislature', # Québec 'ocd-jurisdiction/country:ca/csd:2464008/legislature', # Terrebonne 'ocd-jurisdiction/country:ca/csd:3524009/legislature', # Milton 'ocd-jurisdiction/country:ca/csd:3530016/legislature', # Waterloo 'ocd-jurisdiction/country:ca/csd:3530027/legislature', # Wellesley 'ocd-jurisdiction/country:ca/csd:3530035/legislature', # Woolwich 'ocd-jurisdiction/country:ca/csd:4706027/legislature', # Regina 'ocd-jurisdiction/country:ca/csd:4711066/legislature', # Saskatoon 'ocd-jurisdiction/country:ca/csd:4806016/legislature', # Calgary 'ocd-jurisdiction/country:ca/csd:5909052/legislature', # Abbotsford ] leaders_with_no_email = { 'ocd-jurisdiction/country:ca/cd:3521/legislature', # Peel 'ocd-jurisdiction/country:ca/csd:2437067/legislature', # Trois-Rivières 'ocd-jurisdiction/country:ca/csd:2456083/legislature', # Saint-Jean-sur-Richelieu 'ocd-jurisdiction/country:ca/csd:2494068/legislature', # Saguenay 'ocd-jurisdiction/country:ca/csd:3520005/legislature', # Toronto 'ocd-jurisdiction/country:ca/csd:3521024/legislature', # Caledon 'ocd-jurisdiction/country:ca/csd:3530013/legislature', # Kitchener 'ocd-jurisdiction/country:ca/csd:4811061/legislature', # Edmonton 'ocd-jurisdiction/country:ca/csd:4816037/legislature', # Wood Buffalo 'ocd-jurisdiction/country:ca/csd:5909052/legislature', # Abbotsford 'ocd-jurisdiction/country:ca/csd:5915004/legislature', # Surrey } jurisdiction_ids = jurisdictions.exclude(id__in=jurisdiction_with_no_email).values_list('id', flat=True) for jurisdiction_id in jurisdiction_ids: for organization in organizations.filter(jurisdiction_id=jurisdiction_id): # It's ridiculous that Django can't do a LEFT OUTER JOIN with a WHERE clause. memberships_with_no_email = sum(not membership.contact_details.filter(type='email').count() for membership in organization.memberships.all()) if memberships_with_no_email > 1 or memberships_with_no_email and jurisdiction_id not in leaders_with_no_email: log.error('{:2} memberships have no email in {}'.format(memberships_with_no_email, organization.name))
def handle(self, *args, **options): def save(key, body): k = Key(bucket) k.key = key k.set_contents_from_string(body) k.set_acl('public-read') sys.path.append(os.path.abspath('scrapers')) bucket = S3Connection().get_bucket('represent.opennorth.ca') names = { 'Parliament of Canada': 'house-of-commons', 'Legislative Assembly of Alberta': 'alberta-legislature', 'Legislative Assembly of British Columbia': 'bc-legislature', 'Legislative Assembly of Manitoba': 'manitoba-legislature', 'Legislative Assembly of New Brunswick': 'new-brunswick-legislature', 'Newfoundland and Labrador House of Assembly': 'newfoundland-labrador-legislature', 'Nova Scotia House of Assembly': 'nova-scotia-legislature', 'Legislative Assembly of Ontario': 'ontario-legislature', 'Legislative Assembly of Prince Edward Island': 'pei-legislature', 'Assemblée nationale du Québec': 'quebec-assemblee-nationale', 'Legislative Assembly of Saskatchewan': 'saskatchewan-legislature', } default_headers = [ 'District name', 'Primary role', 'Name', # not in CSV schema 'First name', 'Last name', 'Gender', 'Party name', 'Email', 'Photo URL', 'Source URL', 'Website', 'Facebook', 'Instagram', 'Twitter', 'LinkedIn', 'YouTube', ] office_headers = [ 'Office type', # not in CSV schema 'Address', # not in CSV schema 'Phone', 'Fax', ] all_rows = [] max_offices_count = 0 reports = Report.objects.filter(exception='').exclude(module__endswith='_candidates').exclude(module__endswith='_municipalities').order_by('module') for report in reports: try: metadata = module_name_to_metadata(report.module) rows = [] offices_count = 0 # Exclude party memberships. queryset = Membership.objects.filter(organization__jurisdiction_id=metadata['jurisdiction_id']).exclude(role__in=('member', 'candidate')) for membership in queryset.prefetch_related('contact_details', 'person', 'person__links', 'person__sources'): person = membership.person try: party_name = Membership.objects.get(organization__classification='party', role='member', person=person).organization.name except Membership.DoesNotExist: party_name = None facebook = None instagram = None linkedin = None twitter = None youtube = None for link in person.links.all(): domain = '.'.join(urlsplit(link.url).netloc.split('.')[-2:]) if domain in ('facebook.com', 'fb.com'): facebook = link.url elif domain == 'instagram.com': instagram = link.url elif domain == 'linkedin.com': linkedin = link.url elif domain == 'twitter.com': twitter = link.url elif domain == 'youtube.com': youtube = link.url if person.gender == 'male': gender = 'M' elif person.gender == 'female': gender = 'F' else: gender = None if ' ' in person.name: first_name, last_name = person.name.rsplit(' ', 1) else: first_name, last_name = None, person.name # @see https://represent.opennorth.ca/api/#fields sources = list(person.sources.all()) row = [ remove_suffix_re.sub('', membership.post.label), # District name membership.role, # Elected office person.name, # Name first_name, # First name last_name, # Last name gender, # Gender party_name, # Party name next((contact_detail.value for contact_detail in membership.contact_details.all() if contact_detail.type == 'email'), None), # Email person.image, # Photo URL sources[-1].url if len(sources) > 1 else None, # Source URL get_personal_url(person), # Website facebook, # Facebook instagram, # Instagram twitter, # Twitter linkedin, # LinkedIn youtube, # YouTube ] offices = get_offices(membership) if len(offices) > offices_count: offices_count = len(offices) for office in offices: for key in ('type', 'postal', 'tel', 'fax'): row.append(office.get(key)) # If the person is associated to multiple boundaries. if re.search(r'\AWards\b', membership.post.label): for district_id in re.findall(r'\d+', membership.post.label): row = row[:] row[0] = 'Ward %s' % district_id rows.append(row) else: rows.append(row) rows.sort() headers = default_headers[:] for _ in range(offices_count): headers += office_headers name = metadata['name'] if name in names: slug = names[name] else: slug = slugify(name) io = StringIO() body = csv.writer(io) body.writerow(headers) body.writerows(rows) save('csv/%s.csv' % slug, codecs.encode(io.getvalue(), 'windows-1252')) if offices_count > max_offices_count: max_offices_count = offices_count for row in rows: row.insert(0, name) all_rows.append(row) except ImportError: report.delete() # delete reports for old modules headers = ['Organization'] + default_headers for _ in range(max_offices_count): headers += office_headers io = StringIO() body = csv.writer(io) body.writerow(headers) body.writerows(all_rows) save('csv/complete.csv', codecs.encode(io.getvalue(), 'windows-1252'))
def handle(self, *args, **options): sys.path.append(os.path.abspath('scrapers')) empty_organizations = {'Parliament of Canada', 'Senate'} if args: division_id = module_name_to_metadata(args[0])['division_id'] jurisdictions = Jurisdiction.objects.filter( division_id=division_id) organizations = Organization.objects.filter( jurisdiction__in=jurisdictions) posts = Post.objects.filter(organization__in=organizations) people = Person.objects.filter( memberships__organization__in=organizations) memberships = Membership.objects.filter(person__id__in=people) contact_details = MembershipContactDetail.objects.filter( membership__in=memberships) else: jurisdictions = Jurisdiction.objects organizations = Organization.objects posts = Post.objects people = Person.objects memberships = Membership.objects contact_details = MembershipContactDetail.objects post_memberships_count = posts.values('id').annotate( count=Count('memberships')) # Validate the number of organizations per jurisdiction. results = jurisdictions.values('id').annotate( count=Count('organizations')).exclude(count=1) # The Parliament of Canada has three organizations. if len(results) > 1 or results and results[0] != { 'count': 3, 'id': 'ocd-jurisdiction/country:ca/legislature' }: log.error('%d jurisdictions do not have one organization' % len(results)) for result in results: log.info('%d %s' % (result['count'], result['id'])) # Validate the presence of posts and memberships on organizations. results = set( organizations.values('id').exclude( classification='party').annotate(count=Count('posts')).filter( count=0).values_list('name', flat=True)) - empty_organizations self.report_value('non-party organizations have no posts', results) results = set( organizations.values('id').annotate( count=Count('memberships')).filter(count=0).values_list( 'name', flat=True)) - empty_organizations self.report_value('organizations have no memberships', results) # Validate the number of memberships per post. results = Counter( post_memberships_count.filter(count=0).values_list( 'organization__name', flat=True)) self.report_count( 'organizations have posts with no memberships (seats may be vacant)', results) results = Counter( post_memberships_count.filter(count__gt=1).values_list( 'organization__name', flat=True)) self.report_count('organizations have posts with many memberships', results) # Validate the presence of posts on memberships. results = Counter( memberships.filter(post_id=None).exclude( organization__classification='party').values_list( 'organization__name', flat=True)) self.report_count( 'non-party organizations have memberships with no posts', results) # Validate that people have at most one post-membership and one party-membership. results = people.values('id').exclude( memberships__organization__classification='party').annotate( count=Count('memberships')).exclude(count=1).values_list( 'name', flat=True) self.report_value('people have many non-party memberships', results) results = people.values('id').filter( memberships__organization__classification='party').annotate( count=Count('memberships')).exclude(count=1).values_list( 'name', flat=True) self.report_value('people have many party memberships', results) # Validate the uniqueness of names, images and link URLs. results = self.repeated( people.exclude(image='').values_list('image', flat=True)) self.report_count('people have the same image', results) results = self.repeated(people.values_list('name', flat=True)) self.report_count('people have the same name', results) results = self.repeated( people.exclude(links__url=None).values_list('links__url', flat=True)) self.report_count('people have the same link URL', results) # Validate the uniqueness of email contact detail values. results = self.repeated( contact_details.filter(type='email').values_list('value', flat=True)) self.report_count('membership contact details with the same email', results) # Validate presence of email contact detail. jurisdiction_with_no_email = [ # Javascript-encoded email 'ocd-jurisdiction/country:ca/csd:1217030/legislature', # Cape Breton # Webform email 'ocd-jurisdiction/country:ca/csd:1310032/legislature', # Fredericton 'ocd-jurisdiction/country:ca/csd:2423027/legislature', # Québec 'ocd-jurisdiction/country:ca/csd:2464008/legislature', # Terrebonne 'ocd-jurisdiction/country:ca/csd:2466097/legislature', # Pointe-Claire 'ocd-jurisdiction/country:ca/csd:3530016/legislature', # Waterloo 'ocd-jurisdiction/country:ca/csd:3530035/legislature', # Woolwich 'ocd-jurisdiction/country:ca/csd:4706027/legislature', # Regina 'ocd-jurisdiction/country:ca/csd:4806016/legislature', # Calgary ] leaders_with_no_email = [ 'ocd-jurisdiction/country:ca/cd:3521/legislature', # Peel 'ocd-jurisdiction/country:ca/csd:2437067/legislature', # Trois-Rivières 'ocd-jurisdiction/country:ca/csd:2456083/legislature', # Saint-Jean-sur-Richelieu 'ocd-jurisdiction/country:ca/csd:2494068/legislature', # Saguenay 'ocd-jurisdiction/country:ca/csd:3520005/legislature', # Toronto 'ocd-jurisdiction/country:ca/csd:3521024/legislature', # Caledon 'ocd-jurisdiction/country:ca/csd:3530013/legislature', # Kitchener 'ocd-jurisdiction/country:ca/csd:4711066/legislature', # Saskatoon 'ocd-jurisdiction/country:ca/csd:4811061/legislature', # Edmonton 'ocd-jurisdiction/country:ca/csd:4816037/legislature', # Wood Buffalo 'ocd-jurisdiction/country:ca/csd:5909052/legislature', # Abbotsford 'ocd-jurisdiction/country:ca/csd:5915004/legislature', # Surrey ] jurisdiction_ids = jurisdictions.exclude( id__in=jurisdiction_with_no_email).values_list('id', flat=True) for jurisdiction_id in jurisdiction_ids: for organization in organizations.filter( jurisdiction_id=jurisdiction_id): # It's ridiculous that Django can't do a LEFT OUTER JOIN with a WHERE clause. memberships_with_no_email = sum( not membership.contact_details.filter( type='email').count() for membership in organization.memberships.all()) if memberships_with_no_email > 1 or memberships_with_no_email and jurisdiction_id not in leaders_with_no_email: log.error('%2d memberships have no email in %s' % (memberships_with_no_email, organization.name))
def represent(request, module_name): sys.path.append(os.path.abspath('scrapers')) metadata = module_name_to_metadata(module_name) representatives = [] # Exclude party memberships. queryset = Membership.objects.filter( organization__jurisdiction_id=metadata['jurisdiction_id']) if module_name.endswith('_candidates'): # Include only candidates. queryset.filter(role='candidate') else: # Exclude candidates and party memberships. queryset.exclude(role__in=('member', 'candidate')) for membership in queryset.prefetch_related('contact_details', 'person', 'person__links', 'person__sources', 'post'): person = membership.person # Not sure why this is necessary. if not isinstance(membership.extras, dict): membership.extras = json.loads(membership.extras) if not isinstance(person.extras, dict): person.extras = json.loads(person.extras) try: party_name = Membership.objects.select_related('organization').get( organization__classification='party', role='member', person=person).organization.name except Membership.DoesNotExist: party_name = None if person.gender == 'male': gender = 'M' elif person.gender == 'female': gender = 'F' else: gender = None # Candidates only. incumbent = person.extras.pop('incumbent', None) # @see https://represent.opennorth.ca/api/#fields representative = { 'name': person.name, 'elected_office': membership.role, 'party_name': party_name, 'email': next((contact_detail.value for contact_detail in membership.contact_details.all() if contact_detail.type == 'email'), None), 'photo_url': person.image or None, 'personal_url': get_personal_url(person), 'gender': gender, 'offices': json.dumps(get_offices(membership)), 'extra': json.dumps(get_extra(person)), } sources = list(person.sources.all()) # The first URL ought to be the most generic source. representative['source_url'] = sources[0].url if len(sources) > 1: # The last URL ought to be the most specific source. representative['url'] = sources[-1].url if incumbent: representative['incumbent'] = True match = re.search(r'^(\S+) (Ward \d+)$', membership.post.label) # If the person is part of Peel Regional Council. if match: parent = Division.objects.get(subtype1='csd', subtype2='', name=match.group(1)) division = Division.objects.get(subtype1='csd', subid1=parent.subid1, name=match.group(2)) boundary_set_slug = '{}-wards'.format(parent.name.lower()) representative['district_name'] = membership.post.label representative['boundary_url'] = '/boundaries/{}/ward-{}/'.format( boundary_set_slug, division.subid2) representatives.append(representative) # If the person is associated to multiple boundaries. elif re.search(r'^Wards\b', membership.post.label): for district_id in re.findall(r'\d+', membership.post.label): representative = representative.copy() representative['district_id'] = district_id representative['district_name'] = 'Ward {}'.format(district_id) representatives.append(representative) else: division_id = metadata['division_id'] if re.search('^ocd-division/country:ca/csd:(\d{7})\Z', division_id): geographic_code = division_id[-7:] elif re.search('^ocd-division/country:ca/cd:(\d{4})\Z', division_id): geographic_code = division_id[-4:] else: geographic_code = None post_label = remove_suffix_re.sub('', membership.post.label) # If the post label is numeric. if re.search(r'^\d+\Z', post_label): representative['district_id'] = post_label # If the person has a boundary URL. elif 'boundary_url' in membership.extras: representative['district_name'] = post_label representative['boundary_url'] = membership.extras[ 'boundary_url'] # If the post label is a Census geographic name. elif post_label == metadata['division_name'] and geographic_code: representative['district_name'] = post_label if len(geographic_code) == 7: representative[ 'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format( geographic_code) elif len(geographic_code) == 4: representative[ 'boundary_url'] = '/boundaries/census-divisions/{}/'.format( geographic_code) else: representative['district_name'] = post_label district_id = re.search(r'^(?:District|Division|Ward) (\d+)\Z', post_label) if district_id: representative['district_id'] = district_id.group(1) representatives.append(representative) return HttpResponse(json.dumps(representatives), content_type='application/json')