Beispiel #1
0
def home(request):
    sys.path.append(os.path.abspath('scrapers'))

    data = json.loads(requests.get('https://represent.opennorth.ca/representative-sets/?limit=0', verify=settings.SSL_VERIFY).text)

    names = {}
    for obj in data['objects']:
        # The `ca` scraper has "Parliament of Canada" as the root organization.
        if obj['name'] == 'House of Commons':
            names['Parliament of Canada'] = obj['data_url']
        else:
            names[obj['name']] = obj['data_url']

    reports = Report.objects.order_by('module').all()
    for report in reports:
        try:
            name = module_name_to_metadata(report.module)['name']
            if not report.exception:
                if name in names:
                    if names[name].startswith('https://scrapers.herokuapp.com/represent/'):
                        report.icon = 'noop'
                    else:
                        report.icon = 'replace'
                else:
                    report.icon = 'add'
        except ImportError:
            report.delete()  # delete reports for old modules

    return render(request, 'index.html', {
        'exceptions': Report.objects.exclude(exception='').count(),
        'reports': reports,
    })
Beispiel #2
0
    def handle(self, *args, **options):
        sys.path.append(os.path.abspath('scrapers'))

        args = list(args)
        threshold = args and int(args.pop(0))
        module_names = args or os.listdir('scrapers')

        urls = [
            # Provinces and territories
            'https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=101&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-101.CSV',
            # Census subdivisions
            'https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=701&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-701.CSV',
            # Census divisions
            'https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=301&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-301.CSV',
        ]

        populations = {}
        for url in urls:
            response = requests.get(url)
            response.encoding = 'ISO-8859-1'
            reader = csv.reader(StringIO(response.text))
            next(reader)  # title
            next(reader)  # headers
            for row in reader:
                if row:
                    populations[row[0]] = int(row[4] or 0)
                else:
                    break

        for module_name in module_names:
            if os.path.isdir(os.path.join('scrapers', module_name)) and module_name not in ('.git', '_cache', '_data', '__pycache__', 'disabled'):
                division_id = module_name_to_metadata(module_name)['division_id']
                try:
                    report = Report.objects.get(module=module_name)
                    if report.exception:
                        status = 'error'
                    else:
                        status = 'success'
                except Report.DoesNotExist:
                    status = 'unknown'

                sgc = Division.get(division_id).attrs['sgc'] or division_id.rsplit('/', 1)[-1].split(':', 1)[-1]
                if sgc == 'ca':
                    sgc = '01'

                population = populations.get(sgc, 0)
                if not threshold or population < threshold:
                    print('%-32s %-7s %8d' % (module_name, status, population))
Beispiel #3
0
    def handle(self, *args, **options):
        sys.path.append(os.path.abspath('scrapers'))

        threshold = options['threshold']
        module_names = options['module'] or os.listdir('scrapers')

        # @see http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/index-eng.cfm
        urls = [
            # Provinces and territories
            'http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=101&OFT=FULLCSV',
            # Census subdivisions
            'http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=701&OFT=FULLCSV',
            # Census divisions
            'http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=301&OFT=FULLCSV',
        ]

        populations = {}
        for url in urls:
            response = requests.get(url, verify=settings.SSL_VERIFY)
            response.encoding = 'iso-8859-1'
            reader = csv.DictReader(StringIO(response.text))
            for row in reader:
                if row:
                    populations[row['Geographic code']] = int(row['Population, 2016'] or 0)
                else:
                    break

        for module_name in module_names:
            if os.path.isfile(os.path.join('scrapers', module_name, '__init__.py')):
                division_id = module_name_to_metadata(module_name)['division_id']
                try:
                    report = Report.objects.get(module=module_name)
                    if report.exception:
                        status = 'error'
                    else:
                        status = 'success'
                except Report.DoesNotExist:
                    status = 'unknown'

                sgc = Division.get(division_id).attrs['sgc'] or division_id.rsplit(':', 1)[1]

                population = populations.get(sgc, 0)
                if not threshold or population < threshold:
                    print('{:<32} {:<7} {:8}'.format(module_name, status, population))
Beispiel #4
0
def home(request):
    sys.path.append(os.path.abspath('scrapers'))

    data = json.loads(
        requests.get(
            'https://represent.opennorth.ca/representative-sets/?limit=0').text
    )

    names = {}
    for obj in data['objects']:
        if obj['name'] == 'House of Commons':
            names['Parliament of Canada'] = obj['data_url']
        else:
            names[obj['name']] = obj['data_url']

    reports = Report.objects.order_by('module').all()
    for report in reports:
        if not report.exception:
            try:
                name = module_name_to_metadata(report.module)['name']
                if name in names:
                    if names[name].startswith(
                            'https://scrapers.herokuapp.com/represent/'):
                        report.icon = 'noop'
                    else:
                        report.icon = 'replace'
                else:
                    report.icon = 'add'
            except ImportError:
                report.delete()  # delete reports for old modules

    return render_to_response(
        'index.html',
        RequestContext(
            request, {
                'exceptions': Report.objects.exclude(exception='').count(),
                'reports': reports,
            }))
Beispiel #5
0
def represent(request, module_name):
    sys.path.append(os.path.abspath('scrapers'))

    metadata = module_name_to_metadata(module_name)

    representatives = []

    # Exclude party memberships.
    queryset = Membership.objects.filter(organization__jurisdiction_id=metadata['jurisdiction_id'])

    if module_name.endswith('_candidates'):
        # Include only candidates.
        queryset.filter(role='candidate')
    else:
        # Exclude candidates and party memberships.
        queryset.exclude(role__in=('member', 'candidate'))

    for membership in queryset.prefetch_related('contact_details', 'person', 'person__links', 'person__sources', 'post'):
        person = membership.person

        # Not sure why this is necessary.
        if not isinstance(membership.extras, dict):
            membership.extras = json.loads(membership.extras)
        if not isinstance(person.extras, dict):
            person.extras = json.loads(person.extras)

        try:
            party_name = Membership.objects.select_related('organization').get(organization__classification='party', role='member', person=person).organization.name
        except Membership.DoesNotExist:
            party_name = None

        if person.gender == 'male':
            gender = 'M'
        elif person.gender == 'female':
            gender = 'F'
        else:
            gender = None

        # Candidates only.
        incumbent = person.extras.pop('incumbent', None)

        # @see https://represent.opennorth.ca/api/#fields
        representative = {
            'name': person.name,
            'elected_office': membership.role,
            'party_name': party_name,
            'email': next((contact_detail.value for contact_detail in membership.contact_details.all() if contact_detail.type == 'email'), None),
            'photo_url': person.image or None,
            'personal_url': get_personal_url(person),
            'gender': gender,
            'offices': json.dumps(get_offices(membership)),
            'extra': json.dumps(get_extra(person)),
        }

        sources = list(person.sources.all())

        # The first URL ought to be the most generic source.
        representative['source_url'] = sources[0].url

        if len(sources) > 1:
            # The last URL ought to be the most specific source.
            representative['url'] = sources[-1].url

        if incumbent:
            representative['incumbent'] = True

        match = re.search(r'^(\S+) (Ward \d+)$', membership.post.label)

        # If the person is part of Peel Regional Council.
        if match:
            parent = Division.objects.get(subtype1='csd', subtype2='', name=match.group(1))
            division = Division.objects.get(subtype1='csd', subid1=parent.subid1, name=match.group(2))
            boundary_set_slug = '{}-wards'.format(parent.name.lower())
            representative['district_name'] = membership.post.label
            representative['boundary_url'] = '/boundaries/{}/ward-{}/'.format(boundary_set_slug, division.subid2)
            representatives.append(representative)

        # If the person is associated to multiple boundaries.
        elif re.search(r'^Wards\b', membership.post.label):
            for district_id in re.findall(r'\d+', membership.post.label):
                representative = representative.copy()
                representative['district_id'] = district_id
                representative['district_name'] = 'Ward {}'.format(district_id)
                representatives.append(representative)

        else:
            division_id = metadata['division_id']

            if re.search('^ocd-division/country:ca/csd:(\d{7})\Z', division_id):
                geographic_code = division_id[-7:]
            elif re.search('^ocd-division/country:ca/cd:(\d{4})\Z', division_id):
                geographic_code = division_id[-4:]
            else:
                geographic_code = None

            post_label = remove_suffix_re.sub('', membership.post.label)

            # If the post label is numeric.
            if re.search(r'^\d+\Z', post_label):
                representative['district_id'] = post_label

            # If the person has a boundary URL.
            elif 'boundary_url' in membership.extras:
                representative['district_name'] = post_label
                representative['boundary_url'] = membership.extras['boundary_url']

            # If the post label is a Census geographic name.
            elif post_label == metadata['division_name'] and geographic_code:
                representative['district_name'] = post_label
                if len(geographic_code) == 7:
                    representative['boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(geographic_code)
                elif len(geographic_code) == 4:
                    representative['boundary_url'] = '/boundaries/census-divisions/{}/'.format(geographic_code)

            else:
                representative['district_name'] = post_label
                district_id = re.search(r'^(?:District|Division|Ward) (\d+)\Z', post_label)
                if district_id:
                    representative['district_id'] = district_id.group(1)

            representatives.append(representative)

    return HttpResponse(json.dumps(representatives), content_type='application/json')
Beispiel #6
0
    def handle(self, *args, **options):
        def save(key, body):
            k = Key(bucket)
            k.key = key
            k.set_contents_from_string(body)
            k.set_acl('public-read')

        sys.path.append(os.path.abspath('scrapers'))

        bucket = S3Connection().get_bucket('represent.opennorth.ca')

        names = {
            'Parliament of Canada': 'house-of-commons',
            'Legislative Assembly of Alberta': 'alberta-legislature',
            'Legislative Assembly of British Columbia': 'bc-legislature',
            'Legislative Assembly of Manitoba': 'manitoba-legislature',
            'Legislative Assembly of New Brunswick':
            'new-brunswick-legislature',
            'Newfoundland and Labrador House of Assembly':
            'newfoundland-labrador-legislature',
            'Nova Scotia House of Assembly': 'nova-scotia-legislature',
            'Legislative Assembly of Ontario': 'ontario-legislature',
            'Legislative Assembly of Prince Edward Island': 'pei-legislature',
            'Assemblée nationale du Québec': 'quebec-assemblee-nationale',
            'Legislative Assembly of Saskatchewan': 'saskatchewan-legislature',
        }

        default_headers = [
            'District name',
            'Primary role',
            'Name',  # not in CSV schema
            'First name',
            'Last name',
            'Gender',
            'Party name',
            'Email',
            'Photo URL',
            'Source URL',
            'Website',
            'Facebook',
            'Instagram',
            'Twitter',
            'LinkedIn',
            'YouTube',
        ]
        office_headers = [
            'Office type',  # not in CSV schema
            'Address',  # not in CSV schema
            'Phone',
            'Fax',
        ]

        all_rows = []
        max_offices_count = 0

        reports = Report.objects.filter(exception='').exclude(
            module__endswith='_candidates').exclude(
                module__endswith='_municipalities').order_by('module')
        for report in reports:
            try:
                metadata = module_name_to_metadata(report.module)

                rows = []
                offices_count = 0

                # Exclude party memberships.
                queryset = Membership.objects.filter(
                    organization__jurisdiction_id=metadata['jurisdiction_id']
                ).exclude(role__in=('member', 'candidate'))
                for membership in queryset.prefetch_related(
                        'contact_details', 'person', 'person__links',
                        'person__sources'):
                    person = membership.person

                    try:
                        party_name = Membership.objects.get(
                            organization__classification='party',
                            role='member',
                            person=person).organization.name
                    except Membership.DoesNotExist:
                        party_name = None

                    facebook = None
                    instagram = None
                    linkedin = None
                    twitter = None
                    youtube = None
                    for link in person.links.all():
                        domain = '.'.join(
                            urlsplit(link.url).netloc.split('.')[-2:])
                        if domain in ('facebook.com', 'fb.com'):
                            facebook = link.url
                        elif domain == 'instagram.com':
                            instagram = link.url
                        elif domain == 'linkedin.com':
                            linkedin = link.url
                        elif domain == 'twitter.com':
                            twitter = link.url
                        elif domain == 'youtube.com':
                            youtube = link.url

                    if person.gender == 'male':
                        gender = 'M'
                    elif person.gender == 'female':
                        gender = 'F'
                    else:
                        gender = None

                    if ' ' in person.name:
                        first_name, last_name = person.name.rsplit(' ', 1)
                    else:
                        first_name, last_name = None, person.name

                    # @see https://represent.opennorth.ca/api/#fields
                    sources = list(person.sources.all())
                    row = [
                        remove_suffix_re.sub(
                            '', membership.post.label),  # District name
                        membership.role,  # Elected office
                        person.name,  # Name
                        first_name,  # First name
                        last_name,  # Last name
                        gender,  # Gender
                        party_name,  # Party name
                        next((contact_detail.value for contact_detail in
                              membership.contact_details.all()
                              if contact_detail.type == 'email'),
                             None),  # Email
                        person.image,  # Photo URL
                        sources[-1].url
                        if len(sources) > 1 else None,  # Source URL
                        get_personal_url(person),  # Website
                        facebook,  # Facebook
                        instagram,  # Instagram
                        twitter,  # Twitter
                        linkedin,  # LinkedIn
                        youtube,  # YouTube
                    ]

                    offices = get_offices(membership)
                    if len(offices) > offices_count:
                        offices_count = len(offices)

                    for office in offices:
                        for key in ('type', 'postal', 'tel', 'fax'):
                            row.append(office.get(key))

                    # If the person is associated to multiple boundaries.
                    if re.search(r'\AWards\b', membership.post.label):
                        for district_id in re.findall(r'\d+',
                                                      membership.post.label):
                            row = row[:]
                            row[0] = 'Ward %s' % district_id
                            rows.append(row)
                    else:
                        rows.append(row)

                rows.sort()

                headers = default_headers[:]
                for _ in range(offices_count):
                    headers += office_headers

                name = metadata['name']
                if name in names:
                    slug = names[name]
                else:
                    slug = slugify(name)

                io = StringIO()
                body = csv.writer(io)
                body.writerow(headers)
                body.writerows(rows)
                save('csv/%s.csv' % slug,
                     codecs.encode(io.getvalue(), 'windows-1252'))

                if offices_count > max_offices_count:
                    max_offices_count = offices_count

                for row in rows:
                    row.insert(0, name)
                    all_rows.append(row)
            except ImportError:
                report.delete()  # delete reports for old modules

        headers = ['Organization'] + default_headers
        for _ in range(max_offices_count):
            headers += office_headers

        io = StringIO()
        body = csv.writer(io)
        body.writerow(headers)
        body.writerows(all_rows)
        save('csv/complete.csv', codecs.encode(io.getvalue(), 'windows-1252'))
Beispiel #7
0
    def handle(self, *args, **options):
        sys.path.append(os.path.abspath('scrapers'))

        empty_organizations = {'Parliament of Canada', 'Senate'}

        if options['module']:
            division_id = module_name_to_metadata(
                options['module'])['division_id']
            jurisdictions = Jurisdiction.objects.filter(
                division_id=division_id)
        else:
            # Exclude candidate scrapers.
            jurisdictions = Jurisdiction.objects.exclude(
                classification='executive')

        organizations = Organization.objects.filter(
            jurisdiction__in=jurisdictions)
        posts = Post.objects.filter(organization__in=organizations)
        people = Person.objects.filter(
            memberships__organization__in=organizations)
        memberships = Membership.objects.filter(person_id__in=people)
        contact_details = MembershipContactDetail.objects.filter(
            membership__in=memberships)

        # A person has multiple memberships.
        jurisdiction_with_repetition = {
            'ocd-jurisdiction/country:ca/cd:3521/legislature':
            4,  # Peel, due to Brampton
            'ocd-jurisdiction/country:ca/csd:3521010/legislature':
            4,  # Brampton
        }

        post_memberships_count = posts.values('id').annotate(
            count=Count('memberships'))

        # Validate the number of organizations per jurisdiction.
        results = jurisdictions.values('id').annotate(
            count=Count('organizations')).exclude(count=1)
        # The Parliament of Canada has three organizations.
        if len(results) > 1 or results and results[0] != {
                'count': 3,
                'id': 'ocd-jurisdiction/country:ca/legislature'
        }:
            log.error('{} jurisdictions do not have one organization'.format(
                len(results)))
            for result in results:
                log.info('{} {}'.format(result['count'], result['id']))

        # Validate the presence of posts and memberships on organizations.
        results = set(
            organizations.values('id').exclude(
                classification__in=('committee', 'party')).annotate(
                    count=Count('posts')).filter(count=0).values_list(
                        'name', flat=True)) - empty_organizations
        self.report_value(
            'non-committee, non-party organizations have no posts', results)
        results = set(
            organizations.values('id').exclude(
                classification='committee').annotate(
                    count=Count('memberships')).filter(count=0).values_list(
                        'name', flat=True)) - empty_organizations
        self.report_value('non-committee organizations have no memberships',
                          results)

        # Validate the number of memberships per post.
        results = Counter(
            post_memberships_count.filter(count=0).values_list(
                'organization__name', flat=True))
        self.report_count(
            'organizations have posts with no memberships (seats may be vacant)',
            results)
        results = Counter(
            post_memberships_count.filter(count__gt=1).values_list(
                'organization__name', flat=True))
        self.report_count('organizations have posts with many memberships',
                          results)

        # Validate the presence of posts on memberships.
        results = Counter(
            memberships.filter(post_id=None).exclude(
                organization__classification='party').values_list(
                    'organization__name', flat=True))
        self.report_count(
            'non-party organizations have memberships with no posts', results)

        # Validate that people have at most one post-membership.
        results = people.values('id').exclude(
            memberships__organization__classification='party').exclude(
                memberships__organization__jurisdiction_id__in=
                jurisdiction_with_repetition.keys()).annotate(
                    count=Count('memberships')).exclude(count=1).values_list(
                        'name', flat=True)
        self.report_value('people have many non-party memberships', results)
        for jurisdiction_id, threshold in jurisdiction_with_repetition.items():
            results = people.values('id').exclude(
                memberships__organization__classification='party').filter(
                    memberships__organization__jurisdiction_id=jurisdiction_id
                ).annotate(count=Count('memberships')).exclude(
                    count__lte=threshold).values_list('name', flat=True)
            self.report_value(
                'people have many non-party memberships in {}'.format(
                    jurisdiction_id), results)

        # Validate that people have at most one party-membership.
        results = people.values('id').filter(
            memberships__organization__classification='party').annotate(
                count=Count('memberships')).exclude(count=1).values_list(
                    'name', flat=True)
        self.report_value('people have many party memberships', results)

        # Validate the uniqueness of names and images.
        people_without_repetition = people.exclude(
            memberships__organization__jurisdiction_id__in=
            jurisdiction_with_repetition.keys())
        results = self.repeated(
            people_without_repetition.values_list('name', flat=True))
        self.report_count('names are repeated across people', results)
        results = self.repeated(
            people_without_repetition.exclude(image='').values_list('image',
                                                                    flat=True))
        self.report_count('images are repeated across people', results)
        for jurisdiction_id, threshold in jurisdiction_with_repetition.items():
            people_with_repetition = people.filter(
                memberships__organization__jurisdiction_id=jurisdiction_id)
            results = self.repeated(people_with_repetition.values_list(
                'name', flat=True),
                                    threshold=threshold)
            self.report_count(
                'names are repeated across people in {}'.format(
                    jurisdiction_id), results)
            results = self.repeated(people_with_repetition.exclude(
                image='').values_list('image', flat=True),
                                    threshold=threshold)
            self.report_count(
                'images are repeated across people in {}'.format(
                    jurisdiction_id), results)

        # Validate the uniqueness of link URLs.
        results = self.repeated(
            people.exclude(links__url=None).values_list('links__url',
                                                        flat=True))
        self.report_count('link URLs are repeated across people', results)

        # Validate the uniqueness of email contact detail values.
        results = self.repeated(
            contact_details.filter(type='email').exclude(
                membership__organization__jurisdiction_id__in=
                jurisdiction_with_repetition.keys()).values_list('value',
                                                                 flat=True))
        self.report_count(
            'emails are repeated across membership contact details', results)
        for jurisdiction_id, threshold in jurisdiction_with_repetition.items():
            results = self.repeated(
                contact_details.filter(type='email').filter(
                    membership__organization__jurisdiction_id=jurisdiction_id).
                values_list('value', flat=True),
                threshold=threshold)
            self.report_count(
                'emails are repeated across membership contact details in {}'.
                format(jurisdiction_id), results)

        # Validate presence of email contact detail.
        jurisdiction_with_no_email = [
            # Javascript-encoded email
            'ocd-jurisdiction/country:ca/csd:1217030/legislature',  # Cape Breton
            # Webform email
            'ocd-jurisdiction/country:ca/csd:2423027/legislature',  # Québec
            'ocd-jurisdiction/country:ca/csd:2464008/legislature',  # Terrebonne
            'ocd-jurisdiction/country:ca/csd:3524009/legislature',  # Milton
            'ocd-jurisdiction/country:ca/csd:3530016/legislature',  # Waterloo
            'ocd-jurisdiction/country:ca/csd:3530027/legislature',  # Wellesley
            'ocd-jurisdiction/country:ca/csd:3530035/legislature',  # Woolwich
            'ocd-jurisdiction/country:ca/csd:4706027/legislature',  # Regina
            'ocd-jurisdiction/country:ca/csd:4711066/legislature',  # Saskatoon
            'ocd-jurisdiction/country:ca/csd:4806016/legislature',  # Calgary
            'ocd-jurisdiction/country:ca/csd:5909052/legislature',  # Abbotsford
        ]
        leaders_with_no_email = {
            'ocd-jurisdiction/country:ca/cd:3521/legislature',  # Peel
            'ocd-jurisdiction/country:ca/csd:2437067/legislature',  # Trois-Rivières
            'ocd-jurisdiction/country:ca/csd:2456083/legislature',  # Saint-Jean-sur-Richelieu
            'ocd-jurisdiction/country:ca/csd:2494068/legislature',  # Saguenay
            'ocd-jurisdiction/country:ca/csd:3520005/legislature',  # Toronto
            'ocd-jurisdiction/country:ca/csd:3521024/legislature',  # Caledon
            'ocd-jurisdiction/country:ca/csd:3530013/legislature',  # Kitchener
            'ocd-jurisdiction/country:ca/csd:4811061/legislature',  # Edmonton
            'ocd-jurisdiction/country:ca/csd:4816037/legislature',  # Wood Buffalo
            'ocd-jurisdiction/country:ca/csd:5909052/legislature',  # Abbotsford
            'ocd-jurisdiction/country:ca/csd:5915004/legislature',  # Surrey
        }
        jurisdiction_ids = jurisdictions.exclude(
            id__in=jurisdiction_with_no_email).values_list('id', flat=True)
        for jurisdiction_id in jurisdiction_ids:
            for organization in organizations.filter(
                    jurisdiction_id=jurisdiction_id):
                # It's ridiculous that Django can't do a LEFT OUTER JOIN with a WHERE clause.
                memberships_with_no_email = sum(
                    not membership.contact_details.filter(
                        type='email').count()
                    for membership in organization.memberships.all())
                if memberships_with_no_email > 1 or memberships_with_no_email and jurisdiction_id not in leaders_with_no_email:
                    log.error('{:2} memberships have no email in {}'.format(
                        memberships_with_no_email, organization.name))
Beispiel #8
0
    def handle(self, *args, **options):
        sys.path.append(os.path.abspath('scrapers'))

        empty_organizations = {'Parliament of Canada', 'Senate'}

        if args:
            division_id = module_name_to_metadata(args[0])['division_id']
            jurisdictions = Jurisdiction.objects.filter(division_id=division_id)
            organizations = Organization.objects.filter(jurisdiction__in=jurisdictions)
            posts = Post.objects.filter(organization__in=organizations)
            people = Person.objects.filter(memberships__organization__in=organizations)
            memberships = Membership.objects.filter(person__id__in=people)
            contact_details = MembershipContactDetail.objects.filter(membership__in=memberships)
        else:
            jurisdictions = Jurisdiction.objects
            organizations = Organization.objects
            posts = Post.objects
            people = Person.objects
            memberships = Membership.objects
            contact_details = MembershipContactDetail.objects

        post_memberships_count = posts.values('id').annotate(count=Count('memberships'))

        # Validate the number of organizations per jurisdiction.
        results = jurisdictions.values('id').annotate(count=Count('organizations')).exclude(count=1)
        # The Parliament of Canada has three organizations.
        if len(results) > 1 or results and results[0] != {'count': 3, 'id': 'ocd-jurisdiction/country:ca/legislature'}:
            log.error('%d jurisdictions do not have one organization' % len(results))
            for result in results:
                log.info('%d %s' % (result['count'], result['id']))

        # Validate the presence of posts and memberships on organizations.
        results = set(organizations.values('id').exclude(classification='party').annotate(count=Count('posts')).filter(count=0).values_list('name', flat=True)) - empty_organizations
        self.report_value('non-party organizations have no posts', results)
        results = set(organizations.values('id').annotate(count=Count('memberships')).filter(count=0).values_list('name', flat=True)) - empty_organizations
        self.report_value('organizations have no memberships', results)

        # Validate the number of memberships per post.
        results = Counter(post_memberships_count.filter(count=0).values_list('organization__name', flat=True))
        self.report_count('organizations have posts with no memberships (seats may be vacant)', results)
        results = Counter(post_memberships_count.filter(count__gt=1).values_list('organization__name', flat=True))
        self.report_count('organizations have posts with many memberships', results)

        # Validate the presence of posts on memberships.
        results = Counter(memberships.filter(post_id=None).exclude(organization__classification='party').values_list('organization__name', flat=True))
        self.report_count('non-party organizations have memberships with no posts', results)

        # Validate that people have at most one post-membership and one party-membership.
        results = people.values('id').exclude(memberships__organization__classification='party').annotate(count=Count('memberships')).exclude(count=1).values_list('name', flat=True)
        self.report_value('people have many non-party memberships', results)
        results = people.values('id').filter(memberships__organization__classification='party').annotate(count=Count('memberships')).exclude(count=1).values_list('name', flat=True)
        self.report_value('people have many party memberships', results)

        # Validate the uniqueness of names, images and link URLs.
        results = self.repeated(people.exclude(image='').values_list('image', flat=True))
        self.report_count('people have the same image', results)
        results = self.repeated(people.values_list('name', flat=True))
        self.report_count('people have the same name', results)
        results = self.repeated(people.exclude(links__url=None).values_list('links__url', flat=True))
        self.report_count('people have the same link URL', results)

        # Validate the uniqueness of email contact detail values.
        results = self.repeated(contact_details.filter(type='email').values_list('value', flat=True))
        self.report_count('membership contact details with the same email', results)

        # Validate presence of email contact detail.
        jurisdiction_with_no_email = [
            # Javascript-encoded email
            'ocd-jurisdiction/country:ca/csd:1217030/legislature',  # Cape Breton
            # Webform email
            'ocd-jurisdiction/country:ca/csd:1310032/legislature',  # Fredericton
            'ocd-jurisdiction/country:ca/csd:2423027/legislature',  # Québec
            'ocd-jurisdiction/country:ca/csd:2464008/legislature',  # Terrebonne
            'ocd-jurisdiction/country:ca/csd:2466097/legislature',  # Pointe-Claire
            'ocd-jurisdiction/country:ca/csd:3530016/legislature',  # Waterloo
            'ocd-jurisdiction/country:ca/csd:3530035/legislature',  # Woolwich
            'ocd-jurisdiction/country:ca/csd:4706027/legislature',  # Regina
            'ocd-jurisdiction/country:ca/csd:4806016/legislature',  # Calgary
        ]
        leaders_with_no_email = [
            'ocd-jurisdiction/country:ca/cd:3521/legislature',  # Peel
            'ocd-jurisdiction/country:ca/csd:2437067/legislature',  # Trois-Rivières
            'ocd-jurisdiction/country:ca/csd:2456083/legislature',  # Saint-Jean-sur-Richelieu
            'ocd-jurisdiction/country:ca/csd:2494068/legislature',  # Saguenay
            'ocd-jurisdiction/country:ca/csd:3520005/legislature',  # Toronto
            'ocd-jurisdiction/country:ca/csd:3521024/legislature',  # Caledon
            'ocd-jurisdiction/country:ca/csd:3530013/legislature',  # Kitchener
            'ocd-jurisdiction/country:ca/csd:4711066/legislature',  # Saskatoon
            'ocd-jurisdiction/country:ca/csd:4811061/legislature',  # Edmonton
            'ocd-jurisdiction/country:ca/csd:4816037/legislature',  # Wood Buffalo
            'ocd-jurisdiction/country:ca/csd:5909052/legislature',  # Abbotsford
            'ocd-jurisdiction/country:ca/csd:5915004/legislature',  # Surrey
        ]
        jurisdiction_ids = jurisdictions.exclude(id__in=jurisdiction_with_no_email).values_list('id', flat=True)
        for jurisdiction_id in jurisdiction_ids:
            for organization in organizations.filter(jurisdiction_id=jurisdiction_id):
                # It's ridiculous that Django can't do a LEFT OUTER JOIN with a WHERE clause.
                memberships_with_no_email = sum(not membership.contact_details.filter(type='email').count() for membership in organization.memberships.all())
                if memberships_with_no_email > 1 or memberships_with_no_email and jurisdiction_id not in leaders_with_no_email:
                    log.error('%2d memberships have no email in %s' % (memberships_with_no_email, organization.name))
Beispiel #9
0
        def process(report, *, candidates=False):
            rows = []
            offices_count = 0

            try:
                metadata = module_name_to_metadata(report.module)

                # Exclude party memberships.
                queryset = Membership.objects.filter(
                    organization__jurisdiction_id=metadata['jurisdiction_id'])
                if candidates:
                    queryset = queryset.filter(role='candidate')
                else:
                    queryset = queryset.exclude(role__in=('member',
                                                          'candidate'))

                for membership in queryset.prefetch_related(
                        'contact_details', 'person', 'person__links',
                        'person__sources'):
                    person = membership.person

                    try:
                        party_name = Membership.objects.get(
                            organization__classification='party',
                            role='member',
                            person=person).organization.name
                    except Membership.DoesNotExist:
                        party_name = None

                    facebook = None
                    instagram = None
                    linkedin = None
                    twitter = None
                    youtube = None
                    for link in person.links.all():
                        domain = '.'.join(
                            urlsplit(link.url).netloc.split('.')[-2:])
                        if domain in ('facebook.com', 'fb.com'):
                            facebook = link.url
                        elif domain == 'instagram.com':
                            instagram = link.url
                        elif domain == 'linkedin.com':
                            linkedin = link.url
                        elif domain == 'twitter.com':
                            twitter = link.url
                        elif domain == 'youtube.com':
                            youtube = link.url

                    if person.gender == 'male':
                        gender = 'M'
                    elif person.gender == 'female':
                        gender = 'F'
                    else:
                        gender = None

                    if ' ' in person.name:
                        first_name, last_name = person.name.rsplit(' ', 1)
                    else:
                        first_name, last_name = None, person.name

                    # @see https://represent.opennorth.ca/api/#fields
                    sources = list(person.sources.all())
                    row = [
                        remove_suffix_re.sub(
                            '', membership.post.label),  # District name
                        membership.role,  # Elected office
                        person.name,  # Name
                        first_name,  # First name
                        last_name,  # Last name
                        gender,  # Gender
                        party_name,  # Party name
                        next((contact_detail.value for contact_detail in
                              membership.contact_details.all()
                              if contact_detail.type == 'email'),
                             None),  # Email
                        person.image,  # Photo URL
                        sources[-1].url
                        if len(sources) > 1 else None,  # Source URL
                        get_personal_url(person),  # Website
                        facebook,  # Facebook
                        instagram,  # Instagram
                        twitter,  # Twitter
                        linkedin,  # LinkedIn
                        youtube,  # YouTube
                    ]

                    offices = get_offices(membership)
                    if len(offices) > offices_count:
                        offices_count = len(offices)

                    for office in offices:
                        for key in ('type', 'postal', 'tel', 'fax'):
                            row.append(office.get(key))

                    # If the person is associated to multiple boundaries.
                    if re.search(r'\AWards\b', membership.post.label):
                        for district_id in re.findall(r'\d+',
                                                      membership.post.label):
                            row = row[:]
                            row[0] = 'Ward {}'.format(district_id)
                            rows.append(row)
                    else:
                        rows.append(row)

                rows.sort()

                headers = self.default_headers[:]
                for _ in range(offices_count):
                    headers += self.office_headers

                name = metadata['name']
                if name in self.names:
                    slug = self.names[name]
                else:
                    slug = slugify(name)

                io = StringIO()
                body = csv.writer(io)
                body.writerow(headers)
                body.writerows(rows)
                key = 'csv/{}/{}.csv'.format(
                    'candidates' if candidates else 'representatives', slug)
                save(key, io)

                for row in rows:
                    row.insert(0, name)
            except ImportError:
                report.delete()  # delete reports for old modules

            return [rows, offices_count]
Beispiel #10
0
def represent(request, module_name):
    sys.path.append(os.path.abspath('scrapers'))

    metadata = module_name_to_metadata(module_name)

    representatives = []

    # Exclude party memberships.
    queryset = Membership.objects.filter(organization__jurisdiction_id=metadata['jurisdiction_id'])

    if module_name.endswith('_candidates'):
        queryset.filter(role='candidate')
    else:
        queryset.exclude(role__in=('member', 'candidate'))

    for membership in queryset.prefetch_related('contact_details', 'person', 'person__links', 'person__sources'):
        person = membership.person

        try:
            party_name = Membership.objects.get(organization__classification='party', role='member', person=person).organization.name
        except Membership.DoesNotExist:
            party_name = None

        if person.gender == 'male':
            gender = 'M'
        elif person.gender == 'female':
            gender = 'F'
        else:
            gender = None

        incumbent = person.extras.pop('incumbent', None)

        # @see https://represent.opennorth.ca/api/#fields
        representative = {
            'name': person.name,
            'elected_office': membership.role,
            'party_name': party_name,
            'email': next((contact_detail.value for contact_detail in membership.contact_details.all() if contact_detail.type == 'email'), None),
            'photo_url': person.image or None,
            'personal_url': get_personal_url(person),
            'gender': gender,
            'offices': json.dumps(get_offices(membership)),
            'extra': json.dumps(get_extra(person)),
        }

        # @see https://github.com/opennorth/represent-canada/issues/81
        sources = list(person.sources.all())
        if len(sources[0].url) <= 200:
            representative['source_url'] = sources[0].url

        if len(sources) > 1:
            representative['url'] = sources[-1].url

        if incumbent:
            representative['incumbent'] = True

        match = re.search(r'^(\S+) (Ward \d+)$', membership.post.label)

        # If the person is part of Peel Regional Council.
        if match:
            parent = Division.objects.get(subtype1='csd', subtype2='', name=match.group(1))
            division = Division.objects.get(subtype1='csd', subid1=parent.subid1, name=match.group(2))
            boundary_set_slug = next((k for k, v in settings.IMAGO_BOUNDARY_MAPPINGS.items() if v['prefix'].startswith(parent.id)), None)
            representative['district_name'] = membership.post.label
            representative['boundary_url'] = '/boundaries/{}/ward-{}/'.format(boundary_set_slug, division.subid2)
            representatives.append(representative)
        # If the person is associated to multiple boundaries.
        elif re.search(r'^Wards\b', membership.post.label):
            for district_id in re.findall(r'\d+', membership.post.label):
                representative = representative.copy()
                representative['district_id'] = district_id
                representative['district_name'] = 'Ward %s' % district_id
                representatives.append(representative)
        else:
            division_id = metadata['division_id']
            if re.search('^ocd-division/country:ca/csd:(\d{7})\Z', division_id):
                geographic_code = division_id[-7:]
            elif re.search('^ocd-division/country:ca/cd:(\d{4})\Z', division_id):
                geographic_code = division_id[-4:]
            else:
                geographic_code = None
            post_label = remove_suffix_re.sub('', membership.post.label)
            # If the post label is numeric.
            if re.search(r'^\d+\Z', post_label):
                representative['district_id'] = post_label
            # If the person has a boundary URL.
            elif membership.extras.get('boundary_url'):
                representative['district_name'] = post_label
                representative['boundary_url'] = membership.extras['boundary_url']
            # If the post label is a census subdivision.
            elif post_label == metadata['division_name'] and geographic_code:
                representative['district_name'] = post_label
                if len(geographic_code) == 7:
                    representative['boundary_url'] = '/boundaries/census-subdivisions/%s/' % geographic_code
                elif len(geographic_code) == 4:
                    representative['boundary_url'] = '/boundaries/census-divisions/%s/' % geographic_code
            else:
                representative['district_name'] = post_label
                district_id = re.search(r'^(?:District|Division|Ward) (\d+)\Z', post_label)
                if district_id:
                    representative['district_id'] = district_id.group(1)

            representatives.append(representative)

    return HttpResponse(json.dumps(representatives), content_type='application/json')
Beispiel #11
0
def represent(request, module_name):
    sys.path.append(os.path.abspath('scrapers'))

    metadata = module_name_to_metadata(module_name)

    representatives = []

    # Exclude party memberships.
    queryset = Membership.objects.filter(
        organization__jurisdiction_id=metadata['jurisdiction_id'])

    if module_name.endswith('_candidates'):
        queryset.filter(role='candidate')
    else:
        queryset.exclude(role__in=('member', 'candidate'))

    for membership in queryset.prefetch_related('contact_details', 'person',
                                                'person__links',
                                                'person__sources'):
        person = membership.person

        try:
            party_name = Membership.objects.get(
                organization__classification='party',
                role='member',
                person=person).organization.name
        except Membership.DoesNotExist:
            party_name = None

        if person.gender == 'male':
            gender = 'M'
        elif person.gender == 'female':
            gender = 'F'
        else:
            gender = None

        incumbent = person.extras.pop('incumbent', None)

        # @see https://represent.opennorth.ca/api/#fields
        representative = {
            'name':
            person.name,
            'elected_office':
            membership.role,
            'party_name':
            party_name,
            'email':
            next((contact_detail.value
                  for contact_detail in membership.contact_details.all()
                  if contact_detail.type == 'email'), None),
            'photo_url':
            person.image or None,
            'personal_url':
            get_personal_url(person),
            'gender':
            gender,
            'offices':
            json.dumps(get_offices(membership)),
            'extra':
            json.dumps(get_extra(person)),
        }

        # @see https://github.com/opennorth/represent-canada/issues/81
        sources = list(person.sources.all())
        if len(sources[0].url) <= 200:
            representative['source_url'] = sources[0].url

        if len(sources) > 1:
            representative['url'] = sources[-1].url

        if incumbent:
            representative['incumbent'] = True

        match = re.search(r'^(\S+) (Ward \d+)$', membership.post.label)

        # If the person is part of Peel Regional Council.
        if match:
            parent = Division.objects.get(subtype1='csd',
                                          subtype2='',
                                          name=match.group(1))
            division = Division.objects.get(subtype1='csd',
                                            subid1=parent.subid1,
                                            name=match.group(2))
            boundary_set_slug = next(
                (k for k, v in settings.IMAGO_BOUNDARY_MAPPINGS.items()
                 if v['prefix'].startswith(parent.id)), None)
            representative['district_name'] = membership.post.label
            representative['boundary_url'] = '/boundaries/{}/ward-{}/'.format(
                boundary_set_slug, division.subid2)
            representatives.append(representative)
        # If the person is associated to multiple boundaries.
        elif re.search(r'^Wards\b', membership.post.label):
            for district_id in re.findall(r'\d+', membership.post.label):
                representative = representative.copy()
                representative['district_id'] = district_id
                representative['district_name'] = 'Ward %s' % district_id
                representatives.append(representative)
        else:
            division_id = metadata['division_id']
            if re.search('^ocd-division/country:ca/csd:(\d{7})\Z',
                         division_id):
                geographic_code = division_id[-7:]
            elif re.search('^ocd-division/country:ca/cd:(\d{4})\Z',
                           division_id):
                geographic_code = division_id[-4:]
            else:
                geographic_code = None
            post_label = remove_suffix_re.sub('', membership.post.label)
            # If the post label is numeric.
            if re.search(r'^\d+\Z', post_label):
                representative['district_id'] = post_label
            # If the person has a boundary URL.
            elif membership.extras.get('boundary_url'):
                representative['district_name'] = post_label
                representative['boundary_url'] = membership.extras[
                    'boundary_url']
            # If the post label is a census subdivision.
            elif post_label == metadata['division_name'] and geographic_code:
                representative['district_name'] = post_label
                if len(geographic_code) == 7:
                    representative[
                        'boundary_url'] = '/boundaries/census-subdivisions/%s/' % geographic_code
                elif len(geographic_code) == 4:
                    representative[
                        'boundary_url'] = '/boundaries/census-divisions/%s/' % geographic_code
            else:
                representative['district_name'] = post_label
                district_id = re.search(r'^(?:District|Division|Ward) (\d+)\Z',
                                        post_label)
                if district_id:
                    representative['district_id'] = district_id.group(1)

            representatives.append(representative)

    return HttpResponse(json.dumps(representatives),
                        content_type='application/json')
Beispiel #12
0
    def handle(self, *args, **options):
        sys.path.append(os.path.abspath('scrapers'))

        empty_organizations = {'Parliament of Canada', 'Senate'}

        if options['module']:
            division_id = module_name_to_metadata(options['module'])['division_id']
            jurisdictions = Jurisdiction.objects.filter(division_id=division_id)
        else:
            # Exclude candidate scrapers.
            jurisdictions = Jurisdiction.objects.exclude(classification='executive')

        organizations = Organization.objects.filter(jurisdiction__in=jurisdictions)
        posts = Post.objects.filter(organization__in=organizations)
        people = Person.objects.filter(memberships__organization__in=organizations)
        memberships = Membership.objects.filter(person_id__in=people)
        contact_details = MembershipContactDetail.objects.filter(membership__in=memberships)

        # A person has multiple memberships.
        jurisdiction_with_repetition = {
            'ocd-jurisdiction/country:ca/cd:3521/legislature': 4,  # Peel, due to Brampton
            'ocd-jurisdiction/country:ca/csd:3521010/legislature': 4,  # Brampton
        }

        post_memberships_count = posts.values('id').annotate(count=Count('memberships'))

        # Validate the number of organizations per jurisdiction.
        results = jurisdictions.values('id').annotate(count=Count('organizations')).exclude(count=1)
        # The Parliament of Canada has three organizations.
        if len(results) > 1 or results and results[0] != {'count': 3, 'id': 'ocd-jurisdiction/country:ca/legislature'}:
            log.error('{} jurisdictions do not have one organization'.format(len(results)))
            for result in results:
                log.info('{} {}'.format(result['count'], result['id']))

        # Validate the presence of posts and memberships on organizations.
        results = set(organizations.values('id').exclude(classification__in=('committee', 'party')).annotate(count=Count('posts')).filter(count=0).values_list('name', flat=True)) - empty_organizations
        self.report_value('non-committee, non-party organizations have no posts', results)
        results = set(organizations.values('id').exclude(classification='committee').annotate(count=Count('memberships')).filter(count=0).values_list('name', flat=True)) - empty_organizations
        self.report_value('non-committee organizations have no memberships', results)

        # Validate the number of memberships per post.
        results = Counter(post_memberships_count.filter(count=0).values_list('organization__name', flat=True))
        self.report_count('organizations have posts with no memberships (seats may be vacant)', results)
        results = Counter(post_memberships_count.filter(count__gt=1).values_list('organization__name', flat=True))
        self.report_count('organizations have posts with many memberships', results)

        # Validate the presence of posts on memberships.
        results = Counter(memberships.filter(post_id=None).exclude(organization__classification='party').values_list('organization__name', flat=True))
        self.report_count('non-party organizations have memberships with no posts', results)

        # Validate that people have at most one post-membership.
        results = people.values('id').exclude(memberships__organization__classification='party').exclude(memberships__organization__jurisdiction_id__in=jurisdiction_with_repetition.keys()).annotate(count=Count('memberships')).exclude(count=1).values_list('name', flat=True)
        self.report_value('people have many non-party memberships', results)
        for jurisdiction_id, threshold in jurisdiction_with_repetition.items():
            results = people.values('id').exclude(memberships__organization__classification='party').filter(memberships__organization__jurisdiction_id=jurisdiction_id).annotate(count=Count('memberships')).exclude(count__lte=threshold).values_list('name', flat=True)
            self.report_value('people have many non-party memberships in {}'.format(jurisdiction_id), results)

        # Validate that people have at most one party-membership.
        results = people.values('id').filter(memberships__organization__classification='party').annotate(count=Count('memberships')).exclude(count=1).values_list('name', flat=True)
        self.report_value('people have many party memberships', results)

        # Validate the uniqueness of names and images.
        people_without_repetition = people.exclude(memberships__organization__jurisdiction_id__in=jurisdiction_with_repetition.keys())
        results = self.repeated(people_without_repetition.values_list('name', flat=True))
        self.report_count('names are repeated across people', results)
        results = self.repeated(people_without_repetition.exclude(image='').values_list('image', flat=True))
        self.report_count('images are repeated across people', results)
        for jurisdiction_id, threshold in jurisdiction_with_repetition.items():
            people_with_repetition = people.filter(memberships__organization__jurisdiction_id=jurisdiction_id)
            results = self.repeated(people_with_repetition.values_list('name', flat=True), threshold=threshold)
            self.report_count('names are repeated across people in {}'.format(jurisdiction_id), results)
            results = self.repeated(people_with_repetition.exclude(image='').values_list('image', flat=True), threshold=threshold)
            self.report_count('images are repeated across people in {}'.format(jurisdiction_id), results)

        # Validate the uniqueness of link URLs.
        results = self.repeated(people.exclude(links__url=None).values_list('links__url', flat=True))
        self.report_count('link URLs are repeated across people', results)

        # Validate the uniqueness of email contact detail values.
        results = self.repeated(contact_details.filter(type='email').exclude(membership__organization__jurisdiction_id__in=jurisdiction_with_repetition.keys()).values_list('value', flat=True))
        self.report_count('emails are repeated across membership contact details', results)
        for jurisdiction_id, threshold in jurisdiction_with_repetition.items():
            results = self.repeated(contact_details.filter(type='email').filter(membership__organization__jurisdiction_id=jurisdiction_id).values_list('value', flat=True), threshold=threshold)
            self.report_count('emails are repeated across membership contact details in {}'.format(jurisdiction_id), results)

        # Validate presence of email contact detail.
        jurisdiction_with_no_email = [
            # Javascript-encoded email
            'ocd-jurisdiction/country:ca/csd:1217030/legislature',  # Cape Breton
            # Webform email
            'ocd-jurisdiction/country:ca/csd:2423027/legislature',  # Québec
            'ocd-jurisdiction/country:ca/csd:2464008/legislature',  # Terrebonne
            'ocd-jurisdiction/country:ca/csd:3524009/legislature',  # Milton
            'ocd-jurisdiction/country:ca/csd:3530016/legislature',  # Waterloo
            'ocd-jurisdiction/country:ca/csd:3530027/legislature',  # Wellesley
            'ocd-jurisdiction/country:ca/csd:3530035/legislature',  # Woolwich
            'ocd-jurisdiction/country:ca/csd:4706027/legislature',  # Regina
            'ocd-jurisdiction/country:ca/csd:4711066/legislature',  # Saskatoon
            'ocd-jurisdiction/country:ca/csd:4806016/legislature',  # Calgary
            'ocd-jurisdiction/country:ca/csd:5909052/legislature',  # Abbotsford
        ]
        leaders_with_no_email = {
            'ocd-jurisdiction/country:ca/cd:3521/legislature',  # Peel
            'ocd-jurisdiction/country:ca/csd:2437067/legislature',  # Trois-Rivières
            'ocd-jurisdiction/country:ca/csd:2456083/legislature',  # Saint-Jean-sur-Richelieu
            'ocd-jurisdiction/country:ca/csd:2494068/legislature',  # Saguenay
            'ocd-jurisdiction/country:ca/csd:3520005/legislature',  # Toronto
            'ocd-jurisdiction/country:ca/csd:3521024/legislature',  # Caledon
            'ocd-jurisdiction/country:ca/csd:3530013/legislature',  # Kitchener
            'ocd-jurisdiction/country:ca/csd:4811061/legislature',  # Edmonton
            'ocd-jurisdiction/country:ca/csd:4816037/legislature',  # Wood Buffalo
            'ocd-jurisdiction/country:ca/csd:5909052/legislature',  # Abbotsford
            'ocd-jurisdiction/country:ca/csd:5915004/legislature',  # Surrey
        }
        jurisdiction_ids = jurisdictions.exclude(id__in=jurisdiction_with_no_email).values_list('id', flat=True)
        for jurisdiction_id in jurisdiction_ids:
            for organization in organizations.filter(jurisdiction_id=jurisdiction_id):
                # It's ridiculous that Django can't do a LEFT OUTER JOIN with a WHERE clause.
                memberships_with_no_email = sum(not membership.contact_details.filter(type='email').count() for membership in organization.memberships.all())
                if memberships_with_no_email > 1 or memberships_with_no_email and jurisdiction_id not in leaders_with_no_email:
                    log.error('{:2} memberships have no email in {}'.format(memberships_with_no_email, organization.name))
Beispiel #13
0
    def handle(self, *args, **options):
        def save(key, body):
            k = Key(bucket)
            k.key = key
            k.set_contents_from_string(body)
            k.set_acl('public-read')

        sys.path.append(os.path.abspath('scrapers'))

        bucket = S3Connection().get_bucket('represent.opennorth.ca')

        names = {
            'Parliament of Canada': 'house-of-commons',
            'Legislative Assembly of Alberta': 'alberta-legislature',
            'Legislative Assembly of British Columbia': 'bc-legislature',
            'Legislative Assembly of Manitoba': 'manitoba-legislature',
            'Legislative Assembly of New Brunswick': 'new-brunswick-legislature',
            'Newfoundland and Labrador House of Assembly': 'newfoundland-labrador-legislature',
            'Nova Scotia House of Assembly': 'nova-scotia-legislature',
            'Legislative Assembly of Ontario': 'ontario-legislature',
            'Legislative Assembly of Prince Edward Island': 'pei-legislature',
            'Assemblée nationale du Québec': 'quebec-assemblee-nationale',
            'Legislative Assembly of Saskatchewan': 'saskatchewan-legislature',
        }

        default_headers = [
            'District name',
            'Primary role',
            'Name',  # not in CSV schema
            'First name',
            'Last name',
            'Gender',
            'Party name',
            'Email',
            'Photo URL',
            'Source URL',
            'Website',
            'Facebook',
            'Instagram',
            'Twitter',
            'LinkedIn',
            'YouTube',
        ]
        office_headers = [
            'Office type',  # not in CSV schema
            'Address',  # not in CSV schema
            'Phone',
            'Fax',
        ]

        all_rows = []
        max_offices_count = 0

        reports = Report.objects.filter(exception='').exclude(module__endswith='_candidates').exclude(module__endswith='_municipalities').order_by('module')
        for report in reports:
            try:
                metadata = module_name_to_metadata(report.module)

                rows = []
                offices_count = 0

                # Exclude party memberships.
                queryset = Membership.objects.filter(organization__jurisdiction_id=metadata['jurisdiction_id']).exclude(role__in=('member', 'candidate'))
                for membership in queryset.prefetch_related('contact_details', 'person', 'person__links', 'person__sources'):
                    person = membership.person

                    try:
                        party_name = Membership.objects.get(organization__classification='party', role='member', person=person).organization.name
                    except Membership.DoesNotExist:
                        party_name = None

                    facebook = None
                    instagram = None
                    linkedin = None
                    twitter = None
                    youtube = None
                    for link in person.links.all():
                        domain = '.'.join(urlsplit(link.url).netloc.split('.')[-2:])
                        if domain in ('facebook.com', 'fb.com'):
                            facebook = link.url
                        elif domain == 'instagram.com':
                            instagram = link.url
                        elif domain == 'linkedin.com':
                            linkedin = link.url
                        elif domain == 'twitter.com':
                            twitter = link.url
                        elif domain == 'youtube.com':
                            youtube = link.url

                    if person.gender == 'male':
                        gender = 'M'
                    elif person.gender == 'female':
                        gender = 'F'
                    else:
                        gender = None

                    if ' ' in person.name:
                        first_name, last_name = person.name.rsplit(' ', 1)
                    else:
                        first_name, last_name = None, person.name

                    # @see https://represent.opennorth.ca/api/#fields
                    sources = list(person.sources.all())
                    row = [
                        remove_suffix_re.sub('', membership.post.label),  # District name
                        membership.role,  # Elected office
                        person.name,  # Name
                        first_name,  # First name
                        last_name,  # Last name
                        gender,  # Gender
                        party_name,  # Party name
                        next((contact_detail.value for contact_detail in membership.contact_details.all() if contact_detail.type == 'email'), None),  # Email
                        person.image,  # Photo URL
                        sources[-1].url if len(sources) > 1 else None,  # Source URL
                        get_personal_url(person),  # Website
                        facebook,  # Facebook
                        instagram,  # Instagram
                        twitter,  # Twitter
                        linkedin,  # LinkedIn
                        youtube,  # YouTube
                    ]

                    offices = get_offices(membership)
                    if len(offices) > offices_count:
                        offices_count = len(offices)

                    for office in offices:
                        for key in ('type', 'postal', 'tel', 'fax'):
                            row.append(office.get(key))

                    # If the person is associated to multiple boundaries.
                    if re.search(r'\AWards\b', membership.post.label):
                        for district_id in re.findall(r'\d+', membership.post.label):
                            row = row[:]
                            row[0] = 'Ward %s' % district_id
                            rows.append(row)
                    else:
                        rows.append(row)

                rows.sort()

                headers = default_headers[:]
                for _ in range(offices_count):
                    headers += office_headers

                name = metadata['name']
                if name in names:
                    slug = names[name]
                else:
                    slug = slugify(name)

                io = StringIO()
                body = csv.writer(io)
                body.writerow(headers)
                body.writerows(rows)
                save('csv/%s.csv' % slug, codecs.encode(io.getvalue(), 'windows-1252'))

                if offices_count > max_offices_count:
                    max_offices_count = offices_count

                for row in rows:
                    row.insert(0, name)
                    all_rows.append(row)
            except ImportError:
                report.delete()  # delete reports for old modules

        headers = ['Organization'] + default_headers
        for _ in range(max_offices_count):
            headers += office_headers

        io = StringIO()
        body = csv.writer(io)
        body.writerow(headers)
        body.writerows(all_rows)
        save('csv/complete.csv', codecs.encode(io.getvalue(), 'windows-1252'))
Beispiel #14
0
    def handle(self, *args, **options):
        sys.path.append(os.path.abspath('scrapers'))

        empty_organizations = {'Parliament of Canada', 'Senate'}

        if args:
            division_id = module_name_to_metadata(args[0])['division_id']
            jurisdictions = Jurisdiction.objects.filter(
                division_id=division_id)
            organizations = Organization.objects.filter(
                jurisdiction__in=jurisdictions)
            posts = Post.objects.filter(organization__in=organizations)
            people = Person.objects.filter(
                memberships__organization__in=organizations)
            memberships = Membership.objects.filter(person__id__in=people)
            contact_details = MembershipContactDetail.objects.filter(
                membership__in=memberships)
        else:
            jurisdictions = Jurisdiction.objects
            organizations = Organization.objects
            posts = Post.objects
            people = Person.objects
            memberships = Membership.objects
            contact_details = MembershipContactDetail.objects

        post_memberships_count = posts.values('id').annotate(
            count=Count('memberships'))

        # Validate the number of organizations per jurisdiction.
        results = jurisdictions.values('id').annotate(
            count=Count('organizations')).exclude(count=1)
        # The Parliament of Canada has three organizations.
        if len(results) > 1 or results and results[0] != {
                'count': 3,
                'id': 'ocd-jurisdiction/country:ca/legislature'
        }:
            log.error('%d jurisdictions do not have one organization' %
                      len(results))
            for result in results:
                log.info('%d %s' % (result['count'], result['id']))

        # Validate the presence of posts and memberships on organizations.
        results = set(
            organizations.values('id').exclude(
                classification='party').annotate(count=Count('posts')).filter(
                    count=0).values_list('name',
                                         flat=True)) - empty_organizations
        self.report_value('non-party organizations have no posts', results)
        results = set(
            organizations.values('id').annotate(
                count=Count('memberships')).filter(count=0).values_list(
                    'name', flat=True)) - empty_organizations
        self.report_value('organizations have no memberships', results)

        # Validate the number of memberships per post.
        results = Counter(
            post_memberships_count.filter(count=0).values_list(
                'organization__name', flat=True))
        self.report_count(
            'organizations have posts with no memberships (seats may be vacant)',
            results)
        results = Counter(
            post_memberships_count.filter(count__gt=1).values_list(
                'organization__name', flat=True))
        self.report_count('organizations have posts with many memberships',
                          results)

        # Validate the presence of posts on memberships.
        results = Counter(
            memberships.filter(post_id=None).exclude(
                organization__classification='party').values_list(
                    'organization__name', flat=True))
        self.report_count(
            'non-party organizations have memberships with no posts', results)

        # Validate that people have at most one post-membership and one party-membership.
        results = people.values('id').exclude(
            memberships__organization__classification='party').annotate(
                count=Count('memberships')).exclude(count=1).values_list(
                    'name', flat=True)
        self.report_value('people have many non-party memberships', results)
        results = people.values('id').filter(
            memberships__organization__classification='party').annotate(
                count=Count('memberships')).exclude(count=1).values_list(
                    'name', flat=True)
        self.report_value('people have many party memberships', results)

        # Validate the uniqueness of names, images and link URLs.
        results = self.repeated(
            people.exclude(image='').values_list('image', flat=True))
        self.report_count('people have the same image', results)
        results = self.repeated(people.values_list('name', flat=True))
        self.report_count('people have the same name', results)
        results = self.repeated(
            people.exclude(links__url=None).values_list('links__url',
                                                        flat=True))
        self.report_count('people have the same link URL', results)

        # Validate the uniqueness of email contact detail values.
        results = self.repeated(
            contact_details.filter(type='email').values_list('value',
                                                             flat=True))
        self.report_count('membership contact details with the same email',
                          results)

        # Validate presence of email contact detail.
        jurisdiction_with_no_email = [
            # Javascript-encoded email
            'ocd-jurisdiction/country:ca/csd:1217030/legislature',  # Cape Breton
            # Webform email
            'ocd-jurisdiction/country:ca/csd:1310032/legislature',  # Fredericton
            'ocd-jurisdiction/country:ca/csd:2423027/legislature',  # Québec
            'ocd-jurisdiction/country:ca/csd:2464008/legislature',  # Terrebonne
            'ocd-jurisdiction/country:ca/csd:2466097/legislature',  # Pointe-Claire
            'ocd-jurisdiction/country:ca/csd:3530016/legislature',  # Waterloo
            'ocd-jurisdiction/country:ca/csd:3530035/legislature',  # Woolwich
            'ocd-jurisdiction/country:ca/csd:4706027/legislature',  # Regina
            'ocd-jurisdiction/country:ca/csd:4806016/legislature',  # Calgary
        ]
        leaders_with_no_email = [
            'ocd-jurisdiction/country:ca/cd:3521/legislature',  # Peel
            'ocd-jurisdiction/country:ca/csd:2437067/legislature',  # Trois-Rivières
            'ocd-jurisdiction/country:ca/csd:2456083/legislature',  # Saint-Jean-sur-Richelieu
            'ocd-jurisdiction/country:ca/csd:2494068/legislature',  # Saguenay
            'ocd-jurisdiction/country:ca/csd:3520005/legislature',  # Toronto
            'ocd-jurisdiction/country:ca/csd:3521024/legislature',  # Caledon
            'ocd-jurisdiction/country:ca/csd:3530013/legislature',  # Kitchener
            'ocd-jurisdiction/country:ca/csd:4711066/legislature',  # Saskatoon
            'ocd-jurisdiction/country:ca/csd:4811061/legislature',  # Edmonton
            'ocd-jurisdiction/country:ca/csd:4816037/legislature',  # Wood Buffalo
            'ocd-jurisdiction/country:ca/csd:5909052/legislature',  # Abbotsford
            'ocd-jurisdiction/country:ca/csd:5915004/legislature',  # Surrey
        ]
        jurisdiction_ids = jurisdictions.exclude(
            id__in=jurisdiction_with_no_email).values_list('id', flat=True)
        for jurisdiction_id in jurisdiction_ids:
            for organization in organizations.filter(
                    jurisdiction_id=jurisdiction_id):
                # It's ridiculous that Django can't do a LEFT OUTER JOIN with a WHERE clause.
                memberships_with_no_email = sum(
                    not membership.contact_details.filter(
                        type='email').count()
                    for membership in organization.memberships.all())
                if memberships_with_no_email > 1 or memberships_with_no_email and jurisdiction_id not in leaders_with_no_email:
                    log.error('%2d memberships have no email in %s' %
                              (memberships_with_no_email, organization.name))
Beispiel #15
0
def represent(request, module_name):
    sys.path.append(os.path.abspath('scrapers'))

    metadata = module_name_to_metadata(module_name)

    representatives = []

    # Exclude party memberships.
    queryset = Membership.objects.filter(
        organization__jurisdiction_id=metadata['jurisdiction_id'])

    if module_name.endswith('_candidates'):
        # Include only candidates.
        queryset.filter(role='candidate')
    else:
        # Exclude candidates and party memberships.
        queryset.exclude(role__in=('member', 'candidate'))

    for membership in queryset.prefetch_related('contact_details', 'person',
                                                'person__links',
                                                'person__sources', 'post'):
        person = membership.person

        # Not sure why this is necessary.
        if not isinstance(membership.extras, dict):
            membership.extras = json.loads(membership.extras)
        if not isinstance(person.extras, dict):
            person.extras = json.loads(person.extras)

        try:
            party_name = Membership.objects.select_related('organization').get(
                organization__classification='party',
                role='member',
                person=person).organization.name
        except Membership.DoesNotExist:
            party_name = None

        if person.gender == 'male':
            gender = 'M'
        elif person.gender == 'female':
            gender = 'F'
        else:
            gender = None

        # Candidates only.
        incumbent = person.extras.pop('incumbent', None)

        # @see https://represent.opennorth.ca/api/#fields
        representative = {
            'name':
            person.name,
            'elected_office':
            membership.role,
            'party_name':
            party_name,
            'email':
            next((contact_detail.value
                  for contact_detail in membership.contact_details.all()
                  if contact_detail.type == 'email'), None),
            'photo_url':
            person.image or None,
            'personal_url':
            get_personal_url(person),
            'gender':
            gender,
            'offices':
            json.dumps(get_offices(membership)),
            'extra':
            json.dumps(get_extra(person)),
        }

        sources = list(person.sources.all())

        # The first URL ought to be the most generic source.
        representative['source_url'] = sources[0].url

        if len(sources) > 1:
            # The last URL ought to be the most specific source.
            representative['url'] = sources[-1].url

        if incumbent:
            representative['incumbent'] = True

        match = re.search(r'^(\S+) (Ward \d+)$', membership.post.label)

        # If the person is part of Peel Regional Council.
        if match:
            parent = Division.objects.get(subtype1='csd',
                                          subtype2='',
                                          name=match.group(1))
            division = Division.objects.get(subtype1='csd',
                                            subid1=parent.subid1,
                                            name=match.group(2))
            boundary_set_slug = '{}-wards'.format(parent.name.lower())
            representative['district_name'] = membership.post.label
            representative['boundary_url'] = '/boundaries/{}/ward-{}/'.format(
                boundary_set_slug, division.subid2)
            representatives.append(representative)

        # If the person is associated to multiple boundaries.
        elif re.search(r'^Wards\b', membership.post.label):
            for district_id in re.findall(r'\d+', membership.post.label):
                representative = representative.copy()
                representative['district_id'] = district_id
                representative['district_name'] = 'Ward {}'.format(district_id)
                representatives.append(representative)

        else:
            division_id = metadata['division_id']

            if re.search('^ocd-division/country:ca/csd:(\d{7})\Z',
                         division_id):
                geographic_code = division_id[-7:]
            elif re.search('^ocd-division/country:ca/cd:(\d{4})\Z',
                           division_id):
                geographic_code = division_id[-4:]
            else:
                geographic_code = None

            post_label = remove_suffix_re.sub('', membership.post.label)

            # If the post label is numeric.
            if re.search(r'^\d+\Z', post_label):
                representative['district_id'] = post_label

            # If the person has a boundary URL.
            elif 'boundary_url' in membership.extras:
                representative['district_name'] = post_label
                representative['boundary_url'] = membership.extras[
                    'boundary_url']

            # If the post label is a Census geographic name.
            elif post_label == metadata['division_name'] and geographic_code:
                representative['district_name'] = post_label
                if len(geographic_code) == 7:
                    representative[
                        'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(
                            geographic_code)
                elif len(geographic_code) == 4:
                    representative[
                        'boundary_url'] = '/boundaries/census-divisions/{}/'.format(
                            geographic_code)

            else:
                representative['district_name'] = post_label
                district_id = re.search(r'^(?:District|Division|Ward) (\d+)\Z',
                                        post_label)
                if district_id:
                    representative['district_id'] = district_id.group(1)

            representatives.append(representative)

    return HttpResponse(json.dumps(representatives),
                        content_type='application/json')