def test_any_constituency_csv(self): self.create_memberships(self.ballot, self.parties) url = "{}.csv".format(self.ballot.get_absolute_url().rstrip("/")) response = self.app.get(url) self.assertEqual(response.status_code, 200) row_dicts = [row for row in BufferDictReader(response.content)] self.assertEqual(9, len(row_dicts)) membership = self.ballot.membership_set.order_by("person__pk").first() self.maxDiff = None self.assertDictEqual( dict(row_dicts[0]), { "blog_url": "", "birth_date": "", "cancelled_poll": "False", "elected": "", "election": self.ballot.election.slug, "election_current": "True", "election_date": self.ballot.election.election_date.isoformat(), "email": "", "facebook_page_url": "", "facebook_personal_url": "", "favourite_biscuits": "", "gender": "", "gss_code": "", "homepage_url": "", "honorific_prefix": "", "honorific_suffix": "", "id": str(membership.person.pk), "image_copyright": "", "image_uploading_user": "", "image_uploading_user_notes": "", "image_url": "", "instagram_url": "", "linkedin_url": "", "mapit_url": "", "name": membership.person.name, "old_person_ids": "", "parlparse_id": "", "party_ec_id": membership.party.ec_id, "party_id": membership.party.legacy_slug, "party_lists_in_use": "False", "party_list_position": "", "party_name": membership.party.name, "party_ppc_page_url": "", "post_id": self.ballot.post.slug, "post_label": self.ballot.post.label, "proxy_image_url_template": "", "theyworkforyou_url": "", "twitter_username": "", "twitter_user_id": "", "wikipedia_url": "", "wikidata_id": "", "youtube_profile": "", }, )
def test_any_constituency_csv(self): url = "{}.csv".format( self.dulwich_post_pee.get_absolute_url().rstrip("/")) response = self.app.get(url) self.assertEqual(response.status_code, 200) row_dicts = [row for row in BufferDictReader(response.content)] self.assertEqual(2, len(row_dicts)) self.assertDictEqual( dict(row_dicts[1]), { "birth_date": "", "cancelled_poll": "False", "elected": "", "election": "parl.2015-05-07", "election_current": "True", "election_date": text_type(date_in_near_future), "email": "", "facebook_page_url": "", "facebook_personal_url": "", "favourite_biscuits": "", "gender": "", "gss_code": "", "homepage_url": "", "honorific_prefix": "", "honorific_suffix": "", "id": "2009", "image_copyright": "", "image_uploading_user": "", "image_uploading_user_notes": "", "image_url": "", "linkedin_url": "", "mapit_url": "", "name": "Tessa Jowell", "old_person_ids": "", "parlparse_id": "", "party_ec_id": "PP53", "party_id": "party:53", "party_lists_in_use": "False", "party_list_position": "", "party_name": "Labour Party", "party_ppc_page_url": "", "post_id": "65808", "post_label": "Dulwich and West Norwood", "proxy_image_url_template": "", "theyworkforyou_url": "", "twitter_username": "", "twitter_user_id": "", "wikipedia_url": "", "wikidata_url": "", }, )
def handle(self, *args, **options): csv_url, = args r = requests.get(csv_url) r.encoding = 'utf-8' reader = BufferDictReader(r.text) for line in reader: cleaned_line = {} for k,v in line.items(): cleaned_line[k] = strip(v) if not cleaned_line['Election ID']: continue if not cleaned_line['GSS Code']: continue self.process_line(cleaned_line)
def test_any_constituency_csv(self): response = self.app.get( '/election/2015/post/65808/dulwich-and-west-norwood.csv', ) row_dicts = [row for row in BufferDictReader(response.content)] self.assertEqual(1, len(row_dicts)) self.assertEqual( row_dicts[0], { 'birth_date': '', 'elected': '', 'election': '2015', 'election_current': 'True', 'election_date': text_type(date_in_near_future), 'email': '', 'facebook_page_url': '', 'facebook_personal_url': '', 'favourite_biscuits': '', 'gender': '', 'gss_code': '', 'homepage_url': '', 'honorific_prefix': '', 'honorific_suffix': '', 'id': '2009', 'image_copyright': '', 'image_uploading_user': '', 'image_uploading_user_notes': '', 'image_url': '', 'linkedin_url': '', 'mapit_url': '', 'name': 'Tessa Jowell', 'old_person_ids': '', 'parlparse_id': '', 'party_ec_id': 'PP53', 'party_id': 'party:53', 'party_lists_in_use': 'False', 'party_list_position': '', 'party_name': 'Labour Party', 'party_ppc_page_url': '', 'post_id': '65808', 'post_label': 'Dulwich and West Norwood', 'proxy_image_url_template': '', 'theyworkforyou_url': '', 'twitter_username': '', 'twitter_user_id': '', 'wikipedia_url': '', })
def handle(self, *args, **options): csv_url, = args override_election = None override_election_slug = options['election'] if override_election_slug: try: override_election = Election.objects.get( slug=override_election_slug) except Election.DoesNotExist: msg = 'No election with slug {0} found' raise CommandError(msg.format(override_election_slug)) election_name_to_election = {} mime_type_magic = magic.Magic(mime=True) storage = FileSystemStorage() r = requests.get(csv_url) r.encoding = 'utf-8' reader = BufferDictReader(r.text) for row in reader: post_or_area_header = get_column_header( POST_OR_AREA_COLUMN_HEADERS_TO_TRY, row) name = row[post_or_area_header] if not name: continue name = name.strip() # If there was no election specified, try to find it from # the 'Election' column (which has the election name): if override_election_slug: election = override_election else: if 'Election' not in row: raise CommandError( "There is no election name in the 'Election' column, so you must supply an election slug with --election" ) election_name = row['Election'] election = election_name_to_election.get(election_name) if election is None: election = Election.objects.get(name=election_name) election_name_to_election[election_name] = election try: post = Post.objects.get( label=name, extra__elections=election, ) except Post.DoesNotExist: msg = "Failed to find the post {0}, guessing it might be the area name instead" print(msg.format(name)) # If the post name isn't there, try getting it from # the area: try: area = Area.objects.get(name=name) except Area.DoesNotExist: print("Failed to find area for {0}".format(name)) continue try: post = Post.objects.get(area=area) except Post.DoesNotExist: print("Failed to find post with for {0}".format(name)) continue # Check that the post is actually valid for this election: if election not in post.extra.elections.all(): msg = "The post {post} wasn't in the election {election}" raise CommandError( msg.format(post=post.label, election=election.name)) document_url_column = get_column_header(PDF_COLUMN_HEADERS_TO_TRY, row) document_url = row[document_url_column] if not document_url: print("No URL for {0}".format(name)) continue existing_documents = OfficialDocument.objects.filter( document_type=OfficialDocument.NOMINATION_PAPER, post_id=post, ) if existing_documents.count() > 0: if options['delete_existing']: print("Removing existing documents") existing_documents.delete() else: print("Skipping {0} since it already had documents".format( name)) continue try: downloaded_filename = download_file_cached(document_url) except requests.exceptions.ConnectionError: print("Connection failed for {0}".format(name)) print("The URL was:", document_url) continue except requests.exceptions.MissingSchema: # This is probably someone putting notes in the URL # column, so ignore: print("Probably not a document URL for {0}: {1}".format( name, document_url)) continue mime_type = mime_type_magic.from_file(downloaded_filename) extension = mimetypes.guess_extension(mime_type) if mime_type not in allowed_mime_types: print("Ignoring unknown MIME type {0} for {1}".format( mime_type, name, )) continue filename = "official_documents/{post_id}/statement-of-persons-nominated{extension}".format( post_id=post.extra.slug, extension=extension, ) with open(downloaded_filename, 'rb') as f: storage_filename = storage.save(filename, f) OfficialDocument.objects.create( document_type=OfficialDocument.NOMINATION_PAPER, uploaded_file=storage_filename, election=election, post=post, source_url=document_url) message = "Successfully added the Statement of Persons Nominated for {0}" print(message.format(name))
def handle(self, *args, **options): csv_url = options["url"] mime_type_magic = magic.Magic(mime=True) storage = DefaultStorage() r = requests.get(csv_url) r.encoding = "utf-8" reader = BufferDictReader(r.text) for row in reader: pee = PostExtraElection.objects.get( ballot_paper_id=row["ballot_paper_id"]) document_url = row["Link to PDF"] if not document_url: # print("No URL for {0}".format(name)) continue existing_documents = OfficialDocument.objects.filter( document_type=OfficialDocument.NOMINATION_PAPER, post_election=pee, ) if existing_documents.count() > 0: if options["delete_existing"]: print("Removing existing documents") existing_documents.delete() else: msg = "Skipping {0} since it already had documents for {1}" # print(msg.format(name, election)) continue try: downloaded_filename = download_file_cached(document_url) except requests.exceptions.ConnectionError: print("Connection failed for {}".format( row["ballot_paper_id"])) print("The URL was:", document_url) continue except requests.exceptions.MissingSchema: # This is probably someone putting notes in the URL # column, so ignore: print("Probably not a document URL for {}: {}".format( row["ballot_paper_id"], document_url)) continue mime_type = mime_type_magic.from_file(downloaded_filename).decode( "utf8") extension = mimetypes.guess_extension(mime_type) if mime_type not in allowed_mime_types: recovered = False # Attempt to get a PDF link form the URL ignore_urls = ["drive.google.com"] if not any([x in document_url for x in ignore_urls]): try: req = requests.get(document_url, headers=headers, verify=False) if req.status_code == 200: re_sre = r'(http[^"\']+\.pdf)' matches = re.findall(re_sre, req.content) if len(matches) == 1: document_url = matches[0] downloaded_filename = download_file_cached( document_url) mime_type = mime_type_magic.from_file( downloaded_filename) extension = mimetypes.guess_extension(mime_type) if mime_type not in allowed_mime_types: raise ValueError( "Recovery failed to get a PDF for {}". format(pee.ballot_paper_id)) else: recovered = True except Exception as e: print(e) else: print("Ignoring unknown MIME type {} for {}".format( mime_type, pee.ballot_paper_id)) if not recovered: continue filename = "official_documents/{ballot_paper_id}/statement-of-persons-nominated{extension}".format( ballot_paper_id=pee.ballot_paper_id, extension=extension) if not extension: raise ValueError("unknown extension") with open(downloaded_filename, "rb") as f: storage_filename = storage.save(filename, f) OfficialDocument.objects.create( document_type=OfficialDocument.NOMINATION_PAPER, uploaded_file=storage_filename, post_election=pee, source_url=document_url, ) message = ( "Successfully added the Statement of Persons Nominated for {0}" ) print(message.format(pee.ballot_paper_id)) extract_and_parse_tables_for_ballot.delay(pee.ballot_paper_id)