def scrape_pdf(url, filename): try: print("\ndownloading " + url) raw_pdf_name = url.split("/")[-1] import_date = datetime.now().replace(microsecond=0) pdf_name = "{}-{:02d}-{:02d}-{}".format(import_date.year, import_date.month, import_date.day, raw_pdf_name) pdf_helpers.get_pdf_from_admin_ch(url, pdf_name) print("\nextracting metadata...") creation_date = extract_creation_date(pdf_name) archive_pdf_name = "{}-{:02d}-{:02d}-{}".format( creation_date.year, creation_date.month, creation_date.day, raw_pdf_name) archive_filename = "{}-{:02d}-{:02d}-{}".format( creation_date.year, creation_date.month, creation_date.day, filename) print("\nPDF creation date: {:02d}.{:02d}.{}\n".format( creation_date.day, creation_date.month, creation_date.year)) print("removing first page of PDF...") call([ "qpdf", "--pages", pdf_name, "2-z", "--", pdf_name, "zb_file-stripped.pdf" ]) print("parsing PDF...") call([ "java", "-Djava.util.logging.config.file=web_scrapers/logging.properties", "-jar", get_script_path() + "/tabula-0.9.2-jar-with-dependencies.jar", "zb_file-stripped.pdf", "--pages", "all", "-o", "zb_data.csv" ]) print("cleaning up parsed data...") guests = cleanup_file("zb_data.csv") print("writing " + filename + "...") write_to_json(guests, archive_pdf_name, filename, url, creation_date, import_date) print("archiving...") copyfile(pdf_name, get_script_path() + "/archive/{}".format(archive_pdf_name)) copyfile(filename, get_script_path() + "/archive/{}".format(archive_filename)) finally: print("cleaning up...") os.rename(pdf_name, get_script_path() + "/backup/{}".format(pdf_name)) backup_filename = "{}-{:02d}-{:02d}-{}".format(import_date.year, import_date.month, import_date.day, filename) copyfile(filename, get_script_path() + "/backup/{}".format(backup_filename)) os.remove("zb_file-stripped.pdf") os.remove("zb_data.csv")
def scrape(): parser = ArgumentParser(description='Scarpe Parlamentarische Gruppen PDF') parser.add_argument("local_pdf", metavar="file", nargs='?', help="local PDF file to use", default=None) args = parser.parse_args() local_pdf = args.local_pdf url = "https://www.parlament.ch/centers/documents/de/parlamentarische-gruppen.pdf" filename = "parlamentarische-gruppen.json" script_path = os.path.dirname(os.path.realpath(__file__)) try: import_date = datetime.now().replace(microsecond=0) raw_pdf_name = url.split("/")[-1] pdf_name = "{}-{:02d}-{:02d}-{}".format(import_date.year, import_date.month, import_date.day, raw_pdf_name) if local_pdf is None: print("\ndownloading " + url) pdf_helpers.get_pdf_from_admin_ch(url, pdf_name) else: print("\ncopy local PDF " + local_pdf) copyfile(local_pdf, pdf_name) print("\nextracting metadata...") creation_date = pdf_helpers.extract_creation_date(pdf_name) archive_pdf_name = "{}-{:02d}-{:02d}-{}".format( creation_date.year, creation_date.month, creation_date.day, raw_pdf_name) archive_filename = "{}-{:02d}-{:02d}-{}".format( creation_date.year, creation_date.month, creation_date.day, filename) print("\nPDF creation date: {:02d}.{:02d}.{}\n".format( creation_date.day, creation_date.month, creation_date.year)) print("parsing PDF...") FNULL = open(os.devnull, 'w') tabula_path = script_path + "/tabula-0.9.2-jar-with-dependencies.jar" call([ "java", "-jar", tabula_path, pdf_name, "--pages", "all", "-o", "pg_data.csv" ], stderr=FNULL) print("cleaning up parsed data...") groups = cleanup_file("pg_data.csv") groups = normalize_namen(groups) print("writing " + filename + "...") write_to_json(groups, archive_pdf_name, filename, url, creation_date, import_date) if local_pdf is None: print("archiving...") copyfile(pdf_name, script_path + "/archive/{}".format(archive_pdf_name)) copyfile(filename, script_path + "/archive/{}".format(archive_filename)) finally: print("cleaning up...") os.rename(pdf_name, script_path + "/backup/{}".format(pdf_name)) backup_filename = "{}-{:02d}-{:02d}-{}".format(import_date.year, import_date.month, import_date.day, filename) copyfile(filename, script_path + "/backup/{}".format(backup_filename)) os.remove("pg_data.csv")
def scrape_pdf(url, local_pdf, filename): script_path = get_script_path() stripped_file_name = None try: raw_pdf_name = url.split("/")[-1] import_date = datetime.now().replace(microsecond=0) pdf_name = "{}-{:02d}-{:02d}-{}".format(import_date.year, import_date.month, import_date.day, raw_pdf_name) if local_pdf is None: print("\ndownloading " + url) pdf_helpers.get_pdf_from_admin_ch(url, pdf_name) else: print("\ncopy local PDF " + local_pdf) copyfile(local_pdf, pdf_name) print("\nextracting metadata...") creation_date = pdf_helpers.extract_creation_date(pdf_name) archive_pdf_name = "{}-{:02d}-{:02d}-{}".format( creation_date.year, creation_date.month, creation_date.day, raw_pdf_name) archive_filename = "{}-{:02d}-{:02d}-{}".format( creation_date.year, creation_date.month, creation_date.day, filename) print("\nPDF creation date: {:02d}.{:02d}.{}\n".format( creation_date.day, creation_date.month, creation_date.year)) print("removing first page of PDF...") stripped_file_name = "zb_file-stripped.pdf" call([ "qpdf", "--pages", pdf_name, "2-z", "--", pdf_name, stripped_file_name ]) print("parsing PDF...") tabula_path = script_path + "/tabula-1.0.4-jar-with-dependencies.jar" cmd = [ "java", "-Djava.util.logging.config.file=web_scrapers/logging.properties", "-jar", tabula_path, stripped_file_name, "-o", "zb_data.csv", "--pages", "all", "-l", "-i" ] print(" ".join(cmd)) call(cmd, stderr=None) print("cleaning up parsed data...") guests = read_guests("zb_data.csv") print("writing " + filename + "...") write_to_json(guests, archive_pdf_name, filename, url, creation_date, import_date) print("archiving...") copyfile(pdf_name, script_path + "/archive/{}".format(archive_pdf_name)) copyfile(filename, script_path + "/archive/{}".format(archive_filename)) finally: print("cleaning up...") os.rename(pdf_name, script_path + "/backup/{}".format(pdf_name)) backup_filename = "{}-{:02d}-{:02d}-{}".format(import_date.year, import_date.month, import_date.day, filename) copyfile(filename, script_path + "/backup/{}".format(backup_filename)) if stripped_file_name and os.path.isfile(stripped_file_name): os.remove(stripped_file_name) if os.path.isfile("zb_data.csv"): os.remove("zb_data.csv")