def export_processed_urls(url_file, dict_url): with open(url_file, "w+") as f: try: json.dump(dict_url, f) except ValueError, e: scraper.log("Can't dump json.") return -1 else:
def scrap_files(dict_files): for id_file, file in dict_files.iteritems(): if "no data written" not in scraper.process_html(file, id_file): scraper.log("file " + file + " with id " + str(id_file) + " created.") else: scraper.log("Failed to create file " + file + " with id " + str(id_file) + ".")
def import_processed_urls(url_file): try: with open(url_file, "r") as f: try: json_data = json.load(f) except ValueError, e: dict_url = dict() scraper.log("Can't load json.") return dict_url else:
def get_latest_id(): url = "http://www.scottish.parliament.uk/parliamentarybusiness/official-report.aspx" req = urllib2.Request(url) response = urllib2.urlopen(req) for line in response: # Former case, let just keep it in case they change again if "Official Report, Meeting of the Parliament," in line: # Keep only the url f_url = re.match(r'^.*"(.*)".*$', line).group(1) # Get the true URL because they thought it was a good idea to change the URL to a shortened one 2 days before the deadline response = urllib2.urlopen(f_url) f_url = response.url # Remove the base url to get only the id id_report = f_url.replace("http://www.scottish.parliament.uk/parliamentarybusiness/28862.aspx?r=", "") # Remove everything after # # There's some data like that sometimes id_report, sep, tail = id_report.partition('#') scraper.log("Latest id report to scrape is " + id_report + ". Starting retrieving hfml files now.") return id_report scraper.log("Latest report id not found. Could not retrieve the reports.") return 0
def get_html_files(): write = False report_date = "2000_01_01" dict_files = dict() # Load the already processed URLs processed_urls = import_processed_urls(f_processed_urls) # id of first report for the current parliament session is roughly 5300 for i in range(5300, int(get_latest_id()) + 1): if str(i) not in processed_urls: url = base_url + "?r=" + str(i) # Test the page try: response = urllib2.urlopen(url) page = response.read() except urllib2.HTTPError, e: # Add it the list of already processed url processed_urls[i] = base_url + "?r=" + str(i) scraper.log(str(i) + " is not a valid URL.") continue else: # Check if it's a parliamentary report, ie what we want # Get the date too for line in page.split("\n"): if ">Meeting of the Parliament" in line: report_date = scraper.trim_html(line).replace( "Meeting of the Parliament ", "") report_date = scraper.datetime.datetime.strptime( report_date, '%d %B %Y').strftime('%Y_%m_%d') write = True processed_urls[i] = base_url + "?r=" + str(i) # If not a parliamentary report if not write: scraper.log( str(i) + " is not a parliamentary report. No data written.") continue else: scraper.log(str(i) + " is a parliamentary report.") # Write the file file_output = "html_" + report_date + ".html" dict_files[i] = file_output with open(file_output, "w+") as f: f.write(page) write = False else: scraper.log(str(i) + " already scrapped.")
def get_latest_id(): url = "http://www.scottish.parliament.uk/parliamentarybusiness/official-report.aspx" req = urllib2.Request(url) response = urllib2.urlopen(req) for line in response: # Former case, let just keep it in case they change again if "Official Report, Meeting of the Parliament," in line: # Keep only the url f_url = re.match(r'^.*"(.*)".*$', line).group(1) # Get the true URL because they thought it was a good idea to change the URL to a shortened one 2 days before the deadline response = urllib2.urlopen(f_url) f_url = response.url # Remove the base url to get only the id id_report = f_url.replace( "http://www.scottish.parliament.uk/parliamentarybusiness/28862.aspx?r=", "") # Remove everything after # # There's some data like that sometimes id_report, sep, tail = id_report.partition('#') scraper.log("Latest id report to scrape is " + id_report + ". Starting retrieving hfml files now.") return id_report scraper.log("Latest report id not found. Could not retrieve the reports.") return 0
def get_html_files(): write = False report_date = "2000_01_01" dict_files = dict() # Load the already processed URLs processed_urls = import_processed_urls(f_processed_urls) # id of first report for the current parliament session is roughly 5300 for i in range(5300, int(get_latest_id()) + 1): if str(i) not in processed_urls: url = base_url + "?r=" + str(i) # Test the page try: response = urllib2.urlopen(url) page = response.read() except urllib2.HTTPError, e: # Add it the list of already processed url processed_urls[i] = base_url + "?r=" + str(i) scraper.log(str(i) + " is not a valid URL.") continue else: # Check if it's a parliamentary report, ie what we want # Get the date too for line in page.split("\n"): if ">Meeting of the Parliament" in line: report_date = scraper.trim_html(line).replace("Meeting of the Parliament ", "") report_date = scraper.datetime.datetime.strptime(report_date, '%d %B %Y').strftime('%Y_%m_%d') write = True processed_urls[i] = base_url + "?r=" + str(i) # If not a parliamentary report if not write: scraper.log(str(i) + " is not a parliamentary report. No data written.") continue else: scraper.log(str(i) + " is a parliamentary report.") # Write the file file_output = "html_" + report_date + ".html" dict_files[i] = file_output with open(file_output, "w+") as f: f.write(page) write = False else: scraper.log(str(i) + " already scrapped.")
# Import the URLs already processed in the json file def import_processed_urls(url_file): try: with open(url_file, "r") as f: try: json_data = json.load(f) except ValueError, e: dict_url = dict() scraper.log("Can't load json.") return dict_url else: return json_data except IOError, e: scraper.log("Can't open file. Creating it...") dict_url = dict() return dict_url # Store the dictionary of processed URLs as json def export_processed_urls(url_file, dict_url): with open(url_file, "w+") as f: try: json.dump(dict_url, f) except ValueError, e: scraper.log("Can't dump json.") return -1 else: return 0