Beispiel #1
0
def export_processed_urls(url_file, dict_url):
    with open(url_file, "w+") as f:
        try:
            json.dump(dict_url, f)
        except ValueError, e:
            scraper.log("Can't dump json.")
            return -1
        else:
Beispiel #2
0
def scrap_files(dict_files):
    for id_file, file in dict_files.iteritems():
        if "no data written" not in scraper.process_html(file, id_file):
            scraper.log("file " + file + " with id " + str(id_file) +
                        " created.")
        else:
            scraper.log("Failed to create file " + file + " with id " +
                        str(id_file) + ".")
def export_processed_urls(url_file, dict_url):
    with open(url_file, "w+") as f:
        try:
            json.dump(dict_url, f)
        except ValueError, e:
            scraper.log("Can't dump json.")
            return -1
        else:
Beispiel #4
0
def import_processed_urls(url_file):
    try:
        with open(url_file, "r") as f:
            try:
                json_data = json.load(f)
            except ValueError, e:
                dict_url = dict()
                scraper.log("Can't load json.")
                return dict_url
            else:
def import_processed_urls(url_file):
    try:
        with open(url_file, "r") as f:
            try:
                json_data = json.load(f)
            except ValueError, e:
                dict_url = dict()
                scraper.log("Can't load json.")
                return dict_url
            else:
def get_latest_id():
    url = "http://www.scottish.parliament.uk/parliamentarybusiness/official-report.aspx"
    req = urllib2.Request(url)
    response = urllib2.urlopen(req)
    for line in response:
        # Former case, let just keep it in case they change again
        if "Official Report, Meeting of the Parliament," in line:
            # Keep only the url
            f_url = re.match(r'^.*"(.*)".*$', line).group(1)
            # Get the true URL because they thought it was a good idea to change the URL to a shortened one 2 days before the deadline
            response = urllib2.urlopen(f_url)
            f_url = response.url
            # Remove the base url to get only the id
            id_report = f_url.replace("http://www.scottish.parliament.uk/parliamentarybusiness/28862.aspx?r=", "")
            # Remove everything after #
            # There's some data like that sometimes
            id_report, sep, tail = id_report.partition('#')
            scraper.log("Latest id report to scrape is " + id_report + ". Starting retrieving hfml files now.")
            return id_report
    scraper.log("Latest report id not found. Could not retrieve the reports.")
    return 0
Beispiel #7
0
def get_html_files():
    write = False
    report_date = "2000_01_01"
    dict_files = dict()
    # Load the already processed URLs
    processed_urls = import_processed_urls(f_processed_urls)
    # id of first report for the current parliament session is roughly 5300
    for i in range(5300, int(get_latest_id()) + 1):
        if str(i) not in processed_urls:
            url = base_url + "?r=" + str(i)
            # Test the page
            try:
                response = urllib2.urlopen(url)
                page = response.read()
            except urllib2.HTTPError, e:
                # Add it the list of already processed url
                processed_urls[i] = base_url + "?r=" + str(i)
                scraper.log(str(i) + " is not a valid URL.")
                continue
            else:
                # Check if it's a parliamentary report, ie what we want
                # Get the date too
                for line in page.split("\n"):
                    if ">Meeting of the Parliament" in line:
                        report_date = scraper.trim_html(line).replace(
                            "Meeting of the Parliament ", "")
                        report_date = scraper.datetime.datetime.strptime(
                            report_date, '%d %B %Y').strftime('%Y_%m_%d')
                        write = True
                processed_urls[i] = base_url + "?r=" + str(i)
                # If not a parliamentary report
                if not write:
                    scraper.log(
                        str(i) +
                        " is not a parliamentary report. No data written.")
                    continue
                else:
                    scraper.log(str(i) + " is a parliamentary report.")
                    # Write the file
                    file_output = "html_" + report_date + ".html"
                    dict_files[i] = file_output
                    with open(file_output, "w+") as f:
                        f.write(page)
                        write = False
        else:
            scraper.log(str(i) + " already scrapped.")
Beispiel #8
0
def get_latest_id():
    url = "http://www.scottish.parliament.uk/parliamentarybusiness/official-report.aspx"
    req = urllib2.Request(url)
    response = urllib2.urlopen(req)
    for line in response:
        # Former case, let just keep it in case they change again
        if "Official Report, Meeting of the Parliament," in line:
            # Keep only the url
            f_url = re.match(r'^.*"(.*)".*$', line).group(1)
            # Get the true URL because they thought it was a good idea to change the URL to a shortened one 2 days before the deadline
            response = urllib2.urlopen(f_url)
            f_url = response.url
            # Remove the base url to get only the id
            id_report = f_url.replace(
                "http://www.scottish.parliament.uk/parliamentarybusiness/28862.aspx?r=",
                "")
            # Remove everything after #
            # There's some data like that sometimes
            id_report, sep, tail = id_report.partition('#')
            scraper.log("Latest id report to scrape is " + id_report +
                        ". Starting retrieving hfml files now.")
            return id_report
    scraper.log("Latest report id not found. Could not retrieve the reports.")
    return 0
def get_html_files():
    write = False
    report_date = "2000_01_01"
    dict_files = dict()
    # Load the already processed URLs
    processed_urls = import_processed_urls(f_processed_urls)
    # id of first report for the current parliament session is roughly 5300
    for i in range(5300, int(get_latest_id()) + 1):
        if str(i) not in processed_urls:
            url = base_url + "?r=" + str(i)
            # Test the page
            try:
                response = urllib2.urlopen(url)
                page = response.read()
            except urllib2.HTTPError, e:
                # Add it the list of already processed url
                processed_urls[i] = base_url + "?r=" + str(i)
                scraper.log(str(i) + " is not a valid URL.")
                continue
            else:
                # Check if it's a parliamentary report, ie what we want
                # Get the date too
                for line in page.split("\n"):
                    if ">Meeting of the Parliament" in line:
                        report_date = scraper.trim_html(line).replace("Meeting of the Parliament ", "")
                        report_date = scraper.datetime.datetime.strptime(report_date, '%d %B %Y').strftime('%Y_%m_%d')
                        write = True
                processed_urls[i] = base_url + "?r=" + str(i)
                # If not a parliamentary report
                if not write:
                    scraper.log(str(i) + " is not a parliamentary report. No data written.")
                    continue
                else:
                    scraper.log(str(i) + " is a parliamentary report.")
                    # Write the file
                    file_output = "html_" + report_date + ".html"
                    dict_files[i] = file_output
                    with open(file_output, "w+") as f:
                        f.write(page)
                        write = False
        else:
            scraper.log(str(i) + " already scrapped.")
Beispiel #10
0

# Import the URLs already processed in the json file
def import_processed_urls(url_file):
    try:
        with open(url_file, "r") as f:
            try:
                json_data = json.load(f)
            except ValueError, e:
                dict_url = dict()
                scraper.log("Can't load json.")
                return dict_url
            else:
                return json_data
    except IOError, e:
        scraper.log("Can't open file. Creating it...")
        dict_url = dict()
        return dict_url


# Store the dictionary of processed URLs as json
def export_processed_urls(url_file, dict_url):
    with open(url_file, "w+") as f:
        try:
            json.dump(dict_url, f)
        except ValueError, e:
            scraper.log("Can't dump json.")
            return -1
        else:
            return 0
def scrap_files(dict_files):
    for id_file, file in dict_files.iteritems():
        if "no data written" not in scraper.process_html(file, id_file):
            scraper.log("file " + file + " with id " + str(id_file) + " created.")
        else:
            scraper.log("Failed to create file " + file + " with id " + str(id_file) + ".")

# Import the URLs already processed in the json file
def import_processed_urls(url_file):
    try:
        with open(url_file, "r") as f:
            try:
                json_data = json.load(f)
            except ValueError, e:
                dict_url = dict()
                scraper.log("Can't load json.")
                return dict_url
            else:
                return json_data
    except IOError, e:
        scraper.log("Can't open file. Creating it...")
        dict_url = dict()
        return dict_url


# Store the dictionary of processed URLs as json
def export_processed_urls(url_file, dict_url):
    with open(url_file, "w+") as f:
        try:
            json.dump(dict_url, f)
        except ValueError, e:
            scraper.log("Can't dump json.")
            return -1
        else:
            return 0