Example #1
0
                 "Associated Document Series": ""}, "statistics")
dt.create_index(["Title", "Old URL"], "statistics", unique=True)

for link in doc.xpath("//div[@class='wrapper']/ul/li/a"):
    series_title, series_url = link.text, urlparse.urljoin(URL, link.attrib["href"])
    print series_title

    series_req = requests.get(series_url)
    series_doc = lxml.html.fromstring(series_req.text)

    for table_line in series_doc.xpath("//tr[not(@bgcolor) or @bgcolor!='#004093']"):
        file_pub_date = table_line.xpath("./td[3]")[0].text

        for file_node in table_line.xpath("./td[2]//a"):
            file_title = etree.tostring(file_node, method="text", encoding="utf8")
            file_link = file_node.attrib["href"]
            if not file_link.startswith("http"):
                file_link = urlparse.urljoin(URL, file_link)

            file_data = {"Old URL": series_url,
                         "Title": file_title,
                         "Body": file_title,
                         "Publication date": datetool.parsedate(file_pub_date),
                         "Attachment": file_link,
                         "Attachment title": file_title,
                         "Associated organisations": "Scotland Office",
                         "Associated Document Series": series_title}
            dt.upsert(file_data, "statistics")

dumptruck_to_csv(dt, "statistics", "/home/http/scotland/stats.csv")
Example #2
0
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16677&mon=jul", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16676&mon=aug", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16678&mon=sep", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16679&mon=oct", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16680&mon=nov", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/10804.146.html", "Archive releases"], # 2005
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/10805.145.html", "Archive releases"], # 2006
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/10806.144.html", "Archive releases"], # 2007
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/10807.143.html", "Archive releases"], # 2008
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/13342.html", "Archive releases"], # 2009
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/13661.html", "Archive releases"], # 2010
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/15263.html", "Archive releases"], # 2011
]

dt = DumpTruck(dbname="scotland.db")
dt.create_table({"Title": "",
                 "Publication date": "",
                 "Old URL": "",
                 "Summary": "",
                 "Attachments": "",
                 "Type": "",
                 "Associated organisations": ""}, "publications")
dt.create_index(["Title", "Old URL"], "publications", unique=True)

for url, page_type in URLS:
    for publication in scrape_list_page(url):
        publication['Type'] = page_type
        dt.upsert(publication, "publications")

dumptruck_to_csv(dt, "publications", "/home/http/scotland/publications.csv")
Example #3
0
def scrape_main_article(url):
    req = requests.get(url)
    doc = lxml.html.fromstring(req.text)

    div = doc.xpath("//*[@class='wrapper']")[0]
    div.remove(div.find("h1"))
    for para in div.findall("p"):
        if para.find("strong") is not None:
            div.remove(para)
    return htmlize(etree.tostring(div))

dt = DumpTruck(dbname="scotland.db")
dt.create_table({"Title": "",
                 "Publication date": "",
                 "Old URL": "",
                 "Summary": "",
                 "Body": "",
                 "Associated organisations": ""}, "news")
dt.create_index(["Title", "Old URL"], "news", unique=True)

for url in URLS:
    for news_item in scrape_list_page(url):
        attachments = json.loads(news_item.pop("Attachments"))
        link = attachments[0]["link"]
        news_item["Old URL"] = link
        news_item["Body"] = scrape_main_article(link)
        dt.upsert(news_item, "news")

dumptruck_to_csv(dt, "news", "/home/http/scotland/news.csv")