Beispiel #1
0
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16677&mon=jul", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16676&mon=aug", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16678&mon=sep", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16679&mon=oct", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16680&mon=nov", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/10804.146.html", "Archive releases"], # 2005
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/10805.145.html", "Archive releases"], # 2006
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/10806.144.html", "Archive releases"], # 2007
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/10807.143.html", "Archive releases"], # 2008
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/13342.html", "Archive releases"], # 2009
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/13661.html", "Archive releases"], # 2010
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/15263.html", "Archive releases"], # 2011
]

dt = DumpTruck(dbname="scotland.db")
dt.create_table({"Title": "",
                 "Publication date": "",
                 "Old URL": "",
                 "Summary": "",
                 "Attachments": "",
                 "Type": "",
                 "Associated organisations": ""}, "publications")
dt.create_index(["Title", "Old URL"], "publications", unique=True)

for url, page_type in URLS:
    for publication in scrape_list_page(url):
        publication['Type'] = page_type
        dt.upsert(publication, "publications")

dumptruck_to_csv(dt, "publications", "/home/http/scotland/publications.csv")
Beispiel #2
0
def scrape_main_article(url):
    req = requests.get(url)
    doc = lxml.html.fromstring(req.text)

    div = doc.xpath("//*[@class='wrapper']")[0]
    div.remove(div.find("h1"))
    for para in div.findall("p"):
        if para.find("strong") is not None:
            div.remove(para)
    return htmlize(etree.tostring(div))

dt = DumpTruck(dbname="scotland.db")
dt.create_table({"Title": "",
                 "Publication date": "",
                 "Old URL": "",
                 "Summary": "",
                 "Body": "",
                 "Associated organisations": ""}, "news")
dt.create_index(["Title", "Old URL"], "news", unique=True)

for url in URLS:
    for news_item in scrape_list_page(url):
        attachments = json.loads(news_item.pop("Attachments"))
        link = attachments[0]["link"]
        news_item["Old URL"] = link
        news_item["Body"] = scrape_main_article(link)
        dt.upsert(news_item, "news")

dumptruck_to_csv(dt, "news", "/home/http/scotland/news.csv")