["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16677&mon=jul", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16676&mon=aug", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16678&mon=sep", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16679&mon=oct", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16680&mon=nov", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/10804.146.html", "Archive releases"], # 2005 ["http://www.scotlandoffice.gov.uk/scotlandoffice/10805.145.html", "Archive releases"], # 2006 ["http://www.scotlandoffice.gov.uk/scotlandoffice/10806.144.html", "Archive releases"], # 2007 ["http://www.scotlandoffice.gov.uk/scotlandoffice/10807.143.html", "Archive releases"], # 2008 ["http://www.scotlandoffice.gov.uk/scotlandoffice/13342.html", "Archive releases"], # 2009 ["http://www.scotlandoffice.gov.uk/scotlandoffice/13661.html", "Archive releases"], # 2010 ["http://www.scotlandoffice.gov.uk/scotlandoffice/15263.html", "Archive releases"], # 2011 ] dt = DumpTruck(dbname="scotland.db") dt.create_table({"Title": "", "Publication date": "", "Old URL": "", "Summary": "", "Attachments": "", "Type": "", "Associated organisations": ""}, "publications") dt.create_index(["Title", "Old URL"], "publications", unique=True) for url, page_type in URLS: for publication in scrape_list_page(url): publication['Type'] = page_type dt.upsert(publication, "publications") dumptruck_to_csv(dt, "publications", "/home/http/scotland/publications.csv")
def scrape_main_article(url): req = requests.get(url) doc = lxml.html.fromstring(req.text) div = doc.xpath("//*[@class='wrapper']")[0] div.remove(div.find("h1")) for para in div.findall("p"): if para.find("strong") is not None: div.remove(para) return htmlize(etree.tostring(div)) dt = DumpTruck(dbname="scotland.db") dt.create_table({"Title": "", "Publication date": "", "Old URL": "", "Summary": "", "Body": "", "Associated organisations": ""}, "news") dt.create_index(["Title", "Old URL"], "news", unique=True) for url in URLS: for news_item in scrape_list_page(url): attachments = json.loads(news_item.pop("Attachments")) link = attachments[0]["link"] news_item["Old URL"] = link news_item["Body"] = scrape_main_article(link) dt.upsert(news_item, "news") dumptruck_to_csv(dt, "news", "/home/http/scotland/news.csv")