def scrape_list_page(url): print "Scraping %s" % url req = requests.get(url) doc = lxml.html.fromstring(req.text) items = [] for item in doc.xpath("//*[@class='newspanel clearDiv']"): full_title = etree.tostring(item.xpath("./h2")[0], method="text", encoding='utf8') date, title = [x.strip() for x in full_title.split("|")] attachments = [] raw_links = item.xpath(".//a/@href") for raw_link in raw_links: attachment = urlparse.urljoin(url, raw_link) attachment_title = item.xpath(".//a/text()")[0] attachments.append({"link": attachment, "title": attachment_title}) try: summary = item.xpath("./p")[0].text except IndexError: summary = "" items.append({"Title": title, "Publication date": datetool.parsedate(date), "Old URL": url, "Summary": summary, "Attachments": json.dumps(attachments), "Associated organisations": "Scotland Office"}) return items
"Associated Document Series": ""}, "statistics") dt.create_index(["Title", "Old URL"], "statistics", unique=True) for link in doc.xpath("//div[@class='wrapper']/ul/li/a"): series_title, series_url = link.text, urlparse.urljoin(URL, link.attrib["href"]) print series_title series_req = requests.get(series_url) series_doc = lxml.html.fromstring(series_req.text) for table_line in series_doc.xpath("//tr[not(@bgcolor) or @bgcolor!='#004093']"): file_pub_date = table_line.xpath("./td[3]")[0].text for file_node in table_line.xpath("./td[2]//a"): file_title = etree.tostring(file_node, method="text", encoding="utf8") file_link = file_node.attrib["href"] if not file_link.startswith("http"): file_link = urlparse.urljoin(URL, file_link) file_data = {"Old URL": series_url, "Title": file_title, "Body": file_title, "Publication date": datetool.parsedate(file_pub_date), "Attachment": file_link, "Attachment title": file_title, "Associated organisations": "Scotland Office", "Associated Document Series": series_title} dt.upsert(file_data, "statistics") dumptruck_to_csv(dt, "statistics", "/home/http/scotland/stats.csv")