Example #1
0
def scrape_list_page(url):
    print "Scraping %s" % url
    req = requests.get(url)
    doc = lxml.html.fromstring(req.text)

    items = []
    for item in doc.xpath("//*[@class='newspanel clearDiv']"):
        full_title = etree.tostring(item.xpath("./h2")[0], method="text", encoding='utf8')
        date, title = [x.strip() for x in full_title.split("|")]
       
        attachments = [] 
        raw_links = item.xpath(".//a/@href")
        for raw_link in raw_links:
             attachment = urlparse.urljoin(url, raw_link)
             attachment_title = item.xpath(".//a/text()")[0]
             attachments.append({"link": attachment,
                                 "title": attachment_title})

        try:
            summary = item.xpath("./p")[0].text
        except IndexError:
            summary = ""
   
        items.append({"Title": title,
                      "Publication date": datetool.parsedate(date),
                      "Old URL": url,
                      "Summary": summary,
                      "Attachments": json.dumps(attachments),
                      "Associated organisations": "Scotland Office"})
        
    return items
Example #2
0
                 "Associated Document Series": ""}, "statistics")
dt.create_index(["Title", "Old URL"], "statistics", unique=True)

for link in doc.xpath("//div[@class='wrapper']/ul/li/a"):
    series_title, series_url = link.text, urlparse.urljoin(URL, link.attrib["href"])
    print series_title

    series_req = requests.get(series_url)
    series_doc = lxml.html.fromstring(series_req.text)

    for table_line in series_doc.xpath("//tr[not(@bgcolor) or @bgcolor!='#004093']"):
        file_pub_date = table_line.xpath("./td[3]")[0].text

        for file_node in table_line.xpath("./td[2]//a"):
            file_title = etree.tostring(file_node, method="text", encoding="utf8")
            file_link = file_node.attrib["href"]
            if not file_link.startswith("http"):
                file_link = urlparse.urljoin(URL, file_link)

            file_data = {"Old URL": series_url,
                         "Title": file_title,
                         "Body": file_title,
                         "Publication date": datetool.parsedate(file_pub_date),
                         "Attachment": file_link,
                         "Attachment title": file_title,
                         "Associated organisations": "Scotland Office",
                         "Associated Document Series": series_title}
            dt.upsert(file_data, "statistics")

dumptruck_to_csv(dt, "statistics", "/home/http/scotland/stats.csv")