Beispiel #1
0
 def parse_depute_page_leg13(self, response):
     lxs = LxmlSelector(response)
     yield DeputyItem(
         uuid=urlsplit(response.url)[2].split("/")[-1].split(".")[0],
         name=clean_name(lxs.css(".deputy-headline-title").text().extract()[0]),
         image=urljoin(response.url, lxs.css(".deputy-profile-picture")[0].attrib("src").extract()),
         url=response.url,
         jurisdiction=lxs.css(".deputy-healine-sub-title").text().extract()[0],
         party=lxs.css(".political-party").text().extract()[0],
     )
    def parse_vote_page(self, response):
        lxs = LxmlSelector(response)
        item = response.meta["item"]
        etree.strip_tags(lxs.xmlNode, "b", "font", "i", "sup")
        meta = self.meta_as_dict(lxs)

        date_txt = lxs.xpath("//text()").re(r"[DUdu\s:]+(\d+/\d+/\d+)")
        if date_txt:
            item["date"] = datetime.strptime(date_txt[0], "%d/%m/%Y").isoformat()
        else:
            page_text = "".join(lxs.xpath("//text()").extract())
            page_text = page_text.replace(u"\u00A0", " ")
            page_text = page_text.encode("utf-8")
            date_txt = re.search(r"du[:\s]+(\d+)[er]*\s+(.+?)\s+(\d+)", page_text)
            if date_txt:
                date_txt = " ".join(date_txt.groups())
                item["date"] = datetime.strptime(date_txt, "%d %B %Y").isoformat()
            else:
                raise

        if lxs.css("#analyse p.nomgroupe"):
            item["votes"] = self.parse_vote_first_layout(lxs, response)
        else: # 2nd layout!
            item["votes"] = self.parse_vote_second_layout(lxs)

        if item.get("file_href"):
            yield Request(
                url=item["file_href"],
                callback=self.parse_info_page,
                meta={
                    "item": item,
                }
            )
        else:
            yield item
Beispiel #3
0
 def parse_deputes_page(self, response):
     lxs = LxmlSelector(response)
     leg_parsers = {
         11: self.parse_depute_page_leg11,
         12: self.parse_depute_page_leg12,
         13: self.parse_depute_page_leg13,
     }
     for depute_node in lxs.css(".dep2"):
         yield Request(
             url=urljoin(response.url, depute_node.attrib("href").extract()),
             callback=leg_parsers[response.meta["leg"]],
         )
Beispiel #4
0
 def parse_depute_page_leg12(self, response):
     lxs = LxmlSelector(response)
     etree.strip_tags(lxs.xmlNode, "u", "b", "font", "i", "sup")
     uuid = urlsplit(response.url)[2].split("/")[-1].split(".")[0]
     jurisdiction_line = (
         lxs.xpath("//td[contains(text(), 'Circonscription ')]/following-sibling::td[1]/text()")
         .extract()[0]
         .encode("utf-8")
     )
     jurisdiction = "%s (%s circonscription)" % re.search(r"(.*?) \((.*)\)", jurisdiction_line).groups()
     yield DeputyItem(
         uuid=uuid,
         name=clean_name(lxs.css(".titre").text().extract()[0]),
         image="http://www.assemblee-nationale.fr/12/tribun/photos/%s.jpg" % uuid,
         url=response.url,
         jurisdiction=jurisdiction,
     )