def parse_scrutin_page(self, response): lxs = LxmlSelector(response) for scrutin in lxs.xpath("//a[contains(@href, '/jo')]/../../../.."): title = scrutin.xpath("td[2]//text()")[0].extract() title = " - ".join(re.sub(r"\s+", " ", title).strip().split(" - ")[:-1]) vote_href = urljoin(response.url, scrutin.xpath("td[1]//a/@href")[0].extract()) if scrutin.xpath("td[2]//a/@href"): file_href = urljoin(response.url, scrutin.xpath("td[2]//a/@href")[0].extract()) else: file_href = None vote_path = urlsplit(vote_href)[2].split("/") leg = vote_path[1] num = vote_path[-1].split(".")[0].replace("jo", "") #vote_href = "http://www.assemblee-nationale.fr/12/scrutins/jo0848.asp" yield Request( url=vote_href, callback=self.parse_vote_page, meta={ "item": ScrutinyItem( uuid="%s-%s" % (leg, num), title=title, file_href=file_href, leg=leg, num=num, url=vote_href, ), } )
def parse_vote_page(self, response): lxs = LxmlSelector(response) item = response.meta["item"] etree.strip_tags(lxs.xmlNode, "b", "font", "i", "sup") meta = self.meta_as_dict(lxs) date_txt = lxs.xpath("//text()").re(r"[DUdu\s:]+(\d+/\d+/\d+)") if date_txt: item["date"] = datetime.strptime(date_txt[0], "%d/%m/%Y").isoformat() else: page_text = "".join(lxs.xpath("//text()").extract()) page_text = page_text.replace(u"\u00A0", " ") page_text = page_text.encode("utf-8") date_txt = re.search(r"du[:\s]+(\d+)[er]*\s+(.+?)\s+(\d+)", page_text) if date_txt: date_txt = " ".join(date_txt.groups()) item["date"] = datetime.strptime(date_txt, "%d %B %Y").isoformat() else: raise if lxs.css("#analyse p.nomgroupe"): item["votes"] = self.parse_vote_first_layout(lxs, response) else: # 2nd layout! item["votes"] = self.parse_vote_second_layout(lxs) if item.get("file_href"): yield Request( url=item["file_href"], callback=self.parse_info_page, meta={ "item": item, } ) else: yield item
def parse_info_page(self, response): def get_text_formatted(node): from lxml.html import fromstring etree.strip_tags(node.xmlNode, "a") txt = node.extract() txt = txt.replace("<br/>", "\n") txt = txt.replace(u"\u00A0", " ") txt = fromstring(txt).text_content() txt = re.sub(r"\n[ \t]+", "\n", txt) return txt.strip() def get_text(node, regexp=None, invert=False): etree.strip_tags(node.xmlNode, "a") txt = "" for line in node.xpath(".//text()").extract(): line = line.replace(u"\u00A0", " ") line = line.strip() if not line: continue match = True if regexp: match = regexp.search(line) and True or False if (match and not invert) or (not match and invert): if line[0] != line[0].lower(): txt += ". " txt += " %s " % line txt = re.sub("(\s\.+\s)+", ".", txt) txt = re.sub("[\s]+", " ", txt) txt = re.sub("[\.]+", ".", txt) txt = re.sub("^. ", "", txt) txt = txt.strip() return txt lxs = LxmlSelector(response) item = response.meta["item"] meta = self.meta_as_dict(lxs) etree.strip_tags(lxs.xmlNode, "b", "font", "i") info_node = lxs.xpath("//a[@name = 'PDT']/ancestor::td[1]") if info_node: item["info"] = get_text_formatted(info_node[0]) amendments_node = lxs.xpath("//a[@name = 'PAC']/ancestor::td[1]") if amendments_node: item["amendments"] = get_text_formatted(amendments_node[0]) summary_node = lxs.xpath("//a[@name = 'ECRCM']/ancestor::td[1]") if summary_node: item["summary"] = get_text_formatted(summary_node[0]) file_href = meta.get("URL_DOSSIER") or None if file_href: file_href = urljoin(response.url, file_href) item["law"] = LawItem( title=meta.get("LOI_PROMULGUEE", ""), href=meta.get("LIEN_LOI_PROMULGUEE", ""), file_href=file_href, ) yield item
def parse_depute_page_leg13(self, response): lxs = LxmlSelector(response) yield DeputyItem( uuid=urlsplit(response.url)[2].split("/")[-1].split(".")[0], name=clean_name(lxs.css(".deputy-headline-title").text().extract()[0]), image=urljoin(response.url, lxs.css(".deputy-profile-picture")[0].attrib("src").extract()), url=response.url, jurisdiction=lxs.css(".deputy-healine-sub-title").text().extract()[0], party=lxs.css(".political-party").text().extract()[0], )
def parse_deputes_page(self, response): lxs = LxmlSelector(response) leg_parsers = { 11: self.parse_depute_page_leg11, 12: self.parse_depute_page_leg12, 13: self.parse_depute_page_leg13, } for depute_node in lxs.css(".dep2"): yield Request( url=urljoin(response.url, depute_node.attrib("href").extract()), callback=leg_parsers[response.meta["leg"]], )
def parse_depute_page_leg12(self, response): lxs = LxmlSelector(response) etree.strip_tags(lxs.xmlNode, "u", "b", "font", "i", "sup") uuid = urlsplit(response.url)[2].split("/")[-1].split(".")[0] jurisdiction_line = ( lxs.xpath("//td[contains(text(), 'Circonscription ')]/following-sibling::td[1]/text()") .extract()[0] .encode("utf-8") ) jurisdiction = "%s (%s circonscription)" % re.search(r"(.*?) \((.*)\)", jurisdiction_line).groups() yield DeputyItem( uuid=uuid, name=clean_name(lxs.css(".titre").text().extract()[0]), image="http://www.assemblee-nationale.fr/12/tribun/photos/%s.jpg" % uuid, url=response.url, jurisdiction=jurisdiction, )
def parse_depute_page_leg11(self, response): lxs = LxmlSelector(response) etree.strip_tags(lxs.xmlNode, "b", "font", "i", "sup") uuid = urlsplit(response.url)[2].split("/")[-1].split(".")[0] jurisdiction_line = "".join(lxs.xpath("//*[contains(text(), 'Circonscription ')]//text()").extract()).encode( "utf-8" ) if jurisdiction_line: jurisdiction = ( "%s (%s circonscription)" % re.search(r"Circonscription d'élection : (.*?) \((.*)\)", jurisdiction_line).groups() ) else: jurisdiction = None yield DeputyItem( uuid=uuid, name=clean_name(lxs.xpath("//a[@name='P-1_0']/..//text()")[0].extract()), image="http://www.assemblee-nationale.fr/11/tribun/photos/%s.jpg" % uuid, url=response.url, jurisdiction=jurisdiction, )