Python LxmlSelector Exemples, lxmlselector.LxmlSelector Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : assemblee_nationale.py Projet : steeve/quivotequoi

 def parse_scrutin_page(self, response):
     lxs = LxmlSelector(response)
     for scrutin in lxs.xpath("//a[contains(@href, '/jo')]/../../../.."):
         title = scrutin.xpath("td[2]//text()")[0].extract()
         title = " - ".join(re.sub(r"\s+", " ", title).strip().split(" - ")[:-1])
         vote_href = urljoin(response.url, scrutin.xpath("td[1]//a/@href")[0].extract())
         if scrutin.xpath("td[2]//a/@href"):
             file_href = urljoin(response.url, scrutin.xpath("td[2]//a/@href")[0].extract())
         else:
             file_href = None
         vote_path = urlsplit(vote_href)[2].split("/")
         leg = vote_path[1]
         num = vote_path[-1].split(".")[0].replace("jo", "")
         #vote_href = "http://www.assemblee-nationale.fr/12/scrutins/jo0848.asp"
         yield Request(
             url=vote_href,
             callback=self.parse_vote_page,
             meta={
                 "item": ScrutinyItem(
                     uuid="%s-%s" % (leg, num),
                     title=title,
                     file_href=file_href,
                     leg=leg,
                     num=num,
                     url=vote_href,
                 ),
             }
         )

Exemple #2

0

Afficher le fichier

Fichier : assemblee_nationale.py Projet : steeve/quivotequoi

    def parse_vote_page(self, response):
        lxs = LxmlSelector(response)
        item = response.meta["item"]
        etree.strip_tags(lxs.xmlNode, "b", "font", "i", "sup")
        meta = self.meta_as_dict(lxs)

        date_txt = lxs.xpath("//text()").re(r"[DUdu\s:]+(\d+/\d+/\d+)")
        if date_txt:
            item["date"] = datetime.strptime(date_txt[0], "%d/%m/%Y").isoformat()
        else:
            page_text = "".join(lxs.xpath("//text()").extract())
            page_text = page_text.replace(u"\u00A0", " ")
            page_text = page_text.encode("utf-8")
            date_txt = re.search(r"du[:\s]+(\d+)[er]*\s+(.+?)\s+(\d+)", page_text)
            if date_txt:
                date_txt = " ".join(date_txt.groups())
                item["date"] = datetime.strptime(date_txt, "%d %B %Y").isoformat()
            else:
                raise

        if lxs.css("#analyse p.nomgroupe"):
            item["votes"] = self.parse_vote_first_layout(lxs, response)
        else: # 2nd layout!
            item["votes"] = self.parse_vote_second_layout(lxs)

        if item.get("file_href"):
            yield Request(
                url=item["file_href"],
                callback=self.parse_info_page,
                meta={
                    "item": item,
                }
            )
        else:
            yield item

Exemple #3

0

Afficher le fichier

Fichier : assemblee_nationale.py Projet : steeve/quivotequoi

    def parse_info_page(self, response):
        def get_text_formatted(node):
            from lxml.html import fromstring
            etree.strip_tags(node.xmlNode, "a")
            txt = node.extract()
            txt = txt.replace("<br/>", "\n")
            txt = txt.replace(u"\u00A0", " ")
            txt = fromstring(txt).text_content()
            txt = re.sub(r"\n[ \t]+", "\n", txt)
            return txt.strip()

        def get_text(node, regexp=None, invert=False):
            etree.strip_tags(node.xmlNode, "a")
            txt = ""
            for line in node.xpath(".//text()").extract():
                line = line.replace(u"\u00A0", " ")
                line = line.strip()
                if not line:
                    continue
                match = True
                if regexp:
                    match = regexp.search(line) and True or False
                if (match and not invert) or (not match and invert):
                    if line[0] != line[0].lower():
                        txt += ". "
                    txt += " %s " % line
            txt = re.sub("(\s\.+\s)+", ".", txt)
            txt = re.sub("[\s]+", " ", txt)
            txt = re.sub("[\.]+", ".", txt)
            txt = re.sub("^. ", "", txt)
            txt  = txt.strip()
            return txt

        lxs = LxmlSelector(response)
        item = response.meta["item"]
        meta = self.meta_as_dict(lxs)
        etree.strip_tags(lxs.xmlNode, "b", "font", "i")

        info_node = lxs.xpath("//a[@name = 'PDT']/ancestor::td[1]")
        if info_node:
            item["info"] = get_text_formatted(info_node[0])
        amendments_node = lxs.xpath("//a[@name = 'PAC']/ancestor::td[1]")
        if amendments_node:
            item["amendments"] = get_text_formatted(amendments_node[0])
        summary_node = lxs.xpath("//a[@name = 'ECRCM']/ancestor::td[1]")
        if summary_node:
            item["summary"] = get_text_formatted(summary_node[0])

        file_href = meta.get("URL_DOSSIER") or None
        if file_href:
            file_href = urljoin(response.url, file_href)
        item["law"] = LawItem(
            title=meta.get("LOI_PROMULGUEE", ""),
            href=meta.get("LIEN_LOI_PROMULGUEE", ""),
            file_href=file_href,
        )

        yield item

Exemple #4

0

Afficher le fichier

Fichier : deputes.py Projet : steeve/quivotequoi

 def parse_depute_page_leg13(self, response):
     lxs = LxmlSelector(response)
     yield DeputyItem(
         uuid=urlsplit(response.url)[2].split("/")[-1].split(".")[0],
         name=clean_name(lxs.css(".deputy-headline-title").text().extract()[0]),
         image=urljoin(response.url, lxs.css(".deputy-profile-picture")[0].attrib("src").extract()),
         url=response.url,
         jurisdiction=lxs.css(".deputy-healine-sub-title").text().extract()[0],
         party=lxs.css(".political-party").text().extract()[0],
     )

Exemple #5

0

Afficher le fichier

Fichier : deputes.py Projet : steeve/quivotequoi

 def parse_deputes_page(self, response):
     lxs = LxmlSelector(response)
     leg_parsers = {
         11: self.parse_depute_page_leg11,
         12: self.parse_depute_page_leg12,
         13: self.parse_depute_page_leg13,
     }
     for depute_node in lxs.css(".dep2"):
         yield Request(
             url=urljoin(response.url, depute_node.attrib("href").extract()),
             callback=leg_parsers[response.meta["leg"]],
         )

Exemple #6

0

Afficher le fichier

Fichier : deputes.py Projet : steeve/quivotequoi

 def parse_depute_page_leg12(self, response):
     lxs = LxmlSelector(response)
     etree.strip_tags(lxs.xmlNode, "u", "b", "font", "i", "sup")
     uuid = urlsplit(response.url)[2].split("/")[-1].split(".")[0]
     jurisdiction_line = (
         lxs.xpath("//td[contains(text(), 'Circonscription ')]/following-sibling::td[1]/text()")
         .extract()[0]
         .encode("utf-8")
     )
     jurisdiction = "%s (%s circonscription)" % re.search(r"(.*?) \((.*)\)", jurisdiction_line).groups()
     yield DeputyItem(
         uuid=uuid,
         name=clean_name(lxs.css(".titre").text().extract()[0]),
         image="http://www.assemblee-nationale.fr/12/tribun/photos/%s.jpg" % uuid,
         url=response.url,
         jurisdiction=jurisdiction,
     )

Exemple #7

0

Afficher le fichier

Fichier : deputes.py Projet : steeve/quivotequoi

 def parse_depute_page_leg11(self, response):
     lxs = LxmlSelector(response)
     etree.strip_tags(lxs.xmlNode, "b", "font", "i", "sup")
     uuid = urlsplit(response.url)[2].split("/")[-1].split(".")[0]
     jurisdiction_line = "".join(lxs.xpath("//*[contains(text(), 'Circonscription ')]//text()").extract()).encode(
         "utf-8"
     )
     if jurisdiction_line:
         jurisdiction = (
             "%s (%s circonscription)"
             % re.search(r"Circonscription d'élection : (.*?) \((.*)\)", jurisdiction_line).groups()
         )
     else:
         jurisdiction = None
     yield DeputyItem(
         uuid=uuid,
         name=clean_name(lxs.xpath("//a[@name='P-1_0']/..//text()")[0].extract()),
         image="http://www.assemblee-nationale.fr/11/tribun/photos/%s.jpg" % uuid,
         url=response.url,
         jurisdiction=jurisdiction,
     )