Ejemplo n.º 1
0
    def parse(self, response):
        gazettes = response.css("a")
        for gazette in gazettes:
            gazette_info = gazette.css("::text").re(
                r"Edi..o\s*n.\s*(\d+) de (\d{2}) de (.*?) de (\d{4})")
            if not gazette_info:
                self.logger.warning(
                    f"Unable to identify gazette info for {response.url}.")
                continue

            edition_number, day, month, year = gazette_info
            gazette_date = parse(f"{day} de {month} de {year}",
                                 languages=["pt"]).date()
            if gazette_date < self.start_date:
                continue

            gazette_url = response.urljoin(gazette.css("::attr(href)").get())

            link_text = gazette.css("::text").get().lower()
            is_extra_edition = bool(
                re.search(r"suplemento|complemento|especial", link_text))

            yield Gazette(
                date=gazette_date,
                edition_number=edition_number,
                file_urls=[gazette_url],
                is_extra_edition=is_extra_edition,
                power="executive",
            )
Ejemplo n.º 2
0
    def parse(self, response):
        follow_next_page = True

        gazettes = response.css("h4.card-title")
        for gazette in gazettes:
            gazette_url = gazette.xpath(".//following-sibling::a/@href").get()
            edition_number = gazette.css("a::text").re_first(
                r"Edição (\d+\/\d+)")

            raw_gazette_date = gazette.css("a::text").re_first(
                r"(\d{2}\/\d{2}\/\d{4})")
            if not raw_gazette_date:
                continue

            gazette_date = datetime.datetime.strptime(raw_gazette_date,
                                                      "%d/%m/%Y").date()

            yield Gazette(
                date=gazette_date,
                edition_number=edition_number,
                file_urls=[gazette_url],
                power="executive_legislative",
            )

            if gazette_date < self.start_date:
                follow_next_page = False
                break

        next_page_url = response.css("a.next::attr(href)").get()
        if follow_next_page and next_page_url:
            yield scrapy.Request(next_page_url)
Ejemplo n.º 3
0
    def parse(self, response, page=1):
        gazettes = response.css(".list-group-item")
        last_page_number_css = ".pagination > li:nth-last-child(-n+2) > a > span::text"
        last_page_number = int(response.css(last_page_number_css).get())
        follow_next_page = True

        for gazette in gazettes:
            gazette_date_raw = gazette.css(
                "div > div > span::text").get().strip()
            gazette_date = parse(gazette_date_raw, languages=["pt"]).date()

            gazette_title_raw = gazette.css(
                "h4 > div > div > strong::text").get()

            edition_number = gazette_title_raw.strip()

            if gazette_date < self.start_date or page == last_page_number:
                follow_next_page = False

            partial_url = gazette.css("a::attr(href)").get()
            url = f"https://paudosferros.rn.gov.br/{partial_url}"

            yield Gazette(
                date=gazette_date,
                file_urls=[url],
                edition_number=edition_number,
                power="executive_legislative",
            )

        if follow_next_page:
            next_page = page + 1
            yield Request(
                f"{self.start_urls[0]}&pagina={page}",
                cb_kwargs={"page": next_page},
            )
Ejemplo n.º 4
0
    def parse(self, response):
        follow_next_page = True
        gazettes = response.css(".listing tbody tr")
        for gazette in gazettes:
            gazette_date_raw = gazette.xpath("./td[1]//text()").re_first(
                r"\d{2}\/\d{2}\/\d{4}")
            gazette_date = datetime.datetime.strptime(gazette_date_raw,
                                                      "%d/%m/%Y").date()

            if gazette_date < self.start_date:
                follow_next_page = False
                break

            title = "".join(gazette.xpath("./td[2]//text()").getall()).strip()
            edition_number = self._extract_edition_number(title, gazette_date)
            is_extra_edition = re.search(r"eex|ext", title.lower()) is not None
            gazette_url = gazette.css("a::attr(href)").get()

            yield Gazette(
                date=gazette_date,
                edition_number=edition_number,
                is_extra_edition=is_extra_edition,
                file_urls=[gazette_url],
                power="executive",
            )

        if follow_next_page:
            next_page_url = response.css(".next a::attr(href)").get()
            yield scrapy.Request(next_page_url)
Ejemplo n.º 5
0
    def parse(self, response):
        gazettes = response.css(".diario-resultado-pesquisa tbody tr")
        for gazette in gazettes:
            gazette_date = gazette.xpath("./td[2]/text()").get()
            gazette_url = response.urljoin(gazette.css("a::attr(href)").get())

            is_extra_edition = bool(
                gazette.xpath(".//*[contains(., 'Suplemento')]"))
            if is_extra_edition:
                # Extra Editions doesn't have a date in its line. We need to get it from
                # the main edition of that day
                gazette_date = self._get_date_from_parent_edition(
                    response, gazette)

            item = Gazette(
                date=dateparser.parse(gazette_date, languages=["pt"]).date(),
                is_extra_edition=is_extra_edition,
                power="executive_legislative",
            )
            yield scrapy.Request(
                gazette_url,
                method="HEAD",
                callback=self.parse_pdf_url,
                cb_kwargs={"item": item},
            )

        next_pages_urls = response.css(".pagination a::attr(href)").getall()
        for next_page_url in next_pages_urls:
            yield scrapy.Request(response.urljoin(next_page_url))
Ejemplo n.º 6
0
    def parse(self, response):
        """Parses gazettes page and requests next page.

        Normal gazettes are displayed in a weekly basis, so, the date which is taken
        into account for this type of gazette is the last in the publication period
        (i.e. "29/08/2020" from "23/08/2020 to 29/08/2020").

        Special gazzetes are daily, but that same logic applies here and it works
        correctly.
        """
        gazettes = response.css(".table-semanarios table tbody tr")
        for gazette in gazettes:
            url = gazette.css("td:last-child a::attr(href)").get()
            gazette_date = (gazette.css("td:nth-last-child(2)::text").re(
                r"[0-9]{2}/[0-9]{2}/[0-9]{4}").pop())
            gazette_date = datetime.datetime.strptime(gazette_date,
                                                      "%d/%m/%Y").date()
            is_extra = "Especial" in gazette.css("td:first-child").get()

            yield Gazette(
                date=gazette_date,
                file_urls=[url],
                is_extra_edition=is_extra,
                power="executive_legislative",
            )

        for url in response.css(".pagination a.next::attr(href)").getall():
            yield response.follow(url)
Ejemplo n.º 7
0
    def parse_year(self, response):
        # The page with the list of gazettes is simply a table with links
        links = response.css("a")
        items = []
        for link in links:
            url = link.css("::attr(href)").extract_first()
            if url[-4:] != ".pdf":
                continue

            url = response.urljoin(url)
            # Apparently, Goiânia doesn't have a separate gazette for executive and legislative
            power = "executive_legislature"
            link_text = link.css("::text").extract_first()
            if link_text is None:
                continue

            date = re.match(".*(\d{2} .* de \d{4})", link_text)[1]
            # Extra editions are marked either with 'suplemento' or 'comunicado'
            is_extra_edition = ("suplemento" in link_text.lower()
                                or "comunicado" in link_text.lower())
            date = parse(date.split("-")[0], languages=["pt"]).date()
            items.append(
                Gazette(
                    date=date,
                    file_urls=[url],
                    is_extra_edition=is_extra_edition,
                    power=power,
                ))
        return items
Ejemplo n.º 8
0
    def parse(self, response):
        gazette_table = response.css(".style166")
        gazettes_links = gazette_table.xpath("a//@href").extract()
        dates = gazette_table.css("a::text").extract()

        for url, date in zip(gazettes_links, dates):
            edition = self._extract_edition(url)
            power = self._extract_power(url)
            power_id = self.powers[power]

            gazette = Gazette(
                date=parse(date, languages=["pt"]).date(),
                is_extra_edition=False,
                power=power,
            )

            gazette_details_page = f"abrir.asp?edi={edition}&p={power_id}"
            gazette_url = response.urljoin(gazette_details_page)
            yield Request(gazette_url,
                          callback=self.parse_document_url,
                          meta={"item": gazette})

        current_page_selector = "#pages ul li.current::text"
        current_page = response.css(current_page_selector).extract_first()
        next_page = int(current_page) + 1
        next_page_url = response.urljoin(f"/?p={next_page}")

        if next_page > self.last_page:
            self.last_page = next_page
            yield Request(next_page_url)
Ejemplo n.º 9
0
    def parse(self, response):
        """
        @url http://apps.fortaleza.ce.gov.br/diariooficial/
        @returns requests 1
        @scrapes date file_urls is_extra_edition territory_id power scraped_at
        """

        for element in response.css(self.GAZETTE_ELEMENT_CSS):
            url = response.urljoin(
                element.css("a::attr(href)").extract_first())
            date = dateparser.parse(element.css(
                self.DATE_CSS).extract_first(""),
                                    languages=["pt"]).date()
            # Extra edition is maked with a "s" on description. Example: Diário Oficial Nº 15923s
            extra_edition = element.css(
                self.EXTRA_CSS).extract_first("").endswith("s")

            yield Gazette(
                date=date,
                file_urls=[url],
                is_extra_edition=extra_edition,
                territory_id=self.TERRITORY_ID,
                power="executive",
                scraped_at=datetime.utcnow(),
            )

        for page_number in response.css(self.NEXT_PAGE_CSS).re("#(\d)+"):
            next_url = w3lib.url.add_or_replace_parameter(
                response.url, "current", page_number)
            yield Request(next_url)
Ejemplo n.º 10
0
 def parse_page(self, response):
     for idx, row in enumerate(response.css(".grid_Row")):
         pdf_date = row.css("td:nth-child(2) span ::text").extract_first()
         gazette_id = row.css(
             "td:nth-child(3) a ::attr(data-teste)").extract_first()
         parsed_date = parse(f"{pdf_date}", languages=["pt"]).date()
         if gazette_id == "0":
             starting_offset = 3
             formdata = {
                 "__LASTFOCUS": "",
                 "__EVENTTARGET":
                 f"ctl00$cphMasterPrincipal$gdvGrid2$ctl{idx + starting_offset:02d}$lnkVisualizar",
                 "__EVENTARGUMENT": "",
                 "__ASYNCPOST": "true",
             }
             yield scrapy.FormRequest.from_response(
                 response,
                 formdata=formdata,
                 callback=self.parse_regular_edition,
                 meta={"parsed_date": parsed_date},
             )
         else:
             yield Gazette(
                 date=parsed_date,
                 file_urls=[
                     f"http://legisladocexterno.curitiba.pr.gov.br/DiarioSuplementoConsultaExterna_Download.aspx?id={gazette_id}"
                 ],
                 is_extra_edition=True,
                 territory_id=self.TERRITORY_ID,
                 power="executive_legislature",
                 scraped_at=datetime.utcnow(),
             )
Ejemplo n.º 11
0
    def parse(self, response):
        for element in response.css(self.GAZETTE_ELEMENT_CSS):
            url = element.css("a::attr(href)").extract_first()
            date = dateparser.parse(
                element.xpath(self.DATE_XPATH).extract_first(), languages=["pt"]
            ).date()

            yield Gazette(
                date=date,
                file_urls=[url],
                is_extra_edition=False,
                territory_id=self.TERRITORY_ID,
                power="executive",
                scraped_at=datetime.utcnow(),
            )

        current_page = w3lib.url.url_query_parameter(response.url, "pg")

        if (
            not response.css(self.LAST_PAGE_CSS)
            .extract_first()
            .endswith("pg=" + current_page)
        ):
            next_url = w3lib.url.add_or_replace_parameter(
                response.url, "pg", str(int(current_page) + 1)
            )
            yield Request(next_url)
Ejemplo n.º 12
0
 def parse(self, response):
     """
     @url http://www.cascavel.pr.gov.br/servicos/orgao_oficial.php
     @returns items 1
     @scrapes date file_urls is_extra_edition territory_id power scraped_at
     """
     for row in response.xpath("//table//tr[position()>1]"):
         date = row.xpath(".//td[2]//font//text()").extract_first()
         date = parse(date, languages=["pt"]).date()
         for link in row.xpath(".//td[3]//a"):
             link_text = link.xpath(".//text()").extract_first()
             power = "executive" if "Executivo" in link_text else "legislature"
             url = response.urljoin(link.xpath("./@href").extract_first(""))
             yield Gazette(
                 date=date,
                 file_urls=[url],
                 is_extra_edition=False,
                 territory_id=self.TERRITORY_ID,
                 power=power,
                 scraped_at=dt.datetime.utcnow(),
             )
     next_page_xpath = '//a[@title="Próxima página"]/@href'
     next_page_url = response.xpath(next_page_xpath).extract_first()
     if next_page_url:
         yield response.follow(next_page_url)
Ejemplo n.º 13
0
    def get_gazette(self, document, is_after_transition):
        """
        Extract the information from the document and return a Gazette item
        """
        title = document.css("::text").get()

        edition_number = re.search(r"\d+", title).group(0)
        is_extra_edition = bool(re.search(r"EXTRA", title, re.IGNORECASE))

        date_text = re.search(r"(\d{1,2}\w+\d{4})|(\d{1,2}.\d{1,2}.\d{4})",
                              title).group(0)
        date = dateparser.parse(date_text, languages=["pt"]).date()

        if is_after_transition:
            file_url = self.get_file_url(title, date)
        else:
            file_url = document.css("::attr(href)").get()

        return Gazette(
            date=date,
            edition_number=edition_number,
            file_urls=[file_url],
            power="executive_legislative",
            is_extra_edition=is_extra_edition,
        )
Ejemplo n.º 14
0
 def build_gazzete(self, date, url, power, is_extra_edition=False):
     return Gazette(
         date=date,
         file_urls=[url],
         is_extra_edition=is_extra_edition,
         power=power,
     )
Ejemplo n.º 15
0
    def parse(self, response):
        """Parses gazettes page and requests next page.

        Normal gazettes are displayed in a weekly basis, so, the date which is taken
        into account for this type of gazette is the last in the publication period
        (i.e. "29/08/2020" from "23/08/2020 to 29/08/2020").

        Special gazzetes are daily, but that same logic applies here and it works
        correctly.
        """

        for element in response.css(self.GAZETTE_ROW_CSS):
            url = element.css(self.GAZETTE_URL_CSS).extract_first()
            date = element.css(self.DATE_CSS).re(self.DATE_REGEX).pop()
            date = dateparser.parse(date, languages=["pt"]).date()
            is_extra = "Especial" in element.css(
                self.EXTRA_EDITION_CSS).extract_first()

            yield Gazette(
                date=date,
                file_urls=[url],
                is_extra_edition=is_extra,
                power="executive_legislative",
            )

        for url in response.css(self.NEXT_PAGE_CSS).extract():
            yield response.follow(url)
Ejemplo n.º 16
0
    def parse(self, response):
        gazettes = response.css("#ContentPlaceHolder1_gvResultado tbody tr")
        for gazette in gazettes:
            gazette_raw_date = gazette.xpath(".//td[2]/text()").get()
            gazette_date = datetime.datetime.strptime(gazette_raw_date,
                                                      "%d/%m/%Y").date()

            edition = gazette.xpath(".//td[1]/text()")
            edition_number = edition.re_first(r"\d+")
            is_extra_edition = "suplemento" in edition.get().lower()

            gazette_item = Gazette(
                date=gazette_date,
                edition_number=edition_number,
                is_extra_edition=is_extra_edition,
                power="executive",
            )

            download_url = response.urljoin(
                gazette.xpath(".//td[6]/a/@href").get())
            yield scrapy.Request(
                download_url,
                method="HEAD",
                callback=self.parse_gazette_download_url,
                cb_kwargs={"item": gazette_item},
            )
Ejemplo n.º 17
0
    def parse(self, response):
        texts = response.xpath(
            "//div[1]/div/div/div[1]/div/article/div[1]/ul/li").getall()
        texts = [
            self._clean_edition_text(edition_title) for edition_title in texts
        ]

        gazette_urls = response.xpath(
            "//div[1]/div/div/div[1]/div/article/div[1]/ul/li/a[1]/@href"
        ).getall()
        for gazette_url, text in zip(gazette_urls, texts):
            # year needs to be 3 or 4 because of typos
            date = re.match(r"[0-9]{2}/[0-9]{2}/\s?[0-9]{3,4}", text).group()
            if len(date) < 10:
                date = self._handle_date_typos(
                    date, response.meta.get("current_year"))
            gazette_date = dateparser.parse(date, languages=["pt"]).date()
            file_urls = [gazette_url]
            is_extra_edition = any([
                word in text for word in [
                    "EXTRAORDINÁRIA",
                    "ESPECIAL",
                    "GABARITO",
                    "EXTRAORDINÁRIO",
                ]
            ])
            yield Gazette(
                date=gazette_date,
                file_urls=file_urls,
                is_extra_edition=is_extra_edition,
                power="executive",
            )
Ejemplo n.º 18
0
 def parse(self, response):
     """
     @url http://www.pontagrossa.pr.gov.br/diario-oficial/
     @returns requests 1
     """
     links = response.css(".view-content .field a")
     smallest_year = min(
         (p["date"].year for p in self.pdf_infos(links, self.starting_year)),
         default=0,
     )
     if smallest_year >= self.starting_year:
         next_page_url = response.urljoin(
             response.css(".pager-next a::attr(href)").extract_first()
         )
         yield scrapy.Request(next_page_url)
         for pdf_info in self.pdf_infos(links, self.starting_year):
             gazette_date = pdf_info["date"].strftime("%Y-%m-%d")
             yield Gazette(
                 date=gazette_date,
                 file_urls=[pdf_info["url"]],
                 is_extra_edition=pdf_info["is_extra_edition"],
                 territory_id=self.TERRITORY_ID,
                 power="executive_legislature",
                 scraped_at=datetime.utcnow(),
             )
Ejemplo n.º 19
0
    def parse(self, response):
        iframe = response.css("iframe")
        if not iframe:
            # If iframe is not present on page, we don't have a valid gazette
            # in the response
            return

        parts_script = response.xpath("//script[contains(., 'pdfjs-frame')]")
        if parts_script:
            file_urls = parts_script.re(r"\(\'src\', \'(.*)\'\);")
        else:
            query_src = urllib.parse.urlparse(
                iframe.css("::attr(src)").get()).query
            file_urls = urllib.parse.parse_qs(query_src).get("file", [])

        gazette_year = response.css(
            "#diario-select-year option[selected]::attr(value)").get()
        gazette_month = response.css(
            "#diario-select-month option[selected]::attr(value)").get()
        gazette_day = response.css(
            "#diario-select-day option[selected]::attr(value)").get()
        gazette_date = datetime.date(int(gazette_year), int(gazette_month),
                                     int(gazette_day))

        yield Gazette(
            date=gazette_date,
            file_urls=file_urls,
            is_extra_edition=False,
            power="executive_legislative",
        )
Ejemplo n.º 20
0
 def parse(self, response):
     """
     @url http://www.guarulhos.sp.gov.br/diario-oficial/index.php?mes=1&ano=2018
     @returns items 17 17
     @scrapes date file_urls is_extra_edition municipality_id power scraped_at
     """
     diarios = response.xpath('//div[contains(@id, "diario")]')
     items = []
     for diario in diarios:
         date = diario.xpath('.//h3/text()').extract_first()
         date = parse(date[-10:], languages=['pt']).date()
         is_extra_edition = False
         links = diario.xpath('.//a[contains(@href, ".pdf")]').xpath(
             '@href')
         url = [response.urljoin(link) for link in links.extract()]
         power = 'executive'
         items.append(
             Gazette(
                 date=date,
                 file_urls=url,
                 is_extra_edition=is_extra_edition,
                 municipality_id=self.MUNICIPALITY_ID,
                 power=power,
                 scraped_at=dt.datetime.utcnow(),
             ))
     return items
Ejemplo n.º 21
0
    def parse_gazette(self, response):
        """Parses list of documents to request each one for the date."""
        json_response = response.json()

        if not json_response:
            self.logger.warning(f"Document not found in {response.url}")
            return

        json_dir = json_response["dir"]

        date = re.search(self.DATE_REGEX, json_dir).group()
        date = dateparser.parse(date, settings={"DATE_ORDER": "DMY"})
        is_extra_edition = self.EXTRA_EDITION_TEXT in json_dir
        path = json_dir.replace("/", "|")

        json_data = json_response["data"]
        file_urls = [
            self.PDF_URL.format(path,
                                url.split("/")[-1]) for url in json_data
        ]

        yield Gazette(
            date=date,
            file_urls=file_urls,
            is_extra_edition=is_extra_edition,
            power="executive_legislative",
        )
Ejemplo n.º 22
0
 def parse_month_page(self, response):
     """
     @url http://www.campinas.sp.gov.br/diario-oficial/index.php?mes=1&ano=2018
     @returns items 23 23
     @scrapes date file_urls is_extra_edition municipality_id power scraped_at
     """
     items = []
     month_year = response.css(
         ".tabelaDiario:first-child tr th:nth-child(2)::text"
     ).extract_first()  # "janeiro 2018"
     links = response.css(".tabelaDiario:first-child tr td a")
     for link in links:
         url = link.css('::attr(href)').extract_first().replace('../', '')
         day = link.css('::text').extract_first()
         date = parse(f'{day} {month_year}', languages=['pt']).date()
         url = f'{self.sp_campinas_url}{url}'
         is_extra_edition = False
         power = 'executive_legislature'
         items.append(
             Gazette(
                 date=date,
                 file_urls=[url],
                 is_extra_edition=is_extra_edition,
                 municipality_id=self.MUNICIPALITY_ID,
                 power=power,
                 scraped_at=dt.datetime.utcnow(),
             ))
     return items
Ejemplo n.º 23
0
 def create_gazette(self, date, url, is_extra_edition):
     return Gazette(
         date=date,
         file_urls=[url],
         is_extra_edition=is_extra_edition,
         power="executive",
     )
Ejemplo n.º 24
0
    def parse_gazette(self, response):
        """
        @url https://gravatai.atende.net/?pg=diariooficial&pagina=1
        @returns items 1
        @scrapes date file_urls is_extra_edition territory_id power scraped_at
        """

        for element in response.css(".nova_listagem > .linha"):
            info = element.css(".info")

            is_extra_edition = (info.css(".tipo::text").extract_first()
                                in self.extra_editions_options)

            date = parse(info.css(".data::text").extract_first(),
                         languages=["pt"]).date()

            code = element.css(
                ".opcoes > button::attr(data-codigo)").extract_first()
            url = ("https://gravatai.atende.net/atende.php?rot=54002&aca=737"
                   f"&processo=download&codigo={code}")

            yield Gazette(
                date=date,
                file_urls=[url],
                is_extra_edition=is_extra_edition,
                territory_id=self.TERRITORY_ID,
                power="executive",
                scraped_at=datetime.utcnow(),
            )
Ejemplo n.º 25
0
    def parse_month_page(self, response):
        """
        @url http://www2.portoalegre.rs.gov.br/dopa/default.php?p_secao=1431
        @returns items 58 58
        @scrapes date file_urls is_extra_edition territory_id power scraped_at
        """
        links = response.css('#conteudo a')
        items = []
        for link in links:
            url = link.css('::attr(href)').extract_first()
            if url[-4:] != '.pdf':
                continue

            url = response.urljoin(url)
            power = 'executive' if 'executivo' in url.lower(
            ) else 'legislature'
            date = link.css('::text').extract_first()
            is_extra_edition = 'extra' in date.lower()
            date = parse(date.split('-')[0], languages=['pt']).date()
            items.append(
                Gazette(
                    date=date,
                    file_urls=[url],
                    is_extra_edition=is_extra_edition,
                    territory_id=self.TERRITORY_ID,
                    power=power,
                    scraped_at=dt.datetime.utcnow(),
                ))
        return items
Ejemplo n.º 26
0
    def parse_items(self, response):
        body = response.body

        if self.is_body_empty(body):
            return

        definition, rows = self.parse_definitions_and_rows(body)

        for row in rows:
            item = dict(zip(definition, row))

            date_values = item["DTPUBLICACAO"]
            item_date = date(date_values[0], date_values[1] + 1,
                             date_values[2])

            url = "https://www.valadares.mg.gov.br/abrir_arquivo.aspx?cdLocal=12&arquivo={}{}".format(
                item["NMARQUIVO"], item["NMEXTENSAOARQUIVO"])
            yield Gazette(
                date=item_date,
                file_urls=[url],
                is_extra_edition=False,
                territory_id=self.TERRITORY_ID,
                power="executive",
                scraped_at=datetime.utcnow(),
            )

        self.current_page += 1
        yield self.make_request(self.current_page)
Ejemplo n.º 27
0
 def parse_month_page(self, response):
     """
     @url http://www.campinas.sp.gov.br/diario-oficial/index.php?mes=1&ano=2018
     @returns items 23 23
     @scrapes date file_urls is_extra_edition territory_id power scraped_at
     """
     items = []
     month_year = response.css(
         ".tabelaDiario:first-child tr th:nth-child(2)::text"
     ).extract_first()  # "janeiro 2018"
     links = response.css(".tabelaDiario:first-child tr td a")
     for link in links:
         url = link.css("::attr(href)").extract_first().replace("../", "")
         day = link.css("::text").extract_first()
         date = parse(f"{day} {month_year}", languages=["pt"]).date()
         url = f"{self.sp_campinas_url}{url}"
         is_extra_edition = False
         power = "executive_legislature"
         items.append(
             Gazette(
                 date=date,
                 file_urls=[url],
                 is_extra_edition=is_extra_edition,
                 territory_id=self.TERRITORY_ID,
                 power=power,
                 scraped_at=dt.datetime.utcnow(),
             ))
     return items
Ejemplo n.º 28
0
    def parse_page(self, response):
        """Parses list of gazettes.

        Extra editions can already have its item built. Regular editions need an extra
        request.
        """
        for idx, row in enumerate(response.css(".grid_Row")):
            pdf_date = row.css("td:nth-child(2) span ::text").extract_first()
            gazette_id = row.css(
                "td:nth-child(3) a ::attr(data-teste)").extract_first()
            parsed_date = parse(f"{pdf_date}", languages=["pt"]).date()
            eventtarget = row.css("td:nth-child(3) a ::attr(href)").re_first(
                "'(.*lnkVisualizar)'")
            if gazette_id == "0":
                yield scrapy.FormRequest.from_response(
                    response,
                    formdata={"__EVENTTARGET": eventtarget},
                    callback=self.parse_regular_edition,
                    meta={"parsed_date": parsed_date},
                )
            else:
                yield Gazette(
                    date=parsed_date,
                    file_urls=[
                        f"https://legisladocexterno.curitiba.pr.gov.br/DiarioSuplementoConsultaExterna_Download.aspx?Id={gazette_id}"
                    ],
                    is_extra_edition=True,
                    power="executive_legislative",
                )
Ejemplo n.º 29
0
    def parse(self, response):
        lines = response.xpath('//table[contains(@class, "adminlist")]/tr')

        urls = [
            response.urljoin(relative_url)
            for relative_url in lines.xpath("td[1]/a/@href").extract()
        ]
        is_extra_edition = [
            "Extra" in text
            for text in lines.xpath("td[1]/a/text()").extract()
        ]
        dates = [
            parse(date, languages=["pt"]).date()
            for date in lines.xpath("td[2]/text()").extract()
        ]

        for url, is_extra, date in zip(urls, is_extra_edition, dates):
            yield Gazette(
                date=date,
                file_urls=[url],
                is_extra_edition=is_extra,
                power="executive_legislative",
            )

        for page in range(2, len(response.css(".button.othersOptPage")) + 1):
            yield FormRequest(response.url,
                              formdata={"hpage": str(page)},
                              callback=self.parse)
Ejemplo n.º 30
0
    def parse(self, response):
        """
        @url https://sistemas.canoas.rs.gov.br/domc/api/public/diary-by-day?day=08/06/2018  # noqa
        @returns items 3 3
        @scrapes date file_urls is_extra_edition territory_id power scraped_at
        """
        data = json.loads(response.body_as_unicode())
        items = []

        # "editions" is empty when there were no gazettes in the date
        for edition in data.get("editions", []):
            file_url = f"{self.BASE_URL}/edition-file/{edition['id']}"
            is_extra_edition = edition["type"] == "C"

            items.append(
                Gazette(
                    date=data["day"],
                    file_urls=[file_url],
                    is_extra_edition=is_extra_edition,
                    territory_id=self.TERRITORY_ID,
                    power="executive",
                    scraped_at=datetime.utcnow(),
                )
            )
        return items