def parse_page(self, response):
        raw_modalities = response.xpath("//tr/td[1]/table/tr/td/text()").extract()
        raw_descriptions = response.xpath(
            "//table/tr[2]/td/table/tr[6]/td/table/tr/td[2]/table[1]"
        )
        raw_bids_history = response.xpath(
            "//table/tr[2]/td/table/tr[6]/td/table/tr/td[2]/table[2]"
        )
        raw_date = response.xpath("//tr/td[3]/table/tr/td/text()").extract()
        descriptions = self._parse_descriptions(raw_descriptions)
        bids_history = self._parse_bids_history(raw_bids_history)
        modalities = self._parse_modalities(raw_modalities)
        date = self._parse_date(raw_date)
        bid_data = zip(modalities, descriptions, bids_history, date)

        url_pattern = re.compile(r"licitacoes_pm\.asp[\?|&]cat=(\w+)\&dt=(\d+-\d+)")
        for modality_and_code, (description, document_url), history, date in bid_data:
            match = url_pattern.search(response.url)
            month, year = match.group(2).split("-")

            item = CityHallBidItem(
                crawled_at=datetime_utcnow_aware(),
                crawled_from=response.url,
                public_agency=match.group(1).upper(),
                month=int(month),
                year=int(year),
                description=description,
                history=history,
                codes=modality_and_code["codes"],
                modality=modality_and_code["modality"],
                session_at=from_str_to_datetime(date),
            )
            if document_url:
                item["files"] = [response.urljoin(document_url)]
            yield item
    def _parse_bids_history(self, raw_bids_history):
        all_bids_history = []
        for raw_bid_history in raw_bids_history:
            bids_history = []
            for row in raw_bid_history.xpath(".//tr"):
                date = row.xpath(".//td[2]/text()").get().strip()
                date = from_str_to_datetime(date)
                event = row.xpath(".//td[3]/div/text()").get()
                url = row.xpath(".//td[4]/div/a//@href").get()

                if event and date:
                    url = url if url else ""
                    bids_history.append(
                        {"published_at": date, "event": event.capitalize(), "url": url}
                    )
            all_bids_history.append(bids_history)

        return all_bids_history
Example #3
0
def test_possible_datetime(datetime_str, expected_obj):
    assert from_str_to_datetime(datetime_str) == expected_obj
Example #4
0
def test_possible_date_formats(datetime_str, expected_obj):
    formats = ["%d/%m/%Y", "%d/%m/%y"]

    assert from_str_to_datetime(datetime_str, formats) == expected_obj
Example #5
0
def test_dates_older_than_city_creation(datetime_str, expected_obj):
    formats = ["%d/%m/%Y", "%d/%m/%y"]

    assert from_str_to_datetime(datetime_str, formats) == expected_obj