Exemple #1
0
    def parse_list(self, response):
        data = json.loads(response.text)
        release_type = response.request.meta['release_type']

        if 'links' in data and isinstance(data['links'], dict) and 'next' in data['links'] and not self.sample:
            yield self.build_request(
                data['links']['next'],
                formatter=parameters('event', 'startRow'),
                meta={'release_type': release_type},
                callback=self.parse_list
            )

        for release in data['releases']:
            if release_type == 'planning':
                uuid = release['tender']['plannedProcurementUUID']
                yield self.build_request(
                    'https://tenders.nsw.gov.au/?event=public.api.planning.view&PlannedProcurementUUID=' + uuid,
                    formatter=parameters('event', 'PlannedProcurementUUID')
                )
            elif release_type == 'tender':
                uuid = release['tender']['RFTUUID']
                yield self.build_request(
                    'https://tenders.nsw.gov.au/?event=public.api.tender.view&RFTUUID=' + uuid,
                    formatter=parameters('event', 'RFTUUID')
                )
            elif release_type == 'contract':
                for award in release['awards']:
                    uuid = award['CNUUID']
                    yield self.build_request(
                        'https://tenders.nsw.gov.au/?event=public.api.contract.view&CNUUID=' + uuid,
                        formatter=parameters('event', 'CNUUID')
                    )
Exemple #2
0
class Uganda(IndexSpider):
    """
    Domain
      Government Procurement Portal (GPP) of Public Procurement and Disposal of Public Assets Authority (PPDA)
    API documentation
      https://docs.google.com/spreadsheets/d/10tVioy-VOQa1FwWoRl5e1pMbGpiymA0iycNcoDFkvks/edit#gid=365266172
    """
    name = 'uganda_releases'
    data_type = 'release_package'
    total_pages_pointer = '/data/last_page'
    yield_list_results = False
    formatter = staticmethod(parameters('page'))
    base_url = 'https://gpp.ppda.go.ug/adminapi/public/api/pdes'

    download_delay = 0.9

    def start_requests(self):
        yield scrapy.Request('https://gpp.ppda.go.ug/adminapi/public/api/pdes',
                             meta={'file_name': 'page-1.json'},
                             callback=self.parse_list,
                             cb_kwargs={'callback': self.parse_data})

    @handle_http_error
    def parse_data(self, response):
        pattern = 'https://gpp.ppda.go.ug/adminapi/public/api/open-data/v1/releases/{}?fy={}&pde={}'

        data = json.loads(response.text)
        for pdes in data['data']['data']:
            for plans in pdes['procurement_plans']:
                for tag in ('planning', 'tender', 'award', 'contract'):
                    yield self.build_request(
                        pattern.format(tag, plans['financial_year'],
                                       plans['pde_id']),
                        formatter=join(components(-1), parameters('fy',
                                                                  'pde')))
 def start_requests(self):
     pattern = 'https://tenders.nsw.gov.au/?event=public.api.{}.search&ResultsPerPage=1000'
     for release_type in ('planning', 'tender', 'contract'):
         yield self.build_request(pattern.format(release_type),
                                  formatter=parameters('event'),
                                  meta={'release_type': release_type},
                                  callback=self.parse_list)
class MexicoQuienEsQuien(IndexSpider):
    """
    Domain
      QuiénEsQuién.Wiki
    API documentation
      https://quienesquienapi.readthedocs.io/es/latest/
    Swagger API documentation
      https://api.quienesquien.wiki/v2/docs/
    """
    name = 'mexico_quien_es_quien'
    download_delay = 0.9
    count_pointer = '/data/0/collections/contracts/count'
    limit = 1000
    base_url = 'https://api.quienesquien.wiki/v2/contracts'
    formatter = staticmethod(parameters('offset'))
    data_type = 'record_package_list'

    def start_requests(self):
        yield scrapy.Request('https://api.quienesquien.wiki/v2/sources',
                             meta={'file_name': 'list.json'},
                             callback=self.parse_list)

    @handle_http_error
    def parse(self, response):
        data = json.loads(response.text)
        yield self.build_file_from_response(response,
                                            data=json.dumps(
                                                data['data']).encode(),
                                            data_type=self.data_type)
class MexicoAdministracionPublicaFederal(IndexSpider):
    """
    Domain
      Administración Pública Federal (APF)
    Bulk download documentation
      https://datos.gob.mx/busca/dataset/concentrado-de-contrataciones-abiertas-de-la-apf
    """
    name = 'mexico_administracion_publica_federal'

    # BaseSpider
    root_path = 'results.item'

    # SimpleSpider
    data_type = 'record_package'

    # IndexSpider
    count_pointer = '/pagination/total'
    limit = '/pagination/pageSize'
    use_page = True
    formatter = staticmethod(parameters('page'))

    def start_requests(self):
        url = 'https://api.datos.gob.mx/v1/contratacionesabiertas'
        yield scrapy.Request(url,
                             meta={'file_name': 'page-1.json'},
                             callback=self.parse_list)
Exemple #6
0
class KenyaMakueni(IndexSpider):
    """
    Domain
      Makueni County
    Swagger API documentation
      https://opencontracting.makueni.go.ke/swagger-ui.html#/ocds-controller
    """
    name = 'kenya_makueni'
    data_type = 'release_package_list'
    limit = 10
    additional_params = {'pageSize': limit}
    yield_list_results = False
    param_page = 'pageNumber'
    formatter = staticmethod(parameters('pageNumber'))

    base_url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={limit}&pageNumber={page}'

    def start_requests(self):
        yield scrapy.Request(
            'https://opencontracting.makueni.go.ke/api/ocds/release/count',
            meta={'file_name': 'count.json'},
            callback=self.parse_list
        )

    def range_generator(self, data, response):
        return range(ceil(int(response.text) / self.limit))

    def url_builder(self, value, data, response):
        return self.pages_url_builder(value, data, response)
Exemple #7
0
 def parse_redirect(self, response):
     if response.status == 301:
         url = response.headers['Location'].decode('utf-8').replace(
             'open?', 'uc?export=download&')
         yield self.build_request(url, formatter=parameters('id'))
     else:
         yield self.build_file_error_from_response(response)
    def parse(self, response):
        data = json.loads(response.text)
        pattern = 'https://datos.hacienda.gov.py:443/odmh-api-v1/rest/api/v1/ocds/release-package/{}'

        # If is the first URL, we need to iterate over all the pages to get all the process ids to query
        if response.request.meta['first']:
            total = data['meta']['totalPages']
            for page in range(2, total + 1):
                yield self.build_request(self.base_list_url.format(page),
                                         formatter=parameters('page'),
                                         meta={
                                             'meta': True,
                                             'first': False,
                                         },
                                         dont_filter=True)

        # if is a meta request it means that is the page that have the process ids to query
        if response.request.meta['meta']:
            # Now that we have the ids we iterate over them, without duplicate them, and make the
            # final requests for the release_package this time
            for row in data['results']:
                if row['idLlamado'] and row[
                        'idLlamado'] not in self.release_ids:
                    self.release_ids.append(row['idLlamado'])
                    yield self.build_request(pattern.format(row['idLlamado']),
                                             formatter=components(-1),
                                             meta={
                                                 'meta': False,
                                                 'first': False,
                                             },
                                             dont_filter=True)
        else:
            yield self.build_file_from_response(response,
                                                data_type=self.data_type)
Exemple #9
0
class DominicanRepublicPortal(LinksSpider):
    """
    Domain
      Dirección General de Contrataciones Públicas (DGCP)
    Spider arguments
      from_date
        Download only data from this date onward (YYYY-MM-DD format).
        If ``until_date`` is provided, defaults to '2018-01-01'.
      until_date
        Download only data until this date (YYYY-MM-DD format).
        If ``from_date`` is provided, defaults to today.
    API documentation
      http://148.101.176.123:48080/ocdsdr/docs
    """
    name = 'dominican_republic_api'

    # BaseSpider
    default_from_date = '2018-01-01'

    # SimpleSpider
    data_type = 'release_package'

    # LinksSpider
    next_page_formatter = staticmethod(parameters('page'))

    def start_requests(self):
        url = 'http://148.101.176.123:48080/ocdsdr/api/v1/releases'
        if self.from_date and self.until_date:
            url = f"{url}/byDatesBetween/{self.from_date.strftime('%Y-%m-%d')}/{self.until_date.strftime('%Y-%m-%d')}"
        yield scrapy.Request(url, meta={'file_name': 'page-1.json'})
class Colombia(LinksSpider):
    """
    Domain
      Colombia Compra Eficiente (CCE)
    Spider arguments
      from_date
        Download only data from this date onward (YYYY-MM-DD format).
        If ``until_date`` is provided, defaults to '2011-01-01'.
      until_date
        Download only data until this date (YYYY-MM-DD format).
        If ``from_date`` is provided, defaults to today.
      start_page
        The page number from which to start crawling.
    API documentation
      https://www.colombiacompra.gov.co/transparencia/api
    Swagger API documentation
      https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/
    """
    name = 'colombia'
    next_page_formatter = staticmethod(parameters('_id'))
    default_from_date = '2011-01-01'
    data_type = 'release_package'

    def start_requests(self):
        base_url = 'https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/rest/releases'
        if self.from_date and self.until_date:
            from_date = self.from_date.strftime(self.date_format)
            until_date = self.until_date.strftime(self.date_format)
            base_url += f'/dates/{from_date}/{until_date}'

        base_url += '?page={}'

        start_page = 1
        if hasattr(self, 'start_page'):
            start_page = int(self.start_page)
        yield self.build_request(base_url.format(start_page), formatter=parameters('page'))

    def retry(self, response, reason):
        url = response.request.url
        self.logger.info(reason.format(url=url, status=response.status))
        time.sleep(120 * 60)
        yield scrapy.Request(url, dont_filter=True, meta=response.request.meta)

    def parse(self, response):
        # In Colombia, every day at certain hour they run a process in their system that drops the database and make
        # the services unavailable for about 120 minutes, as Colombia has a lot of data,
        # the spider takes more than one day to scrape all the data,
        # so eventually the spider will always face the service problems. For that, when the problem occurs, (503
        # status or invalid json) we wait 120 minutes and then continue
        try:
            if self.is_http_success(response):
                yield self.build_file_from_response(response, data_type=self.data_type)
                yield self.next_link(response)
            elif response.status == 503:
                self.retry(response, 'Sleeping due to HTTP error {status} from {url}')
            else:
                yield self.build_file_error_from_response(response)
        except JSONDecodeError:
            self.retry(response, 'Sleeping due to JSONDecodeError from {url}')
    def parse_list(self, response):
        yield from self.parse(response)

        if not self.sample:
            data = json.loads(response.text)
            total = data['maxPage']
            for page in range(2, total + 1):
                url = replace_parameter(response.request.url, 'page', page)
                yield self.build_request(url, formatter=parameters('page'))
Exemple #12
0
class PortugalBase(LinksSpider):
    # BaseSpider
    default_from_date = '2010-01-01'

    # LinksSpider
    next_page_formatter = staticmethod(parameters('offset'))

    # We will wait 1, 2, 4, 8, 16 minutes (31 minutes total).
    max_retries = 5
    initial_wait_time = 60

    def start_requests(self):
        url = self.url
        if self.from_date and self.until_date:
            url = f'{url}&contractStartDate={self.from_date}&contractEndDate={self.until_date}'

        yield scrapy.Request(url, meta={'file_name': 'offset-1.json'})

    # https://github.com/scrapy/scrapy/blob/master/scrapy/downloadermiddlewares/retry.py
    def parse(self, response):
        retries = response.request.meta.get('retries', 0) + 1
        wait_time = response.request.meta.get('wait_time',
                                              self.initial_wait_time // 2) * 2

        # Every ~36,000 requests, the API returns HTTP errors. After a few minutes, it starts working again.
        # The number of failed attempts in the log messages includes the original request.
        # https://github.com/open-contracting/kingfisher-collect/issues/545#issuecomment-762768460
        if self.is_http_success(response):
            yield from super().parse(response)
        elif retries <= self.max_retries:
            request = response.request.copy()
            request.meta['retries'] = retries
            request.meta['wait_time'] = wait_time
            request.dont_filter = True

            self.logger.debug(
                'Retrying %(request)s in %(wait_time)ds (failed %(failures)d times): HTTP %(status)d',
                {
                    'request': response.request,
                    'failures': retries,
                    'status': response.status,
                    'wait_time': wait_time
                },
                extra={'spider': self})

            yield request
        else:
            self.logger.error(
                'Gave up retrying %(request)s (failed %(failures)d times): HTTP %(status)d',
                {
                    'request': response.request,
                    'failures': retries,
                    'status': response.status
                },
                extra={'spider': self})

            yield self.build_file_error_from_response(response)
 def start_requests(self):
     if self.sample:
         url = self.url.format(step=self.step, page=0)
         yield self.build_request(url, formatter=parameters('pageNumber'))
     else:
         yield scrapy.Request(
             'https://opencontracting.makueni.go.ke/api/ocds/release/count',
             meta={'file_name': 'count.json'},
             callback=self.parse_count)
Exemple #14
0
 def request_range(self, start_date, end_date, search_h):
     return self.build_request(
         self.base_page_url.format(start_date, end_date),
         formatter=parameters('releasedate__gte', 'releasedate__lte'),
         meta={
             'release_date': start_date,
             'search_h': search_h,
         },
         headers={'Accept': '*/*', 'Content-Type': 'application/json'}
     )
    def parse_list(self, response):
        yield from self.parse(response)

        if not self.sample:
            data = json.loads(response.text)
            offset = data['meta']['pagination']['limit']
            total = data['meta']['count']
            for offset in range(offset, total, self.step):
                url = replace_parameter(response.request.url, 'offset', offset)
                yield self.build_request(url, formatter=parameters('offset'))
    def parse_list(self, response):
        pattern = 'https://api.quienesquien.wiki/v2/contracts?limit={limit}&offset={offset}'
        limit = 1000

        count = json.loads(response.text)['data'][0]['collections']['contracts']['count']
        for offset in range(ceil(count / limit)):
            url = pattern.format(limit=limit, offset=offset * limit)
            yield self.build_request(url, formatter=parameters('offset'))
            if self.sample:
                break
Exemple #17
0
    def start_requests(self):
        url = f'{self.base_url}/search/processes?tipo_fecha=fecha_release&' \
              f'fecha_desde={self.from_date.strftime(self.date_format)}-04:00&' \
              f'fecha_hasta={self.until_date.strftime(self.date_format)}-04:00'

        yield self.build_request(
            url,
            formatter=parameters('fecha_desde'),
            # send duplicate requests when the token expired and in the continuation of last_request saved.
            dont_filter=True,
            callback=self.parse_pages)
    def parse_list(self, response):
        base_url = 'http://public.eprocurement.systems/ocds/tenders/'
        data = json.loads(response.text)
        # The last page returns an empty JSON object.
        if not data:
            return
        for item in data['data']:
            yield self.build_request(base_url + item['ocid'], formatter=components(-1))

        url = replace_parameters(response.request.url, offset=data['offset'])
        yield self.build_request(url, formatter=parameters('offset'), callback=self.parse_list)
    def parse_data(self, response):
        pattern = 'https://gpp.ppda.go.ug/adminapi/public/api/open-data/v1/releases/{}?fy={}&pde={}'

        data = response.json()
        for pdes in data['data']['data']:
            for plans in pdes['procurement_plans']:
                for tag in ('planning', 'tender', 'award', 'contract'):
                    yield self.build_request(
                        pattern.format(tag, plans['financial_year'], plans['pde_id']),
                        formatter=join(components(-1), parameters('fy', 'pde'))
                    )
    def parse_list(self, response):
        data = response.json()
        for item in data['data']:
            for resource in item['resources']:
                description = resource['description']
                if description and 'ocds' in description.lower():
                    yield self.build_request(resource['url'], formatter=components(-2))

        next_page = data.get('next_page')
        if next_page:
            yield self.build_request(next_page, formatter=parameters('page'), callback=self.parse_list)
    def parse_list(self, response):
        yield from self.parse(response)

        if not self.sample:
            data = json.loads(response.text)
            page = data['pagination']['page']
            total = data['pagination']['total']
            limit = data['pagination']['pageSize']
            for page in range(page + 1, ceil(total / limit)):
                url = replace_parameter(response.request.url, 'page', page)
                yield self.build_request(url, formatter=parameters('page'))
class Kyrgyzstan(LinksSpider):
    """
    Domain
      Ministry of Finance
    """
    name = 'kyrgyzstan'
    data_type = 'release_package'
    next_page_formatter = staticmethod(parameters('offset'))

    def start_requests(self):
        yield scrapy.Request('http://ocds.zakupki.gov.kg/api/tendering',
                             meta={'file_name': 'offset-0.json'})
Exemple #23
0
    def parse_list(self, response):
        data = response.json()
        # The last page returns an empty JSON object.
        if not data:
            return

        for item in data['data']:
            url = replace_parameters(response.request.url, offset=None) + item['ocid']
            yield self.build_request(url, formatter=components(-2))

        url = replace_parameters(response.request.url, offset=data['offset'])
        yield self.build_request(url, formatter=join(components(-1), parameters('offset')), callback=self.parse_list)
    def start_requests(self):
        base_url = 'https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/rest/releases'
        if self.from_date and self.until_date:
            from_date = self.from_date.strftime(self.date_format)
            until_date = self.until_date.strftime(self.date_format)
            base_url += f'/dates/{from_date}/{until_date}'

        base_url += '?page={}'

        start_page = 1
        if hasattr(self, 'start_page'):
            start_page = int(self.start_page)
        yield self.build_request(base_url.format(start_page), formatter=parameters('page'))
 def start_requests(self):
     # Paraguay Hacienda has a service that return all the ids that we need to get the releases packages
     # so we first iterate over this list that is paginated
     yield self.build_request(
         self.base_list_url.format(1),
         formatter=parameters('page'),
         meta={
             'meta': True,
             'first': True,
         },
         # send duplicate requests when the token expired and in the continuation of last_request saved.
         dont_filter=True,
     )
    def parse_list(self, response):
        pattern = 'https://gpp.ppda.go.ug/adminapi/public/api/pdes?page={}'

        if self.sample:
            total = 1
        else:
            data = json.loads(response.text)
            total = data['data']['last_page']

        for page in range(2, total + 1):
            yield self.build_request(pattern.format(page),
                                     formatter=parameters('page'),
                                     callback=self.parse_data)
Exemple #27
0
class PortugalBase(LinksSpider):
    default_from_date = '2010-01-01'
    next_page_formatter = staticmethod(parameters('offset'))
    # The API return 429 error after a certain number of requests
    download_delay = 1
    # The API returns 503 error sometimes
    custom_settings = {'RETRY_TIMES': 10}

    def start_requests(self):
        url = self.url
        if self.from_date and self.until_date:
            url = f'{url}&contractStartDate={self.from_date}&contractEndDate={self.until_date}'
        yield scrapy.Request(url, meta={'file_name': 'offset-1.json'})
Exemple #28
0
class Armenia(LinksSpider):
    """
    Spider arguments
      sample
        Download only the first release package in the dataset.
    """
    name = 'armenia'
    data_type = 'release_package'
    next_pointer = '/next_page/uri'
    next_page_formatter = staticmethod(parameters('offset'))

    def start_requests(self):
        url = 'https://armeps.am/ocds/release'
        yield scrapy.Request(url, meta={'file_name': 'offset-0.json'})
 def parse_list(self, response):
     data = response.json()
     for item in data['data']:
         url = item['uri']
         if url:
             yield self.build_request(url, self.get_formatter())
     else:
         next_page_url = data.get('next_page_url')
         if next_page_url:
             yield self.build_request(next_page_url,
                                      formatter=join(
                                          self.get_formatter(),
                                          parameters('page')),
                                      callback=self.parse_list)
class GeorgiaReleases(LinksSpider):
    """
    Domain
      State Procurement Agency (SPA)
    Swagger API documentation
      https://odapi.spa.ge/api/swagger.ui
    """
    name = 'georgia_releases'
    data_type = 'release_package'
    next_page_formatter = staticmethod(parameters('page'))

    def start_requests(self):
        url = 'https://odapi.spa.ge/api/releases.json'
        yield scrapy.Request(url, meta={'file_name': 'page-1.json'})