Esempio n. 1
0
 def gather_all_download_urls(self):
     url = 'http://datos.gob.mx/busca/api/3/action/'
     url += 'package_search?q=organization:inai&rows=500'
     r = util.get_url_request(url)
     if r[1]:
         raise Exception(r[1])
     r = r[0]
     data = r.json()
     out = []
     for result in data['result']['results']:
         for resource in result['resources']:
             if resource['format'] == 'JSON':
                 temp = resource['url'].split("//")[1]
                 conn = http.client.HTTPConnection(temp.split("/")[0])
                 name = temp.split("/")[1]
                 conn.request('HEAD', "/" + name)
                 response = conn.getresponse()
                 url = response.getheader('Location').replace(
                     "open?", "uc?export=download&")
                 out.append({
                     'url': url,
                     'filename': '{}.json'.format(name),
                     'data_type': 'release_package_list',
                     'encoding': 'utf-8-sig',  # ignore BOM
                 })
                 if self.sample:
                     return out
     return out
Esempio n. 2
0
    def gather_all_download_urls(self):
        if self.sample:
            return [{
                'url': 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=1',
                'filename': 'page1.json',
                'data_type': 'release_package_list_in_results',
                'encoding': "ISO-8859-1"
            }]

        url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=1'
        r = util.get_url_request(url)
        if r[1]:
            raise Exception(r[1])
        r = r[0]

        data = r.json()
        total = data['maxPage']
        out = []
        for page in range(1, total+1):
            out.append({
                'url': 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=%d' % page,
                'filename': 'page%d.json' % page,
                'data_type': 'release_package_list_in_results',
                'encoding': "ISO-8859-1"
            })
        return out
Esempio n. 3
0
    def gather_all_download_urls(self):
        if self.sample:
            return [{
                'url': 'https://api.colombiacompra.gov.co/releases/?page=1',
                'filename': 'sample.json',
                'data_type': 'release_package',
            }]

        r = util.get_url_request(
            'https://api.colombiacompra.gov.co/releases/?page=1')
        if r[1]:
            raise Exception(r[1])
        r = r[0]
        data = r.json()
        total = data['links']['count']
        page = 1
        out = []
        # this limit is not passed to the API via the URL - but the API is currently returning 1000
        # results per page, so we hard code it
        limit = 1000
        while ((page - 1) * limit) < total:
            out.append({
                'url':
                'https://api.colombiacompra.gov.co/releases/?page=%d' % page,
                'filename':
                'page%d.json' % page,
                'data_type':
                'release_package',
            })
            page += 1
        return out
Esempio n. 4
0
    def save_url(self, filename, data, file_path):
        if data['data_type'] == 'meta':

            response, errors = util.get_url_request(data['url'])
            if errors:
                return self.SaveUrlResult(errors=errors)

            data = json.loads(response.text)

            additional = []

            if "ListadoOCDS" in data.keys():
                for data_item in data["ListadoOCDS"]:
                    if not self.sample or (self.sample
                                           and len(additional) < 10):
                        additional.append({
                            'url':
                            data_item['URLTender'],
                            'filename':
                            'data-%s.json' % data_item['Codigo'],
                            'data_type':
                            'release_package',
                            'priority':
                            1,
                        })

            return self.SaveUrlResult(additional_files=additional)

        else:
            return super(ChileCompraSource, self).save_url(file_name=filename,
                                                           data=data,
                                                           file_path=file_path)
Esempio n. 5
0
    def gather_all_download_urls(self):

        if self.sample:
            return [{
                'url': 'https://birms.bandung.go.id/beta/api/contracts/year/2017?page=1',
                'filename': 'sample.json',
                'data_type': 'meta',
                'priority': 10
            }]

        out = []

        for year in range(2016, 2019):
            url = 'https://birms.bandung.go.id/beta/api/contracts/year/{}'.format(year)
            response, errors = util.get_url_request(url, verify_ssl=False)
            if errors:
                raise Exception(errors)
            data = response.json()

            last_page = data['last_page']
            for page in range(1, last_page+1):
                out.append({
                    'url': 'https://birms.bandung.go.id/beta/api/contracts/year/{}?page={}'.format(year, page),
                    'filename': 'year{}page{}.json'.format(year, page),
                    'data_type': 'meta',
                    'priority': 10
                })

        return out
Esempio n. 6
0
    def save_url(self, filename, data, file_path):
        if data['data_type'] == 'meta':

            response, errors = util.get_url_request(data['url'])
            if errors:
                return self.SaveUrlResult(errors=errors)

            doc = lxml.html.fromstring(response.text)

            additional = []

            for item in doc.xpath('//li'):
                url_bit = item.xpath('a')[0].get('href')
                if url_bit != 'index.html':
                    url = '%s/%s' % (data['url'], url_bit)
                    if not self.sample or (self.sample
                                           and len(additional) < 3):
                        additional.append({
                            'url':
                            url,
                            'filename':
                            'packages-%s.json' %
                            hashlib.md5(url.encode('utf-8')).hexdigest(),
                            'data_type':
                            'release_package',
                        })

            return self.SaveUrlResult(additional_files=additional)

        else:
            return super(UkraineSource, self).save_url(file_name=filename,
                                                       data=data,
                                                       file_path=file_path)
Esempio n. 7
0
    def gather_all_download_urls(self):
        url = 'https://api.datos.gob.mx/v1/contratacionesabiertas?page=%d'
        if self.sample:
            return [{
                'url': url % 1,
                'filename': 'sample.json',
                'data_type': 'record_package_list_in_results',
            }]

        r = util.get_url_request(url % 2)
        if r[1]:
            raise Exception(r[1])
        r = r[0]
        data = r.json()
        total = data['pagination']['total']
        page = 1
        out = []
        limit = data['pagination']['pageSize']
        while ((page - 1) * limit) < total:
            out.append({
                'url': url % page,
                'filename': 'page%d.json' % page,
                'data_type': 'record_package_list_in_results',
            })
            page += 1
        return out
Esempio n. 8
0
    def save_url(self, filename, data, file_path):
        if data['data_type'] == 'meta':

            response, errors = util.get_url_request(data['url'], verify_ssl=False)
            if errors:
                return self.SaveUrlResult(errors=errors)

            data = json.loads(response.text)

            additional = []

            if "data" in data.keys():

                # Sometimes it's a dict, sometimes it's a list.
                if isinstance(data['data'], dict):
                    data['data'] = data['data'].values()

                for data_item in data["data"]:
                    if not self.sample or (self.sample and len(additional) < 10):
                        additional.append({
                                        'url': data_item['uri'],
                                        'filename': '{}.json'.format(data_item['ocid']),
                                        'data_type': 'release',
                                        'priority': 1,
                                    })

            return self.SaveUrlResult(additional_files=additional)

        else:
            save_content_response = util.save_content(data['url'], file_path, verify_ssl=False)
            return self.SaveUrlResult(errors=save_content_response.errors, warnings=save_content_response.warnings)
 def gather_all_download_urls(self):
     r = util.get_url_request(
         'https://datos.gob.mx/busca/api/3/action/package_search?q=organization:gacm&rows=500'
     )
     if r[1]:
         raise Exception(r[1])
     r = r[0]
     data = r.json()
     urls = []
     for result in data['result']['results']:
         for resource in result['resources']:
             if not self.sample or (self.sample and len(urls) < 10):
                 if resource['format'] == 'JSON' and \
                         resource['url'] != "http://datos.gob.mx/adela/api/v1/organizations/gacm/documents":
                     urls.append({
                         'url':
                         resource['url'],
                         'filename':
                         'file-%s.json' % hashlib.md5(
                             resource['url'].encode('utf-8')).hexdigest(),
                         'data_type':
                         'release_package_list'
                         if resource['name'] == "CONCENTRADO ARCHIVO JSON"
                         else 'release_package',
                     })
     return urls
Esempio n. 10
0
    def gather_all_download_urls(self):

        if self.sample:
            return [{
                'url':
                'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=1000&offset=0',
                'filename': 'offset0.json',
                'data_type': 'release_package',
            }]

        url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=1'
        r = util.get_url_request(url)
        if r[1]:
            raise Exception(r[1])
        r = r[0]
        data = r.json()
        total = data['meta']['count']
        offset = 0
        out = []
        limit = 10000
        while offset < total:
            out.append({
                'url':
                'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d&offset=%d'
                % (limit, offset),
                'filename':
                'offset%d.json' % offset,
                'data_type':
                'release_package',
            })
            offset += limit
        return out
Esempio n. 11
0
    def gather_all_download_urls(self):
        tags = ['planning', 'tender']  # , 'award', 'contract' <-- original fetcher also has these but these return 500?
        out = []

        for tag in tags:
            if self.sample:
                out.append({
                    'url': 'http://gpp.ppda.go.ug/api/v1/releases?tag=%s&page=1' % tag,
                    'filename': 'tag%spage1.json' % tag,
                    'data_type': 'release_package',
                })
            else:
                r = util.get_url_request('http://gpp.ppda.go.ug/api/v1/releases?tag=%s&page=1' % tag)
                if r[1]:
                    raise Exception(r[1])
                r = r[0]
                data = r.json()
                last_page = data['pagination']['last_page']
                for page in range(1, last_page+1):
                    out.append({
                        'url': 'http://gpp.ppda.go.ug/api/v1/releases?tag=%s&page=%d' % (tag, page),
                        'filename': 'tag-%s-page-%d.json' % (tag, page),
                        'data_type': 'release_package',
                    })

        return out
Esempio n. 12
0
    def save_url(self, filename, data, file_path):

        record_url = 'https://apis.mercadopublico.cl/OCDS/data/record/%s'
        if data['data_type'] == 'meta':

            response, errors = util.get_url_request(data['url'])
            if errors:
                return self.SaveUrlResult(errors=errors)

            data = json.loads(response.text)

            additional = []

            if "ListadoOCDS" in data.keys():
                for data_item in data["ListadoOCDS"]:
                    if not self.sample or (self.sample
                                           and len(additional) < 10):
                        for stage in ['URLTender', 'URLAward']:
                            if stage in data_item:
                                name = stage.replace('URL', '')
                                additional.append({
                                    'url':
                                    data_item[stage],
                                    'filename':
                                    'data-%s-%s.json' %
                                    (data_item['Codigo'], name),
                                    'data_type':
                                    'release_package',
                                    'priority':
                                    1,
                                })
                        additional.append({
                            'url':
                            record_url %
                            data_item['Codigo'].replace('ocds-70d2nz-', ''),
                            'filename':
                            'data-%s-record.json' % data_item['Codigo'],
                            'data_type':
                            'record_package',
                            'priority':
                            1,
                        })

            return self.SaveUrlResult(additional_files=additional)

        else:
            return super(ChileCompraSource, self).save_url(file_name=filename,
                                                           data=data,
                                                           file_path=file_path)
Esempio n. 13
0
    def gather_all_download_urls(self):

        r = util.get_url_request('https://ocds.ageops.net/api/ocds/records')
        if r[1]:
            raise Exception(r[1])
        r = r[0]
        out = []
        for data in r.json():
            if not self.sample or (self.sample and len(out) < 10):
                out.append({
                    'url': data,
                    'filename': hashlib.md5(data.encode('utf-8')).hexdigest(),
                    'data_type': 'record',
                })
        return out
Esempio n. 14
0
    def save_content(self, url, filepath, headers=None):
        request, errors = get_url_request(url, stream=True, headers=headers)
        if any('Request exception (Code %s): %s' % (401, 'Invalid or expired token') in s for s in errors):
            self.access_token = None
            errors = self.save_content(url, filepath, headers={"Authorization": self.getAccessToken()})
        if not request:
            return errors

        try:
            with open(filepath, 'wb') as f:
                for chunk in request.iter_content(1024 ^ 2):
                    f.write(chunk)
            return []
        except Exception as e:
            return [str(e)]
Esempio n. 15
0
    def save_url(self, filename, data, file_path):

        response, errors = util.get_url_request(data['url'])
        if errors:
            return self.SaveUrlResult(errors=errors)

        zipfile = ZipFile(BytesIO(response.content))
        read_file_name = zipfile.namelist()[0]

        try:
            with open(file_path, 'wb') as f:
                f.write(zipfile.read(read_file_name))
        except Exception as e:
            return self.SaveUrlResult(errors=[str(e)])

        return self.SaveUrlResult()
Esempio n. 16
0
 def gather_all_download_urls(self):
     r = util.get_url_request(
         'http://www.contratosabiertos.cdmx.gob.mx/api/contratos/todos')
     if r[1]:
         raise Exception(r[1])
     r = r[0]
     datas = r.json()
     out = []
     for data in datas:
         if not self.sample or (self.sample and len(out) < 10):
             out.append({
                 'url': data['uri'],
                 'filename': 'id%s.json' % data['id'],
                 'data_type': 'release_package',
             })
     return out
Esempio n. 17
0
 def fetchRecordPackageIDs(self, year):
     '''
     Download the CSV file for a particular year, and
     extract the list of record package IDs.
     '''
     url = 'https://www.contrataciones.gov.py/'
     url += 'images/opendata/planificaciones/%s.csv' % year
     r = util.get_url_request(url)
     if r[1]:
         raise Exception(r[1])
     r = r[0]
     decoded_content = r.content.decode('utf-8')
     cr = csv.reader(decoded_content.splitlines(), delimiter=',')
     id_list = []
     for row in cr:
         id_list.append(row[2])
     return id_list[1:]
Esempio n. 18
0
 def gather_all_download_urls(self):
     r = util.get_url_request(
         'https://contratacionesabiertas.jalisco.gob.mx/OCApi/2017/contracts'
     )
     if r[1]:
         raise Exception(r[1])
     r = r[0]
     datas = r.json()
     out = []
     for data in datas:
         if not self.sample or (self.sample and len(out) < 10):
             out.append({
                 'url': data['URIContract'],
                 'filename': 'id%s.json' % data['ocid'],
                 'data_type': 'record_package',
             })
     return out
Esempio n. 19
0
    def gather_all_download_urls(self):

        if self.sample:
            return [{
                'url': 'https://ocds.ageops.net/api/ocds/releases/2018-09-23',
                'filename': '2018-09-23.json',
                'data_type': 'meta',
                'priority': 10
            }]

        r = util.get_url_request('https://ocds.ageops.net/api/ocds/releases/dates')
        if r[1]:
            raise Exception(r[1])
        r = r[0]
        out = []
        for data in r.json():
            out.append({
                'url': data,
                'filename': data[-10:] + '.json',
                'data_type': 'meta',
                'priority': 10
            })
        return out
Esempio n. 20
0
    def gather_all_download_urls(self):
        r = util.get_url_request('http://ocds.prozorro.openprocurement.io/')
        if r[1]:
            raise Exception(r[1])
        r = r[0]
        doc = lxml.html.fromstring(r.text)

        last_url = None
        for item in doc.xpath('//li'):
            url = item.xpath('a')[0].get('href')
            last_url = {
                'url': 'http://ocds.prozorro.openprocurement.io/%s' % url,
                'filename': 'meta-%s.json' % url,
                'data_type': 'meta',
            }
            if self.argument_date and url == 'merged_with_extensions_' + self.argument_date:
                return [last_url]

        if self.argument_date:
            raise Exception("You requested the Ukraine data dated " +
                            self.argument_date + " but we couldn't find that!")
        else:
            return [last_url]