def gather_all_download_urls(self): url = 'http://datos.gob.mx/busca/api/3/action/' url += 'package_search?q=organization:inai&rows=500' r = util.get_url_request(url) if r[1]: raise Exception(r[1]) r = r[0] data = r.json() out = [] for result in data['result']['results']: for resource in result['resources']: if resource['format'] == 'JSON': temp = resource['url'].split("//")[1] conn = http.client.HTTPConnection(temp.split("/")[0]) name = temp.split("/")[1] conn.request('HEAD', "/" + name) response = conn.getresponse() url = response.getheader('Location').replace( "open?", "uc?export=download&") out.append({ 'url': url, 'filename': '{}.json'.format(name), 'data_type': 'release_package_list', 'encoding': 'utf-8-sig', # ignore BOM }) if self.sample: return out return out
def gather_all_download_urls(self): if self.sample: return [{ 'url': 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=1', 'filename': 'page1.json', 'data_type': 'release_package_list_in_results', 'encoding': "ISO-8859-1" }] url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=1' r = util.get_url_request(url) if r[1]: raise Exception(r[1]) r = r[0] data = r.json() total = data['maxPage'] out = [] for page in range(1, total+1): out.append({ 'url': 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=%d' % page, 'filename': 'page%d.json' % page, 'data_type': 'release_package_list_in_results', 'encoding': "ISO-8859-1" }) return out
def gather_all_download_urls(self): if self.sample: return [{ 'url': 'https://api.colombiacompra.gov.co/releases/?page=1', 'filename': 'sample.json', 'data_type': 'release_package', }] r = util.get_url_request( 'https://api.colombiacompra.gov.co/releases/?page=1') if r[1]: raise Exception(r[1]) r = r[0] data = r.json() total = data['links']['count'] page = 1 out = [] # this limit is not passed to the API via the URL - but the API is currently returning 1000 # results per page, so we hard code it limit = 1000 while ((page - 1) * limit) < total: out.append({ 'url': 'https://api.colombiacompra.gov.co/releases/?page=%d' % page, 'filename': 'page%d.json' % page, 'data_type': 'release_package', }) page += 1 return out
def save_url(self, filename, data, file_path): if data['data_type'] == 'meta': response, errors = util.get_url_request(data['url']) if errors: return self.SaveUrlResult(errors=errors) data = json.loads(response.text) additional = [] if "ListadoOCDS" in data.keys(): for data_item in data["ListadoOCDS"]: if not self.sample or (self.sample and len(additional) < 10): additional.append({ 'url': data_item['URLTender'], 'filename': 'data-%s.json' % data_item['Codigo'], 'data_type': 'release_package', 'priority': 1, }) return self.SaveUrlResult(additional_files=additional) else: return super(ChileCompraSource, self).save_url(file_name=filename, data=data, file_path=file_path)
def gather_all_download_urls(self): if self.sample: return [{ 'url': 'https://birms.bandung.go.id/beta/api/contracts/year/2017?page=1', 'filename': 'sample.json', 'data_type': 'meta', 'priority': 10 }] out = [] for year in range(2016, 2019): url = 'https://birms.bandung.go.id/beta/api/contracts/year/{}'.format(year) response, errors = util.get_url_request(url, verify_ssl=False) if errors: raise Exception(errors) data = response.json() last_page = data['last_page'] for page in range(1, last_page+1): out.append({ 'url': 'https://birms.bandung.go.id/beta/api/contracts/year/{}?page={}'.format(year, page), 'filename': 'year{}page{}.json'.format(year, page), 'data_type': 'meta', 'priority': 10 }) return out
def save_url(self, filename, data, file_path): if data['data_type'] == 'meta': response, errors = util.get_url_request(data['url']) if errors: return self.SaveUrlResult(errors=errors) doc = lxml.html.fromstring(response.text) additional = [] for item in doc.xpath('//li'): url_bit = item.xpath('a')[0].get('href') if url_bit != 'index.html': url = '%s/%s' % (data['url'], url_bit) if not self.sample or (self.sample and len(additional) < 3): additional.append({ 'url': url, 'filename': 'packages-%s.json' % hashlib.md5(url.encode('utf-8')).hexdigest(), 'data_type': 'release_package', }) return self.SaveUrlResult(additional_files=additional) else: return super(UkraineSource, self).save_url(file_name=filename, data=data, file_path=file_path)
def gather_all_download_urls(self): url = 'https://api.datos.gob.mx/v1/contratacionesabiertas?page=%d' if self.sample: return [{ 'url': url % 1, 'filename': 'sample.json', 'data_type': 'record_package_list_in_results', }] r = util.get_url_request(url % 2) if r[1]: raise Exception(r[1]) r = r[0] data = r.json() total = data['pagination']['total'] page = 1 out = [] limit = data['pagination']['pageSize'] while ((page - 1) * limit) < total: out.append({ 'url': url % page, 'filename': 'page%d.json' % page, 'data_type': 'record_package_list_in_results', }) page += 1 return out
def save_url(self, filename, data, file_path): if data['data_type'] == 'meta': response, errors = util.get_url_request(data['url'], verify_ssl=False) if errors: return self.SaveUrlResult(errors=errors) data = json.loads(response.text) additional = [] if "data" in data.keys(): # Sometimes it's a dict, sometimes it's a list. if isinstance(data['data'], dict): data['data'] = data['data'].values() for data_item in data["data"]: if not self.sample or (self.sample and len(additional) < 10): additional.append({ 'url': data_item['uri'], 'filename': '{}.json'.format(data_item['ocid']), 'data_type': 'release', 'priority': 1, }) return self.SaveUrlResult(additional_files=additional) else: save_content_response = util.save_content(data['url'], file_path, verify_ssl=False) return self.SaveUrlResult(errors=save_content_response.errors, warnings=save_content_response.warnings)
def gather_all_download_urls(self): r = util.get_url_request( 'https://datos.gob.mx/busca/api/3/action/package_search?q=organization:gacm&rows=500' ) if r[1]: raise Exception(r[1]) r = r[0] data = r.json() urls = [] for result in data['result']['results']: for resource in result['resources']: if not self.sample or (self.sample and len(urls) < 10): if resource['format'] == 'JSON' and \ resource['url'] != "http://datos.gob.mx/adela/api/v1/organizations/gacm/documents": urls.append({ 'url': resource['url'], 'filename': 'file-%s.json' % hashlib.md5( resource['url'].encode('utf-8')).hexdigest(), 'data_type': 'release_package_list' if resource['name'] == "CONCENTRADO ARCHIVO JSON" else 'release_package', }) return urls
def gather_all_download_urls(self): if self.sample: return [{ 'url': 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=1000&offset=0', 'filename': 'offset0.json', 'data_type': 'release_package', }] url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=1' r = util.get_url_request(url) if r[1]: raise Exception(r[1]) r = r[0] data = r.json() total = data['meta']['count'] offset = 0 out = [] limit = 10000 while offset < total: out.append({ 'url': 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d&offset=%d' % (limit, offset), 'filename': 'offset%d.json' % offset, 'data_type': 'release_package', }) offset += limit return out
def gather_all_download_urls(self): tags = ['planning', 'tender'] # , 'award', 'contract' <-- original fetcher also has these but these return 500? out = [] for tag in tags: if self.sample: out.append({ 'url': 'http://gpp.ppda.go.ug/api/v1/releases?tag=%s&page=1' % tag, 'filename': 'tag%spage1.json' % tag, 'data_type': 'release_package', }) else: r = util.get_url_request('http://gpp.ppda.go.ug/api/v1/releases?tag=%s&page=1' % tag) if r[1]: raise Exception(r[1]) r = r[0] data = r.json() last_page = data['pagination']['last_page'] for page in range(1, last_page+1): out.append({ 'url': 'http://gpp.ppda.go.ug/api/v1/releases?tag=%s&page=%d' % (tag, page), 'filename': 'tag-%s-page-%d.json' % (tag, page), 'data_type': 'release_package', }) return out
def save_url(self, filename, data, file_path): record_url = 'https://apis.mercadopublico.cl/OCDS/data/record/%s' if data['data_type'] == 'meta': response, errors = util.get_url_request(data['url']) if errors: return self.SaveUrlResult(errors=errors) data = json.loads(response.text) additional = [] if "ListadoOCDS" in data.keys(): for data_item in data["ListadoOCDS"]: if not self.sample or (self.sample and len(additional) < 10): for stage in ['URLTender', 'URLAward']: if stage in data_item: name = stage.replace('URL', '') additional.append({ 'url': data_item[stage], 'filename': 'data-%s-%s.json' % (data_item['Codigo'], name), 'data_type': 'release_package', 'priority': 1, }) additional.append({ 'url': record_url % data_item['Codigo'].replace('ocds-70d2nz-', ''), 'filename': 'data-%s-record.json' % data_item['Codigo'], 'data_type': 'record_package', 'priority': 1, }) return self.SaveUrlResult(additional_files=additional) else: return super(ChileCompraSource, self).save_url(file_name=filename, data=data, file_path=file_path)
def gather_all_download_urls(self): r = util.get_url_request('https://ocds.ageops.net/api/ocds/records') if r[1]: raise Exception(r[1]) r = r[0] out = [] for data in r.json(): if not self.sample or (self.sample and len(out) < 10): out.append({ 'url': data, 'filename': hashlib.md5(data.encode('utf-8')).hexdigest(), 'data_type': 'record', }) return out
def save_content(self, url, filepath, headers=None): request, errors = get_url_request(url, stream=True, headers=headers) if any('Request exception (Code %s): %s' % (401, 'Invalid or expired token') in s for s in errors): self.access_token = None errors = self.save_content(url, filepath, headers={"Authorization": self.getAccessToken()}) if not request: return errors try: with open(filepath, 'wb') as f: for chunk in request.iter_content(1024 ^ 2): f.write(chunk) return [] except Exception as e: return [str(e)]
def save_url(self, filename, data, file_path): response, errors = util.get_url_request(data['url']) if errors: return self.SaveUrlResult(errors=errors) zipfile = ZipFile(BytesIO(response.content)) read_file_name = zipfile.namelist()[0] try: with open(file_path, 'wb') as f: f.write(zipfile.read(read_file_name)) except Exception as e: return self.SaveUrlResult(errors=[str(e)]) return self.SaveUrlResult()
def gather_all_download_urls(self): r = util.get_url_request( 'http://www.contratosabiertos.cdmx.gob.mx/api/contratos/todos') if r[1]: raise Exception(r[1]) r = r[0] datas = r.json() out = [] for data in datas: if not self.sample or (self.sample and len(out) < 10): out.append({ 'url': data['uri'], 'filename': 'id%s.json' % data['id'], 'data_type': 'release_package', }) return out
def fetchRecordPackageIDs(self, year): ''' Download the CSV file for a particular year, and extract the list of record package IDs. ''' url = 'https://www.contrataciones.gov.py/' url += 'images/opendata/planificaciones/%s.csv' % year r = util.get_url_request(url) if r[1]: raise Exception(r[1]) r = r[0] decoded_content = r.content.decode('utf-8') cr = csv.reader(decoded_content.splitlines(), delimiter=',') id_list = [] for row in cr: id_list.append(row[2]) return id_list[1:]
def gather_all_download_urls(self): r = util.get_url_request( 'https://contratacionesabiertas.jalisco.gob.mx/OCApi/2017/contracts' ) if r[1]: raise Exception(r[1]) r = r[0] datas = r.json() out = [] for data in datas: if not self.sample or (self.sample and len(out) < 10): out.append({ 'url': data['URIContract'], 'filename': 'id%s.json' % data['ocid'], 'data_type': 'record_package', }) return out
def gather_all_download_urls(self): if self.sample: return [{ 'url': 'https://ocds.ageops.net/api/ocds/releases/2018-09-23', 'filename': '2018-09-23.json', 'data_type': 'meta', 'priority': 10 }] r = util.get_url_request('https://ocds.ageops.net/api/ocds/releases/dates') if r[1]: raise Exception(r[1]) r = r[0] out = [] for data in r.json(): out.append({ 'url': data, 'filename': data[-10:] + '.json', 'data_type': 'meta', 'priority': 10 }) return out
def gather_all_download_urls(self): r = util.get_url_request('http://ocds.prozorro.openprocurement.io/') if r[1]: raise Exception(r[1]) r = r[0] doc = lxml.html.fromstring(r.text) last_url = None for item in doc.xpath('//li'): url = item.xpath('a')[0].get('href') last_url = { 'url': 'http://ocds.prozorro.openprocurement.io/%s' % url, 'filename': 'meta-%s.json' % url, 'data_type': 'meta', } if self.argument_date and url == 'merged_with_extensions_' + self.argument_date: return [last_url] if self.argument_date: raise Exception("You requested the Ukraine data dated " + self.argument_date + " but we couldn't find that!") else: return [last_url]