def make_json(self, export_type='datajson', owner_org=None): # Error handler for creating error log stream = StringIO.StringIO() eh = logging.StreamHandler(stream) eh.setLevel(logging.WARN) formatter = logging.Formatter('%(asctime)s - %(message)s') eh.setFormatter(formatter) logger.addHandler(eh) data = '' output = [] errors_json = [] Package2Pod.seen_identifiers = set() try: # Build the data.json file. if owner_org: if 'datajson' == export_type: # we didn't check ownership for this type of export, so never load private datasets here packages = DataJsonController._get_ckan_datasets( org=owner_org) if not packages: packages = self.get_packages(owner_org=owner_org, with_private=False) else: packages = self.get_packages(owner_org=owner_org, with_private=True) else: # TODO: load data by pages # packages = p.toolkit.get_action("current_package_list_with_resources")( # None, {'limit': 50, 'page': 300}) packages = DataJsonController._get_ckan_datasets() # packages = p.toolkit.get_action("current_package_list_with_resources")(None, {}) json_export_map = get_export_map_json('export.map.json') if json_export_map: for pkg in packages: if json_export_map.get('debug'): output.append(pkg) # logger.error('package: %s', json.dumps(pkg)) # logger.debug("processing %s" % (pkg.get('title'))) extras = dict([(x['key'], x['value']) for x in pkg.get('extras', {})]) # unredacted = all non-draft datasets (public + private) # redacted = public-only, non-draft datasets if export_type in ['unredacted', 'redacted']: if 'Draft' == extras.get('publishing_status'): # publisher = detect_publisher(extras) # logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted (%s)\n", # pkg.get('id'), pkg.get('title'), publisher, # 'publishing_status: Draft') # self._errors_json.append(OrderedDict([ # ('id', pkg.get('id')), # ('name', pkg.get('name')), # ('title', pkg.get('title')), # ('errors', [( # 'publishing_status: Draft', # [ # 'publishing_status: Draft' # ] # )]) # ])) continue # if 'redacted' == export_type and re.match(r'[Nn]on-public', extras.get('public_access_level')): # continue # draft = all draft-only datasets elif 'draft' == export_type: if 'publishing_status' not in extras.keys( ) or extras.get('publishing_status') != 'Draft': continue redaction_enabled = ('redacted' == export_type) datajson_entry = Package2Pod.convert_package( pkg, json_export_map, redaction_enabled) errors = None if 'errors' in datajson_entry.keys(): errors_json.append(datajson_entry) errors = datajson_entry.get('errors') datajson_entry = None if datajson_entry and \ (not json_export_map.get('validation_enabled') or self.is_valid(datajson_entry)): # logger.debug("writing to json: %s" % (pkg.get('title'))) output.append(datajson_entry) else: publisher = detect_publisher(extras) if errors: logger.warn( "Dataset id=[%s], title=[%s], organization=[%s] omitted, reason below:\n\t%s\n", pkg.get('id', None), pkg.get('title', None), publisher, errors) else: logger.warn( "Dataset id=[%s], title=[%s], organization=[%s] omitted, reason above.\n", pkg.get('id', None), pkg.get('title', None), publisher) data = Package2Pod.wrap_json_catalog(output, json_export_map) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() filename = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("%s : %s : %s : %s", exc_type, filename, exc_tb.tb_lineno, unicode(e)) # Get the error log eh.flush() error = stream.getvalue() eh.close() logger.removeHandler(eh) stream.close() # Skip compression if we export whole /data.json catalog if 'datajson' == export_type: return data return self.write_zip(data, error, errors_json, zip_name=export_type)
def make_json(self, export_type='datajson', owner_org=None): # Error handler for creating error log stream = StringIO.StringIO() eh = logging.StreamHandler(stream) eh.setLevel(logging.WARN) formatter = logging.Formatter('%(asctime)s - %(message)s') eh.setFormatter(formatter) logger.addHandler(eh) data = '' output = [] errors_json = [] Package2Pod.seen_identifiers = set() try: # Build the data.json file. if owner_org: if 'datajson' == export_type: # we didn't check ownership for this type of export, so never load private datasets here packages = DataJsonController._get_ckan_datasets(org=owner_org) if not packages: packages = self.get_packages(owner_org=owner_org, with_private=False) else: packages = self.get_packages(owner_org=owner_org, with_private=True) else: # TODO: load data by pages # packages = p.toolkit.get_action("current_package_list_with_resources")( # None, {'limit': 50, 'page': 300}) packages = DataJsonController._get_ckan_datasets() # packages = p.toolkit.get_action("current_package_list_with_resources")(None, {}) json_export_map = get_export_map_json('export.map.json') if json_export_map: for pkg in packages: if json_export_map.get('debug'): output.append(pkg) # logger.error('package: %s', json.dumps(pkg)) # logger.debug("processing %s" % (pkg.get('title'))) extras = dict([(x['key'], x['value']) for x in pkg.get('extras', {})]) # unredacted = all non-draft datasets (public + private) # redacted = public-only, non-draft datasets if export_type in ['unredacted', 'redacted']: if 'Draft' == extras.get('publishing_status'): # publisher = detect_publisher(extras) # logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted (%s)\n", # pkg.get('id'), pkg.get('title'), publisher, # 'publishing_status: Draft') # self._errors_json.append(OrderedDict([ # ('id', pkg.get('id')), # ('name', pkg.get('name')), # ('title', pkg.get('title')), # ('errors', [( # 'publishing_status: Draft', # [ # 'publishing_status: Draft' # ] # )]) # ])) continue # if 'redacted' == export_type and re.match(r'[Nn]on-public', extras.get('public_access_level')): # continue # draft = all draft-only datasets elif 'draft' == export_type: if 'publishing_status' not in extras.keys() or extras.get('publishing_status') != 'Draft': continue redaction_enabled = ('redacted' == export_type) datajson_entry = Package2Pod.convert_package(pkg, json_export_map, redaction_enabled) errors = None if 'errors' in datajson_entry.keys(): errors_json.append(datajson_entry) errors = datajson_entry.get('errors') datajson_entry = None if datajson_entry and \ (not json_export_map.get('validation_enabled') or self.is_valid(datajson_entry)): # logger.debug("writing to json: %s" % (pkg.get('title'))) output.append(datajson_entry) else: publisher = detect_publisher(extras) if errors: logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted, reason below:\n\t%s\n", pkg.get('id', None), pkg.get('title', None), publisher, errors) else: logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted, reason above.\n", pkg.get('id', None), pkg.get('title', None), publisher) data = Package2Pod.wrap_json_catalog(output, json_export_map) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() filename = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("%s : %s : %s : %s", exc_type, filename, exc_tb.tb_lineno, unicode(e)) # Get the error log eh.flush() error = stream.getvalue() eh.close() logger.removeHandler(eh) stream.close() # Skip compression if we export whole /data.json catalog if 'datajson' == export_type: return data return self.write_zip(data, error, errors_json, zip_name=export_type)
def make_json(self, export_type='datajson', owner_org=None): # Error handler for creating error log stream = StringIO.StringIO() eh = logging.StreamHandler(stream) eh.setLevel(logging.WARN) formatter = logging.Formatter('%(asctime)s - %(message)s') eh.setFormatter(formatter) logger.addHandler(eh) data = '' output = [] errors_json = [] Package2Pod.seen_identifiers = set() try: # Build the data.json file. if owner_org: if 'datajson' == export_type: # we didn't check ownership for this type of export, so never load private datasets here packages = DataJsonController._get_ckan_datasets(org=owner_org) if not packages: packages = self.get_packages(owner_org=owner_org, with_private=False) else: packages = self.get_packages(owner_org=owner_org, with_private=True) else: # TODO: load data by pages # packages = p.toolkit.get_action("current_package_list_with_resources")( # None, {'limit': 50, 'page': 300}) packages = DataJsonController._get_ckan_datasets() # packages = p.toolkit.get_action("current_package_list_with_resources")(None, {}) import re for i in range(0, len(packages)): j = 0 for extra in packages[i]['extras']: if extra.get('key') == 'language': print 'Key: {}, Value: {}'.format(extra.get('key'), extra.get('value')) if not isinstance(extra.get('value'), (unicode, str)): # Solo puedo operar si value es una instancia de UNICODE o STR logger.warn('No fue posible renderizar el campo: \"Language\".') else: language = [] try: # intento convertir directamente el valor de # Language a una lista. language = json.loads(extra['value']) except ValueError: # La traduccion no es posible, limpiar y reintentar if "{" or "}" in extra.get('value'): lang = extra['value'].replace('{', '').replace('}', '').split(',') else: lang = extra.get('value') if ',' in lang: lang = lang.split(',') else: lang = [lang] language = json.loads(lang) packages[i]['extras'][j]['value'] = language j += 1 try: for index, resource in enumerate(packages[i]['resources']): try: fixed_attrDesc = json.loads(resource['attributesDescription']) packages[i]['resources'][index]['attributesDescription'] = fixed_attrDesc except ValueError: logger.error('Fallo render de \'attributesDescription\'.') except KeyError: pass # Obtengo el ckan.site_url para chequear la propiedad del recurso. ckan_site_url = config.get('ckan.site_url') try: for index, resource in enumerate(packages[i]['resources']): resource = packages[i]['resources'][index] if not resource.get("accessURL", None): accessURL = os.path.join(ckan_site_url, 'dataset', packages[i]['id'], 'resource', resource['id']) resource.update({'accessURL': accessURL}) except KeyError: pass ckan_host = '' try: ckan_host = re.match( r'(?:http)s?:\/\/([\w][^\/=\s]+)\/?|(^w{3}[\.\w][^\/\=\s]{2,})\/?', packages[i]['resources'][0]['url']).group(0) except Exception: pass themes = self.safely_map(dict.get, packages[i]['groups'], 'name') packages[i]['groups'] = themes try: packages[i]['author'] = { 'name': packages[i]['author'], 'mbox': packages[i]['author_email'] } except KeyError: pass tags = self.safely_map(dict.get, packages[i]['tags'], 'display_name') packages[i]['tags'] = tags # packages[i] = json.loads(packages[i][0]['extras']['language']) try: if len(packages[i]['url']) < 1: packages[i]['url'] = '{host}/dataset/{dataset_id}'.format( host=ckan_host[:-1], dataset_id=packages[i]['name']) logger.info("landingPage generado para el dataset_id: %s.", packages[i]['name']) except TypeError: prepare_url = 'unknow' try: prepare_url = packages[i]['resources'][0]['url'] prepare_url = prepare_url.split('resource')[0] logger.info("landingPage generado para el dataset_id: %s, Tipo de datos: \" harvest\".", packages[i]['name']) except IndexError: logger.error("autogen \"landingpage\" fails.") packages[i].update({'url': prepare_url}) json_export_map = get_export_map_json('export.map.json') if json_export_map: for pkg in packages: if json_export_map.get('debug'): output.append(pkg) extras = dict([(x['key'], x['value']) for x in pkg.get('extras', {})]) if export_type in ['unredacted', 'redacted']: if 'Draft' == extras.get('publishing_status'): continue elif 'draft' == export_type: if 'publishing_status' not in extras.keys() or extras.get('publishing_status') != 'Draft': continue redaction_enabled = ('redacted' == export_type) datajson_entry = Package2Pod.convert_package(pkg, json_export_map, redaction_enabled) errors = None if 'errors' in datajson_entry.keys(): errors_json.append(datajson_entry) errors = datajson_entry.get('errors') datajson_entry = None if datajson_entry and \ (not json_export_map.get('validation_enabled') or self.is_valid(datajson_entry)): # logger.debug("writing to json: %s" % (pkg.get('title'))) output.append(datajson_entry) else: publisher = detect_publisher(extras) if errors: logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted, reason below:\n\t%s\n", pkg.get('id', None), pkg.get('title', None), publisher, errors) else: logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted, reason above.\n", pkg.get('id', None), pkg.get('title', None), publisher) try: # CLEAN Not requiered fields for d in output: del d["@type"] except Exception: pass data = Package2Pod.wrap_json_catalog(output, json_export_map) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() filename = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("%s : %s : %s : %s", exc_type, filename, exc_tb.tb_lineno, unicode(e)) # Get the error log eh.flush() error = stream.getvalue() eh.close() logger.removeHandler(eh) stream.close() # Skip compression if we export whole /data.json catalog if 'datajson' == export_type: return data return self.write_zip(data, error, errors_json, zip_name=export_type)
def make_json(self, export_type='datajson', owner_org=None, with_private=False): logger.info('\n{}\n'.format(owner_org)) # Error handler for creating error log stream = StringIO.StringIO() eh = logging.StreamHandler(stream) eh.setLevel(logging.WARN) formatter = logging.Formatter('%(asctime)s - %(message)s') eh.setFormatter(formatter) logger.addHandler(eh) data = '' output = [] first_line = True errors_json = [] Package2Pod.seen_identifiers = set() try: n = 500 page = 1 dataset_list = [] q = '+capacity:public' if not with_private else '*:*' fq = 'dataset_type:dataset' if owner_org: fq += " AND organization:" + owner_org while True: search_data_dict = { 'q': q, 'fq': fq, 'sort': 'metadata_modified desc', 'rows': n, 'start': n * (page - 1), } query = p.toolkit.get_action('package_search')( {}, search_data_dict) packages = query['results'] """ if owner_org: if 'datajson' == export_type: if not packages: packages = self.get_packages(owner_org=owner_org, with_private=False) """ if len(query['results']): json_export_map = get_export_map_json('export.map.json') if json_export_map: for pkg in packages: if json_export_map.get('debug'): output.append(pkg) extras = dict([(x['key'], x['value']) for x in pkg.get('extras', {})]) # unredacted = all non-draft datasets (public + private) # redacted = public-only, non-draft datasets if export_type in ['unredacted', 'redacted']: if 'Draft' == extras.get('publishing_status'): continue # draft = all draft-only datasets elif 'draft' == export_type: if 'publishing_status' not in extras.keys( ) or extras.get( 'publishing_status') != 'Draft': continue redaction_enabled = ('redacted' == export_type) datajson_entry = Package2Pod.convert_package( pkg, json_export_map, DataJsonPlugin.site_url, redaction_enabled) errors = None if 'errors' in datajson_entry.keys(): errors_json.append(datajson_entry) errors = datajson_entry.get('errors') datajson_entry = None if datajson_entry and \ (not json_export_map.get('validation_enabled') or self.is_valid(datajson_entry)): output.append(datajson_entry) else: publisher = detect_publisher(extras) if errors: logger.warn( "Dataset id=[%s], title=[%s], organization=[%s] omitted, reason below:\n\t%s\n", pkg.get('id', None), pkg.get('title', None), pkg.get('organization').get('title'), errors) else: logger.warn( "Dataset id=[%s], title=[%s], organization=[%s] omitted, reason above.\n", pkg.get('id', None), pkg.get('title', None), pkg.get('organization').get('title')) if 'datajson' == export_type: page += 1 else: break else: break data = Package2Pod.wrap_json_catalog(output, json_export_map) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() filename = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("%s : %s : %s : %s", exc_type, filename, exc_tb.tb_lineno, unicode(e)) # Get the error log eh.flush() error = stream.getvalue() eh.close() logger.removeHandler(eh) stream.close() return data
def make_json(self, export_type='datajson', owner_org=None): # Error handler for creating error log stream = StringIO.StringIO() eh = logging.StreamHandler(stream) eh.setLevel(logging.WARN) formatter = logging.Formatter('%(asctime)s - %(message)s') eh.setFormatter(formatter) logger.addHandler(eh) data = '' output = [] errors_json = [] Package2Pod.seen_identifiers = set() try: # Build the data.json file. if owner_org: if 'datajson' == export_type: # we didn't check ownership for this type of export, so never load private datasets here packages = DataJsonController._get_ckan_datasets( org=owner_org) if not packages: packages = self.get_packages(owner_org=owner_org, with_private=False) else: packages = self.get_packages(owner_org=owner_org, with_private=True) else: # TODO: load data by pages # packages = p.toolkit.get_action("current_package_list_with_resources")( # None, {'limit': 50, 'page': 300}) packages = DataJsonController._get_ckan_datasets() # packages = p.toolkit.get_action("current_package_list_with_resources")(None, {}) tags = [] themes = [] import re for i in range(0, len(packages)): j = 0 for extra in packages[i]['extras']: if extra['key'] == 'language': if "{" in extra['value'] and "}" in extra[ 'value'] or len(extra['value']) == 3: extra['value'] = "[\"{0}\"]".format( extra['value'].replace('{', '').replace('}', '')) packages[i]['extras'][j]['value'] = json.loads( extra['value']) # packages[i]['extras'][j]['value'] = json.loads(extra['value']) elif extra['key'] == 'globalGroups': packages[i]['extras'][j]['value'] = json.loads( extra['value']) j += 1 try: for j in range(0, len(packages[i]['resources'])): fixed_attrDesc = json.loads(packages[i]['resources'][j] ['attributesDescription']) packages[i]['resources'][j][ 'attributesDescription'] = fixed_attrDesc except KeyError: pass try: for j in range(0, len(packages[i]['resources'])): accessURL = packages[i]['resources'][j]['url'] accessURL = accessURL.split('download')[0].replace( '/resource/', '/archivo/') packages[i]['resources'][j].update( {'accessURL': accessURL[:-1]}) except KeyError: pass ckan_host = '' try: ckan_host = re.match( r'(?:http)s?:\/\/([\w][^\/=\s]+)\/?|(^w{3}[\.\w][^\/\=\s]{2,})\/?', packages[i]['resources'][0]['url']).group(0) except Exception: pass try: for theme in packages[i]['groups']: themes.append(theme['title']) except KeyError: pass try: packages[i]['author'] = { 'name': packages[i]['author'], 'mbox': packages[i]['author_email'] } except KeyError: pass try: for tag in packages[i]['tags']: tags.append(tag['display_name']) except KeyError: pass # packages[i] = json.loads(packages[i][0]['extras']['language']) packages[i]['groups'] = themes packages[i]['tags'] = tags try: if len(packages[i]['url']) < 1: packages[i][ 'url'] = '{host}/dataset/{dataset_id}'.format( host=ckan_host[:-1], dataset_id=packages[i]['name']) logger.info("landingPage generado para el dataset_id: %s.", packages[i]['name']) except TypeError: prepare_url = 'unknow' try: prepare_url = packages[i]['resources'][0]['url'] prepare_url = prepare_url.split('resource')[0] logger.info( "landingPage generado para el dataset_id: %s, Tipo de datos: \" harvest\".", packages[i]['name']) except IndexError: logger.error("autogen \"landingpage\" fails.") packages[i].update({'url': prepare_url}) json_export_map = get_export_map_json('export.map.json') if json_export_map: for pkg in packages: if json_export_map.get('debug'): output.append(pkg) extras = dict([(x['key'], x['value']) for x in pkg.get('extras', {})]) if export_type in ['unredacted', 'redacted']: if 'Draft' == extras.get('publishing_status'): continue elif 'draft' == export_type: if 'publishing_status' not in extras.keys( ) or extras.get('publishing_status') != 'Draft': continue redaction_enabled = ('redacted' == export_type) datajson_entry = Package2Pod.convert_package( pkg, json_export_map, redaction_enabled) errors = None if 'errors' in datajson_entry.keys(): errors_json.append(datajson_entry) errors = datajson_entry.get('errors') datajson_entry = None if datajson_entry and \ (not json_export_map.get('validation_enabled') or self.is_valid(datajson_entry)): # logger.debug("writing to json: %s" % (pkg.get('title'))) output.append(datajson_entry) else: publisher = detect_publisher(extras) if errors: logger.warn( "Dataset id=[%s], title=[%s], organization=[%s] omitted, reason below:\n\t%s\n", pkg.get('id', None), pkg.get('title', None), publisher, errors) else: logger.warn( "Dataset id=[%s], title=[%s], organization=[%s] omitted, reason above.\n", pkg.get('id', None), pkg.get('title', None), publisher) try: # CLEAN Not requiered fields for d in output: del d["@type"] except Exception: pass data = Package2Pod.wrap_json_catalog(output, json_export_map) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() filename = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("%s : %s : %s : %s", exc_type, filename, exc_tb.tb_lineno, unicode(e)) # Get the error log eh.flush() error = stream.getvalue() eh.close() logger.removeHandler(eh) stream.close() # Skip compression if we export whole /data.json catalog if 'datajson' == export_type: return data return self.write_zip(data, error, errors_json, zip_name=export_type)