def validator(self): # Validates that a URL is a good data.json file. if request.method == "POST" and "url" in request.POST and request.POST["url"].strip() != "": c.source_url = request.POST["url"] c.errors = [] import urllib import json from datajsonvalidator import do_validation body = None try: body = json.load(urllib.urlopen(c.source_url)) except IOError as e: c.errors.append(("Error Loading File", ["The address could not be loaded: " + unicode(e)])) except ValueError as e: c.errors.append(("Invalid JSON", ["The file does not meet basic JSON syntax requirements: " + unicode( e) + ". Try using JSONLint.com."])) except Exception as e: c.errors.append(( "Internal Error", ["Something bad happened while trying to load and parse the file: " + unicode(e)])) if body: try: do_validation(body, c.errors) except Exception as e: c.errors.append(("Internal Error", ["Something bad happened: " + unicode(e)])) if len(c.errors) == 0: c.errors.append(("No Errors", ["Great job!"])) return render('datajsonvalidator.html')
def validate(pkg, dataset_dict): import sys, os global currentPackageOrg try: # When saved from UI DataQuality value is stored as "on" instead of True. # Check if value is "on" and replace it with True. dataset_dict = OrderedDict(dataset_dict) if dataset_dict.get('dataQuality') == "on" \ or dataset_dict.get('dataQuality') == "true" \ or dataset_dict.get('dataQuality') == "True": dataset_dict['dataQuality'] = True elif dataset_dict.get('dataQuality') == "false" \ or dataset_dict.get('dataQuality') == "False": dataset_dict['dataQuality'] = False errors = [] try: from datajsonvalidator import do_validation do_validation([dict(dataset_dict)], errors, Package2Pod.seen_identifiers) except Exception as e: errors.append(("Internal Error", ["Something bad happened: " + unicode(e)])) if len(errors) > 0: for error in errors: log.warn(error) try: currentPackageOrg except NameError: currentPackageOrg = 'unknown' errors_dict = OrderedDict([ ('id', pkg.get('id')), ('name', Package2Pod.filter(pkg.get('name'))), ('title', Package2Pod.filter(pkg.get('title'))), ('organization', Package2Pod.filter(currentPackageOrg)), ('errors', errors), ]) return errors_dict return dataset_dict except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() filename = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] log.error("%s : %s : %s", exc_type, filename, exc_tb.tb_lineno) raise e
def make_datajson_export_entry(package, seen_identifiers): global currentPackageOrg currentPackageOrg = None # extras is a list of dicts [{},{}, {}]. For each dict, extract the key, value entries into a new dict extras = dict([(x['key'], x['value']) for x in package['extras']]) parent_dataset_id = extras.get('parent_dataset') if parent_dataset_id: parent = model.Package.get(parent_dataset_id) parent_uid = parent.extras.col.target['unique_id'].value if parent_uid: parent_dataset_id = parent_uid # if resource format is CSV then convert it to text/csv # Resource format has to be in 'csv' format for automatic datastore push. for r in package["resources"]: if r["format"].lower() == "csv": r["format"] = "text/csv" if r["format"].lower() == "json": r["format"] = "application/json" if r["format"].lower() == "pdf": r["format"] = "application/pdf" try: retlist = [ ("@type", "dcat:Dataset"), # optional ("title", JsonExportBuilder.strip_if_string(package["title"])), # required # ("accessLevel", 'public'), # required ("accessLevel", JsonExportBuilder.strip_if_string(extras.get('public_access_level'))), # required # ("accrualPeriodicity", "R/P1Y"), # optional # ('accrualPeriodicity', 'accrual_periodicity'), ('accrualPeriodicity', JsonExportBuilder.get_accrual_periodicity(extras.get('accrual_periodicity'))), # optional ("conformsTo", JsonExportBuilder.strip_if_string(extras.get('conforms_to'))), # optional # ('contactPoint', OrderedDict([ # ("@type", "vcard:Contact"), # ("fn", "Jane Doe"), # ("hasEmail", "mailto:[email protected]") # ])), # required ('contactPoint', JsonExportBuilder.get_contact_point(extras)), # required ("dataQuality", JsonExportBuilder.strip_if_string(extras.get('data_quality'))), # required-if-applicable ("describedBy", JsonExportBuilder.strip_if_string(extras.get('data_dictionary'))), # optional ("describedByType", JsonExportBuilder.strip_if_string(extras.get('data_dictionary_type'))), # optional ("description", JsonExportBuilder.strip_if_string(package["notes"])), # required # ("description", 'asdfasdf'), # required ("identifier", JsonExportBuilder.strip_if_string(extras.get('unique_id'))), # required # ("identifier", 'asdfasdfasdf'), # required ("isPartOf", parent_dataset_id), # optional ("issued", JsonExportBuilder.strip_if_string(extras.get('release_date'))), # optional # ("keyword", ['a', 'b']), # required ("keyword", [t["display_name"] for t in package["tags"]]), # required ("landingPage", JsonExportBuilder.strip_if_string(extras.get('homepage_url'))), # optional ("license", JsonExportBuilder.strip_if_string(extras.get("license_new"))), # required-if-applicable ("modified", JsonExportBuilder.strip_if_string(extras.get("modified", package.get("metadata_modified")))), # required ("primaryITInvestmentUII", JsonExportBuilder.strip_if_string(extras.get('primary_it_investment_uii'))), # optional # ('publisher', OrderedDict([ # ("@type", "org:Organization"), # ("name", "Widget Services") # ])), # required # ("publisher", get_publisher_tree(extras)), # required ("publisher", JsonExportBuilder.get_publisher_tree_wrong_order(extras)), # required ("rights", JsonExportBuilder.strip_if_string(extras.get('access_level_comment'))), # required ("spatial", JsonExportBuilder.strip_if_string(package.get("spatial"))), # required-if-applicable ('systemOfRecords', JsonExportBuilder.strip_if_string(extras.get('system_of_records'))), # optional ("temporal", JsonExportBuilder.strip_if_string(extras.get('temporal'))), # required-if-applicable ("distribution", JsonExportBuilder.generate_distribution(package)), # required-if-applicable # ("distribution", # #TODO distribution should hide any key/value pairs where value is "" or None (e.g. format) # [ # OrderedDict([ # ("downloadURL", r["url"]), # ("mediaType", r["formatReadable"]), # ]) # for r in package["resources"] # ]) ] for pair in [ #('bureauCode', 'bureau_code'), # required ('language', 'language'), # optional #('programCode', 'program_code'), # required ('references', 'related_documents'), # optional ('theme', 'category'), # optional ]: JsonExportBuilder.split_multiple_entries(retlist, extras, pair) except KeyError as e: log.warn("Missing Required Field for package with id=[%s], title=['%s'], organization=['%s']: '%s'" % ( package.get('id'), package.get('title'), currentPackageOrg, e)) errors = ['Missing Required Field', ["%s" % e]] errors_dict = OrderedDict([ ('id', package.get('id')), ('name', package.get('name')), ('title', package.get('title')), ('organization', currentPackageOrg), ('errors', errors), ]) return errors_dict # Remove entries where value is None, "", or empty list [] striped_retlist = [(x, y) for x, y in retlist if y is not None and y != "" and y != []] # When saved from UI DataQuality value is stored as "on" instead of True. # Check if value is "on" and replace it with True. striped_retlist_dict = OrderedDict(striped_retlist) if striped_retlist_dict.get('dataQuality') == "on" \ or striped_retlist_dict.get('dataQuality') == "true" \ or striped_retlist_dict.get('dataQuality') == "True": striped_retlist_dict['dataQuality'] = True elif striped_retlist_dict.get('dataQuality') == "false" \ or striped_retlist_dict.get('dataQuality') == "False": striped_retlist_dict['dataQuality'] = False errors = [] try: do_validation([dict(striped_retlist_dict)], errors, seen_identifiers) except Exception as e: errors.append(("Internal Error", ["Something bad happened: " + unicode(e)])) if len(errors) > 0: for error in errors: log.warn(error) errors_dict = OrderedDict([ ('id', package.get('id')), ('name', package.get('name')), ('title', package.get('title')), ('organization', currentPackageOrg), ('errors', errors), ]) return errors_dict return striped_retlist_dict
def make_datajson_export_entry(package): global currentPackageOrg currentPackageOrg = None # extras is a list of dicts [{},{}, {}]. For each dict, extract the key, value entries into a new dict extras = dict([(x['key'], x['value']) for x in package['extras']]) parent_dataset_id = extras.get('parent_dataset') if parent_dataset_id: parent = model.Package.get(parent_dataset_id) parent_uid = parent.extras.col.target['unique_id'].value if parent_uid: parent_dataset_id = parent_uid # if resource format is CSV then convert it to text/csv # Resource format has to be in 'csv' format for automatic datastore push. for r in package["resources"]: if r["format"].lower() == "csv": r["format"] = "text/csv" if r["format"].lower() == "json": r["format"] = "application/json" if r["format"].lower() == "pdf": r["format"] = "application/pdf" try: retlist = [ ("@type", "dcat:Dataset"), # optional ("title", JsonExportBuilder.strip_if_string(package["title"])), # required # ("accessLevel", 'public'), # required ("accessLevel", JsonExportBuilder.strip_if_string(extras.get('public_access_level'))), # required # ("accrualPeriodicity", "R/P1Y"), # optional # ('accrualPeriodicity', 'accrual_periodicity'), ('accrualPeriodicity', JsonExportBuilder.get_accrual_periodicity(extras.get('accrual_periodicity'))), # optional ("conformsTo", JsonExportBuilder.strip_if_string(extras.get('conforms_to'))), # optional # ('contactPoint', OrderedDict([ # ("@type", "vcard:Contact"), # ("fn", "Jane Doe"), # ("hasEmail", "mailto:[email protected]") # ])), # required ('contactPoint', JsonExportBuilder.get_contact_point(extras)), # required ("dataQuality", JsonExportBuilder.strip_if_string(extras.get('data_quality'))), # required-if-applicable ("describedBy", JsonExportBuilder.strip_if_string(extras.get('data_dictionary'))), # optional ("describedByType", JsonExportBuilder.strip_if_string(extras.get('data_dictionary_type'))), # optional ("description", JsonExportBuilder.strip_if_string(package["notes"])), # required # ("description", 'asdfasdf'), # required ("identifier", JsonExportBuilder.strip_if_string(extras.get('unique_id'))), # required # ("identifier", 'asdfasdfasdf'), # required ("isPartOf", parent_dataset_id), # optional ("issued", JsonExportBuilder.strip_if_string(extras.get('release_date'))), # optional # ("keyword", ['a', 'b']), # required ("keyword", [t["display_name"] for t in package["tags"]]), # required ("landingPage", JsonExportBuilder.strip_if_string(extras.get('homepage_url'))), # optional ("license", JsonExportBuilder.strip_if_string(extras.get("license_new"))), # required-if-applicable ("modified", JsonExportBuilder.strip_if_string(extras.get("modified", package.get("metadata_modified")))), # required ("primaryITInvestmentUII", JsonExportBuilder.strip_if_string(extras.get('primary_it_investment_uii'))), # optional # ('publisher', OrderedDict([ # ("@type", "org:Organization"), # ("name", "Widget Services") # ])), # required # ("publisher", get_publisher_tree(extras)), # required ("publisher", JsonExportBuilder.get_publisher_tree_wrong_order(extras)), # required ("rights", JsonExportBuilder.strip_if_string(extras.get('access_level_comment'))), # required ("spatial", JsonExportBuilder.strip_if_string(package.get("spatial"))), # required-if-applicable ('systemOfRecords', JsonExportBuilder.strip_if_string(extras.get('system_of_records'))), # optional ("temporal", JsonExportBuilder.strip_if_string(extras.get('temporal'))), # required-if-applicable ("distribution", JsonExportBuilder.generate_distribution(package)), # required-if-applicable # ("distribution", # #TODO distribution should hide any key/value pairs where value is "" or None (e.g. format) # [ # OrderedDict([ # ("downloadURL", r["url"]), # ("mediaType", r["formatReadable"]), # ]) # for r in package["resources"] # ]) ] for pair in [ ('bureauCode', 'bureau_code'), # required ('language', 'language'), # optional ('programCode', 'program_code'), # required ('references', 'related_documents'), # optional ('theme', 'category'), # optional ]: JsonExportBuilder.split_multiple_entries(retlist, extras, pair) except KeyError as e: log.warn("Missing Required Field for package with id=[%s], title=['%s'], organization=['%s']: '%s'" % ( package.get('id'), package.get('title'), currentPackageOrg, e)) errors = ['Missing Required Field', ["%s" % e]] errors_dict = OrderedDict([ ('id', package.get('id')), ('name', package.get('name')), ('title', package.get('title')), ('organization', currentPackageOrg), ('errors', errors), ]) return errors_dict # Remove entries where value is None, "", or empty list [] striped_retlist = [(x, y) for x, y in retlist if y is not None and y != "" and y != []] # When saved from UI DataQuality value is stored as "on" instead of True. # Check if value is "on" and replace it with True. striped_retlist_dict = OrderedDict(striped_retlist) if striped_retlist_dict.get('dataQuality') == "on" \ or striped_retlist_dict.get('dataQuality') == "true" \ or striped_retlist_dict.get('dataQuality') == "True": striped_retlist_dict['dataQuality'] = True elif striped_retlist_dict.get('dataQuality') == "false" \ or striped_retlist_dict.get('dataQuality') == "False": striped_retlist_dict['dataQuality'] = False from datajsonvalidator import do_validation errors = [] try: do_validation([dict(striped_retlist_dict)], errors) except Exception as e: errors.append(("Internal Error", ["Something bad happened: " + unicode(e)])) if len(errors) > 0: for error in errors: log.warn(error) errors_dict = OrderedDict([ ('id', package.get('id')), ('name', package.get('name')), ('title', package.get('title')), ('organization', currentPackageOrg), ('errors', errors), ]) return errors_dict return striped_retlist_dict
def make_datajson_entry(package): # extras is a list of dicts [{},{}, {}]. For each dict, extract the key, value entries into a new dict extras = dict([(x['key'], x['value']) for x in package['extras']]) parent_dataset_id = extras.get('parent_dataset') if parent_dataset_id: parent = model.Package.get(parent_dataset_id) parent_uid = parent.extras.col.target['unique_id'].value if parent_uid: parent_dataset_id = parent_uid # if resource format is CSV then convert it to text/csv # Resource format has to be in 'csv' format for automatic datastore push. for r in package["resources"]: if r["format"].lower() == "csv": r["format"] = "text/csv" if r["format"].lower() == "json": r["format"] = "application/json" if r["format"].lower() == "pdf": r["format"] = "application/pdf" try: retlist = [ ("@type", "dcat:Dataset"), # optional ("title", strip_if_string(package["title"])), # required # ("accessLevel", 'public'), # required ("accessLevel", strip_if_string(extras.get('public_access_level'))), # required # ("accrualPeriodicity", "R/P1Y"), # optional # ('accrualPeriodicity', 'accrual_periodicity'), ('accrualPeriodicity', get_accrual_periodicity(extras.get('accrual_periodicity'))), # optional ("conformsTo", strip_if_string(extras.get('conforms_to'))), # optional # ('contactPoint', OrderedDict([ # ("@type", "vcard:Contact"), # ("fn", "Jane Doe"), # ("hasEmail", "mailto:[email protected]") # ])), # required ('contactPoint', get_contact_point(extras, package)), # required ("dataQuality", strip_if_string(extras.get('data_quality'))), # required-if-applicable ("describedBy", strip_if_string(extras.get('data_dictionary'))), # optional ("describedByType", strip_if_string(extras.get('data_dictionary_type'))), # optional ("description", strip_if_string(package["notes"])), # required # ("description", 'asdfasdf'), # required ("identifier", strip_if_string(extras.get('unique_id'))), # required # ("identifier", 'asdfasdfasdf'), # required ("isPartOf", parent_dataset_id), # optional ("issued", strip_if_string(extras.get('release_date'))), # optional # ("keyword", ['a', 'b']), # required ("keyword", [t["display_name"] for t in package["tags"]]), # required ("landingPage", strip_if_string(extras.get('homepage_url'))), # optional ("license", strip_if_string(extras.get("license_new"))), # required-if-applicable ("modified", strip_if_string(extras.get("modified"))), # required ("primaryITInvestmentUII", strip_if_string(extras.get('primary_it_investment_uii'))), # optional # ('publisher', OrderedDict([ # ("@type", "org:Organization"), # ("name", "Widget Services") # ])), # required ("publisher", get_publisher_tree(extras)), # required ("rights", strip_if_string(extras.get('access_level_comment'))), # required ("spatial", strip_if_string(package.get("spatial"))), # required-if-applicable ('systemOfRecords', strip_if_string(extras.get('system_of_records'))), # optional ("temporal", strip_if_string(extras.get('temporal'))), # required-if-applicable ("distribution", generate_distribution(package)), # required-if-applicable # ("distribution", # #TODO distribution should hide any key/value pairs where value is "" or None (e.g. format) # [ # OrderedDict([ # ("downloadURL", r["url"]), # ("mediaType", r["formatReadable"]), # ]) # for r in package["resources"] # ]) ] for pair in [ ('bureauCode', 'bureau_code'), # required ('language', 'language'), # optional ('programCode', 'program_code'), # required ('references', 'related_documents'), # optional ('theme', 'category'), # optional ]: split_multiple_entries(retlist, extras, pair) except KeyError as e: log.warn("Invalid field detected for package with id=[%s], title=['%s']: '%s'", package.get('id'), package.get('title'), e) return # # TODO this is a lazy hack to make sure we don't have redundant fields when the free form key/value pairs are added # extras_to_filter_out = ['publisher', 'contact_name', 'contact_email', 'unique_id', 'public_access_level', # 'data_dictionary', 'bureau_code', 'program_code', 'access_level_comment', 'license_title', # 'spatial', 'temporal', 'release_date', 'accrual_periodicity', 'language', 'granularity', # 'data_quality', 'size', 'homepage_url', 'rss_feed', 'category', 'related_documents', # 'system_of_records', 'system_of_records_none_related_to_this_dataset', 'tags', # 'extrasRollup', 'format', 'accessURL', 'notes', 'publisher_1', 'publisher_2', 'publisher_3', # 'publisher_4', 'publisher_5'] # # # Append any free extras (key/value pairs) that aren't part of common core but have been associated with the dataset # # TODO really hackey, short on time, had to hardcode a lot of the names to remove. there's much better ways, maybe # # generate a list of keys to ignore by calling a specific function to get the extras # retlist_keys = [x for x, y in retlist] # extras_keys = set(extras.keys()) - set(extras_to_filter_out) # # for key in extras_keys: # convertedKey = underscore_to_camelcase(key) # if convertedKey not in retlist_keys: # retlist.append((convertedKey, extras[key])) # Remove entries where value is None, "", or empty list [] striped_retlist = [(x, y) for x, y in retlist if y is not None and y != "" and y != []] striped_retlist_keys = [x for x, y in striped_retlist] # If a required metadata field was removed, return empty string # for required_field in ["accessLevel", "bureauCode", "contactPoint", "description", "identifier", "keyword", # "modified", "programCode", "publisher", "title"]: # if required_field not in striped_retlist_keys: # log.warn("Missing required field detected for package with id=[%s], title=['%s']: '%s'", # package.get('id'), package.get('title'), required_field) # return # When saved from UI DataQuality value is stored as "on" instead of True. # Check if value is "on" and replace it with True. striped_retlist_dict = OrderedDict(striped_retlist) if striped_retlist_dict.get('dataQuality') == "on" \ or striped_retlist_dict.get('dataQuality') == "true" \ or striped_retlist_dict.get('dataQuality') == "True": striped_retlist_dict['dataQuality'] = True elif striped_retlist_dict.get('dataQuality') == "false" \ or striped_retlist_dict.get('dataQuality') == "False": striped_retlist_dict['dataQuality'] = False from datajsonvalidator import do_validation errors = [] try: do_validation([dict(striped_retlist_dict)], errors) except Exception as e: errors.append(("Internal Error", ["Something bad happened: " + unicode(e)])) if len(errors) > 0: for error in errors: log.warn(error) return return striped_retlist_dict