def test_validate_tables_invalid(log): report = validate( [ { 'source': 'data/valid.csv', 'schema': { 'fields': [{ 'name': 'id' }, { 'name': 'name' }] }, }, { 'source': 'data/invalid.csv' }, ], preset='nested', infer_schema=True, ) assert log(report) == [ (2, None, 3, 'blank-header'), (2, None, 3, 'non-matching-header'), (2, None, 4, 'duplicate-header'), (2, None, 4, 'non-matching-header'), (2, 2, 3, 'missing-value'), (2, 2, 4, 'missing-value'), (2, 3, None, 'duplicate-row'), (2, 4, None, 'blank-row'), (2, 5, 5, 'extra-value'), ]
def post(self, request): # if data or json file aren't given, throw 400 error # if key not used in parsing file, throw 400 error error_limit_form = sys.maxint order_fields_form = True for key in request.data: if key not in self.allowed_requests: return HttpResponseBadRequest( "400 Error: {key_name} not allowed as parameter.\n".format( key_name=key), status=400) if 'jsonfile' not in request.data: return HttpResponseBadRequest("400 Error: Payload Incorrect\n", status=400) elif 'docfile' not in request.data: return HttpResponseBadRequest("400 Error: Payload Incorrect\n", status=400) elif request.data['docfile'].name.endswith('.csv') == False: return HttpResponseBadRequest( "400 Error: Data was not of CSV type\n", status=400) elif request.data['jsonfile'].name.endswith('.json') == False: return HttpResponseBadRequest( "400 Error: Schema was not of JSON type\n", status=400) jsonfile = request.data['jsonfile'] csvfile = request.data['docfile'] if 'error_limit' in request.data: if request.data['error_limit'] == 'True': error_limit_form = 1 elif request.data['error_limit'] == 'False': error_limit_form = sys.maxint if 'order_fields' in request.data: if request.data['order_fields'] == 'True': order_fields_form = True elif request.data['order_fields'] == 'False': order_fields_form = False #validate file if no errors have been given path = default_storage.save('tmp/csvfile.csv', ContentFile(csvfile.read())) jsonpath = default_storage.save('tmp/jsonfile.json', ContentFile(jsonfile.read())) jsonreader = open(jsonpath, 'r') jsonstring = jsonreader.read() try: json_object = json.loads(jsonstring) print(json_object) except: return HttpResponseBadRequest( "400 Error: JSON File was not valid\n", status=400) jsonreader.close() print(jsonreader) validator = validate(path, error_limit=error_limit_form, row_limit=sys.maxint, schema=jsonpath, order_fields=True) os.remove(jsonpath) os.remove(path) return Response(json.dumps(validator))
def main(): base_url = 'https://raw.githubusercontent.com/UNStats/SDG/master/DataPackages/Goal/{}/datapackage.json' for goal in range(1, 18): dp_url = base_url.format(goal) dp = Package(dp_url) print('# Validating SDG {}'.format(goal)) if dp.valid: print('\tData Package descriptor valid') else: print('\tData Package descriptor invalid') print('\t\tErrors:') print('\n'.join(['\t\t\t' + str(error) for error in dp.errors])) report = validate(dp_url, row_limit=10000) if report['valid']: print('\tData valid') else: print('\tData invalid') print('\t\tErrors (first 20):') for table in report['tables']: if not table['valid']: print('\t\t\t{}:'.format(table['source'].split('/')[-1])) cnt = 0 for error in table['errors']: cnt += 1 print('\t\t\t\t\t' + error['message']) if cnt > 20: break
def upload_file(): if request.method == 'POST': uploaded_files = request.files.getlist('file') label = dt.datetime.now().strftime("%a, %d. %B %Y %H:%M") for file in uploaded_files: if file and allowed_file(file.filename): filename = secure_filename(file.filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) report = validate( os.path.join(app.config['UPLOAD_FOLDER'], filename), schema= "C:/Users/NXie/PycharmProjects/flask_google_uploads/diagnostic/schema.json", order_fields=True) if report['valid']: add_file(filename, label) gs_add(filename) return jsonify({'success': 'well done'}) else: os.remove( os.path.join(app.config['UPLOAD_FOLDER'], filename)) return jsonify( {'fail': report['tables'][0]['errors'][0]['message']}) else: return jsonify({ 'fail': 'Your file extension is {}'.format( file.filename.rsplit('.', 1)[1].lower()) }) else: return render_template('index.html')
def test_validate_datapackage_with_schema_structure_only_issue_348(log): DESCRIPTOR = { 'resources': [{ 'name': 'people', 'data': [ ['id', 'name', 'surname'], ['p1', 'Tom', 'Hanks'], ['p2', 'Meryl', 'Streep'], ], 'schema': { 'fields': [ { 'name': 'id', 'type': 'string' }, { 'name': 'name', 'type': 'string' }, { 'name': 'surname', 'type': 'string' }, { 'name': 'dob', 'type': 'date' }, ] }, }] } report = validate(DESCRIPTOR, checks=['structure']) assert report['valid']
def test_check_minimum_constraint(log): source = [ ['row', 'score'], [2, 1], [3, 2], [4, 3], [5, 4], [6], ] schema = { 'fields': [{ 'name': 'row', 'type': 'integer' }, { 'name': 'score', 'type': 'integer', 'constraints': { 'minimum': 2 } }] } report = validate(source, schema=schema, checks=[ 'minimum-constraint', ]) assert log(report) == [ (1, 2, 2, 'minimum-constraint'), ]
def test_validate_order_fields_issue_313(log): source = 'data/order_fields_313.xlsx' schema = { 'fields': [ { 'name': 'Column_1', 'type': 'string', }, { 'name': 'Column_2', 'type': 'string', 'constraints': { 'required': True } }, { 'name': 'Column_3', 'type': 'string' }, { 'name': 'Column_4', 'type': 'string' }, { 'name': 'Column_5', 'type': 'string' }, ] } # For now, the "non-matching-header" check is required to order the fields checks = ['non-matching-header', 'required-constraint'] report = validate(source, schema=schema, checks=checks, order_fields=True) assert report['valid']
def validate(paths, json, **options): # Remove blank values options = { key: value for key, value in options.items() if value is not None } if not options['checks']: del options['checks'] if not options['skip_checks']: del options['skip_checks'] options['infer_fields'] = options['infer_schema'] quiet = options.pop('quiet') output = options.pop('output') if options.get('preset') == 'datapackage': sources = paths[0] else: sources = [{'source': path} for path in paths] schema = options.pop('schema', None) if schema: for source in sources: source['schema'] = schema report = goodtables.validate(sources, **options) if not quiet: _print_report(report, output=output, json=json) exit(not report['valid'])
def validate(self, model, checks=[]): """Use a defined schema to validate the given table.""" records = self.data.to_dict("records") self.evaluate_report( validate(records, headers=list(records[0]), preset='table', schema=self.schema, order_fields=True, custom_checks=checks))
def test_check_custom_constraint_incorrect_constraint(log): source = [ ['row', 'name'], [2, 'Alex'], ] report = validate(source, checks=[ { 'custom-constraint': { 'constraint': 'vars()' } }, { 'custom-constraint': { 'constraint': 'import(os)' } }, { 'custom-constraint': { 'constraint': 'non_existent > 0' } }, ]) assert log(report) == [ (1, 2, None, 'custom-constraint'), (1, 2, None, 'custom-constraint'), (1, 2, None, 'custom-constraint'), ]
def validate(self, source, content_type): if content_type == 'application/json': data = utils.to_tabular(source) elif content_type == 'text/csv': data = utils.reorder_csv(source) else: raise UnsupportedContentTypeException(content_type, type(self).__name__) try: data['source'].decode() byteslike = True except (TypeError, KeyError, AttributeError): byteslike = False if byteslike: validate_params = data.copy() validate_params['schema'] = self.validator validate_params['source'] = io.BytesIO(data['source']) else: validate_params = { 'source': data, 'schema': self.validator, "headers": 1 } result = goodtables.validate(**validate_params) return self.formatted(data, result)
def test_validate_datapackage_dialect_header_false(log): descriptor = { 'resources': [{ 'name': 'name', 'data': [ ['John', '22'], ['Alex', '33'], ['Paul', '44'], ], 'schema': { 'fields': [ { 'name': 'name' }, { 'name': 'age', 'type': 'integer' }, ] }, 'dialect': { 'header': False, } }] } report = validate(descriptor) assert log(report) == []
def _validate_resource(res, dp_res): evaluated_rows, rows = itertools.tee(res) evaluated_rows = list(_get_row_value(r) for r in evaluated_rows) validate_options = { 'schema': dp_res['schema'], 'order_fields': True, 'preset': 'table' } validate_options.update(goodtables_options) report = goodtables.validate(evaluated_rows, **validate_options) report_file_path = '{}/{}.json'.format(reports_path, dp_res['name']) if write_report: os.makedirs(reports_path, exist_ok=True) with io.open(report_file_path, 'w') as f: f.write(json.dumps(report, indent=4)) if report['error-count'] > 0 and fail_on_error: msg = 'Datapackage resource \'{}\' failed'.format(dp_res['name']) msg += ' Goodtables validation.' if write_report: msg += ' See report for details: {}'.format(report_file_path) raise RuntimeError(msg) yield from rows
def test_foreign_key_internal_resource_violation(log): descriptor = deepcopy(FK_DESCRIPTOR) del descriptor['resources'][1]['data'][4] report = validate(descriptor, checks=['foreign-key']) assert log(report) == [ (1, 5, 1, 'foreign-key'), ]
def test_composite_primary_key_not_unique_issue_215(log): descriptor = { 'resources': [{ 'name': 'name', 'data': [ ['id1', 'id2'], ['a', '1'], ['a', '1'], ], 'schema': { 'fields': [ { 'name': 'id1' }, { 'name': 'id2' }, ], 'primaryKey': ['id1', 'id2'] } }], } report = validate(descriptor, skip_checks=['duplicate-row']) assert log(report) == [ (1, 3, 1, 'unique-constraint'), ]
def test_foreign_key_self_referenced_resource_violation(log): descriptor = deepcopy(FK_DESCRIPTOR) del descriptor['resources'][0]['data'][4] report = validate(descriptor, checks=['foreign-key']) assert log(report) == [ (1, 4, 3, 'foreign-key'), ]
def validate_save_pkg(pkg_descriptor, pkg_dir): """ Validate a data package descriptor and save it to a json file. Args: pkg_descriptor (dict): pkg_dir (path-like): Returns: report """ # Use that descriptor to instantiate a Package object data_pkg = datapackage.Package(pkg_descriptor) # Validate the data package descriptor before we go to if not data_pkg.valid: logger.error(f""" Invalid tabular data package: {data_pkg.descriptor["name"]} Errors: {data_pkg.errors}""") # pkg_json is the datapackage.json that we ultimately output: pkg_json = os.path.join(pkg_dir, "datapackage.json") data_pkg.save(pkg_json) logger.info('Validating the data package...') # Validate the data within the package using goodtables: report = goodtables.validate(pkg_json, row_limit=1000) if not report['valid']: logger.error("Data package validation failed.") else: logger.info('Congrats! You made a valid data package!') return report
def test_check_minimum_length_constraint(log): source = [ ['row', 'word'], [2, 'a'], [3, 'ab'], [4, 'abc'], [5, 'abcd'], [6], ] schema = { 'fields': [{ 'name': 'row', 'type': 'integer' }, { 'name': 'word', 'type': 'string', 'constraints': { 'minLength': 2 } }] } report = validate(source, schema=schema, checks=[ 'minimum-length-constraint', ]) assert log(report) == [ (1, 2, 2, 'minimum-length-constraint'), ]
def test_validate_datapackage_with_schema_issue_348(log): DESCRIPTOR = { 'resources': [{ 'name': 'people', 'data': [['id', 'name', 'surname'], ['p1', 'Tom', 'Hanks'], ['p2', 'Meryl', 'Streep']], 'schema': { 'fields': [{ 'name': 'id', 'type': 'string' }, { 'name': 'name', 'type': 'string' }, { 'name': 'surname', 'type': 'string' }, { 'name': 'dob', 'type': 'date' }] } }] } report = validate(DESCRIPTOR, checks=['structure', 'schema']) assert log(report) == [ (1, None, 4, 'missing-header'), ]
def test_check_deviated_value(log): source = [ ['temperature'], [1], [-2], [7], [0], [1], [2], [5], [-4], [100], [8], [3], ] report = validate(source, checks=[ { 'deviated-value': { 'column': 'temperature', 'average': 'median', 'interval': 3 } }, ]) assert log(report) == [ (1, 10, 1, 'deviated-value'), ]
def test_check_file_integrity_invalid(log): source = deepcopy(DESCRIPTOR) source['resources'][0]['hash'] = 'not-supported-hash' report = validate(source) assert report['warnings'] == [ 'Resource "resource1" does not use the SHA256 hash. The check will be skipped', ]
def _validate_table(source, _format=u'csv', schema=None, **options): report = validate(source, format=_format, schema=schema, **options) log.debug(u'Validating source: {}'.format(source)) return report
def generate_metadata(pkg_settings, tables, pkg_dir, uuid_pkgs=uuid.uuid4()): # pkg_json is the datapackage.json that we ultimately output: pkg_json = os.path.join(pkg_dir, "datapackage.json") # Create a tabular data resource for each of the tables. resources = [] for t in tables: resources.append(get_tabular_data_resource_2(t, pkg_dir=pkg_dir)) data_sources = pudl.helpers.data_sources_from_tables_pkg(tables) sources = [] for src in data_sources: if src in pudl.constants.data_sources: sources.append({"title": src, "path": pc.base_data_urls[src]}) contributors = set() for src in data_sources: for c in pudl.constants.contributors_by_source[src]: contributors.add(c) pkg_descriptor = { "name": pkg_settings["name"], "profile": "tabular-data-package", "title": pkg_settings["title"], "id": uuid_pkgs, "description": pkg_settings["description"], # "keywords": pkg_settings["keywords"], "homepage": "https://catalyst.coop/pudl/", "created": (datetime.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z'), "contributors": [pudl.constants.contributors[c] for c in contributors], "sources": sources, "licenses": [pudl.constants.licenses["cc-by-4.0"]], "resources": resources, } # Use that descriptor to instantiate a Package object data_pkg = datapackage.Package(pkg_descriptor) # Validate the data package descriptor before we go to if not data_pkg.valid: logger.warning(f""" Invalid tabular data package: {data_pkg.descriptor["name"]} Errors: {data_pkg.errors}""") data_pkg.save(pkg_json) # Validate the data within the package using goodtables: report = goodtables.validate(pkg_json, row_limit=1000) if not report['valid']: logger.warning("Data package data validation failed.") return data_pkg, report
def check_user_file_good_table(id_import, full_path, given_encoding, row_limit=100000000): try: errors = [] report = validate(full_path, skip_checks=["duplicate-row"], row_limit=row_limit) detected_encoding = report["tables"][0]["encoding"] if given_encoding.lower() != detected_encoding: set_user_error( id_import=id_import, step="UPLOAD", error_code="ENCODING_ERROR", ) errors.append({"error": "ENCODING_ERROR"}) return {"errors": errors} if report["valid"] is False: for error in report["tables"][0]["errors"]: # other goodtable errors : set_user_error( id_import=id_import, step="UPLOAD", error_code=ERROR_MAPPING.get(error["code"], "UNKNOWN_ERROR"), comment="Erreur d'origine :" + error["message"], ) errors.append(error) # if no rows : if report["tables"][0]["row-count"] == 0: gn_error = get_error_from_code("EMPTY_FILE") set_user_error( id_import=id_import, step="UPLOAD", id_error=gn_error.id_error, ) errors.append("no data") # get column names: column_names = report["tables"][0]["headers"] # get file format: file_format = report["tables"][0]["format"] # get row number: row_count = report["tables"][0]["row-count"] logger.debug("column_names = %s", column_names) logger.debug("row_count = %s", row_count) return { "column_names": column_names, "file_format": file_format, "row_count": row_count, "errors": errors, } except Exception: raise
def test_validate_no_headers(): report = validate('data/invalid_no_headers.csv', headers=None) assert report['tables'][0]['row-count'] == 3 # will report missing header since headers are none assert report['tables'][0]['error-count'] == 3 assert report['tables'][0]['errors'][0]['code'] == 'blank-header' assert report['tables'][0]['errors'][1]['code'] == 'blank-header' assert report['tables'][0]['errors'][2]['code'] == 'extra-value'
def test_foreign_key_external_resource_errors(log): descriptor = 'data/datapackages_linked_errors/cities/datapackage.json' report = validate(descriptor, checks=['structure', 'schema', 'foreign-key']) assert log(report) == [ (1, 4, 1, 'foreign-key'), # self-referenced (1, 4, 3, 'foreign-key'), # external ]
def pathPicker(self): for path in self.paths_files: report = validate(path) if report['tables'][0]['format'] == 'csv': colums = report['tables'][0]['headers'] if len(colums) == 6: self.path_list.append(path) return self.path_list
def test_validate_table_invalid_row_limit(log): report = validate('data/invalid.csv', row_limit=2, infer_schema=True) assert log(report) == [ (1, None, 3, 'blank-header'), (1, None, 4, 'duplicate-header'), (1, 2, 3, 'missing-value'), (1, 2, 4, 'missing-value'), ]
def test_validate_wide_table_with_order_fields_issue_277(log): report = validate('data/issue277.csv', schema='data/issue277.json', order_fields=True) assert log(report) == [ (1, 49, 50, 'required-constraint'), (1, 68, 50, 'required-constraint'), (1, 69, 50, 'required-constraint'), ]
def test_check_deviated_value_incorrect_average(log): source = [ ['row', 'name'], [2, 'Alex'], ] with pytest.raises(exceptions.GoodtablesException): report = validate(source, checks=[ {'deviated-value': {'column': 3, 'average': 'incorrect-average'}}, ])
def test_validate_warnings_table_and_error_limit(): source = 'data/datapackages/invalid/datapackage.json' report = validate(source, preset='datapackage', table_limit=1, error_limit=1) assert len(report['warnings']) == 2 assert 'table(s) limit' in report['warnings'][0] assert 'error(s) limit' in report['warnings'][1]
def test_check_deviated_value_not_enough_data(log): source = [ ['temperature'], [1], ] report = validate(source, checks=[ {'deviated-value': {'column': 'temperature'}}, ]) assert log(report) == []
def test_goodtables(): source = 'https://raw.githubusercontent.com/frictionlessdata/goodtables-py/master/data/datapackages/invalid/%s' report = validate(source % 'datapackage.json') report = remove_keys(report, keys=['time']) assert report == { 'valid': False, 'table-count': 2, 'warnings': [], 'error-count': 2, 'preset': 'datapackage', 'tables': [ { 'headers': ['id', 'name', 'description', 'amount'], 'datapackage': source % 'datapackage.json', 'errors': [ { 'column-number': None, 'row-number': 3, 'row': [], 'code': 'blank-row', 'message': 'Row 3 is completely blank' } ], 'schema': 'table-schema', 'row-count': 4, 'valid': False, 'error-count': 1, 'scheme': None, 'encoding': None, 'format': 'inline', 'source': source % 'data.csv', }, { 'headers': ['parent', 'comment'], 'datapackage': source % 'datapackage.json', 'errors': [ { 'column-number': None, 'row-number': 4, 'row': [], 'code': 'blank-row', 'message': 'Row 4 is completely blank' } ], 'schema': 'table-schema', 'row-count': 5, 'valid': False, 'error-count': 1, 'scheme': None, 'encoding': None, 'format': 'inline', 'source': source % 'data2.csv', } ] }
def test_check_deviated_value_not_a_number(log): source = [ ['row', 'name'], [2, 'Alex'], ] report = validate(source, checks=[ {'deviated-value': {'column': 'name'}}, ]) assert log(report) == [ (1, 2, 2, 'deviated-value'), ]
def test_validate_infer_fields_issue_225(): source = [ ['name1', 'name2'], ['123', ''], ['456', ''], ['789', ''], ] schema = { 'fields': [{'name': 'name1'}] } report = validate(source, schema=schema, infer_fields=True) assert report['valid']
def test_validate_invalid_table_schema(log): source = [ ['name', 'age'], ['Alex', '33'], ] schema = {'fields': [ {'name': 'name'}, {'name': 'age', 'type': 'bad'}, ]} report = validate(source, schema=schema) assert log(report) == [ (1, None, None, 'schema-error'), ]
def test_check_sequential_value_non_existent_column(log): source = [ ['row', 'name'], [2, 'Alex'], ] report = validate(source, checks=[ {'sequential-value': {'column': 3}}, {'sequential-value': {'column': 'non-existent'}}, ]) assert log(report) == [ (1, 2, None, 'sequential-value'), (1, 2, None, 'sequential-value'), ]
def test_validate_nested_checks(log): source = [ ['field'], ['value', 'value'], [''], ] report = validate([ {'source': source, 'checks': ['extra-value']}, {'source': source, 'checks': ['blank-row']} ]) assert log(report) == [ (1, 2, 2, 'extra-value'), (2, 3, None, 'blank-row'), ]
def test_check_custom_constraint_incorrect_constraint(log): source = [ ['row', 'name'], [2, 'Alex'], ] report = validate(source, checks=[ {'custom-constraint': {'constraint': 'vars()'}}, {'custom-constraint': {'constraint': 'import(os)'}}, {'custom-constraint': {'constraint': 'non_existent > 0'}}, ]) assert log(report) == [ (1, 2, None, 'custom-constraint'), (1, 2, None, 'custom-constraint'), (1, 2, None, 'custom-constraint'), ]
def test_check_custom_constraint(log): source = [ ['row', 'salary', 'bonus'], [2, 1000, 200], [3, 2500, 500], [4, 1300, 500], [5, 5000, 1000], [6], ] report = validate(source, checks=[ {'custom-constraint': {'constraint': 'salary > bonus * 4'}}, ]) assert log(report) == [ (1, 4, None, 'custom-constraint'), (1, 6, None, 'custom-constraint'), ]
def test_check_minimum_length_constraint(log): source = [ ['row', 'word'], [2, 'a'], [3, 'ab'], [4, 'abc'], [5, 'abcd'], [6], ] schema = {'fields': [ {'name': 'row', 'type': 'integer'}, {'name': 'word', 'type': 'string', 'constraints': {'minLength': 2}} ]} report = validate(source, schema=schema, checks=[ 'minimum-length-constraint', ]) assert log(report) == [ (1, 2, 2, 'minimum-length-constraint'), ]
def test_check_sequential_value(log): source = [ ['row', 'index2', 'index3'], [2, 1, 1], [3, 2, 3], [4, 3, 5], [5, 5, 6], [6], ] report = validate(source, checks=[ {'sequential-value': {'column': 2}}, {'sequential-value': {'column': 'index3'}}, ]) assert log(report) == [ (1, 3, 3, 'sequential-value'), (1, 4, 3, 'sequential-value'), (1, 5, 2, 'sequential-value'), (1, 6, 2, 'sequential-value'), (1, 6, 3, 'sequential-value'), ]
def test_check_maximum_constraint(log): source = [ ['row', 'score'], [2, 1], [3, 2], [4, 3], [5, 4], [6], ] schema = {'fields': [ {'name': 'row', 'type': 'integer'}, {'name': 'score', 'type': 'integer', 'constraints': {'maximum': 2}} ]} report = validate(source, schema=schema, checks=[ 'maximum-constraint', ]) assert log(report) == [ (1, 4, 2, 'maximum-constraint'), (1, 5, 2, 'maximum-constraint'), ]
def test_check_deviated_value(log): source = [ ['temperature'], [1], [-2], [7], [0], [1], [2], [5], [-4], [100], [8], [3], ] report = validate(source, checks=[ {'deviated-value': {'column': 'temperature', 'average': 'median', 'interval': 3}}, ]) assert log(report) == [ (1, 10, 1, 'deviated-value'), ]
def test_composite_primary_key_unique_issue_215(log): descriptor = { 'resources': [ { 'name': 'name', 'data': [ ['id1', 'id2'], ['a', '1'], ['a', '2'], ], 'schema': { 'fields': [ {'name': 'id1'}, {'name': 'id2'}, ], 'primaryKey': ['id1', 'id2'] } } ], } report = validate(descriptor) assert log(report) == []
def test_validate_datapackage_dialect_header_false(log): descriptor = { 'resources': [ { 'name': 'name', 'data': [ ['John', '22'], ['Alex', '33'], ['Paul', '44'], ], 'schema': { 'fields': [ {'name': 'name'}, {'name': 'age', 'type': 'integer'}, ] }, 'dialect': { 'header': False, } } ] } report = validate(descriptor) assert log(report) == []
def validate(paths, json, **options): # Remove blank values options = {key: value for key, value in options.items() if value is not None} if not options['checks']: del options['checks'] if not options['skip_checks']: del options['skip_checks'] options['infer_fields'] = options['infer_schema'] quiet = options.pop('quiet') output = options.pop('output') sources = [{'source': path} for path in paths] schema = options.pop('schema', None) if schema: for source in sources: source['schema'] = schema report = goodtables.validate(sources, **options) if not quiet: _print_report(report, output=output, json=json) exit(not report['valid'])
def test_composite_primary_key_not_unique_issue_215(log): descriptor = { 'resources': [ { 'name': 'name', 'data': [ ['id1', 'id2'], ['a', '1'], ['a', '1'], ], 'schema': { 'fields': [ {'name': 'id1'}, {'name': 'id2'}, ], 'primaryKey': ['id1', 'id2'] } } ], } report = validate(descriptor, skip_checks=['duplicate-row']) assert log(report) == [ (1, 3, 1, 'unique-constraint'), ]
from util import root from goodtables import validate from goodtables.cli import _print_report report = validate( root / "datapackage.json", table_limit=20, row_limit=20000 ) _print_report(report)
def test_validate_infer_datapackage_path(log): report = validate('data/datapackages/invalid/datapackage.json') assert report['error-count'] == 2
def test_validate_infer_datapackage_dict(log): with open('data/datapackages/invalid/datapackage.json') as file: report = validate(json.load(file)) assert report['error-count'] == 2
def test_validate_infer_nested(log): report = validate([{'source': 'data/invalid.csv'}]) assert report['error-count'] == 7
def test_validate_report_schema_infer_schema(): report = validate('data/valid.csv', infer_schema=True) assert report['tables'][0]['schema'] == 'table-schema'
print("\nData summary:\n") print("Emissions sum w/o EU28: {:d} GgCO₂-equiv.".format(int( export.Emissions.sum() - export.Emissions.loc['EUU'].sum()))) print("Percentage sum: {}".format( export.Percentage.sum() - export.loc['EUU'].Percentage)) print("Count signatures: {}".format(export.Signature.count())) print("Count ratified: {}".format( export["Ratification"].count())) ratified = export["Ratification"].notnull() percentage_sum = (export[ratified].Percentage.sum() - export.loc["EUU"].Percentage) print("Sum of percentages with ratification w/o EU: {}".format( percentage_sum)) def to_int(x): if pd.isnull(x): return "" else: return str(int(x)) export.Emissions = export.Emissions.apply(to_int) export.Year = export.Year.apply(to_int) export.to_csv(outfile, encoding="UTF-8") report = validate(root / "datapackage.json") if report["error-count"] > 0: _print_report(report)
def test_validate_infer_table(log): report = validate('data/invalid.csv') assert report['error-count'] == 7
def test_scenarios_return_valid_reports(name, scenario, report_schema): del scenario['report'] report = validate(**scenario) jsonschema.validate(report, report_schema)
def test_validate_report_scheme_format_encoding(): report = validate('data/valid.csv') assert report['tables'][0]['scheme'] == 'file' assert report['tables'][0]['format'] == 'csv' assert report['tables'][0]['encoding'] == 'utf-8'
def test_scenarios(log, name, scenario): expect = list(map(lambda item: tuple(item), scenario.pop('report'))) actual = log(validate(**scenario)) assert actual == expect
def test_validate_report_scheme_format_encoding(): report = validate('data/valid.csv') assert report['preset'] == 'table'
def test_validate_report_schema(): report = validate('data/valid.csv') assert report['tables'][0].get('schema') is None