Exemple #1
0
def test_validate_tables_invalid(log):
    report = validate(
        [
            {
                'source': 'data/valid.csv',
                'schema': {
                    'fields': [{
                        'name': 'id'
                    }, {
                        'name': 'name'
                    }]
                },
            },
            {
                'source': 'data/invalid.csv'
            },
        ],
        preset='nested',
        infer_schema=True,
    )
    assert log(report) == [
        (2, None, 3, 'blank-header'),
        (2, None, 3, 'non-matching-header'),
        (2, None, 4, 'duplicate-header'),
        (2, None, 4, 'non-matching-header'),
        (2, 2, 3, 'missing-value'),
        (2, 2, 4, 'missing-value'),
        (2, 3, None, 'duplicate-row'),
        (2, 4, None, 'blank-row'),
        (2, 5, 5, 'extra-value'),
    ]
Exemple #2
0
    def post(self, request):
        # if data or json file aren't given, throw 400 error
        # if key not used in parsing file, throw 400 error
        error_limit_form = sys.maxint
        order_fields_form = True
        for key in request.data:
            if key not in self.allowed_requests:
                return HttpResponseBadRequest(
                    "400 Error: {key_name} not allowed as parameter.\n".format(
                        key_name=key),
                    status=400)
        if 'jsonfile' not in request.data:
            return HttpResponseBadRequest("400 Error: Payload Incorrect\n",
                                          status=400)
        elif 'docfile' not in request.data:
            return HttpResponseBadRequest("400 Error: Payload Incorrect\n",
                                          status=400)
        elif request.data['docfile'].name.endswith('.csv') == False:
            return HttpResponseBadRequest(
                "400 Error: Data was not of CSV type\n", status=400)
        elif request.data['jsonfile'].name.endswith('.json') == False:
            return HttpResponseBadRequest(
                "400 Error: Schema was not of JSON type\n", status=400)
        jsonfile = request.data['jsonfile']
        csvfile = request.data['docfile']
        if 'error_limit' in request.data:
            if request.data['error_limit'] == 'True':
                error_limit_form = 1
            elif request.data['error_limit'] == 'False':
                error_limit_form = sys.maxint
        if 'order_fields' in request.data:
            if request.data['order_fields'] == 'True':
                order_fields_form = True
            elif request.data['order_fields'] == 'False':
                order_fields_form = False

        #validate file if no errors have been given
        path = default_storage.save('tmp/csvfile.csv',
                                    ContentFile(csvfile.read()))
        jsonpath = default_storage.save('tmp/jsonfile.json',
                                        ContentFile(jsonfile.read()))
        jsonreader = open(jsonpath, 'r')
        jsonstring = jsonreader.read()
        try:
            json_object = json.loads(jsonstring)
            print(json_object)
        except:
            return HttpResponseBadRequest(
                "400 Error: JSON File was not valid\n", status=400)

        jsonreader.close()
        print(jsonreader)
        validator = validate(path,
                             error_limit=error_limit_form,
                             row_limit=sys.maxint,
                             schema=jsonpath,
                             order_fields=True)
        os.remove(jsonpath)
        os.remove(path)
        return Response(json.dumps(validator))
def main():

    base_url = 'https://raw.githubusercontent.com/UNStats/SDG/master/DataPackages/Goal/{}/datapackage.json'

    for goal in range(1, 18):
        dp_url = base_url.format(goal)
        dp = Package(dp_url)

        print('# Validating SDG {}'.format(goal))
        if dp.valid:
            print('\tData Package descriptor valid')
        else:
            print('\tData Package descriptor invalid')
            print('\t\tErrors:')
            print('\n'.join(['\t\t\t' + str(error) for error in dp.errors]))

        report = validate(dp_url, row_limit=10000)
        if report['valid']:
            print('\tData valid')
        else:
            print('\tData invalid')
            print('\t\tErrors (first 20):')
            for table in report['tables']:
                if not table['valid']:
                    print('\t\t\t{}:'.format(table['source'].split('/')[-1]))
                    cnt = 0
                    for error in table['errors']:
                        cnt += 1
                        print('\t\t\t\t\t' + error['message'])
                        if cnt > 20:
                            break
Exemple #4
0
def upload_file():
    if request.method == 'POST':
        uploaded_files = request.files.getlist('file')
        label = dt.datetime.now().strftime("%a, %d. %B %Y %H:%M")
        for file in uploaded_files:
            if file and allowed_file(file.filename):
                filename = secure_filename(file.filename)
                file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
                report = validate(
                    os.path.join(app.config['UPLOAD_FOLDER'], filename),
                    schema=
                    "C:/Users/NXie/PycharmProjects/flask_google_uploads/diagnostic/schema.json",
                    order_fields=True)
                if report['valid']:
                    add_file(filename, label)
                    gs_add(filename)
                    return jsonify({'success': 'well done'})
                else:
                    os.remove(
                        os.path.join(app.config['UPLOAD_FOLDER'], filename))
                    return jsonify(
                        {'fail': report['tables'][0]['errors'][0]['message']})
            else:
                return jsonify({
                    'fail':
                    'Your file extension is {}'.format(
                        file.filename.rsplit('.', 1)[1].lower())
                })
    else:
        return render_template('index.html')
def test_validate_datapackage_with_schema_structure_only_issue_348(log):
    DESCRIPTOR = {
        'resources': [{
            'name':
            'people',
            'data': [
                ['id', 'name', 'surname'],
                ['p1', 'Tom', 'Hanks'],
                ['p2', 'Meryl', 'Streep'],
            ],
            'schema': {
                'fields': [
                    {
                        'name': 'id',
                        'type': 'string'
                    },
                    {
                        'name': 'name',
                        'type': 'string'
                    },
                    {
                        'name': 'surname',
                        'type': 'string'
                    },
                    {
                        'name': 'dob',
                        'type': 'date'
                    },
                ]
            },
        }]
    }
    report = validate(DESCRIPTOR, checks=['structure'])
    assert report['valid']
Exemple #6
0
def test_check_minimum_constraint(log):
    source = [
        ['row', 'score'],
        [2, 1],
        [3, 2],
        [4, 3],
        [5, 4],
        [6],
    ]
    schema = {
        'fields': [{
            'name': 'row',
            'type': 'integer'
        }, {
            'name': 'score',
            'type': 'integer',
            'constraints': {
                'minimum': 2
            }
        }]
    }
    report = validate(source, schema=schema, checks=[
        'minimum-constraint',
    ])
    assert log(report) == [
        (1, 2, 2, 'minimum-constraint'),
    ]
def test_validate_order_fields_issue_313(log):
    source = 'data/order_fields_313.xlsx'
    schema = {
        'fields': [
            {
                'name': 'Column_1',
                'type': 'string',
            },
            {
                'name': 'Column_2',
                'type': 'string',
                'constraints': {
                    'required': True
                }
            },
            {
                'name': 'Column_3',
                'type': 'string'
            },
            {
                'name': 'Column_4',
                'type': 'string'
            },
            {
                'name': 'Column_5',
                'type': 'string'
            },
        ]
    }
    # For now, the "non-matching-header" check is required to order the fields
    checks = ['non-matching-header', 'required-constraint']
    report = validate(source, schema=schema, checks=checks, order_fields=True)
    assert report['valid']
Exemple #8
0
def validate(paths, json, **options):
    # Remove blank values
    options = {
        key: value
        for key, value in options.items() if value is not None
    }
    if not options['checks']:
        del options['checks']
    if not options['skip_checks']:
        del options['skip_checks']

    options['infer_fields'] = options['infer_schema']
    quiet = options.pop('quiet')
    output = options.pop('output')

    if options.get('preset') == 'datapackage':
        sources = paths[0]
    else:
        sources = [{'source': path} for path in paths]
        schema = options.pop('schema', None)
        if schema:
            for source in sources:
                source['schema'] = schema

    report = goodtables.validate(sources, **options)

    if not quiet:
        _print_report(report, output=output, json=json)

    exit(not report['valid'])
Exemple #9
0
 def validate(self, model, checks=[]):
     """Use a defined schema to validate the given table."""
     records = self.data.to_dict("records")
     self.evaluate_report(
         validate(records, headers=list(records[0]),
                  preset='table', schema=self.schema,
                  order_fields=True, custom_checks=checks))
def test_check_custom_constraint_incorrect_constraint(log):
    source = [
        ['row', 'name'],
        [2, 'Alex'],
    ]
    report = validate(source,
                      checks=[
                          {
                              'custom-constraint': {
                                  'constraint': 'vars()'
                              }
                          },
                          {
                              'custom-constraint': {
                                  'constraint': 'import(os)'
                              }
                          },
                          {
                              'custom-constraint': {
                                  'constraint': 'non_existent > 0'
                              }
                          },
                      ])
    assert log(report) == [
        (1, 2, None, 'custom-constraint'),
        (1, 2, None, 'custom-constraint'),
        (1, 2, None, 'custom-constraint'),
    ]
Exemple #11
0
    def validate(self, source, content_type):

        if content_type == 'application/json':
            data = utils.to_tabular(source)
        elif content_type == 'text/csv':
            data = utils.reorder_csv(source)
        else:
            raise UnsupportedContentTypeException(content_type,
                                                  type(self).__name__)

        try:
            data['source'].decode()
            byteslike = True
        except (TypeError, KeyError, AttributeError):
            byteslike = False

        if byteslike:
            validate_params = data.copy()
            validate_params['schema'] = self.validator
            validate_params['source'] = io.BytesIO(data['source'])
        else:
            validate_params = {
                'source': data,
                'schema': self.validator,
                "headers": 1
            }

        result = goodtables.validate(**validate_params)
        return self.formatted(data, result)
Exemple #12
0
def test_validate_datapackage_dialect_header_false(log):
    descriptor = {
        'resources': [{
            'name': 'name',
            'data': [
                ['John', '22'],
                ['Alex', '33'],
                ['Paul', '44'],
            ],
            'schema': {
                'fields': [
                    {
                        'name': 'name'
                    },
                    {
                        'name': 'age',
                        'type': 'integer'
                    },
                ]
            },
            'dialect': {
                'header': False,
            }
        }]
    }
    report = validate(descriptor)
    assert log(report) == []
Exemple #13
0
    def _validate_resource(res, dp_res):
        evaluated_rows, rows = itertools.tee(res)
        evaluated_rows = list(_get_row_value(r) for r in evaluated_rows)
        validate_options = {
            'schema': dp_res['schema'],
            'order_fields': True,
            'preset': 'table'
        }
        validate_options.update(goodtables_options)
        report = goodtables.validate(evaluated_rows, **validate_options)

        report_file_path = '{}/{}.json'.format(reports_path, dp_res['name'])
        if write_report:
            os.makedirs(reports_path, exist_ok=True)
            with io.open(report_file_path, 'w') as f:
                f.write(json.dumps(report, indent=4))

        if report['error-count'] > 0 and fail_on_error:
            msg = 'Datapackage resource \'{}\' failed'.format(dp_res['name'])
            msg += ' Goodtables validation.'
            if write_report:
                msg += ' See report for details: {}'.format(report_file_path)
            raise RuntimeError(msg)

        yield from rows
Exemple #14
0
def test_foreign_key_internal_resource_violation(log):
    descriptor = deepcopy(FK_DESCRIPTOR)
    del descriptor['resources'][1]['data'][4]
    report = validate(descriptor, checks=['foreign-key'])
    assert log(report) == [
        (1, 5, 1, 'foreign-key'),
    ]
Exemple #15
0
def test_composite_primary_key_not_unique_issue_215(log):
    descriptor = {
        'resources': [{
            'name': 'name',
            'data': [
                ['id1', 'id2'],
                ['a', '1'],
                ['a', '1'],
            ],
            'schema': {
                'fields': [
                    {
                        'name': 'id1'
                    },
                    {
                        'name': 'id2'
                    },
                ],
                'primaryKey': ['id1', 'id2']
            }
        }],
    }
    report = validate(descriptor, skip_checks=['duplicate-row'])
    assert log(report) == [
        (1, 3, 1, 'unique-constraint'),
    ]
Exemple #16
0
def test_foreign_key_self_referenced_resource_violation(log):
    descriptor = deepcopy(FK_DESCRIPTOR)
    del descriptor['resources'][0]['data'][4]
    report = validate(descriptor, checks=['foreign-key'])
    assert log(report) == [
        (1, 4, 3, 'foreign-key'),
    ]
Exemple #17
0
def validate_save_pkg(pkg_descriptor, pkg_dir):
    """
    Validate a data package descriptor and save it to a json file.

    Args:
        pkg_descriptor (dict):
        pkg_dir (path-like):

    Returns:
        report

    """
    # Use that descriptor to instantiate a Package object
    data_pkg = datapackage.Package(pkg_descriptor)

    # Validate the data package descriptor before we go to
    if not data_pkg.valid:
        logger.error(f"""
            Invalid tabular data package: {data_pkg.descriptor["name"]}
            Errors: {data_pkg.errors}""")

    # pkg_json is the datapackage.json that we ultimately output:
    pkg_json = os.path.join(pkg_dir, "datapackage.json")
    data_pkg.save(pkg_json)
    logger.info('Validating the data package...')
    # Validate the data within the package using goodtables:
    report = goodtables.validate(pkg_json, row_limit=1000)
    if not report['valid']:
        logger.error("Data package validation failed.")
    else:
        logger.info('Congrats! You made a valid data package!')
    return report
def test_check_minimum_length_constraint(log):
    source = [
        ['row', 'word'],
        [2, 'a'],
        [3, 'ab'],
        [4, 'abc'],
        [5, 'abcd'],
        [6],
    ]
    schema = {
        'fields': [{
            'name': 'row',
            'type': 'integer'
        }, {
            'name': 'word',
            'type': 'string',
            'constraints': {
                'minLength': 2
            }
        }]
    }
    report = validate(source,
                      schema=schema,
                      checks=[
                          'minimum-length-constraint',
                      ])
    assert log(report) == [
        (1, 2, 2, 'minimum-length-constraint'),
    ]
Exemple #19
0
def test_validate_datapackage_with_schema_issue_348(log):
    DESCRIPTOR = {
        'resources': [{
            'name':
            'people',
            'data': [['id', 'name', 'surname'], ['p1', 'Tom', 'Hanks'],
                     ['p2', 'Meryl', 'Streep']],
            'schema': {
                'fields': [{
                    'name': 'id',
                    'type': 'string'
                }, {
                    'name': 'name',
                    'type': 'string'
                }, {
                    'name': 'surname',
                    'type': 'string'
                }, {
                    'name': 'dob',
                    'type': 'date'
                }]
            }
        }]
    }
    report = validate(DESCRIPTOR, checks=['structure', 'schema'])
    assert log(report) == [
        (1, None, 4, 'missing-header'),
    ]
Exemple #20
0
def test_check_deviated_value(log):
    source = [
        ['temperature'],
        [1],
        [-2],
        [7],
        [0],
        [1],
        [2],
        [5],
        [-4],
        [100],
        [8],
        [3],
    ]
    report = validate(source,
                      checks=[
                          {
                              'deviated-value': {
                                  'column': 'temperature',
                                  'average': 'median',
                                  'interval': 3
                              }
                          },
                      ])
    assert log(report) == [
        (1, 10, 1, 'deviated-value'),
    ]
Exemple #21
0
def test_check_file_integrity_invalid(log):
    source = deepcopy(DESCRIPTOR)
    source['resources'][0]['hash'] = 'not-supported-hash'
    report = validate(source)
    assert report['warnings'] == [
        'Resource "resource1" does not use the SHA256 hash. The check will be skipped',
    ]
Exemple #22
0
def _validate_table(source, _format=u'csv', schema=None, **options):

    report = validate(source, format=_format, schema=schema, **options)

    log.debug(u'Validating source: {}'.format(source))

    return report
Exemple #23
0
def generate_metadata(pkg_settings, tables, pkg_dir, uuid_pkgs=uuid.uuid4()):
    # pkg_json is the datapackage.json that we ultimately output:
    pkg_json = os.path.join(pkg_dir, "datapackage.json")
    # Create a tabular data resource for each of the tables.
    resources = []
    for t in tables:
        resources.append(get_tabular_data_resource_2(t, pkg_dir=pkg_dir))

    data_sources = pudl.helpers.data_sources_from_tables_pkg(tables)
    sources = []
    for src in data_sources:
        if src in pudl.constants.data_sources:
            sources.append({"title": src, "path": pc.base_data_urls[src]})

    contributors = set()
    for src in data_sources:
        for c in pudl.constants.contributors_by_source[src]:
            contributors.add(c)

    pkg_descriptor = {
        "name":
        pkg_settings["name"],
        "profile":
        "tabular-data-package",
        "title":
        pkg_settings["title"],
        "id":
        uuid_pkgs,
        "description":
        pkg_settings["description"],
        # "keywords": pkg_settings["keywords"],
        "homepage":
        "https://catalyst.coop/pudl/",
        "created":
        (datetime.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z'),
        "contributors": [pudl.constants.contributors[c] for c in contributors],
        "sources":
        sources,
        "licenses": [pudl.constants.licenses["cc-by-4.0"]],
        "resources":
        resources,
    }

    # Use that descriptor to instantiate a Package object
    data_pkg = datapackage.Package(pkg_descriptor)

    # Validate the data package descriptor before we go to
    if not data_pkg.valid:
        logger.warning(f"""
            Invalid tabular data package: {data_pkg.descriptor["name"]}
            Errors: {data_pkg.errors}""")

    data_pkg.save(pkg_json)
    # Validate the data within the package using goodtables:
    report = goodtables.validate(pkg_json, row_limit=1000)
    if not report['valid']:
        logger.warning("Data package data validation failed.")

    return data_pkg, report
def check_user_file_good_table(id_import,
                               full_path,
                               given_encoding,
                               row_limit=100000000):
    try:
        errors = []
        report = validate(full_path,
                          skip_checks=["duplicate-row"],
                          row_limit=row_limit)
        detected_encoding = report["tables"][0]["encoding"]
        if given_encoding.lower() != detected_encoding:
            set_user_error(
                id_import=id_import,
                step="UPLOAD",
                error_code="ENCODING_ERROR",
            )
            errors.append({"error": "ENCODING_ERROR"})
            return {"errors": errors}
        if report["valid"] is False:
            for error in report["tables"][0]["errors"]:
                # other goodtable errors :
                set_user_error(
                    id_import=id_import,
                    step="UPLOAD",
                    error_code=ERROR_MAPPING.get(error["code"],
                                                 "UNKNOWN_ERROR"),
                    comment="Erreur d'origine :" + error["message"],
                )
                errors.append(error)

        # if no rows :
        if report["tables"][0]["row-count"] == 0:
            gn_error = get_error_from_code("EMPTY_FILE")
            set_user_error(
                id_import=id_import,
                step="UPLOAD",
                id_error=gn_error.id_error,
            )
            errors.append("no data")

        # get column names:
        column_names = report["tables"][0]["headers"]
        # get file format:
        file_format = report["tables"][0]["format"]
        # get row number:
        row_count = report["tables"][0]["row-count"]

        logger.debug("column_names = %s", column_names)
        logger.debug("row_count = %s", row_count)

        return {
            "column_names": column_names,
            "file_format": file_format,
            "row_count": row_count,
            "errors": errors,
        }

    except Exception:
        raise
Exemple #25
0
def test_validate_no_headers():
    report = validate('data/invalid_no_headers.csv', headers=None)
    assert report['tables'][0]['row-count'] == 3
    # will report missing header since headers are none
    assert report['tables'][0]['error-count'] == 3
    assert report['tables'][0]['errors'][0]['code'] == 'blank-header'
    assert report['tables'][0]['errors'][1]['code'] == 'blank-header'
    assert report['tables'][0]['errors'][2]['code'] == 'extra-value'
Exemple #26
0
def test_foreign_key_external_resource_errors(log):
    descriptor = 'data/datapackages_linked_errors/cities/datapackage.json'
    report = validate(descriptor,
                      checks=['structure', 'schema', 'foreign-key'])
    assert log(report) == [
        (1, 4, 1, 'foreign-key'),  # self-referenced
        (1, 4, 3, 'foreign-key'),  # external
    ]
Exemple #27
0
	def pathPicker(self):
		for path in self.paths_files:
			report = validate(path)
			if report['tables'][0]['format'] == 'csv':
				colums = report['tables'][0]['headers']
				if len(colums) == 6:
					self.path_list.append(path)
		return self.path_list
Exemple #28
0
def test_validate_table_invalid_row_limit(log):
    report = validate('data/invalid.csv', row_limit=2, infer_schema=True)
    assert log(report) == [
        (1, None, 3, 'blank-header'),
        (1, None, 4, 'duplicate-header'),
        (1, 2, 3, 'missing-value'),
        (1, 2, 4, 'missing-value'),
    ]
def test_validate_wide_table_with_order_fields_issue_277(log):
    report = validate('data/issue277.csv',
                      schema='data/issue277.json',
                      order_fields=True)
    assert log(report) == [
        (1, 49, 50, 'required-constraint'),
        (1, 68, 50, 'required-constraint'),
        (1, 69, 50, 'required-constraint'),
    ]
def test_check_deviated_value_incorrect_average(log):
    source = [
        ['row', 'name'],
        [2, 'Alex'],
    ]
    with pytest.raises(exceptions.GoodtablesException):
        report = validate(source, checks=[
            {'deviated-value': {'column': 3, 'average': 'incorrect-average'}},
        ])
Exemple #31
0
def test_validate_warnings_table_and_error_limit():
    source = 'data/datapackages/invalid/datapackage.json'
    report = validate(source,
                      preset='datapackage',
                      table_limit=1,
                      error_limit=1)
    assert len(report['warnings']) == 2
    assert 'table(s) limit' in report['warnings'][0]
    assert 'error(s) limit' in report['warnings'][1]
def test_check_deviated_value_not_enough_data(log):
    source = [
        ['temperature'],
        [1],
    ]
    report = validate(source, checks=[
        {'deviated-value': {'column': 'temperature'}},
    ])
    assert log(report) == []
def test_goodtables():
    source = 'https://raw.githubusercontent.com/frictionlessdata/goodtables-py/master/data/datapackages/invalid/%s'
    report = validate(source % 'datapackage.json')
    report = remove_keys(report, keys=['time'])
    assert report == {
        'valid': False,
        'table-count': 2,
        'warnings': [],
        'error-count': 2,
        'preset': 'datapackage',
        'tables': [
            {
                'headers': ['id', 'name', 'description', 'amount'],
                'datapackage': source % 'datapackage.json',
                'errors': [
                    {
                        'column-number': None,
                        'row-number': 3,
                        'row': [],
                        'code': 'blank-row',
                        'message': 'Row 3 is completely blank'
                    }
                ],
                'schema': 'table-schema',
                'row-count': 4,
                'valid': False,
                'error-count': 1,
                'scheme': None,
                'encoding': None,
                'format': 'inline',
                'source': source % 'data.csv',
            },
            {
                'headers': ['parent', 'comment'],
                'datapackage': source % 'datapackage.json',
                'errors': [
                    {
                        'column-number': None,
                        'row-number': 4,
                        'row': [],
                        'code': 'blank-row',
                        'message': 'Row 4 is completely blank'
                    }
                ],
                'schema': 'table-schema',
                'row-count': 5,
                'valid': False,
                'error-count': 1,
                'scheme': None,
                'encoding': None,
                'format': 'inline',
                'source': source % 'data2.csv',
            }
        ]
    }
def test_check_deviated_value_not_a_number(log):
    source = [
        ['row', 'name'],
        [2, 'Alex'],
    ]
    report = validate(source, checks=[
        {'deviated-value': {'column': 'name'}},
    ])
    assert log(report) == [
        (1, 2, 2, 'deviated-value'),
    ]
def test_validate_infer_fields_issue_225():
    source = [
        ['name1', 'name2'],
        ['123', ''],
        ['456', ''],
        ['789', ''],
    ]
    schema = {
        'fields': [{'name': 'name1'}]
    }
    report = validate(source, schema=schema, infer_fields=True)
    assert report['valid']
def test_validate_invalid_table_schema(log):
    source = [
        ['name', 'age'],
        ['Alex', '33'],
    ]
    schema = {'fields': [
        {'name': 'name'},
        {'name': 'age', 'type': 'bad'},
    ]}
    report = validate(source, schema=schema)
    assert log(report) == [
        (1, None, None, 'schema-error'),
    ]
def test_check_sequential_value_non_existent_column(log):
    source = [
        ['row', 'name'],
        [2, 'Alex'],
    ]
    report = validate(source, checks=[
        {'sequential-value': {'column': 3}},
        {'sequential-value': {'column': 'non-existent'}},
    ])
    assert log(report) == [
        (1, 2, None, 'sequential-value'),
        (1, 2, None, 'sequential-value'),
    ]
def test_validate_nested_checks(log):
    source = [
        ['field'],
        ['value', 'value'],
        [''],
    ]
    report = validate([
        {'source': source, 'checks': ['extra-value']},
        {'source': source, 'checks': ['blank-row']}
    ])
    assert log(report) == [
        (1, 2, 2, 'extra-value'),
        (2, 3, None, 'blank-row'),
    ]
def test_check_custom_constraint_incorrect_constraint(log):
    source = [
        ['row', 'name'],
        [2, 'Alex'],
    ]
    report = validate(source, checks=[
        {'custom-constraint': {'constraint': 'vars()'}},
        {'custom-constraint': {'constraint': 'import(os)'}},
        {'custom-constraint': {'constraint': 'non_existent > 0'}},
    ])
    assert log(report) == [
        (1, 2, None, 'custom-constraint'),
        (1, 2, None, 'custom-constraint'),
        (1, 2, None, 'custom-constraint'),
    ]
def test_check_custom_constraint(log):
    source = [
        ['row', 'salary', 'bonus'],
        [2, 1000, 200],
        [3, 2500, 500],
        [4, 1300, 500],
        [5, 5000, 1000],
        [6],
    ]
    report = validate(source, checks=[
        {'custom-constraint': {'constraint': 'salary > bonus * 4'}},
    ])
    assert log(report) == [
        (1, 4, None, 'custom-constraint'),
        (1, 6, None, 'custom-constraint'),
    ]
def test_check_minimum_length_constraint(log):
    source = [
        ['row', 'word'],
        [2, 'a'],
        [3, 'ab'],
        [4, 'abc'],
        [5, 'abcd'],
        [6],
    ]
    schema = {'fields': [
        {'name': 'row', 'type': 'integer'},
        {'name': 'word', 'type': 'string', 'constraints': {'minLength': 2}}
    ]}
    report = validate(source, schema=schema, checks=[
        'minimum-length-constraint',
    ])
    assert log(report) == [
        (1, 2, 2, 'minimum-length-constraint'),
    ]
def test_check_sequential_value(log):
    source = [
        ['row', 'index2', 'index3'],
        [2, 1, 1],
        [3, 2, 3],
        [4, 3, 5],
        [5, 5, 6],
        [6],
    ]
    report = validate(source, checks=[
        {'sequential-value': {'column': 2}},
        {'sequential-value': {'column': 'index3'}},
    ])
    assert log(report) == [
        (1, 3, 3, 'sequential-value'),
        (1, 4, 3, 'sequential-value'),
        (1, 5, 2, 'sequential-value'),
        (1, 6, 2, 'sequential-value'),
        (1, 6, 3, 'sequential-value'),
    ]
def test_check_maximum_constraint(log):
    source = [
        ['row', 'score'],
        [2, 1],
        [3, 2],
        [4, 3],
        [5, 4],
        [6],
    ]
    schema = {'fields': [
        {'name': 'row', 'type': 'integer'},
        {'name': 'score', 'type': 'integer', 'constraints': {'maximum': 2}}
    ]}
    report = validate(source, schema=schema, checks=[
        'maximum-constraint',
    ])
    assert log(report) == [
        (1, 4, 2, 'maximum-constraint'),
        (1, 5, 2, 'maximum-constraint'),
    ]
def test_check_deviated_value(log):
    source = [
        ['temperature'],
        [1],
        [-2],
        [7],
        [0],
        [1],
        [2],
        [5],
        [-4],
        [100],
        [8],
        [3],
    ]
    report = validate(source, checks=[
        {'deviated-value': {'column': 'temperature', 'average': 'median', 'interval': 3}},
    ])
    assert log(report) == [
        (1, 10, 1, 'deviated-value'),
    ]
def test_composite_primary_key_unique_issue_215(log):
    descriptor = {
        'resources': [
            {
                'name': 'name',
                'data':  [
                    ['id1', 'id2'],
                    ['a', '1'],
                    ['a', '2'],
                ],
                'schema': {
                    'fields': [
                        {'name': 'id1'},
                        {'name': 'id2'},
                    ],
                    'primaryKey': ['id1', 'id2']
                }
            }
        ],
    }
    report = validate(descriptor)
    assert log(report) == []
def test_validate_datapackage_dialect_header_false(log):
    descriptor = {
        'resources': [
            {
                'name': 'name',
                'data': [
                    ['John', '22'],
                    ['Alex', '33'],
                    ['Paul', '44'],
                ],
                'schema': {
                    'fields': [
                        {'name': 'name'},
                        {'name': 'age', 'type': 'integer'},
                    ]
                },
                'dialect': {
                    'header': False,
                }
            }
        ]
    }
    report = validate(descriptor)
    assert log(report) == []
Exemple #47
0
def validate(paths, json, **options):
    # Remove blank values
    options = {key: value for key, value in options.items() if value is not None}
    if not options['checks']:
        del options['checks']
    if not options['skip_checks']:
        del options['skip_checks']

    options['infer_fields'] = options['infer_schema']
    quiet = options.pop('quiet')
    output = options.pop('output')

    sources = [{'source': path} for path in paths]
    schema = options.pop('schema', None)
    if schema:
        for source in sources:
            source['schema'] = schema

    report = goodtables.validate(sources, **options)

    if not quiet:
        _print_report(report, output=output, json=json)

    exit(not report['valid'])
def test_composite_primary_key_not_unique_issue_215(log):
    descriptor = {
        'resources': [
            {
                'name': 'name',
                'data':  [
                    ['id1', 'id2'],
                    ['a', '1'],
                    ['a', '1'],
                ],
                'schema': {
                    'fields': [
                        {'name': 'id1'},
                        {'name': 'id2'},
                    ],
                    'primaryKey': ['id1', 'id2']
                }
            }
        ],
    }
    report = validate(descriptor, skip_checks=['duplicate-row'])
    assert log(report) == [
        (1, 3, 1, 'unique-constraint'),
    ]
from util import root

from goodtables import validate
from goodtables.cli import _print_report

report = validate(
    root / "datapackage.json",
    table_limit=20,
    row_limit=20000
)
_print_report(report)
def test_validate_infer_datapackage_path(log):
    report = validate('data/datapackages/invalid/datapackage.json')
    assert report['error-count'] == 2
def test_validate_infer_datapackage_dict(log):
    with open('data/datapackages/invalid/datapackage.json') as file:
        report = validate(json.load(file))
        assert report['error-count'] == 2
def test_validate_infer_nested(log):
    report = validate([{'source': 'data/invalid.csv'}])
    assert report['error-count'] == 7
def test_validate_report_schema_infer_schema():
    report = validate('data/valid.csv', infer_schema=True)
    assert report['tables'][0]['schema'] == 'table-schema'
print("\nData summary:\n")
print("Emissions sum w/o EU28: {:d} GgCO₂-equiv.".format(int(
    export.Emissions.sum() - export.Emissions.loc['EUU'].sum())))
print("Percentage sum: {}".format(
    export.Percentage.sum() - export.loc['EUU'].Percentage))
print("Count signatures: {}".format(export.Signature.count()))
print("Count ratified: {}".format(
    export["Ratification"].count()))
ratified = export["Ratification"].notnull()
percentage_sum = (export[ratified].Percentage.sum() -
                  export.loc["EUU"].Percentage)
print("Sum of percentages with ratification w/o EU: {}".format(
    percentage_sum))


def to_int(x):
    if pd.isnull(x):
        return ""
    else:
        return str(int(x))


export.Emissions = export.Emissions.apply(to_int)
export.Year = export.Year.apply(to_int)
export.to_csv(outfile, encoding="UTF-8")

report = validate(root / "datapackage.json")
if report["error-count"] > 0:
    _print_report(report)
def test_validate_infer_table(log):
    report = validate('data/invalid.csv')
    assert report['error-count'] == 7
def test_scenarios_return_valid_reports(name, scenario, report_schema):
    del scenario['report']
    report = validate(**scenario)

    jsonschema.validate(report, report_schema)
def test_validate_report_scheme_format_encoding():
    report = validate('data/valid.csv')
    assert report['tables'][0]['scheme'] == 'file'
    assert report['tables'][0]['format'] == 'csv'
    assert report['tables'][0]['encoding'] == 'utf-8'
def test_scenarios(log, name, scenario):
    expect = list(map(lambda item: tuple(item), scenario.pop('report')))
    actual = log(validate(**scenario))
    assert actual == expect
def test_validate_report_scheme_format_encoding():
    report = validate('data/valid.csv')
    assert report['preset'] == 'table'
def test_validate_report_schema():
    report = validate('data/valid.csv')
    assert report['tables'][0].get('schema') is None