コード例 #1
0
    def setUp(self):
        self.validator = DataFileValidator()
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

        self.invalid_file_yaml = os.path.join(self.base_dir,
                                              'test_data/invalid_file.yaml')

        self.valid_file_yaml = os.path.join(self.base_dir,
                                            'test_data/valid_file.yaml')

        self.valid_file_json = os.path.join(self.base_dir,
                                            'test_data/valid_file.json')

        self.invalid_file_json = os.path.join(self.base_dir,
                                              'test_data/invalid_file.json')

        self.valid_file_error_percent_yaml = os.path.join(
            self.base_dir, 'test_data/valid_data_with_error.yaml')

        self.invalid_syntax_data_file = os.path.join(
            self.base_dir, 'test_data/invalid_data_file.yaml')

        self.invalid_parser_file = os.path.join(
            self.base_dir, 'test_data/invalid_parser_file.yaml')

        self.valid_custom_file = os.path.join(
            self.base_dir, 'test_data/valid_file_custom.yaml')
コード例 #2
0
    def test_load_data_with_custom_data_type(self):
        self.validator = DataFileValidator()
        custom_schema_path = os.path.join(self.base_dir,
                                          'test_data/custom_data_schema.json')
        self.validator.load_custom_schema('different', custom_schema_path)

        self.assertTrue('different' in self.validator.custom_data_schemas)

        self.assertTrue(
            self.validator.validate(file_path=self.valid_custom_file))
コード例 #3
0
def test_invalid_schema_version():
    """
    Tests the DataFileValidator creation with an invalid schema version
    """
    with pytest.raises(ValueError) as excinfo:
        validator = DataFileValidator(schema_version='0.9999.99')

    assert "Invalid schema version 0.9999.99" == str(excinfo.value)
コード例 #4
0
    def test_load_data_with_custom_data_type(self):
        self.validator = DataFileValidator()
        custom_schema_path = os.path.join(self.base_dir, 'test_data/custom_data_schema.json')
        self.validator.load_custom_schema('different', custom_schema_path)

        self.assertTrue('different' in self.validator.custom_data_schemas)

        self.assertTrue(self.validator.validate(file_path=self.valid_custom_file))
コード例 #5
0
def test_invalid_schema_file():
    # Fudge the schema versions constant so we can check the file check works
    VALID_SCHEMA_VERSIONS.append('0.9999.9999')
    try:
        with pytest.raises(ValueError) as excinfo:
            validator = DataFileValidator(schema_version='0.9999.9999')

        assert "Invalid schema file" in str(excinfo.value)
    finally:
        VALID_SCHEMA_VERSIONS.pop()
コード例 #6
0
ファイル: validators.py プロジェクト: islahudinees/hepdata
def get_data_validator(old_schema):
    """
    Returns a DataFileValidator object (with remote defined schemas loaded)

    :param old_hepdata: whether the schema version for the data file is 0.1.0
    :return: DataFileValidator object
    """

    global CACHED_DATA_VALIDATOR

    # Use for YAML files migrated from old HepData site
    if old_schema:
        data_validator = DataFileValidator(schema_version='0.1.0')

    elif CACHED_DATA_VALIDATOR:
        data_validator = CACHED_DATA_VALIDATOR

    else:
        data_validator = DataFileValidator()
        load_remote_schemas(data_validator)
        CACHED_DATA_VALIDATOR = data_validator

    return data_validator
コード例 #7
0
    def setUp(self):
        self.validator = DataFileValidator()
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

        self.invalid_file_yaml = os.path.join(
            self.base_dir,
            'test_data/invalid_file.yaml'
        )

        self.valid_file_yaml = os.path.join(
            self.base_dir,
            'test_data/valid_file.yaml'
        )

        self.valid_file_json = os.path.join(
            self.base_dir,
            'test_data/valid_file.json'
        )

        self.invalid_file_json = os.path.join(
            self.base_dir,
            'test_data/invalid_file.json')

        self.valid_file_error_percent_yaml = os.path.join(
            self.base_dir,
            'test_data/valid_data_with_error.yaml'
        )

        self.invalid_syntax_data_file = os.path.join(
            self.base_dir,
            'test_data/invalid_data_file.yaml'
        )

        self.valid_custom_file = os.path.join(
            self.base_dir,
            'test_data/valid_file_custom.yaml')
コード例 #8
0
def validator_v0():
    return DataFileValidator(schema_version='0.1.0')
コード例 #9
0
    def parse(self, data_in, *args, **kwargs):
        """
        :param data_in: path to submission.yaml
        :param args:
        :param kwargs:
        :raise ValueError:
        """
        if not os.path.exists(data_in):
            raise ValueError("File / Directory does not exist: %s" % data_in)

        if os.path.isdir(data_in):
            submission_filepath = os.path.join(data_in, 'submission.yaml')
            if not os.path.exists(submission_filepath):
                submission_filepath = os.path.join(data_in, 'submission.yml')
                if not os.path.exists(submission_filepath):
                    raise ValueError("No submission file in %s" % data_in)
            data_in = submission_filepath

        # first validate submission file:
        with open(data_in, 'r') as submission_file:
            try:
                submission_data = list(
                    yaml.load_all(submission_file, Loader=yaml.CSafeLoader))
            except:  # pragma: no cover
                submission_data = list(
                    yaml.load_all(submission_file))  # pragma: no cover

            if len(submission_data) == 0:
                raise RuntimeError("Submission file (%s) is empty" % data_in)

            submission_file_validator = SubmissionFileValidator()
            if not submission_file_validator.validate(file_path=data_in,
                                                      data=submission_data):
                raise RuntimeError(
                    "Submission file (%s) did not pass validation: %s" %
                    (data_in, self._pretty_print_errors(
                        submission_file_validator.get_messages())))

        tables = []

        # validator for table data
        data_file_validator = DataFileValidator()

        for i in range(1, len(submission_data)):
            table_filepath = os.path.join(os.path.dirname(data_in),
                                          submission_data[i]['data_file'])
            with open(table_filepath, 'r') as table_file:
                if not os.path.exists(table_filepath):
                    raise ValueError(
                        "table file: %s does not exist" % table.data_file)

                try:
                    # We try to load using the CLoader for speed improvements.
                    table_data = yaml.load(table_file, Loader=yaml.CSafeLoader)
                except:  # pragma: no cover
                    table_data = yaml.load(table_file)  # pragma: no cover

                if not data_file_validator.validate(data=table_data,
                                                    file_path=table_filepath):
                    raise RuntimeError(
                        "Data file (%s) did not pass validation: %s" %
                        (table_filepath, self._pretty_print_errors(
                            data_file_validator.get_messages())))

                table = Table(index=i, metadata=submission_data[i],
                              data=table_data)
                tables.append(table)

        return ParsedData(submission_data[0], tables)
コード例 #10
0
    def parse(self, data_in, *args, **kwargs):
        """
        :param data_in: path to submission.yaml
        :param args:
        :param kwargs:
        :raise ValueError:
        """
        if not os.path.exists(data_in):
            raise ValueError("File / Directory does not exist: %s" % data_in)

        if os.path.isdir(data_in):
            submission_filepath = os.path.join(data_in, 'submission.yaml')
            if not os.path.exists(submission_filepath):
                submission_filepath = os.path.join(data_in, 'submission.yml')
                if not os.path.exists(submission_filepath):
                    raise ValueError("No submission file in %s" % data_in)
            data_in = submission_filepath

        # first validate submission file:
        with open(data_in, 'r') as submission_file:
            submission_data = list(
                yaml.load_all(submission_file, Loader=Loader))

            if len(submission_data) == 0:
                raise RuntimeError("Submission file (%s) is empty" % data_in)

            submission_file_validator = SubmissionFileValidator(
                schema_version=self.validator_schema_version)
            if not submission_file_validator.validate(file_path=data_in,
                                                      data=submission_data):
                raise RuntimeError(
                    "Submission file (%s) did not pass validation: %s" %
                    (data_in,
                     self._pretty_print_errors(
                         submission_file_validator.get_messages())))

        metadata = {}
        tables = []

        # validator for table data
        data_file_validator = DataFileValidator(
            schema_version=self.validator_schema_version)

        index = 0
        for i in range(0, len(submission_data)):
            if not submission_data[i]:  # empty YAML document
                continue
            if 'data_file' not in submission_data[i]:
                metadata = submission_data[
                    i]  # information about whole submission
                continue
            table_filepath = os.path.join(os.path.dirname(data_in),
                                          submission_data[i]['data_file'])
            with open(table_filepath, 'r') as table_file:
                if not os.path.exists(table_filepath):
                    raise ValueError("table file: %s does not exist" %
                                     table.data_file)

                table_data = yaml.load(table_file, Loader=Loader)

                if not data_file_validator.validate(data=table_data,
                                                    file_path=table_filepath):
                    raise RuntimeError(
                        "Data file (%s) did not pass validation: %s" %
                        (table_filepath,
                         self._pretty_print_errors(
                             data_file_validator.get_messages())))

                index = index + 1
                table = Table(index=index,
                              metadata=submission_data[i],
                              data=table_data)
                tables.append(table)

        return ParsedData(metadata, tables)
コード例 #11
0
from hepdata_validator.data_file_validator import DataFileValidator
import argparse

parser = argparse.ArgumentParser(description='Validate yaml files.')
parser.add_argument('-filename',dest='filename', type=str, help='file to check')

args = parser.parse_args()

data_file_validator = DataFileValidator()

# the validate method takes a string representing the file path.
data_file_validator.validate(file_path=args.filename)

# if there are any error messages, they are retrievable through this call
data_file_validator.get_messages()

# the error messages can be printed
data_file_validator.print_errors('data.yaml')
コード例 #12
0
def process_submission_directory(basepath,
                                 submission_file_path,
                                 recid,
                                 update=False,
                                 *args,
                                 **kwargs):
    """
    Goes through an entire submission directory and processes the
    files within to create DataSubmissions
    with the files and related material attached as DataResources.
    :param basepath:
    :param submission_file_path:
    :param recid:
    :return:
    """
    added_file_names = []
    errors = {}

    if submission_file_path is not None:
        submission_file = open(submission_file_path, 'r')

        submission_file_validator = SubmissionFileValidator()
        is_valid_submission_file = submission_file_validator.validate(
            file_path=submission_file_path)

        data_file_validator = DataFileValidator()

        if is_valid_submission_file:
            try:
                submission_processed = yaml.load_all(submission_file,
                                                     Loader=yaml.CSafeLoader)
            except:
                submission_processed = yaml.safe_load_all(submission_file)

            # process file, extracting contents, and linking
            # the data record with the parent publication
            hepsubmission = get_latest_hepsubmission(publication_recid=recid)
            if hepsubmission is None:
                HEPSubmission(publication_recid=recid,
                              overall_status='todo',
                              inspire_id=hepsubmission.inspire_id,
                              coordinator=kwargs.get('user_id') if 'user_id'
                              in kwargs else int(current_user.get_id()),
                              version=hepsubmission.version + 1)

            # On a new upload, we reset the flag to notify reviewers
            hepsubmission.reviewers_notified = False

            # if it is finished and we receive an update,
            # then we need to reopen the submission to allow for revisions.
            if hepsubmission.overall_status == 'finished' and not update:
                # we create a new HEPSubmission object
                _rev_hepsubmission = HEPSubmission(
                    publication_recid=recid,
                    overall_status='todo',
                    inspire_id=hepsubmission.inspire_id,
                    coordinator=hepsubmission.coordinator,
                    version=hepsubmission.version + 1)
                db.session.add(_rev_hepsubmission)
                hepsubmission = _rev_hepsubmission

            reserve_doi_for_hepsubmission(hepsubmission, update)

            for yaml_document in submission_processed:
                if 'record_ids' in yaml_document or 'comment' in yaml_document or 'modifications' in yaml_document:
                    # comments are only present in the general submission
                    # information document.
                    process_general_submission_info(basepath, yaml_document,
                                                    recid)
                else:
                    existing_datasubmission_query = DataSubmission.query \
                        .filter_by(name=encode_string(yaml_document["name"]),
                                   publication_recid=recid,
                                   version=hepsubmission.version)

                    added_file_names.append(yaml_document["name"])

                    if existing_datasubmission_query.count() == 0:
                        datasubmission = DataSubmission(
                            publication_recid=recid,
                            name=encode_string(yaml_document["name"]),
                            description=encode_string(
                                yaml_document["description"]),
                            version=hepsubmission.version)

                    else:
                        datasubmission = existing_datasubmission_query.one()
                        datasubmission.description = encode_string(
                            yaml_document["description"])

                    db.session.add(datasubmission)

                    main_file_path = os.path.join(basepath,
                                                  yaml_document["data_file"])

                    if data_file_validator.validate(file_path=main_file_path):
                        process_data_file(recid, hepsubmission.version,
                                          basepath, yaml_document,
                                          datasubmission, main_file_path)
                    else:
                        errors = process_validation_errors_for_display(
                            data_file_validator.get_messages())

                        data_file_validator.clear_messages()

            cleanup_submission(recid, hepsubmission.version, added_file_names)

            db.session.commit()

            if len(errors) is 0:
                package_submission(basepath, recid, hepsubmission)
                reserve_dois_for_data_submissions(recid, hepsubmission.version)

                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hepsubmission)
        else:
            errors = process_validation_errors_for_display(
                submission_file_validator.get_messages())

            submission_file_validator.clear_messages()
            data_file_validator.clear_messages()
    else:
        # return an error
        errors = {
            "submission.yaml": [{
                "level":
                "error",
                "message":
                "No submission.yaml file found in submission."
            }]
        }
        return errors

    # we return all the errors collectively.
    # This makes more sense that returning errors as
    # soon as problems are found on one file.
    return errors
コード例 #13
0
def process_submission_directory(basepath,
                                 submission_file_path,
                                 recid,
                                 update=False,
                                 *args,
                                 **kwargs):
    """
    Goes through an entire submission directory and processes the
    files within to create DataSubmissions
    with the files and related material attached as DataResources.

    :param basepath:
    :param submission_file_path:
    :param recid:
    :param update:
    :return:
    """
    added_file_names = []
    errors = {}

    if submission_file_path is not None:

        submission_file_validator = SubmissionFileValidator()
        is_valid_submission_file = submission_file_validator.validate(
            file_path=submission_file_path)

        if is_valid_submission_file:

            submission_file = open(submission_file_path, 'r')
            submission_processed = yaml.load_all(submission_file,
                                                 Loader=Loader)

            # process file, extracting contents, and linking
            # the data record with the parent publication
            hepsubmission = get_latest_hepsubmission(publication_recid=recid)
            if hepsubmission is None:
                HEPSubmission(publication_recid=recid,
                              overall_status='todo',
                              inspire_id=hepsubmission.inspire_id,
                              coordinator=kwargs.get('user_id') if 'user_id'
                              in kwargs else int(current_user.get_id()),
                              version=hepsubmission.version + 1)

            # On a new upload, we reset the flag to notify reviewers
            hepsubmission.reviewers_notified = False

            # if it is finished and we receive an update,
            # then we need to reopen the submission to allow for revisions.
            if hepsubmission.overall_status == 'finished' and not update:
                # we create a new HEPSubmission object
                _rev_hepsubmission = HEPSubmission(
                    publication_recid=recid,
                    overall_status='todo',
                    inspire_id=hepsubmission.inspire_id,
                    coordinator=hepsubmission.coordinator,
                    version=hepsubmission.version + 1)
                db.session.add(_rev_hepsubmission)
                hepsubmission = _rev_hepsubmission

            reserve_doi_for_hepsubmission(hepsubmission, update)

            no_general_submission_info = True

            data_file_validator = DataFileValidator()

            # Delete all data records associated with this submission.
            # Fixes problems with ordering where the table names are changed between uploads.
            # See https://github.com/HEPData/hepdata/issues/112
            # Side effect that reviews will be deleted between uploads.
            cleanup_submission(recid, hepsubmission.version, added_file_names)

            for yaml_document_index, yaml_document in enumerate(
                    submission_processed):
                if not yaml_document:
                    continue

                # Check for presence of local files given as additional_resources.
                if 'additional_resources' in yaml_document:
                    for resource in yaml_document['additional_resources']:
                        location = os.path.join(basepath, resource['location'])
                        if not resource['location'].startswith(
                            ('http', '/resource/')):
                            if not os.path.isfile(location):
                                errors[resource['location']] = [{
                                    "level":
                                    "error",
                                    "message":
                                    "Missing 'additional_resources' file from uploaded archive."
                                }]
                            elif '/' in resource['location']:
                                errors[resource['location']] = [{
                                    "level":
                                    "error",
                                    "message":
                                    "Location of 'additional_resources' file should not contain '/'."
                                }]

                if not yaml_document_index and 'name' not in yaml_document:

                    no_general_submission_info = False
                    process_general_submission_info(basepath, yaml_document,
                                                    recid)

                elif not all(k in yaml_document
                             for k in ('name', 'description', 'keywords',
                                       'data_file')):

                    errors["submission.yaml"] = [{
                        "level":
                        "error",
                        "message":
                        "YAML document with index {} ".format(
                            yaml_document_index) +
                        "missing one or more required keys (name, description, keywords, data_file)."
                    }]

                else:

                    existing_datasubmission_query = DataSubmission.query \
                        .filter_by(name=encode_string(yaml_document["name"]),
                                   publication_recid=recid,
                                   version=hepsubmission.version)

                    added_file_names.append(yaml_document["name"])

                    try:
                        if existing_datasubmission_query.count() == 0:
                            datasubmission = DataSubmission(
                                publication_recid=recid,
                                name=encode_string(yaml_document["name"]),
                                description=encode_string(
                                    yaml_document["description"]),
                                version=hepsubmission.version)
                        else:
                            datasubmission = existing_datasubmission_query.one(
                            )
                            datasubmission.description = encode_string(
                                yaml_document["description"])
                        db.session.add(datasubmission)
                    except SQLAlchemyError as sqlex:
                        errors[yaml_document["data_file"]] = [{
                            "level":
                            "error",
                            "message":
                            str(sqlex)
                        }]
                        db.session.rollback()
                        continue

                    main_file_path = os.path.join(basepath,
                                                  yaml_document["data_file"])

                    data, ex = _eos_fix_read_data(main_file_path)

                    if not data or data is None or ex is not None:

                        errors[yaml_document["data_file"]] = \
                            [{"level": "error", "message": "There was a problem parsing the file.\n" + str(ex)}]

                    elif '/' in yaml_document["data_file"]:

                        errors[yaml_document["data_file"]] = \
                            [{"level": "error", "message": "Name of data_file should not contain '/'.\n"}]

                    else:

                        if data_file_validator.validate(
                                file_path=main_file_path, data=data):
                            try:
                                process_data_file(recid, hepsubmission.version,
                                                  basepath, yaml_document,
                                                  datasubmission,
                                                  main_file_path)
                            except SQLAlchemyError as sqlex:
                                errors[yaml_document["data_file"]] = [{
                                    "level":
                                    "error",
                                    "message":
                                    "There was a problem processing the file.\n"
                                    + str(sqlex)
                                }]
                                db.session.rollback()
                        else:
                            errors = process_validation_errors_for_display(
                                data_file_validator.get_messages())
                            data_file_validator.clear_messages()

                        if yaml_document["data_file"] not in errors:
                            # Check that the length of the 'values' list is consistent
                            # for each of the independent_variables and dependent_variables.
                            indep_count = [
                                len(indep['values'])
                                for indep in data['independent_variables']
                            ]
                            dep_count = [
                                len(dep['values'])
                                for dep in data['dependent_variables']
                            ]
                            if len(set(indep_count + dep_count)
                                   ) > 1:  # if more than one unique count
                                errors.setdefault(
                                    yaml_document["data_file"], []
                                ).append({
                                    "level":
                                    "error",
                                    "message":
                                    "Inconsistent length of 'values' list:\n" +
                                    "independent_variables{}, dependent_variables{}"
                                    .format(str(indep_count), str(dep_count))
                                })

            submission_file.close()

            if no_general_submission_info:
                hepsubmission.last_updated = datetime.now()
                db.session.add(hepsubmission)
                db.session.commit()

            # The line below is commented out since it does not preserve the order of tables.
            # Delete all tables above instead: side effect of deleting reviews between uploads.
            #cleanup_submission(recid, hepsubmission.version, added_file_names)

            db.session.commit()

            if len(errors) is 0:
                errors = package_submission(basepath, recid, hepsubmission)
                reserve_dois_for_data_submissions(
                    publication_recid=recid, version=hepsubmission.version)

                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hepsubmission)

            else:  # delete all tables if errors
                cleanup_submission(recid, hepsubmission.version, {})

        else:

            errors = process_validation_errors_for_display(
                submission_file_validator.get_messages())
            submission_file_validator.clear_messages()

    else:
        # return an error
        errors = {
            "submission.yaml": [{
                "level":
                "error",
                "message":
                "No submission.yaml file found in submission."
            }]
        }
        return errors

    # we return all the errors collectively.
    # This makes more sense that returning errors as
    # soon as problems are found on one file.
    return errors
コード例 #14
0
                print('%s should not contain "/".' % doc['data_file'])
                continue

            # Extract data file from YAML document.
            data_file_path = directory + '/' + doc[
                'data_file'] if directory else doc['data_file']

            # Just try to load YAML data file without validating schema.
            # Script will terminate with an exception if there is a problem.
            contents = yaml.load(open(data_file_path, 'r'), Loader=Loader)

            # Validate the YAML data file if validator imported.
            if not validator_imported:
                print('%s is valid YAML.' % data_file_path)
            else:
                data_file_validator = DataFileValidator()
                is_valid_data_file = data_file_validator.validate(
                    file_path=data_file_path, data=contents)
                if not is_valid_data_file:
                    print('%s is invalid HEPData YAML.' % data_file_path)
                    data_file_validator.print_errors(data_file_path)
                else:
                    # Check that the length of the 'values' list is consistent for
                    # each of the independent_variables and dependent_variables.
                    indep_count = [
                        len(indep['values'])
                        for indep in contents['independent_variables']
                    ]
                    dep_count = [
                        len(dep['values'])
                        for dep in contents['dependent_variables']
コード例 #15
0
class DataValidationTest(unittest.TestCase):
    validator = None

    def setUp(self):
        self.validator = DataFileValidator()
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

        self.invalid_file_yaml = os.path.join(
            self.base_dir,
            'test_data/invalid_file.yaml'
        )

        self.valid_file_yaml = os.path.join(
            self.base_dir,
            'test_data/valid_file.yaml'
        )

        self.valid_file_json = os.path.join(
            self.base_dir,
            'test_data/valid_file.json'
        )

        self.invalid_file_json = os.path.join(
            self.base_dir,
            'test_data/invalid_file.json')

        self.valid_file_error_percent_yaml = os.path.join(
            self.base_dir,
            'test_data/valid_data_with_error.yaml'
        )

        self.invalid_syntax_data_file = os.path.join(
            self.base_dir,
            'test_data/invalid_data_file.yaml'
        )

        self.valid_custom_file = os.path.join(
            self.base_dir,
            'test_data/valid_file_custom.yaml')

    def test_valid_yaml_file(self):
        print '___DATA_VALIDATION: Testing valid yaml submission___'
        is_valid = self.validator.validate(file_path=self.valid_file_yaml)
        self.validator.print_errors(self.valid_file_yaml)
        self.assertEqual(is_valid, True)

    def test_invalid_yaml_file(self):
        print '___DATA_VALIDATION: Testing invalid yaml submission___'
        self.assertEqual(self.validator.validate(file_path=self.invalid_file_yaml),
                         False)

        self.validator.print_errors(self.invalid_file_yaml)

    def test_valid_file_with_percent_errors(self):
        print '___DATA_VALIDATION: Testing valid yaml percent error ___'
        self.assertEqual(self.validator.validate(file_path=self.valid_file_error_percent_yaml),
                         False)
        self.validator.print_errors(self.valid_file_error_percent_yaml)

    def test_valid_json_file(self):
        print '___DATA_VALIDATION: Testing valid json submission___'
        is_valid = self.validator.validate(file_path=self.valid_file_json)
        self.validator.print_errors(self.valid_file_json)
        self.assertEqual(is_valid, True)

        self.validator.print_errors(self.valid_file_json)

    def test_invalid_json_file(self):
        print '___DATA_VALIDATION: Testing invalid json submission___'
        self.assertEqual(self.validator.validate(file_path=self.invalid_file_json),
                         False)
        self.validator.print_errors(self.invalid_file_json)

    def test_load_data_with_custom_data_type(self):
        self.validator = DataFileValidator()
        custom_schema_path = os.path.join(self.base_dir, 'test_data/custom_data_schema.json')
        self.validator.load_custom_schema('different', custom_schema_path)

        self.assertTrue('different' in self.validator.custom_data_schemas)

        self.assertTrue(self.validator.validate(file_path=self.valid_custom_file))

    def test_load_invalid_custom_schema(self):
        self.validator.custom_data_schemas = {}
        print('Loading invalid schema')
        try:
            self.validator.load_custom_schema('different')
        except UnsupportedDataSchemaException as udse:
            self.assertTrue(udse.message == "There is no schema defined for the 'different' data type.")
            self.assertTrue(udse.message == udse.__unicode__())

    def test_load_invalid_data_file(self):

        print('Loading invalid data file')

        self.assertFalse(self.validator.validate(file_path=self.invalid_syntax_data_file))

        self.assertTrue(self.validator.has_errors(self.invalid_syntax_data_file))
        self.assertTrue(len(self.validator.get_messages(self.invalid_syntax_data_file)) == 1)
        self.validator.print_errors(self.invalid_syntax_data_file)
        for message in self.validator.get_messages(self.invalid_syntax_data_file):
            self.assertTrue(message.message.index("There was a problem parsing the file.") == 0)
コード例 #16
0
def validator_v1():
    return DataFileValidator(schema_version='1.0.0')
コード例 #17
0
class DataValidationTest(unittest.TestCase):
    validator = None

    def setUp(self):
        self.validator = DataFileValidator()
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

        self.invalid_file_yaml = os.path.join(self.base_dir,
                                              'test_data/invalid_file.yaml')

        self.valid_file_yaml = os.path.join(self.base_dir,
                                            'test_data/valid_file.yaml')

        self.valid_file_json = os.path.join(self.base_dir,
                                            'test_data/valid_file.json')

        self.invalid_file_json = os.path.join(self.base_dir,
                                              'test_data/invalid_file.json')

        self.valid_file_error_percent_yaml = os.path.join(
            self.base_dir, 'test_data/valid_data_with_error.yaml')

        self.invalid_syntax_data_file = os.path.join(
            self.base_dir, 'test_data/invalid_data_file.yaml')

        self.invalid_parser_file = os.path.join(
            self.base_dir, 'test_data/invalid_parser_file.yaml')

        self.valid_custom_file = os.path.join(
            self.base_dir, 'test_data/valid_file_custom.yaml')

    def test_no_file_path_supplied(self):
        try:
            self.validator.validate(file_path=None)
        except LookupError as le:
            assert (le)

    def test_valid_yaml_file(self):
        print('___DATA_VALIDATION: Testing valid yaml submission___')
        is_valid = self.validator.validate(file_path=self.valid_file_yaml)
        self.validator.print_errors(self.valid_file_yaml)
        self.assertEqual(is_valid, True)

    def test_invalid_yaml_file(self):
        print('___DATA_VALIDATION: Testing invalid yaml submission___')
        self.assertEqual(
            self.validator.validate(file_path=self.invalid_file_yaml), False)

        self.validator.print_errors(self.invalid_file_yaml)

    def test_valid_file_with_percent_errors(self):
        print('___DATA_VALIDATION: Testing valid yaml percent error ___')
        self.assertEqual(
            self.validator.validate(
                file_path=self.valid_file_error_percent_yaml), False)
        self.validator.print_errors(self.valid_file_error_percent_yaml)

    def test_valid_json_file(self):
        print('___DATA_VALIDATION: Testing valid json submission___')
        is_valid = self.validator.validate(file_path=self.valid_file_json)
        self.validator.print_errors(self.valid_file_json)
        self.assertEqual(is_valid, True)

        self.validator.print_errors(self.valid_file_json)

    def test_invalid_json_file(self):
        print('___DATA_VALIDATION: Testing invalid json submission___')
        self.assertEqual(
            self.validator.validate(file_path=self.invalid_file_json), False)
        self.validator.print_errors(self.invalid_file_json)

    def test_load_data_with_custom_data_type(self):
        self.validator = DataFileValidator()
        custom_schema_path = os.path.join(self.base_dir,
                                          'test_data/custom_data_schema.json')
        self.validator.load_custom_schema('different', custom_schema_path)

        self.assertTrue('different' in self.validator.custom_data_schemas)

        self.assertTrue(
            self.validator.validate(file_path=self.valid_custom_file))

    def test_load_invalid_custom_schema(self):
        self.validator.custom_data_schemas = {}
        print('Loading invalid schema')
        try:
            self.validator.load_custom_schema('different')
        except UnsupportedDataSchemaException as udse:
            self.assertTrue(
                udse.message ==
                "There is no schema defined for the 'different' data type.")
            self.assertTrue(udse.message == udse.__unicode__())

    def test_load_invalid_data_file(self):

        print('Loading invalid data file')

        self.assertFalse(
            self.validator.validate(file_path=self.invalid_syntax_data_file))

        self.assertTrue(
            self.validator.has_errors(self.invalid_syntax_data_file))
        self.assertTrue(
            len(self.validator.get_messages(self.invalid_syntax_data_file)) ==
            1)
        self.validator.print_errors(self.invalid_syntax_data_file)
        for message in self.validator.get_messages(
                self.invalid_syntax_data_file):
            self.assertTrue(
                message.message.index("There was a problem parsing the file.")
                == 0)

    def test_invalid_parser_yaml_file(self):
        print('___DATA_VALIDATION: Testing invalid parser yaml submission___')
        self.assertEqual(
            self.validator.validate(file_path=self.invalid_parser_file), False)

        self.validator.print_errors(self.invalid_parser_file)

    def test_ioerror_yaml_file(self):
        print('___DATA_VALIDATION: Testing ioerror yaml submission___')
        self.assertEqual(
            self.validator.validate(file_path=self.valid_file_yaml[:-1]),
            False)

        self.validator.print_errors(self.valid_file_yaml[:-1])
コード例 #18
0
ファイル: submission.py プロジェクト: HEPData/hepdata3
def process_submission_directory(basepath, submission_file_path, recid, update=False, *args, **kwargs):
    """
    Goes through an entire submission directory and processes the
    files within to create DataSubmissions
    with the files and related material attached as DataResources.
    :param basepath:
    :param submission_file_path:
    :param recid:
    :return:
    """
    added_file_names = []
    errors = {}

    if submission_file_path is not None:
        submission_file = open(submission_file_path, 'r')

        submission_file_validator = SubmissionFileValidator()
        is_valid_submission_file = submission_file_validator.validate(
            file_path=submission_file_path)

        data_file_validator = DataFileValidator()

        if is_valid_submission_file:
            try:
                submission_processed = yaml.load_all(submission_file, Loader=yaml.CSafeLoader)
            except:
                submission_processed = yaml.safe_load_all(submission_file)

            # process file, extracting contents, and linking
            # the data record with the parent publication
            hepsubmission = get_latest_hepsubmission(publication_recid=recid)
            if hepsubmission is None:
                HEPSubmission(publication_recid=recid,
                              overall_status='todo',
                              inspire_id=hepsubmission.inspire_id,
                              coordinator=kwargs.get('user_id') if 'user_id' in kwargs else int(current_user.get_id()),
                              version=hepsubmission.version + 1)

            # On a new upload, we reset the flag to notify reviewers
            hepsubmission.reviewers_notified = False

            # if it is finished and we receive an update,
            # then we need to reopen the submission to allow for revisions.
            if hepsubmission.overall_status == 'finished' and not update:
                # we create a new HEPSubmission object
                _rev_hepsubmission = HEPSubmission(publication_recid=recid,
                                                   overall_status='todo',
                                                   inspire_id=hepsubmission.inspire_id,
                                                   coordinator=hepsubmission.coordinator,
                                                   version=hepsubmission.version + 1)
                db.session.add(_rev_hepsubmission)
                hepsubmission = _rev_hepsubmission

            reserve_doi_for_hepsubmission(hepsubmission)

            for yaml_document in submission_processed:
                if 'record_ids' in yaml_document or 'comment' in yaml_document or 'modifications' in yaml_document:
                    # comments are only present in the general submission
                    # information document.
                    process_general_submission_info(basepath, yaml_document, recid)
                else:
                    existing_datasubmission_query = DataSubmission.query \
                        .filter_by(name=encode_string(yaml_document["name"]),
                                   publication_recid=recid,
                                   version=hepsubmission.version)

                    added_file_names.append(yaml_document["name"])

                    if existing_datasubmission_query.count() == 0:
                        datasubmission = DataSubmission(
                            publication_recid=recid,
                            name=encode_string(yaml_document["name"]),
                            description=encode_string(
                                yaml_document["description"]),
                            version=hepsubmission.version)

                    else:
                        datasubmission = existing_datasubmission_query.one()
                        datasubmission.description = encode_string(
                            yaml_document["description"])

                    db.session.add(datasubmission)

                    main_file_path = os.path.join(basepath,
                                                  yaml_document["data_file"])

                    if data_file_validator.validate(file_path=main_file_path):
                        process_data_file(recid, hepsubmission.version, basepath, yaml_document,
                                          datasubmission, main_file_path)
                    else:
                        errors = process_validation_errors_for_display(
                            data_file_validator.get_messages())

                        data_file_validator.clear_messages()

            cleanup_submission(recid, hepsubmission.version,
                               added_file_names)

            db.session.commit()

            if len(errors) is 0:
                package_submission(basepath, recid, hepsubmission)
                reserve_dois_for_data_submissions(recid, hepsubmission.version)

                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hepsubmission)
        else:
            errors = process_validation_errors_for_display(
                submission_file_validator.get_messages())

            submission_file_validator.clear_messages()
            data_file_validator.clear_messages()
    else:
        # return an error
        errors = {"submission.yaml": [
            {"level": "error",
             "message": "No submission.yaml file found in submission."}
        ]}
        return errors

    # we return all the errors collectively.
    # This makes more sense that returning errors as
    # soon as problems are found on one file.
    return errors