def setUp(self): self.validator = DataFileValidator() self.base_dir = os.path.dirname(os.path.realpath(__file__)) self.invalid_file_yaml = os.path.join(self.base_dir, 'test_data/invalid_file.yaml') self.valid_file_yaml = os.path.join(self.base_dir, 'test_data/valid_file.yaml') self.valid_file_json = os.path.join(self.base_dir, 'test_data/valid_file.json') self.invalid_file_json = os.path.join(self.base_dir, 'test_data/invalid_file.json') self.valid_file_error_percent_yaml = os.path.join( self.base_dir, 'test_data/valid_data_with_error.yaml') self.invalid_syntax_data_file = os.path.join( self.base_dir, 'test_data/invalid_data_file.yaml') self.invalid_parser_file = os.path.join( self.base_dir, 'test_data/invalid_parser_file.yaml') self.valid_custom_file = os.path.join( self.base_dir, 'test_data/valid_file_custom.yaml')
def test_load_data_with_custom_data_type(self): self.validator = DataFileValidator() custom_schema_path = os.path.join(self.base_dir, 'test_data/custom_data_schema.json') self.validator.load_custom_schema('different', custom_schema_path) self.assertTrue('different' in self.validator.custom_data_schemas) self.assertTrue( self.validator.validate(file_path=self.valid_custom_file))
def test_invalid_schema_version(): """ Tests the DataFileValidator creation with an invalid schema version """ with pytest.raises(ValueError) as excinfo: validator = DataFileValidator(schema_version='0.9999.99') assert "Invalid schema version 0.9999.99" == str(excinfo.value)
def test_load_data_with_custom_data_type(self): self.validator = DataFileValidator() custom_schema_path = os.path.join(self.base_dir, 'test_data/custom_data_schema.json') self.validator.load_custom_schema('different', custom_schema_path) self.assertTrue('different' in self.validator.custom_data_schemas) self.assertTrue(self.validator.validate(file_path=self.valid_custom_file))
def test_invalid_schema_file(): # Fudge the schema versions constant so we can check the file check works VALID_SCHEMA_VERSIONS.append('0.9999.9999') try: with pytest.raises(ValueError) as excinfo: validator = DataFileValidator(schema_version='0.9999.9999') assert "Invalid schema file" in str(excinfo.value) finally: VALID_SCHEMA_VERSIONS.pop()
def get_data_validator(old_schema): """ Returns a DataFileValidator object (with remote defined schemas loaded) :param old_hepdata: whether the schema version for the data file is 0.1.0 :return: DataFileValidator object """ global CACHED_DATA_VALIDATOR # Use for YAML files migrated from old HepData site if old_schema: data_validator = DataFileValidator(schema_version='0.1.0') elif CACHED_DATA_VALIDATOR: data_validator = CACHED_DATA_VALIDATOR else: data_validator = DataFileValidator() load_remote_schemas(data_validator) CACHED_DATA_VALIDATOR = data_validator return data_validator
def setUp(self): self.validator = DataFileValidator() self.base_dir = os.path.dirname(os.path.realpath(__file__)) self.invalid_file_yaml = os.path.join( self.base_dir, 'test_data/invalid_file.yaml' ) self.valid_file_yaml = os.path.join( self.base_dir, 'test_data/valid_file.yaml' ) self.valid_file_json = os.path.join( self.base_dir, 'test_data/valid_file.json' ) self.invalid_file_json = os.path.join( self.base_dir, 'test_data/invalid_file.json') self.valid_file_error_percent_yaml = os.path.join( self.base_dir, 'test_data/valid_data_with_error.yaml' ) self.invalid_syntax_data_file = os.path.join( self.base_dir, 'test_data/invalid_data_file.yaml' ) self.valid_custom_file = os.path.join( self.base_dir, 'test_data/valid_file_custom.yaml')
def validator_v0(): return DataFileValidator(schema_version='0.1.0')
def parse(self, data_in, *args, **kwargs): """ :param data_in: path to submission.yaml :param args: :param kwargs: :raise ValueError: """ if not os.path.exists(data_in): raise ValueError("File / Directory does not exist: %s" % data_in) if os.path.isdir(data_in): submission_filepath = os.path.join(data_in, 'submission.yaml') if not os.path.exists(submission_filepath): submission_filepath = os.path.join(data_in, 'submission.yml') if not os.path.exists(submission_filepath): raise ValueError("No submission file in %s" % data_in) data_in = submission_filepath # first validate submission file: with open(data_in, 'r') as submission_file: try: submission_data = list( yaml.load_all(submission_file, Loader=yaml.CSafeLoader)) except: # pragma: no cover submission_data = list( yaml.load_all(submission_file)) # pragma: no cover if len(submission_data) == 0: raise RuntimeError("Submission file (%s) is empty" % data_in) submission_file_validator = SubmissionFileValidator() if not submission_file_validator.validate(file_path=data_in, data=submission_data): raise RuntimeError( "Submission file (%s) did not pass validation: %s" % (data_in, self._pretty_print_errors( submission_file_validator.get_messages()))) tables = [] # validator for table data data_file_validator = DataFileValidator() for i in range(1, len(submission_data)): table_filepath = os.path.join(os.path.dirname(data_in), submission_data[i]['data_file']) with open(table_filepath, 'r') as table_file: if not os.path.exists(table_filepath): raise ValueError( "table file: %s does not exist" % table.data_file) try: # We try to load using the CLoader for speed improvements. table_data = yaml.load(table_file, Loader=yaml.CSafeLoader) except: # pragma: no cover table_data = yaml.load(table_file) # pragma: no cover if not data_file_validator.validate(data=table_data, file_path=table_filepath): raise RuntimeError( "Data file (%s) did not pass validation: %s" % (table_filepath, self._pretty_print_errors( data_file_validator.get_messages()))) table = Table(index=i, metadata=submission_data[i], data=table_data) tables.append(table) return ParsedData(submission_data[0], tables)
def parse(self, data_in, *args, **kwargs): """ :param data_in: path to submission.yaml :param args: :param kwargs: :raise ValueError: """ if not os.path.exists(data_in): raise ValueError("File / Directory does not exist: %s" % data_in) if os.path.isdir(data_in): submission_filepath = os.path.join(data_in, 'submission.yaml') if not os.path.exists(submission_filepath): submission_filepath = os.path.join(data_in, 'submission.yml') if not os.path.exists(submission_filepath): raise ValueError("No submission file in %s" % data_in) data_in = submission_filepath # first validate submission file: with open(data_in, 'r') as submission_file: submission_data = list( yaml.load_all(submission_file, Loader=Loader)) if len(submission_data) == 0: raise RuntimeError("Submission file (%s) is empty" % data_in) submission_file_validator = SubmissionFileValidator( schema_version=self.validator_schema_version) if not submission_file_validator.validate(file_path=data_in, data=submission_data): raise RuntimeError( "Submission file (%s) did not pass validation: %s" % (data_in, self._pretty_print_errors( submission_file_validator.get_messages()))) metadata = {} tables = [] # validator for table data data_file_validator = DataFileValidator( schema_version=self.validator_schema_version) index = 0 for i in range(0, len(submission_data)): if not submission_data[i]: # empty YAML document continue if 'data_file' not in submission_data[i]: metadata = submission_data[ i] # information about whole submission continue table_filepath = os.path.join(os.path.dirname(data_in), submission_data[i]['data_file']) with open(table_filepath, 'r') as table_file: if not os.path.exists(table_filepath): raise ValueError("table file: %s does not exist" % table.data_file) table_data = yaml.load(table_file, Loader=Loader) if not data_file_validator.validate(data=table_data, file_path=table_filepath): raise RuntimeError( "Data file (%s) did not pass validation: %s" % (table_filepath, self._pretty_print_errors( data_file_validator.get_messages()))) index = index + 1 table = Table(index=index, metadata=submission_data[i], data=table_data) tables.append(table) return ParsedData(metadata, tables)
from hepdata_validator.data_file_validator import DataFileValidator import argparse parser = argparse.ArgumentParser(description='Validate yaml files.') parser.add_argument('-filename',dest='filename', type=str, help='file to check') args = parser.parse_args() data_file_validator = DataFileValidator() # the validate method takes a string representing the file path. data_file_validator.validate(file_path=args.filename) # if there are any error messages, they are retrievable through this call data_file_validator.get_messages() # the error messages can be printed data_file_validator.print_errors('data.yaml')
def process_submission_directory(basepath, submission_file_path, recid, update=False, *args, **kwargs): """ Goes through an entire submission directory and processes the files within to create DataSubmissions with the files and related material attached as DataResources. :param basepath: :param submission_file_path: :param recid: :return: """ added_file_names = [] errors = {} if submission_file_path is not None: submission_file = open(submission_file_path, 'r') submission_file_validator = SubmissionFileValidator() is_valid_submission_file = submission_file_validator.validate( file_path=submission_file_path) data_file_validator = DataFileValidator() if is_valid_submission_file: try: submission_processed = yaml.load_all(submission_file, Loader=yaml.CSafeLoader) except: submission_processed = yaml.safe_load_all(submission_file) # process file, extracting contents, and linking # the data record with the parent publication hepsubmission = get_latest_hepsubmission(publication_recid=recid) if hepsubmission is None: HEPSubmission(publication_recid=recid, overall_status='todo', inspire_id=hepsubmission.inspire_id, coordinator=kwargs.get('user_id') if 'user_id' in kwargs else int(current_user.get_id()), version=hepsubmission.version + 1) # On a new upload, we reset the flag to notify reviewers hepsubmission.reviewers_notified = False # if it is finished and we receive an update, # then we need to reopen the submission to allow for revisions. if hepsubmission.overall_status == 'finished' and not update: # we create a new HEPSubmission object _rev_hepsubmission = HEPSubmission( publication_recid=recid, overall_status='todo', inspire_id=hepsubmission.inspire_id, coordinator=hepsubmission.coordinator, version=hepsubmission.version + 1) db.session.add(_rev_hepsubmission) hepsubmission = _rev_hepsubmission reserve_doi_for_hepsubmission(hepsubmission, update) for yaml_document in submission_processed: if 'record_ids' in yaml_document or 'comment' in yaml_document or 'modifications' in yaml_document: # comments are only present in the general submission # information document. process_general_submission_info(basepath, yaml_document, recid) else: existing_datasubmission_query = DataSubmission.query \ .filter_by(name=encode_string(yaml_document["name"]), publication_recid=recid, version=hepsubmission.version) added_file_names.append(yaml_document["name"]) if existing_datasubmission_query.count() == 0: datasubmission = DataSubmission( publication_recid=recid, name=encode_string(yaml_document["name"]), description=encode_string( yaml_document["description"]), version=hepsubmission.version) else: datasubmission = existing_datasubmission_query.one() datasubmission.description = encode_string( yaml_document["description"]) db.session.add(datasubmission) main_file_path = os.path.join(basepath, yaml_document["data_file"]) if data_file_validator.validate(file_path=main_file_path): process_data_file(recid, hepsubmission.version, basepath, yaml_document, datasubmission, main_file_path) else: errors = process_validation_errors_for_display( data_file_validator.get_messages()) data_file_validator.clear_messages() cleanup_submission(recid, hepsubmission.version, added_file_names) db.session.commit() if len(errors) is 0: package_submission(basepath, recid, hepsubmission) reserve_dois_for_data_submissions(recid, hepsubmission.version) admin_indexer = AdminIndexer() admin_indexer.index_submission(hepsubmission) else: errors = process_validation_errors_for_display( submission_file_validator.get_messages()) submission_file_validator.clear_messages() data_file_validator.clear_messages() else: # return an error errors = { "submission.yaml": [{ "level": "error", "message": "No submission.yaml file found in submission." }] } return errors # we return all the errors collectively. # This makes more sense that returning errors as # soon as problems are found on one file. return errors
def process_submission_directory(basepath, submission_file_path, recid, update=False, *args, **kwargs): """ Goes through an entire submission directory and processes the files within to create DataSubmissions with the files and related material attached as DataResources. :param basepath: :param submission_file_path: :param recid: :param update: :return: """ added_file_names = [] errors = {} if submission_file_path is not None: submission_file_validator = SubmissionFileValidator() is_valid_submission_file = submission_file_validator.validate( file_path=submission_file_path) if is_valid_submission_file: submission_file = open(submission_file_path, 'r') submission_processed = yaml.load_all(submission_file, Loader=Loader) # process file, extracting contents, and linking # the data record with the parent publication hepsubmission = get_latest_hepsubmission(publication_recid=recid) if hepsubmission is None: HEPSubmission(publication_recid=recid, overall_status='todo', inspire_id=hepsubmission.inspire_id, coordinator=kwargs.get('user_id') if 'user_id' in kwargs else int(current_user.get_id()), version=hepsubmission.version + 1) # On a new upload, we reset the flag to notify reviewers hepsubmission.reviewers_notified = False # if it is finished and we receive an update, # then we need to reopen the submission to allow for revisions. if hepsubmission.overall_status == 'finished' and not update: # we create a new HEPSubmission object _rev_hepsubmission = HEPSubmission( publication_recid=recid, overall_status='todo', inspire_id=hepsubmission.inspire_id, coordinator=hepsubmission.coordinator, version=hepsubmission.version + 1) db.session.add(_rev_hepsubmission) hepsubmission = _rev_hepsubmission reserve_doi_for_hepsubmission(hepsubmission, update) no_general_submission_info = True data_file_validator = DataFileValidator() # Delete all data records associated with this submission. # Fixes problems with ordering where the table names are changed between uploads. # See https://github.com/HEPData/hepdata/issues/112 # Side effect that reviews will be deleted between uploads. cleanup_submission(recid, hepsubmission.version, added_file_names) for yaml_document_index, yaml_document in enumerate( submission_processed): if not yaml_document: continue # Check for presence of local files given as additional_resources. if 'additional_resources' in yaml_document: for resource in yaml_document['additional_resources']: location = os.path.join(basepath, resource['location']) if not resource['location'].startswith( ('http', '/resource/')): if not os.path.isfile(location): errors[resource['location']] = [{ "level": "error", "message": "Missing 'additional_resources' file from uploaded archive." }] elif '/' in resource['location']: errors[resource['location']] = [{ "level": "error", "message": "Location of 'additional_resources' file should not contain '/'." }] if not yaml_document_index and 'name' not in yaml_document: no_general_submission_info = False process_general_submission_info(basepath, yaml_document, recid) elif not all(k in yaml_document for k in ('name', 'description', 'keywords', 'data_file')): errors["submission.yaml"] = [{ "level": "error", "message": "YAML document with index {} ".format( yaml_document_index) + "missing one or more required keys (name, description, keywords, data_file)." }] else: existing_datasubmission_query = DataSubmission.query \ .filter_by(name=encode_string(yaml_document["name"]), publication_recid=recid, version=hepsubmission.version) added_file_names.append(yaml_document["name"]) try: if existing_datasubmission_query.count() == 0: datasubmission = DataSubmission( publication_recid=recid, name=encode_string(yaml_document["name"]), description=encode_string( yaml_document["description"]), version=hepsubmission.version) else: datasubmission = existing_datasubmission_query.one( ) datasubmission.description = encode_string( yaml_document["description"]) db.session.add(datasubmission) except SQLAlchemyError as sqlex: errors[yaml_document["data_file"]] = [{ "level": "error", "message": str(sqlex) }] db.session.rollback() continue main_file_path = os.path.join(basepath, yaml_document["data_file"]) data, ex = _eos_fix_read_data(main_file_path) if not data or data is None or ex is not None: errors[yaml_document["data_file"]] = \ [{"level": "error", "message": "There was a problem parsing the file.\n" + str(ex)}] elif '/' in yaml_document["data_file"]: errors[yaml_document["data_file"]] = \ [{"level": "error", "message": "Name of data_file should not contain '/'.\n"}] else: if data_file_validator.validate( file_path=main_file_path, data=data): try: process_data_file(recid, hepsubmission.version, basepath, yaml_document, datasubmission, main_file_path) except SQLAlchemyError as sqlex: errors[yaml_document["data_file"]] = [{ "level": "error", "message": "There was a problem processing the file.\n" + str(sqlex) }] db.session.rollback() else: errors = process_validation_errors_for_display( data_file_validator.get_messages()) data_file_validator.clear_messages() if yaml_document["data_file"] not in errors: # Check that the length of the 'values' list is consistent # for each of the independent_variables and dependent_variables. indep_count = [ len(indep['values']) for indep in data['independent_variables'] ] dep_count = [ len(dep['values']) for dep in data['dependent_variables'] ] if len(set(indep_count + dep_count) ) > 1: # if more than one unique count errors.setdefault( yaml_document["data_file"], [] ).append({ "level": "error", "message": "Inconsistent length of 'values' list:\n" + "independent_variables{}, dependent_variables{}" .format(str(indep_count), str(dep_count)) }) submission_file.close() if no_general_submission_info: hepsubmission.last_updated = datetime.now() db.session.add(hepsubmission) db.session.commit() # The line below is commented out since it does not preserve the order of tables. # Delete all tables above instead: side effect of deleting reviews between uploads. #cleanup_submission(recid, hepsubmission.version, added_file_names) db.session.commit() if len(errors) is 0: errors = package_submission(basepath, recid, hepsubmission) reserve_dois_for_data_submissions( publication_recid=recid, version=hepsubmission.version) admin_indexer = AdminIndexer() admin_indexer.index_submission(hepsubmission) else: # delete all tables if errors cleanup_submission(recid, hepsubmission.version, {}) else: errors = process_validation_errors_for_display( submission_file_validator.get_messages()) submission_file_validator.clear_messages() else: # return an error errors = { "submission.yaml": [{ "level": "error", "message": "No submission.yaml file found in submission." }] } return errors # we return all the errors collectively. # This makes more sense that returning errors as # soon as problems are found on one file. return errors
print('%s should not contain "/".' % doc['data_file']) continue # Extract data file from YAML document. data_file_path = directory + '/' + doc[ 'data_file'] if directory else doc['data_file'] # Just try to load YAML data file without validating schema. # Script will terminate with an exception if there is a problem. contents = yaml.load(open(data_file_path, 'r'), Loader=Loader) # Validate the YAML data file if validator imported. if not validator_imported: print('%s is valid YAML.' % data_file_path) else: data_file_validator = DataFileValidator() is_valid_data_file = data_file_validator.validate( file_path=data_file_path, data=contents) if not is_valid_data_file: print('%s is invalid HEPData YAML.' % data_file_path) data_file_validator.print_errors(data_file_path) else: # Check that the length of the 'values' list is consistent for # each of the independent_variables and dependent_variables. indep_count = [ len(indep['values']) for indep in contents['independent_variables'] ] dep_count = [ len(dep['values']) for dep in contents['dependent_variables']
class DataValidationTest(unittest.TestCase): validator = None def setUp(self): self.validator = DataFileValidator() self.base_dir = os.path.dirname(os.path.realpath(__file__)) self.invalid_file_yaml = os.path.join( self.base_dir, 'test_data/invalid_file.yaml' ) self.valid_file_yaml = os.path.join( self.base_dir, 'test_data/valid_file.yaml' ) self.valid_file_json = os.path.join( self.base_dir, 'test_data/valid_file.json' ) self.invalid_file_json = os.path.join( self.base_dir, 'test_data/invalid_file.json') self.valid_file_error_percent_yaml = os.path.join( self.base_dir, 'test_data/valid_data_with_error.yaml' ) self.invalid_syntax_data_file = os.path.join( self.base_dir, 'test_data/invalid_data_file.yaml' ) self.valid_custom_file = os.path.join( self.base_dir, 'test_data/valid_file_custom.yaml') def test_valid_yaml_file(self): print '___DATA_VALIDATION: Testing valid yaml submission___' is_valid = self.validator.validate(file_path=self.valid_file_yaml) self.validator.print_errors(self.valid_file_yaml) self.assertEqual(is_valid, True) def test_invalid_yaml_file(self): print '___DATA_VALIDATION: Testing invalid yaml submission___' self.assertEqual(self.validator.validate(file_path=self.invalid_file_yaml), False) self.validator.print_errors(self.invalid_file_yaml) def test_valid_file_with_percent_errors(self): print '___DATA_VALIDATION: Testing valid yaml percent error ___' self.assertEqual(self.validator.validate(file_path=self.valid_file_error_percent_yaml), False) self.validator.print_errors(self.valid_file_error_percent_yaml) def test_valid_json_file(self): print '___DATA_VALIDATION: Testing valid json submission___' is_valid = self.validator.validate(file_path=self.valid_file_json) self.validator.print_errors(self.valid_file_json) self.assertEqual(is_valid, True) self.validator.print_errors(self.valid_file_json) def test_invalid_json_file(self): print '___DATA_VALIDATION: Testing invalid json submission___' self.assertEqual(self.validator.validate(file_path=self.invalid_file_json), False) self.validator.print_errors(self.invalid_file_json) def test_load_data_with_custom_data_type(self): self.validator = DataFileValidator() custom_schema_path = os.path.join(self.base_dir, 'test_data/custom_data_schema.json') self.validator.load_custom_schema('different', custom_schema_path) self.assertTrue('different' in self.validator.custom_data_schemas) self.assertTrue(self.validator.validate(file_path=self.valid_custom_file)) def test_load_invalid_custom_schema(self): self.validator.custom_data_schemas = {} print('Loading invalid schema') try: self.validator.load_custom_schema('different') except UnsupportedDataSchemaException as udse: self.assertTrue(udse.message == "There is no schema defined for the 'different' data type.") self.assertTrue(udse.message == udse.__unicode__()) def test_load_invalid_data_file(self): print('Loading invalid data file') self.assertFalse(self.validator.validate(file_path=self.invalid_syntax_data_file)) self.assertTrue(self.validator.has_errors(self.invalid_syntax_data_file)) self.assertTrue(len(self.validator.get_messages(self.invalid_syntax_data_file)) == 1) self.validator.print_errors(self.invalid_syntax_data_file) for message in self.validator.get_messages(self.invalid_syntax_data_file): self.assertTrue(message.message.index("There was a problem parsing the file.") == 0)
def validator_v1(): return DataFileValidator(schema_version='1.0.0')
class DataValidationTest(unittest.TestCase): validator = None def setUp(self): self.validator = DataFileValidator() self.base_dir = os.path.dirname(os.path.realpath(__file__)) self.invalid_file_yaml = os.path.join(self.base_dir, 'test_data/invalid_file.yaml') self.valid_file_yaml = os.path.join(self.base_dir, 'test_data/valid_file.yaml') self.valid_file_json = os.path.join(self.base_dir, 'test_data/valid_file.json') self.invalid_file_json = os.path.join(self.base_dir, 'test_data/invalid_file.json') self.valid_file_error_percent_yaml = os.path.join( self.base_dir, 'test_data/valid_data_with_error.yaml') self.invalid_syntax_data_file = os.path.join( self.base_dir, 'test_data/invalid_data_file.yaml') self.invalid_parser_file = os.path.join( self.base_dir, 'test_data/invalid_parser_file.yaml') self.valid_custom_file = os.path.join( self.base_dir, 'test_data/valid_file_custom.yaml') def test_no_file_path_supplied(self): try: self.validator.validate(file_path=None) except LookupError as le: assert (le) def test_valid_yaml_file(self): print('___DATA_VALIDATION: Testing valid yaml submission___') is_valid = self.validator.validate(file_path=self.valid_file_yaml) self.validator.print_errors(self.valid_file_yaml) self.assertEqual(is_valid, True) def test_invalid_yaml_file(self): print('___DATA_VALIDATION: Testing invalid yaml submission___') self.assertEqual( self.validator.validate(file_path=self.invalid_file_yaml), False) self.validator.print_errors(self.invalid_file_yaml) def test_valid_file_with_percent_errors(self): print('___DATA_VALIDATION: Testing valid yaml percent error ___') self.assertEqual( self.validator.validate( file_path=self.valid_file_error_percent_yaml), False) self.validator.print_errors(self.valid_file_error_percent_yaml) def test_valid_json_file(self): print('___DATA_VALIDATION: Testing valid json submission___') is_valid = self.validator.validate(file_path=self.valid_file_json) self.validator.print_errors(self.valid_file_json) self.assertEqual(is_valid, True) self.validator.print_errors(self.valid_file_json) def test_invalid_json_file(self): print('___DATA_VALIDATION: Testing invalid json submission___') self.assertEqual( self.validator.validate(file_path=self.invalid_file_json), False) self.validator.print_errors(self.invalid_file_json) def test_load_data_with_custom_data_type(self): self.validator = DataFileValidator() custom_schema_path = os.path.join(self.base_dir, 'test_data/custom_data_schema.json') self.validator.load_custom_schema('different', custom_schema_path) self.assertTrue('different' in self.validator.custom_data_schemas) self.assertTrue( self.validator.validate(file_path=self.valid_custom_file)) def test_load_invalid_custom_schema(self): self.validator.custom_data_schemas = {} print('Loading invalid schema') try: self.validator.load_custom_schema('different') except UnsupportedDataSchemaException as udse: self.assertTrue( udse.message == "There is no schema defined for the 'different' data type.") self.assertTrue(udse.message == udse.__unicode__()) def test_load_invalid_data_file(self): print('Loading invalid data file') self.assertFalse( self.validator.validate(file_path=self.invalid_syntax_data_file)) self.assertTrue( self.validator.has_errors(self.invalid_syntax_data_file)) self.assertTrue( len(self.validator.get_messages(self.invalid_syntax_data_file)) == 1) self.validator.print_errors(self.invalid_syntax_data_file) for message in self.validator.get_messages( self.invalid_syntax_data_file): self.assertTrue( message.message.index("There was a problem parsing the file.") == 0) def test_invalid_parser_yaml_file(self): print('___DATA_VALIDATION: Testing invalid parser yaml submission___') self.assertEqual( self.validator.validate(file_path=self.invalid_parser_file), False) self.validator.print_errors(self.invalid_parser_file) def test_ioerror_yaml_file(self): print('___DATA_VALIDATION: Testing ioerror yaml submission___') self.assertEqual( self.validator.validate(file_path=self.valid_file_yaml[:-1]), False) self.validator.print_errors(self.valid_file_yaml[:-1])
def process_submission_directory(basepath, submission_file_path, recid, update=False, *args, **kwargs): """ Goes through an entire submission directory and processes the files within to create DataSubmissions with the files and related material attached as DataResources. :param basepath: :param submission_file_path: :param recid: :return: """ added_file_names = [] errors = {} if submission_file_path is not None: submission_file = open(submission_file_path, 'r') submission_file_validator = SubmissionFileValidator() is_valid_submission_file = submission_file_validator.validate( file_path=submission_file_path) data_file_validator = DataFileValidator() if is_valid_submission_file: try: submission_processed = yaml.load_all(submission_file, Loader=yaml.CSafeLoader) except: submission_processed = yaml.safe_load_all(submission_file) # process file, extracting contents, and linking # the data record with the parent publication hepsubmission = get_latest_hepsubmission(publication_recid=recid) if hepsubmission is None: HEPSubmission(publication_recid=recid, overall_status='todo', inspire_id=hepsubmission.inspire_id, coordinator=kwargs.get('user_id') if 'user_id' in kwargs else int(current_user.get_id()), version=hepsubmission.version + 1) # On a new upload, we reset the flag to notify reviewers hepsubmission.reviewers_notified = False # if it is finished and we receive an update, # then we need to reopen the submission to allow for revisions. if hepsubmission.overall_status == 'finished' and not update: # we create a new HEPSubmission object _rev_hepsubmission = HEPSubmission(publication_recid=recid, overall_status='todo', inspire_id=hepsubmission.inspire_id, coordinator=hepsubmission.coordinator, version=hepsubmission.version + 1) db.session.add(_rev_hepsubmission) hepsubmission = _rev_hepsubmission reserve_doi_for_hepsubmission(hepsubmission) for yaml_document in submission_processed: if 'record_ids' in yaml_document or 'comment' in yaml_document or 'modifications' in yaml_document: # comments are only present in the general submission # information document. process_general_submission_info(basepath, yaml_document, recid) else: existing_datasubmission_query = DataSubmission.query \ .filter_by(name=encode_string(yaml_document["name"]), publication_recid=recid, version=hepsubmission.version) added_file_names.append(yaml_document["name"]) if existing_datasubmission_query.count() == 0: datasubmission = DataSubmission( publication_recid=recid, name=encode_string(yaml_document["name"]), description=encode_string( yaml_document["description"]), version=hepsubmission.version) else: datasubmission = existing_datasubmission_query.one() datasubmission.description = encode_string( yaml_document["description"]) db.session.add(datasubmission) main_file_path = os.path.join(basepath, yaml_document["data_file"]) if data_file_validator.validate(file_path=main_file_path): process_data_file(recid, hepsubmission.version, basepath, yaml_document, datasubmission, main_file_path) else: errors = process_validation_errors_for_display( data_file_validator.get_messages()) data_file_validator.clear_messages() cleanup_submission(recid, hepsubmission.version, added_file_names) db.session.commit() if len(errors) is 0: package_submission(basepath, recid, hepsubmission) reserve_dois_for_data_submissions(recid, hepsubmission.version) admin_indexer = AdminIndexer() admin_indexer.index_submission(hepsubmission) else: errors = process_validation_errors_for_display( submission_file_validator.get_messages()) submission_file_validator.clear_messages() data_file_validator.clear_messages() else: # return an error errors = {"submission.yaml": [ {"level": "error", "message": "No submission.yaml file found in submission."} ]} return errors # we return all the errors collectively. # This makes more sense that returning errors as # soon as problems are found on one file. return errors