def test_cleanup_index_all(app, load_default_data, identifiers, mocker): index = app.config.get('ELASTICSEARCH_INDEX') m = mocker.patch('fixes.cleanup_index.cleanup_index_batch') # Should be no calls made at first as there is only one version of all submissions cleanup_index_all(index=index, synchronous=True) m.assert_not_called() m.reset_mock() # Create a new version for ins1283842 new_submission = HEPSubmission(publication_recid=1, inspire_id=identifiers[0]["inspire_id"], version=2, overall_status='finished') db.session.add(new_submission) db.session.commit() # New id should be 3 assert (new_submission.id == 3) # Cleanup should now clean up id 1 cleanup_index_all(index=index, synchronous=True) m.assert_called_once_with([1], index) m.reset_mock() # Create more new versions new_submission1 = HEPSubmission(publication_recid=1, inspire_id=identifiers[0]["inspire_id"], version=3, overall_status='finished') db.session.add(new_submission1) new_submission2 = HEPSubmission(publication_recid=1, inspire_id=identifiers[0]["inspire_id"], version=4, overall_status='todo') db.session.add(new_submission2) new_submission3 = HEPSubmission(publication_recid=16, inspire_id=identifiers[1]["inspire_id"], version=2, overall_status='finished') db.session.add(new_submission3) db.session.commit() assert (new_submission1.id == 4) assert (new_submission2.id == 5) assert (new_submission3.id == 6) # Cleanup should now clean up ids 1, 2 and 3 (ie versions lower than the highest finished version) cleanup_index_all(index=index, synchronous=True) m.assert_called_once_with([1, 2, 3], index) m.reset_mock() # Check batch size works cleanup_index_all(index=index, batch=2, synchronous=True) m.assert_has_calls([call([1, 2], index), call([3], index)]) m.reset_mock() cleanup_index_all(index=index, batch=1, synchronous=True) m.assert_has_calls([call([1], index), call([2], index), call([3], index)])
def test_status_reset_error(app, mocker, caplog): """ Test that an error is logged if something goes wrong in the status reset :return: """ caplog.set_level(logging.ERROR) base_dir = os.path.dirname(os.path.realpath(__file__)) hepsubmission = HEPSubmission(publication_recid=12345, overall_status='processing', version=1) db.session.add(hepsubmission) db.session.commit() assert (hepsubmission.overall_status == 'processing') mocker.patch('hepdata.modules.records.api.process_zip_archive', side_effect=Exception("Something went wrong")) mocker.patch('hepdata.modules.records.api.cleanup_submission', side_effect=Exception("Could not clean up the submission")) zip_file = os.path.join(base_dir, 'test_data/TestHEPSubmission.zip') process_saved_file(zip_file, 12345, 1, '', 'todo') # After initial failure, overall_status has not been able to be reset assert (hepsubmission.overall_status == 'processing') assert (len(caplog.records) == 1) assert (caplog.records[0].levelname == "ERROR") assert (caplog.records[0].msg == "Exception while cleaning up: Could not clean up the submission")
def test_submission_too_big(app, mocker): """ Test the right thing happens when the submission data is too big :return: """ base_dir = os.path.dirname(os.path.realpath(__file__)) hepsubmission = HEPSubmission(publication_recid=12345, overall_status='todo', version=1) db.session.add(hepsubmission) db.session.commit() # Patch the app config to reduce the max upload size mocker.patch.dict('flask.current_app.config', {'CONVERT_MAX_SIZE': 1000}) test_directory = os.path.join(base_dir, 'test_data/test_submission') errors = process_submission_directory( test_directory, os.path.join(test_directory, 'submission.yaml'), 12345) assert ('Archive' in errors) assert (len(errors['Archive']) == 1) assert (errors['Archive'][0]['level'] == 'error') assert (errors['Archive'][0]['message'].startswith( "Archive is too big for conversion to other formats."))
def test_old_submission_yaml(app, admin_idx): """ Test we can validate against the old submission schema (for use when importing) :return: """ base_dir = os.path.dirname(os.path.realpath(__file__)) hepsubmission = HEPSubmission(publication_recid=12345, overall_status='todo', version=1) db.session.add(hepsubmission) db.session.commit() directory = os.path.join(base_dir, 'test_data/test_v0_submission') # This should fail against current schema errors = process_submission_directory( directory, os.path.join(directory, 'submission.yaml'), 12345) assert ('submission.yaml' in errors) assert (len(errors['submission.yaml']) == 1) assert (errors['submission.yaml'][0]['level'] == 'error') assert (errors['submission.yaml'][0]['message'].decode().startswith( "Invalid value (in GeV) for cmenergies: 1.383-1.481")) # Use old schema - should now work errors = process_submission_directory(directory, os.path.join(directory, 'submission.yaml'), 12345, old_submission_schema=True) assert (errors == {})
def create_new_version(recid, user, notify_uploader=True, uploader_message=None): hepsubmission = get_latest_hepsubmission(publication_recid=recid) if hepsubmission.overall_status == 'finished': # Reopen the submission to allow for revisions, # by creating a new HEPSubmission object. _rev_hepsubmission = HEPSubmission(publication_recid=recid, overall_status='todo', inspire_id=hepsubmission.inspire_id, coordinator=hepsubmission.coordinator, version=hepsubmission.version + 1) db.session.add(_rev_hepsubmission) db.session.commit() if notify_uploader: uploaders = SubmissionParticipant.query.filter_by( role='uploader', publication_recid=recid, status='primary' ) record_information = get_record_by_id(recid) for uploader in uploaders: send_cookie_email(uploader, record_information, message=uploader_message, version=_rev_hepsubmission.version) return jsonify({'success': True, 'version': _rev_hepsubmission.version}) else: return jsonify({"message": f"Rec id {recid} is not finished so cannot create a new version"}), 400
def process_payload(recid, file, redirect_url, synchronous=False): """Process an uploaded file :param recid: int The id of the record to update :param file: file The file to process :param redirect_url: string Redirect URL to record, for use if the upload fails or in synchronous mode :param synchronous: bool Whether to process asynchronously via celery (default) or immediately (only recommended for tests) :return: JSONResponse either containing 'url' (for success cases) or 'message' (for error cases, which will give a 400 error). """ if file and (allowed_file(file.filename)): file_path = save_zip_file(file, recid) hepsubmission = get_latest_hepsubmission(publication_recid=recid) if hepsubmission.overall_status == 'finished': # If it is finished and we receive an update, # then we need to reopen the submission to allow for revisions, # by creating a new HEPSubmission object. _rev_hepsubmission = HEPSubmission( publication_recid=recid, overall_status='todo', inspire_id=hepsubmission.inspire_id, coordinator=hepsubmission.coordinator, version=hepsubmission.version + 1) db.session.add(_rev_hepsubmission) hepsubmission = _rev_hepsubmission previous_status = hepsubmission.overall_status hepsubmission.overall_status = 'sandbox_processing' if previous_status == 'sandbox' else 'processing' db.session.add(hepsubmission) db.session.commit() if synchronous: process_saved_file(file_path, recid, current_user.get_id(), redirect_url, previous_status) else: process_saved_file.delay(file_path, recid, current_user.get_id(), redirect_url, previous_status) flash( 'File saved. You will receive an email when the file has been processed.', 'info') return jsonify({'url': redirect_url.format(recid)}) else: return jsonify({ "message": "You must upload a .zip, .tar, .tar.gz or .tgz file" + " (or a .oldhepdata or single .yaml or .yaml.gz file)." }), 400
def _create_new_versions(version, expected_range): # Create new HEPSubmission and DataSubmissions for ins1283842 new_hep_submission = HEPSubmission( publication_recid=1, inspire_id=identifiers[0]["inspire_id"], version=version, overall_status='finished') db.session.add(new_hep_submission) db.session.commit() new_data_submissions = [] for i in range(5): new_data_submission = DataSubmission(publication_recid=1, associated_recid=1, version=version) db.session.add(new_data_submission) new_data_submissions.append(new_data_submission) db.session.commit() assert [x.id for x in new_data_submissions] == expected_range
def get_or_create_hepsubmission(recid, coordinator=1, status="todo"): """ Gets of creates a new HEPSubmission record :param recid: the publication record id :param coordinator: the user id of the user who owns this record :param status: e.g. todo, finished. :return: the newly created HEPSubmission object """ hepsubmission = HEPSubmission.query.filter_by(publication_recid=recid).first() if hepsubmission is None: hepsubmission = HEPSubmission(publication_recid=recid, coordinator=coordinator, overall_status=status) db.session.add(hepsubmission) db.session.commit() return hepsubmission
def test_find_submission_data_file_path(app): data_dir = app.config['CFG_DATADIR'] expected_file_name = 'HEPData-987654321-v2-yaml.zip' old_file_path = data_dir + '/987654321/' + expected_file_name new_file_path = data_dir + '/8a/987654321/' + expected_file_name # /8a/987654321/ if os.path.exists(new_file_path): os.remove(new_file_path) # No new format file found, so should return old directory submission = HEPSubmission(publication_recid=987654321, version=2) assert (find_submission_data_file_path(submission) == old_file_path) # Create new file os.makedirs(data_dir + '/8a/987654321/', exist_ok=True) f = open(new_file_path, 'w') f.close() assert (find_submission_data_file_path(submission) == new_file_path)
def test_duplicate_table_names(app): """ Test that an error is returned for a submission.yaml file with duplicate table names. """ base_dir = os.path.dirname(os.path.realpath(__file__)) hepsubmission = HEPSubmission(publication_recid=12345, overall_status='todo', version=1) db.session.add(hepsubmission) db.session.commit() directory = os.path.join(base_dir, 'test_data/test_duplicate_table_names') errors = process_submission_directory( directory, os.path.join(directory, 'submission.yaml'), 12345) assert ('submission.yaml' in errors) assert (len(errors['submission.yaml']) == 2) for error in errors['submission.yaml']: assert (error['level'] == 'error') assert (error['message'].startswith("Duplicate table with name"))
def test_status_reset(app, mocker): """ Test that the status is reset if something unexpected goes wrong :return: """ base_dir = os.path.dirname(os.path.realpath(__file__)) hepsubmission = HEPSubmission(publication_recid=12345, overall_status='processing', version=1) db.session.add(hepsubmission) db.session.commit() assert (hepsubmission.overall_status == 'processing') mocker.patch('hepdata.modules.records.api.process_zip_archive', side_effect=Exception("Something went wrong")) zip_file = os.path.join(base_dir, 'test_data/TestHEPSubmission.zip') process_saved_file(zip_file, 12345, 1, '', 'todo') # After initial failure, overall_status should be reset to 'todo' assert (hepsubmission.overall_status == 'todo')
def test_invalid_data_yaml(app, admin_idx): """ Test the right thing happens when a data yaml file is invalid :return: """ base_dir = os.path.dirname(os.path.realpath(__file__)) hepsubmission = HEPSubmission(publication_recid=12345, overall_status='todo', version=1) db.session.add(hepsubmission) db.session.commit() directory = os.path.join(base_dir, 'test_data/test_invalid_data_file') errors = process_submission_directory( directory, os.path.join(directory, 'submission.yaml'), 12345) assert ('data1.yaml' in errors) assert (len(errors['data1.yaml']) == 1) assert (errors['data1.yaml'][0]['level'] == 'error') assert (errors['data1.yaml'][0]['message'].startswith( "There was a problem parsing the file"))
def process_submission_directory(basepath, submission_file_path, recid, update=False, *args, **kwargs): """ Goes through an entire submission directory and processes the files within to create DataSubmissions with the files and related material attached as DataResources. :param basepath: :param submission_file_path: :param recid: :return: """ added_file_names = [] errors = {} if submission_file_path is not None: submission_file = open(submission_file_path, 'r') submission_file_validator = SubmissionFileValidator() is_valid_submission_file = submission_file_validator.validate( file_path=submission_file_path) data_file_validator = DataFileValidator() if is_valid_submission_file: try: submission_processed = yaml.load_all(submission_file, Loader=yaml.CSafeLoader) except: submission_processed = yaml.safe_load_all(submission_file) # process file, extracting contents, and linking # the data record with the parent publication hepsubmission = get_latest_hepsubmission(publication_recid=recid) if hepsubmission is None: HEPSubmission(publication_recid=recid, overall_status='todo', inspire_id=hepsubmission.inspire_id, coordinator=kwargs.get('user_id') if 'user_id' in kwargs else int(current_user.get_id()), version=hepsubmission.version + 1) # On a new upload, we reset the flag to notify reviewers hepsubmission.reviewers_notified = False # if it is finished and we receive an update, # then we need to reopen the submission to allow for revisions. if hepsubmission.overall_status == 'finished' and not update: # we create a new HEPSubmission object _rev_hepsubmission = HEPSubmission( publication_recid=recid, overall_status='todo', inspire_id=hepsubmission.inspire_id, coordinator=hepsubmission.coordinator, version=hepsubmission.version + 1) db.session.add(_rev_hepsubmission) hepsubmission = _rev_hepsubmission reserve_doi_for_hepsubmission(hepsubmission, update) for yaml_document in submission_processed: if 'record_ids' in yaml_document or 'comment' in yaml_document or 'modifications' in yaml_document: # comments are only present in the general submission # information document. process_general_submission_info(basepath, yaml_document, recid) else: existing_datasubmission_query = DataSubmission.query \ .filter_by(name=encode_string(yaml_document["name"]), publication_recid=recid, version=hepsubmission.version) added_file_names.append(yaml_document["name"]) if existing_datasubmission_query.count() == 0: datasubmission = DataSubmission( publication_recid=recid, name=encode_string(yaml_document["name"]), description=encode_string( yaml_document["description"]), version=hepsubmission.version) else: datasubmission = existing_datasubmission_query.one() datasubmission.description = encode_string( yaml_document["description"]) db.session.add(datasubmission) main_file_path = os.path.join(basepath, yaml_document["data_file"]) if data_file_validator.validate(file_path=main_file_path): process_data_file(recid, hepsubmission.version, basepath, yaml_document, datasubmission, main_file_path) else: errors = process_validation_errors_for_display( data_file_validator.get_messages()) data_file_validator.clear_messages() cleanup_submission(recid, hepsubmission.version, added_file_names) db.session.commit() if len(errors) is 0: package_submission(basepath, recid, hepsubmission) reserve_dois_for_data_submissions(recid, hepsubmission.version) admin_indexer = AdminIndexer() admin_indexer.index_submission(hepsubmission) else: errors = process_validation_errors_for_display( submission_file_validator.get_messages()) submission_file_validator.clear_messages() data_file_validator.clear_messages() else: # return an error errors = { "submission.yaml": [{ "level": "error", "message": "No submission.yaml file found in submission." }] } return errors # we return all the errors collectively. # This makes more sense that returning errors as # soon as problems are found on one file. return errors
def process_submission_directory(basepath, submission_file_path, recid, update=False, *args, **kwargs): """ Goes through an entire submission directory and processes the files within to create DataSubmissions with the files and related material attached as DataResources. :param basepath: :param submission_file_path: :param recid: :param update: :return: """ added_file_names = [] errors = {} if submission_file_path is not None: submission_file_validator = SubmissionFileValidator() is_valid_submission_file = submission_file_validator.validate( file_path=submission_file_path) if is_valid_submission_file: submission_file = open(submission_file_path, 'r') submission_processed = yaml.load_all(submission_file, Loader=Loader) # process file, extracting contents, and linking # the data record with the parent publication hepsubmission = get_latest_hepsubmission(publication_recid=recid) if hepsubmission is None: HEPSubmission(publication_recid=recid, overall_status='todo', inspire_id=hepsubmission.inspire_id, coordinator=kwargs.get('user_id') if 'user_id' in kwargs else int(current_user.get_id()), version=hepsubmission.version + 1) # On a new upload, we reset the flag to notify reviewers hepsubmission.reviewers_notified = False # if it is finished and we receive an update, # then we need to reopen the submission to allow for revisions. if hepsubmission.overall_status == 'finished' and not update: # we create a new HEPSubmission object _rev_hepsubmission = HEPSubmission( publication_recid=recid, overall_status='todo', inspire_id=hepsubmission.inspire_id, coordinator=hepsubmission.coordinator, version=hepsubmission.version + 1) db.session.add(_rev_hepsubmission) hepsubmission = _rev_hepsubmission reserve_doi_for_hepsubmission(hepsubmission, update) no_general_submission_info = True data_file_validator = DataFileValidator() # Delete all data records associated with this submission. # Fixes problems with ordering where the table names are changed between uploads. # See https://github.com/HEPData/hepdata/issues/112 # Side effect that reviews will be deleted between uploads. cleanup_submission(recid, hepsubmission.version, added_file_names) for yaml_document_index, yaml_document in enumerate( submission_processed): if not yaml_document: continue # Check for presence of local files given as additional_resources. if 'additional_resources' in yaml_document: for resource in yaml_document['additional_resources']: location = os.path.join(basepath, resource['location']) if not resource['location'].startswith( ('http', '/resource/')): if not os.path.isfile(location): errors[resource['location']] = [{ "level": "error", "message": "Missing 'additional_resources' file from uploaded archive." }] elif '/' in resource['location']: errors[resource['location']] = [{ "level": "error", "message": "Location of 'additional_resources' file should not contain '/'." }] if not yaml_document_index and 'name' not in yaml_document: no_general_submission_info = False process_general_submission_info(basepath, yaml_document, recid) elif not all(k in yaml_document for k in ('name', 'description', 'keywords', 'data_file')): errors["submission.yaml"] = [{ "level": "error", "message": "YAML document with index {} ".format( yaml_document_index) + "missing one or more required keys (name, description, keywords, data_file)." }] else: existing_datasubmission_query = DataSubmission.query \ .filter_by(name=encode_string(yaml_document["name"]), publication_recid=recid, version=hepsubmission.version) added_file_names.append(yaml_document["name"]) try: if existing_datasubmission_query.count() == 0: datasubmission = DataSubmission( publication_recid=recid, name=encode_string(yaml_document["name"]), description=encode_string( yaml_document["description"]), version=hepsubmission.version) else: datasubmission = existing_datasubmission_query.one( ) datasubmission.description = encode_string( yaml_document["description"]) db.session.add(datasubmission) except SQLAlchemyError as sqlex: errors[yaml_document["data_file"]] = [{ "level": "error", "message": str(sqlex) }] db.session.rollback() continue main_file_path = os.path.join(basepath, yaml_document["data_file"]) data, ex = _eos_fix_read_data(main_file_path) if not data or data is None or ex is not None: errors[yaml_document["data_file"]] = \ [{"level": "error", "message": "There was a problem parsing the file.\n" + str(ex)}] elif '/' in yaml_document["data_file"]: errors[yaml_document["data_file"]] = \ [{"level": "error", "message": "Name of data_file should not contain '/'.\n"}] else: if data_file_validator.validate( file_path=main_file_path, data=data): try: process_data_file(recid, hepsubmission.version, basepath, yaml_document, datasubmission, main_file_path) except SQLAlchemyError as sqlex: errors[yaml_document["data_file"]] = [{ "level": "error", "message": "There was a problem processing the file.\n" + str(sqlex) }] db.session.rollback() else: errors = process_validation_errors_for_display( data_file_validator.get_messages()) data_file_validator.clear_messages() if yaml_document["data_file"] not in errors: # Check that the length of the 'values' list is consistent # for each of the independent_variables and dependent_variables. indep_count = [ len(indep['values']) for indep in data['independent_variables'] ] dep_count = [ len(dep['values']) for dep in data['dependent_variables'] ] if len(set(indep_count + dep_count) ) > 1: # if more than one unique count errors.setdefault( yaml_document["data_file"], [] ).append({ "level": "error", "message": "Inconsistent length of 'values' list:\n" + "independent_variables{}, dependent_variables{}" .format(str(indep_count), str(dep_count)) }) submission_file.close() if no_general_submission_info: hepsubmission.last_updated = datetime.now() db.session.add(hepsubmission) db.session.commit() # The line below is commented out since it does not preserve the order of tables. # Delete all tables above instead: side effect of deleting reviews between uploads. #cleanup_submission(recid, hepsubmission.version, added_file_names) db.session.commit() if len(errors) is 0: errors = package_submission(basepath, recid, hepsubmission) reserve_dois_for_data_submissions( publication_recid=recid, version=hepsubmission.version) admin_indexer = AdminIndexer() admin_indexer.index_submission(hepsubmission) else: # delete all tables if errors cleanup_submission(recid, hepsubmission.version, {}) else: errors = process_validation_errors_for_display( submission_file_validator.get_messages()) submission_file_validator.clear_messages() else: # return an error errors = { "submission.yaml": [{ "level": "error", "message": "No submission.yaml file found in submission." }] } return errors # we return all the errors collectively. # This makes more sense that returning errors as # soon as problems are found on one file. return errors