def parse_additional_resources(basepath, recid, yaml_document): """ Parses out the additional resource section for a full submission. :param basepath: the path the submission has been loaded to :param recid: :param yaml_document: :return: """ resources = [] for reference in yaml_document['additional_resources']: resource_location = reference['location'] file_type = infer_file_type(reference["location"]) contains_pattern, pattern = contains_accepted_url(reference['location']) if ('http' in resource_location.lower() and 'hepdata' not in resource_location) or contains_pattern: if pattern: file_type = pattern else: file_type = 'html' # in case URLs do not have http added. if 'http' not in resource_location.lower(): resource_location = "http://" + resource_location elif 'http' not in resource_location.lower() and 'www' not in resource_location.lower(): if resource_location.startswith('/resource'): # This is an old file migrated from hepdata.cedar.ac.uk. We # should only get here if using mock_import_old_record, in # which case the resources should already be in the 'resources' # directory parent_dir = os.path.dirname(basepath) resource_location = os.path.join( parent_dir, 'resources', os.path.basename(resource_location) ) if not os.path.exists(resource_location): raise ValueError("No such path %s" % resource_location) else: # this is a file local to the submission. try: resource_location = os.path.join(basepath, resource_location) except Exception as e: raise e if resource_location: new_reference = DataResource( file_location=resource_location, file_type=file_type, file_description=reference['description']) if "license" in reference: resource_license = get_license(reference["license"]) new_reference.file_license = resource_license.id resources.append(new_reference) return resources
def parse_additional_resources(basepath, recid, version, yaml_document): """ Parses out the additional resource section for a full submission :param hepsubmission: :param recid: :param submission_info_document: :return: """ resources = [] for reference in yaml_document['additional_resources']: resource_location = reference['location'] file_type = infer_file_type(reference["location"]) contains_pattern, pattern = contains_accepted_url(reference['location']) if ('http' in resource_location and 'hepdata' not in resource_location) or contains_pattern: if pattern: file_type = pattern else: file_type = 'html' # in case URLs do not have http added. if 'http' not in resource_location: resource_location = "http://" + resource_location elif 'http' not in resource_location and 'www' not in resource_location and 'resource' not in resource_location: # this is a file local to the submission. try: resource_location = os.path.join(basepath, resource_location) except Exception as e: raise e else: try: resource_location = download_resource_file(recid, resource_location) print('Downloaded resource location is {0}'.format(resource_location)) except URLError as url_error: log.error("Unable to download {0}. The resource is unavailable.".format(resource_location)) resource_location = None if resource_location: new_reference = DataResource( file_location=resource_location, file_type=file_type, file_description=reference['description']) if "license" in reference: dict = get_prefilled_dictionary( ["name", "url", "description"], reference["license"]) resource_license = get_or_create( db.session, License, name=dict['name'], url=dict['url'], description=dict['description']) new_reference.file_license = resource_license.id resources.append(new_reference) return resources
def process_data_file(recid, version, basepath, data_obj, datasubmission, main_file_path): """ Takes a data file and any supplementary files and persists their metadata to the database whilst recording their upload path. :param recid: the record id :param version: version of the resource to be stored :param basepath: the path the submission has been loaded to :param data_obj: Object representation of loaded YAML file :param datasubmission: the DataSubmission object representing this file in the DB :param main_file_path: the data file path :return: """ main_data_file = DataResource(file_location=main_file_path, file_type="data") if "data_license" in data_obj: dict = get_prefilled_dictionary(["name", "url", "description"], data_obj["data_license"]) license = get_or_create(db.session, License, name=dict['name'], url=dict['url'], description=dict['description']) main_data_file.file_license = license.id db.session.add(main_data_file) # I have to do the commit here, otherwise I have no ID to reference in the data submission table. db.session.commit() datasubmission.data_file = main_data_file.id if "location" in data_obj: datasubmission.location_in_publication = data_obj["location"] cleanup_data_keywords(datasubmission) if "keywords" in data_obj: for keyword in data_obj["keywords"]: keyword_name = keyword['name'] for value in keyword['values']: keyword = Keyword(name=keyword_name, value=value) datasubmission.keywords.append(keyword) cleanup_data_resources(datasubmission) if "additional_resources" in data_obj: resources = parse_additional_resources(basepath, recid, version, data_obj) for resource in resources: datasubmission.resources.append(resource) db.session.commit()
def process_data_file(recid, version, basepath, data_obj, datasubmission, main_file_path): """ Takes a data file and any supplementary files and persists their metadata to the database whilst recording their upload path. :param recid: the record id :param version: version of the resource to be stored :param basepath: the path the submission has been loaded to :param data_obj: Object representation of loaded YAML file :param datasubmission: the DataSubmission object representing this file in the DB :param main_file_path: the data file path :return: """ main_data_file = DataResource( file_location=main_file_path, file_type="data") if "data_license" in data_obj: dict = get_prefilled_dictionary( ["name", "url", "description"], data_obj["data_license"]) license = get_or_create( db.session, License, name=dict['name'], url=dict['url'], description=dict['description']) main_data_file.file_license = license.id db.session.add(main_data_file) # I have to do the commit here, otherwise I have no ID to reference in the data submission table. db.session.commit() datasubmission.data_file = main_data_file.id if "location" in data_obj: datasubmission.location_in_publication = data_obj["location"] if "keywords" in data_obj: for keyword in data_obj["keywords"]: keyword_name = keyword['name'] for value in keyword['values']: keyword = Keyword(name=keyword_name, value=value) datasubmission.keywords.append(keyword) cleanup_data_resources(datasubmission) if "additional_resources" in data_obj: resources = parse_additional_resources(basepath, recid, version, data_obj) for resource in resources: datasubmission.resources.append(resource) db.session.commit()
def update_analyses(): endpoints = current_app.config["ANALYSES_ENDPOINTS"] for analysis_endpoint in endpoints: if "endpoint_url" in endpoints[analysis_endpoint]: log.info("Updating analyses from {0}...".format(analysis_endpoint)) response = requests.get( endpoints[analysis_endpoint]["endpoint_url"]) if response: analyses = response.json() for record in analyses: submission = get_latest_hepsubmission( inspire_id=record, overall_status='finished') if submission: num_new_resources = 0 for analysis in analyses[record]: _resource_url = endpoints[analysis_endpoint][ "url_template"].format(analysis) if not is_resource_added_to_submission( submission.publication_recid, submission.version, _resource_url): print( 'Adding {} analysis to ins{} with URL {}'. format(analysis_endpoint, record, _resource_url)) new_resource = DataResource( file_location=_resource_url, file_type=analysis_endpoint) submission.resources.append(new_resource) num_new_resources += 1 if num_new_resources: try: db.session.add(submission) db.session.commit() index_record_ids( [submission.publication_recid]) except Exception as e: db.session.rollback() log.error(e) else: log.debug( "An analysis is available in {0} but with no equivalent in HEPData (ins{1})." .format(analysis_endpoint, record)) else: log.debug( "No endpoint url configured for {0}".format(analysis_endpoint))
def test_receive_before_flush_errors(app, mocker, caplog): # Test that errors are logged in receive_before_flush # We mimic errors by providing unpersisted objects to the DataResource and # DataReview queries using mocking, so that they cannot successfully be # deleted from the db caplog.set_level(logging.ERROR) recid = "12345" datasubmission = DataSubmission(publication_recid=recid) db.session.add(datasubmission) db.session.commit() mockResourceFilterBy = mocker.Mock(first=lambda: DataResource()) mockResourceQuery = mocker.Mock(filter_by=lambda id: mockResourceFilterBy) mockDataResource = mocker.Mock(query=mockResourceQuery) mocker.patch('hepdata.modules.submission.models.DataResource', mockDataResource) mockReviewFilterBy = mocker.Mock(all=lambda: [DataReview()]) mockReviewQuery = mocker.Mock( filter_by=lambda data_recid: mockReviewFilterBy) mockDataReview = mocker.Mock(query=mockReviewQuery) mocker.patch('hepdata.modules.submission.models.DataReview', mockDataReview) db.session.delete(datasubmission) db.session.commit() # Last error logs are what we're looking for assert (len(caplog.records) == 2) assert (caplog.records[0].levelname == "ERROR") assert (caplog.records[0].msg.startswith( "Unable to delete data resource with id None whilst deleting data submission id 1. Error was: Instance '<DataResource at " )) assert (caplog.records[0].msg.endswith(" is not persisted")) assert (caplog.records[1].levelname == "ERROR") assert (caplog.records[1].msg.startswith( "Unable to delete review with id None whilst deleting data submission id 1. Error was: Instance '<DataReview at " )) assert (caplog.records[1].msg.endswith(" is not persisted"))
def add_resource(type, identifier, version): """ Adds a data resource to either the submission or individual data files. :param type: :param identifier: :param version: :return: """ submission = None inspire_id = None recid = None if type == 'submission': submission = HEPSubmission.query.filter_by(publication_recid=identifier, version=version).one() if submission: inspire_id = submission.inspire_id recid = submission.publication_recid elif type == 'data': submission = DataSubmission.query.filter_by(id=identifier).one() if submission: inspire_id = submission.publication_inspire_id recid = submission.publication_recid if not user_allowed_to_perform_action(recid): abort(403) analysis_type = request.form.get('analysisType', None) analysis_other = request.form.get('analysisOther', None) analysis_url = request.form.get('analysisURL', None) analysis_description = request.form.get('analysisDescription', None) if analysis_type == 'other': analysis_type = analysis_other if analysis_type and analysis_url: if submission: new_resource = DataResource(file_location=analysis_url, file_type=analysis_type, file_description=str(analysis_description)) submission.resources.append(new_resource) try: db.session.add(submission) db.session.commit() try: index_record_ids([recid]) except: log.error('Failed to reindex {0}'.format(recid)) if inspire_id: return redirect('/record/ins{0}'.format(inspire_id)) else: return redirect('/record/{0}'.format(recid)) except Exception as e: db.session.rollback() raise e return render_template('hepdata_records/error_page.html', recid=None, header_message='Error adding resource.', message='Unable to add resource. Please try again.', errors={})
def test_data_submission_cascades(app): # Create a data submission recid = "12345" datasubmission = DataSubmission(publication_recid=recid) db.session.add(datasubmission) db.session.commit() # Add a data review with a message message = Message(user=1, message="Test review message") datareview = DataReview(publication_recid=recid, data_recid=datasubmission.id, messages=[message]) db.session.add(datareview) db.session.commit() reviews = DataReview.query.filter_by(publication_recid=recid).all() assert (len(reviews) == 1) assert (reviews[0] == datareview) messages = Message.query.all() assert (len(messages) == 1) # Add some data resources with files files_dir = os.path.join(app.config['CFG_DATADIR'], 'models_test') os.makedirs(files_dir, exist_ok=True) resources = [] for i in range(3): file_path = os.path.join(files_dir, f'file{i}.txt') f = open(file_path, 'w') f.close() dataresource = DataResource(file_location=file_path, file_type="data") db.session.add(dataresource) db.session.commit() resources.append(dataresource) # Create an extra DataResource linking to file0.txt but # not linked to the submission # (because this situation has arisen in prod) dataresource = DataResource(file_location=os.path.join( files_dir, 'file0.txt'), file_type="data") db.session.add(dataresource) db.session.commit() assert (len(os.listdir(files_dir)) == 3) datasubmission.data_file = resources[0].id datasubmission.resources = resources[1:] db.session.add(datasubmission) db.session.commit() # Check we can find resources in db dataresources = DataResource.query.filter( DataResource.id.in_([x.id for x in resources])).all() assert (len(dataresources) == 3) # Delete datasubmission db.session.delete(datasubmission) db.session.commit() # Check that datareview is deleted reviews = DataReview.query.filter_by(publication_recid=recid).all() assert (len(reviews) == 0) # Check that message is deleted messages = Message.query.all() assert (len(messages) == 0) # Check all resources have been deleted dataresources = DataResource.query.filter( DataResource.id.in_([x.id for x in resources])).all() assert (len(dataresources) == 0) # Check files are also deleted, apart from file0 # as that's referenced by another DataResource assert (os.listdir(files_dir) == ['file0.txt']) # Tidy up shutil.rmtree(files_dir)