def process_zip_archive(file, id): filename = secure_filename(file.filename) time_stamp = str(int(round(time.time()))) file_save_directory = os.path.join(current_app.config['CFG_DATADIR'], str(id), time_stamp) if not os.path.exists(file_save_directory): os.makedirs(file_save_directory) if not filename.endswith('.oldhepdata'): file_path = os.path.join(file_save_directory, filename) file.save(file_path) submission_path = os.path.join(file_save_directory, remove_file_extension(filename)) if filename.endswith('.yaml'): # we split the singular yaml file and create a submission directory error, last_updated = split_files(file_path, submission_path) if error: return { "Single YAML file splitter": [{ "level": "error", "message": str(error) }] } else: # we are dealing with a zip, tar, etc. so we extract the contents extract(filename, file_path, submission_path) submission_found = find_file_in_directory( submission_path, lambda x: x == "submission.yaml") else: file_path = os.path.join(file_save_directory, 'oldhepdata') if not os.path.exists(file_path): os.makedirs(file_path) print('Saving file to {}'.format(os.path.join(file_path, filename))) file.save(os.path.join(file_path, filename)) submission_path = os.path.join(file_save_directory, 'oldhepdata') submission_found = False if submission_found: basepath, submission_file_path = submission_found else: result = check_and_convert_from_oldhepdata(submission_path, id, time_stamp) # Check for errors if type(result) == dict: return result else: basepath, submission_file_path = result return process_submission_directory(basepath, submission_file_path, id)
def process_zip_archive(file, id): filename = secure_filename(file.filename) time_stamp = str(int(round(time.time()))) file_save_directory = os.path.join(current_app.config['CFG_DATADIR'], str(id), time_stamp) if not os.path.exists(file_save_directory): os.makedirs(file_save_directory) if '.oldhepdata' not in filename: file_path = os.path.join(file_save_directory, filename) file.save(file_path) submission_path = os.path.join(file_save_directory, remove_file_extension(filename)) if 'yaml' in filename: # we split the singular yaml file and create a submission directory split_files(file_path, submission_path) else: # we are dealing with a zip, tar, etc. so we extract the contents extract(filename, file_path, submission_path) submission_found = find_file_in_directory(submission_path, lambda x: x == "submission.yaml") else: file_path = os.path.join(file_save_directory, 'oldhepdata') if not os.path.exists(file_path): os.makedirs(file_path) if filename.endswith('.txt'): filename = filename.replace(".txt", "") print('Saving file to {}'.format(os.path.join(file_path, filename))) file.save(os.path.join(file_path, filename)) submission_path = os.path.join(file_save_directory, 'oldhepdata') submission_found = False if submission_found: basepath, submission_file_path = submission_found else: result = check_and_convert_from_oldhepdata(submission_path, id, time_stamp) # Check for errors if type(result) == dict: return result else: basepath, submission_file_path = result return process_submission_directory(basepath, submission_file_path, id)
def process_zip_archive(file, id): filename = secure_filename(file.filename) time_stamp = str(int(round(time.time()))) file_save_directory = os.path.join(current_app.config['CFG_DATADIR'], str(id), time_stamp) if not os.path.exists(file_save_directory): os.makedirs(file_save_directory) if not filename.endswith('.oldhepdata'): file_path = os.path.join(file_save_directory, filename) print('Saving file to {}'.format(file_path)) file.save(file_path) submission_path = os.path.join(file_save_directory, remove_file_extension(filename)) submission_temp_path = tempfile.mkdtemp( dir=current_app.config["CFG_TMPDIR"]) if filename.endswith('.yaml'): # we split the singular yaml file and create a submission directory error, last_updated = split_files(file_path, submission_temp_path) if error: return { "Single YAML file splitter": [{ "level": "error", "message": str(error) }] } else: # we are dealing with a zip, tar, etc. so we extract the contents if not extract(file_path, submission_temp_path): return { "Archive file extractor": [{ "level": "error", "message": "{} is not a valid zip or tar archive file.".format( file_path) }] } if not os.path.exists(submission_path): os.makedirs(submission_path) # Move files from submission_temp_path to submission_path (try to avoid problems with EOS disk). if current_app.config.get('PRODUCTION_MODE', False): # production instance at CERN copy_command = ['xrdcp', '-N', '-f'] copy_submission_path = submission_path.replace( current_app.config['CFG_DATADIR'], current_app.config['EOS_DATADIR']) else: # local instance copy_command = ['cp'] copy_submission_path = submission_path print('Copying with: {} -r {} {}'.format(' '.join(copy_command), submission_temp_path + '/.', copy_submission_path)) subprocess.check_output( copy_command + ['-r', submission_temp_path + '/.', copy_submission_path]) rmtree(submission_temp_path, ignore_errors=True ) # can uncomment when this is definitely working submission_found = find_file_in_directory( submission_path, lambda x: x == "submission.yaml") else: file_path = os.path.join(file_save_directory, 'oldhepdata') if not os.path.exists(file_path): os.makedirs(file_path) print('Saving file to {}'.format(os.path.join(file_path, filename))) file.save(os.path.join(file_path, filename)) submission_found = False if submission_found: basepath, submission_file_path = submission_found else: result = check_and_convert_from_oldhepdata(file_path, id, time_stamp) # Check for errors if type(result) == dict: return result else: basepath, submission_file_path = result return process_submission_directory(basepath, submission_file_path, id)
def process_zip_archive(file_path, id, old_submission_schema=False, old_data_schema=False): (file_save_directory, filename) = os.path.split(file_path) if not filename.endswith('.oldhepdata'): file_save_directory = os.path.dirname(file_path) submission_path = os.path.join(file_save_directory, remove_file_extension(filename)) submission_temp_path = tempfile.mkdtemp(dir=current_app.config["CFG_TMPDIR"]) if filename.endswith('.yaml.gz'): print('Extracting: {} to {}'.format(file_path, file_path[:-3])) if not extract(file_path, file_path[:-3]): return { "Archive file extractor": [{ "level": "error", "message": "{} is not a valid .gz file.".format(file_path) }] } return process_zip_archive(file_path[:-3], id, old_submission_schema=old_submission_schema, old_data_schema=False) elif filename.endswith('.yaml'): # we split the singular yaml file and create a submission directory error, last_updated = split_files(file_path, submission_temp_path) if error: return { "Single YAML file splitter": [{ "level": "error", "message": str(error) }] } else: # we are dealing with a zip, tar, etc. so we extract the contents try: unzipped_path = extract(file_path, submission_temp_path) except Exception as e: unzipped_path = None if not unzipped_path: return { "Archive file extractor": [{ "level": "error", "message": "{} is not a valid zip or tar archive file.".format(file_path) }] } copy_errors = move_files(submission_temp_path, submission_path) if copy_errors: return copy_errors submission_found = find_file_in_directory(submission_path, lambda x: x == "submission.yaml") if not submission_found: return { "Archive file extractor": [{ "level": "error", "message": "No submission.yaml file has been found in the archive." }] } basepath, submission_file_path = submission_found else: file_dir = os.path.dirname(file_save_directory) time_stamp = os.path.split(file_dir)[1] result = check_and_convert_from_oldhepdata(os.path.dirname(file_save_directory), id, time_stamp) # Check for errors if type(result) == dict: return result else: basepath, submission_file_path = result old_data_schema = True return process_submission_directory(basepath, submission_file_path, id, old_data_schema=old_data_schema, old_submission_schema=old_submission_schema)
def _import_record(inspire_id, update_existing=False, base_url='https://hepdata.net', send_email=False): publication_information, status = get_inspire_record_information( inspire_id) if status != "success": log.error("Failed to retrieve publication information for " + inspire_id) return False current_submission = get_latest_hepsubmission(inspire_id=inspire_id) if not current_submission: log.info( "The record with id {0} does not exist in the database, so we're loading it." .format(inspire_id)) publication_information["inspire_id"] = inspire_id record_information = create_record(publication_information) recid = record_information['recid'] else: log.info("The record with inspire id {0} already exists.".format( inspire_id)) if update_existing: log.info("Updating instead") recid = current_submission.publication_recid else: log.info("Not updating as update_existing is False") return False try: download_path = _download_file(base_url, inspire_id) filename = os.path.basename(download_path) time_stamp = str(int(round(time.time()))) file_save_directory = get_data_path_for_record(str(recid), time_stamp) if not os.path.exists(file_save_directory): os.makedirs(file_save_directory) file_path = os.path.join(file_save_directory, filename) log.info("Moving file to %s" % file_path) shutil.copy(download_path, file_path) # Create submission admin_user_id = 1 hepsubmission = get_or_create_hepsubmission(recid, admin_user_id) db.session.add(hepsubmission) db.session.commit() # Then process the payload as for any other record errors = process_zip_archive(file_path, recid) if errors: log.info("Errors processing archive. Re-trying with old schema.") # Try again with old schema # Need to clean up first to avoid errors # First delete tables cleanup_submission(recid, 1, []) # Next remove remaining files file_save_directory = os.path.dirname(file_path) submission_path = os.path.join(file_save_directory, remove_file_extension(filename)) shutil.rmtree(submission_path) errors = process_zip_archive(file_path, recid, old_submission_schema=True, old_data_schema=True) if errors: log.error("Could not process zip archive: ") for file, file_errors in errors.items(): log.error(" %s:" % file) for error in file_errors: log.error(" %s" % error['message']) raise ValueError("Could not validate record.") # Delete any previous upload folders cleanup_old_files(hepsubmission) log.info("Finalising record %s" % recid) result_json = do_finalise(recid, force_finalise=True, update=(current_submission is not None), convert=False, send_email=send_email) result = json.loads(result_json) if result and result['success']: log.info("Imported record %s with %s submissions" % (recid, result['data_count'])) return True else: raise ValueError("Failed to finalise record.") except Exception as e: # Unload record unload_submission(recid) log.error(e) return False