def test_file_extractor(app): with app.app_context(): base_dir = os.path.dirname(os.path.realpath(__file__)) test_data_directory = os.path.join(base_dir, 'test_data') files = [{ 'file': '1396331.zip', 'extract_as': '1396331' }, { 'file': '1396331.tar', 'extract_as': '1396331-tar' }, { 'file': '1396331.tar.gz', 'extract_as': '1396331-targz' }] for file in files: extract_dir = os.path.join(app.config['CFG_TMPDIR'], file['extract_as']) extract(file['file'], os.path.join(test_data_directory, file['file']), extract_dir) assert (os.path.exists(extract_dir)) file = get_file_in_directory(extract_dir, 'yaml') assert (file is not None)
def process_zip_archive(file, id): filename = secure_filename(file.filename) time_stamp = str(int(round(time.time()))) file_save_directory = os.path.join(current_app.config['CFG_DATADIR'], str(id), time_stamp) if not os.path.exists(file_save_directory): os.makedirs(file_save_directory) if not filename.endswith('.oldhepdata'): file_path = os.path.join(file_save_directory, filename) file.save(file_path) submission_path = os.path.join(file_save_directory, remove_file_extension(filename)) if filename.endswith('.yaml'): # we split the singular yaml file and create a submission directory error, last_updated = split_files(file_path, submission_path) if error: return { "Single YAML file splitter": [{ "level": "error", "message": str(error) }] } else: # we are dealing with a zip, tar, etc. so we extract the contents extract(filename, file_path, submission_path) submission_found = find_file_in_directory( submission_path, lambda x: x == "submission.yaml") else: file_path = os.path.join(file_save_directory, 'oldhepdata') if not os.path.exists(file_path): os.makedirs(file_path) print('Saving file to {}'.format(os.path.join(file_path, filename))) file.save(os.path.join(file_path, filename)) submission_path = os.path.join(file_save_directory, 'oldhepdata') submission_found = False if submission_found: basepath, submission_file_path = submission_found else: result = check_and_convert_from_oldhepdata(submission_path, id, time_stamp) # Check for errors if type(result) == dict: return result else: basepath, submission_file_path = result return process_submission_directory(basepath, submission_file_path, id)
def process_zip_archive(file, id): filename = secure_filename(file.filename) time_stamp = str(int(round(time.time()))) file_save_directory = os.path.join(current_app.config['CFG_DATADIR'], str(id), time_stamp) if not os.path.exists(file_save_directory): os.makedirs(file_save_directory) if '.oldhepdata' not in filename: file_path = os.path.join(file_save_directory, filename) file.save(file_path) submission_path = os.path.join(file_save_directory, remove_file_extension(filename)) if 'yaml' in filename: # we split the singular yaml file and create a submission directory split_files(file_path, submission_path) else: # we are dealing with a zip, tar, etc. so we extract the contents extract(filename, file_path, submission_path) submission_found = find_file_in_directory(submission_path, lambda x: x == "submission.yaml") else: file_path = os.path.join(file_save_directory, 'oldhepdata') if not os.path.exists(file_path): os.makedirs(file_path) if filename.endswith('.txt'): filename = filename.replace(".txt", "") print('Saving file to {}'.format(os.path.join(file_path, filename))) file.save(os.path.join(file_path, filename)) submission_path = os.path.join(file_save_directory, 'oldhepdata') submission_found = False if submission_found: basepath, submission_file_path = submission_found else: result = check_and_convert_from_oldhepdata(submission_path, id, time_stamp) # Check for errors if type(result) == dict: return result else: basepath, submission_file_path = result return process_submission_directory(basepath, submission_file_path, id)
def download_datatable(data_resource, file_format, *args, **kwargs): record_path, table_name = os.path.split(data_resource.file_location) filename = 'HEPData-{0}'.format(kwargs.pop('submission_id')) if 'table_name' in kwargs: filename += '-' + kwargs.pop('table_name').replace(' ', '') output_path = os.path.join(current_app.config['CFG_TMPDIR'], filename) if file_format == 'yaml': return send_file( data_resource.file_location, as_attachment=True, ) options = { 'input_format': 'yaml', 'output_format': file_format, 'table': table_name, 'filename': table_name.split('.')[0], } if not os.path.exists(output_path): successful = convert( CFG_CONVERTER_URL, record_path, output=output_path + '-dir', options=options, extract=False, ) else: successful = True # Error occurred, the output is a HTML file if successful: new_path = output_path + "." + file_format new_path = extract(filename + ".tar.gz", output_path + '-dir', new_path) file_to_send = get_file_in_directory(new_path, file_format) else: file_to_send = output_path + '-dir' file_format = 'html' return send_file(file_to_send, as_attachment=True, attachment_filename=filename + '.' + file_format)
def download_datatable(datasubmission, file_format, *args, **kwargs): """ Download a particular data table given a ``datasubmission``. :param datasubmission: :param file_format: :param args: :param kwargs: :return: display_error or send_file depending on success of conversion """ if file_format == 'json': return redirect('/record/data/{0}/{1}/{2}'.format( datasubmission.publication_recid, datasubmission.id, datasubmission.version)) elif file_format not in CFG_SUPPORTED_FORMATS: return display_error( title="The " + file_format + " output format is not supported", description="This output format is not supported. " + "Currently supported formats: " + str(CFG_SUPPORTED_FORMATS), ) dataresource = DataResource.query.filter_by( id=datasubmission.data_file).one() record_path, table_name = os.path.split(dataresource.file_location) filename = 'HEPData-{0}-v{1}'.format(kwargs.pop('submission_id'), datasubmission.version) if 'table_name' in kwargs: filename += '-' + kwargs.pop('table_name').replace(' ', '_').replace( '/', '_').replace('$', '').replace('\\', '') output_path = os.path.join(current_app.config['CFG_TMPDIR'], filename) if file_format == 'yaml': return send_file(dataresource.file_location, as_attachment=True, attachment_filename=filename + '.yaml') options = { 'input_format': 'yaml', 'output_format': file_format, 'table': table_name, 'filename': table_name.split('.')[0], 'validator_schema_version': '0.1.0', } hepsubmission = HEPSubmission.query.filter_by( publication_recid=datasubmission.publication_recid, version=datasubmission.version).first() if datasubmission.doi and hepsubmission.overall_status != 'sandbox': options['hepdata_doi'] = datasubmission.doi.rsplit( '/', 1)[0].encode('ascii') if file_format == 'yoda': rivet_analysis_name = kwargs.pop('rivet_analysis_name', '') if rivet_analysis_name: options['rivet_analysis_name'] = rivet_analysis_name elif datasubmission.publication_inspire_id: record = get_record_contents(datasubmission.publication_recid) if record: # Check if this record has a Rivet analysis, then extract the Rivet analysis name from the URL. if 'analyses' in record: for analysis in record['analyses']: if analysis['type'] == 'rivet': options['rivet_analysis_name'] = analysis[ 'analysis'].split('/')[-1] # Otherwise guess the Rivet analysis name using the collaboration name, # the creation year of the INSPIRE record, and the INSPIRE ID. if 'rivet_analysis_name' not in options: try: year = parse(record['creation_date']).year except: year = record['year'] # publication year options['rivet_analysis_name'] = '{0}_{1}_I{2}'.format( ''.join(record['collaborations']).upper(), year, datasubmission.publication_inspire_id) successful = convert( CFG_CONVERTER_URL, record_path, output=output_path + '-dir', options=options, extract=False, ) if successful: new_path = output_path + "." + file_format new_path = extract(output_path + '-dir', new_path) os.remove(output_path + '-dir') file_to_send = get_file_in_directory(new_path, file_format) else: # Error occurred, the output is a HTML file file_to_send = output_path + '-dir' file_format = 'html' return send_file(file_to_send, as_attachment=True, attachment_filename=filename + '.' + file_format)
def download_datatable(datasubmission, file_format, *args, **kwargs): """ Download a particular data table given a ``datasubmission``. :param datasubmission: :param file_format: :param args: :param kwargs: :return: display_error or send_file depending on success of conversion """ if file_format == 'json': return redirect('/record/data/{0}/{1}/{2}'.format(datasubmission.publication_recid, datasubmission.id, datasubmission.version)) elif file_format not in CFG_SUPPORTED_FORMATS: return display_error( title="The " + file_format + " output format is not supported", description="This output format is not supported. " + "Currently supported formats: " + str(CFG_SUPPORTED_FORMATS), ) dataresource = DataResource.query.filter_by(id=datasubmission.data_file).one() record_path, table_name = os.path.split(dataresource.file_location) filename = 'HEPData-{0}-v{1}'.format(kwargs.pop('submission_id'), datasubmission.version) if 'table_name' in kwargs: filename += '-' + kwargs.pop('table_name').replace(' ', '_').replace('/', '_').replace('$', '').replace('\\','') output_path = os.path.join(current_app.config['CFG_TMPDIR'], filename) if file_format == 'yaml' or file_format == 'original': return send_file( dataresource.file_location, as_attachment=True, attachment_filename=filename + '.yaml' ) options = { 'input_format': 'yaml', 'output_format': file_format, 'table': table_name, 'filename': table_name.split('.')[0], 'validator_schema_version': '0.1.0', } hepsubmission = HEPSubmission.query.filter_by(publication_recid=datasubmission.publication_recid, version=datasubmission.version).first() if datasubmission.doi and not hepsubmission.overall_status.startswith('sandbox'): options['hepdata_doi'] = datasubmission.doi.rsplit('/', 1)[0] if file_format == 'yoda': rivet_analysis_name = kwargs.pop('rivet_analysis_name', '') if not rivet_analysis_name: rivet_analysis_name = guess_rivet_analysis_name(hepsubmission) if rivet_analysis_name: options['rivet_analysis_name'] = rivet_analysis_name try: successful = convert( CFG_CONVERTER_URL, record_path, output=output_path + '-dir', options=options, extract=False, timeout=CFG_CONVERTER_TIMEOUT, ) except Error as error: # hepdata_converter_ws_client.Error return display_error(title='Report concerns to [email protected]', description=str(error)) if successful: new_path = output_path + "." + file_format new_path = extract(output_path + '-dir', new_path) os.remove(output_path + '-dir') file_to_send = get_file_in_directory(new_path, file_format) else: # Error occurred, the output is a HTML file file_to_send = output_path + '-dir' file_format = 'html' return send_file(file_to_send, as_attachment=True, attachment_filename=filename + '.' + file_format)
def process_zip_archive(file, id): filename = secure_filename(file.filename) time_stamp = str(int(round(time.time()))) file_save_directory = os.path.join(current_app.config['CFG_DATADIR'], str(id), time_stamp) if not os.path.exists(file_save_directory): os.makedirs(file_save_directory) if not filename.endswith('.oldhepdata'): file_path = os.path.join(file_save_directory, filename) print('Saving file to {}'.format(file_path)) file.save(file_path) submission_path = os.path.join(file_save_directory, remove_file_extension(filename)) submission_temp_path = tempfile.mkdtemp( dir=current_app.config["CFG_TMPDIR"]) if filename.endswith('.yaml'): # we split the singular yaml file and create a submission directory error, last_updated = split_files(file_path, submission_temp_path) if error: return { "Single YAML file splitter": [{ "level": "error", "message": str(error) }] } else: # we are dealing with a zip, tar, etc. so we extract the contents if not extract(file_path, submission_temp_path): return { "Archive file extractor": [{ "level": "error", "message": "{} is not a valid zip or tar archive file.".format( file_path) }] } if not os.path.exists(submission_path): os.makedirs(submission_path) # Move files from submission_temp_path to submission_path (try to avoid problems with EOS disk). if current_app.config.get('PRODUCTION_MODE', False): # production instance at CERN copy_command = ['xrdcp', '-N', '-f'] copy_submission_path = submission_path.replace( current_app.config['CFG_DATADIR'], current_app.config['EOS_DATADIR']) else: # local instance copy_command = ['cp'] copy_submission_path = submission_path print('Copying with: {} -r {} {}'.format(' '.join(copy_command), submission_temp_path + '/.', copy_submission_path)) subprocess.check_output( copy_command + ['-r', submission_temp_path + '/.', copy_submission_path]) rmtree(submission_temp_path, ignore_errors=True ) # can uncomment when this is definitely working submission_found = find_file_in_directory( submission_path, lambda x: x == "submission.yaml") else: file_path = os.path.join(file_save_directory, 'oldhepdata') if not os.path.exists(file_path): os.makedirs(file_path) print('Saving file to {}'.format(os.path.join(file_path, filename))) file.save(os.path.join(file_path, filename)) submission_found = False if submission_found: basepath, submission_file_path = submission_found else: result = check_and_convert_from_oldhepdata(file_path, id, time_stamp) # Check for errors if type(result) == dict: return result else: basepath, submission_file_path = result return process_submission_directory(basepath, submission_file_path, id)
def process_zip_archive(file_path, id, old_submission_schema=False, old_data_schema=False): (file_save_directory, filename) = os.path.split(file_path) if not filename.endswith('.oldhepdata'): file_save_directory = os.path.dirname(file_path) submission_path = os.path.join(file_save_directory, remove_file_extension(filename)) submission_temp_path = tempfile.mkdtemp(dir=current_app.config["CFG_TMPDIR"]) if filename.endswith('.yaml.gz'): print('Extracting: {} to {}'.format(file_path, file_path[:-3])) if not extract(file_path, file_path[:-3]): return { "Archive file extractor": [{ "level": "error", "message": "{} is not a valid .gz file.".format(file_path) }] } return process_zip_archive(file_path[:-3], id, old_submission_schema=old_submission_schema, old_data_schema=False) elif filename.endswith('.yaml'): # we split the singular yaml file and create a submission directory error, last_updated = split_files(file_path, submission_temp_path) if error: return { "Single YAML file splitter": [{ "level": "error", "message": str(error) }] } else: # we are dealing with a zip, tar, etc. so we extract the contents try: unzipped_path = extract(file_path, submission_temp_path) except Exception as e: unzipped_path = None if not unzipped_path: return { "Archive file extractor": [{ "level": "error", "message": "{} is not a valid zip or tar archive file.".format(file_path) }] } copy_errors = move_files(submission_temp_path, submission_path) if copy_errors: return copy_errors submission_found = find_file_in_directory(submission_path, lambda x: x == "submission.yaml") if not submission_found: return { "Archive file extractor": [{ "level": "error", "message": "No submission.yaml file has been found in the archive." }] } basepath, submission_file_path = submission_found else: file_dir = os.path.dirname(file_save_directory) time_stamp = os.path.split(file_dir)[1] result = check_and_convert_from_oldhepdata(os.path.dirname(file_save_directory), id, time_stamp) # Check for errors if type(result) == dict: return result else: basepath, submission_file_path = result old_data_schema = True return process_submission_directory(basepath, submission_file_path, id, old_data_schema=old_data_schema, old_submission_schema=old_submission_schema)
def download_datatable(datasubmission, file_format, *args, **kwargs): if file_format == 'json': return redirect('/record/data/{0}/{1}/{2}'.format( datasubmission.publication_recid, datasubmission.id, datasubmission.version)) elif file_format not in CFG_SUPPORTED_FORMATS: return display_error( title="The " + file_format + " output format is not supported", description="This output format is not supported. " + "Currently supported formats: " + str(CFG_SUPPORTED_FORMATS), ) dataresource = DataResource.query.filter_by( id=datasubmission.data_file).one() record_path, table_name = os.path.split(dataresource.file_location) filename = 'HEPData-{0}-v{1}'.format(kwargs.pop('submission_id'), datasubmission.version) if 'table_name' in kwargs: filename += '-' + kwargs.pop('table_name').replace(' ', '') output_path = os.path.join(current_app.config['CFG_TMPDIR'], filename) if file_format == 'yaml': return send_file(dataresource.file_location, as_attachment=True, attachment_filename=filename + '.yaml') options = { 'input_format': 'yaml', 'output_format': file_format, 'table': table_name, 'filename': table_name.split('.')[0], } if datasubmission.doi: options['hepdata_doi'] = datasubmission.doi.rsplit( '/', 1)[0].encode('ascii') if datasubmission.publication_inspire_id and file_format == 'yoda': record = get_record_contents(datasubmission.publication_recid) if record: options['rivet_analysis_name'] = '{0}_{1}_I{2}'.format( ''.join(record['collaborations']).upper(), record['year'], datasubmission.publication_inspire_id) if not os.path.exists(output_path): successful = convert( CFG_CONVERTER_URL, record_path, output=output_path + '-dir', options=options, extract=False, ) else: successful = True # Error occurred, the output is a HTML file if successful: new_path = output_path + "." + file_format new_path = extract(filename + ".tar.gz", output_path + '-dir', new_path) file_to_send = get_file_in_directory(new_path, file_format) else: file_to_send = output_path + '-dir' file_format = 'html' return send_file(file_to_send, as_attachment=True, attachment_filename=filename + '.' + file_format)