Python extract Examples, hepdata.utils.file_extractor.extract Python Examples

Example #1

0

Show file

def test_file_extractor(app):
    with app.app_context():
        base_dir = os.path.dirname(os.path.realpath(__file__))
        test_data_directory = os.path.join(base_dir, 'test_data')

        files = [{
            'file': '1396331.zip',
            'extract_as': '1396331'
        }, {
            'file': '1396331.tar',
            'extract_as': '1396331-tar'
        }, {
            'file': '1396331.tar.gz',
            'extract_as': '1396331-targz'
        }]

        for file in files:
            extract_dir = os.path.join(app.config['CFG_TMPDIR'],
                                       file['extract_as'])
            extract(file['file'],
                    os.path.join(test_data_directory, file['file']),
                    extract_dir)

            assert (os.path.exists(extract_dir))

            file = get_file_in_directory(extract_dir, 'yaml')
            assert (file is not None)

Example #2

0

Show file

def process_zip_archive(file, id):
    filename = secure_filename(file.filename)
    time_stamp = str(int(round(time.time())))
    file_save_directory = os.path.join(current_app.config['CFG_DATADIR'],
                                       str(id), time_stamp)

    if not os.path.exists(file_save_directory):
        os.makedirs(file_save_directory)

    if not filename.endswith('.oldhepdata'):
        file_path = os.path.join(file_save_directory, filename)
        file.save(file_path)

        submission_path = os.path.join(file_save_directory,
                                       remove_file_extension(filename))
        if filename.endswith('.yaml'):
            # we split the singular yaml file and create a submission directory

            error, last_updated = split_files(file_path, submission_path)
            if error:
                return {
                    "Single YAML file splitter": [{
                        "level": "error",
                        "message": str(error)
                    }]
                }

        else:
            # we are dealing with a zip, tar, etc. so we extract the contents
            extract(filename, file_path, submission_path)

        submission_found = find_file_in_directory(
            submission_path, lambda x: x == "submission.yaml")
    else:
        file_path = os.path.join(file_save_directory, 'oldhepdata')
        if not os.path.exists(file_path):
            os.makedirs(file_path)

        print('Saving file to {}'.format(os.path.join(file_path, filename)))
        file.save(os.path.join(file_path, filename))

        submission_path = os.path.join(file_save_directory, 'oldhepdata')
        submission_found = False

    if submission_found:
        basepath, submission_file_path = submission_found
    else:
        result = check_and_convert_from_oldhepdata(submission_path, id,
                                                   time_stamp)

        # Check for errors
        if type(result) == dict:
            return result
        else:
            basepath, submission_file_path = result

    return process_submission_directory(basepath, submission_file_path, id)

Example #3

0

Show file

File: api.py Project: HEPData/hepdata3

def process_zip_archive(file, id):
    filename = secure_filename(file.filename)
    time_stamp = str(int(round(time.time())))
    file_save_directory = os.path.join(current_app.config['CFG_DATADIR'], str(id), time_stamp)

    if not os.path.exists(file_save_directory):
        os.makedirs(file_save_directory)

    if '.oldhepdata' not in filename:
        file_path = os.path.join(file_save_directory, filename)
        file.save(file_path)

        submission_path = os.path.join(file_save_directory, remove_file_extension(filename))
        if 'yaml' in filename:
            # we split the singular yaml file and create a submission directory

            split_files(file_path, submission_path)
        else:
            # we are dealing with a zip, tar, etc. so we extract the contents
            extract(filename, file_path, submission_path)

        submission_found = find_file_in_directory(submission_path,
                                                  lambda x: x == "submission.yaml")
    else:
        file_path = os.path.join(file_save_directory, 'oldhepdata')
        if not os.path.exists(file_path):
            os.makedirs(file_path)

        if filename.endswith('.txt'):
            filename = filename.replace(".txt", "")
        print('Saving file to {}'.format(os.path.join(file_path, filename)))
        file.save(os.path.join(file_path, filename))

        submission_path = os.path.join(file_save_directory, 'oldhepdata')
        submission_found = False

    if submission_found:
        basepath, submission_file_path = submission_found
    else:
        result = check_and_convert_from_oldhepdata(submission_path, id,
                                                   time_stamp)

        # Check for errors
        if type(result) == dict:
            return result
        else:
            basepath, submission_file_path = result

    return process_submission_directory(basepath, submission_file_path, id)

Example #4

0

Show file

File: views.py Project: HEPData/hepdata3

def download_datatable(data_resource, file_format, *args, **kwargs):
    record_path, table_name = os.path.split(data_resource.file_location)

    filename = 'HEPData-{0}'.format(kwargs.pop('submission_id'))
    if 'table_name' in kwargs:
        filename += '-' + kwargs.pop('table_name').replace(' ', '')

    output_path = os.path.join(current_app.config['CFG_TMPDIR'], filename)

    if file_format == 'yaml':
        return send_file(
            data_resource.file_location,
            as_attachment=True,
        )

    options = {
        'input_format': 'yaml',
        'output_format': file_format,
        'table': table_name,
        'filename': table_name.split('.')[0],
    }

    if not os.path.exists(output_path):

        successful = convert(
            CFG_CONVERTER_URL,
            record_path,
            output=output_path + '-dir',
            options=options,
            extract=False,
        )
    else:
        successful = True

    # Error occurred, the output is a HTML file
    if successful:
        new_path = output_path + "." + file_format
        new_path = extract(filename + ".tar.gz", output_path + '-dir', new_path)
        file_to_send = get_file_in_directory(new_path, file_format)
    else:
        file_to_send = output_path + '-dir'
        file_format = 'html'

    return send_file(file_to_send, as_attachment=True,
                     attachment_filename=filename + '.' + file_format)

Example #5

0

Show file

def download_datatable(datasubmission, file_format, *args, **kwargs):
    """
    Download a particular data table given a ``datasubmission``.

    :param datasubmission:
    :param file_format:
    :param args:
    :param kwargs:
    :return: display_error or send_file depending on success of conversion
    """

    if file_format == 'json':
        return redirect('/record/data/{0}/{1}/{2}'.format(
            datasubmission.publication_recid, datasubmission.id,
            datasubmission.version))
    elif file_format not in CFG_SUPPORTED_FORMATS:
        return display_error(
            title="The " + file_format + " output format is not supported",
            description="This output format is not supported. " +
            "Currently supported formats: " + str(CFG_SUPPORTED_FORMATS),
        )

    dataresource = DataResource.query.filter_by(
        id=datasubmission.data_file).one()

    record_path, table_name = os.path.split(dataresource.file_location)

    filename = 'HEPData-{0}-v{1}'.format(kwargs.pop('submission_id'),
                                         datasubmission.version)
    if 'table_name' in kwargs:
        filename += '-' + kwargs.pop('table_name').replace(' ', '_').replace(
            '/', '_').replace('$', '').replace('\\', '')

    output_path = os.path.join(current_app.config['CFG_TMPDIR'], filename)

    if file_format == 'yaml':
        return send_file(dataresource.file_location,
                         as_attachment=True,
                         attachment_filename=filename + '.yaml')

    options = {
        'input_format': 'yaml',
        'output_format': file_format,
        'table': table_name,
        'filename': table_name.split('.')[0],
        'validator_schema_version': '0.1.0',
    }

    hepsubmission = HEPSubmission.query.filter_by(
        publication_recid=datasubmission.publication_recid,
        version=datasubmission.version).first()

    if datasubmission.doi and hepsubmission.overall_status != 'sandbox':
        options['hepdata_doi'] = datasubmission.doi.rsplit(
            '/', 1)[0].encode('ascii')

    if file_format == 'yoda':
        rivet_analysis_name = kwargs.pop('rivet_analysis_name', '')
        if rivet_analysis_name:
            options['rivet_analysis_name'] = rivet_analysis_name
        elif datasubmission.publication_inspire_id:
            record = get_record_contents(datasubmission.publication_recid)
            if record:
                # Check if this record has a Rivet analysis, then extract the Rivet analysis name from the URL.
                if 'analyses' in record:
                    for analysis in record['analyses']:
                        if analysis['type'] == 'rivet':
                            options['rivet_analysis_name'] = analysis[
                                'analysis'].split('/')[-1]
                # Otherwise guess the Rivet analysis name using the collaboration name,
                # the creation year of the INSPIRE record, and the INSPIRE ID.
                if 'rivet_analysis_name' not in options:
                    try:
                        year = parse(record['creation_date']).year
                    except:
                        year = record['year']  # publication year
                    options['rivet_analysis_name'] = '{0}_{1}_I{2}'.format(
                        ''.join(record['collaborations']).upper(), year,
                        datasubmission.publication_inspire_id)

    successful = convert(
        CFG_CONVERTER_URL,
        record_path,
        output=output_path + '-dir',
        options=options,
        extract=False,
    )

    if successful:
        new_path = output_path + "." + file_format
        new_path = extract(output_path + '-dir', new_path)
        os.remove(output_path + '-dir')
        file_to_send = get_file_in_directory(new_path, file_format)
    else:
        # Error occurred, the output is a HTML file
        file_to_send = output_path + '-dir'
        file_format = 'html'

    return send_file(file_to_send,
                     as_attachment=True,
                     attachment_filename=filename + '.' + file_format)

Example #6

0

Show file

def download_datatable(datasubmission, file_format, *args, **kwargs):
    """
    Download a particular data table given a ``datasubmission``.

    :param datasubmission:
    :param file_format:
    :param args:
    :param kwargs:
    :return: display_error or send_file depending on success of conversion
    """

    if file_format == 'json':
        return redirect('/record/data/{0}/{1}/{2}'.format(datasubmission.publication_recid,
                                                   datasubmission.id, datasubmission.version))
    elif file_format not in CFG_SUPPORTED_FORMATS:
        return display_error(
            title="The " + file_format + " output format is not supported",
            description="This output format is not supported. " +
                        "Currently supported formats: " + str(CFG_SUPPORTED_FORMATS),
        )

    dataresource = DataResource.query.filter_by(id=datasubmission.data_file).one()

    record_path, table_name = os.path.split(dataresource.file_location)

    filename = 'HEPData-{0}-v{1}'.format(kwargs.pop('submission_id'), datasubmission.version)
    if 'table_name' in kwargs:
        filename += '-' + kwargs.pop('table_name').replace(' ', '_').replace('/', '_').replace('$', '').replace('\\','')

    output_path = os.path.join(current_app.config['CFG_TMPDIR'], filename)

    if file_format == 'yaml' or file_format == 'original':
        return send_file(
            dataresource.file_location,
            as_attachment=True,
            attachment_filename=filename + '.yaml'
        )

    options = {
        'input_format': 'yaml',
        'output_format': file_format,
        'table': table_name,
        'filename': table_name.split('.')[0],
        'validator_schema_version': '0.1.0',
    }

    hepsubmission = HEPSubmission.query.filter_by(publication_recid=datasubmission.publication_recid,
                                                  version=datasubmission.version).first()

    if datasubmission.doi and not hepsubmission.overall_status.startswith('sandbox'):
        options['hepdata_doi'] = datasubmission.doi.rsplit('/', 1)[0]

    if file_format == 'yoda':
        rivet_analysis_name = kwargs.pop('rivet_analysis_name', '')
        if not rivet_analysis_name:
            rivet_analysis_name = guess_rivet_analysis_name(hepsubmission)
        if rivet_analysis_name:
            options['rivet_analysis_name'] = rivet_analysis_name

    try:
        successful = convert(
            CFG_CONVERTER_URL,
            record_path,
            output=output_path + '-dir',
            options=options,
            extract=False,
            timeout=CFG_CONVERTER_TIMEOUT,
        )
    except Error as error:  # hepdata_converter_ws_client.Error
        return display_error(title='Report concerns to [email protected]', description=str(error))

    if successful:
        new_path = output_path + "." + file_format
        new_path = extract(output_path + '-dir', new_path)
        os.remove(output_path + '-dir')
        file_to_send = get_file_in_directory(new_path, file_format)
    else:
        # Error occurred, the output is a HTML file
        file_to_send = output_path + '-dir'
        file_format = 'html'

    return send_file(file_to_send, as_attachment=True,
                     attachment_filename=filename + '.' + file_format)

Example #7

0

Show file

def process_zip_archive(file, id):
    filename = secure_filename(file.filename)
    time_stamp = str(int(round(time.time())))
    file_save_directory = os.path.join(current_app.config['CFG_DATADIR'],
                                       str(id), time_stamp)

    if not os.path.exists(file_save_directory):
        os.makedirs(file_save_directory)

    if not filename.endswith('.oldhepdata'):
        file_path = os.path.join(file_save_directory, filename)
        print('Saving file to {}'.format(file_path))
        file.save(file_path)

        submission_path = os.path.join(file_save_directory,
                                       remove_file_extension(filename))
        submission_temp_path = tempfile.mkdtemp(
            dir=current_app.config["CFG_TMPDIR"])

        if filename.endswith('.yaml'):
            # we split the singular yaml file and create a submission directory

            error, last_updated = split_files(file_path, submission_temp_path)
            if error:
                return {
                    "Single YAML file splitter": [{
                        "level": "error",
                        "message": str(error)
                    }]
                }

        else:
            # we are dealing with a zip, tar, etc. so we extract the contents
            if not extract(file_path, submission_temp_path):
                return {
                    "Archive file extractor": [{
                        "level":
                        "error",
                        "message":
                        "{} is not a valid zip or tar archive file.".format(
                            file_path)
                    }]
                }

        if not os.path.exists(submission_path):
            os.makedirs(submission_path)

        # Move files from submission_temp_path to submission_path (try to avoid problems with EOS disk).
        if current_app.config.get('PRODUCTION_MODE',
                                  False):  # production instance at CERN
            copy_command = ['xrdcp', '-N', '-f']
            copy_submission_path = submission_path.replace(
                current_app.config['CFG_DATADIR'],
                current_app.config['EOS_DATADIR'])
        else:  # local instance
            copy_command = ['cp']
            copy_submission_path = submission_path
        print('Copying with: {} -r {} {}'.format(' '.join(copy_command),
                                                 submission_temp_path + '/.',
                                                 copy_submission_path))
        subprocess.check_output(
            copy_command +
            ['-r', submission_temp_path + '/.', copy_submission_path])
        rmtree(submission_temp_path, ignore_errors=True
               )  # can uncomment when this is definitely working

        submission_found = find_file_in_directory(
            submission_path, lambda x: x == "submission.yaml")

    else:
        file_path = os.path.join(file_save_directory, 'oldhepdata')
        if not os.path.exists(file_path):
            os.makedirs(file_path)

        print('Saving file to {}'.format(os.path.join(file_path, filename)))
        file.save(os.path.join(file_path, filename))

        submission_found = False

    if submission_found:
        basepath, submission_file_path = submission_found
    else:
        result = check_and_convert_from_oldhepdata(file_path, id, time_stamp)

        # Check for errors
        if type(result) == dict:
            return result
        else:
            basepath, submission_file_path = result

    return process_submission_directory(basepath, submission_file_path, id)

Example #8

0

Show file

def process_zip_archive(file_path, id, old_submission_schema=False,
                        old_data_schema=False):
    (file_save_directory, filename) = os.path.split(file_path)

    if not filename.endswith('.oldhepdata'):
        file_save_directory = os.path.dirname(file_path)
        submission_path = os.path.join(file_save_directory, remove_file_extension(filename))
        submission_temp_path = tempfile.mkdtemp(dir=current_app.config["CFG_TMPDIR"])

        if filename.endswith('.yaml.gz'):
            print('Extracting: {} to {}'.format(file_path, file_path[:-3]))
            if not extract(file_path, file_path[:-3]):
                return {
                    "Archive file extractor": [{
                        "level": "error", "message": "{} is not a valid .gz file.".format(file_path)
                    }]
                }
            return process_zip_archive(file_path[:-3], id,
                                       old_submission_schema=old_submission_schema,
                                       old_data_schema=False)
        elif filename.endswith('.yaml'):
            # we split the singular yaml file and create a submission directory
            error, last_updated = split_files(file_path, submission_temp_path)
            if error:
                return {
                    "Single YAML file splitter": [{
                        "level": "error",
                        "message": str(error)
                    }]
                }
        else:
            # we are dealing with a zip, tar, etc. so we extract the contents
            try:
                unzipped_path = extract(file_path, submission_temp_path)
            except Exception as e:
                unzipped_path = None

            if not unzipped_path:
                return {
                    "Archive file extractor": [{
                        "level": "error", "message": "{} is not a valid zip or tar archive file.".format(file_path)
                    }]
                }

        copy_errors = move_files(submission_temp_path, submission_path)
        if copy_errors:
            return copy_errors

        submission_found = find_file_in_directory(submission_path, lambda x: x == "submission.yaml")

        if not submission_found:
            return {
                "Archive file extractor": [{
                    "level": "error", "message": "No submission.yaml file has been found in the archive."
                }]
            }

        basepath, submission_file_path = submission_found

    else:
        file_dir = os.path.dirname(file_save_directory)
        time_stamp = os.path.split(file_dir)[1]
        result = check_and_convert_from_oldhepdata(os.path.dirname(file_save_directory), id, time_stamp)

        # Check for errors
        if type(result) == dict:
            return result
        else:
            basepath, submission_file_path = result
            old_data_schema = True

    return process_submission_directory(basepath, submission_file_path, id,
                                        old_data_schema=old_data_schema,
                                        old_submission_schema=old_submission_schema)

Example #9

0

Show file

def download_datatable(datasubmission, file_format, *args, **kwargs):

    if file_format == 'json':
        return redirect('/record/data/{0}/{1}/{2}'.format(
            datasubmission.publication_recid, datasubmission.id,
            datasubmission.version))
    elif file_format not in CFG_SUPPORTED_FORMATS:
        return display_error(
            title="The " + file_format + " output format is not supported",
            description="This output format is not supported. " +
            "Currently supported formats: " + str(CFG_SUPPORTED_FORMATS),
        )

    dataresource = DataResource.query.filter_by(
        id=datasubmission.data_file).one()

    record_path, table_name = os.path.split(dataresource.file_location)

    filename = 'HEPData-{0}-v{1}'.format(kwargs.pop('submission_id'),
                                         datasubmission.version)
    if 'table_name' in kwargs:
        filename += '-' + kwargs.pop('table_name').replace(' ', '')

    output_path = os.path.join(current_app.config['CFG_TMPDIR'], filename)

    if file_format == 'yaml':
        return send_file(dataresource.file_location,
                         as_attachment=True,
                         attachment_filename=filename + '.yaml')

    options = {
        'input_format': 'yaml',
        'output_format': file_format,
        'table': table_name,
        'filename': table_name.split('.')[0],
    }

    if datasubmission.doi:
        options['hepdata_doi'] = datasubmission.doi.rsplit(
            '/', 1)[0].encode('ascii')

    if datasubmission.publication_inspire_id and file_format == 'yoda':
        record = get_record_contents(datasubmission.publication_recid)
        if record:
            options['rivet_analysis_name'] = '{0}_{1}_I{2}'.format(
                ''.join(record['collaborations']).upper(), record['year'],
                datasubmission.publication_inspire_id)

    if not os.path.exists(output_path):

        successful = convert(
            CFG_CONVERTER_URL,
            record_path,
            output=output_path + '-dir',
            options=options,
            extract=False,
        )
    else:
        successful = True

    # Error occurred, the output is a HTML file
    if successful:
        new_path = output_path + "." + file_format
        new_path = extract(filename + ".tar.gz", output_path + '-dir',
                           new_path)
        file_to_send = get_file_in_directory(new_path, file_format)
    else:
        file_to_send = output_path + '-dir'
        file_format = 'html'

    return send_file(file_to_send,
                     as_attachment=True,
                     attachment_filename=filename + '.' + file_format)