コード例 #1
0
ファイル: cli.py プロジェクト: islahudinees/hepdata
def get_data_path(record_id=None, inspire_id=None):
    """Gets the file path where data files for the given record are stored."""
    if record_id:
        # Check record exists
        hepsubmission = get_latest_hepsubmission(publication_recid=record_id)
        if hepsubmission is None:
            click.echo("No record with id %s" % record_id)
            return
    elif inspire_id:
        hepsubmission = get_latest_hepsubmission(inspire_id=inspire_id)
        if hepsubmission is None:
            click.echo("No record with inspire id %s" % inspire_id)
            return
        else:
            record_id = hepsubmission.publication_recid
            click.echo("Inspire ID %s maps to record id %s" %
                       (inspire_id, record_id))

    else:
        click.echo("Please provide either record-id or inspire-id.")
        return

    click.echo("Files for record %s are at:\t\t %s" %
               (record_id, data_files.get_data_path_for_record(record_id)))
    click.echo("Converted files for record %s are at:\t %s" %
               (record_id, data_files.get_converted_directory_path(record_id)))
コード例 #2
0
def package_submission(basepath, recid, hep_submission_obj):
    """
    Zips up a submission directory. This is in advance of its download
    for example by users.

    :param basepath: path of directory containing all submission files
    :param recid: the publication record ID
    :param hep_submission_obj: the HEPSubmission object representing
           the overall position
    """
    path = get_data_path_for_record(str(recid))
    if not os.path.exists(path):
        os.makedirs(path)

    version = hep_submission_obj.version
    if version == 0:
        version = 1

    zip_location = os.path.join(
        path,
        current_app.config['SUBMISSION_FILE_NAME_PATTERN']
            .format(recid, version))
    if os.path.exists(zip_location):
        os.remove(zip_location)

    try:
        shutil.make_archive(os.path.splitext(zip_location)[0], 'zip', basepath)
        return {}
    except Exception as e:
        return {zip_location: [{"level": "error", "message": str(e)}]}
コード例 #3
0
def check_and_convert_from_oldhepdata(input_directory, id, timestamp):
    """
    Check if the input directory contains a .oldhepdata file
    and convert it to YAML if it happens.
    """
    converted_path = get_data_path_for_record(str(id), timestamp, 'yaml')

    oldhepdata_found = find_file_in_directory(
        input_directory,
        lambda x: x.endswith('.oldhepdata'),
    )
    if not oldhepdata_found:
        return {
            "Converter": [{
                "level":
                "error",
                "message":
                "No file with .oldhepdata extension has been found."
            }]
        }

    converted_temp_dir = tempfile.mkdtemp(dir=current_app.config["CFG_TMPDIR"])
    converted_temp_path = os.path.join(converted_temp_dir, 'yaml')

    try:
        successful = convert_oldhepdata_to_yaml(oldhepdata_found[1],
                                                converted_temp_path)
        if not successful:
            # Parse error message from title of HTML file, removing part of string after final "//".
            soup = BeautifulSoup(open(converted_temp_path), "lxml")
            errormsg = soup.title.string.rsplit("//", 1)[0]

    except Error as error:  # hepdata_converter_ws_client.Error
        successful = False
        errormsg = str(error)

    if not successful:
        shutil.rmtree(converted_temp_dir, ignore_errors=True
                      )  # can uncomment when this is definitely working

        return {
            "Converter": [{
                "level":
                "error",
                "message":
                "The conversion from oldhepdata "
                "to the YAML format has not succeeded. "
                "Error message from converter follows:<br/><br/>" + errormsg
            }]
        }
    else:
        copy_errors = move_files(converted_temp_path, converted_path)
        if copy_errors:
            return copy_errors

    return find_file_in_directory(converted_path,
                                  lambda x: x == "submission.yaml")
コード例 #4
0
def save_zip_file(file, id):
    filename = secure_filename(file.filename)
    time_stamp = str(int(round(time.time())))
    file_save_directory = get_data_path_for_record(str(id), time_stamp)

    if filename.endswith('.oldhepdata'):
        file_save_directory = os.path.join(file_save_directory, 'oldhepdata')

    if not os.path.exists(file_save_directory):
        os.makedirs(file_save_directory)
    file_path = os.path.join(file_save_directory, filename)

    print('Saving file to {}'.format(file_path))
    file.save(file_path)
    return file_path
コード例 #5
0
def create_original_with_resources(submission, data_filepath, output_path):
    """Copy or create 'original' zip file, i.e. yaml files with resources. If
    resources were imported from hepdata.cedar.ac.uk we create a new zip
    in a format that could be re-uploaded as a submission.

    :param type submission: HEPSubmission object
    :param type data_filepath: Path to original file
    :param type output_path: Path to output file (in converted dir)
    :return: None
    """
    resource_location = os.path.join(
        get_data_path_for_record(str(submission.publication_recid)),
        'resources'
    )
    if os.path.isdir(resource_location):
        # There is a resources directory from when this record was imported
        # from the old hepdata site. We need to create a new zip with the
        # contents of data_filepath and resources
        with tempfile.TemporaryDirectory(dir=current_app.config['CFG_TMPDIR']) as tmpdir:
            # Copy resources directory into 'contents' dir in temp directory
            contents_path = os.path.join(tmpdir, 'contents')
            shutil.copytree(resource_location, contents_path)

            # Unzip data_filepath into contents path
            shutil.unpack_archive(data_filepath, contents_path)

            # Need to go through the submission file and update the paths so
            # that all resources are at the top level. This should allow the
            # zip to be re-uploaded or imported
            submission_found = find_file_in_directory(
                contents_path,
                lambda x: x == "submission.yaml"
            )
            if submission_found:
                with fileinput.FileInput(submission_found[1], inplace=True) as file:
                    p = re.compile(r'(\s+location: )\/resource\/.*\/([^\/]+)')
                    for line in file:
                        print(p.sub(r'\g<1>\g<2>', line), end='')

            # Zip up contents dir into a new file
            base, ext = os.path.splitext(output_path)
            zip_type = 'zip' if ext == '.zip' else 'gztar'
            print("Creating archive at %s" % output_path)
            shutil.make_archive(base, zip_type, contents_path)

    else:
        shutil.copy2(data_filepath, output_path)
コード例 #6
0
def test_create_submission(app, admin_idx):
    """
    Test the whole submission pipeline in loading a file, ensuring the HEPSubmission object is created,
    all the files have been added, and the record has been indexed.
    :return:
    """
    with app.app_context():

        admin_idx.recreate_index()

        # test submission part works

        record = {
            'inspire_id': '19999999',
            'title': 'HEPData Testing 1',
            'reviewer': {
                'name': 'Testy McTester',
                'email': '*****@*****.**'
            },
            'uploader': {
                'name': 'Testy McTester',
                'email': '*****@*****.**'
            },
            'message': 'This is ready',
            'user_id': 1
        }

        hepdata_submission = process_submission_payload(**record)

        assert (hepdata_submission.version == 1)
        assert (hepdata_submission.overall_status == 'todo')

        # test upload works
        base_dir = os.path.dirname(os.path.realpath(__file__))

        test_directory = os.path.join(base_dir, 'test_data/test_submission')
        time_stamp = str(int(round(time.time())))
        directory = get_data_path_for_record(
            hepdata_submission.publication_recid, time_stamp)
        shutil.copytree(test_directory, directory)
        assert (os.path.exists(directory))

        process_submission_directory(
            directory, os.path.join(directory, 'submission.yaml'),
            hepdata_submission.publication_recid)

        admin_idx_results = admin_idx.search(
            term=hepdata_submission.publication_recid, fields=['recid'])
        assert (admin_idx_results is not None)

        data_submissions = DataSubmission.query.filter_by(
            publication_recid=hepdata_submission.publication_recid).count()
        assert (data_submissions == 8)
        assert (len(hepdata_submission.resources) == 4)
        assert (len(hepdata_submission.participants) == 4)

        do_finalise(hepdata_submission.publication_recid,
                    force_finalise=True,
                    convert=False)

        assert (record_exists(inspire_id=record['inspire_id']))

        # Test record is in index...
        index_records = get_records_matching_field('inspire_id',
                                                   record['inspire_id'],
                                                   doc_type='publication')
        assert (len(index_records['hits']['hits']) == 1)

        publication_record = get_record_contents(
            hepdata_submission.publication_recid)

        assert (publication_record is not None)

        ctx = format_submission(hepdata_submission.publication_recid,
                                publication_record, hepdata_submission.version,
                                1, hepdata_submission)

        assert (ctx is not None)

        assert (ctx['version'] == 1)
        assert (ctx['recid'] == hepdata_submission.publication_recid)

        # remove the submission and test that all is remove

        unload_submission(hepdata_submission.publication_recid)

        assert (not record_exists(inspire_id=record['inspire_id']))

        data_submissions = DataSubmission.query.filter_by(
            publication_recid=hepdata_submission.publication_recid).count()

        assert (data_submissions == 0)

        sleep(2)

        admin_idx_results = admin_idx.search(
            term=hepdata_submission.publication_recid, fields=['recid'])
        assert (len(admin_idx_results) == 0)

        # Check file dir has been deleted
        assert (not os.path.exists(directory))
コード例 #7
0
ファイル: records_test.py プロジェクト: islahudinees/hepdata
def test_upload_valid_file(app):
    # Test uploading and processing a file for a record
    with app.app_context():
        base_dir = os.path.dirname(os.path.realpath(__file__))

        for i, status in enumerate(["todo", "sandbox"]):
            user = User.query.first()
            login_user(user)

            recid = f'12345{i}'
            get_or_create_hepsubmission(recid, 1, status=status)

            hepdata_submission = HEPSubmission.query.filter_by(
                publication_recid=recid).first()
            assert (hepdata_submission is not None)
            assert (hepdata_submission.data_abstract is None)
            assert (hepdata_submission.created <
                    hepdata_submission.last_updated)
            assert (hepdata_submission.version == 1)
            assert (hepdata_submission.overall_status == status)

            with open(
                    os.path.join(base_dir, 'test_data/TestHEPSubmission.zip'),
                    "rb") as stream:
                test_file = FileStorage(stream=stream,
                                        filename="TestHEPSubmission.zip")
                response = process_payload(recid,
                                           test_file,
                                           '/test_redirect_url',
                                           synchronous=True)

            assert (response.json == {'url': '/test_redirect_url'})

            # Check the submission has been updated
            hepdata_submission = HEPSubmission.query.filter_by(
                publication_recid=recid).first()
            assert (hepdata_submission.data_abstract.startswith(
                'CERN-LHC.  Measurements of the cross section  for ZZ production'
            ))
            assert (hepdata_submission.created <
                    hepdata_submission.last_updated)
            assert (hepdata_submission.version == 1)
            assert (hepdata_submission.overall_status == status)

            # Set the status to finished and try again, to check versioning
            if status == "todo":
                hepdata_submission.overall_status = 'finished'
                db.session.add(hepdata_submission)

            # Sleep before uploading new version to avoid dir name conflict
            sleep(1)

            # Refresh user
            user = User.query.first()
            login_user(user)

            # Upload a new version
            with open(
                    os.path.join(base_dir, 'test_data/TestHEPSubmission.zip'),
                    "rb") as stream:
                test_file = FileStorage(stream=stream,
                                        filename="TestHEPSubmission.zip")
                process_payload(recid,
                                test_file,
                                '/test_redirect_url',
                                synchronous=True)

            # Check the submission has been updated (overridden for a sandbox;
            # new version for normal submission)
            expected_versions = 2 if status == "todo" else 1
            hepdata_submissions = HEPSubmission.query.filter_by(
                publication_recid=recid).order_by(
                    HEPSubmission.last_updated).all()
            assert (len(hepdata_submissions) == expected_versions)
            assert (hepdata_submissions[0].version == 1)

            if status == "todo":
                assert (hepdata_submissions[0].overall_status == 'finished')

            assert (hepdata_submissions[-1].data_abstract.startswith(
                'CERN-LHC.  Measurements of the cross section  for ZZ production'
            ))
            assert (hepdata_submissions[-1].version == expected_versions)
            assert (hepdata_submissions[-1].overall_status == status)

            # Check that there are the expected number of subdirectories and
            # zip files under the record's main path
            # For status = 'todo' (standard submission) there will be 1 file
            # and 1 dir for each of 2 versions; for the sandbox submission
            # there will just be 1 file and 1 dir.
            directory = get_data_path_for_record(
                hepdata_submission.publication_recid)
            assert (os.path.exists(directory))
            filepaths = os.listdir(directory)
            assert (len(filepaths) == 2 * expected_versions)

            dir_count = 0
            file_count = 0
            for path in filepaths:
                if os.path.isdir(os.path.join(directory, path)):
                    dir_count += 1
                    assert (re.match(r"\d{10}", path) is not None)
                else:
                    file_count += 1
                    assert (re.match(r"HEPData-%s-v[12]-yaml.zip" % recid,
                                     path) is not None)

            assert (dir_count == expected_versions)
            assert (file_count == expected_versions)

            if status == "todo":
                # Delete the v2 submission and check db and v2 files have been removed
                unload_submission(hepdata_submission.publication_recid,
                                  version=2)

                hepdata_submissions = HEPSubmission.query.filter_by(
                    publication_recid=recid).order_by(
                        HEPSubmission.last_updated).all()
                assert (len(hepdata_submissions) == 1)
                assert (hepdata_submissions[0].version == 1)
                assert (hepdata_submissions[0].overall_status == 'finished')

                filepaths = os.listdir(directory)
                assert (len(filepaths) == 2)
                assert (f"HEPData-12345{i}-v1-yaml.zip" in filepaths)

            # Delete the submission and check everything has been removed
            unload_submission(hepdata_submission.publication_recid, version=1)

            hepdata_submissions = HEPSubmission.query.filter_by(
                publication_recid=recid).order_by(
                    HEPSubmission.last_updated).all()
            assert (len(hepdata_submissions) == 0)

            assert (not os.path.exists(directory))
コード例 #8
0
def test_get_data_path_for_record(app):
    data_dir = app.config['CFG_DATADIR']
    assert (get_data_path_for_record('ins12345') == data_dir + '/96/ins12345')
    assert (get_data_path_for_record('ins12345', 'mysubdir',
                                     'file.xyz') == data_dir +
            '/96/ins12345/mysubdir/file.xyz')
コード例 #9
0
def mock_import_old_record(inspire_id=mock_inspire_ids[1], send_email=False):
    """Creates a submission but mimics the old migrated paths. (See hepdata
    master branch at ccd691b for old migrator module.)
    """
    if inspire_id not in mock_inspire_ids:
        raise ValueError('Invalid inspire id %s. Accepted values are: %s' %
                         (inspire_id, ', '.join(mock_inspire_ids)))

    # Use zipped test data for specific record(s)
    publication_information, status = get_inspire_record_information(
        inspire_id)
    publication_information["inspire_id"] = inspire_id

    # Create record
    if status == "success":
        record_information = create_record(publication_information)
    else:
        log.error("Failed to retrieve publication information for " +
                  inspire_id)
        return False

    # Unzip into correct data dir
    data_path = get_data_path_for_record(record_information['recid'])
    base_dir = os.path.dirname(os.path.realpath(__file__))
    zip_path = os.path.join(base_dir, 'old_hepdata_zips',
                            'ins%s.zip' % inspire_id)
    if os.path.isfile(zip_path):
        log.info('Unzipping %s to %s' % (zip_path, data_path))
        shutil.unpack_archive(zip_path, data_path)
        time_stamp = str(int(round(time.time())))
        yaml_path = os.path.join(data_path, time_stamp)
        sub_zip_path = os.path.join(data_path, 'ins%s.zip' % inspire_id)
        shutil.unpack_archive(sub_zip_path, yaml_path)
    else:
        log.error('Invalid path %s' % zip_path)
        return False

    # Create submission
    admin_user_id = 1

    # Consume data payload and store in db.
    get_or_create_hepsubmission(record_information["recid"], admin_user_id)

    errors = process_submission_directory(yaml_path,
                                          os.path.join(yaml_path,
                                                       "submission.yaml"),
                                          record_information["recid"],
                                          old_submission_schema=True,
                                          old_data_schema=True)

    if errors:
        log.error(
            "Submission failed for {0}.".format(record_information["recid"]),
            errors, record_information["recid"])
        return False

    do_finalise(record_information['recid'],
                publication_record=record_information,
                force_finalise=True,
                convert=False,
                send_email=send_email)
コード例 #10
0
ファイル: api.py プロジェクト: islahudinees/hepdata
def _import_record(inspire_id,
                   update_existing=False,
                   base_url='https://hepdata.net',
                   send_email=False):
    publication_information, status = get_inspire_record_information(
        inspire_id)
    if status != "success":
        log.error("Failed to retrieve publication information for " +
                  inspire_id)
        return False

    current_submission = get_latest_hepsubmission(inspire_id=inspire_id)

    if not current_submission:
        log.info(
            "The record with id {0} does not exist in the database, so we're loading it."
            .format(inspire_id))
        publication_information["inspire_id"] = inspire_id
        record_information = create_record(publication_information)
        recid = record_information['recid']
    else:
        log.info("The record with inspire id {0} already exists.".format(
            inspire_id))
        if update_existing:
            log.info("Updating instead")
            recid = current_submission.publication_recid
        else:
            log.info("Not updating as update_existing is False")
            return False

    try:
        download_path = _download_file(base_url, inspire_id)

        filename = os.path.basename(download_path)

        time_stamp = str(int(round(time.time())))
        file_save_directory = get_data_path_for_record(str(recid), time_stamp)
        if not os.path.exists(file_save_directory):
            os.makedirs(file_save_directory)

        file_path = os.path.join(file_save_directory, filename)
        log.info("Moving file to %s" % file_path)
        shutil.copy(download_path, file_path)

        # Create submission
        admin_user_id = 1
        hepsubmission = get_or_create_hepsubmission(recid, admin_user_id)
        db.session.add(hepsubmission)
        db.session.commit()

        # Then process the payload as for any other record
        errors = process_zip_archive(file_path, recid)
        if errors:
            log.info("Errors processing archive. Re-trying with old schema.")
            # Try again with old schema
            # Need to clean up first to avoid errors
            # First delete tables
            cleanup_submission(recid, 1, [])
            # Next remove remaining files
            file_save_directory = os.path.dirname(file_path)
            submission_path = os.path.join(file_save_directory,
                                           remove_file_extension(filename))
            shutil.rmtree(submission_path)

            errors = process_zip_archive(file_path,
                                         recid,
                                         old_submission_schema=True,
                                         old_data_schema=True)

            if errors:
                log.error("Could not process zip archive: ")
                for file, file_errors in errors.items():
                    log.error("    %s:" % file)
                    for error in file_errors:
                        log.error("        %s" % error['message'])

                raise ValueError("Could not validate record.")

        # Delete any previous upload folders
        cleanup_old_files(hepsubmission)

        log.info("Finalising record %s" % recid)

        result_json = do_finalise(recid,
                                  force_finalise=True,
                                  update=(current_submission is not None),
                                  convert=False,
                                  send_email=send_email)
        result = json.loads(result_json)

        if result and result['success']:
            log.info("Imported record %s with %s submissions" %
                     (recid, result['data_count']))
            return True
        else:
            raise ValueError("Failed to finalise record.")
    except Exception as e:
        # Unload record
        unload_submission(recid)
        log.error(e)
        return False