Exemple #1
0
def test_cleanup_index_all(app, load_default_data, identifiers, mocker):
    index = app.config.get('ELASTICSEARCH_INDEX')

    m = mocker.patch('fixes.cleanup_index.cleanup_index_batch')

    # Should be no calls made at first as there is only one version of all submissions
    cleanup_index_all(index=index, synchronous=True)
    m.assert_not_called()
    m.reset_mock()

    # Create a new version for ins1283842
    new_submission = HEPSubmission(publication_recid=1,
                                   inspire_id=identifiers[0]["inspire_id"],
                                   version=2,
                                   overall_status='finished')
    db.session.add(new_submission)
    db.session.commit()
    # New id should be 3
    assert (new_submission.id == 3)

    # Cleanup should now clean up id 1
    cleanup_index_all(index=index, synchronous=True)
    m.assert_called_once_with([1], index)
    m.reset_mock()

    # Create more new versions
    new_submission1 = HEPSubmission(publication_recid=1,
                                    inspire_id=identifiers[0]["inspire_id"],
                                    version=3,
                                    overall_status='finished')
    db.session.add(new_submission1)
    new_submission2 = HEPSubmission(publication_recid=1,
                                    inspire_id=identifiers[0]["inspire_id"],
                                    version=4,
                                    overall_status='todo')
    db.session.add(new_submission2)
    new_submission3 = HEPSubmission(publication_recid=16,
                                    inspire_id=identifiers[1]["inspire_id"],
                                    version=2,
                                    overall_status='finished')
    db.session.add(new_submission3)
    db.session.commit()
    assert (new_submission1.id == 4)
    assert (new_submission2.id == 5)
    assert (new_submission3.id == 6)

    # Cleanup should now clean up ids 1, 2 and 3 (ie versions lower than the highest finished version)
    cleanup_index_all(index=index, synchronous=True)
    m.assert_called_once_with([1, 2, 3], index)
    m.reset_mock()

    # Check batch size works
    cleanup_index_all(index=index, batch=2, synchronous=True)
    m.assert_has_calls([call([1, 2], index), call([3], index)])
    m.reset_mock()

    cleanup_index_all(index=index, batch=1, synchronous=True)
    m.assert_has_calls([call([1], index), call([2], index), call([3], index)])
def test_status_reset_error(app, mocker, caplog):
    """
    Test that an error is logged if something goes wrong in the status reset
    :return:
    """
    caplog.set_level(logging.ERROR)
    base_dir = os.path.dirname(os.path.realpath(__file__))
    hepsubmission = HEPSubmission(publication_recid=12345,
                                  overall_status='processing',
                                  version=1)
    db.session.add(hepsubmission)
    db.session.commit()

    assert (hepsubmission.overall_status == 'processing')

    mocker.patch('hepdata.modules.records.api.process_zip_archive',
                 side_effect=Exception("Something went wrong"))
    mocker.patch('hepdata.modules.records.api.cleanup_submission',
                 side_effect=Exception("Could not clean up the submission"))

    zip_file = os.path.join(base_dir, 'test_data/TestHEPSubmission.zip')
    process_saved_file(zip_file, 12345, 1, '', 'todo')

    # After initial failure, overall_status has not been able to be reset
    assert (hepsubmission.overall_status == 'processing')
    assert (len(caplog.records) == 1)

    assert (caplog.records[0].levelname == "ERROR")
    assert (caplog.records[0].msg ==
            "Exception while cleaning up: Could not clean up the submission")
def test_submission_too_big(app, mocker):
    """
    Test the right thing happens when the submission data is too big
    :return:
    """

    base_dir = os.path.dirname(os.path.realpath(__file__))

    hepsubmission = HEPSubmission(publication_recid=12345,
                                  overall_status='todo',
                                  version=1)
    db.session.add(hepsubmission)
    db.session.commit()

    # Patch the app config to reduce the max upload size
    mocker.patch.dict('flask.current_app.config', {'CONVERT_MAX_SIZE': 1000})

    test_directory = os.path.join(base_dir, 'test_data/test_submission')
    errors = process_submission_directory(
        test_directory, os.path.join(test_directory, 'submission.yaml'), 12345)

    assert ('Archive' in errors)
    assert (len(errors['Archive']) == 1)
    assert (errors['Archive'][0]['level'] == 'error')
    assert (errors['Archive'][0]['message'].startswith(
        "Archive is too big for conversion to other formats."))
def test_old_submission_yaml(app, admin_idx):
    """
    Test we can validate against the old submission schema (for use when importing)
    :return:
    """

    base_dir = os.path.dirname(os.path.realpath(__file__))

    hepsubmission = HEPSubmission(publication_recid=12345,
                                  overall_status='todo',
                                  version=1)
    db.session.add(hepsubmission)
    db.session.commit()

    directory = os.path.join(base_dir, 'test_data/test_v0_submission')

    # This should fail against current schema
    errors = process_submission_directory(
        directory, os.path.join(directory, 'submission.yaml'), 12345)
    assert ('submission.yaml' in errors)
    assert (len(errors['submission.yaml']) == 1)
    assert (errors['submission.yaml'][0]['level'] == 'error')
    assert (errors['submission.yaml'][0]['message'].decode().startswith(
        "Invalid value (in GeV) for cmenergies: 1.383-1.481"))

    # Use old schema - should now work
    errors = process_submission_directory(directory,
                                          os.path.join(directory,
                                                       'submission.yaml'),
                                          12345,
                                          old_submission_schema=True)
    assert (errors == {})
Exemple #5
0
def create_new_version(recid, user, notify_uploader=True, uploader_message=None):
    hepsubmission = get_latest_hepsubmission(publication_recid=recid)

    if hepsubmission.overall_status == 'finished':
        # Reopen the submission to allow for revisions,
        # by creating a new HEPSubmission object.
        _rev_hepsubmission = HEPSubmission(publication_recid=recid,
                                           overall_status='todo',
                                           inspire_id=hepsubmission.inspire_id,
                                           coordinator=hepsubmission.coordinator,
                                           version=hepsubmission.version + 1)
        db.session.add(_rev_hepsubmission)
        db.session.commit()

        if notify_uploader:
            uploaders = SubmissionParticipant.query.filter_by(
                role='uploader', publication_recid=recid, status='primary'
                )
            record_information = get_record_by_id(recid)
            for uploader in uploaders:
                send_cookie_email(uploader,
                                  record_information,
                                  message=uploader_message,
                                  version=_rev_hepsubmission.version)

        return jsonify({'success': True, 'version': _rev_hepsubmission.version})
    else:
        return jsonify({"message": f"Rec id {recid} is not finished so cannot create a new version"}), 400
Exemple #6
0
def process_payload(recid, file, redirect_url, synchronous=False):
    """Process an uploaded file

    :param recid: int
        The id of the record to update
    :param file: file
        The file to process
    :param redirect_url: string
        Redirect URL to record, for use if the upload fails or in synchronous mode
    :param synchronous: bool
        Whether to process asynchronously via celery (default) or immediately (only recommended for tests)
    :return: JSONResponse either containing 'url' (for success cases) or
             'message' (for error cases, which will give a 400 error).
    """

    if file and (allowed_file(file.filename)):
        file_path = save_zip_file(file, recid)
        hepsubmission = get_latest_hepsubmission(publication_recid=recid)

        if hepsubmission.overall_status == 'finished':
            # If it is finished and we receive an update,
            # then we need to reopen the submission to allow for revisions,
            # by creating a new HEPSubmission object.
            _rev_hepsubmission = HEPSubmission(
                publication_recid=recid,
                overall_status='todo',
                inspire_id=hepsubmission.inspire_id,
                coordinator=hepsubmission.coordinator,
                version=hepsubmission.version + 1)
            db.session.add(_rev_hepsubmission)
            hepsubmission = _rev_hepsubmission

        previous_status = hepsubmission.overall_status
        hepsubmission.overall_status = 'sandbox_processing' if previous_status == 'sandbox' else 'processing'
        db.session.add(hepsubmission)
        db.session.commit()

        if synchronous:
            process_saved_file(file_path, recid, current_user.get_id(),
                               redirect_url, previous_status)
        else:
            process_saved_file.delay(file_path, recid, current_user.get_id(),
                                     redirect_url, previous_status)
            flash(
                'File saved. You will receive an email when the file has been processed.',
                'info')

        return jsonify({'url': redirect_url.format(recid)})
    else:
        return jsonify({
            "message":
            "You must upload a .zip, .tar, .tar.gz or .tgz file" +
            " (or a .oldhepdata or single .yaml or .yaml.gz file)."
        }), 400
Exemple #7
0
 def _create_new_versions(version, expected_range):
     # Create new HEPSubmission and DataSubmissions for ins1283842
     new_hep_submission = HEPSubmission(
         publication_recid=1,
         inspire_id=identifiers[0]["inspire_id"],
         version=version,
         overall_status='finished')
     db.session.add(new_hep_submission)
     db.session.commit()
     new_data_submissions = []
     for i in range(5):
         new_data_submission = DataSubmission(publication_recid=1,
                                              associated_recid=1,
                                              version=version)
         db.session.add(new_data_submission)
         new_data_submissions.append(new_data_submission)
     db.session.commit()
     assert [x.id for x in new_data_submissions] == expected_range
def get_or_create_hepsubmission(recid, coordinator=1, status="todo"):
    """
    Gets of creates a new HEPSubmission record
    :param recid: the publication record id
    :param coordinator: the user id of the user who owns this record
    :param status: e.g. todo, finished.
    :return: the newly created HEPSubmission object
    """
    hepsubmission = HEPSubmission.query.filter_by(publication_recid=recid).first()

    if hepsubmission is None:
        hepsubmission = HEPSubmission(publication_recid=recid,
                                      coordinator=coordinator,
                                      overall_status=status)

        db.session.add(hepsubmission)
        db.session.commit()

    return hepsubmission
def test_find_submission_data_file_path(app):
    data_dir = app.config['CFG_DATADIR']
    expected_file_name = 'HEPData-987654321-v2-yaml.zip'
    old_file_path = data_dir + '/987654321/' + expected_file_name
    new_file_path = data_dir + '/8a/987654321/' + expected_file_name
    # /8a/987654321/

    if os.path.exists(new_file_path):
        os.remove(new_file_path)

    # No new format file found, so should return old directory
    submission = HEPSubmission(publication_recid=987654321, version=2)
    assert (find_submission_data_file_path(submission) == old_file_path)

    # Create new file
    os.makedirs(data_dir + '/8a/987654321/', exist_ok=True)
    f = open(new_file_path, 'w')
    f.close()

    assert (find_submission_data_file_path(submission) == new_file_path)
def test_duplicate_table_names(app):
    """
    Test that an error is returned for a submission.yaml file with duplicate table names.
    """

    base_dir = os.path.dirname(os.path.realpath(__file__))

    hepsubmission = HEPSubmission(publication_recid=12345,
                                  overall_status='todo',
                                  version=1)
    db.session.add(hepsubmission)
    db.session.commit()

    directory = os.path.join(base_dir, 'test_data/test_duplicate_table_names')
    errors = process_submission_directory(
        directory, os.path.join(directory, 'submission.yaml'), 12345)

    assert ('submission.yaml' in errors)
    assert (len(errors['submission.yaml']) == 2)
    for error in errors['submission.yaml']:
        assert (error['level'] == 'error')
        assert (error['message'].startswith("Duplicate table with name"))
def test_status_reset(app, mocker):
    """
    Test that the status is reset if something unexpected goes wrong
    :return:
    """

    base_dir = os.path.dirname(os.path.realpath(__file__))
    hepsubmission = HEPSubmission(publication_recid=12345,
                                  overall_status='processing',
                                  version=1)
    db.session.add(hepsubmission)
    db.session.commit()

    assert (hepsubmission.overall_status == 'processing')

    mocker.patch('hepdata.modules.records.api.process_zip_archive',
                 side_effect=Exception("Something went wrong"))

    zip_file = os.path.join(base_dir, 'test_data/TestHEPSubmission.zip')
    process_saved_file(zip_file, 12345, 1, '', 'todo')

    # After initial failure, overall_status should be reset to 'todo'
    assert (hepsubmission.overall_status == 'todo')
def test_invalid_data_yaml(app, admin_idx):
    """
    Test the right thing happens when a data yaml file is invalid
    :return:
    """

    base_dir = os.path.dirname(os.path.realpath(__file__))

    hepsubmission = HEPSubmission(publication_recid=12345,
                                  overall_status='todo',
                                  version=1)
    db.session.add(hepsubmission)
    db.session.commit()

    directory = os.path.join(base_dir, 'test_data/test_invalid_data_file')
    errors = process_submission_directory(
        directory, os.path.join(directory, 'submission.yaml'), 12345)

    assert ('data1.yaml' in errors)
    assert (len(errors['data1.yaml']) == 1)
    assert (errors['data1.yaml'][0]['level'] == 'error')
    assert (errors['data1.yaml'][0]['message'].startswith(
        "There was a problem parsing the file"))
Exemple #13
0
def process_submission_directory(basepath,
                                 submission_file_path,
                                 recid,
                                 update=False,
                                 *args,
                                 **kwargs):
    """
    Goes through an entire submission directory and processes the
    files within to create DataSubmissions
    with the files and related material attached as DataResources.
    :param basepath:
    :param submission_file_path:
    :param recid:
    :return:
    """
    added_file_names = []
    errors = {}

    if submission_file_path is not None:
        submission_file = open(submission_file_path, 'r')

        submission_file_validator = SubmissionFileValidator()
        is_valid_submission_file = submission_file_validator.validate(
            file_path=submission_file_path)

        data_file_validator = DataFileValidator()

        if is_valid_submission_file:
            try:
                submission_processed = yaml.load_all(submission_file,
                                                     Loader=yaml.CSafeLoader)
            except:
                submission_processed = yaml.safe_load_all(submission_file)

            # process file, extracting contents, and linking
            # the data record with the parent publication
            hepsubmission = get_latest_hepsubmission(publication_recid=recid)
            if hepsubmission is None:
                HEPSubmission(publication_recid=recid,
                              overall_status='todo',
                              inspire_id=hepsubmission.inspire_id,
                              coordinator=kwargs.get('user_id') if 'user_id'
                              in kwargs else int(current_user.get_id()),
                              version=hepsubmission.version + 1)

            # On a new upload, we reset the flag to notify reviewers
            hepsubmission.reviewers_notified = False

            # if it is finished and we receive an update,
            # then we need to reopen the submission to allow for revisions.
            if hepsubmission.overall_status == 'finished' and not update:
                # we create a new HEPSubmission object
                _rev_hepsubmission = HEPSubmission(
                    publication_recid=recid,
                    overall_status='todo',
                    inspire_id=hepsubmission.inspire_id,
                    coordinator=hepsubmission.coordinator,
                    version=hepsubmission.version + 1)
                db.session.add(_rev_hepsubmission)
                hepsubmission = _rev_hepsubmission

            reserve_doi_for_hepsubmission(hepsubmission, update)

            for yaml_document in submission_processed:
                if 'record_ids' in yaml_document or 'comment' in yaml_document or 'modifications' in yaml_document:
                    # comments are only present in the general submission
                    # information document.
                    process_general_submission_info(basepath, yaml_document,
                                                    recid)
                else:
                    existing_datasubmission_query = DataSubmission.query \
                        .filter_by(name=encode_string(yaml_document["name"]),
                                   publication_recid=recid,
                                   version=hepsubmission.version)

                    added_file_names.append(yaml_document["name"])

                    if existing_datasubmission_query.count() == 0:
                        datasubmission = DataSubmission(
                            publication_recid=recid,
                            name=encode_string(yaml_document["name"]),
                            description=encode_string(
                                yaml_document["description"]),
                            version=hepsubmission.version)

                    else:
                        datasubmission = existing_datasubmission_query.one()
                        datasubmission.description = encode_string(
                            yaml_document["description"])

                    db.session.add(datasubmission)

                    main_file_path = os.path.join(basepath,
                                                  yaml_document["data_file"])

                    if data_file_validator.validate(file_path=main_file_path):
                        process_data_file(recid, hepsubmission.version,
                                          basepath, yaml_document,
                                          datasubmission, main_file_path)
                    else:
                        errors = process_validation_errors_for_display(
                            data_file_validator.get_messages())

                        data_file_validator.clear_messages()

            cleanup_submission(recid, hepsubmission.version, added_file_names)

            db.session.commit()

            if len(errors) is 0:
                package_submission(basepath, recid, hepsubmission)
                reserve_dois_for_data_submissions(recid, hepsubmission.version)

                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hepsubmission)
        else:
            errors = process_validation_errors_for_display(
                submission_file_validator.get_messages())

            submission_file_validator.clear_messages()
            data_file_validator.clear_messages()
    else:
        # return an error
        errors = {
            "submission.yaml": [{
                "level":
                "error",
                "message":
                "No submission.yaml file found in submission."
            }]
        }
        return errors

    # we return all the errors collectively.
    # This makes more sense that returning errors as
    # soon as problems are found on one file.
    return errors
Exemple #14
0
def process_submission_directory(basepath,
                                 submission_file_path,
                                 recid,
                                 update=False,
                                 *args,
                                 **kwargs):
    """
    Goes through an entire submission directory and processes the
    files within to create DataSubmissions
    with the files and related material attached as DataResources.

    :param basepath:
    :param submission_file_path:
    :param recid:
    :param update:
    :return:
    """
    added_file_names = []
    errors = {}

    if submission_file_path is not None:

        submission_file_validator = SubmissionFileValidator()
        is_valid_submission_file = submission_file_validator.validate(
            file_path=submission_file_path)

        if is_valid_submission_file:

            submission_file = open(submission_file_path, 'r')
            submission_processed = yaml.load_all(submission_file,
                                                 Loader=Loader)

            # process file, extracting contents, and linking
            # the data record with the parent publication
            hepsubmission = get_latest_hepsubmission(publication_recid=recid)
            if hepsubmission is None:
                HEPSubmission(publication_recid=recid,
                              overall_status='todo',
                              inspire_id=hepsubmission.inspire_id,
                              coordinator=kwargs.get('user_id') if 'user_id'
                              in kwargs else int(current_user.get_id()),
                              version=hepsubmission.version + 1)

            # On a new upload, we reset the flag to notify reviewers
            hepsubmission.reviewers_notified = False

            # if it is finished and we receive an update,
            # then we need to reopen the submission to allow for revisions.
            if hepsubmission.overall_status == 'finished' and not update:
                # we create a new HEPSubmission object
                _rev_hepsubmission = HEPSubmission(
                    publication_recid=recid,
                    overall_status='todo',
                    inspire_id=hepsubmission.inspire_id,
                    coordinator=hepsubmission.coordinator,
                    version=hepsubmission.version + 1)
                db.session.add(_rev_hepsubmission)
                hepsubmission = _rev_hepsubmission

            reserve_doi_for_hepsubmission(hepsubmission, update)

            no_general_submission_info = True

            data_file_validator = DataFileValidator()

            # Delete all data records associated with this submission.
            # Fixes problems with ordering where the table names are changed between uploads.
            # See https://github.com/HEPData/hepdata/issues/112
            # Side effect that reviews will be deleted between uploads.
            cleanup_submission(recid, hepsubmission.version, added_file_names)

            for yaml_document_index, yaml_document in enumerate(
                    submission_processed):
                if not yaml_document:
                    continue

                # Check for presence of local files given as additional_resources.
                if 'additional_resources' in yaml_document:
                    for resource in yaml_document['additional_resources']:
                        location = os.path.join(basepath, resource['location'])
                        if not resource['location'].startswith(
                            ('http', '/resource/')):
                            if not os.path.isfile(location):
                                errors[resource['location']] = [{
                                    "level":
                                    "error",
                                    "message":
                                    "Missing 'additional_resources' file from uploaded archive."
                                }]
                            elif '/' in resource['location']:
                                errors[resource['location']] = [{
                                    "level":
                                    "error",
                                    "message":
                                    "Location of 'additional_resources' file should not contain '/'."
                                }]

                if not yaml_document_index and 'name' not in yaml_document:

                    no_general_submission_info = False
                    process_general_submission_info(basepath, yaml_document,
                                                    recid)

                elif not all(k in yaml_document
                             for k in ('name', 'description', 'keywords',
                                       'data_file')):

                    errors["submission.yaml"] = [{
                        "level":
                        "error",
                        "message":
                        "YAML document with index {} ".format(
                            yaml_document_index) +
                        "missing one or more required keys (name, description, keywords, data_file)."
                    }]

                else:

                    existing_datasubmission_query = DataSubmission.query \
                        .filter_by(name=encode_string(yaml_document["name"]),
                                   publication_recid=recid,
                                   version=hepsubmission.version)

                    added_file_names.append(yaml_document["name"])

                    try:
                        if existing_datasubmission_query.count() == 0:
                            datasubmission = DataSubmission(
                                publication_recid=recid,
                                name=encode_string(yaml_document["name"]),
                                description=encode_string(
                                    yaml_document["description"]),
                                version=hepsubmission.version)
                        else:
                            datasubmission = existing_datasubmission_query.one(
                            )
                            datasubmission.description = encode_string(
                                yaml_document["description"])
                        db.session.add(datasubmission)
                    except SQLAlchemyError as sqlex:
                        errors[yaml_document["data_file"]] = [{
                            "level":
                            "error",
                            "message":
                            str(sqlex)
                        }]
                        db.session.rollback()
                        continue

                    main_file_path = os.path.join(basepath,
                                                  yaml_document["data_file"])

                    data, ex = _eos_fix_read_data(main_file_path)

                    if not data or data is None or ex is not None:

                        errors[yaml_document["data_file"]] = \
                            [{"level": "error", "message": "There was a problem parsing the file.\n" + str(ex)}]

                    elif '/' in yaml_document["data_file"]:

                        errors[yaml_document["data_file"]] = \
                            [{"level": "error", "message": "Name of data_file should not contain '/'.\n"}]

                    else:

                        if data_file_validator.validate(
                                file_path=main_file_path, data=data):
                            try:
                                process_data_file(recid, hepsubmission.version,
                                                  basepath, yaml_document,
                                                  datasubmission,
                                                  main_file_path)
                            except SQLAlchemyError as sqlex:
                                errors[yaml_document["data_file"]] = [{
                                    "level":
                                    "error",
                                    "message":
                                    "There was a problem processing the file.\n"
                                    + str(sqlex)
                                }]
                                db.session.rollback()
                        else:
                            errors = process_validation_errors_for_display(
                                data_file_validator.get_messages())
                            data_file_validator.clear_messages()

                        if yaml_document["data_file"] not in errors:
                            # Check that the length of the 'values' list is consistent
                            # for each of the independent_variables and dependent_variables.
                            indep_count = [
                                len(indep['values'])
                                for indep in data['independent_variables']
                            ]
                            dep_count = [
                                len(dep['values'])
                                for dep in data['dependent_variables']
                            ]
                            if len(set(indep_count + dep_count)
                                   ) > 1:  # if more than one unique count
                                errors.setdefault(
                                    yaml_document["data_file"], []
                                ).append({
                                    "level":
                                    "error",
                                    "message":
                                    "Inconsistent length of 'values' list:\n" +
                                    "independent_variables{}, dependent_variables{}"
                                    .format(str(indep_count), str(dep_count))
                                })

            submission_file.close()

            if no_general_submission_info:
                hepsubmission.last_updated = datetime.now()
                db.session.add(hepsubmission)
                db.session.commit()

            # The line below is commented out since it does not preserve the order of tables.
            # Delete all tables above instead: side effect of deleting reviews between uploads.
            #cleanup_submission(recid, hepsubmission.version, added_file_names)

            db.session.commit()

            if len(errors) is 0:
                errors = package_submission(basepath, recid, hepsubmission)
                reserve_dois_for_data_submissions(
                    publication_recid=recid, version=hepsubmission.version)

                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hepsubmission)

            else:  # delete all tables if errors
                cleanup_submission(recid, hepsubmission.version, {})

        else:

            errors = process_validation_errors_for_display(
                submission_file_validator.get_messages())
            submission_file_validator.clear_messages()

    else:
        # return an error
        errors = {
            "submission.yaml": [{
                "level":
                "error",
                "message":
                "No submission.yaml file found in submission."
            }]
        }
        return errors

    # we return all the errors collectively.
    # This makes more sense that returning errors as
    # soon as problems are found on one file.
    return errors