Esempio n. 1
0
def delete_submission(recid):
    """
    Submissions can only be removed if they are not finalised,
    meaning they should never be in the index.
    Only delete the latest version of a submission.
    Delete indexed information only if version = 1.

    :param recid:
    :return:
    """
    if has_role(current_user, 'admin') or has_role(current_user, 'coordinator') \
        or check_is_sandbox_record(recid):

        submission = get_latest_hepsubmission(publication_recid=recid)
        unload_submission(recid, submission.version)

        if submission.version == 1:
            admin_idx = AdminIndexer()
            admin_idx.find_and_delete('recid', recid)

        return json.dumps({"success": True,
                           "recid": recid,
                           "errors": [
                               "Record successfully removed!"]})
    else:
        return json.dumps(
            {"success": False, "recid": recid,
             "errors": [
                 "You do not have permission to delete this submission. "
                 "Only coordinators can do that."]})
Esempio n. 2
0
def process_submission_payload(*args, **kwargs):
    """
    Processes the submission payload.

    :param inspire_id:
    :param title:
    :param reviewer:
    :param uploader:
    :param send_upload_email:
    :return:
    """
    if kwargs.get('inspire_id'):
        content, status = get_inspire_record_information(kwargs.get('inspire_id'))
        content["inspire_id"] = kwargs.get('inspire_id')
    elif kwargs.get('title'):
        content = {'title': kwargs.get('title')}
    else:
        raise ValueError(message="A title or inspire_id must be provided.")

    record_information = create_record(content)
    submitter_id = kwargs.get('submitter_id')
    if submitter_id is None:
        submitter_id = kwargs.get('user_id') if 'user_id' in kwargs else int(current_user.get_id())

    hepsubmission = get_or_create_hepsubmission(record_information["recid"], submitter_id)

    if kwargs.get('inspire_id'):
        hepsubmission.inspire_id = kwargs.get('inspire_id')
        db.session.add(hepsubmission)

    reviewer_details = kwargs.get('reviewer')

    reviewer = create_participant_record(
        reviewer_details.get('name'),
        reviewer_details.get('email'), 'reviewer', 'primary',
        record_information['recid'])
    hepsubmission.participants.append(reviewer)

    uploader_details = kwargs.get('uploader')
    uploader = create_participant_record(uploader_details.get('name'), uploader_details.get('email'),
                                         'uploader', 'primary',
                                         record_information['recid'])
    hepsubmission.participants.append(uploader)

    db.session.commit()

    if kwargs.get('send_upload_email', True):
        # Now Send Email only to the uploader first. The reviewer will be asked to
        # review only when an upload has been performed.
        message = kwargs.get('message', None)
        send_cookie_email(uploader, record_information, message)

    admin_idx = AdminIndexer()
    admin_idx.index_submission(hepsubmission)

    return hepsubmission
Esempio n. 3
0
def reindex():
    if has_role(current_user, 'admin'):
        reindex_all(recreate=True)
        push_data_keywords()
        admin_idx = AdminIndexer()
        admin_idx.reindex(recreate=True)
        return jsonify({"success": True})
    else:
        return jsonify({"success": False,
                        'message': "You don't have sufficient privileges to "
                                   "perform this action."})
Esempio n. 4
0
def find_duplicates_and_remove():
    """Will go through the application to find any duplicates then remove them."""
    inspire_ids = get_all_ids_in_current_system(prepend_id_with="")

    duplicates = []
    for inspire_id in inspire_ids:
        matches = get_records_matching_field('inspire_id', inspire_id,
                                             doc_type=CFG_PUB_TYPE)
        if len(matches['hits']['hits']) > 1:
            duplicates.append(matches['hits']['hits'][0]['_source']['recid'])
    print('There are {} duplicates. Going to remove.'.format(len(duplicates)))
    do_unload(duplicates)

    # reindex submissions for dashboard view
    admin_indexer = AdminIndexer()
    admin_indexer.reindex(recreate=True)
Esempio n. 5
0
def find_duplicates_and_remove(base_url):
    """Will go through the application to find any duplicates then remove them."""
    inspire_ids = importer_api.get_inspire_ids(base_url=base_url)
    if inspire_ids is not False:
        duplicates = []
        for inspire_id in inspire_ids:
            matches = get_records_matching_field('inspire_id',
                                                 inspire_id,
                                                 doc_type=CFG_PUB_TYPE)
            if len(matches['hits']['hits']) > 1:
                duplicates.append(
                    matches['hits']['hits'][0]['_source']['recid'])
        print('There are {} duplicates. Going to remove.'.format(
            len(duplicates)))
        do_unload(duplicates)

        # reindex submissions for dashboard view
        admin_indexer = AdminIndexer()
        admin_indexer.reindex(recreate=True)
Esempio n. 6
0
def find_duplicates_and_remove():
    """
    Will go through the application to find any duplicates then remove them.
    :return:
    """
    inspire_ids = get_all_ids_in_current_system(prepend_id_with="")

    duplicates = []
    for inspire_id in inspire_ids:
        matches = get_records_matching_field('inspire_id', inspire_id,
                                             doc_type=CFG_PUB_TYPE)
        if len(matches['hits']['hits']) > 1:
            duplicates.append(matches['hits']['hits'][0]['_source']['recid'])
    print('There are {} duplicates. Going to remove.'.format(len(duplicates)))
    do_unload(duplicates)

    # reindex submissions for dashboard view
    admin_indexer = AdminIndexer()
    admin_indexer.reindex(recreate=True)
Esempio n. 7
0
def delete_submission(recid):
    """
    Submissions can only be removed if they are not finalised,
    meaning they should never be in the index.
    :param recid:
    :return:
    """
    if has_role(current_user, 'admin') or has_role(current_user, 'coordinator') \
        or check_is_sandbox_record(recid):
        unload_submission(recid)

        admin_idx = AdminIndexer()
        admin_idx.reindex(recreate=True)
        return json.dumps({"success": True,
                           "recid": recid,
                           "errors": [
                               "Record successfully removed!"]})
    else:
        return json.dumps(
            {"success": False, "recid": recid,
             "errors": [
                 "You do not have permission to delete this submission. "
                 "Only coordinators can do that."]})
Esempio n. 8
0
def admin_idx(app):
    with app.app_context():
        admin_idx = AdminIndexer()
        return admin_idx
Esempio n. 9
0
def reindex():
    """Reindexes HEPSubmissions and adds to the submission index"""

    admin_idx = AdminIndexer()
    admin_idx.reindex(recreate=True)
Esempio n. 10
0
def do_finalise(recid, publication_record=None, force_finalise=False,
                commit_message=None, send_tweet=False, update=False):
    """
        Creates record SIP for each data record with a link to the associated
        publication
        :param synchronous: if true then workflow execution and creation is
        waited on, then everything is indexed in one go.
        If False, object creation is asynchronous, however reindexing is not
        performed. This is only really useful for the full migration of
        content.
    """
    hep_submission = HEPSubmission.query.filter_by(
        publication_recid=recid, overall_status="todo").first()

    print('Finalising record {}'.format(recid))

    generated_record_ids = []
    if hep_submission \
        and (force_finalise or hep_submission.coordinator == int(current_user.get_id())):

        submissions = DataSubmission.query.filter_by(
            publication_recid=recid,
            version=hep_submission.version).all()

        version = hep_submission.version

        existing_submissions = {}
        if hep_submission.version > 1 or update:
            # we need to determine which are the existing record ids.
            existing_data_records = get_records_matching_field(
                'related_publication', recid, doc_type=CFG_DATA_TYPE)

            for record in existing_data_records["hits"]["hits"]:

                if "recid" in record["_source"]:
                    existing_submissions[record["_source"]["title"]] = \
                        record["_source"]["recid"]
                    delete_item_from_index(record["_id"],
                                           doc_type=CFG_DATA_TYPE, parent=record["_source"]["related_publication"])

        current_time = "{:%Y-%m-%d %H:%M:%S}".format(datetime.now())

        for submission in submissions:
            finalise_datasubmission(current_time, existing_submissions,
                                    generated_record_ids,
                                    publication_record, recid, submission,
                                    version)

        try:
            record = get_record_by_id(recid)
            # If we have a commit message, then we have a record update.
            # We will store the commit message and also update the
            # last_updated flag for the record.
            record['hepdata_doi'] = hep_submission.doi

            if commit_message:
                # On a revision, the last updated date will
                # be the current date.
                hep_submission.last_updated = datetime.now()

                commit_record = RecordVersionCommitMessage(
                    recid=recid,
                    version=version,
                    message=str(commit_message))

                db.session.add(commit_record)

            record['last_updated'] = datetime.strftime(
                hep_submission.last_updated, '%Y-%m-%d %H:%M:%S')
            record['version'] = version

            record.commit()

            hep_submission.inspire_id = record['inspire_id']
            hep_submission.overall_status = "finished"
            db.session.add(hep_submission)

            db.session.commit()

            create_celery_app(current_app)

            # only mint DOIs if not testing.
            if not current_app.config.get('TESTING', False) and not current_app.config.get('NO_DOI_MINTING', False):
                for submission in submissions:
                    generate_doi_for_data_submission.delay(submission.id, submission.version)

                generate_doi_for_submission.delay(recid, version)

            # Reindex everything.
            index_record_ids([recid] + generated_record_ids)
            push_data_keywords(pub_ids=[recid])

            admin_indexer = AdminIndexer()
            admin_indexer.index_submission(hep_submission)

            send_finalised_email(hep_submission)

            for file_format in ['csv', 'yoda', 'root']:
                convert_and_store.delay(hep_submission.inspire_id, file_format, force=True)

            if send_tweet:
                tweet(record.get('title'), record.get('collaborations'),
                      "http://www.hepdata.net/record/ins{0}".format(record.get('inspire_id')))

            return json.dumps({"success": True, "recid": recid,
                               "data_count": len(submissions),
                               "generated_records": generated_record_ids})

        except NoResultFound:
            print('No record found to update. Which is super strange.')

    else:
        return json.dumps(
            {"success": False, "recid": recid,
             "errors": ["You do not have permission to finalise this "
                        "submission. Only coordinators can do that."]})
Esempio n. 11
0
def process_submission_directory(basepath,
                                 submission_file_path,
                                 recid,
                                 update=False,
                                 *args,
                                 **kwargs):
    """
    Goes through an entire submission directory and processes the
    files within to create DataSubmissions
    with the files and related material attached as DataResources.
    :param basepath:
    :param submission_file_path:
    :param recid:
    :return:
    """
    added_file_names = []
    errors = {}

    if submission_file_path is not None:
        submission_file = open(submission_file_path, 'r')

        submission_file_validator = SubmissionFileValidator()
        is_valid_submission_file = submission_file_validator.validate(
            file_path=submission_file_path)

        data_file_validator = DataFileValidator()

        if is_valid_submission_file:
            try:
                submission_processed = yaml.load_all(submission_file,
                                                     Loader=yaml.CSafeLoader)
            except:
                submission_processed = yaml.safe_load_all(submission_file)

            # process file, extracting contents, and linking
            # the data record with the parent publication
            hepsubmission = get_latest_hepsubmission(publication_recid=recid)
            if hepsubmission is None:
                HEPSubmission(publication_recid=recid,
                              overall_status='todo',
                              inspire_id=hepsubmission.inspire_id,
                              coordinator=kwargs.get('user_id') if 'user_id'
                              in kwargs else int(current_user.get_id()),
                              version=hepsubmission.version + 1)

            # On a new upload, we reset the flag to notify reviewers
            hepsubmission.reviewers_notified = False

            # if it is finished and we receive an update,
            # then we need to reopen the submission to allow for revisions.
            if hepsubmission.overall_status == 'finished' and not update:
                # we create a new HEPSubmission object
                _rev_hepsubmission = HEPSubmission(
                    publication_recid=recid,
                    overall_status='todo',
                    inspire_id=hepsubmission.inspire_id,
                    coordinator=hepsubmission.coordinator,
                    version=hepsubmission.version + 1)
                db.session.add(_rev_hepsubmission)
                hepsubmission = _rev_hepsubmission

            reserve_doi_for_hepsubmission(hepsubmission, update)

            for yaml_document in submission_processed:
                if 'record_ids' in yaml_document or 'comment' in yaml_document or 'modifications' in yaml_document:
                    # comments are only present in the general submission
                    # information document.
                    process_general_submission_info(basepath, yaml_document,
                                                    recid)
                else:
                    existing_datasubmission_query = DataSubmission.query \
                        .filter_by(name=encode_string(yaml_document["name"]),
                                   publication_recid=recid,
                                   version=hepsubmission.version)

                    added_file_names.append(yaml_document["name"])

                    if existing_datasubmission_query.count() == 0:
                        datasubmission = DataSubmission(
                            publication_recid=recid,
                            name=encode_string(yaml_document["name"]),
                            description=encode_string(
                                yaml_document["description"]),
                            version=hepsubmission.version)

                    else:
                        datasubmission = existing_datasubmission_query.one()
                        datasubmission.description = encode_string(
                            yaml_document["description"])

                    db.session.add(datasubmission)

                    main_file_path = os.path.join(basepath,
                                                  yaml_document["data_file"])

                    if data_file_validator.validate(file_path=main_file_path):
                        process_data_file(recid, hepsubmission.version,
                                          basepath, yaml_document,
                                          datasubmission, main_file_path)
                    else:
                        errors = process_validation_errors_for_display(
                            data_file_validator.get_messages())

                        data_file_validator.clear_messages()

            cleanup_submission(recid, hepsubmission.version, added_file_names)

            db.session.commit()

            if len(errors) is 0:
                package_submission(basepath, recid, hepsubmission)
                reserve_dois_for_data_submissions(recid, hepsubmission.version)

                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hepsubmission)
        else:
            errors = process_validation_errors_for_display(
                submission_file_validator.get_messages())

            submission_file_validator.clear_messages()
            data_file_validator.clear_messages()
    else:
        # return an error
        errors = {
            "submission.yaml": [{
                "level":
                "error",
                "message":
                "No submission.yaml file found in submission."
            }]
        }
        return errors

    # we return all the errors collectively.
    # This makes more sense that returning errors as
    # soon as problems are found on one file.
    return errors
Esempio n. 12
0
def process_submission_directory(basepath, submission_file_path, recid,
                                 update=False, old_data_schema=False,
                                 old_submission_schema=False):
    """
    Goes through an entire submission directory and processes the
    files within to create DataSubmissions
    with the files and related material attached as DataResources.

    :param basepath:
    :param submission_file_path:
    :param recid:
    :param update:
    :param old_data_schema: whether to use old (v0) data schema
    :param old_submission_schema: whether to use old (v0) submission schema
        (should only be used when importing old records)
    :return:
    """
    added_file_names = []
    errors = {}

    if submission_file_path is None:
        # return an error
        errors = {"submission.yaml": [
            {"level": "error", "message": "No submission.yaml file found in submission."}
        ]}
        return errors

    try:
        with open(submission_file_path, 'r') as submission_file:
            submission_processed = list(yaml.load_all(submission_file, Loader=Loader))
    except Exception as ex:
        errors = {"submission.yaml": [
            {"level": "error", "message": "There was a problem parsing the file.\n" + str(ex)}
        ]}
        return errors

    submission_file_validator = get_submission_validator(old_submission_schema)
    is_valid_submission_file = submission_file_validator.validate(
        file_path=submission_file_path,
        data=submission_processed,
    )

    if is_valid_submission_file:

        # process file, extracting contents, and linking
        # the data record with the parent publication
        hepsubmission = get_latest_hepsubmission(publication_recid=recid)

        # On a new upload, we reset the flag to notify reviewers
        hepsubmission.reviewers_notified = False

        reserve_doi_for_hepsubmission(hepsubmission, update)

        no_general_submission_info = True

        data_file_validator = get_data_validator(old_data_schema)

        # Delete all data records associated with this submission.
        # Fixes problems with ordering where the table names are changed between uploads.
        # See https://github.com/HEPData/hepdata/issues/112
        # Side effect that reviews will be deleted between uploads.
        cleanup_submission(recid, hepsubmission.version, added_file_names)

        for yaml_document_index, yaml_document in enumerate(submission_processed):
            if not yaml_document:
                continue

            # Check for presence of local files given as additional_resources.
            if 'additional_resources' in yaml_document:
                for resource in yaml_document['additional_resources']:
                    location = os.path.join(basepath, resource['location'])
                    if not resource['location'].startswith(('http', '/resource/')):
                        if not os.path.isfile(location):
                            errors[resource['location']] = [{"level": "error", "message":
                                "Missing 'additional_resources' file from uploaded archive."}]
                        elif '/' in resource['location']:
                            errors[resource['location']] = [{"level": "error", "message":
                                "Location of 'additional_resources' file should not contain '/'."}]

            if not yaml_document_index and 'name' not in yaml_document:

                no_general_submission_info = False
                process_general_submission_info(basepath, yaml_document, recid)

            elif not all(k in yaml_document for k in ('name', 'description', 'keywords', 'data_file')):

                errors["submission.yaml"] = [{"level": "error", "message": "YAML document with index {} ".format(
                    yaml_document_index) + "missing one or more required keys (name, description, keywords, data_file)."}]

            else:

                existing_datasubmission_query = DataSubmission.query \
                    .filter_by(name=yaml_document["name"],
                               publication_recid=recid,
                               version=hepsubmission.version)

                added_file_names.append(yaml_document["name"])

                try:
                    if existing_datasubmission_query.count() == 0:
                        datasubmission = DataSubmission(
                            publication_recid=recid,
                            name=yaml_document["name"],
                            description=yaml_document["description"],
                            version=hepsubmission.version)
                        db.session.add(datasubmission)
                    else:
                        error = {"level": "error",
                                 "message": "Duplicate table with name '{}'.".format(yaml_document["name"])}
                        errors.setdefault('submission.yaml', []).append(error)
                        continue

                except SQLAlchemyError as sqlex:
                    errors[yaml_document["data_file"]] = [{"level": "error", "message": str(sqlex)}]
                    db.session.rollback()
                    continue

                main_file_path = os.path.join(basepath, yaml_document["data_file"])

                data, ex = _read_data_file(main_file_path)

                if not data or data is None or ex is not None:

                    errors[yaml_document["data_file"]] = \
                        [{"level": "error", "message": "There was a problem parsing the file.\n" + str(ex)}]

                elif '/' in yaml_document["data_file"]:

                    errors[yaml_document["data_file"]] = \
                        [{"level": "error", "message": "Name of data_file should not contain '/'.\n"}]

                else:
                    schema_type = yaml_document.get('data_schema')  # Optional
                    if data_file_validator.validate(file_path=main_file_path, file_type=schema_type, data=data):
                        try:
                            process_data_file(recid, hepsubmission.version, basepath, yaml_document,
                                          datasubmission, main_file_path)
                        except SQLAlchemyError as sqlex:
                            errors[yaml_document["data_file"]] = [{"level": "error", "message":
                                "There was a problem processing the file.\n" + str(sqlex)}]
                            db.session.rollback()
                    else:
                        errors.update(process_validation_errors_for_display(data_file_validator.get_messages()))
                        data_file_validator.clear_messages()

        if no_general_submission_info:
            hepsubmission.last_updated = datetime.utcnow()
            db.session.add(hepsubmission)
            db.session.commit()

        # The line below is commented out since it does not preserve the order of tables.
        # Delete all tables above instead: side effect of deleting reviews between uploads.
        #cleanup_submission(recid, hepsubmission.version, added_file_names)

        db.session.commit()

        if len(errors) is 0:
            errors = package_submission(basepath, recid, hepsubmission)

            # Check the size of the upload to ensure it can be converted
            data_filepath = find_submission_data_file_path(hepsubmission)
            with prepare_data_folder(data_filepath, 'yaml') as filepaths:
                input_directory, input_file = filepaths
                # Create options that look like a worst-case (biggest)
                # conversions (using yoda-like options as they include rivet
                # analysis
                dummy_inspire_id = hepsubmission.inspire_id or '0000000'
                options = {
                    'input_format': 'yaml',
                    'output_format': 'yoda',
                    'filename': f'HEPData-ins{dummy_inspire_id}-v{hepsubmission.version}-yoda',
                    'validator_schema_version': '0.1.0',
                    'hepdata_doi': f'10.17182/hepdata.{recid}.v{hepsubmission.version}',
                    'rivet_analysis_name': f'ATLAS_2020_I{dummy_inspire_id}'
                }
                data_size = get_data_size(input_directory, options)
                if data_size > current_app.config['CONVERT_MAX_SIZE']:
                    errors["Archive"] = [{
                        "level": "error",
                        "message": "Archive is too big for conversion to other formats. (%s bytes would be sent to converter; maximum size is %s.)"
                                   % (data_size, current_app.config['CONVERT_MAX_SIZE'])
                    }]

            if len(errors) == 0:
                reserve_dois_for_data_submissions(publication_recid=recid, version=hepsubmission.version)

                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hepsubmission)

    else:

        errors = process_validation_errors_for_display(submission_file_validator.get_messages())
        submission_file_validator.clear_messages()

    # we return all the errors collectively.
    # This makes more sense that returning errors as
    # soon as problems are found on one file.
    return errors
Esempio n. 13
0
def process_submission_directory(basepath,
                                 submission_file_path,
                                 recid,
                                 update=False,
                                 *args,
                                 **kwargs):
    """
    Goes through an entire submission directory and processes the
    files within to create DataSubmissions
    with the files and related material attached as DataResources.

    :param basepath:
    :param submission_file_path:
    :param recid:
    :param update:
    :return:
    """
    added_file_names = []
    errors = {}

    if submission_file_path is not None:

        submission_file_validator = SubmissionFileValidator()
        is_valid_submission_file = submission_file_validator.validate(
            file_path=submission_file_path)

        if is_valid_submission_file:

            submission_file = open(submission_file_path, 'r')
            submission_processed = yaml.load_all(submission_file,
                                                 Loader=Loader)

            # process file, extracting contents, and linking
            # the data record with the parent publication
            hepsubmission = get_latest_hepsubmission(publication_recid=recid)
            if hepsubmission is None:
                HEPSubmission(publication_recid=recid,
                              overall_status='todo',
                              inspire_id=hepsubmission.inspire_id,
                              coordinator=kwargs.get('user_id') if 'user_id'
                              in kwargs else int(current_user.get_id()),
                              version=hepsubmission.version + 1)

            # On a new upload, we reset the flag to notify reviewers
            hepsubmission.reviewers_notified = False

            # if it is finished and we receive an update,
            # then we need to reopen the submission to allow for revisions.
            if hepsubmission.overall_status == 'finished' and not update:
                # we create a new HEPSubmission object
                _rev_hepsubmission = HEPSubmission(
                    publication_recid=recid,
                    overall_status='todo',
                    inspire_id=hepsubmission.inspire_id,
                    coordinator=hepsubmission.coordinator,
                    version=hepsubmission.version + 1)
                db.session.add(_rev_hepsubmission)
                hepsubmission = _rev_hepsubmission

            reserve_doi_for_hepsubmission(hepsubmission, update)

            no_general_submission_info = True

            data_file_validator = DataFileValidator()

            # Delete all data records associated with this submission.
            # Fixes problems with ordering where the table names are changed between uploads.
            # See https://github.com/HEPData/hepdata/issues/112
            # Side effect that reviews will be deleted between uploads.
            cleanup_submission(recid, hepsubmission.version, added_file_names)

            for yaml_document_index, yaml_document in enumerate(
                    submission_processed):
                if not yaml_document:
                    continue

                # Check for presence of local files given as additional_resources.
                if 'additional_resources' in yaml_document:
                    for resource in yaml_document['additional_resources']:
                        location = os.path.join(basepath, resource['location'])
                        if not resource['location'].startswith(
                            ('http', '/resource/')):
                            if not os.path.isfile(location):
                                errors[resource['location']] = [{
                                    "level":
                                    "error",
                                    "message":
                                    "Missing 'additional_resources' file from uploaded archive."
                                }]
                            elif '/' in resource['location']:
                                errors[resource['location']] = [{
                                    "level":
                                    "error",
                                    "message":
                                    "Location of 'additional_resources' file should not contain '/'."
                                }]

                if not yaml_document_index and 'name' not in yaml_document:

                    no_general_submission_info = False
                    process_general_submission_info(basepath, yaml_document,
                                                    recid)

                elif not all(k in yaml_document
                             for k in ('name', 'description', 'keywords',
                                       'data_file')):

                    errors["submission.yaml"] = [{
                        "level":
                        "error",
                        "message":
                        "YAML document with index {} ".format(
                            yaml_document_index) +
                        "missing one or more required keys (name, description, keywords, data_file)."
                    }]

                else:

                    existing_datasubmission_query = DataSubmission.query \
                        .filter_by(name=encode_string(yaml_document["name"]),
                                   publication_recid=recid,
                                   version=hepsubmission.version)

                    added_file_names.append(yaml_document["name"])

                    try:
                        if existing_datasubmission_query.count() == 0:
                            datasubmission = DataSubmission(
                                publication_recid=recid,
                                name=encode_string(yaml_document["name"]),
                                description=encode_string(
                                    yaml_document["description"]),
                                version=hepsubmission.version)
                        else:
                            datasubmission = existing_datasubmission_query.one(
                            )
                            datasubmission.description = encode_string(
                                yaml_document["description"])
                        db.session.add(datasubmission)
                    except SQLAlchemyError as sqlex:
                        errors[yaml_document["data_file"]] = [{
                            "level":
                            "error",
                            "message":
                            str(sqlex)
                        }]
                        db.session.rollback()
                        continue

                    main_file_path = os.path.join(basepath,
                                                  yaml_document["data_file"])

                    data, ex = _eos_fix_read_data(main_file_path)

                    if not data or data is None or ex is not None:

                        errors[yaml_document["data_file"]] = \
                            [{"level": "error", "message": "There was a problem parsing the file.\n" + str(ex)}]

                    elif '/' in yaml_document["data_file"]:

                        errors[yaml_document["data_file"]] = \
                            [{"level": "error", "message": "Name of data_file should not contain '/'.\n"}]

                    else:

                        if data_file_validator.validate(
                                file_path=main_file_path, data=data):
                            try:
                                process_data_file(recid, hepsubmission.version,
                                                  basepath, yaml_document,
                                                  datasubmission,
                                                  main_file_path)
                            except SQLAlchemyError as sqlex:
                                errors[yaml_document["data_file"]] = [{
                                    "level":
                                    "error",
                                    "message":
                                    "There was a problem processing the file.\n"
                                    + str(sqlex)
                                }]
                                db.session.rollback()
                        else:
                            errors = process_validation_errors_for_display(
                                data_file_validator.get_messages())
                            data_file_validator.clear_messages()

                        if yaml_document["data_file"] not in errors:
                            # Check that the length of the 'values' list is consistent
                            # for each of the independent_variables and dependent_variables.
                            indep_count = [
                                len(indep['values'])
                                for indep in data['independent_variables']
                            ]
                            dep_count = [
                                len(dep['values'])
                                for dep in data['dependent_variables']
                            ]
                            if len(set(indep_count + dep_count)
                                   ) > 1:  # if more than one unique count
                                errors.setdefault(
                                    yaml_document["data_file"], []
                                ).append({
                                    "level":
                                    "error",
                                    "message":
                                    "Inconsistent length of 'values' list:\n" +
                                    "independent_variables{}, dependent_variables{}"
                                    .format(str(indep_count), str(dep_count))
                                })

            submission_file.close()

            if no_general_submission_info:
                hepsubmission.last_updated = datetime.now()
                db.session.add(hepsubmission)
                db.session.commit()

            # The line below is commented out since it does not preserve the order of tables.
            # Delete all tables above instead: side effect of deleting reviews between uploads.
            #cleanup_submission(recid, hepsubmission.version, added_file_names)

            db.session.commit()

            if len(errors) is 0:
                errors = package_submission(basepath, recid, hepsubmission)
                reserve_dois_for_data_submissions(
                    publication_recid=recid, version=hepsubmission.version)

                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hepsubmission)

            else:  # delete all tables if errors
                cleanup_submission(recid, hepsubmission.version, {})

        else:

            errors = process_validation_errors_for_display(
                submission_file_validator.get_messages())
            submission_file_validator.clear_messages()

    else:
        # return an error
        errors = {
            "submission.yaml": [{
                "level":
                "error",
                "message":
                "No submission.yaml file found in submission."
            }]
        }
        return errors

    # we return all the errors collectively.
    # This makes more sense that returning errors as
    # soon as problems are found on one file.
    return errors
Esempio n. 14
0
def submissions_list():
    admin_idx = AdminIndexer()
    summary = admin_idx.get_summary()
    return jsonify(summary)
Esempio n. 15
0
def do_finalise(recid,
                publication_record=None,
                force_finalise=False,
                commit_message=None,
                send_tweet=False,
                update=False,
                convert=True):
    """
        Creates record SIP for each data record with a link to the associated
        publication
        :param synchronous: if true then workflow execution and creation is
        waited on, then everything is indexed in one go.
        If False, object creation is asynchronous, however reindexing is not
        performed. This is only really useful for the full migration of
        content.
    """
    print('Finalising record {}'.format(recid))

    hep_submission = HEPSubmission.query.filter_by(
        publication_recid=recid, overall_status="todo").first()

    generated_record_ids = []
    if hep_submission \
        and (force_finalise or hep_submission.coordinator == int(current_user.get_id())):

        submissions = DataSubmission.query.filter_by(
            publication_recid=recid, version=hep_submission.version).all()

        version = hep_submission.version

        existing_submissions = {}
        if hep_submission.version > 1 or update:
            # we need to determine which are the existing record ids.
            existing_data_records = get_records_matching_field(
                'related_publication', recid, doc_type=CFG_DATA_TYPE)

            for record in existing_data_records["hits"]["hits"]:

                if "recid" in record["_source"]:
                    existing_submissions[record["_source"]["title"]] = \
                        record["_source"]["recid"]
                    delete_item_from_index(
                        record["_id"],
                        doc_type=CFG_DATA_TYPE,
                        parent=record["_source"]["related_publication"])

        current_time = "{:%Y-%m-%d %H:%M:%S}".format(datetime.now())

        for submission in submissions:
            finalise_datasubmission(current_time, existing_submissions,
                                    generated_record_ids, publication_record,
                                    recid, submission, version)

        try:
            record = get_record_by_id(recid)
            # If we have a commit message, then we have a record update.
            # We will store the commit message and also update the
            # last_updated flag for the record.
            record['hepdata_doi'] = hep_submission.doi

            if commit_message:
                # On a revision, the last updated date will
                # be the current date.
                hep_submission.last_updated = datetime.now()

                commit_record = RecordVersionCommitMessage(
                    recid=recid, version=version, message=str(commit_message))

                db.session.add(commit_record)

            record['last_updated'] = datetime.strftime(
                hep_submission.last_updated, '%Y-%m-%d %H:%M:%S')
            record['version'] = version

            record.commit()

            hep_submission.inspire_id = record['inspire_id']
            hep_submission.overall_status = "finished"
            db.session.add(hep_submission)

            db.session.commit()

            create_celery_app(current_app)

            # only mint DOIs if not testing.
            if not current_app.config.get(
                    'TESTING', False) and not current_app.config.get(
                        'NO_DOI_MINTING', False):
                for submission in submissions:
                    generate_doi_for_data_submission.delay(
                        submission.id, submission.version)
                log.info("Generating DOIs for ins{0}".format(
                    hep_submission.inspire_id))
                generate_doi_for_submission.delay(recid, version)

            # Reindex everything.
            index_record_ids([recid] + generated_record_ids)
            push_data_keywords(pub_ids=[recid])

            try:
                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hep_submission)
            except ConnectionTimeout as ct:
                log.error('Unable to add ins{0} to admin index.\n{1}'.format(
                    hep_submission.inspire_id, ct))

            send_finalised_email(hep_submission)

            if convert:
                for file_format in ['yaml', 'csv', 'yoda', 'root']:
                    convert_and_store.delay(hep_submission.inspire_id,
                                            file_format,
                                            force=True)

            if send_tweet:
                tweet(
                    record.get('title'), record.get('collaborations'),
                    "http://www.hepdata.net/record/ins{0}".format(
                        record.get('inspire_id')), version)

            return json.dumps({
                "success": True,
                "recid": recid,
                "data_count": len(submissions),
                "generated_records": generated_record_ids
            })

        except NoResultFound:
            print('No record found to update. Which is super strange.')

    else:
        return json.dumps({
            "success":
            False,
            "recid":
            recid,
            "errors": [
                "You do not have permission to finalise this "
                "submission. Only coordinators can do that."
            ]
        })
Esempio n. 16
0
def submissions_list():
    admin_idx = AdminIndexer()
    summary = admin_idx.get_summary()
    return jsonify(summary)
Esempio n. 17
0
def remove_submission(record_id):
    """
    Removes the database entries related to a record.
    :param record_id:
    :return: True if Successful, False if the record does not exist.
    """

    hepdata_submissions = HEPSubmission.query.filter_by(
        publication_recid=record_id).all()

    try:
        try:
            for hepdata_submission in hepdata_submissions:
                db.session.delete(hepdata_submission)
        except NoResultFound as nrf:
            print(nrf.args)

        admin_idx = AdminIndexer()
        admin_idx.find_and_delete(term=record_id, fields=['recid'])

        submissions = DataSubmission.query.filter_by(
            publication_recid=record_id).all()

        reviews = DataReview.query.filter_by(publication_recid=record_id).all()

        for review in reviews:
            db.session.delete(review)

        for submission in submissions:

            resource = DataResource.query.filter_by(
                id=submission.data_file).first()

            db.session.delete(submission)

            if resource:
                db.session.delete(resource)

        try:
            SubmissionParticipant.query.filter_by(
                publication_recid=record_id).delete()
        except Exception:
            print("Unable to find a submission participant for {0}".format(
                record_id))

        try:
            record = get_record_by_id(record_id)
            data_records = get_records_matching_field('related_publication',
                                                      record_id,
                                                      doc_type=CFG_DATA_TYPE)

            if 'hits' in data_records:
                for data_record in data_records['hits']['hits']:
                    data_record_obj = get_record_by_id(
                        data_record['_source']['recid'])
                    if data_record_obj:
                        data_record_obj.delete()
            if record:
                record.delete()

        except PIDDoesNotExistError as e:
            print(
                'No record entry exists for {0}. Proceeding to delete other files.'
                .format(record_id))

        db.session.commit()
        db.session.flush()
        return True

    except Exception as e:
        db.session.rollback()
        raise e
Esempio n. 18
0
def reindex():
    """Reindexes HEPSubmissions and adds to the submission index."""
    admin_idx = AdminIndexer()
    admin_idx.reindex(recreate=True)
Esempio n. 19
0
def process_submission_directory(basepath, submission_file_path, recid, update=False, *args, **kwargs):
    """
    Goes through an entire submission directory and processes the
    files within to create DataSubmissions
    with the files and related material attached as DataResources.
    :param basepath:
    :param submission_file_path:
    :param recid:
    :return:
    """
    added_file_names = []
    errors = {}

    if submission_file_path is not None:
        submission_file = open(submission_file_path, 'r')

        submission_file_validator = SubmissionFileValidator()
        is_valid_submission_file = submission_file_validator.validate(
            file_path=submission_file_path)

        data_file_validator = DataFileValidator()

        if is_valid_submission_file:
            try:
                submission_processed = yaml.load_all(submission_file, Loader=yaml.CSafeLoader)
            except:
                submission_processed = yaml.safe_load_all(submission_file)

            # process file, extracting contents, and linking
            # the data record with the parent publication
            hepsubmission = get_latest_hepsubmission(publication_recid=recid)
            if hepsubmission is None:
                HEPSubmission(publication_recid=recid,
                              overall_status='todo',
                              inspire_id=hepsubmission.inspire_id,
                              coordinator=kwargs.get('user_id') if 'user_id' in kwargs else int(current_user.get_id()),
                              version=hepsubmission.version + 1)

            # On a new upload, we reset the flag to notify reviewers
            hepsubmission.reviewers_notified = False

            # if it is finished and we receive an update,
            # then we need to reopen the submission to allow for revisions.
            if hepsubmission.overall_status == 'finished' and not update:
                # we create a new HEPSubmission object
                _rev_hepsubmission = HEPSubmission(publication_recid=recid,
                                                   overall_status='todo',
                                                   inspire_id=hepsubmission.inspire_id,
                                                   coordinator=hepsubmission.coordinator,
                                                   version=hepsubmission.version + 1)
                db.session.add(_rev_hepsubmission)
                hepsubmission = _rev_hepsubmission

            reserve_doi_for_hepsubmission(hepsubmission)

            for yaml_document in submission_processed:
                if 'record_ids' in yaml_document or 'comment' in yaml_document or 'modifications' in yaml_document:
                    # comments are only present in the general submission
                    # information document.
                    process_general_submission_info(basepath, yaml_document, recid)
                else:
                    existing_datasubmission_query = DataSubmission.query \
                        .filter_by(name=encode_string(yaml_document["name"]),
                                   publication_recid=recid,
                                   version=hepsubmission.version)

                    added_file_names.append(yaml_document["name"])

                    if existing_datasubmission_query.count() == 0:
                        datasubmission = DataSubmission(
                            publication_recid=recid,
                            name=encode_string(yaml_document["name"]),
                            description=encode_string(
                                yaml_document["description"]),
                            version=hepsubmission.version)

                    else:
                        datasubmission = existing_datasubmission_query.one()
                        datasubmission.description = encode_string(
                            yaml_document["description"])

                    db.session.add(datasubmission)

                    main_file_path = os.path.join(basepath,
                                                  yaml_document["data_file"])

                    if data_file_validator.validate(file_path=main_file_path):
                        process_data_file(recid, hepsubmission.version, basepath, yaml_document,
                                          datasubmission, main_file_path)
                    else:
                        errors = process_validation_errors_for_display(
                            data_file_validator.get_messages())

                        data_file_validator.clear_messages()

            cleanup_submission(recid, hepsubmission.version,
                               added_file_names)

            db.session.commit()

            if len(errors) is 0:
                package_submission(basepath, recid, hepsubmission)
                reserve_dois_for_data_submissions(recid, hepsubmission.version)

                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hepsubmission)
        else:
            errors = process_validation_errors_for_display(
                submission_file_validator.get_messages())

            submission_file_validator.clear_messages()
            data_file_validator.clear_messages()
    else:
        # return an error
        errors = {"submission.yaml": [
            {"level": "error",
             "message": "No submission.yaml file found in submission."}
        ]}
        return errors

    # we return all the errors collectively.
    # This makes more sense that returning errors as
    # soon as problems are found on one file.
    return errors