Ejemplo n.º 1
0
def process_submission_directory(basepath,
                                 submission_file_path,
                                 recid,
                                 update=False,
                                 *args,
                                 **kwargs):
    """
    Goes through an entire submission directory and processes the
    files within to create DataSubmissions
    with the files and related material attached as DataResources.

    :param basepath:
    :param submission_file_path:
    :param recid:
    :param update:
    :return:
    """
    added_file_names = []
    errors = {}

    if submission_file_path is not None:

        submission_file_validator = SubmissionFileValidator()
        is_valid_submission_file = submission_file_validator.validate(
            file_path=submission_file_path)

        if is_valid_submission_file:

            submission_file = open(submission_file_path, 'r')
            submission_processed = yaml.load_all(submission_file,
                                                 Loader=Loader)

            # process file, extracting contents, and linking
            # the data record with the parent publication
            hepsubmission = get_latest_hepsubmission(publication_recid=recid)
            if hepsubmission is None:
                HEPSubmission(publication_recid=recid,
                              overall_status='todo',
                              inspire_id=hepsubmission.inspire_id,
                              coordinator=kwargs.get('user_id') if 'user_id'
                              in kwargs else int(current_user.get_id()),
                              version=hepsubmission.version + 1)

            # On a new upload, we reset the flag to notify reviewers
            hepsubmission.reviewers_notified = False

            # if it is finished and we receive an update,
            # then we need to reopen the submission to allow for revisions.
            if hepsubmission.overall_status == 'finished' and not update:
                # we create a new HEPSubmission object
                _rev_hepsubmission = HEPSubmission(
                    publication_recid=recid,
                    overall_status='todo',
                    inspire_id=hepsubmission.inspire_id,
                    coordinator=hepsubmission.coordinator,
                    version=hepsubmission.version + 1)
                db.session.add(_rev_hepsubmission)
                hepsubmission = _rev_hepsubmission

            reserve_doi_for_hepsubmission(hepsubmission, update)

            no_general_submission_info = True

            data_file_validator = DataFileValidator()

            # Delete all data records associated with this submission.
            # Fixes problems with ordering where the table names are changed between uploads.
            # See https://github.com/HEPData/hepdata/issues/112
            # Side effect that reviews will be deleted between uploads.
            cleanup_submission(recid, hepsubmission.version, added_file_names)

            for yaml_document_index, yaml_document in enumerate(
                    submission_processed):
                if not yaml_document:
                    continue

                # Check for presence of local files given as additional_resources.
                if 'additional_resources' in yaml_document:
                    for resource in yaml_document['additional_resources']:
                        location = os.path.join(basepath, resource['location'])
                        if not resource['location'].startswith(
                            ('http', '/resource/')):
                            if not os.path.isfile(location):
                                errors[resource['location']] = [{
                                    "level":
                                    "error",
                                    "message":
                                    "Missing 'additional_resources' file from uploaded archive."
                                }]
                            elif '/' in resource['location']:
                                errors[resource['location']] = [{
                                    "level":
                                    "error",
                                    "message":
                                    "Location of 'additional_resources' file should not contain '/'."
                                }]

                if not yaml_document_index and 'name' not in yaml_document:

                    no_general_submission_info = False
                    process_general_submission_info(basepath, yaml_document,
                                                    recid)

                elif not all(k in yaml_document
                             for k in ('name', 'description', 'keywords',
                                       'data_file')):

                    errors["submission.yaml"] = [{
                        "level":
                        "error",
                        "message":
                        "YAML document with index {} ".format(
                            yaml_document_index) +
                        "missing one or more required keys (name, description, keywords, data_file)."
                    }]

                else:

                    existing_datasubmission_query = DataSubmission.query \
                        .filter_by(name=encode_string(yaml_document["name"]),
                                   publication_recid=recid,
                                   version=hepsubmission.version)

                    added_file_names.append(yaml_document["name"])

                    try:
                        if existing_datasubmission_query.count() == 0:
                            datasubmission = DataSubmission(
                                publication_recid=recid,
                                name=encode_string(yaml_document["name"]),
                                description=encode_string(
                                    yaml_document["description"]),
                                version=hepsubmission.version)
                        else:
                            datasubmission = existing_datasubmission_query.one(
                            )
                            datasubmission.description = encode_string(
                                yaml_document["description"])
                        db.session.add(datasubmission)
                    except SQLAlchemyError as sqlex:
                        errors[yaml_document["data_file"]] = [{
                            "level":
                            "error",
                            "message":
                            str(sqlex)
                        }]
                        db.session.rollback()
                        continue

                    main_file_path = os.path.join(basepath,
                                                  yaml_document["data_file"])

                    data, ex = _eos_fix_read_data(main_file_path)

                    if not data or data is None or ex is not None:

                        errors[yaml_document["data_file"]] = \
                            [{"level": "error", "message": "There was a problem parsing the file.\n" + str(ex)}]

                    elif '/' in yaml_document["data_file"]:

                        errors[yaml_document["data_file"]] = \
                            [{"level": "error", "message": "Name of data_file should not contain '/'.\n"}]

                    else:

                        if data_file_validator.validate(
                                file_path=main_file_path, data=data):
                            try:
                                process_data_file(recid, hepsubmission.version,
                                                  basepath, yaml_document,
                                                  datasubmission,
                                                  main_file_path)
                            except SQLAlchemyError as sqlex:
                                errors[yaml_document["data_file"]] = [{
                                    "level":
                                    "error",
                                    "message":
                                    "There was a problem processing the file.\n"
                                    + str(sqlex)
                                }]
                                db.session.rollback()
                        else:
                            errors = process_validation_errors_for_display(
                                data_file_validator.get_messages())
                            data_file_validator.clear_messages()

                        if yaml_document["data_file"] not in errors:
                            # Check that the length of the 'values' list is consistent
                            # for each of the independent_variables and dependent_variables.
                            indep_count = [
                                len(indep['values'])
                                for indep in data['independent_variables']
                            ]
                            dep_count = [
                                len(dep['values'])
                                for dep in data['dependent_variables']
                            ]
                            if len(set(indep_count + dep_count)
                                   ) > 1:  # if more than one unique count
                                errors.setdefault(
                                    yaml_document["data_file"], []
                                ).append({
                                    "level":
                                    "error",
                                    "message":
                                    "Inconsistent length of 'values' list:\n" +
                                    "independent_variables{}, dependent_variables{}"
                                    .format(str(indep_count), str(dep_count))
                                })

            submission_file.close()

            if no_general_submission_info:
                hepsubmission.last_updated = datetime.now()
                db.session.add(hepsubmission)
                db.session.commit()

            # The line below is commented out since it does not preserve the order of tables.
            # Delete all tables above instead: side effect of deleting reviews between uploads.
            #cleanup_submission(recid, hepsubmission.version, added_file_names)

            db.session.commit()

            if len(errors) is 0:
                errors = package_submission(basepath, recid, hepsubmission)
                reserve_dois_for_data_submissions(
                    publication_recid=recid, version=hepsubmission.version)

                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hepsubmission)

            else:  # delete all tables if errors
                cleanup_submission(recid, hepsubmission.version, {})

        else:

            errors = process_validation_errors_for_display(
                submission_file_validator.get_messages())
            submission_file_validator.clear_messages()

    else:
        # return an error
        errors = {
            "submission.yaml": [{
                "level":
                "error",
                "message":
                "No submission.yaml file found in submission."
            }]
        }
        return errors

    # we return all the errors collectively.
    # This makes more sense that returning errors as
    # soon as problems are found on one file.
    return errors
Ejemplo n.º 2
0
def process_submission_directory(basepath,
                                 submission_file_path,
                                 recid,
                                 update=False,
                                 *args,
                                 **kwargs):
    """
    Goes through an entire submission directory and processes the
    files within to create DataSubmissions
    with the files and related material attached as DataResources.
    :param basepath:
    :param submission_file_path:
    :param recid:
    :return:
    """
    added_file_names = []
    errors = {}

    if submission_file_path is not None:
        submission_file = open(submission_file_path, 'r')

        submission_file_validator = SubmissionFileValidator()
        is_valid_submission_file = submission_file_validator.validate(
            file_path=submission_file_path)

        data_file_validator = DataFileValidator()

        if is_valid_submission_file:
            try:
                submission_processed = yaml.load_all(submission_file,
                                                     Loader=yaml.CSafeLoader)
            except:
                submission_processed = yaml.safe_load_all(submission_file)

            # process file, extracting contents, and linking
            # the data record with the parent publication
            hepsubmission = get_latest_hepsubmission(publication_recid=recid)
            if hepsubmission is None:
                HEPSubmission(publication_recid=recid,
                              overall_status='todo',
                              inspire_id=hepsubmission.inspire_id,
                              coordinator=kwargs.get('user_id') if 'user_id'
                              in kwargs else int(current_user.get_id()),
                              version=hepsubmission.version + 1)

            # On a new upload, we reset the flag to notify reviewers
            hepsubmission.reviewers_notified = False

            # if it is finished and we receive an update,
            # then we need to reopen the submission to allow for revisions.
            if hepsubmission.overall_status == 'finished' and not update:
                # we create a new HEPSubmission object
                _rev_hepsubmission = HEPSubmission(
                    publication_recid=recid,
                    overall_status='todo',
                    inspire_id=hepsubmission.inspire_id,
                    coordinator=hepsubmission.coordinator,
                    version=hepsubmission.version + 1)
                db.session.add(_rev_hepsubmission)
                hepsubmission = _rev_hepsubmission

            reserve_doi_for_hepsubmission(hepsubmission, update)

            for yaml_document in submission_processed:
                if 'record_ids' in yaml_document or 'comment' in yaml_document or 'modifications' in yaml_document:
                    # comments are only present in the general submission
                    # information document.
                    process_general_submission_info(basepath, yaml_document,
                                                    recid)
                else:
                    existing_datasubmission_query = DataSubmission.query \
                        .filter_by(name=encode_string(yaml_document["name"]),
                                   publication_recid=recid,
                                   version=hepsubmission.version)

                    added_file_names.append(yaml_document["name"])

                    if existing_datasubmission_query.count() == 0:
                        datasubmission = DataSubmission(
                            publication_recid=recid,
                            name=encode_string(yaml_document["name"]),
                            description=encode_string(
                                yaml_document["description"]),
                            version=hepsubmission.version)

                    else:
                        datasubmission = existing_datasubmission_query.one()
                        datasubmission.description = encode_string(
                            yaml_document["description"])

                    db.session.add(datasubmission)

                    main_file_path = os.path.join(basepath,
                                                  yaml_document["data_file"])

                    if data_file_validator.validate(file_path=main_file_path):
                        process_data_file(recid, hepsubmission.version,
                                          basepath, yaml_document,
                                          datasubmission, main_file_path)
                    else:
                        errors = process_validation_errors_for_display(
                            data_file_validator.get_messages())

                        data_file_validator.clear_messages()

            cleanup_submission(recid, hepsubmission.version, added_file_names)

            db.session.commit()

            if len(errors) is 0:
                package_submission(basepath, recid, hepsubmission)
                reserve_dois_for_data_submissions(recid, hepsubmission.version)

                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hepsubmission)
        else:
            errors = process_validation_errors_for_display(
                submission_file_validator.get_messages())

            submission_file_validator.clear_messages()
            data_file_validator.clear_messages()
    else:
        # return an error
        errors = {
            "submission.yaml": [{
                "level":
                "error",
                "message":
                "No submission.yaml file found in submission."
            }]
        }
        return errors

    # we return all the errors collectively.
    # This makes more sense that returning errors as
    # soon as problems are found on one file.
    return errors
Ejemplo n.º 3
0
def process_submission_directory(basepath, submission_file_path, recid, update=False, *args, **kwargs):
    """
    Goes through an entire submission directory and processes the
    files within to create DataSubmissions
    with the files and related material attached as DataResources.
    :param basepath:
    :param submission_file_path:
    :param recid:
    :return:
    """
    added_file_names = []
    errors = {}

    if submission_file_path is not None:
        submission_file = open(submission_file_path, 'r')

        submission_file_validator = SubmissionFileValidator()
        is_valid_submission_file = submission_file_validator.validate(
            file_path=submission_file_path)

        data_file_validator = DataFileValidator()

        if is_valid_submission_file:
            try:
                submission_processed = yaml.load_all(submission_file, Loader=yaml.CSafeLoader)
            except:
                submission_processed = yaml.safe_load_all(submission_file)

            # process file, extracting contents, and linking
            # the data record with the parent publication
            hepsubmission = get_latest_hepsubmission(publication_recid=recid)
            if hepsubmission is None:
                HEPSubmission(publication_recid=recid,
                              overall_status='todo',
                              inspire_id=hepsubmission.inspire_id,
                              coordinator=kwargs.get('user_id') if 'user_id' in kwargs else int(current_user.get_id()),
                              version=hepsubmission.version + 1)

            # On a new upload, we reset the flag to notify reviewers
            hepsubmission.reviewers_notified = False

            # if it is finished and we receive an update,
            # then we need to reopen the submission to allow for revisions.
            if hepsubmission.overall_status == 'finished' and not update:
                # we create a new HEPSubmission object
                _rev_hepsubmission = HEPSubmission(publication_recid=recid,
                                                   overall_status='todo',
                                                   inspire_id=hepsubmission.inspire_id,
                                                   coordinator=hepsubmission.coordinator,
                                                   version=hepsubmission.version + 1)
                db.session.add(_rev_hepsubmission)
                hepsubmission = _rev_hepsubmission

            reserve_doi_for_hepsubmission(hepsubmission)

            for yaml_document in submission_processed:
                if 'record_ids' in yaml_document or 'comment' in yaml_document or 'modifications' in yaml_document:
                    # comments are only present in the general submission
                    # information document.
                    process_general_submission_info(basepath, yaml_document, recid)
                else:
                    existing_datasubmission_query = DataSubmission.query \
                        .filter_by(name=encode_string(yaml_document["name"]),
                                   publication_recid=recid,
                                   version=hepsubmission.version)

                    added_file_names.append(yaml_document["name"])

                    if existing_datasubmission_query.count() == 0:
                        datasubmission = DataSubmission(
                            publication_recid=recid,
                            name=encode_string(yaml_document["name"]),
                            description=encode_string(
                                yaml_document["description"]),
                            version=hepsubmission.version)

                    else:
                        datasubmission = existing_datasubmission_query.one()
                        datasubmission.description = encode_string(
                            yaml_document["description"])

                    db.session.add(datasubmission)

                    main_file_path = os.path.join(basepath,
                                                  yaml_document["data_file"])

                    if data_file_validator.validate(file_path=main_file_path):
                        process_data_file(recid, hepsubmission.version, basepath, yaml_document,
                                          datasubmission, main_file_path)
                    else:
                        errors = process_validation_errors_for_display(
                            data_file_validator.get_messages())

                        data_file_validator.clear_messages()

            cleanup_submission(recid, hepsubmission.version,
                               added_file_names)

            db.session.commit()

            if len(errors) is 0:
                package_submission(basepath, recid, hepsubmission)
                reserve_dois_for_data_submissions(recid, hepsubmission.version)

                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hepsubmission)
        else:
            errors = process_validation_errors_for_display(
                submission_file_validator.get_messages())

            submission_file_validator.clear_messages()
            data_file_validator.clear_messages()
    else:
        # return an error
        errors = {"submission.yaml": [
            {"level": "error",
             "message": "No submission.yaml file found in submission."}
        ]}
        return errors

    # we return all the errors collectively.
    # This makes more sense that returning errors as
    # soon as problems are found on one file.
    return errors