Example #1
0
    def update_file(inspire_id, recid, only_record_information=False, send_tweet=False):
        self = Migrator()

        output_location = self.prepare_files_for_submission(inspire_id, force_retrieval=True)
        if output_location:
            updated_record_information = self.retrieve_publication_information(inspire_id)
            record_information = update_record(recid, updated_record_information)

            if not only_record_information:
                try:
                    recid = self.load_submission(
                        record_information, output_location, os.path.join(output_location, "submission.yaml"),
                        update=True)

                    if recid is not None:
                        do_finalise(recid, publication_record=record_information,
                                    force_finalise=True, send_tweet=send_tweet, update=True)

                except FailedSubmission as fe:
                    log.error(fe.message)
                    fe.print_errors()
                    remove_submission(fe.record_id)
            else:
                index_record_ids([record_information['recid']])

        else:
            log.error('Failed to load {0}'.format(inspire_id))
Example #2
0
def update_analyses():
    endpoints = current_app.config["ANALYSES_ENDPOINTS"]
    for analysis_endpoint in endpoints:

        if "endpoint_url" in endpoints[analysis_endpoint]:

            log.info("Updating analyses from {0}...".format(analysis_endpoint))

            response = requests.get(
                endpoints[analysis_endpoint]["endpoint_url"])

            if response:

                analyses = response.json()

                for record in analyses:
                    submission = get_latest_hepsubmission(
                        inspire_id=record, overall_status='finished')

                    if submission:
                        num_new_resources = 0

                        for analysis in analyses[record]:
                            _resource_url = endpoints[analysis_endpoint][
                                "url_template"].format(analysis)
                            if not is_resource_added_to_submission(
                                    submission.publication_recid,
                                    submission.version, _resource_url):
                                print(
                                    'Adding {} analysis to ins{} with URL {}'.
                                    format(analysis_endpoint, record,
                                           _resource_url))
                                new_resource = DataResource(
                                    file_location=_resource_url,
                                    file_type=analysis_endpoint)

                                submission.resources.append(new_resource)
                                num_new_resources += 1

                        if num_new_resources:

                            try:
                                db.session.add(submission)
                                db.session.commit()
                                index_record_ids(
                                    [submission.publication_recid])
                            except Exception as e:
                                db.session.rollback()
                                log.error(e)

                    else:
                        log.debug(
                            "An analysis is available in {0} but with no equivalent in HEPData (ins{1})."
                            .format(analysis_endpoint, record))
        else:
            log.debug(
                "No endpoint url configured for {0}".format(analysis_endpoint))
Example #3
0
File: api.py Project: ruphy/hepdata
    def update_file(inspire_id,
                    recid,
                    force=False,
                    only_record_information=False,
                    send_tweet=False,
                    convert=False):
        self = Migrator()

        output_location, oldsite_last_updated = self.prepare_files_for_submission(
            inspire_id, force_retrieval=True)
        if output_location:
            updated_record_information = self.retrieve_publication_information(
                inspire_id)
            record_information = update_record(recid,
                                               updated_record_information)

            hep_submission = HEPSubmission.query.filter_by(
                publication_recid=recid).first()
            version_count = HEPSubmission.query.filter_by(
                publication_recid=recid).count()
            print('Old site last updated {}'.format(str(oldsite_last_updated)))
            print('New site last updated {}'.format(
                str(hep_submission.last_updated)))
            print('Coordinator ID is {}, version count is {}'.format(
                hep_submission.coordinator, version_count))
            allow_update = hep_submission.last_updated < oldsite_last_updated and \
                           hep_submission.coordinator == 1 and version_count == 1

            if not only_record_information and (allow_update or force):
                try:
                    recid = self.load_submission(record_information,
                                                 output_location,
                                                 os.path.join(
                                                     output_location,
                                                     "submission.yaml"),
                                                 update=True)
                    print('Loaded record {}'.format(recid))

                    if recid is not None:
                        do_finalise(recid,
                                    publication_record=record_information,
                                    force_finalise=True,
                                    send_tweet=send_tweet,
                                    update=True,
                                    convert=convert)

                except FailedSubmission as fe:
                    log.error(fe.message)
                    fe.print_errors()
                    remove_submission(fe.record_id)
            elif not only_record_information:
                print('Not updating record {}'.format(recid))
            else:
                index_record_ids([record_information["recid"]])

        else:
            log.error("Failed to load {0}".format(inspire_id))
Example #4
0
def create_missing_datasubmission_records():
    # Get submissions with missing IDs
    missing_submissions = DataSubmission.query \
        .join(HEPSubmission, HEPSubmission.publication_recid == DataSubmission.publication_recid) \
        .filter(
            DataSubmission.associated_recid == None,
            DataSubmission.publication_inspire_id == None,
            DataSubmission.version == HEPSubmission.version,
            HEPSubmission.overall_status == 'finished')
    missing_submissions = missing_submissions.all()

    if not missing_submissions:
        print("No datasubmissions found with missing record or inspire ids.")
        return

    # Organise missing submissions by publication
    submissions_by_publication = {}
    for submission in missing_submissions:
        if submission.publication_recid in submissions_by_publication:
            submissions_by_publication[submission.publication_recid].append(
                submission)
        else:
            submissions_by_publication[submission.publication_recid] = [
                submission
            ]

    # Loop through each publication
    for publication_recid, submissions in submissions_by_publication.items():
        publication_record = get_record_by_id(publication_recid)
        current_time = "{:%Y-%m-%d %H:%M:%S}".format(datetime.utcnow())
        generated_record_ids = []
        for submission in submissions:
            # Finalise each data submission that does not have a record
            finalise_datasubmission(current_time, {}, generated_record_ids,
                                    publication_record, publication_recid,
                                    submission, submission.version)

            # Register the datasubmission's DOI
            if not current_app.config.get('TESTING', False):
                generate_doi_for_table.delay(submission.doi)
                print(f"Generated DOI {submission.doi}")
            else:
                print(f"Would generate DOI {submission.doi}")

        # finalise_datasubmission does not commit, so commit once for each publication
        db.session.commit()

        # Reindex the publication and its updated datasubmissions
        index_record_ids([publication_recid] + generated_record_ids)
        push_data_keywords(pub_ids=[publication_recid])
Example #5
0
def add_resource(type, identifier, version):
    """
    Adds a data resource to either the submission or individual data files.
    :param type:
    :param identifier:
    :param version:
    :return:
    """

    submission = None
    inspire_id = None
    recid = None

    if type == "submission":
        submission = HEPSubmission.query.filter_by(publication_recid=identifier, version=version).one()
        if submission:
            inspire_id = submission.inspire_id
            recid = submission.publication_recid

    elif type == "data":
        submission = DataSubmission.query.filter_by(id=identifier).one()
        if submission:
            inspire_id = submission.publication_inspire_id
            recid = submission.publication_recid

    if not user_allowed_to_perform_action(recid):
        abort(403)

    analysis_type = request.form.get("analysisType", None)
    analysis_other = request.form.get("analysisOther", None)
    analysis_url = request.form.get("analysisURL", None)
    analysis_description = request.form.get("analysisDescription", None)

    if analysis_type == "other":
        analysis_type = analysis_other

    if analysis_type and analysis_url:

        if submission:
            new_resource = DataResource(
                file_location=analysis_url, file_type=analysis_type, file_description=str(analysis_description)
            )

            submission.resources.append(new_resource)

            try:
                db.session.add(submission)
                db.session.commit()

                try:
                    index_record_ids([recid])
                except:
                    log.error("Failed to reindex {0}".format(recid))

                if inspire_id:
                    return redirect("/record/ins{0}".format(inspire_id))
                else:
                    return redirect("/record/{0}".format(recid))
            except Exception as e:
                db.session.rollback()
                raise e

    return render_template(
        "hepdata_records/error_page.html",
        recid=None,
        header_message="Error adding resource.",
        message="Unable to add resource. Please try again.",
        errors={},
    )
Example #6
0
    def update_file(inspire_id,
                    recid,
                    force=False,
                    only_record_information=False,
                    send_email=False,
                    send_tweet=False,
                    convert=False):
        self = Migrator()

        output_location, oldsite_last_updated = self.prepare_files_for_submission(
            inspire_id, force_retrieval=True)
        if output_location:
            updated_record_information, status = self.retrieve_publication_information(
                inspire_id)
            if status == 'success':
                record_information = update_record(recid,
                                                   updated_record_information)
            else:
                log.error("Failed to retrieve publication information for {0}".
                          format(inspire_id))
                return

            hep_submission = HEPSubmission.query.filter_by(
                publication_recid=recid).first()
            version_count = HEPSubmission.query.filter_by(
                publication_recid=recid).count()
            print('Old site last updated {}'.format(str(oldsite_last_updated)))
            print('New site last updated {}'.format(
                str(hep_submission.last_updated)))
            print('Coordinator ID is {}, version count is {}'.format(
                hep_submission.coordinator, version_count))
            allow_update = (hep_submission.last_updated < oldsite_last_updated or force) and \
                           hep_submission.coordinator == 1 and version_count == 1

            if not only_record_information and allow_update:
                try:
                    recid = self.load_submission(record_information,
                                                 output_location,
                                                 os.path.join(
                                                     output_location,
                                                     "submission.yaml"),
                                                 update=True)
                    print('Loaded record {}'.format(recid))

                    if recid is not None:
                        do_finalise(recid,
                                    publication_record=record_information,
                                    force_finalise=True,
                                    send_tweet=send_tweet,
                                    update=True,
                                    convert=convert)

                except FailedSubmission as fe:
                    log.error(fe.message)
                    fe.print_errors()
                    remove_submission(fe.record_id)
            elif not only_record_information:
                print('Not updating record {}'.format(recid))
            else:
                index_record_ids([record_information["recid"]])
                _cleaned_id = inspire_id.replace("ins", "")
                generate_dois_for_submission.delay(
                    inspire_id=_cleaned_id
                )  # update metadata stored in DataCite
                if send_email:
                    notify_publication_update(
                        hep_submission,
                        record_information)  # send email to all participants

        else:
            log.error("Failed to load {0}".format(inspire_id))
Example #7
0
def add_resource(type, identifier, version):
    """
    Adds a data resource to either the submission or individual data files.

    :param type:
    :param identifier:
    :param version:
    :return:
    """

    submission = None
    inspire_id = None
    recid = None

    if type == 'submission':
        submission = HEPSubmission.query.filter_by(publication_recid=identifier, version=version).one()
        if submission:
            inspire_id = submission.inspire_id
            recid = submission.publication_recid

    elif type == 'data':
        submission = DataSubmission.query.filter_by(id=identifier).one()
        if submission:
            inspire_id = submission.publication_inspire_id
            recid = submission.publication_recid

    if not user_allowed_to_perform_action(recid):
        abort(403)

    analysis_type = request.form.get('analysisType', None)
    analysis_other = request.form.get('analysisOther', None)
    analysis_url = request.form.get('analysisURL', None)
    analysis_description = request.form.get('analysisDescription', None)

    if analysis_type == 'other':
        analysis_type = analysis_other

    if analysis_type and analysis_url:

        if submission:
            new_resource = DataResource(file_location=analysis_url, file_type=analysis_type,
                                        file_description=str(analysis_description))

            submission.resources.append(new_resource)

            try:
                db.session.add(submission)
                db.session.commit()

                try:
                    index_record_ids([recid])
                except:
                    log.error('Failed to reindex {0}'.format(recid))

                if inspire_id:
                    return redirect('/record/ins{0}'.format(inspire_id))
                else:
                    return redirect('/record/{0}'.format(recid))
            except Exception as e:
                db.session.rollback()
                raise e

    return render_template('hepdata_records/error_page.html', recid=None,
                           header_message='Error adding resource.',
                           message='Unable to add resource. Please try again.',
                           errors={})
def update_record_info(inspire_id, send_email=False):
    """Update publication information from INSPIRE for a specific record."""

    if inspire_id is None:
        log.error("Inspire ID is None")
        return 'Inspire ID is None'

    inspire_id = inspire_id.replace("ins", "")

    hep_submission = get_latest_hepsubmission(inspire_id=inspire_id)
    if hep_submission is None:
        log.warning("Failed to retrieve HEPData submission for Inspire ID {0}".format(inspire_id))
        return 'No HEPData submission'

    publication_recid = hep_submission.publication_recid

    log.info("Updating recid {} with information from Inspire record {}".format(publication_recid, inspire_id))

    updated_inspire_record_information, status = get_inspire_record_information(inspire_id)

    if status == 'success':

        # Also need to update publication information for data records.
        data_submissions = DataSubmission.query.filter_by(
            publication_recid=publication_recid, version=hep_submission.version
        ).order_by(DataSubmission.id.asc())
        record_ids = [publication_recid]  # list of record IDs
        for data_submission in data_submissions:
            record_ids.append(data_submission.associated_recid)

        same_information = {}
        for index, recid in enumerate(record_ids):

            if index == 0:
                updated_record_information = updated_inspire_record_information
            else:
                # Only update selected keys for data records.
                updated_record_information = {
                    key: updated_inspire_record_information[key] for key in (
                        'authors', 'creation_date', 'journal_info', 'collaborations'
                    )
                }

            record_information = get_record_by_id(recid)
            same_information[recid] = True
            for key, value in updated_record_information.items():
                if key not in record_information or record_information[key] != value:
                    log.debug('For recid {}, key {} has new value {}'.format(recid, key, value))
                    same_information[recid] = False
                    update_record(recid, updated_record_information)
                    break
            log.info('For recid {}, information needs to be updated: {}'.format(recid, str(not(same_information[recid]))))

        if all(same for same in same_information.values()):
            return 'No update needed'

    else:
        log.warning("Failed to retrieve publication information for Inspire record {0}".format(inspire_id))
        return 'Invalid Inspire ID'

    if hep_submission.overall_status == 'finished':
        index_record_ids(record_ids)  # index for Elasticsearch
        push_data_keywords(pub_ids=[recid])
        if not TESTING:
            generate_dois_for_submission.delay(inspire_id=inspire_id)  # update metadata stored in DataCite
        if send_email:
            record_information = get_record_by_id(publication_recid)
            notify_publication_update(hep_submission, record_information)   # send email to all participants

    return 'Success'
Example #9
0
def do_finalise(recid,
                publication_record=None,
                force_finalise=False,
                commit_message=None,
                send_tweet=False,
                update=False,
                convert=True):
    """
        Creates record SIP for each data record with a link to the associated
        publication
        :param synchronous: if true then workflow execution and creation is
        waited on, then everything is indexed in one go.
        If False, object creation is asynchronous, however reindexing is not
        performed. This is only really useful for the full migration of
        content.
    """
    print('Finalising record {}'.format(recid))

    hep_submission = HEPSubmission.query.filter_by(
        publication_recid=recid, overall_status="todo").first()

    generated_record_ids = []
    if hep_submission \
        and (force_finalise or hep_submission.coordinator == int(current_user.get_id())):

        submissions = DataSubmission.query.filter_by(
            publication_recid=recid, version=hep_submission.version).all()

        version = hep_submission.version

        existing_submissions = {}
        if hep_submission.version > 1 or update:
            # we need to determine which are the existing record ids.
            existing_data_records = get_records_matching_field(
                'related_publication', recid, doc_type=CFG_DATA_TYPE)

            for record in existing_data_records["hits"]["hits"]:

                if "recid" in record["_source"]:
                    existing_submissions[record["_source"]["title"]] = \
                        record["_source"]["recid"]
                    delete_item_from_index(
                        record["_id"],
                        doc_type=CFG_DATA_TYPE,
                        parent=record["_source"]["related_publication"])

        current_time = "{:%Y-%m-%d %H:%M:%S}".format(datetime.now())

        for submission in submissions:
            finalise_datasubmission(current_time, existing_submissions,
                                    generated_record_ids, publication_record,
                                    recid, submission, version)

        try:
            record = get_record_by_id(recid)
            # If we have a commit message, then we have a record update.
            # We will store the commit message and also update the
            # last_updated flag for the record.
            record['hepdata_doi'] = hep_submission.doi

            if commit_message:
                # On a revision, the last updated date will
                # be the current date.
                hep_submission.last_updated = datetime.now()

                commit_record = RecordVersionCommitMessage(
                    recid=recid, version=version, message=str(commit_message))

                db.session.add(commit_record)

            record['last_updated'] = datetime.strftime(
                hep_submission.last_updated, '%Y-%m-%d %H:%M:%S')
            record['version'] = version

            record.commit()

            hep_submission.inspire_id = record['inspire_id']
            hep_submission.overall_status = "finished"
            db.session.add(hep_submission)

            db.session.commit()

            create_celery_app(current_app)

            # only mint DOIs if not testing.
            if not current_app.config.get(
                    'TESTING', False) and not current_app.config.get(
                        'NO_DOI_MINTING', False):
                for submission in submissions:
                    generate_doi_for_data_submission.delay(
                        submission.id, submission.version)
                log.info("Generating DOIs for ins{0}".format(
                    hep_submission.inspire_id))
                generate_doi_for_submission.delay(recid, version)

            # Reindex everything.
            index_record_ids([recid] + generated_record_ids)
            push_data_keywords(pub_ids=[recid])

            try:
                admin_indexer = AdminIndexer()
                admin_indexer.index_submission(hep_submission)
            except ConnectionTimeout as ct:
                log.error('Unable to add ins{0} to admin index.\n{1}'.format(
                    hep_submission.inspire_id, ct))

            send_finalised_email(hep_submission)

            if convert:
                for file_format in ['yaml', 'csv', 'yoda', 'root']:
                    convert_and_store.delay(hep_submission.inspire_id,
                                            file_format,
                                            force=True)

            if send_tweet:
                tweet(
                    record.get('title'), record.get('collaborations'),
                    "http://www.hepdata.net/record/ins{0}".format(
                        record.get('inspire_id')), version)

            return json.dumps({
                "success": True,
                "recid": recid,
                "data_count": len(submissions),
                "generated_records": generated_record_ids
            })

        except NoResultFound:
            print('No record found to update. Which is super strange.')

    else:
        return json.dumps({
            "success":
            False,
            "recid":
            recid,
            "errors": [
                "You do not have permission to finalise this "
                "submission. Only coordinators can do that."
            ]
        })
Example #10
0
def do_finalise(recid, publication_record=None, force_finalise=False,
                commit_message=None, send_tweet=False, update=False):
    """
        Creates record SIP for each data record with a link to the associated
        publication
        :param synchronous: if true then workflow execution and creation is
        waited on, then everything is indexed in one go.
        If False, object creation is asynchronous, however reindexing is not
        performed. This is only really useful for the full migration of
        content.
    """
    hep_submission = HEPSubmission.query.filter_by(
        publication_recid=recid, overall_status="todo").first()

    print('Finalising record {}'.format(recid))

    generated_record_ids = []
    if hep_submission \
        and (force_finalise or hep_submission.coordinator == int(current_user.get_id())):

        submissions = DataSubmission.query.filter_by(
            publication_recid=recid,
            version=hep_submission.version).all()

        version = hep_submission.version

        existing_submissions = {}
        if hep_submission.version > 1 or update:
            # we need to determine which are the existing record ids.
            existing_data_records = get_records_matching_field(
                'related_publication', recid, doc_type=CFG_DATA_TYPE)

            for record in existing_data_records["hits"]["hits"]:

                if "recid" in record["_source"]:
                    existing_submissions[record["_source"]["title"]] = \
                        record["_source"]["recid"]
                    delete_item_from_index(record["_id"],
                                           doc_type=CFG_DATA_TYPE, parent=record["_source"]["related_publication"])

        current_time = "{:%Y-%m-%d %H:%M:%S}".format(datetime.now())

        for submission in submissions:
            finalise_datasubmission(current_time, existing_submissions,
                                    generated_record_ids,
                                    publication_record, recid, submission,
                                    version)

        try:
            record = get_record_by_id(recid)
            # If we have a commit message, then we have a record update.
            # We will store the commit message and also update the
            # last_updated flag for the record.
            record['hepdata_doi'] = hep_submission.doi

            if commit_message:
                # On a revision, the last updated date will
                # be the current date.
                hep_submission.last_updated = datetime.now()

                commit_record = RecordVersionCommitMessage(
                    recid=recid,
                    version=version,
                    message=str(commit_message))

                db.session.add(commit_record)

            record['last_updated'] = datetime.strftime(
                hep_submission.last_updated, '%Y-%m-%d %H:%M:%S')
            record['version'] = version

            record.commit()

            hep_submission.inspire_id = record['inspire_id']
            hep_submission.overall_status = "finished"
            db.session.add(hep_submission)

            db.session.commit()

            create_celery_app(current_app)

            # only mint DOIs if not testing.
            if not current_app.config.get('TESTING', False) and not current_app.config.get('NO_DOI_MINTING', False):
                for submission in submissions:
                    generate_doi_for_data_submission.delay(submission.id, submission.version)

                generate_doi_for_submission.delay(recid, version)

            # Reindex everything.
            index_record_ids([recid] + generated_record_ids)
            push_data_keywords(pub_ids=[recid])

            admin_indexer = AdminIndexer()
            admin_indexer.index_submission(hep_submission)

            send_finalised_email(hep_submission)

            for file_format in ['csv', 'yoda', 'root']:
                convert_and_store.delay(hep_submission.inspire_id, file_format, force=True)

            if send_tweet:
                tweet(record.get('title'), record.get('collaborations'),
                      "http://www.hepdata.net/record/ins{0}".format(record.get('inspire_id')))

            return json.dumps({"success": True, "recid": recid,
                               "data_count": len(submissions),
                               "generated_records": generated_record_ids})

        except NoResultFound:
            print('No record found to update. Which is super strange.')

    else:
        return json.dumps(
            {"success": False, "recid": recid,
             "errors": ["You do not have permission to finalise this "
                        "submission. Only coordinators can do that."]})