Ejemplo n.º 1
0
def migrate_chunk(chunk, broken_output=None, dry_run=False):
    from flask_sqlalchemy import models_committed
    from invenio_records.receivers import record_modification
    from invenio_records.tasks.index import get_record_index
    from invenio.base.globals import cfg
    from elasticsearch.helpers import bulk as es_bulk
    from inspirehep.modules.citations.receivers import (
        catch_citations_insert,
        add_citation_count_on_insert_or_update,
        catch_citations_update
    )
    from invenio_records.signals import before_record_index, after_record_insert
    models_committed.disconnect(record_modification)
    after_record_insert.disconnect(catch_citations_insert)
    before_record_index.disconnect(add_citation_count_on_insert_or_update)
    before_record_index.disconnect(catch_citations_update)

    records_to_index = []
    try:
        for record in chunk:
            recid = json = None
            try:
                recid, json = create_record(record,
                                            force=True, dry_run=dry_run)
                index = get_record_index(json) or \
                    cfg['SEARCH_ELASTIC_DEFAULT_INDEX']
                before_record_index.send(recid, json=json, index=index)
                json.update({'_index': index, '_type': 'record', '_id': recid, 'citation_count': 0})
                records_to_index.append(json)
            except Exception as err:
                logger.error("ERROR with record {} and json {}".format(recid, json))
                logger.exception(err)
                if broken_output:
                    broken_output_fd = open(broken_output, "a")
                    print(record, file=broken_output_fd)

        logger.info("Committing chunk")
        db.session.commit()
        logger.info("Sending chunk to elasticsearch")
        es_bulk(es, records_to_index, request_timeout=60)
    finally:
        models_committed.connect(record_modification)
        after_record_insert.connect(catch_citations_insert)
        before_record_index.connect(add_citation_count_on_insert_or_update)
        before_record_index.connect(catch_citations_update)
        db.session.close()
Ejemplo n.º 2
0
def signals():
    """Fixtures to connect signals."""
    called = {}

    def _listener(signal_name, sender, *args, **kwargs):
        if signal_name not in called:
            called[signal_name] = 0
        called[signal_name] += 1

    after_record_delete_listener = partial(_listener, 'after_record_delete')
    after_record_insert_listener = partial(_listener, 'after_record_insert')
    after_record_revert_listener = partial(_listener, 'after_record_revert')
    after_record_update_listener = partial(_listener, 'after_record_update')
    before_record_delete_listener = partial(_listener, 'before_record_delete')
    before_record_insert_listener = partial(_listener, 'before_record_insert')
    before_record_revert_listener = partial(_listener, 'before_record_revert')
    before_record_update_listener = partial(_listener, 'before_record_update')
    before_record_insert_listener = partial(_listener, 'before_record_insert')

    after_record_delete.connect(after_record_delete_listener)
    after_record_insert.connect(after_record_insert_listener)
    after_record_revert.connect(after_record_revert_listener)
    after_record_update.connect(after_record_update_listener)
    before_record_delete.connect(before_record_delete_listener)
    before_record_insert.connect(before_record_insert_listener)
    before_record_revert.connect(before_record_revert_listener)
    before_record_update.connect(before_record_update_listener)
    before_record_insert.connect(before_record_insert_listener)

    yield called

    after_record_delete.disconnect(after_record_delete_listener)
    after_record_insert.disconnect(after_record_insert_listener)
    after_record_revert.disconnect(after_record_revert_listener)
    after_record_update.disconnect(after_record_update_listener)
    before_record_delete.disconnect(before_record_delete_listener)
    before_record_insert.disconnect(before_record_insert_listener)
    before_record_revert.disconnect(before_record_revert_listener)
    before_record_update.disconnect(before_record_update_listener)
    before_record_insert.disconnect(before_record_insert_listener)
Ejemplo n.º 3
0
def create_author(profile):
    """Create a new author profile based on a given signature.

    The method receives a dictionary representing an author.
    Based on the values, it creates a dictionary in the invenio_records format.
    After all the fields are processed, the method calls create_record
    from invenio_records.api to put the new record.

    :param profile:
        A signature representing an author's to be created as a profile.

        Example:
            profile = {u'affiliations': [{u'value': u'Yerevan Phys. Inst.'}],
                       u'alternative_name': None,
                       u'curated_relation': False,
                       u'email': None,
                       u'full_name': u'Chatrchyan, Serguei',
                       u'inspire_id': None,
                       u'orcid': None,
                       u'profile': u'',
                       u'recid': None,
                       u'role': None,
                       u'uuid': u'd63537a8-1df4-4436-b5ed-224da5b5028c'}

    :return:
        A recid, where the new profile can be accessed.

        Example:
            "1234"
    """
    name = profile.get('full_name')

    # Template of an initial record.
    record = {'collections': [{'primary': 'HEPNAMES'}],
              'name': {'value': name},
              '$schema': _get_author_schema()}

    # The author's email address.
    # Unfortunately the method will not correlate a given e-mail address
    # with an affiliation.
    if 'email' in profile:
        email = profile.get('email')

        record['positions'] = []
        record['positions'].append({'email': email})

    # The author can be a member of more than one affiliation.
    if 'affiliations' in profile:
        affiliations = profile.get('affiliations')

        if 'positions' not in record:
            record['positions'] = []

        for affiliation in affiliations:
            name = affiliation.get('value')
            recid = affiliation.get('recid', None)

            if recid:
                record['positions'].append(
                    {'institution': {'name': name, 'recid': recid}})
            else:
                record['positions'].append(
                    {'institution': {'name': name}})

    # FIXME: The method should also collect the useful data
    #        from the publication, like category field, subject,
    #        etc.

    # Disconnect the signal on insert of a new record.
    after_record_insert.disconnect(append_new_record_to_queue)

    # Create a new author profile.
    record = InspireRecord.create(record, id_=None)

    # Create Inspire recid.
    record_pid = inspire_recid_minter(record.id, record)

    # Extend the new record with Inspire recid and self key.
    record['control_number'] = record_pid.pid_value
    record['self'] = inspire_dojson_utils.get_record_ref(
        record_pid.pid_value, 'authors')

    # Apply the changes.
    record.commit()
    db.session.commit()

    # Reconnect the disconnected signal.
    after_record_insert.connect(append_new_record_to_queue)

    # Report.
    logger.info("Created profile: %s", record_pid.pid_value)

    # Return the recid of new profile to which signatures will point to.
    return record_pid.pid_value
Ejemplo n.º 4
0
def create_author(profile):
    """Create a new author profile based on a given signature.

    The method receives a dictionary representing an author.
    Based on the values, it creates a dictionary in the invenio_records format.
    After all the fields are processed, the method calls create_record
    from invenio_records.api to put the new record.

    :param profile:
        A signature representing an author's to be created as a profile.

        Example:
            profile = {u'affiliations': [{u'value': u'Yerevan Phys. Inst.'}],
                       u'alternative_name': None,
                       u'curated_relation': False,
                       u'email': None,
                       u'full_name': u'Chatrchyan, Serguei',
                       u'inspire_id': None,
                       u'orcid': None,
                       u'profile': u'',
                       u'recid': None,
                       u'role': None,
                       u'uuid': u'd63537a8-1df4-4436-b5ed-224da5b5028c'}

    :return:
        A recid, where the new profile can be accessed.

        Example:
            "1234"
    """
    name = profile.get('full_name')

    # Template of an initial record.
    record = {'collections': [{'primary': 'HEPNAMES'}],
              'name': {'value': name},
              '$schema': _get_author_schema()}

    # The author's email address.
    # Unfortunately the method will not correlate a given e-mail address
    # with an affiliation.
    if 'email' in profile:
        email = profile.get('email')

        record['positions'] = []
        record['positions'].append({'email': email})

    # The author can be a member of more than one affiliation.
    if 'affiliations' in profile:
        affiliations = profile.get('affiliations')

        if 'positions' not in record:
            record['positions'] = []

        for affiliation in affiliations:
            name = affiliation.get('value')
            recid = affiliation.get('recid', None)

            if recid:
                record['positions'].append(
                    {'institution': {'name': name, 'recid': recid}})
            else:
                record['positions'].append(
                    {'institution': {'name': name}})

    # FIXME: The method should also collect the useful data
    #        from the publication, like category field, subject,
    #        etc.

    # Disconnect the signal on insert of a new record.
    after_record_insert.disconnect(append_new_record_to_queue)

    # Create a new author profile.
    record = Record.create(record, id_=None)

    # Create Inspire recid.
    record_pid = inspire_recid_minter(record.id, record)

    # Extend the new record with Inspire recid and self key.
    record['control_number'] = record_pid.pid_value
    record['self'] = inspire_dojson_utils.get_record_ref(
        record_pid.pid_value, 'authors')

    # Apply the changes.
    record.commit()
    db.session.commit()

    # Add the record to Elasticsearch.
    indexer = RecordIndexer()
    indexer.index_by_id(record_pid.object_uuid)

    # Reconnect the disconnected signal.
    after_record_insert.connect(append_new_record_to_queue)

    # Report.
    logger.info("Created profile: %s", record_pid.pid_value)

    # Return the recid of new profile to which signatures will point to.
    return record_pid.pid_value
Ejemplo n.º 5
0
def migrate_chunk(chunk, broken_output=None, dry_run=False):
    from flask_sqlalchemy import models_committed
    from invenio_records.receivers import record_modification
    from invenio_records.tasks.index import get_record_index
    from invenio.base.globals import cfg
    from elasticsearch.helpers import bulk as es_bulk
    from inspirehep.modules.citations.receivers import (
        catch_citations_insert,
        add_citation_count_on_insert_or_update,
        catch_citations_update
    )
    from invenio_records.signals import before_record_index, after_record_insert
    models_committed.disconnect(record_modification)
    after_record_insert.disconnect(catch_citations_insert)
    before_record_index.disconnect(add_citation_count_on_insert_or_update)
    before_record_index.disconnect(catch_citations_update)

    records_to_index = []
    try:
        for raw_record in chunk:
            json = None
            record = marc_create_record(raw_record, keep_singletons=False)
            recid = int(record['001'])
            if not dry_run:
                prod_record = InspireProdRecords(recid=recid)
                prod_record.marcxml = raw_record
            try:
                with db.session.begin_nested():
                    errors, recid, json = create_record(
                        recid, record, force=True,
                        dry_run=dry_run, validation=True
                    )
                    if dry_run:
                        continue
                    prod_record.valid = not errors
                    prod_record.errors = errors
                    index = get_record_index(json) or \
                        cfg['SEARCH_ELASTIC_DEFAULT_INDEX']
                    before_record_index.send(recid, json=json, index=index)
                    json.update({'_index': index, '_type': 'record',
                                 '_id': recid, 'citation_count': 0})
                    records_to_index.append(json)
                    prod_record.successful = True
                    db.session.merge(prod_record)
            except Exception as err:
                logger.error("ERROR with record {} and json {}".format(recid, json))
                logger.exception(err)
                if not dry_run:
                    prod_record.successful = False
                    db.session.merge(prod_record)
        logger.info("Committing chunk")
        db.session.commit()
        logger.info("Sending chunk to elasticsearch")
        if not dry_run:
            es_bulk(es, records_to_index, request_timeout=60)
    finally:
        models_committed.connect(record_modification)
        after_record_insert.connect(catch_citations_insert)
        before_record_index.connect(add_citation_count_on_insert_or_update)
        before_record_index.connect(catch_citations_update)
        db.session.close()