Beispiel #1
0
def open_rt_ticket(e, debug_log=False, queue='Test'):
    """Take an exception e and, if allowed by the configuration,
    open a ticket for that exception.

    Arguments:
    e -- the exception to be reported
    """
    global ticket_hashes
    ticket_hash = e.hash()
    subject = e.get_message_subject() + ' ' + ticket_hash
    body = e.get_message_body()
    if debug_log:
        debug = "\n Debugging information: \n" + e.__repr__() + '\n' + \
            '\n'.join([
                str(key) + " " +
                str(value) for key, value in vars(e).iteritems()])
    else:
        debug = ''
    if rt_ticket_report:
        if ticket_hash not in ticket_hashes.iterkeys():
            ticket_id = BIBCATALOG_SYSTEM.ticket_submit(uid=None,
                                                        subject=subject,
                                                        recordid=e.recid,
                                                        text=body + debug,
                                                        queue=queue,
                                                        priority="",
                                                        owner="",
                                                        requestor="")
            HooverStats.tickets_raised += 1
            ticket_data = BIBCATALOG_SYSTEM.ticket_get_info(None, ticket_id)
            ticket_hashes[ticket_hash] = ticket_data, ticket_id, True
        else:
            ticket_hashes[ticket_hash] = ticket_hashes[ticket_hash][:2] + \
                (True,)
            # If the ticket is already there check its status.  In case it is
            # marked as somehow solved -- i.e. resolved, deleted or rejected --
            # reopen it.
            if ticket_hashes[ticket_hash][0]['status'] in [
                    'resolved', 'deleted', 'rejected'
            ]:
                BIBCATALOG_SYSTEM.ticket_set_attribute(
                    None, ticket_hashes[ticket_hash][1], 'status', 'open')
                HooverStats.tickets_raised += 1
    else:
        write_message('sub: ' + subject + '\nbody:\n' + body + '\ndbg:\n' +
                      debug,
                      verbose=9)
Beispiel #2
0
def hoover(authors=None,
           check_db_consistency=False,
           dry_run=False,
           packet_size=1000,
           dry_hepnames_run=False,
           open_tickets=False,
           queue='Test'):
    """The actions that hoover performs are the following:
    1. Find out the identifiers that belong to the authors(pids) in the database
    2. Find and pull all the signatures that have the same identifier as the author to the author
    3. Connect the profile of the author with the hepnames collection entry
    (optional) check the database to see if it is in a consistent state

    Keyword arguments:
    authors -- an iterable of authors to be hoovered
    check_db_consistency -- perform checks for the consistency of the database
    dry_run -- do not alter the database tables
    packet_size -- squeeze together the marcxml. This there are fewer bibupload
                   processes for the bibsched to run.
    dry_hepnames_run -- do not alter the hepnames collection
    queue -- the name of the queue to be used in the rt system for the tickets
    """
    global rt_ticket_report
    rt_ticket_report = open_tickets
    write_message("Packet size {0}".format(packet_size), verbose=1)
    write_message("Initializing hoover", verbose=1)
    write_message("Selecting records with identifiers...", verbose=1)
    recs = get_records_with_tag('100__i')
    task_sleep_now_if_required(can_stop_too=True)
    recs += get_records_with_tag('100__j')
    task_sleep_now_if_required(can_stop_too=True)
    recs += get_records_with_tag('700__i')
    task_sleep_now_if_required(can_stop_too=True)
    recs += get_records_with_tag('700__j')
    task_sleep_now_if_required(can_stop_too=True)
    write_message("Found {0} records".format(len(set(recs))), verbose=2)
    recs = set(recs) & set(
        run_sql("select DISTINCT(bibrec) from aidPERSONIDPAPERS"))
    write_message("   out of which {0} are in BibAuthorID".format(len(recs)),
                  verbose=2)
    task_sleep_now_if_required(can_stop_too=True)

    records_with_id = set(rec[0] for rec in recs)

    destroy_partial_marc_caches()
    populate_partial_marc_caches(records_with_id, create_inverted_dicts=True)

    if rt_ticket_report:
        global ticket_hashes
        write_message("Ticketing system rt is used", verbose=9)
        write_message("Building hash cache for tickets for queue %s" % queue,
                      verbose=9)
        ticket_ids = BIBCATALOG_SYSTEM.ticket_search(None,
                                                     subject='[Hoover]',
                                                     queue=queue)
        write_message("Found %s existing tickets" % len(ticket_ids), verbose=9)
        for ticket_id in ticket_ids:
            task_sleep_now_if_required(can_stop_too=True)
            try:
                ticket_data = BIBCATALOG_SYSTEM.ticket_get_info(
                    None, ticket_id)
                ticket_hashes[ticket_data['subject'].split()
                              [-1]] = ticket_data, ticket_id, False
            except IndexError:
                write_message(
                    "Problem in subject of ticket {0}".format(ticket_id),
                    verbose=5)
        write_message("Found {0} tickets".format(len(ticket_hashes)),
                      verbose=2)

    task_sleep_now_if_required(can_stop_too=True)
    fdict_id_getters = {
        "INSPIREID": {
            'reliable': [
                get_inspire_id_of_author, get_inspireID_from_hepnames,
                lambda pid: get_inspireID_from_claimed_papers(
                    pid, intersection_set=records_with_id, queue=queue)
            ],
            'unreliable': [
                lambda pid: get_inspireID_from_unclaimed_papers(
                    pid, intersection_set=records_with_id, queue=queue)
            ],
            'signatures_getter':
            get_signatures_with_inspireID,
            'connection':
            dict_entry_for_hepnames_connector,
            'data_dicts': {
                'pid_mapping': defaultdict(set),
                'id_mapping': defaultdict(set)
            }
        },
        "ORCID": {
            'reliable': [  # get_orcid_id_of_author,
                # get_inspireID_from_hepnames,
                # lambda pid: get_inspireID_from_claimed_papers(pid,
                # intersection_set=records_with_id)]
            ],
            'unreliable': [
                # get_inspireID_from_hepnames,
                # lambda pid: get_inspireID_from_claimed_papers(pid,
                # intersection_set=records_with_id)]
            ],
            'signatures_getter': lambda x: list(),
            'connection': lambda pid, _id: None,
            'data_dicts': {
                'pid_mapping': defaultdict(set),
                'id_mapping': defaultdict(set)
            }
        }
    }

    if not authors:
        authors = get_existing_authors()

    write_message("Running on {0}".format(len(authors)), verbose=2)

    unclaimed_authors = defaultdict(set)
    hep_connector = HepnamesConnector(packet_size=packet_size,
                                      dry_hepnames_run=dry_hepnames_run)

    for index, pid in enumerate(authors):
        task_sleep_now_if_required(can_stop_too=True)
        write_message("Searching for reliable ids of person {0}".format(pid),
                      verbose=2)
        for identifier_type, functions in fdict_id_getters.iteritems():
            write_message("    Type: {0}".format(identifier_type, ), verbose=9)

            try:
                G = (func(pid) for func in functions['reliable'])
                if check_db_consistency:
                    results = filter(None, (func for func in G if func))
                    try:
                        # check if this is reduntant
                        if len(results) == 1:
                            consistent_db = True
                        else:
                            consistent_db = len(set(results)) <= 1
                        res = results[0]
                    except IndexError:
                        res = None
                    else:
                        if not consistent_db:
                            res = None
                            raise InconsistentIdentifiersException(
                                'Inconsistent database', pid, identifier_type,
                                set(results))
                else:
                    res = next((func for func in G if func), None)
            except MultipleIdsOnSingleAuthorException as e:
                open_rt_ticket(e, queue=queue)
            except BrokenHepNamesRecordException as e:
                continue
            except InconsistentIdentifiersException as e:
                open_rt_ticket(e, queue=queue)
            except MultipleHepnamesRecordsWithSameIdException as e:
                open_rt_ticket(e, queue=queue)
            else:
                if res:
                    HooverStats.new_ids_found += 1
                    write_message("   Found reliable id {0}".format(res, ),
                                  verbose=9)
                    fdict_id_getters[identifier_type]['data_dicts'][
                        'pid_mapping'][pid].add(res)
                    fdict_id_getters[identifier_type]['data_dicts'][
                        'id_mapping'][res].add(pid)
                else:
                    write_message("   No reliable id found", verbose=9)
                    unclaimed_authors[identifier_type].add(pid)

    write_message("Vacuuming reliable ids...", verbose=2)

    for identifier_type, data in fdict_id_getters.iteritems():
        task_sleep_now_if_required(can_stop_too=True)
        hep_connector.produce_connection_entry = fdict_id_getters[
            identifier_type]['connection']
        for pid, identifiers in data['data_dicts']['pid_mapping'].iteritems():
            write_message(
                "   Person {0} has reliable identifier(s) {1} ".format(
                    str(pid), str(identifiers)),
                verbose=9)
            try:
                if len(identifiers) == 1:
                    identifier = list(identifiers)[0]
                    write_message(
                        "        Considering  {0}".format(identifier),
                        verbose=9)

                    if len(data['data_dicts']['id_mapping'][identifier]) == 1:
                        if not dry_run:
                            rowenta = Vacuumer(pid)
                            signatures = data['signatures_getter'](identifier)
                            write_message(
                                "        Vacuuming {0} signatures! ".format(
                                    str(len(signatures))),
                                verbose=4)
                            for sig in signatures:
                                try:
                                    rowenta.vacuum_signature(sig)
                                except DuplicateClaimedPaperException as e:
                                    open_rt_ticket(e, queue=queue)
                                except DuplicateUnclaimedPaperException as e:
                                    unclaimed_authors[identifier_type].add(
                                        e.pid)
                            write_message(
                                "        Adding inspireid {0} to pid {1}".
                                format(identifier, pid),
                                verbose=3)
                            add_external_id_to_author(pid, identifier_type,
                                                      identifier)
                            hep_connector.add_connection(pid, identifier)

                    else:
                        raise MultipleAuthorsWithSameIdException(
                            "More than one authors with the same identifier",
                            data['data_dicts']['id_mapping'][identifier],
                            identifier)
                else:
                    raise MultipleIdsOnSingleAuthorException(
                        "More than one identifier on a single author ", pid,
                        'INSPIREID', identifiers)

            except MultipleAuthorsWithSameIdException as e:
                open_rt_ticket(e, queue=queue)
            except MultipleIdsOnSingleAuthorException as e:
                open_rt_ticket(e, queue=queue)
            except MultipleHepnamesRecordsWithSameIdException as e:
                open_rt_ticket(e, queue=queue)
            write_message("   Done with {0}".format(pid, ), verbose=3)

    write_message("Vacuuming unreliable ids...", verbose=2)

    for identifier_type, functions in fdict_id_getters.iteritems():
        task_sleep_now_if_required(can_stop_too=True)
        hep_connector.produce_connection_entry = fdict_id_getters[
            identifier_type]['connection']
        for index, pid in enumerate(unclaimed_authors[identifier_type]):
            write_message(
                "Searching for unreliable ids of person {0}".format(pid),
                verbose=9)
            try:
                G = (func(pid) for func in functions['unreliable'])
                res = next((func for func in G if func), None)
                if res is None:
                    continue
            except MultipleIdsOnSingleAuthorException as e:
                continue
            except BrokenHepNamesRecordException as e:
                continue
            except MultipleHepnamesRecordsWithSameIdException as e:
                open_rt_ticket(e, queue=queue)

            HooverStats.new_ids_found += 1
            write_message(
                "   Person {0} has unreliable identifier {1} ".format(
                    str(pid), str(res)),
                verbose=9)

            if res in fdict_id_getters[identifier_type]['data_dicts'][
                    'id_mapping']:
                write_message(
                    "        Id {0} is already assigned to another person, skipping person {1} "
                    .format(str(res), pid))
                continue

            if not dry_run:
                rowenta = Vacuumer(pid)
                signatures = functions['signatures_getter'](res)
                for sig in signatures:
                    try:
                        rowenta.vacuum_signature(sig)
                    except DuplicateClaimedPaperException as e:
                        open_rt_ticket(e, queue=queue)
                    except DuplicateUnclaimedPaperException as e:
                        pass

                write_message("     Adding inspireid {0} to pid {1}".format(
                    res, pid),
                              verbose=3)
                add_external_id_to_author(pid, identifier_type, res)
                hep_connector.add_connection(pid, res)
            write_message("   Done with {0}".format(pid), verbose=3)
    hep_connector.execute_connection()
    for ticket in ticket_hashes:
        if ticket[2] == False:
            BIBCATALOG_SYSTEM.ticket_set_attribute(None, ticket[1], 'status',
                                                   'resolved')

    HooverStats.report_results()
    write_message("Terminating hoover", verbose=1)