Exemple #1
0
def get_recids_for_rules(rules):
    """
    Generates the final list of record IDs to load.

    @param rules dict of rules {rule_name: rule_dict}
    @type rules: dict of rules

    @return dict {rule_name: array of record IDs}
    """
    override_record_ids = task_get_option("record_ids")
    recids = {}
    for rule_name, rule in rules.iteritems():
        if "filter_pattern" in rule or "filter_collection" in rule:
            query = rule.get("filter_pattern", '')
            if "filter_collection" in rule:
                collections = rule["filter_collection"].split()
            else:
                collections = None
            write_message("Performing given search query: '%s'" % query)
            if collections:
                result = perform_request_search(p=query,
                                                of='intbitset',
                                                wl=rule.get('filter_limit', 0),
                                                f=rule.get(
                                                    'filter_field', None),
                                                c=collections)
            else:
                result = search_pattern(
                    p=query,
                    wl=rule.get('filter_limit', 0),
                    f=rule.get('filter_field', None),
                )
        else:
            result = intbitset(trailing_bits=True)

        if override_record_ids is not None:
            result.intersection_update(override_record_ids)
        else:
            last_run = get_rule_lastrun(rule_name)
            modified_recids = get_modified_records_since(last_run)
            if not "consider_deleted_records" in rule:
                modified_recids -= search_unit_in_bibxxx(p='DELETED',
                                                         f='980__%',
                                                         type='e')
                if CFG_CERN_SITE:
                    modified_recids -= search_unit_in_bibxxx(p='DUMMY',
                                                             f='980__%',
                                                             type='e')
            result.intersection_update(modified_recids)
        recids[rule_name] = result

    return recids
def oai_get_recid_list(set_spec="", fromdate="", untildate=""):
    """
    Returns list of recids for the OAI set 'set', modified from 'fromdate' until 'untildate'.
    """
    ret = intbitset()
    if not set_spec:
        ret |= search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e')
        if CFG_OAI_DELETED_POLICY != 'no':
            ret |= search_unit_in_bibxxx(p='*',
                                         f=CFG_OAI_PREVIOUS_SET_FIELD,
                                         type='e')
    else:
        ret |= search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e')
        ret |= search_unit_in_bibxxx(p='%s:*' % set_spec,
                                     f=CFG_OAI_SET_FIELD,
                                     type='e')
        if CFG_OAI_DELETED_POLICY != 'no':
            ret |= search_unit_in_bibxxx(p=set_spec,
                                         f=CFG_OAI_PREVIOUS_SET_FIELD,
                                         type='e')
            ret |= search_unit_in_bibxxx(p='%s:*' % set_spec,
                                         f=CFG_OAI_PREVIOUS_SET_FIELD,
                                         type='e')
    if CFG_OAI_DELETED_POLICY == 'no':
        ret -= search_unit_in_bibxxx(p='DELETED', f='980__%', type='e')
        if CFG_CERN_SITE:
            ret -= search_unit_in_bibxxx(p='DUMMY', f='980__%', type='e')
    return filter_out_based_on_date_range(ret, fromdate, untildate)
Exemple #3
0
def get_recids_for_rules(rules):
    """
    Generates the final list of record IDs to load.

    @param rules dict of rules {rule_name: rule_dict}
    @type rules: dict of rules

    @return dict {rule_name: array of record IDs}
    """
    override_record_ids = task_get_option("record_ids")
    recids = {}
    for rule_name, rule in rules.iteritems():
        if "filter_pattern" in rule:
            query = rule["filter_pattern"]
            if "filter_collection" in rule:
                collections = rule["filter_collection"].split()
            else:
                collections = None
            write_message("Performing given search query: '%s'" % query)
            if collections:
                result = perform_request_search(
                    p=query,
                    of='intbitset',
                    wl=rule.get('filter_limit', 0),
                    f=rule.get('filter_field', None),
                    c=collections
                )
            else:
                result = search_pattern(
                    p=query,
                    wl=rule.get('filter_limit', 0),
                    f=rule.get('filter_field', None),
                )
        else:
            result = intbitset(trailing_bits=True)

        if override_record_ids is not None:
            result.intersection_update(override_record_ids)
        else:
            last_run = get_rule_lastrun(rule_name)
            modified_recids = get_modified_records_since(last_run)
            if not "consider_deleted_records" in rule:
                modified_recids -= search_unit_in_bibxxx(p='DELETED', f='980__%', type='e')
                if CFG_CERN_SITE:
                    modified_recids -= search_unit_in_bibxxx(p='DUMMY', f='980__%', type='e')
            result.intersection_update(modified_recids)
        recids[rule_name] = result

    return recids
def oai_get_recid_list(set_spec="", fromdate="", untildate=""):
    """
    Returns list of recids for the OAI set 'set', modified from 'fromdate' until 'untildate'.
    """
    ret = intbitset()
    if not set_spec:
        ret |= search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e')
        if CFG_OAI_DELETED_POLICY != 'no':
            ret |= search_unit_in_bibxxx(p='*', f=CFG_OAI_PREVIOUS_SET_FIELD, type='e')
    else:
        ret |= search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e')
        ret |= search_unit_in_bibxxx(p='%s:*' % set_spec, f=CFG_OAI_SET_FIELD, type='e')
        if CFG_OAI_DELETED_POLICY != 'no':
            ret |= search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_PREVIOUS_SET_FIELD, type='e')
            ret |= search_unit_in_bibxxx(p='%s:*' % set_spec, f=CFG_OAI_PREVIOUS_SET_FIELD, type='e')
    if CFG_OAI_DELETED_POLICY == 'no':
        ret -= search_unit_in_bibxxx(p='DELETED', f='980__%', type='e')
        if CFG_CERN_SITE:
            ret -= search_unit_in_bibxxx(p='DUMMY', f='980__%', type='e')
    return filter_out_based_on_date_range(ret, fromdate, untildate, set_spec)
Exemple #5
0
def get_inspireID_from_hepnames(pid):
    """return inspireID of a pid by searching the hepnames

    Arguments:
    pid -- the pid of the author to search in the hepnames dataset
    """
    author_canonical_name = get_canonical_name_of_author(pid)
    hepnames_recids = get_all_recids_in_hepnames()
    try:
        recid = set(
            search_unit_in_bibxxx(p=author_canonical_name[0][0],
                                  f='035__',
                                  type='='))
        recid = list(recid & hepnames_recids)

        if len(recid) > 1:
            raise MultipleHepnamesRecordsWithSameIdException(
                "More than one hepnames record found with the same inspire id",
                recid, 'INSPIREID')

        hepname_record = get_record(recid[0])
        fields_dict = [dict(x[0]) for x in hepname_record['035']]
        inspire_ids = []
        for d in fields_dict:
            if '9' in d and d['9'] == 'INSPIRE':
                try:
                    inspire_ids.append(d['a'])
                except KeyError:
                    raise BrokenHepNamesRecordException(
                        "There is no inspire id present, althought there is a MARC tag.",
                        recid[0], 'INSPIREID')
        if len(inspire_ids) > 1:
            raise BrokenHepNamesRecordException(
                "Multiple inspire ids found in the record.", recid[0],
                'INSPIREID')
        else:
            return inspire_ids[0]
    except IndexError:
        return None
    except KeyError:
        return None
Exemple #6
0
def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    if run_sql("SELECT id FROM schTASK WHERE proc='bibupload:oairepository' AND status='WAITING'"):
        write_message("Previous requests of oairepository still being elaborated. Let's skip this execution.")
        return True

    initial_snapshot = {}
    for set_spec in all_set_specs():
        initial_snapshot[set_spec] = get_set_definitions(set_spec)
    write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2)

    task_update_progress("Fetching records to process")

    recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e')
    write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2)

    all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e')
    no_more_exported_recids = intbitset(all_current_recids)
    write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2)

    all_affected_recids = intbitset()
    all_should_recids = intbitset()
    recids_for_set = {}
    for set_spec in all_set_specs():
        if not set_spec:
            set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC
        should_recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = should_recids
        no_more_exported_recids -= should_recids
        all_should_recids |= should_recids
        current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e')
        write_message("%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2)
        to_add = should_recids - current_recids
        write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2)
        to_remove = current_recids - should_recids
        write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2)
        affected_recids = to_add | to_remove
        write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2)
        all_affected_recids |= affected_recids

    missing_oaiid = all_should_recids - recids_with_oaiid
    write_message("%s recids are missing an oaiid" % len(missing_oaiid))
    write_message("%s recids should no longer be exported" % len(no_more_exported_recids))

    ## Let's add records with missing OAI ID
    all_affected_recids |= missing_oaiid | no_more_exported_recids
    write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2)

    if not all_affected_recids:
        write_message("Nothing to do!")
        return True

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write("<collection>")

    tot = 0
    # Iterate over the recids
    for i, recid in enumerate(all_affected_recids):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(all_affected_recids)))

        write_message("Elaborating recid %s" % recid, verbose=3)
        record = get_record(recid)
        if not record:
            write_message("Record %s seems empty. Let's skip it." % recid, verbose=3)
            continue
        new_record = {}

        # Check if an OAI identifier is already in the record or
        # not.
        assign_oai_id_entry = False
        oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5])
        if not oai_id_entry:
            assign_oai_id_entry = True
            oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid)
            write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3)
        else:
            write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5]))
        write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3)

        current_previous_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5]))
        write_message("Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3)

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(_set for _set, _recids in recids_for_set.iteritems()
             if recid in _recids)
        write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3)

        updated_previous_oai_sets = set(_set for _set in (current_previous_oai_sets - updated_oai_sets) |
             (current_oai_sets - updated_oai_sets))
        write_message("Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3)

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not assign_oai_id_entry:
            write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3)
            continue # Jump to next recid

        write_message("Something has changed for record %s, let's update it!" % recid, verbose=3)
        subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)]
        for oai_set in updated_oai_sets:
            subfields.append((CFG_OAI_SET_FIELD[5], oai_set))
        for oai_set in updated_previous_oai_sets:
            subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set))

        record_add_field(new_record, tag="001", controlfield_value=str(recid))
        record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields)
        oai_out.write(record_xml_output(new_record))
        tot += 1
        if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE:
            oai_out.write("</collection>")
            oai_out.close()
            write_message("Wrote to file %s" % filename)
            if not no_upload:
                if task_get_option("notimechange"):
                    task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n', '-Noairepository', '-P', '-1')
                else:
                    task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-Noairepository', '-P', '-1')
            # Prepare to save results in a tmp file
            (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                        prefix='oairepository_' + \
                                        time.strftime("%Y%m%d_%H%M%S_",
                                                        time.localtime()))
            oai_out = os.fdopen(fd, "w")
            oai_out.write("<collection>")
            tot = 0
            task_sleep_now_if_required(can_stop_too=True)

    oai_out.write("</collection>")
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if tot > 0:
        if not no_upload:
            task_sleep_now_if_required(can_stop_too=True)
            if task_get_option("notimechange"):
                task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n')
            else:
                task_low_level_submission('bibupload', 'oairepository', '-c', filename)
    else:
        os.remove(filename)

    return True
Exemple #7
0
def repository_size():
    """Read repository size"""
    return len(search_unit_in_bibxxx(p="*", f=CFG_OAI_SET_FIELD, type="e"))
Exemple #8
0
def print_repository_status(local_write_message=write_message,
                            verbose=0):
    """
    Prints the repository status to the standard output.

    Parameters:

      write_message - *function* the function used to write the output

            verbose - *int* the verbosity of the output
                       - 0: print repository size
                       - 1: print quick status of each set (numbers
                         can be wrong if the repository is in some
                         inconsistent state, i.e. a record is in an
                         OAI setSpec but has not OAI ID)
                       - 2: print detailed status of repository, with
                         number of records that needs to be
                         synchronized according to the sets
                         definitions. Precise, but ~slow...
    """
    repository_size_s = "%d" % repository_size()
    repository_recids_after_update = intbitset()

    local_write_message(CFG_SITE_NAME)
    local_write_message(" OAI Repository Status")

    set_spec_max_length = 19 # How many max char do we display for
    set_name_max_length = 20 # setName and setSpec?

    if verbose == 0:
        # Just print repository size
        local_write_message("  Total(**)" + " " * 29 +
                      " " * (9 - len(repository_size_s)) + repository_size_s)
        return
    elif verbose == 1:
        # We display few information: show longer set name and spec
        set_spec_max_length = 30
        set_name_max_length = 30

    local_write_message("=" * 80)
    header = "  setSpec" + " " * (set_spec_max_length - 7) + \
             "  setName" + " " * (set_name_max_length - 5) + " Volume"
    if verbose > 1:
        header += " " * 5 + "After update(*):"
    local_write_message(header)

    if verbose > 1:
        local_write_message(" " * 57 + "Additions  Deletions")

    local_write_message("-" * 80)

    for set_spec in all_set_specs():

        if verbose <= 1:
            # Get the records that are in this set. This is an
            # incomplete check, as it can happen that some records are
            # in this set (according to the metadata) but have no OAI
            # ID (so they are not exported). This can happen if the
            # repository has some records coming from external
            # sources, or if it has never been synchronized with this
            # tool.
            current_recids = get_recids_for_set_spec(set_spec)
            nb_current_recids = len(current_recids)
        else:
            # Get the records that are *currently* exported for this
            # setSpec
            current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e')
            nb_current_recids = len(current_recids)
            # Get the records that *should* be in this set according to
            # the admin defined settings, and compute how many should be
            # added or removed
            should_recids = get_recids_for_set_spec(set_spec)
            repository_recids_after_update |= should_recids

            nb_add_recids = len(should_recids -  current_recids)
            nb_remove_recids = len(current_recids - should_recids)
            nb_should_recids = len(should_recids)


        # Adapt setName and setSpec strings lengths
        set_spec_str = set_spec
        if len(set_spec_str) > set_spec_max_length :
            set_spec_str = "%s.." % set_spec_str[:set_spec_max_length]
        set_name_str = get_set_name_for_set_spec(set_spec)
        if len(set_name_str) > set_name_max_length :
            set_name_str = "%s.." % set_name_str[:set_name_max_length]

        row = "  " + set_spec_str + \
               " " * ((set_spec_max_length + 2) - len(set_spec_str)) + set_name_str + \
               " " * ((set_name_max_length + 2) - len(set_name_str)) + \
               " " * (7 - len(str(nb_current_recids))) + str(nb_current_recids)
        if verbose > 1:
            row += \
                " " * max(9 - len(str(nb_add_recids)), 0) + '+' + str(nb_add_recids) + \
                " " * max(7 - len(str(nb_remove_recids)), 0) + '-' + str(nb_remove_recids) + " = " +\
                " " * max(7 - len(str(nb_should_recids)), 0) + str(nb_should_recids)
        local_write_message(row)

    local_write_message("=" * 80)
    footer = "  Total(**)" + " " * (set_spec_max_length + set_name_max_length - 7) + \
             " " * (9 - len(repository_size_s)) + repository_size_s
    if verbose > 1:
        footer += ' ' * (28 - len(str(len(repository_recids_after_update)))) + str(len(repository_recids_after_update))
    local_write_message(footer)

    if verbose > 1:
        local_write_message('  *The "after update" columns show the repository after you run this tool.')
    else:
        local_write_message(' *"Volume" is indicative if repository is out of sync. Use --detailed-report.')
    local_write_message('**The "total" is not the sum of the above numbers, but the union of the records.')
def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    initial_snapshot = {}
    for set_spec in all_set_specs():
        initial_snapshot[set_spec] = get_set_definitions(set_spec)
    write_message("Initial set snapshot: %s" % pformat(initial_snapshot),
                  verbose=2)

    task_update_progress("Fetching records to process")

    recids_with_oaiid = search_unit_in_bibxxx(p='*',
                                              f=CFG_OAI_ID_FIELD,
                                              type='e')
    write_message("%s recids have an OAI ID" % len(recids_with_oaiid),
                  verbose=2)

    all_current_recids = search_unit_in_bibxxx(p='*',
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
    no_more_exported_recids = intbitset(all_current_recids)
    write_message("%s recids are currently exported" %
                  (len(all_current_recids)),
                  verbose=2)

    all_affected_recids = intbitset()
    all_should_recids = intbitset()
    recids_for_set = {}
    for set_spec in all_set_specs():
        if not set_spec:
            set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC
        should_recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = should_recids
        no_more_exported_recids -= should_recids
        all_should_recids |= should_recids
        current_recids = search_unit_in_bibxxx(p=set_spec,
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
        write_message(
            "%s recids should be in %s. Currently %s are in %s" %
            (len(should_recids), set_spec, len(current_recids), set_spec),
            verbose=2)
        to_add = should_recids - current_recids
        write_message("%s recids should be added to %s" %
                      (len(to_add), set_spec),
                      verbose=2)
        to_remove = current_recids - should_recids
        write_message("%s recids should be removed from %s" %
                      (len(to_remove), set_spec),
                      verbose=2)
        affected_recids = to_add | to_remove
        write_message("%s recids should be hence updated for %s" %
                      (len(affected_recids), set_spec),
                      verbose=2)
        all_affected_recids |= affected_recids

    missing_oaiid = all_should_recids - recids_with_oaiid
    write_message("%s recids are missing an oaiid" % len(missing_oaiid))
    write_message("%s recids should no longer be exported" %
                  len(no_more_exported_recids))

    ## Let's add records with missing OAI ID
    all_affected_recids |= missing_oaiid | no_more_exported_recids
    write_message("%s recids should updated" % (len(all_affected_recids)),
                  verbose=2)

    if not all_affected_recids:
        write_message("Nothing to do!")
        return True

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write("<collection>")

    tot = 0
    # Iterate over the recids
    for i, recid in enumerate(all_affected_recids):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(all_affected_recids)))

        write_message("Elaborating recid %s" % recid, verbose=3)
        record = get_record(recid)
        if not record:
            write_message("Record %s seems empty. Let's skip it." % recid,
                          verbose=3)
            continue
        new_record = {}

        # Check if an OAI identifier is already in the record or
        # not.
        assign_oai_id_entry = False
        oai_id_entry = record_get_field_value(record,
                                              tag=CFG_OAI_ID_FIELD[:3],
                                              ind1=CFG_OAI_ID_FIELD[3],
                                              ind2=CFG_OAI_ID_FIELD[4],
                                              code=CFG_OAI_ID_FIELD[5])
        if not oai_id_entry:
            assign_oai_id_entry = True
            oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid)
            write_message("Setting new oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)
        else:
            write_message("Already existing oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_SET_FIELD[:3],
                                    ind1=CFG_OAI_SET_FIELD[3],
                                    ind2=CFG_OAI_SET_FIELD[4],
                                    code=CFG_OAI_SET_FIELD[5]))
        write_message("Record %s currently belongs to these oai_sets: %s" %
                      (recid, ", ".join(current_oai_sets)),
                      verbose=3)

        current_previous_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_PREVIOUS_SET_FIELD[:3],
                                    ind1=CFG_OAI_PREVIOUS_SET_FIELD[3],
                                    ind2=CFG_OAI_PREVIOUS_SET_FIELD[4],
                                    code=CFG_OAI_PREVIOUS_SET_FIELD[5]))
        write_message(
            "Record %s currently doesn't belong anymore to these oai_sets: %s"
            % (recid, ", ".join(current_previous_oai_sets)),
            verbose=3)

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(_set
                               for _set, _recids in recids_for_set.iteritems()
                               if recid in _recids)
        write_message("Record %s now belongs to these oai_sets: %s" %
                      (recid, ", ".join(updated_oai_sets)),
                      verbose=3)

        updated_previous_oai_sets = set(
            _set for _set in (current_previous_oai_sets - updated_oai_sets)
            | (current_oai_sets - updated_oai_sets))
        write_message(
            "Record %s now doesn't belong anymore to these oai_sets: %s" %
            (recid, ", ".join(updated_previous_oai_sets)),
            verbose=3)

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not assign_oai_id_entry:
            write_message("Nothing has changed for record %s, let's move on!" %
                          recid,
                          verbose=3)
            continue  # Jump to next recid

        write_message("Something has changed for record %s, let's update it!" %
                      recid,
                      verbose=3)
        subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)]
        for oai_set in updated_oai_sets:
            subfields.append((CFG_OAI_SET_FIELD[5], oai_set))
        for oai_set in updated_previous_oai_sets:
            subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set))

        record_add_field(new_record, tag="001", controlfield_value=str(recid))
        record_add_field(new_record,
                         tag=CFG_OAI_ID_FIELD[:3],
                         ind1=CFG_OAI_ID_FIELD[3],
                         ind2=CFG_OAI_ID_FIELD[4],
                         subfields=subfields)
        oai_out.write(record_xml_output(new_record))
        tot += 1
        if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE:
            oai_out.write("</collection>")
            oai_out.close()
            write_message("Wrote to file %s" % filename)
            if not no_upload:
                task_low_level_submission('bibupload', 'oairepository', '-c',
                                          filename, '-n')
            # Prepare to save results in a tmp file
            (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                        prefix='oairepository_' + \
                                        time.strftime("%Y%m%d_%H%M%S_",
                                                        time.localtime()))
            oai_out = os.fdopen(fd, "w")
            oai_out.write("<collection>")
            tot = 0
            task_sleep_now_if_required(can_stop_too=True)

    oai_out.write("</collection>")
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if not no_upload:
        task_sleep_now_if_required(can_stop_too=True)
        if tot > 0:
            task_low_level_submission('bibupload', 'oairepository', '-c',
                                      filename, '-n')
        else:
            os.remove(filename)

    return True
def repository_size():
    """Read repository size"""
    return len(search_unit_in_bibxxx(p="*", f=CFG_OAI_SET_FIELD, type="e"))
def print_repository_status(local_write_message=write_message, verbose=0):
    """
    Prints the repository status to the standard output.

    Parameters:

      write_message - *function* the function used to write the output

            verbose - *int* the verbosity of the output
                       - 0: print repository size
                       - 1: print quick status of each set (numbers
                         can be wrong if the repository is in some
                         inconsistent state, i.e. a record is in an
                         OAI setSpec but has not OAI ID)
                       - 2: print detailed status of repository, with
                         number of records that needs to be
                         synchronized according to the sets
                         definitions. Precise, but ~slow...
    """
    repository_size_s = "%d" % repository_size()
    repository_recids_after_update = intbitset()

    local_write_message(CFG_SITE_NAME)
    local_write_message(" OAI Repository Status")

    set_spec_max_length = 19  # How many max char do we display for
    set_name_max_length = 20  # setName and setSpec?

    if verbose == 0:
        # Just print repository size
        local_write_message("  Total(**)" + " " * 29 + " " *
                            (9 - len(repository_size_s)) + repository_size_s)
        return
    elif verbose == 1:
        # We display few information: show longer set name and spec
        set_spec_max_length = 30
        set_name_max_length = 30

    local_write_message("=" * 80)
    header = "  setSpec" + " " * (set_spec_max_length - 7) + \
             "  setName" + " " * (set_name_max_length - 5) + " Volume"
    if verbose > 1:
        header += " " * 5 + "After update(*):"
    local_write_message(header)

    if verbose > 1:
        local_write_message(" " * 57 + "Additions  Deletions")

    local_write_message("-" * 80)

    for set_spec in all_set_specs():

        if verbose <= 1:
            # Get the records that are in this set. This is an
            # incomplete check, as it can happen that some records are
            # in this set (according to the metadata) but have no OAI
            # ID (so they are not exported). This can happen if the
            # repository has some records coming from external
            # sources, or if it has never been synchronized with this
            # tool.
            current_recids = get_recids_for_set_spec(set_spec)
            nb_current_recids = len(current_recids)
        else:
            # Get the records that are *currently* exported for this
            # setSpec
            current_recids = search_unit_in_bibxxx(p=set_spec,
                                                   f=CFG_OAI_SET_FIELD,
                                                   type='e')
            nb_current_recids = len(current_recids)
            # Get the records that *should* be in this set according to
            # the admin defined settings, and compute how many should be
            # added or removed
            should_recids = get_recids_for_set_spec(set_spec)
            repository_recids_after_update |= should_recids

            nb_add_recids = len(should_recids - current_recids)
            nb_remove_recids = len(current_recids - should_recids)
            nb_should_recids = len(should_recids)

        # Adapt setName and setSpec strings lengths
        set_spec_str = set_spec
        if len(set_spec_str) > set_spec_max_length:
            set_spec_str = "%s.." % set_spec_str[:set_spec_max_length]
        set_name_str = get_set_name_for_set_spec(set_spec)
        if len(set_name_str) > set_name_max_length:
            set_name_str = "%s.." % set_name_str[:set_name_max_length]

        row = "  " + set_spec_str + \
               " " * ((set_spec_max_length + 2) - len(set_spec_str)) + set_name_str + \
               " " * ((set_name_max_length + 2) - len(set_name_str)) + \
               " " * (7 - len(str(nb_current_recids))) + str(nb_current_recids)
        if verbose > 1:
            row += \
                " " * max(9 - len(str(nb_add_recids)), 0) + '+' + str(nb_add_recids) + \
                " " * max(7 - len(str(nb_remove_recids)), 0) + '-' + str(nb_remove_recids) + " = " +\
                " " * max(7 - len(str(nb_should_recids)), 0) + str(nb_should_recids)
        local_write_message(row)

    local_write_message("=" * 80)
    footer = "  Total(**)" + " " * (set_spec_max_length + set_name_max_length - 7) + \
             " " * (9 - len(repository_size_s)) + repository_size_s
    if verbose > 1:
        footer += ' ' * (28 -
                         len(str(len(repository_recids_after_update)))) + str(
                             len(repository_recids_after_update))
    local_write_message(footer)

    if verbose > 1:
        local_write_message(
            '  *The "after update" columns show the repository after you run this tool.'
        )
    else:
        local_write_message(
            ' *"Volume" is indicative if repository is out of sync. Use --detailed-report.'
        )
    local_write_message(
        '**The "total" is not the sum of the above numbers, but the union of the records.'
    )