def _detect_collections_from_marcxml_file(recs): """ Extract all possible recIDs from MARCXML file and guess collections for these recIDs. """ from invenio.legacy.bibrecord import record_get_field_values from invenio.legacy.search_engine import guess_collection_of_a_record from invenio.legacy.bibupload.engine import find_record_from_sysno, \ find_records_from_extoaiid, \ find_record_from_oaiid dbcollids = {} sysno_tag = CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG oaiid_tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG oai_tag = CFG_OAI_ID_FIELD for rec, dummy1, dummy2 in recs: if rec: for tag001 in record_get_field_values(rec, '001'): collection = guess_collection_of_a_record(int(tag001)) dbcollids[collection] = 1 for tag_sysno in record_get_field_values(rec, tag=sysno_tag[:3], ind1=sysno_tag[3], ind2=sysno_tag[4], code=sysno_tag[5]): record = find_record_from_sysno(tag_sysno) if record: collection = guess_collection_of_a_record(int(record)) dbcollids[collection] = 1 for tag_oaiid in record_get_field_values(rec, tag=oaiid_tag[:3], ind1=oaiid_tag[3], ind2=oaiid_tag[4], code=oaiid_tag[5]): try: records = find_records_from_extoaiid(tag_oaiid) except Error: records = [] if records: record = records.pop() collection = guess_collection_of_a_record(int(record)) dbcollids[collection] = 1 for tag_oai in record_get_field_values(rec, tag=oai_tag[0:3], ind1=oai_tag[3], ind2=oai_tag[4], code=oai_tag[5]): record = find_record_from_oaiid(tag_oai) if record: collection = guess_collection_of_a_record(int(record)) dbcollids[collection] = 1 return dbcollids.keys()
def test_compare_field_values_with_bibrecord_values(self): """bibfield - same value as in bibrecord""" from invenio.legacy.bibrecord import record_get_field_values from invenio.legacy.search_engine import get_record as search_engine_get_record record = get_record(1) bibrecord_value = record_get_field_values(search_engine_get_record(1), '245', ' ', ' ', 'a')[0] self.assertEqual(bibrecord_value, record['title.title'])
def _get_minimal_arxiv_id(record, tag_code): """ Returns the OAI arXiv id in the given record skipping the prefixes. I.e. oai:arxiv.org:1234.1234 becomes 1234.1234 and oai:arxiv.org:hep-ex/2134123 becomes hep-ex/2134123. Used for searching. """ values = record_get_field_values(record, **split_tag_code(tag_code)) for value in values: if 'arXiv' in value: return value.split(':')[-1]
def retrieve_field_values(curdir, field_name, separator=None, system_number_file='SN', tag=None): """ This is a handy function to retrieve values either from the current submission directory, when a form has been just submitted, or from an existing record (e.g. during MBI action). @param curdir: is the current submission directory. @type curdir: string @param field_name: is the form field name that might exists on disk. @type field_name: string @param separator: is an optional separator. If it exists, it will be used to retrieve multiple values contained in the field. @type separator: string @param system_number_file: is the name of the file on disk in curdir, that is supposed to contain the record id. @type system_number_file: string @param tag: is the full MARC tag (tag+ind1+ind2+code) that should contain values. If not specified, only values in curdir will be retrieved. @type tag: 6-chars @return: the field value(s). @rtype: list of strings. @note: if field_name exists in curdir it will take precedence over retrieving the values from the record. """ field_file = os.path.join(curdir, field_name) if os.path.exists(field_file): field_value = open(field_file).read() if separator is not None: return [ value.strip() for value in field_value.split(separator) if value.strip() ] else: return [field_value.strip()] elif tag is not None: system_number_file = os.path.join(curdir, system_number_file) if os.path.exists(system_number_file): recid = int(open(system_number_file).read().strip()) record = get_record(recid) if separator: return record_get_field_values(record, tag[:3], tag[3], tag[4], tag[5]) else: return [ record_get_field_value(record, tag[:3], tag[3], tag[4], tag[5]) ] return []
def get_record_collections(recid=0, recstruct=None): """ Returns all collections of a record, field 980 @param recid: record id to get collections from @type: string @return: list of collections @rtype: list """ if not recstruct: recstruct = get_record(recid) return [collection for collection in record_get_field_values(recstruct, tag="980", ind1=" ", ind2=" ", code="a")]
def get_sysno_from_record(record, options): """Function to get the system number for a record. In the case of a pure text MARC record being created, the sysno will be retrieved from 001 (i.e. the 'recid' will be returned). In the case of an Aleph MARC record being created, the sysno will be retrieved from 970__a IF this field exists. If not, None will be returned. @param record: the internal representation of the record (created by bibrecord) from which the sysno is to be retrieved. @param options: various options about the record to be created, as obtained from the command line. @return: a string containing a 9-digit SYSNO, -OR- None in certain cases for an Aleph MARC record. """ if options["text-marc"] != 0: vals001 = record_get_field_values(rec=record, tag="001") if len(vals001) > 1: ## multiple values for recid is illegal! sysno = None elif len(vals001) < 1: ## no value for recid is illegal! sysno = None else: ## get recid sysno = vals001[0] if len(sysno) < 9: sysno = "0"*(9-len(sysno)) + sysno else: vals970a = record_get_field_values(rec=record, tag="970", code="a") if len(vals970a) > 1: ## multiple SYS is illegal - return a list of them all, ## let other functions decide what to do return vals970a if len(vals970a) < 1: ## no SYS sysno = None else: ## get SYS sysno = vals970a[0][0:9] return sysno
def get_sysno_from_record(record, options): """Function to get the system number for a record. In the case of a pure text MARC record being created, the sysno will be retrieved from 001 (i.e. the 'recid' will be returned). In the case of an Aleph MARC record being created, the sysno will be retrieved from 970__a IF this field exists. If not, None will be returned. @param record: the internal representation of the record (created by bibrecord) from which the sysno is to be retrieved. @param options: various options about the record to be created, as obtained from the command line. @return: a string containing a 9-digit SYSNO, -OR- None in certain cases for an Aleph MARC record. """ if options["text-marc"] != 0: vals001 = record_get_field_values(rec=record, tag="001") if len(vals001) > 1: ## multiple values for recid is illegal! sysno = None elif len(vals001) < 1: ## no value for recid is illegal! sysno = None else: ## get recid sysno = vals001[0] if len(sysno) < 9: sysno = "0" * (9 - len(sysno)) + sysno else: vals970a = record_get_field_values(rec=record, tag="970", code="a") if len(vals970a) > 1: ## multiple SYS is illegal - return a list of them all, ## let other functions decide what to do return vals970a if len(vals970a) < 1: ## no SYS sysno = None else: ## get SYS sysno = vals970a[0][0:9] return sysno
def get_record_collections(recid=0, recstruct=None): """ Returns all collections of a record, field 980 @param recid: record id to get collections from @type: string @return: list of collections @rtype: list """ if not recstruct: recstruct = get_record(recid) return [ collection for collection in record_get_field_values( recstruct, tag="980", ind1=" ", ind2=" ", code="a") ]
def record_is_conference(record): """ Determine if the record is a new conference based on the value present on field 980 @param record: record to be checked @type record: bibrecord object @return: True if record is a conference, False otherwise @rtype: boolean """ # Get collection field content (tag 980) tag_980_content = record_get_field_values(record, "980", " ", " ", "a") if "CONFERENCES" in tag_980_content: return True return False
def user_can_edit_record_collection(req, recid): """ Check if user has authorization to modify a collection the recid belongs to """ def remove_volatile(field_value): """ Remove volatile keyword from field value """ if field_value.startswith(VOLATILE_PREFIX): field_value = field_value[len(VOLATILE_PREFIX) :] return field_value # Get the collections the record belongs to record_collections = get_all_collections_of_a_record(recid) user_info = collect_user_info(req) uid = user_info["uid"] # In case we are creating a new record if cache_exists(recid, uid): record = get_cache_contents(recid, uid)[2] values = record_get_field_values(record, "980", code="a") record_collections.extend([remove_volatile(v) for v in values]) normalized_collections = [] for collection in record_collections: # Get the normalized collection name present in the action table res = run_sql( """SELECT value FROM "accARGUMENT" WHERE keyword='collection' AND value=%s;""", (collection,), ) if res: normalized_collections.append(res[0][0]) if not normalized_collections: # Check if user has access to all collections auth_code, dummy_message = acc_authorize_action(req, "runbibedit", collection="") if auth_code == 0: return True else: for collection in normalized_collections: auth_code, dummy_message = acc_authorize_action(req, "runbibedit", collection=collection) if auth_code == 0: return True return False
def user_can_edit_record_collection(req, recid): """ Check if user has authorization to modify a collection the recid belongs to """ def remove_volatile(field_value): """ Remove volatile keyword from field value """ if field_value.startswith(VOLATILE_PREFIX): field_value = field_value[len(VOLATILE_PREFIX):] return field_value # Get the collections the record belongs to record_collections = get_all_collections_of_a_record(recid) user_info = collect_user_info(req) uid = user_info["uid"] # In case we are creating a new record if cache_exists(recid, uid): record = get_cache_contents(recid, uid)[2] values = record_get_field_values(record, '980', code="a") record_collections.extend([remove_volatile(v) for v in values]) normalized_collections = [] for collection in record_collections: # Get the normalized collection name present in the action table res = run_sql( """SELECT value FROM "accARGUMENT" WHERE keyword='collection' AND value=%s;""", (collection, )) if res: normalized_collections.append(res[0][0]) if not normalized_collections: # Check if user has access to all collections auth_code, dummy_message = acc_authorize_action(req, 'runbibedit', collection='') if auth_code == 0: return True else: for collection in normalized_collections: auth_code, dummy_message = acc_authorize_action( req, 'runbibedit', collection=collection) if auth_code == 0: return True return False
def retrieve_field_values(curdir, field_name, separator=None, system_number_file='SN', tag=None): """ This is a handy function to retrieve values either from the current submission directory, when a form has been just submitted, or from an existing record (e.g. during MBI action). @param curdir: is the current submission directory. @type curdir: string @param field_name: is the form field name that might exists on disk. @type field_name: string @param separator: is an optional separator. If it exists, it will be used to retrieve multiple values contained in the field. @type separator: string @param system_number_file: is the name of the file on disk in curdir, that is supposed to contain the record id. @type system_number_file: string @param tag: is the full MARC tag (tag+ind1+ind2+code) that should contain values. If not specified, only values in curdir will be retrieved. @type tag: 6-chars @return: the field value(s). @rtype: list of strings. @note: if field_name exists in curdir it will take precedence over retrieving the values from the record. """ field_file = os.path.join(curdir, field_name) if os.path.exists(field_file): field_value = open(field_file).read() if separator is not None: return [value.strip() for value in field_value.split(separator) if value.strip()] else: return [field_value.strip()] elif tag is not None: system_number_file = os.path.join(curdir, system_number_file) if os.path.exists(system_number_file): recid = int(open(system_number_file).read().strip()) record = get_record(recid) if separator: return record_get_field_values(record, tag[:3], tag[3], tag[4], tag[5]) else: return [record_get_field_value(record, tag[:3], tag[3], tag[4], tag[5])] return []
def _detect_980_values_from_marcxml_file(recs): """ Read MARCXML file and return list of 980 $a values found in that file. Useful for checking rights. """ from invenio.legacy.bibrecord import record_get_field_values collection_tag = run_sql("SELECT value FROM tag, field_tag, field \ WHERE tag.id=field_tag.id_tag AND \ field_tag.id_field=field.id AND \ field.code='collection'") collection_tag = collection_tag[0][0] dbcollids = {} for rec, dummy1, dummy2 in recs: if rec: for tag980 in record_get_field_values(rec, tag=collection_tag[:3], ind1=collection_tag[3], ind2=collection_tag[4], code=collection_tag[5]): dbcollids[tag980] = 1 return dbcollids.keys()
def fields(self, tag, escape=0, repeatable_subfields_p=False): """ Returns the list of values corresonding to "tag". If tag has an undefined subcode (such as 999C5), the function returns a list of dictionaries, whoose keys are the subcodes and the values are the values of tag.subcode. If the tag has a subcode, simply returns list of values corresponding to tag. Eg. for given MARC:: 999C5 $a value_1a $b value_1b 999C5 $b value_2b 999C5 $b value_3b $b value_3b_bis >>> bfo.fields('999C5b') >>> ['value_1b', 'value_2b', 'value_3b', 'value_3b_bis'] >>> bfo.fields('999C5') >>> [{'a':'value_1a', 'b':'value_1b'}, {'b':'value_2b'}, {'b':'value_3b'}] By default the function returns only one value for each subfield (that is it considers that repeatable subfields are not allowed). It is why in the above example 'value3b_bis' is not shown for bfo.fields('999C5'). (Note that it is not defined which of value_3b or value_3b_bis is returned). This is to simplify the use of the function, as most of the time subfields are not repeatable (in that way we get a string instead of a list). You can allow repeatable subfields by setting 'repeatable_subfields_p' parameter to True. In this mode, the above example would return: >>> bfo.fields('999C5b', repeatable_subfields_p=True) >>> ['value_1b', 'value_2b', 'value_3b'] >>> bfo.fields('999C5', repeatable_subfields_p=True) >>> [{'a':['value_1a'], 'b':['value_1b']}, {'b':['value_2b']}, {'b':['value_3b', 'value3b_bis']}] NOTICE THAT THE RETURNED STRUCTURE IS DIFFERENT. Also note that whatever the value of 'repeatable_subfields_p' is, bfo.fields('999C5b') always show all fields, even repeatable ones. This is because the parameter has no impact on the returned structure (it is always a list). 'escape' parameter allows to escape special characters of the fields. The value of escape can be: 0. No escaping 1. Escape all HTML characters 2. Remove unsafe HTML tags (Eg. keep <br />) 3. Mix of mode 1 and 2. If value of field starts with <!-- HTML -->, then use mode 2. Else use mode 1. 4. Remove all HTML tags 5. Same as 2, with more tags allowed (like <img>) 6. Same as 3, with more tags allowed (like <img>) 7. Mix of mode 0 and mode 1. If field_value starts with <!--HTML-->, then use mode 0. Else use mode 1. 8. Same as mode 1, but also escape double-quotes 9. Same as mode 4, but also escape double-quotes :param tag: the marc code of a field :param escape: 1 if returned values should be escaped. Else 0. @repeatable_subfields_p if True, returns the list of subfields in the dictionary @return: values of field tag in record """ if self.get_record() is None: # Case where BibRecord could not parse object return [] p_tag = parse_tag(tag) if p_tag[3] != "": # Subcode has been defined. Simply returns list of values values = record_get_field_values(self.get_record(), p_tag[0], p_tag[1], p_tag[2], p_tag[3]) if escape == 0: return values else: return [escape_field(value, escape) for value in values] else: # Subcode is undefined. Returns list of dicts. # However it might be the case of a control field. instances = record_get_field_instances(self.get_record(), p_tag[0], p_tag[1], p_tag[2]) if repeatable_subfields_p: list_of_instances = [] for instance in instances: instance_dict = {} for subfield in instance[0]: if subfield[0] not in instance_dict: instance_dict[subfield[0]] = [] if escape == 0: instance_dict[subfield[0]].append(subfield[1]) else: instance_dict[subfield[0]].append(escape_field(subfield[1], escape)) list_of_instances.append(instance_dict) return list_of_instances else: if escape == 0: return [dict(instance[0]) for instance in instances] else: return [dict([(subfield[0], escape_field(subfield[1], escape)) for subfield in instance[0]]) for instance in instances]
def generate_ticket(ticket, record): """ Generates a ticket to be created, filling subject, body and queue values of the passed BibCatalogTicket object. The enriched object is returned. @param ticket: a ticket object as created by BibCatalogTicket() containing the subject, body and queue to create a ticket in. @type ticket: record object of BibCatalogTicket. @param record: a recstruct object as created by bibrecord.create_record() @type record: record object of BibRecord. @return: the modified ticket object to create. @rtype: BibCatalogTicket """ title_code = load_tag_code_from_name("title") abstract_code = load_tag_code_from_name("abstract") try: date_code = load_tag_code_from_name("date") except BibCatalogTagNotFound: date_code = load_tag_code_from_name("year") category_code = load_tag_code_from_name("subject") try: notes_code = load_tag_code_from_name("note") except BibCatalogTagNotFound: notes_code = load_tag_code_from_name("comment") first_author_code = load_tag_code_from_name("first author name") additional_author_code = load_tag_code_from_name("additional author name") try: external_id_code = load_tag_code_from_name("ext system ID") except BibCatalogTagNotFound: external_id_code = load_tag_code_from_name("primary report number") # List of extra info to print in the ticket. extra_info = [] recid = record_id_from_record(record) arxiv_id = _get_minimal_arxiv_id(record, external_id_code) if arxiv_id: # We have an arxiv id - we can add special info: extra_info.append("ABSTRACT: http://arxiv.org/abs/%s" % (arxiv_id, )) extra_info.append("PDF: http://arxiv.org/pdf/%s" % (arxiv_id, )) categories = record_get_value_with_provenence( record=record, provenence_code="2", provenence_value="arXiv", **split_tag_code(category_code)) comments = record_get_value_with_provenence( record=record, provenence_code="9", provenence_value="arXiv", **split_tag_code(notes_code)) external_ids = arxiv_id subject = "ARXIV:" + arxiv_id else: # Not an arxiv record - Lets get generic info categories = record_get_value_with_provenence( record=record, provenence_code="2", provenence_value="SzGeCERN", **split_tag_code(category_code)) comments = record_get_field_values(rec=record, **split_tag_code(notes_code)) external_id_list = record_get_field_values( rec=record, **split_tag_code(external_id_code)) external_ids = ", ".join(external_id_list) subject = "Record #%s %s" % (recid, external_ids) authors = record_get_field_values(record, **split_tag_code(first_author_code)) + \ record_get_field_values(record, **split_tag_code(additional_author_code)) text = """ %(submitdate)s External IDs: %(external_ids)s Title: %(title)s Authors: %(authors)s Categories: %(categories)s Comments: %(comments)s %(abstract)s %(extra_info)s Edit the record now: %(editurl)s """ \ % { 'external_ids': external_ids, 'submitdate': record_get_field_value(record, **split_tag_code(date_code)), 'extra_info': "\n".join(extra_info), 'title': record_get_field_value(record, **split_tag_code(title_code)), 'comments': "; ".join(comments), 'categories': " ".join(categories), 'authors': " / ".join(authors[:10]), 'abstract': record_get_field_value(record, **split_tag_code(abstract_code)), 'editurl': "%s/record/edit/%s" % (CFG_SITE_URL, recid), } # To avoid errors with string formatting later, we are escaping %'s ticket.subject = subject ticket.body = text.replace('%', '%%') ticket.queue = "Test" return ticket
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True initial_snapshot = {} for set_spec in all_set_specs(): initial_snapshot[set_spec] = get_set_definitions(set_spec) write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2) task_update_progress("Fetching records to process") recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e') write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2) all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e') no_more_exported_recids = intbitset(all_current_recids) write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2) all_affected_recids = intbitset() all_should_recids = intbitset() recids_for_set = {} for set_spec in all_set_specs(): if not set_spec: set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC should_recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = should_recids no_more_exported_recids -= should_recids all_should_recids |= should_recids current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') write_message("%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2) to_add = should_recids - current_recids write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2) to_remove = current_recids - should_recids write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2) affected_recids = to_add | to_remove write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2) all_affected_recids |= affected_recids missing_oaiid = all_should_recids - recids_with_oaiid write_message("%s recids are missing an oaiid" % len(missing_oaiid)) write_message("%s recids should no longer be exported" % len(no_more_exported_recids)) ## Let's add records with missing OAI ID all_affected_recids |= missing_oaiid | no_more_exported_recids write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2) if not all_affected_recids: write_message("Nothing to do!") return True # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 # Iterate over the recids for i, recid in enumerate(all_affected_recids): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(all_affected_recids))) write_message("Elaborating recid %s" % recid, verbose=3) record = get_record(recid) if not record: write_message("Record %s seems empty. Let's skip it." % recid, verbose=3) continue new_record = {} # Check if an OAI identifier is already in the record or # not. assign_oai_id_entry = False oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5]) if not oai_id_entry: assign_oai_id_entry = True oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid) write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) else: write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5])) write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3) current_previous_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5])) write_message("Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3) # Get the sets that should be in this record according to # settings updated_oai_sets = set(_set for _set, _recids in iteritems(recids_for_set) if recid in _recids) write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3) updated_previous_oai_sets = set(_set for _set in (current_previous_oai_sets - updated_oai_sets) | (current_oai_sets - updated_oai_sets)) write_message("Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and not assign_oai_id_entry: write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3) continue # Jump to next recid write_message("Something has changed for record %s, let's update it!" % recid, verbose=3) subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)] for oai_set in updated_oai_sets: subfields.append((CFG_OAI_SET_FIELD[5], oai_set)) for oai_set in updated_previous_oai_sets: subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set)) record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields) oai_out.write(record_xml_output(new_record)) tot += 1 if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE: oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename) # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 task_sleep_now_if_required(can_stop_too=True) oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if tot > 0: if not no_upload: task_sleep_now_if_required(can_stop_too=True) if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename) else: os.remove(filename) return True
def create_marc_record(record, sysno, options): """Create a text-marc, or aleph-marc record from the contents of "record", and return it as a string. @param record: Internal representation of an XML MARC record, created by bibrecord. @param sysno: the system number to be used for the record @param options: the options about the MARC record to be created, as passed from command line @return: string (MARC record, either text-marc or ALEPH marc format, depending upon "options". """ out = "" ## String containing record to be printed display_001 = 0 ## Flag used in ALEPH MARC mode to determine whether ## or not to print the "001" field ## Get a dictionary containing the names of fields to change for ## the output record: if options["aleph-marc"] == 1: fieldname_changes = get_fieldname_changes() else: fieldname_changes = {} if options["aleph-marc"] == 1: ## Perform some ALEPH-MARC specific tasks: ## Assume that we will NOT display "001": display_001 = 0 ## Add ALEPH record headers to the output record: if 1 not in (options["correct-mode"], options["append-mode"]): ## This is not an ALEPH "correct" or "append" record. The ## record must therefore have FMT and LDR fields. E.g.: ## 123456789 FMT L BK ## 123456789 LDR L ^^^^^nam^^22^^^^^^a^4500 out += """%(sys)s%(fmt)s %(sys)s%(ldr)s\n""" % { 'sys': sysno, 'fmt': get_aleph_FMT(), 'ldr': get_aleph_LDR() } if options["delete-mode"] == 1: ## This is an ALEPH 'delete' record. Add the DEL field ## then return the 'completed' record (in delete mode, ## the record only needs the leaders, and a 'DEL' field, e.g.: ## 123456789 FMT L BK ## 123456789 LDR L ^^^^^nam^^22^^^^^^a^4500 ## 123456789 DEL L $$aY out += """%(sys)s%(del)s\n""" % { 'sys': sysno, 'del': get_aleph_DEL() } return out elif 1 in (options["insert-mode"], options["replace-mode"]): ## Either an ALEPH 'insert' or 'replace' record is being created. ## It needs to have 008 and OWN fields. E.g.: ## 123456789 008 L ^^^^^^s^^^^^^^^^^^^^^^^r^^^^^000^0^eng^d ## 123456789 OWN L $$aPUBLIC out += """%(sys)s%(008)s\n""" % { 'sys': sysno, '008': get_aleph_008() } ## The "OWN" field should only be printed at this level if the ## MARC XML did not have an OWN (963__a) field: if "PUBLIC" not in \ record_get_field_values(record, "963", code="a"): ## Add OWN field: out += """%(sys)s%(own)s\n""" % { 'sys': sysno, 'own': get_aleph_OWN() } if options["replace-mode"] == 1: ## In 'replace' mode, the record should have a 001 field: display_001 = 1 ## Remove fields unwanted in ALEPH MARC: for deltag in get_fields_dropped_in_aleph(): try: del record[deltag] except KeyError: ## tag doesn't exist in record: pass ## now add 001, since it is a special field: if options["text-marc"] == 1: try: ## get the 001 line(s): lines_001 = create_field_lines(fieldname="001", \ field=record["001"][0], \ sysno=sysno, \ alephmarc=options["aleph-marc"]) ## print the 001 line(s): out += print_field(field_lines=lines_001, \ alephmarc=options["aleph-marc"]) except KeyError: ## no 001 field pass elif options["aleph-marc"] == 1: ## If desirable, build the "001" line: if display_001 == 1: try: ## make the 001 line(s): line_leader = """%(sys)s """ % {'sys': sysno} line_leader += """%(fieldname)s L """ % {'fieldname': "001"} lines_001 = [[["", line_leader], ["", sysno]]] ## print the 001 line(s): out += print_field(field_lines=lines_001, \ alephmarc=options["aleph-marc"]) except KeyError: ## no 001 field pass ## Now, if running in "insert" or "replace" mode, add "003": ## 003 is a mandatory field in an ALEPH record. It contains the ## identifier for the organization that has generated the SYS (001) ## for the record. As such, it is necessary to drop any existing 003 ## from the record, then add our own 003. ## First, drop the "003" field from the record: try: del record["003"] except KeyError: ## There was no 003 pass ## Now add a correct 003 (if desirable): if 1 in (options["insert-mode"], options["replace-mode"]): out += """%(sys)s%(own)s\n""" % { 'sys': sysno, 'own': get_aleph_003() } ## delete 001 from the list of fields to output (if it exists): try: del record["001"] except KeyError: ## There was no 001 pass ## Get the fields of this record, and order them correctly (using the same ## order as that of the original MARC XML file): fields = [] tags = record.keys() tags.sort() for tag in tags: for field in record[tag]: fields.append((tag, field)) ## Finally, loop through all fields and display them in the record: for field in fields: ## Should the field-name be changed? try: fieldname = fieldname_changes[str(field[0])] except KeyError: ## Don't change this fieldname: fieldname = field[0] ## get the subfields, etc, for this field: fielddata = field[1] ## Create the MARC lines for this field: field_lines = create_field_lines(fieldname, \ fielddata, \ sysno, \ options["aleph-marc"]) ## Now create the formatted MARC lines: out += print_field(field_lines, options["aleph-marc"]) ## Return the formatted MARC record: return out
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True if run_sql( "SELECT id FROM schTASK WHERE proc='bibupload:oairepository' AND status='WAITING'" ): write_message( "Previous requests of oairepository still being elaborated. Let's skip this execution." ) return True initial_snapshot = {} for set_spec in all_set_specs(): initial_snapshot[set_spec] = get_set_definitions(set_spec) write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2) task_update_progress("Fetching records to process") recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e') write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2) all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e') no_more_exported_recids = intbitset(all_current_recids) write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2) all_affected_recids = intbitset() all_should_recids = intbitset() recids_for_set = {} for set_spec in all_set_specs(): if not set_spec: set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC should_recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = should_recids no_more_exported_recids -= should_recids all_should_recids |= should_recids current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') write_message( "%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2) to_add = should_recids - current_recids write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2) to_remove = current_recids - should_recids write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2) affected_recids = to_add | to_remove write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2) all_affected_recids |= affected_recids missing_oaiid = all_should_recids - recids_with_oaiid write_message("%s recids are missing an oaiid" % len(missing_oaiid)) write_message("%s recids should no longer be exported" % len(no_more_exported_recids)) ## Let's add records with missing OAI ID all_affected_recids |= missing_oaiid | no_more_exported_recids write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2) if not all_affected_recids: write_message("Nothing to do!") return True # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 # Iterate over the recids for i, recid in enumerate(all_affected_recids): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(all_affected_recids))) write_message("Elaborating recid %s" % recid, verbose=3) record = get_record(recid) if not record: write_message("Record %s seems empty. Let's skip it." % recid, verbose=3) continue new_record = {} # Check if an OAI identifier is already in the record or # not. assign_oai_id_entry = False oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5]) if not oai_id_entry: assign_oai_id_entry = True oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid) write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) else: write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5])) write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3) current_previous_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5])) write_message( "Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3) # Get the sets that should be in this record according to # settings updated_oai_sets = set(_set for _set, _recids in iteritems(recids_for_set) if recid in _recids) write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3) updated_previous_oai_sets = set( _set for _set in (current_previous_oai_sets - updated_oai_sets) | (current_oai_sets - updated_oai_sets)) write_message( "Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and not assign_oai_id_entry: write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3) continue # Jump to next recid write_message("Something has changed for record %s, let's update it!" % recid, verbose=3) subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)] for oai_set in updated_oai_sets: subfields.append((CFG_OAI_SET_FIELD[5], oai_set)) for oai_set in updated_previous_oai_sets: subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set)) record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields) oai_out.write(record_xml_output(new_record)) tot += 1 if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE: oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n', '-Noairepository', '-P', '-1') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-Noairepository', '-P', '-1') # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 task_sleep_now_if_required(can_stop_too=True) oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if tot > 0: if not no_upload: task_sleep_now_if_required(can_stop_too=True) if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename) else: os.remove(filename) return True
def create_marc_record(record, sysno, options): """Create a text-marc, or aleph-marc record from the contents of "record", and return it as a string. @param record: Internal representation of an XML MARC record, created by bibrecord. @param sysno: the system number to be used for the record @param options: the options about the MARC record to be created, as passed from command line @return: string (MARC record, either text-marc or ALEPH marc format, depending upon "options". """ out = "" ## String containing record to be printed display_001 = 0 ## Flag used in ALEPH MARC mode to determine whether ## or not to print the "001" field ## Get a dictionary containing the names of fields to change for ## the output record: if options["aleph-marc"] == 1: fieldname_changes = get_fieldname_changes() else: fieldname_changes = {} if options["aleph-marc"] == 1: ## Perform some ALEPH-MARC specific tasks: ## Assume that we will NOT display "001": display_001 = 0 ## Add ALEPH record headers to the output record: if 1 not in (options["correct-mode"], options["append-mode"]): ## This is not an ALEPH "correct" or "append" record. The ## record must therefore have FMT and LDR fields. E.g.: ## 123456789 FMT L BK ## 123456789 LDR L ^^^^^nam^^22^^^^^^a^4500 out += """%(sys)s%(fmt)s %(sys)s%(ldr)s\n""" % { 'sys' : sysno, 'fmt' : get_aleph_FMT(), 'ldr' : get_aleph_LDR() } if options["delete-mode"] == 1: ## This is an ALEPH 'delete' record. Add the DEL field ## then return the 'completed' record (in delete mode, ## the record only needs the leaders, and a 'DEL' field, e.g.: ## 123456789 FMT L BK ## 123456789 LDR L ^^^^^nam^^22^^^^^^a^4500 ## 123456789 DEL L $$aY out += """%(sys)s%(del)s\n""" % { 'sys' : sysno, 'del' : get_aleph_DEL() } return out elif 1 in (options["insert-mode"], options["replace-mode"]): ## Either an ALEPH 'insert' or 'replace' record is being created. ## It needs to have 008 and OWN fields. E.g.: ## 123456789 008 L ^^^^^^s^^^^^^^^^^^^^^^^r^^^^^000^0^eng^d ## 123456789 OWN L $$aPUBLIC out += """%(sys)s%(008)s\n""" % { 'sys' : sysno, '008' : get_aleph_008() } ## The "OWN" field should only be printed at this level if the ## MARC XML did not have an OWN (963__a) field: if "PUBLIC" not in \ record_get_field_values(record, "963", code="a"): ## Add OWN field: out += """%(sys)s%(own)s\n""" % { 'sys' : sysno, 'own' : get_aleph_OWN() } if options["replace-mode"] == 1: ## In 'replace' mode, the record should have a 001 field: display_001 = 1 ## Remove fields unwanted in ALEPH MARC: for deltag in get_fields_dropped_in_aleph(): try: del record[deltag] except KeyError: ## tag doesn't exist in record: pass ## now add 001, since it is a special field: if options["text-marc"] == 1: try: ## get the 001 line(s): lines_001 = create_field_lines(fieldname="001", \ field=record["001"][0], \ sysno=sysno, \ alephmarc=options["aleph-marc"]) ## print the 001 line(s): out += print_field(field_lines=lines_001, \ alephmarc=options["aleph-marc"]) except KeyError: ## no 001 field pass elif options["aleph-marc"] == 1: ## If desirable, build the "001" line: if display_001 == 1: try: ## make the 001 line(s): line_leader = """%(sys)s """ % { 'sys' : sysno } line_leader += """%(fieldname)s L """ % { 'fieldname' : "001" } lines_001 = [[["", line_leader], ["", sysno]]] ## print the 001 line(s): out += print_field(field_lines=lines_001, \ alephmarc=options["aleph-marc"]) except KeyError: ## no 001 field pass ## Now, if running in "insert" or "replace" mode, add "003": ## 003 is a mandatory field in an ALEPH record. It contains the ## identifier for the organization that has generated the SYS (001) ## for the record. As such, it is necessary to drop any existing 003 ## from the record, then add our own 003. ## First, drop the "003" field from the record: try: del record["003"] except KeyError: ## There was no 003 pass ## Now add a correct 003 (if desirable): if 1 in (options["insert-mode"], options["replace-mode"]): out += """%(sys)s%(own)s\n""" % { 'sys' : sysno, 'own' : get_aleph_003() } ## delete 001 from the list of fields to output (if it exists): try: del record["001"] except KeyError: ## There was no 001 pass ## Get the fields of this record, and order them correctly (using the same ## order as that of the original MARC XML file): fields = [] tags = record.keys() tags.sort() for tag in tags: for field in record[tag]: fields.append((tag, field)) ## Finally, loop through all fields and display them in the record: for field in fields: ## Should the field-name be changed? try: fieldname = fieldname_changes[str(field[0])] except KeyError: ## Don't change this fieldname: fieldname = field[0] ## get the subfields, etc, for this field: fielddata = field[1] ## Create the MARC lines for this field: field_lines = create_field_lines(fieldname, \ fielddata, \ sysno, \ options["aleph-marc"]) ## Now create the formatted MARC lines: out += print_field(field_lines, options["aleph-marc"]) ## Return the formatted MARC record: return out
def validate_matches(bibmatch_recid, record, server, result_recids, \ collections="", verbose=0, ascii_mode=False): """ Perform record validation on a set of matches. This function will try to find any search-result that "really" is a correct match, based on various methods defined in a given rule-set. See more about rule-sets in validate_match() function documentation. This function will return a tuple containing a list of all record IDs satisfying the count of field matching needed for exact matches and a similar list for fuzzy matches that has less fields matching then the threshold. Records that are not matching at all are simply left out of the lists. @param bibmatch_recid: Current record number. Used for logging. @type bibmatch_recid: int @param record: bibrec structure of original record @type record: dict @param server: InvenioConnector object to matched record source repository @type server: InvenioConnector object @param result_recids: the list of record ids from search result. @type result_recids: list @param collections: list of collections to search, if specified @type collections: list @param verbose: be loud @type verbose: int @param ascii_mode: True to transform values to its ascii representation @type ascii_mode: bool @return: list of record IDs matched @rtype: list """ matches_found = [] fuzzy_matches_found = [] # Generate final rule-set by analyzing the record final_ruleset = get_validation_ruleset(record) if not final_ruleset: raise BibMatchValidationError("Bad configuration rule-set." \ "Please check that CFG_BIBMATCH_MATCH_VALIDATION_RULESETS" \ " is formed correctly.") if verbose > 8: sys.stderr.write("\nStart record validation:\n\nFinal validation ruleset used:\n") pp = pprint.PrettyPrinter(stream=sys.stderr, indent=2) pp.pprint(final_ruleset) CFG_BIBMATCH_LOGGER.info("Final validation ruleset used: %s" % (final_ruleset,)) # Fetch all records in MARCXML and convert to BibRec found_record_list = [] query = " OR ".join(["001:%d" % (recid,) for recid in result_recids]) if collections: search_params = dict(p=query, of="xm", c=collections) else: search_params = dict(p=query, of="xm") CFG_BIBMATCH_LOGGER.info("Fetching records to match: %s" % (str(search_params),)) result_marcxml = server.search_with_retry(**search_params) # Check if record was found if result_marcxml: found_record_list = [r[0] for r in create_records(result_marcxml)] # Check if BibRecord generation was successful if not found_record_list: # Error fetching records. Unable to validate. Abort. raise BibMatchValidationError("Error retrieving MARCXML for possible matches from %s. Aborting." \ % (server.server_url,)) if len(found_record_list) < len(result_recids): # Error fetching all records. Will still continue. sys.stderr.write("\nError retrieving all MARCXML for possible matched records from %s.\n" \ % (server.server_url,)) # Validate records one-by-one, adding any matches to the list of matching record IDs current_index = 1 for matched_record in found_record_list: recid = record_get_field_values(matched_record, tag="001")[0] if verbose > 8: sys.stderr.write("\n Validating matched record #%d (%s):\n" % \ (current_index, recid)) CFG_BIBMATCH_LOGGER.info("Matching of record %d: Comparing to matched record %s" % \ (bibmatch_recid, recid)) match_ratio = validate_match(record, matched_record, final_ruleset, \ verbose, ascii_mode) if match_ratio == 1.0: # All matches were a success, this is an exact match CFG_BIBMATCH_LOGGER.info("Matching of record %d: Exact match found -> %s" % (bibmatch_recid, recid)) matches_found.append(recid) elif match_ratio >= CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT: # This means that some matches failed, but some succeeded as well. That's fuzzy... CFG_BIBMATCH_LOGGER.info("Matching of record %d: Fuzzy match found -> %s" % \ (bibmatch_recid, recid)) fuzzy_matches_found.append(recid) else: CFG_BIBMATCH_LOGGER.info("Matching of record %d: Not a match" % (bibmatch_recid,)) current_index += 1 # Return list of matching record IDs return matches_found, fuzzy_matches_found
def validate_match(org_record, matched_record, ruleset, verbose=0, ascii_mode=False): """ This function will try to match the original record with matched record. This comparison uses various methods defined in configuration and/or determined from the source record. These methods can be derived from each rule-set defined, which contains a mapping of a certain pattern to a list of rules defining the "match-strategy". For example: ('260__', [{ 'tags' : '260__c', 'threshold' : 0.8, 'compare_mode' : 'lazy', 'match_mode' : 'date', 'result_mode' : 'normal' }]) Quick run-down of possible values: Compare mode: 'strict' : all (sub-)fields are compared, and all must match. Order is significant. 'normal' : all (sub-)fields are compared, and all must match. Order is ignored. 'lazy' : all (sub-)fields are compared with each other and at least one must match 'ignored' : the tag is ignored in the match. Used to disable previously defined rules. Match mode: 'title' : uses a method specialized for comparing titles, e.g. looking for subtitles 'author' : uses a special authorname comparison. Will take initials into account. 'identifier': special matching for identifiers, stripping away punctuation 'date' : matches dates by extracting and comparing the year 'normal' : normal string comparison. Result mode: 'normal' : a failed match will cause the validation to continue on other rules (if any) a successful match will cause the validation to continue on other rules (if any) 'final' : a failed match will cause the validation to immediately exit as a failure. a successful match will cause validation to immediately exit as a success. 'joker' : a failed match will cause the validation to continue on other rules (if any). a successful match will cause validation to immediately exit as a success. Fields are considered matching when all its subfields or values match. ALL matching strategy must return successfully for a match to be validated (except for 'joker' mode). @param org_record: bibrec structure of original record @type org_record: dict @param matched_record: bibrec structure of matched record @type matched_record: dict @param ruleset: the default rule-set {tag: strategy,..} used when validating @type ruleset: dict @param verbose: be loud @type verbose: int @param ascii_mode: True to transform values to its ascii representation @type ascii_mode: bool @return: Number of matches succeeded divided by number of comparisons done. At least two successful matches must be done unless a joker or final match is found @rtype: float """ total_number_of_matches = 0 total_number_of_comparisons = 0 for field_tags, threshold, compare_mode, match_mode, result_mode in ruleset: field_tag_list = field_tags.split(',') if verbose > 8: sys.stderr.write("\nValidating tags: %s in parsing mode '%s' and comparison\ mode '%s' as '%s' result with threshold %0.2f\n" \ % (field_tag_list, compare_mode, match_mode, \ result_mode, threshold)) current_matching_status = False ## 1. COMPARE MODE # Fetch defined fields from both records original_record_values = [] matched_record_values = [] for field_tag in field_tag_list: tag_structure = validate_tag(field_tag) if tag_structure != None: tag, ind1, ind2, code = tag_structure # Fetch all field instances to match original_values = record_get_field_values(org_record, tag, ind1, ind2, code) original_record_values.extend([value for value in original_values if value]) matched_values = record_get_field_values(matched_record, tag, ind1, ind2, code) matched_record_values.extend([value for value in matched_values if value]) if (len(original_record_values) == 0 or len(matched_record_values) == 0): # Both records do not have values, ignore. if verbose > 8: sys.stderr.write("\nBoth records do not have this field. Continue.\n") continue if result_mode != 'joker': # Since joker is a special beast (should have no impact on failure), # We first check if it is the current mode before incrementing number # of matching comparisons / attempts total_number_of_comparisons += 1 if ascii_mode: original_record_values = translate_to_ascii(original_record_values) matched_record_values = translate_to_ascii(matched_record_values) ignore_order = True matches_needed = 0 # How many field-value matches are needed for successful validation of this record if compare_mode == 'lazy': # 'lazy' : all fields are matched with each other, if any match = success matches_needed = 1 elif compare_mode == 'normal': # 'normal' : all fields are compared, and all must match. # Order is ignored. The number of matches needed is equal # to the value count of original record matches_needed = len(original_record_values) elif compare_mode == 'strict': # 'strict' : all fields are compared, and all must match. Order matters. if len(original_record_values) != len(matched_record_values): # Not the same number of fields, not a valid match # Unless this is a joker, we return indicating failure if result_mode != 'joker': return 0.0 continue matches_needed = len(original_record_values) ignore_order = False if verbose > 8: sys.stderr.write("Total matches needed: %d -> " % (matches_needed,)) ## 2. MATCH MODE comparison_function = None if match_mode == 'title': # Special title mode comparison_function = compare_fieldvalues_title elif match_mode == 'author': # Special author mode comparison_function = compare_fieldvalues_authorname elif match_mode == 'identifier': # Special identifier mode comparison_function = compare_fieldvalues_identifier elif match_mode == 'date': # Special identifier mode comparison_function = compare_fieldvalues_date else: # Normal mode comparison_function = compare_fieldvalues_normal # Get list of comparisons to perform containing extracted values field_comparisons = get_paired_comparisons(original_record_values, \ matched_record_values, \ ignore_order) if verbose > 8: sys.stderr.write("Field comparison values:\n%s\n" % (field_comparisons,)) # Run comparisons according to match_mode current_matching_status, matches = comparison_function(field_comparisons, \ threshold, \ matches_needed) CFG_BIBMATCH_LOGGER.info("-- Comparing fields %s with %s = %d matches of %d" % \ (str(original_record_values), \ str(matched_record_values), \ matches, matches_needed)) ## 3. RESULT MODE if current_matching_status: if verbose > 8: sys.stderr.write("Fields matched successfully.\n") if result_mode in ['final', 'joker']: # Matching success. Return 5,5 indicating exact-match when final or joker. return 1.0 total_number_of_matches += 1 else: # Matching failed. Not a valid match if result_mode == 'final': # Final does not allow failure return 0.0 elif result_mode == 'joker': if verbose > 8: sys.stderr.write("Fields not matching. (Joker)\n") else: if verbose > 8: sys.stderr.write("Fields not matching. \n") if total_number_of_matches < CFG_BIBMATCH_MIN_VALIDATION_COMPARISONS \ or total_number_of_comparisons == 0: return 0.0 return total_number_of_matches / float(total_number_of_comparisons)
def generate_ticket(ticket, record): """ Generates a ticket to be created, filling subject, body and queue values of the passed BibCatalogTicket object. The enriched object is returned. @param ticket: a ticket object as created by BibCatalogTicket() containing the subject, body and queue to create a ticket in. @type ticket: record object of BibCatalogTicket. @param record: a recstruct object as created by bibrecord.create_record() @type record: record object of BibRecord. @return: the modified ticket object to create. @rtype: BibCatalogTicket """ title_code = load_tag_code_from_name("title") abstract_code = load_tag_code_from_name("abstract") try: date_code = load_tag_code_from_name("date") except BibCatalogTagNotFound: date_code = load_tag_code_from_name("year") category_code = load_tag_code_from_name("subject") try: notes_code = load_tag_code_from_name("note") except BibCatalogTagNotFound: notes_code = load_tag_code_from_name("comment") first_author_code = load_tag_code_from_name("first author name") additional_author_code = load_tag_code_from_name("additional author name") try: external_id_code = load_tag_code_from_name("ext system ID") except BibCatalogTagNotFound: external_id_code = load_tag_code_from_name("primary report number") # List of extra info to print in the ticket. extra_info = [] recid = record_id_from_record(record) arxiv_id = _get_minimal_arxiv_id(record, external_id_code) if arxiv_id: # We have an arxiv id - we can add special info: extra_info.append("ABSTRACT: http://arxiv.org/abs/%s" % (arxiv_id,)) extra_info.append("PDF: http://arxiv.org/pdf/%s" % (arxiv_id,)) categories = record_get_value_with_provenence(record=record, provenence_code="2", provenence_value="arXiv", **split_tag_code(category_code)) comments = record_get_value_with_provenence(record=record, provenence_code="9", provenence_value="arXiv", **split_tag_code(notes_code)) external_ids = arxiv_id subject = "ARXIV:" + arxiv_id else: # Not an arxiv record - Lets get generic info categories = record_get_value_with_provenence(record=record, provenence_code="2", provenence_value="SzGeCERN", **split_tag_code(category_code)) comments = record_get_field_values(rec=record, **split_tag_code(notes_code)) external_id_list = record_get_field_values(rec=record, **split_tag_code(external_id_code)) external_ids = ", ".join(external_id_list) subject = "Record #%s %s" % (recid, external_ids) authors = record_get_field_values(record, **split_tag_code(first_author_code)) + \ record_get_field_values(record, **split_tag_code(additional_author_code)) text = """ %(submitdate)s External IDs: %(external_ids)s Title: %(title)s Authors: %(authors)s Categories: %(categories)s Comments: %(comments)s %(abstract)s %(extra_info)s Edit the record now: %(editurl)s """ \ % { 'external_ids': external_ids, 'submitdate': record_get_field_value(record, **split_tag_code(date_code)), 'extra_info': "\n".join(extra_info), 'title': record_get_field_value(record, **split_tag_code(title_code)), 'comments': "; ".join(comments), 'categories': " ".join(categories), 'authors': " / ".join(authors[:10]), 'abstract': record_get_field_value(record, **split_tag_code(abstract_code)), 'editurl': "%s/record/edit/%s" % (CFG_SITE_URL, recid), } # To avoid errors with string formatting later, we are escaping %'s ticket.subject = subject ticket.body = text.replace('%', '%%') ticket.queue = "Test" return ticket