def replace_references(recid, uid=None, txt=None, url=None): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record * txt: references in text mode * inspire: format of ther references """ # Parse references if txt is not None: references_xml = extract_references_from_string_xml(txt, is_only_references=True) elif url is not None: references_xml = extract_references_from_url_xml(url) else: references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml) dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_contents(recid, uid) out_xml = None references_to_add = record_get_field_instances(references[0], tag="999", ind1="C", ind2="5") refextract_status = record_get_field_instances(references[0], tag="999", ind1="C", ind2="6") if references_to_add: # Replace 999 fields record_delete_fields(record, "999") record_add_fields(record, "999", references_to_add) record_add_fields(record, "999", refextract_status) # Update record references out_xml = record_xml_output(record) return out_xml
def crossref_normalize_name(record): """ Changes the format of author's name (often with initials) to the proper, unified one, using bibauthor_name_utils tools @return: changed record """ # pattern for removing the spaces between two initials pattern_initials = '([A-Z]\\.)\\s([A-Z]\\.)' # first, change the main author for field in record_get_field_instances(record, '100'): main_author = field[0][0][1] new_author = create_normalized_name(split_name_parts(main_author)) # remove spaces between initials # two iterations are required for _ in range(2): new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author) position = field[4] record_modify_subfield(rec=record, tag='100', subfield_code='a', value=new_author, subfield_position=0, field_position_global=position) # then, change additional authors for field in record_get_field_instances(record, '700'): author = field[0][0][1] new_author = create_normalized_name(split_name_parts(author)) for _ in range(2): new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author) position = field[4] record_modify_subfield(rec=record, tag='700', subfield_code='a', value=new_author, subfield_position=0, field_position_global=position)
def replace_references(recid): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record """ # Parse references references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml) # Record marc xml record = get_record(recid) if references[0]: fields_to_add = record_get_field_instances(references[0], tag='999', ind1='%', ind2='%') # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', fields_to_add) # Update record references out_xml = record_xml_output(record) else: out_xml = None return out_xml
def translate_fieldvalues_from_latex(record, tag, code='', encoding='utf-8'): """ Given a record and field tag, this function will modify the record by translating the subfield values of found fields from LaTeX to chosen encoding for all the subfields with given code (or all if no code is given). :param record: record to modify, in BibRec style structure :type record: dict :param tag: tag of fields to modify :type tag: string :param code: restrict the translation to a given subfield code :type code: string :param encoding: scharacter encoding for the new value. Defaults to UTF-8. :type encoding: string """ field_list = record_get_field_instances(record, tag) for field in field_list: subfields = field[0] subfield_index = 0 for subfield_code, subfield_value in subfields: if code == '' or subfield_code == code: newvalue = translate_latex2unicode(subfield_value).encode( encoding) record_modify_subfield(record, tag, subfield_code, newvalue, subfield_index, field_position_global=field[4]) subfield_index += 1
def format_element(bfo, limit, separator=' ; ', extension='[...]', print_links="yes"): """ Prints the list of editors of a record. @param limit: the maximum number of editors to display @param separator: the separator between editors. @param extension: a text printed if more editors than 'limit' exist @param print_links: if yes, print the editors as HTML link to their publications """ from urllib import quote from invenio.config import CFG_SITE_URL from invenio.legacy import bibrecord authors = bibrecord.record_get_field_instances(bfo.get_record(), '100') editors = [bibrecord.field_get_subfield_values(author, 'a')[0] for author in authors if len(bibrecord.field_get_subfield_values(author, "e")) > 0 and bibrecord.field_get_subfield_values(author, "e")[0]=="ed." ] if print_links.lower() == "yes": editors = ['<a href="' + CFG_SITE_URL + '/search?f=author&p=' + \ quote(editor) + \ '&ln='+ bfo.lang + \ '">' + editor + '</a>' for editor in editors] if limit.isdigit() and len(editors) > int(limit): return separator.join(editors[:int(limit)]) + extension elif len(editors) > 0: return separator.join(editors)
def translate_fieldvalues_from_latex(record, tag, code='', encoding='utf-8'): """ Given a record and field tag, this function will modify the record by translating the subfield values of found fields from LaTeX to chosen encoding for all the subfields with given code (or all if no code is given). :param record: record to modify, in BibRec style structure :type record: dict :param tag: tag of fields to modify :type tag: string :param code: restrict the translation to a given subfield code :type code: string :param encoding: scharacter encoding for the new value. Defaults to UTF-8. :type encoding: string """ field_list = record_get_field_instances(record, tag) for field in field_list: subfields = field[0] subfield_index = 0 for subfield_code, subfield_value in subfields: if code == '' or subfield_code == code: newvalue = translate_latex2unicode( subfield_value ).encode(encoding) record_modify_subfield(record, tag, subfield_code, newvalue, subfield_index, field_position_global=field[4]) subfield_index += 1
def record_get_value_with_provenence(record, provenence_value, provenence_code, tag, ind1=" ", ind2=" ", code=""): """ Retrieves the value of the given field(s) with given provenence code/value combo. For example: If one would like to extract all subject categories (65017 $a) with a given provenence, in this case "arXiv" in $9: 65017 $ahep-ph$9arXiv 65017 $ahep-th$9arXiv 65017 $aMath$9INSPIRE this function would return ["hep-ph", "hep-th"] Returns a list of subfield values. """ fields = record_get_field_instances(record, tag, ind1, ind2) final_values = [] for subfields, dummy1, dummy2, dummy3, dummy4 in fields: for subfield_code, value in subfields: if subfield_code == provenence_code and value == provenence_value: # We have a hit. Stop to look for right value break else: # No hits.. continue to next field continue for subfield_code, value in subfields: if subfield_code == code: # This is the value we are looking for with the correct provenence final_values.append(value) return final_values
def format_element(bfo, limit, separator=' ; ', extension='[...]', print_links="yes"): """ Prints the list of editors of a record. @param limit: the maximum number of editors to display @param separator: the separator between editors. @param extension: a text printed if more editors than 'limit' exist @param print_links: if yes, print the editors as HTML link to their publications """ from urllib import quote from invenio.config import CFG_BASE_URL from invenio.legacy import bibrecord authors = bibrecord.record_get_field_instances(bfo.get_record(), '100') editors = [bibrecord.field_get_subfield_values(author, 'a')[0] for author in authors if len(bibrecord.field_get_subfield_values(author, "e")) > 0 and bibrecord.field_get_subfield_values(author, "e")[0]=="ed." ] if print_links.lower() == "yes": editors = ['<a href="' + CFG_BASE_URL + '/search?f=author&p=' + \ quote(editor) + \ '&ln='+ bfo.lang + \ '">' + editor + '</a>' for editor in editors] if limit.isdigit() and len(editors) > int(limit): return separator.join(editors[:int(limit)]) + extension elif len(editors) > 0: return separator.join(editors)
def _create_ticket(recid, bibcatalog_system, queue): subject = "Refs for #%s" % recid if CFG_INSPIRE_SITE: # Add report number in the subjecet report_number = "" record = get_bibrecord(recid) in_core = False for collection_tag in record_get_field_instances(record, "980"): for collection in field_get_subfield_values(collection_tag, 'a'): if collection == 'CORE': in_core = True if collection == 'arXiv': # Do not create tickets for arxiv papers # Tickets for arxiv papers are created in bibcatelog write_message("arXiv paper", verbose=1) return # Only create tickets for HEP if not in_core: write_message("not in hep", verbose=1) return # Do not create tickets for old records creation_date = run_sql( """SELECT creation_date FROM bibrec WHERE id = %s""", [recid])[0][0] if creation_date < datetime.now() - timedelta(days=30 * 4): return for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'c'): if category.startswith('astro-ph'): write_message("astro-ph", verbose=1) # We do not curate astro-ph return for report_number in field_get_subfield_values(report_tag, 'a'): subject += " " + report_number break text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, recid) bibcatalog_system.ticket_submit(subject=subject, queue=queue, text=text, recordid=recid)
def _create_ticket(recid, bibcatalog_system, queue): subject = "Refs for #%s" % recid if CFG_INSPIRE_SITE: # Add report number in the subjecet report_number = "" record = get_bibrecord(recid) in_core = False for collection_tag in record_get_field_instances(record, "980"): for collection in field_get_subfield_values(collection_tag, 'a'): if collection == 'CORE': in_core = True if collection == 'arXiv': # Do not create tickets for arxiv papers # Tickets for arxiv papers are created in bibcatelog write_message("arXiv paper", verbose=1) return # Only create tickets for HEP if not in_core: write_message("not in hep", verbose=1) return # Do not create tickets for old records creation_date = run_sql("""SELECT creation_date FROM bibrec WHERE id = %s""", [recid])[0][0] if creation_date < datetime.now() - timedelta(days=30*4): return for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'c'): if category.startswith('astro-ph'): write_message("astro-ph", verbose=1) # We do not curate astro-ph return for report_number in field_get_subfield_values(report_tag, 'a'): subject += " " + report_number break text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, recid) bibcatalog_system.ticket_submit(subject=subject, queue=queue, text=text, recordid=recid)
def check_arxiv(recid): record = get_record(recid) for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'a'): if category.startswith('arXiv'): return True return False
def record_in_collection(record, collection): """ Returns True/False if given record is in a given collection (980__a). """ for collection_tag in record_get_field_instances(record, "980"): for coll in field_get_subfield_values(collection_tag, 'a'): if coll.lower() == collection.lower(): return True return False
def rule_change_conf_num(header, record): substitutes = { "C78-09-18xxx": "C78-09-18.2" } for field in record_get_field_instances(record, '773'): for idx, (code, value) in enumerate(field[0]): if code == 'w' and value in substitutes.keys(): field[0][idx] = ('w', substitutes[value]) return record
def replace_references(recid, uid=None, txt=None, url=None): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record * txt: references in text mode * inspire: format of ther references """ # Parse references if txt is not None: references_xml = extract_references_from_string_xml( txt, is_only_references=True) elif url is not None: references_xml = extract_references_from_url_xml(url) else: references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml) dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_contents( recid, uid) out_xml = None references_to_add = record_get_field_instances(references[0], tag='999', ind1='C', ind2='5') refextract_status = record_get_field_instances(references[0], tag='999', ind1='C', ind2='6') if references_to_add: # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', references_to_add) record_add_fields(record, '999', refextract_status) # Update record references out_xml = record_xml_output(record) return out_xml
def record_find_matching_fields(key, rec, tag="", ind1=" ", ind2=" ", exact_match=False): """ This utility function will look for any fieldvalues containing or equal to, if exact match is wanted, given keyword string. The found fields will be returned as a list of field instances per tag. The fields to search can be narrowed down to tag/indicator level. @param key: keyword to search for @type key: string @param rec: a record structure as returned by bibrecord.create_record() @type rec: dict @param tag: a 3 characters long string @type tag: string @param ind1: a 1 character long string @type ind1: string @param ind2: a 1 character long string @type ind2: string @return: a list of found fields in a tuple per tag: (tag, field_instances) where field_instances is a list of (Subfields, ind1, ind2, value, field_position_global) and subfields is list of (code, value) @rtype: list """ if not tag: all_field_instances = rec.items() else: all_field_instances = [ (tag, record_get_field_instances(rec, tag, ind1, ind2)) ] matching_field_instances = [] for current_tag, field_instances in all_field_instances: found_fields = [] for field_instance in field_instances: # Get values to match: controlfield_value + subfield values values_to_match = [field_instance[3]] + \ [val for dummy_code, val in field_instance[0]] if exact_match and key in values_to_match: found_fields.append(field_instance) else: for value in values_to_match: if value.find(key) > -1: found_fields.append(field_instance) break if len(found_fields) > 0: matching_field_instances.append((current_tag, found_fields)) return matching_field_instances
def _get_record_linking_fields(recid_b, recid_a, tag, ind1, ind2): """ Returns the fields (defined by tag, ind1, ind2) in record (given by recid_b) that do not link to another given record (recid_a). """ fields = [] rec = create_record(format_record(recid_b, "xm"))[0] for field_instance in record_get_field_instances(rec, tag=tag, ind1=ind1, ind2=ind2): if not ('w', str(recid_a)) in field_instance[0]: fields.append(field_instance) return fields
def rule_create_fft(header, record): for field in record_get_field_instances(record, '856', ind1='4'): url = None for code, value in field_get_subfield_instances(field): if code == 'u': url = value break if url: subs = [('a', url), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')] record_add_field(record, 'FFT', subfields=subs) return record
def process_record(self, record): """@see: BaseFieldCommand.process_record""" # if the tag is empty, we don't make any changes if self._tag == "" or self._tag == None: return matching_field_instances = \ bibrecord.record_get_field_instances(record, self._tag, self._ind1, self._ind2) for current_field in matching_field_instances: self._apply_subfield_commands_to_field(record, current_field[4])
def create_ticket(recid, bibcatalog_system, queue=CFG_REFEXTRACT_TICKET_QUEUE): write_message('bibcatalog_system %s' % bibcatalog_system, verbose=1) write_message('queue %s' % queue, verbose=1) if bibcatalog_system and queue: subject = "Refs for #%s" % recid # Add report number in the subjecet report_number = "" record = get_bibrecord(recid) in_hep = False for collection_tag in record_get_field_instances(record, "980"): for collection in field_get_subfield_values(collection_tag, 'a'): if collection == 'HEP': in_hep = True # Only create tickets for HEP if not in_hep: write_message("not in hep", verbose=1) return for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'c'): if category.startswith('astro-ph'): write_message("astro-ph", verbose=1) # We do not curate astro-ph return for report_number in field_get_subfield_values(report_tag, 'a'): subject += " " + report_number break text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, \ recid) bibcatalog_system.ticket_submit(subject=subject, queue=queue, text=text, recordid=recid)
def record_find_matching_fields(key, rec, tag="", ind1=" ", ind2=" ", exact_match=False): """ This utility function will look for any fieldvalues containing or equal to, if exact match is wanted, given keyword string. The found fields will be returned as a list of field instances per tag. The fields to search can be narrowed down to tag/indicator level. @param key: keyword to search for @type key: string @param rec: a record structure as returned by bibrecord.create_record() @type rec: dict @param tag: a 3 characters long string @type tag: string @param ind1: a 1 character long string @type ind1: string @param ind2: a 1 character long string @type ind2: string @return: a list of found fields in a tuple per tag: (tag, field_instances) where field_instances is a list of (Subfields, ind1, ind2, value, field_position_global) and subfields is list of (code, value) @rtype: list """ if not tag: all_field_instances = rec.items() else: all_field_instances = [(tag, record_get_field_instances(rec, tag, ind1, ind2))] matching_field_instances = [] for current_tag, field_instances in all_field_instances: found_fields = [] for field_instance in field_instances: # Get values to match: controlfield_value + subfield values values_to_match = [field_instance[3]] + \ [val for dummy_code, val in field_instance[0]] if exact_match and key in values_to_match: found_fields.append(field_instance) else: for value in values_to_match: if value.find(key) > -1: found_fields.append(field_instance) break if len(found_fields) > 0: matching_field_instances.append((current_tag, found_fields)) return matching_field_instances
def crossref_translate_title(record): """ Convert the record's title to the Inspire specific abbreviation of the title (using JOURNALS knowledge base) @return: changed record """ # probably there is only one 773 field # but just in case let's treat it as a list for field in record_get_field_instances(record, '773'): title = field[0][0][1] new_title = get_kbr_values("JOURNALS", title, searchtype='e') if new_title: # returned value is a list, and we need only the first value new_title = new_title[0][0] position = field[4] record_modify_subfield(rec=record, tag='773', subfield_code='p', \ value=new_title, subfield_position=0, field_position_global=position)
def crossref_translate_title(record): """ Convert the record's title to the Inspire specific abbreviation of the title (using JOURNALS knowledge base) @return: changed record """ # probably there is only one 773 field # but just in case let's treat it as a list for field in record_get_field_instances(record, '773'): title = field[0][0][1] new_title = get_kbr_values("JOURNALS", title, searchtype='e') if new_title: # returned value is a list, and we need only the first value new_title = new_title[0][0] position = field[4] record_modify_subfield(rec=record, tag='773', subfield_code='p', value=new_title, subfield_position=0, field_position_global=position)
def references_nb_counts(): """Get number of references for the record `recid`.""" recid = request.view_args.get("recid") if recid is None: return from invenio.legacy.bibrecord import record_get_field_instances from invenio.modules.search.models import Field from invenio.modules.records.api import get_record if not CFG_CERN_SITE: reftag = "" reftags = list(Field.get_field_tags("reference")) if reftags: reftag = reftags[0] tmprec = get_record(recid) if reftag and len(reftag) > 4: return len(record_get_field_instances(tmprec, reftag[0:3], reftag[3], reftag[4])) return 0
def get_record_provenance(recid): """ Return the provenance XML representation of a record, suitable to be put in the about tag. """ record = get_record(recid) provenances = record_get_field_instances( record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]) out = "" for provenance in provenances: base_url = identifier = datestamp = metadata_namespace = origin_description = harvest_date = altered = "" for (code, value) in provenance[0]: if code == CFG_OAI_PROVENANCE_BASEURL_SUBFIELD: base_url = value elif code == CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]: identifier = value elif code == CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD: datestamp = value elif code == CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD: metadata_namespace = value elif code == CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD: origin_description = value elif code == CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD: harvest_date = value elif code == CFG_OAI_PROVENANCE_ALTERED_SUBFIELD: altered = value if base_url: out += """<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">""" out += X.originDescription( harvestDate=harvest_date, altered=altered)( X.baseURL()(base_url), X.identifier()(identifier), X.datestamp()(datestamp), X.metadataNamespace()(metadata_namespace), origin_description and X.originDescription(origin_description) or '' ## This is already XML ) out += """</provenance>""" return out
def get_record_provenance(recid): """ Return the provenance XML representation of a record, suitable to be put in the about tag. """ record = get_record(recid) provenances = record_get_field_instances( record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4], ) out = "" for provenance in provenances: base_url = identifier = datestamp = metadata_namespace = origin_description = harvest_date = altered = "" for (code, value) in provenance[0]: if code == CFG_OAI_PROVENANCE_BASEURL_SUBFIELD: base_url = value elif code == CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]: identifier = value elif code == CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD: datestamp = value elif code == CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD: metadata_namespace = value elif code == CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD: origin_description = value elif code == CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD: harvest_date = value elif code == CFG_OAI_PROVENANCE_ALTERED_SUBFIELD: altered = value if base_url: out += """<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">""" out += X.originDescription(harvestDate=harvest_date, altered=altered)( X.baseURL()(base_url), X.identifier()(identifier), X.datestamp()(datestamp), X.metadataNamespace()(metadata_namespace), origin_description and X.originDescription(origin_description) or "", ## This is already XML ) out += """</provenance>""" return out
def references_nb_counts(): """Get number of references for the record `recid`.""" recid = request.view_args.get('recid') if recid is None: return from invenio.legacy.bibrecord import record_get_field_instances from invenio.modules.search.models import Field from invenio.modules.records.api import get_record if not CFG_CERN_SITE: reftag = "" reftags = list(Field.get_field_tags("reference")) if reftags: reftag = reftags[0] tmprec = get_record(recid) if reftag and len(reftag) > 4: return len( record_get_field_instances(tmprec, reftag[0:3], reftag[3], reftag[4])) return 0
def tarballs_by_recids(recids, sdir): """ Take a string representing one recid or several and get the associated tarballs for those ids. @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @return: tarballs ([string, string, ...]): locations of tarballs """ list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recid.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = int(recid) arXiv_ids = [] for recid in list_of_ids: rec = get_record(recid) for afieldinstance in record_get_field_instances(rec, tag='037'): if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) return tarballs_by_arXiv_id(arXiv_ids, sdir)
def tokenize(self, recID): phrases = [] try: rec = get_record(recID) for rule in self.rules: tag_to_index, necessary_tag, necessary_value = rule core_tag = tag_to_index[0:3] ind = tag_to_index[3:5] sub_tag = tag_to_index[5] fields = [dict(instance[0]) for instance in record_get_field_instances(rec, core_tag, ind[0], ind[1])] for field in fields: tag_condition = necessary_tag and field.has_key(necessary_tag) or necessary_tag == '' value_condition = necessary_value and field.get(necessary_tag, '') == necessary_value or \ necessary_value == '' if tag_condition and field.has_key(sub_tag) and value_condition: phrases.append(field[sub_tag]) return phrases except KeyError: return [] return phrases
def find_modified_tags(self, common_tags, record1, record2): """ For each tag common to Record1 and Record2, checks for modifictions at field-level, indicator-level and subfield-level. Returns a dictionary of tags and corresponding fields from Record1 that have been found to have modified. """ result = {} for tag in common_tags: # retrieve tag instances of record1 and record2 rec1_tag_val = record_get_field_instances(record1, tag, '%', '%') rec2_tag_val = record_get_field_instances(record2, tag, '%', '%') if rec1_tag_val: rec1_ind = self.group_tag_values_by_indicator(rec1_tag_val) if rec2_tag_val: rec2_ind = self.group_tag_values_by_indicator(rec2_tag_val) # NOTE: At this point rec1_ind and rec2_ind will be dictionary # Key ==> (ind1, ind2) tuple # Val ==> list of data_tuple => [dt1,dt2] # dt(n) => ([sfl],ind1,ind2,ctrlfield,fn) # Generating 3 different dictionaries # common/added/deleted ind pairs in record1 based on record2 (com_ind, add_ind, del_ind) = self.compare_tags_by_ind(rec1_ind, rec2_ind) if add_ind: for ind_pair in add_ind: for data_tuple in add_ind[ind_pair]: subfield_list = data_tuple[0] record_add_field(result, tag, ind_pair[0], ind_pair[1], '', subfields=subfield_list) # Indicators that are deleted from record1 w.r.t record2 will be added with special code if del_ind: for ind_pair in del_ind: record_add_field(result, tag, ind_pair[0], ind_pair[1], '', [(CFG_BIBUPLOAD_DELETE_CODE, CFG_BIBUPLOAD_DELETE_VALUE)]) # Common modified fields. Identifying changes at subfield level if com_ind: for ind_pair in com_ind: # NOTE: sf_rec1 and sf_rec2 are list of list of subfields # A simple list comparison is sufficient in this scneario # Any change in the order of fields or changes in subfields # will cause the entire list of data_tuple for that ind_pair # to be copied from record1(upload) to result. if tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS: cf_rec1 = [data_tuple[3] for data_tuple in rec1_ind[ind_pair]] cf_rec2 = [data_tuple[3] for data_tuple in rec2_ind[ind_pair]] if cf_rec1 != cf_rec2: for data_tuple in com_ind[ind_pair]: record_add_field(result, tag, controlfield_value=data_tuple[3]) else: sf_rec1 = [data_tuple[0] for data_tuple in rec1_ind[ind_pair]] sf_rec2 = [data_tuple[0] for data_tuple in rec2_ind[ind_pair]] if sf_rec1 != sf_rec2: # change at subfield level/ re-oredered fields for data_tuple in com_ind[ind_pair]: # com_ind will have data_tuples of record1(upload) and not record2 subfield_list = data_tuple[0] record_add_field(result, tag, ind_pair[0], ind_pair[1], '', subfields=subfield_list) return result
def merge_field_group(rec1, rec2, fnum, ind1="", ind2="", merge_conflicting_fields=False): """Merges non-conflicting fields from 'rec2' to 'rec1' for a specific tag. the second record. @param rec1: First record (a record dictionary structure) @param rec2: Second record (a record dictionary structure) @param fnum: a 3 characters long string indicating field tag number @param ind1: a 1 character long string @param ind2: a 1 character long string @param merge_conflicting_fields: whether to merge conflicting fields or not """ ### Check if merging goes for all indicators and set a boolean merging_all_indicators = not ind1 and not ind2 ### check if there is no field in rec2 to be merged in rec1 if not record_has_field(rec2, fnum): return ### get fields of rec2 if merging_all_indicators: fields2 = record_get_field_instances(rec2, fnum, "%", "%") else: fields2 = record_get_field_instances(rec2, fnum, ind1, ind2) if len(fields2) == 0: return ### check if field in rec1 doesn't even exist if not record_has_field(rec1, fnum): record_add_fields(rec1, fnum, fields2) return ### compare the fields, get diffs for given indicators alldiffs = record_field_diff(rec1[fnum], rec2[fnum], fnum, match_subfields, ind1, ind2) ### check if fields are the same if alldiffs is None: return # nothing to merge ### find the diffing for the fields of the given indicators alldiffs = alldiffs[1] # keep only the list of diffs by indicators (without the 'c') if merging_all_indicators: # combine the diffs for each indicator to one list diff = _combine_diffs(alldiffs) else: # diffing for one indicator for diff in alldiffs: # look for indicator pair in diff result if diff[0] == (ind1, ind2): break else: raise Exception, "Indicators not in diff result." diff = diff[1] # keep only the list of diffs (without the indicator tuple) ### proceed to merging fields in a new field list fields1, fields2 = rec1[fnum], rec2[fnum] new_fields = [] if merge_conflicting_fields == False: # merge non-conflicting fields for m in diff: # for every match of fields in the diff if m[0] is not None: # if rec1 has a field in the diff, keep it new_fields.append(deepcopy(fields1[m[0]])) else: # else take the field from rec2 new_fields.append(deepcopy(fields2[m[1]])) else: # merge all fields for m in diff: # for every match of fields in the diff if m[1] is not None: # if rec2 has a field, add it new_fields.append(deepcopy(fields2[m[1]])) if m[0] is not None and fields1[m[0]][0] != fields2[m[1]][0]: # if the fields are not the same then add the field of rec1 new_fields.append(deepcopy(fields1[m[0]])) else: new_fields.append(deepcopy(fields1[m[0]])) ### delete existing fields record_delete_field(rec1, fnum, ind1, ind2) ## find where the new_fields should be inserted in rec1 (insert_index) if merging_all_indicators: insert_index = 0 else: insert_index = None ind_pair = (ind1, ind2) first_last_dict = _first_and_last_index_for_each_indicator(rec1.get(fnum, [])) # find the indicator pair which is just before the one which will be inserted indicators = first_last_dict.keys() indicators.sort() ind_pair_before = None for pair in indicators: if pair > ind_pair: break else: ind_pair_before = pair if ind_pair_before is None: # if no smaller indicator pair exists insert_index = 0 # insertion will take place at the beginning else: # else insert after the last field index of the previous indicator pair insert_index = first_last_dict[ind_pair_before][1] + 1 ### add the new (merged) fields in correct 'in_field_index' position record_add_fields(rec1, fnum, new_fields, insert_index) return
def merge_field_group(rec1, rec2, fnum, ind1='', ind2='', merge_conflicting_fields=False): """Merges non-conflicting fields from 'rec2' to 'rec1' for a specific tag. the second record. @param rec1: First record (a record dictionary structure) @param rec2: Second record (a record dictionary structure) @param fnum: a 3 characters long string indicating field tag number @param ind1: a 1 character long string @param ind2: a 1 character long string @param merge_conflicting_fields: whether to merge conflicting fields or not """ ### Check if merging goes for all indicators and set a boolean merging_all_indicators = not ind1 and not ind2 ### check if there is no field in rec2 to be merged in rec1 if not record_has_field(rec2, fnum): return ### get fields of rec2 if merging_all_indicators: fields2 = record_get_field_instances(rec2, fnum, '%', '%') else: fields2 = record_get_field_instances(rec2, fnum, ind1, ind2) if len(fields2) == 0: return ### check if field in rec1 doesn't even exist if not record_has_field(rec1, fnum): record_add_fields(rec1, fnum, fields2) return ### compare the fields, get diffs for given indicators alldiffs = record_field_diff(rec1[fnum], rec2[fnum], fnum, match_subfields, ind1, ind2) ### check if fields are the same if alldiffs is None: return #nothing to merge ### find the diffing for the fields of the given indicators alldiffs = alldiffs[ 1] #keep only the list of diffs by indicators (without the 'c') if merging_all_indicators: #combine the diffs for each indicator to one list diff = _combine_diffs(alldiffs) else: #diffing for one indicator for diff in alldiffs: #look for indicator pair in diff result if diff[0] == (ind1, ind2): break else: raise Exception, "Indicators not in diff result." diff = diff[ 1] #keep only the list of diffs (without the indicator tuple) ### proceed to merging fields in a new field list fields1, fields2 = rec1[fnum], rec2[fnum] new_fields = [] if merge_conflicting_fields == False: #merge non-conflicting fields for m in diff: #for every match of fields in the diff if m[0] is not None: #if rec1 has a field in the diff, keep it new_fields.append(deepcopy(fields1[m[0]])) else: #else take the field from rec2 new_fields.append(deepcopy(fields2[m[1]])) else: #merge all fields for m in diff: #for every match of fields in the diff if m[1] is not None: #if rec2 has a field, add it new_fields.append(deepcopy(fields2[m[1]])) if m[0] is not None and fields1[m[0]][0] != fields2[m[1]][0]: #if the fields are not the same then add the field of rec1 new_fields.append(deepcopy(fields1[m[0]])) else: new_fields.append(deepcopy(fields1[m[0]])) ### delete existing fields record_delete_field(rec1, fnum, ind1, ind2) ## find where the new_fields should be inserted in rec1 (insert_index) if merging_all_indicators: insert_index = 0 else: insert_index = None ind_pair = (ind1, ind2) first_last_dict = _first_and_last_index_for_each_indicator( rec1.get(fnum, [])) #find the indicator pair which is just before the one which will be inserted indicators = first_last_dict.keys() indicators.sort() ind_pair_before = None for pair in indicators: if pair > ind_pair: break else: ind_pair_before = pair if ind_pair_before is None: #if no smaller indicator pair exists insert_index = 0 #insertion will take place at the beginning else: #else insert after the last field index of the previous indicator pair insert_index = first_last_dict[ind_pair_before][1] + 1 ### add the new (merged) fields in correct 'in_field_index' position record_add_fields(rec1, fnum, new_fields, insert_index) return
def tarballs_by_recids(recids, sdir, docname=None, doctype=None, docformat=None): """ Take a string representing one recid or several and get the associated tarballs for those ids. By default look for files with names matching the report number and with source field 'arXiv'. This can be changed with C{docname}, C{doctype}, C{docformat} @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @param docname: select tarball for given recid(s) that match docname @param doctype: select tarball for given recid(s) that match doctype @param docformat: select tarball for given recid(s) that match docformat @return: tarballs ([string, string, ...]): locations of tarballs """ if not recids: return [] list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recids.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = [int(recids)] arXiv_ids = [] local_files = [] for recid in list_of_ids: rec = get_record(recid) if not doctype and not docname and not docformat: for afieldinstance in record_get_field_instances(rec, tag='037'): if len(field_get_subfield_values(afieldinstance, '9')) > 0: if 'arXiv' == field_get_subfield_values( afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values( afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) else: bibarchive = BibRecDocs(recid) all_files = bibarchive.list_latest_files() if doctype: all_files = [ docfile for docfile in all_files if docfile.get_type() == doctype ] if docname: all_files = [ docfile for docfile in all_files if docfile.get_name() == docname ] if docformat: all_files = [ docfile for docfile in all_files if docfile.get_format() == docformat ] local_files.extend([(docfile.get_path(), recid) for docfile in all_files]) if doctype or docname or docformat: return local_files return tarballs_by_arXiv_id(arXiv_ids, sdir)
def task_run_core(): """Perform a search to find records without a texkey. generates a new one and uploads the changes in chunks """ recids = perform_request_search( p='-035:spirestex -035:inspiretex', cc='HEP') write_message("Found %s records to assign texkeys" % len(recids)) processed_recids = [] xml_to_process = [] for count, recid in enumerate(recids): write_message("processing recid %s" % recid) # Check that the record does not have already a texkey has_texkey = False recstruct = get_record(recid) for instance in record_get_field_instances(recstruct, tag="035", ind1="", ind2=""): try: provenance = field_get_subfield_values(instance, "9")[0] except IndexError: provenance = "" try: value = field_get_subfield_values(instance, "z")[0] except IndexError: try: value = field_get_subfield_values(instance, "a")[0] except IndexError: value = "" provenances = ["SPIRESTeX", "INSPIRETeX"] if provenance in provenances and value: has_texkey = True write_message( "INFO: Record %s has already texkey %s" % (recid, value)) if not has_texkey: TexKeySeq = TexkeySeq() new_texkey = "" try: new_texkey = TexKeySeq.next_value(recid) except TexkeyNoAuthorError: write_message(( "WARNING: Record %s has no first author or " "collaboration") % recid) continue except TexkeyNoYearError: write_message("WARNING: Record %s has no year" % recid) continue write_message("Created texkey %s for record %d" % (new_texkey, recid)) xml = create_xml(recid, new_texkey) processed_recids.append(recid) xml_to_process.append(xml) task_update_progress("Done %d out of %d." % (count, len(recids))) task_sleep_now_if_required() # sequence ID to be used in all subsequent tasks sequence_id = str(random.randrange(1, 4294967296)) if xml_to_process: process_chunk(xml_to_process, sequence_id) # Finally, index all the records processed # FIXME: Waiting for sequence id to be fixed # if processed_recids: # submit_bibindex_task(processed_recids, sequence_id) return True
def find_modified_tags(self, common_tags, record1, record2): """ For each tag common to Record1 and Record2, checks for modifictions at field-level, indicator-level and subfield-level. Returns a dictionary of tags and corresponding fields from Record1 that have been found to have modified. """ result = {} for tag in common_tags: # retrieve tag instances of record1 and record2 rec1_tag_val = record_get_field_instances(record1, tag, '%', '%') rec2_tag_val = record_get_field_instances(record2, tag, '%', '%') if rec1_tag_val: rec1_ind = self.group_tag_values_by_indicator(rec1_tag_val) if rec2_tag_val: rec2_ind = self.group_tag_values_by_indicator(rec2_tag_val) # NOTE: At this point rec1_ind and rec2_ind will be dictionary # Key ==> (ind1, ind2) tuple # Val ==> list of data_tuple => [dt1,dt2] # dt(n) => ([sfl],ind1,ind2,ctrlfield,fn) # Generating 3 different dictionaries # common/added/deleted ind pairs in record1 based on record2 (com_ind, add_ind, del_ind) = self.compare_tags_by_ind(rec1_ind, rec2_ind) if add_ind: for ind_pair in add_ind: for data_tuple in add_ind[ind_pair]: subfield_list = data_tuple[0] record_add_field(result, tag, ind_pair[0], ind_pair[1], '', subfields=subfield_list) # Indicators that are deleted from record1 w.r.t record2 will be added with special code if del_ind: for ind_pair in del_ind: record_add_field(result, tag, ind_pair[0], ind_pair[1], '', [(CFG_BIBUPLOAD_DELETE_CODE, CFG_BIBUPLOAD_DELETE_VALUE)]) # Common modified fields. Identifying changes at subfield level if com_ind: for ind_pair in com_ind: # NOTE: sf_rec1 and sf_rec2 are list of list of subfields # A simple list comparison is sufficient in this scneario # Any change in the order of fields or changes in subfields # will cause the entire list of data_tuple for that ind_pair # to be copied from record1(upload) to result. if tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS: cf_rec1 = [ data_tuple[3] for data_tuple in rec1_ind[ind_pair] ] cf_rec2 = [ data_tuple[3] for data_tuple in rec2_ind[ind_pair] ] if cf_rec1 != cf_rec2: for data_tuple in com_ind[ind_pair]: record_add_field( result, tag, controlfield_value=data_tuple[3]) else: sf_rec1 = [ data_tuple[0] for data_tuple in rec1_ind[ind_pair] ] sf_rec2 = [ data_tuple[0] for data_tuple in rec2_ind[ind_pair] ] if sf_rec1 != sf_rec2: # change at subfield level/ re-oredered fields for data_tuple in com_ind[ind_pair]: # com_ind will have data_tuples of record1(upload) and not record2 subfield_list = data_tuple[0] record_add_field(result, tag, ind_pair[0], ind_pair[1], '', subfields=subfield_list) return result
def task_run_core(): """ Performs a search to find records without a texkey, generates a new one and uploads the changes in chunks """ recids = perform_request_search(p='-035:spirestex -035:inspiretex', cc='HEP') write_message("Found %s records to assign texkeys" % len(recids)) processed_recids = [] xml_to_process = [] for count, recid in enumerate(recids): write_message("processing recid %s" % recid) # Check that the record does not have already a texkey has_texkey = False recstruct = get_record(recid) for instance in record_get_field_instances(recstruct, tag="035", ind1="", ind2=""): try: provenance = field_get_subfield_values(instance, "9")[0] except IndexError: provenance = "" try: value = field_get_subfield_values(instance, "z")[0] except IndexError: try: value = field_get_subfield_values(instance, "a")[0] except IndexError: value = "" provenances = ["SPIRESTeX", "INSPIRETeX"] if provenance in provenances and value: has_texkey = True write_message("INFO: Record %s has already texkey %s" % (recid, value)) if not has_texkey: TexKeySeq = TexkeySeq() new_texkey = "" try: new_texkey = TexKeySeq.next_value(recid) except TexkeyNoAuthorError: write_message( "WARNING: Record %s has no first author or collaboration" % recid) continue except TexkeyNoYearError: write_message("WARNING: Record %s has no year" % recid) continue write_message("Created texkey %s for record %d" % (new_texkey, recid)) xml = create_xml(recid, new_texkey) processed_recids.append(recid) xml_to_process.append(xml) task_update_progress("Done %d out of %d." % (count, len(recids))) task_sleep_now_if_required() # sequence ID to be used in all subsequent tasks sequence_id = str(random.randrange(1, 4294967296)) if xml_to_process: process_chunk(xml_to_process, sequence_id) # Finally, index all the records processed #FIXME: Waiting for sequence id to be fixed # if processed_recids: # submit_bibindex_task(processed_recids, sequence_id) return True
def tarballs_by_recids(recids, sdir, docname=None, doctype=None, docformat=None): """ Take a string representing one recid or several and get the associated tarballs for those ids. By default look for files with names matching the report number and with source field 'arXiv'. This can be changed with C{docname}, C{doctype}, C{docformat} @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @param docname: select tarball for given recid(s) that match docname @param doctype: select tarball for given recid(s) that match doctype @param docformat: select tarball for given recid(s) that match docformat @return: tarballs ([string, string, ...]): locations of tarballs """ if not recids: return [] list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recids.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = [int(recids)] arXiv_ids = [] local_files = [] for recid in list_of_ids: rec = get_record(recid) if not doctype and not docname and not docformat: for afieldinstance in record_get_field_instances(rec, tag='037'): if len(field_get_subfield_values(afieldinstance, '9')) > 0: if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) else: bibarchive = BibRecDocs(recid) all_files = bibarchive.list_latest_files() if doctype: all_files = [docfile for docfile in all_files if docfile.get_type() == doctype] if docname: all_files = [docfile for docfile in all_files if docfile.get_name() == docname] if docformat: all_files = [docfile for docfile in all_files if docfile.get_format() == docformat] local_files.extend([(docfile.get_path(), recid) for docfile in all_files]) if doctype or docname or docformat: return local_files return tarballs_by_arXiv_id(arXiv_ids, sdir)
def fields(self, tag, escape=0, repeatable_subfields_p=False): """ Returns the list of values corresonding to "tag". If tag has an undefined subcode (such as 999C5), the function returns a list of dictionaries, whoose keys are the subcodes and the values are the values of tag.subcode. If the tag has a subcode, simply returns list of values corresponding to tag. Eg. for given MARC:: 999C5 $a value_1a $b value_1b 999C5 $b value_2b 999C5 $b value_3b $b value_3b_bis >>> bfo.fields('999C5b') >>> ['value_1b', 'value_2b', 'value_3b', 'value_3b_bis'] >>> bfo.fields('999C5') >>> [{'a':'value_1a', 'b':'value_1b'}, {'b':'value_2b'}, {'b':'value_3b'}] By default the function returns only one value for each subfield (that is it considers that repeatable subfields are not allowed). It is why in the above example 'value3b_bis' is not shown for bfo.fields('999C5'). (Note that it is not defined which of value_3b or value_3b_bis is returned). This is to simplify the use of the function, as most of the time subfields are not repeatable (in that way we get a string instead of a list). You can allow repeatable subfields by setting 'repeatable_subfields_p' parameter to True. In this mode, the above example would return: >>> bfo.fields('999C5b', repeatable_subfields_p=True) >>> ['value_1b', 'value_2b', 'value_3b'] >>> bfo.fields('999C5', repeatable_subfields_p=True) >>> [{'a':['value_1a'], 'b':['value_1b']}, {'b':['value_2b']}, {'b':['value_3b', 'value3b_bis']}] NOTICE THAT THE RETURNED STRUCTURE IS DIFFERENT. Also note that whatever the value of 'repeatable_subfields_p' is, bfo.fields('999C5b') always show all fields, even repeatable ones. This is because the parameter has no impact on the returned structure (it is always a list). 'escape' parameter allows to escape special characters of the fields. The value of escape can be: 0. No escaping 1. Escape all HTML characters 2. Remove unsafe HTML tags (Eg. keep <br />) 3. Mix of mode 1 and 2. If value of field starts with <!-- HTML -->, then use mode 2. Else use mode 1. 4. Remove all HTML tags 5. Same as 2, with more tags allowed (like <img>) 6. Same as 3, with more tags allowed (like <img>) 7. Mix of mode 0 and mode 1. If field_value starts with <!--HTML-->, then use mode 0. Else use mode 1. 8. Same as mode 1, but also escape double-quotes 9. Same as mode 4, but also escape double-quotes :param tag: the marc code of a field :param escape: 1 if returned values should be escaped. Else 0. @repeatable_subfields_p if True, returns the list of subfields in the dictionary @return: values of field tag in record """ if self.get_record() is None: # Case where BibRecord could not parse object return [] p_tag = parse_tag(tag) if p_tag[3] != "": # Subcode has been defined. Simply returns list of values values = record_get_field_values(self.get_record(), p_tag[0], p_tag[1], p_tag[2], p_tag[3]) if escape == 0: return values else: return [escape_field(value, escape) for value in values] else: # Subcode is undefined. Returns list of dicts. # However it might be the case of a control field. instances = record_get_field_instances(self.get_record(), p_tag[0], p_tag[1], p_tag[2]) if repeatable_subfields_p: list_of_instances = [] for instance in instances: instance_dict = {} for subfield in instance[0]: if subfield[0] not in instance_dict: instance_dict[subfield[0]] = [] if escape == 0: instance_dict[subfield[0]].append(subfield[1]) else: instance_dict[subfield[0]].append(escape_field(subfield[1], escape)) list_of_instances.append(instance_dict) return list_of_instances else: if escape == 0: return [dict(instance[0]) for instance in instances] else: return [dict([(subfield[0], escape_field(subfield[1], escape)) for subfield in instance[0]]) for instance in instances]