def check_record(record, texkey_field="035__a", extra_subfields=()): """ Add a tex key to a record, checking that it doesn't have one already. """ tag = texkey_field[:3] ind1, ind2, subfield = texkey_field[3:] provenances = list(record.iterfield(texkey_field[:5] + "9")) if len(provenances) and provenances[0][1] in ("SPIRESTeX", "INSPIRETeX"): for _, val in record.iterfield(texkey_field[:5] + "z"): if val: return # Record already has a texkey if len(list(record.iterfield(texkey_field))) == 0: try: texkey = TexkeySeq().next_value(bibrecord=record) except TexkeyNoAuthorError: record.warn("No first author or collaboration") return subfields_to_add = [(subfield, texkey)] + map(tuple, extra_subfields) record_add_field(record, tag=tag, ind1=ind1, ind2=ind2, subfields=subfields_to_add) record.set_amended("Added Tex key '%s' to field %s" % (texkey, texkey_field))
def merge_record_with_template(rec, template_name, is_hp_record=False): """ Extend the record rec with the contents of the template and return it""" template = get_record_template(template_name) if not template: return template_bibrec = create_record(template)[0] # if the record is a holding pen record make all subfields volatile if is_hp_record: record_make_all_subfields_volatile(template_bibrec) for field_tag in template_bibrec: if not record_has_field(rec, field_tag): for field_instance in template_bibrec[field_tag]: record_add_field(rec, field_tag, field_instance[1], field_instance[2], subfields=field_instance[0]) else: for template_field_instance in template_bibrec[field_tag]: subfield_codes_template = field_get_subfield_codes(template_field_instance) for field_instance in rec[field_tag]: subfield_codes = field_get_subfield_codes(field_instance) for code in subfield_codes_template: if code not in subfield_codes: field_add_subfield( field_instance, code, field_get_subfield_values(template_field_instance, code)[0] ) record_order_subfields(rec) return rec
def bst_openaire_altmetric(): """ """ recids = search_pattern(p="0->Z", f="0247_a") a = Altmetric() for recid in recids: try: # Check if we already have an Altmetric id sysno_inst = get_fieldvalues(recid, "035__9") if ['Altmetric'] in sysno_inst: continue doi_val = get_fieldvalues(recid, "0247_a")[0] json_res = a.doi(doi_val) rec = {} record_add_field(rec, "001", controlfield_value=str(recid)) if json_res: record_add_field(rec, '035', subfields=[('a', str(json_res['altmetric_id'])), ('9', 'Altmetric')]) bibupload(rec, opt_mode='correct') except AltmetricHTTPException, e: register_exception(prefix='Altmetric error (status code %s): %s' % (e.status_code, str(e)), alert_admin=False)
def add_epic_pid(rec, recid, checksum): """ Adds EPIC PID to the record. If registration fails, can also fail the request if CFG_FAIL_ON_MISSING_PID is set to True""" CFG_SITE_SECURE_URL = current_app.config.get("CFG_SITE_SECURE_URL") location = CFG_SITE_SECURE_URL + '/record/' + str(recid) try: pid = createHandle(location, checksum) record_add_field(rec, '024', ind1='7', subfields=[('2', 'PID'), ('a', pid)]) except HTTPException as e: # If CFG_FAIL_ON_MISSING_PID is not found in invenio-local.conf, # default is to assume False try: from config import CFG_FAIL_ON_MISSING_PID fail = bool(CFG_FAIL_ON_MISSING_PID) except ImportError: fail = False current_app.logger.error( "Unable to obtain PID from EPIC server {0} {1}: {2}".format( e.code, e.name, e)) if fail: raise e
def generate_columns_longer(ds): """ a much longer implemntation of the column generation""" from invenio.legacy.bibrecord import record_add_field rec = {} columns = [[num, "", ""] for num in xrange(ds.num_columns)] # (number, header, title) cur_col = 0 for hd in ds.column_headers: for i in xrange(hd["colspan"]): columns[cur_col][1] = hd["content"].strip() cur_col += 1 cur_col = 0 for ct in ds.column_titles: for i in xrange(ct["colspan"]): columns[cur_col][2] = ct["content"].strip() cur_col += 1 for col in columns: subfields = [("n", str(col[0]))] if col[2] != "": subfields.append(("t", col[2])) if col[1] != "": subfields.append(("d", col[1])) record_add_field(rec, "910", subfields = subfields) return rec
def check_records(records, doi_field="0247_a", extra_subfields=(("2", "DOI"), ("9", "bibcheck"))): """ Find the DOI for the records using crossref and add it to the specified field. This plugin won't ask for the DOI if it's already set. """ records_to_check = {} for record in records: has_doi = False for position, value in record.iterfield("0247_2"): if value.lower() == "doi": has_doi = True break if not has_doi: records_to_check[record.record_id] = record dois = get_doi_for_records(records_to_check.values()) for record_id, doi in dois.iteritems(): record = records_to_check[record_id] dup_doi_recid = find_record_from_doi(doi) if dup_doi_recid: record.warn("DOI %s to be added to record %s already exists in record/s %s" % (doi, record_id, dup_doi_recid)) continue subfields = [(doi_field[5], doi.encode("utf-8"))] + map(tuple, extra_subfields) record_add_field(record, tag=doi_field[:3], ind1=doi_field[3], ind2=doi_field[4], subfields=subfields) record.set_amended("Added DOI in field %s" % doi_field)
def generate_final_patch(self, patch_dict, recid): """ Generates patch by merging modified patch and added patch Returns the final merged patch containing modified and added fields """ def _add_to_record(record, patch): for tag in patch: for data_tuple in patch[tag]: record_add_field(record, tag, data_tuple[1], data_tuple[2], '', subfields=data_tuple[0]) return record final_patch = {} #tag_list = [] # merge processed and added fields into one patch if 'MOD' in patch_dict: # tag_list = tag_list + patch_dict['MOD'].items() final_patch = _add_to_record(final_patch, patch_dict['MOD']) if 'ADD' in patch_dict: #tag_list = tag_list + patch_dict['ADD'].items() final_patch = _add_to_record(final_patch, patch_dict['ADD']) if 'DEL' in patch_dict: #tag_list = tag_list + patch_dict['DEL'].items() final_patch = _add_to_record(final_patch, patch_dict['DEL']) record_add_field(final_patch, '001', ' ', ' ', recid) return final_patch
def add_file_info(rec, form, email, sub_id, recid): """ Adds the path to the file and access rights to ther record. """ if 'open_access' in form: fft_status = 'firerole: allow any\n' else: fft_status = 'firerole: allow email "{0}"\ndeny all'.format( email) for metadata in get_depositing_files_metadata(sub_id): path = metadata['file'] record_add_field(rec, 'FFT', subfields=[('a', path), ('n', metadata['name']), # name of the file #('t', 'Type'), # TODO # unfortunately s is used for a timestamp, not file size #('s', 'timestamp'), # s is a timestamp #('w', str(metadata['size'])), # size should be derived automatically, # # but storing it into 'document_moreinfo' field ('r', fft_status)]) #seems to be impossible to add file size data, thought this would work CFG_SITE_SECURE_URL = current_app.config.get("CFG_SITE_SECURE_URL") url = u"{0}/record/{1}/files/{2}".format(CFG_SITE_SECURE_URL, recid, metadata['name']) record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('s', str(os.path.getsize(path))), ('y', metadata['name'])])
def merge_record_with_template(rec, template_name, is_hp_record=False): """ Extend the record rec with the contents of the template and return it""" template = get_record_template(template_name) if not template: return template_bibrec = create_record(template)[0] # if the record is a holding pen record make all subfields volatile if is_hp_record: record_make_all_subfields_volatile(template_bibrec) for field_tag in template_bibrec: if not record_has_field(rec, field_tag): for field_instance in template_bibrec[field_tag]: record_add_field(rec, field_tag, field_instance[1], field_instance[2], subfields=field_instance[0]) else: for template_field_instance in template_bibrec[field_tag]: subfield_codes_template = field_get_subfield_codes( template_field_instance) for field_instance in rec[field_tag]: subfield_codes = field_get_subfield_codes(field_instance) for code in subfield_codes_template: if code not in subfield_codes: field_add_subfield( field_instance, code, field_get_subfield_values( template_field_instance, code)[0]) record_order_subfields(rec) return rec
def bst_openaire_altmetric(): """ """ recids = search_pattern(p="0->Z", f="0247_a") a = Altmetric() for recid in recids: try: # Check if we already have an Altmetric id sysno_inst = get_fieldvalues(recid, "035__9") if ['Altmetric'] in sysno_inst: continue doi_val = get_fieldvalues(recid, "0247_a")[0] json_res = a.doi(doi_val) rec = {} record_add_field(rec, "001", controlfield_value=str(recid)) if json_res: record_add_field(rec, '035', subfields=[ ('a', str(json_res['altmetric_id'])), ('9', 'Altmetric')] ) bibupload(rec, opt_mode='correct') except AltmetricHTTPException, e: register_exception(prefix='Altmetric error (status code %s): %s' % (e.status_code, str(e)), alert_admin=False)
def add_doi_to_record(recid, doi): rec = {} record_add_field(rec, '001', controlfield_value=str(recid)) pid_fields = [('a', doi), ('2', 'DOI')] record_add_field(rec, tag='024', ind1='7', subfields=pid_fields) from invenio.legacy.bibupload.utils import bibupload_record bibupload_record(record=rec, file_prefix='doi', mode='-c', opts=[], alias="doi") return rec
def _add_to_record(record, patch): for tag in patch: for data_tuple in patch[tag]: record_add_field(record, tag, data_tuple[1], data_tuple[2], '', subfields=data_tuple[0]) return record
def rule_add_recid(header, record): # if not BIBMATCH_MATCHED in header: # return record if '001' in record.keys(): recid = str(record['001'][0][3]) _print("Record already has recid %s" % (recid,)) return record recids = REGEX_BIBMATCH_RESULTS.findall(header) if len(recids) == 1: record_add_field(record, '001', controlfield_value=recids[0]) return record
def rule_create_fft(header, record): for field in record_get_field_instances(record, '856', ind1='4'): url = None for code, value in field_get_subfield_instances(field): if code == 'u': url = value break if url: subs = [('a', url), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')] record_add_field(record, 'FFT', subfields=subs) return record
def _modify_record(self, recid, test_func, replace_func, include_func, append_colls=[], replace_colls=[]): """Generate record a MARCXML file. @param test_func: Function to test if a collection id should be changed @param replace_func: Function to replace the collection id. @param include_func: Function to test if collection should be included """ from invenio.legacy.search_engine import get_record rec = get_record(recid) newcolls = [] dirty = False try: colls = rec['980'] if replace_colls: for c in replace_colls: newcolls.append([('a', c)]) dirty = True else: for c in colls: try: # We are only interested in subfield 'a' code, val = c[0][0] if test_func(code, val): c[0][0] = replace_func(code, val) dirty = True if include_func(code, val): newcolls.append(c[0]) else: dirty = True except IndexError: pass for c in append_colls: newcolls.append([('a', c)]) dirty = True except KeyError: return False if not dirty: return False rec = {} record_add_field(rec, '001', controlfield_value=str(recid)) for subfields in newcolls: record_add_field(rec, '980', subfields=subfields) return rec
def add_domain_fields(rec, form, meta): """ Adds a domain specific fields. These are just added as name value pairs to field 690. """ for fs in meta.fieldsets: if fs.name != 'Generic': # TODO: this is brittle; get from somewhere for k in (fs.optional_fields + fs.basic_fields): if k in form and form[k]: fields = form.getlist(k) for f in fields: if f and not f.isspace(): record_add_field(rec, '690', subfields=[('a', k), ('b', f)])
def _modify_record(self, recid, test_func, replace_func, include_func, append_colls=[], replace_colls=[]): """ Generate record a MARCXML file @param test_func: Function to test if a collection id should be changed @param replace_func: Function to replace the collection id. @param include_func: Function to test if collection should be included """ from invenio.legacy.search_engine import get_record rec = get_record(recid) newcolls = [] dirty = False try: colls = rec['980'] if replace_colls: for c in replace_colls: newcolls.append([('a', c)]) dirty = True else: for c in colls: try: # We are only interested in subfield 'a' code, val = c[0][0] if test_func(code, val): c[0][0] = replace_func(code, val) dirty = True if include_func(code, val): newcolls.append(c[0]) else: dirty = True except IndexError: pass for c in append_colls: newcolls.append([('a', c)]) dirty = True except KeyError: return False if not dirty: return False rec = {} record_add_field(rec, '001', controlfield_value=str(recid)) for subfields in newcolls: record_add_field(rec, '980', subfields=subfields) return rec
def openaire_altmetric_update(recids, upload=True): """ Retrieve Altmetric information for a record. """ logger.debug("Checking Altmetric for recids %s" % recids) a = Altmetric() records = [] for recid in recids: logger.debug("Checking Altmetric for recid %s" % recid) try: # Check if we already have an Altmetric id sysno_inst = get_fieldvalues(recid, "035__9") if ['Altmetric'] in sysno_inst: continue doi_val = get_fieldvalues(recid, "0247_a")[0] logger.debug("Found DOI %s" % doi_val) json_res = a.doi(doi_val) logger.debug("Altmetric response: %s" % json_res) rec = {} record_add_field(rec, "001", controlfield_value=str(recid)) if json_res: record_add_field(rec, '035', subfields=[('a', str(json_res['altmetric_id'])), ('9', 'Altmetric')]) records.append(rec) except AltmetricHTTPException as e: logger.warning( 'Altmetric error for recid %s with DOI %s (status code %s): %s' % (recid, doi_val, e.status_code, str(e))) register_exception(prefix='Altmetric error (status code %s): %s' % (e.status_code, str(e)), alert_admin=False) except IndexError: logger.debug("No DOI found") pass if upload and records: if len(records) == 1: bibupload(record=records[0], file_prefix="altmetric") else: bibupload(collection=records, file_prefix="altmetric") return records
def modify_record_timestamp(revision_xml, last_revision_ts): """ Modify tag 005 to add the revision passed as parameter. @param revision_xml: marcxml representation of the record to modify @type revision_xml: string @param last_revision_ts: timestamp to add to 005 tag @type last_revision_ts: string @return: marcxml with 005 tag modified """ recstruct = create_record(revision_xml)[0] if "005" in recstruct: record_modify_controlfield(recstruct, "005", last_revision_ts, field_position_local=0) else: record_add_field(recstruct, "005", controlfield_value=last_revision_ts) return record_xml_output(recstruct)
def _prepare_marcxml(recid_a, rn_a, recid_b, rn_b, what_is_a_for_b, what_is_b_for_a, display_in_a=True, display_in_b=True): record_a = {} record_b = {} record_add_field(record_a, "001", controlfield_value=str(recid_a)) record_add_field(record_a, CFG_OTHER_RELATIONSHIP_ENTRY, ind1=display_in_a and "0" or "1", subfields=[('i', what_is_b_for_a), ('r', rn_b), ('w', str(recid_b))]) record_add_field(record_b, "001", controlfield_value=str(recid_b)) record_add_field(record_b, CFG_OTHER_RELATIONSHIP_ENTRY, ind1=display_in_b and "0" or "1", subfields=[('i', what_is_a_for_b), ('r', rn_a), ('w', str(recid_a))]) return "<collection>\n%s\n%s</collection>" % (record_xml_output(record_a), record_xml_output(record_b))
def main(): from invenio.legacy.search_engine import get_record from invenio.legacy.bibupload.engine import ( bibupload, ) from invenio.legacy.bibrecord import ( record_add_field, record_delete_field, ) # Loop through list of records for r in RECORDS: old_rec = get_record(r) rec = get_record(r) if not rec: break print('Processing record: {0}'.format(r)) # pprint(rec) old_690 = [f[0] for f in rec.get('690', [])] new_690 = [] for f in old_690: a = f[0] b = f[1] t = [a, (b[0], VALUES.get(r))] if (a[0] == 'a' and a[1] == 'language_code' and b[0] == 'b' and VALUES.get(r)) \ else f new_690.append(t) if not new_690 == old_690: record_delete_field(rec, '690') for f in new_690: record_add_field(rec, '690', subfields=f) # pprint(rec) print('\nOld 690:') pprint(old_rec.get('690')) print('\nNew 690:') pprint(rec.get('690')) if raw_input('Bibupload (y/n)? ') == 'y': bibupload(rec, 'delete') sleep(5) bibupload(rec, 'replace')
def modify_record_timestamp(revision_xml, last_revision_ts): """ Modify tag 005 to add the revision passed as parameter. @param revision_xml: marcxml representation of the record to modify @type revision_xml: string @param last_revision_ts: timestamp to add to 005 tag @type last_revision_ts: string @return: marcxml with 005 tag modified """ recstruct = create_record(revision_xml)[0] if "005" in recstruct: record_modify_controlfield(recstruct, "005", last_revision_ts, field_position_local=0) else: record_add_field(recstruct, '005', controlfield_value=last_revision_ts) return record_xml_output(recstruct)
def create_xml(recid, texkey): """ Create the marcxml snippet with the new texkey @param recid: recid of the record to be updated @type: int @param texkey: texkey that has been generated @type: str @return: marcxml with the fields to be record_add_field @rtype: str """ record = {} record_add_field(record, '001', controlfield_value=str(recid)) subfields_toadd = [('a', texkey), ('9', 'INSPIRETeX')] record_add_field(record, tag='035', subfields=subfields_toadd) return print_rec(record)
def create_marc(form, sub_id, email): """ Generates MARC data used by Invenio from the filled out form, then submits it to the Invenio system. """ rec = {} recid = create_recid() record_add_field(rec, '001', controlfield_value=str(recid)) add_basic_fields(rec, form, email) add_domain_fields(rec, form) add_file_info(rec, form, email, sub_id, recid) checksum = create_checksum(rec, sub_id) add_epic_pid(rec, recid, checksum) marc = record_xml_output(rec) return recid, marc
def create_xml(recid, texkey): """Create the marcxml snippet with the new texkey. :param recid: recid of the record to be updated :type: int :param texkey: texkey that has been generated :type: str :return: marcxml with the fields to be record_add_field :rtype: str """ record = {} record_add_field(record, '001', controlfield_value=str(recid)) subfields_toadd = [('a', texkey), ('9', 'INSPIRETeX')] record_add_field(record, tag='035', subfields=subfields_toadd) return print_rec(record)
def openaire_altmetric_update(recids, upload=True): """ Retrieve Altmetric information for a record. """ logger.debug("Checking Altmetric for recids %s" % recids) a = Altmetric() records = [] for recid in recids: logger.debug("Checking Altmetric for recid %s" % recid) try: # Check if we already have an Altmetric id sysno_inst = get_fieldvalues(recid, "035__9") if ["Altmetric"] in sysno_inst: continue doi_val = get_fieldvalues(recid, "0247_a")[0] logger.debug("Found DOI %s" % doi_val) json_res = a.doi(doi_val) logger.debug("Altmetric response: %s" % json_res) rec = {} record_add_field(rec, "001", controlfield_value=str(recid)) if json_res: record_add_field(rec, "035", subfields=[("a", str(json_res["altmetric_id"])), ("9", "Altmetric")]) records.append(rec) except AltmetricHTTPException as e: logger.warning( "Altmetric error for recid %s with DOI %s (status code %s): %s" % (recid, doi_val, e.status_code, str(e)) ) register_exception( prefix="Altmetric error (status code %s): %s" % (e.status_code, str(e)), alert_admin=False ) except IndexError: logger.debug("No DOI found") pass if upload and records: if len(records) == 1: bibupload(record=records[0], file_prefix="altmetric") else: bibupload(collection=records, file_prefix="altmetric") return records
def add_file_info(rec, form, email, sub_id, recid): """ Adds the path to the file and access rights to ther record. """ CFG_B2SHARE_UPLOAD_FOLDER = current_app.config.get("CFG_B2SHARE_UPLOAD_FOLDER") upload_dir = os.path.join(CFG_B2SHARE_UPLOAD_FOLDER, sub_id) files = os.listdir(upload_dir) if 'open_access' in form: fft_status = 'firerole: allow any\n' else: fft_status = 'firerole: allow email "{0}"\ndeny all'.format( email) for f in files: path = os.path.join(upload_dir, f) if f.startswith('metadata_'): # we do not want to do load file metadata into Invenio as files, will extract into MARC fields continue # load corresponding metadata file metadata = {} metadata_filename = os.path.join(upload_dir, 'metadata_' + f) if os.path.isfile(metadata_filename): # expecting to load a dict with the following structure: dict(name=name, file=file_path, size=size) metadata = pickle.load(open(metadata_filename, 'rb')) else: current_app.logger.error('Submitted file \'%s\' is missing metadata file, using default' % f) metadata = dict(name=f, file=path, size=str(os.path.getsize(path))) record_add_field(rec, 'FFT', subfields=[('a', path), ('n', metadata['name']), # name of the file #('t', 'Type'), # TODO # unfortunately s is used for a timestamp, not file size #('s', 'timestamp'), # s is a timestamp #('w', str(metadata['size'])), # size should be derived automatically, # # but storing it into 'document_moreinfo' field ('r', fft_status)]) #seems to be impossible to add file size data, thought this would work CFG_SITE_SECURE_URL = current_app.config.get("CFG_SITE_SECURE_URL") url = "{0}/record/{1}/files/{2}".format(CFG_SITE_SECURE_URL, recid, f) record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('s', str(os.path.getsize(path))), ('y',metadata['name'])])
def process_record(self, record): """@see: BaseFieldCommand.process_record""" # if the tag is empty, we don't make any changes if self._tag == "" or self._tag == None: return field_number = bibrecord.record_add_field(record, self._tag, self._ind1, self._ind2) self._apply_subfield_commands_to_field(record, field_number)
def openaire_altmetric_update(recids, upload=True): """ Retrieve Altmetric information for a record. """ logger.debug("Checking Altmetric for recids %s" % recids) a = Altmetric() records = [] for recid in recids: logger.debug("Checking Altmetric for recid %s" % recid) try: # Check if we already have an Altmetric id sysno_inst = get_fieldvalues(recid, "035__9") if ['Altmetric'] in sysno_inst: continue doi_val = get_fieldvalues(recid, "0247_a")[0] logger.debug("Found DOI %s" % doi_val) json_res = a.doi(doi_val) logger.debug("Altmetric response: %s" % json_res) rec = {} record_add_field(rec, "001", controlfield_value=str(recid)) if json_res: record_add_field(rec, '035', subfields=[ ('a', str(json_res['altmetric_id'])), ('9', 'Altmetric') ]) records.append(rec) except AltmetricHTTPException, e: logger.warning( 'Altmetric error for recid %s with DOI %s (status code %s): %s' % (recid, doi_val, e.status_code, str(e)) ) register_exception( prefix='Altmetric error (status code %s): %s' % ( e.status_code, str(e)), alert_admin=False ) except IndexError: logger.debug("No DOI found") pass
def create_marc(form, sub_id, email, meta): """ Generates MARC data used by Invenio from the filled out form, then submits it to the Invenio system. """ rec = {} recid = create_recid() record_add_field(rec, '001', controlfield_value=str(recid)) add_basic_fields(rec, form, meta) record_add_field(rec, '856', ind1='0', subfields=[('f', email)]) add_domain_fields(rec, form, meta) add_file_info(rec, form, email, sub_id, recid) checksum = create_checksum(rec, sub_id) add_epic_pid(rec, recid, checksum) marc = record_xml_output(rec) return recid, marc
def create_checksum(rec, sub_id, buffersize=64 * 1024): """ Creates a checksum of all the files in the record, and adds it to the MARC. Returns: checksum as a hex string """ sha = hashlib.sha256() files_metadata = get_depositing_files_metadata(sub_id) files = [f['file'] for f in files_metadata] for filepath in sorted(files): with open(filepath, 'rb', buffering=0) as fp: while True: block = fp.read(buffersize) if not block: break sha.update(block) cs = sha.hexdigest() record_add_field(rec, '024', ind1='7', subfields=[('2', 'checksum'), ('a', cs)]) return cs
def add_file_info(rec, form, email, sub_id, recid): """ Adds the path to the file and access rights to ther record. """ if 'open_access' in form: fft_status = 'firerole: allow any\n' else: fft_status = 'firerole: allow email "{0}"\ndeny all'.format(email) if 'embargo_till' in form: embargodate = parser.parse(form['embargo_till']) embargodate = datetime.strftime(embargodate, '%Y-%m-%d') fft_status = 'firerole: deny until "%s"\nallow any\n' % embargodate for metadata in get_depositing_files_metadata(sub_id): path = metadata['file'] record_add_field( rec, 'FFT', subfields=[ ('a', path), ('n', metadata['name']), # name of the file #('t', 'Type'), # TODO # unfortunately s is used for a timestamp, not file size #('s', 'timestamp'), # s is a timestamp #('w', str(metadata['size'])), # size should be derived automatically, # # but storing it into 'document_moreinfo' field ('r', fft_status) ]) #seems to be impossible to add file size data, thought this would work CFG_SITE_SECURE_URL = current_app.config.get("CFG_SITE_SECURE_URL") url = u"{0}/record/{1}/files/{2}".format(CFG_SITE_SECURE_URL, recid, metadata['name']) record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('s', str(os.path.getsize(path))), ('y', metadata['name'])])
def add_domain_fields(rec, form): """ Adds a domain specific fields. These are just added as name value pairs to field 690. """ domain = form['domain'].lower() if domain in metadata_classes(): meta = metadata_classes()[domain]() else: #no domain stuff return for fs in meta.fieldsets: if fs.name != 'Generic': # TODO: this is brittle; get from somewhere for k in (fs.optional_fields + fs.basic_fields): if k in form and form[k]: fields = form.getlist(k) for f in fields: if f and not f.isspace(): record_add_field(rec, '690', subfields=[('a', k), ('b', f)])
def create_checksum(rec, sub_id, buffersize=64 * 1024): """ Creates a checksum of all the files in the record, and adds it to the MARC. Returns: checksum as a hex string """ sha = hashlib.sha256() CFG_B2SHARE_UPLOAD_FOLDER = current_app.config.get("CFG_B2SHARE_UPLOAD_FOLDER") upload_dir = os.path.join(CFG_B2SHARE_UPLOAD_FOLDER, sub_id) files = sorted(os.listdir(upload_dir)) for f in files: filepath = os.path.join(upload_dir, f) with open(filepath, 'rb', buffering=0) as fp: while True: block = fp.read(buffersize) if not block: break sha.update(block) cs = sha.hexdigest() record_add_field(rec, '024', ind1='7', subfields=[('2', 'checksum'), ('a', cs)]) return cs
def merge_record_with_template(rec, template_name): """ Extend the record rec with the contents of the template and return it""" template = get_record_template(template_name) if not template: return template_bibrec = create_record(template)[0] for field_tag in template_bibrec: if not record_has_field(rec, field_tag): for field_instance in template_bibrec[field_tag]: record_add_field(rec, field_tag, field_instance[1], field_instance[2], subfields=field_instance[0]) else: for template_field_instance in template_bibrec[field_tag]: subfield_codes_template = field_get_subfield_codes(template_field_instance) for field_instance in rec[field_tag]: subfield_codes = field_get_subfield_codes(field_instance) for code in subfield_codes_template: if code not in subfield_codes: field_add_subfield(field_instance, code, field_get_subfield_values(template_field_instance, code)[0]) return rec
def add_epic_pid(rec, recid, checksum): """ Adds EPIC PID to the record. If registration fails, can also fail the request if CFG_FAIL_ON_MISSING_PID is set to True""" CFG_SITE_SECURE_URL = current_app.config.get("CFG_SITE_SECURE_URL") location = CFG_SITE_SECURE_URL + '/record/' + str(recid) try: pid = createHandle(location, checksum) record_add_field(rec, '024', ind1='7', subfields=[('2', 'PID'), ('a', pid)]) except HTTPException as e: # If CFG_FAIL_ON_MISSING_PID is not found in invenio-local.conf, # default is to assume False try: from config import CFG_FAIL_ON_MISSING_PID fail = bool(CFG_FAIL_ON_MISSING_PID) except ImportError: fail = False current_app.logger.error( "Unable to obtain PID from EPIC server {0} {1}: {2}". format(e.code, e.name, e)) if fail: raise e
def check_records(records, doi_field="0247_a", extra_subfields=(("2", "DOI"), ("9", "bibcheck"))): """ Find the DOI for the records using crossref and add it to the specified field. This plugin won't ask for the DOI if it's already set. """ records_to_check = {} for record in records: has_doi = False for position, value in record.iterfield("0247_2"): if value.lower() == "doi": has_doi = True break if not has_doi: records_to_check[record.record_id] = record dois = get_doi_for_records(records_to_check.values()) for record_id, doi in dois.iteritems(): record = records_to_check[record_id] dup_doi_recid = find_record_from_doi(doi) if dup_doi_recid: record.warn( "DOI %s to be added to record %s already exists in record/s %s" % (doi, record_id, dup_doi_recid)) continue subfields = [(doi_field[5], doi.encode("utf-8"))] + map( tuple, extra_subfields) record_add_field(record, tag=doi_field[:3], ind1=doi_field[3], ind2=doi_field[4], subfields=subfields) record.set_amended("Added DOI in field %s" % doi_field)
def control_actions(record, curate=None, archive=None, publish=None): rec = {} record_add_field(rec, '001', controlfield_value=str(record['recid'])) if curate is None: curate = record.get('record_curated_in_project', False) if archive is None: archive = record.get('record_selected_for_archive', False) if publish is None: publish = record.get('record_public_from_project', False) project_info_fields = [('a', '%s' % curate)] record_add_field(rec, tag='983', ind1='_', ind2='_', subfields=project_info_fields) project_info_fields = [('b', '%s' % publish)] record_add_field(rec, tag='983', ind1='_', ind2='_', subfields=project_info_fields) project_info_fields = [('c', '%s' % archive)] record_add_field(rec, tag='983', ind1='_', ind2='_', subfields=project_info_fields) from invenio.legacy.bibupload.utils import bibupload_record bibupload_record(record=rec, file_prefix='project_info', mode='-c', opts=[], alias="project_info")
def find_modified_tags(self, common_tags, record1, record2): """ For each tag common to Record1 and Record2, checks for modifictions at field-level, indicator-level and subfield-level. Returns a dictionary of tags and corresponding fields from Record1 that have been found to have modified. """ result = {} for tag in common_tags: # retrieve tag instances of record1 and record2 rec1_tag_val = record_get_field_instances(record1, tag, '%', '%') rec2_tag_val = record_get_field_instances(record2, tag, '%', '%') if rec1_tag_val: rec1_ind = self.group_tag_values_by_indicator(rec1_tag_val) if rec2_tag_val: rec2_ind = self.group_tag_values_by_indicator(rec2_tag_val) # NOTE: At this point rec1_ind and rec2_ind will be dictionary # Key ==> (ind1, ind2) tuple # Val ==> list of data_tuple => [dt1,dt2] # dt(n) => ([sfl],ind1,ind2,ctrlfield,fn) # Generating 3 different dictionaries # common/added/deleted ind pairs in record1 based on record2 (com_ind, add_ind, del_ind) = self.compare_tags_by_ind(rec1_ind, rec2_ind) if add_ind: for ind_pair in add_ind: for data_tuple in add_ind[ind_pair]: subfield_list = data_tuple[0] record_add_field(result, tag, ind_pair[0], ind_pair[1], '', subfields=subfield_list) # Indicators that are deleted from record1 w.r.t record2 will be added with special code if del_ind: for ind_pair in del_ind: record_add_field(result, tag, ind_pair[0], ind_pair[1], '', [(CFG_BIBUPLOAD_DELETE_CODE, CFG_BIBUPLOAD_DELETE_VALUE)]) # Common modified fields. Identifying changes at subfield level if com_ind: for ind_pair in com_ind: # NOTE: sf_rec1 and sf_rec2 are list of list of subfields # A simple list comparison is sufficient in this scneario # Any change in the order of fields or changes in subfields # will cause the entire list of data_tuple for that ind_pair # to be copied from record1(upload) to result. if tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS: cf_rec1 = [ data_tuple[3] for data_tuple in rec1_ind[ind_pair] ] cf_rec2 = [ data_tuple[3] for data_tuple in rec2_ind[ind_pair] ] if cf_rec1 != cf_rec2: for data_tuple in com_ind[ind_pair]: record_add_field( result, tag, controlfield_value=data_tuple[3]) else: sf_rec1 = [ data_tuple[0] for data_tuple in rec1_ind[ind_pair] ] sf_rec2 = [ data_tuple[0] for data_tuple in rec2_ind[ind_pair] ] if sf_rec1 != sf_rec2: # change at subfield level/ re-oredered fields for data_tuple in com_ind[ind_pair]: # com_ind will have data_tuples of record1(upload) and not record2 subfield_list = data_tuple[0] record_add_field(result, tag, ind_pair[0], ind_pair[1], '', subfields=subfield_list) return result
def tweet_to_record(tweet, query): """ Transform a tweet into a record. @note: you may want to highly customize this. """ rec = {} ## Let's normalize the body of the tweet. text = tweet.text.encode('UTF-8') text = text.replace('>', '>') text = text.replace('<', '<') text = text.replace('"', "'") text = text.replace('&', '&') ## Let's add the creation date try: creation_date = time.strptime(tweet.created_at, '%a, %d %b %Y %H:%M:%S +0000') except ValueError: creation_date = time.strptime(tweet.created_at, '%a %b %d %H:%M:%S +0000 %Y') record_add_field(rec, '260__c', time.strftime('%Y-%m-%dZ%H:%M:%ST', creation_date)) ## Let's add the Tweet ID record_add_field(rec, '970', subfields=[('a', str(tweet.id))]) ## Let's add the body of the tweet as an abstract record_add_field(rec, '520', subfields=[('a', text)]) ## Let's re-add the body of the tweet as a title. record_add_field(rec, '245', subfields=[('a', text)]) ## Let's fetch information about the user try: user = _TWITTER_API.GetUser(tweet.from_user) ## Let's add the user name as author of the tweet record_add_field(rec, '100', subfields=[('a', str(user.name.encode('UTF-8')))]) ## Let's fetch the icon of the user profile, and let's upload it as ## an image (and an icon of itself) record_add_field(rec, 'FFT', subfields=[ ('a', user.profile.image_url.encode('UTF-8')), ('x', user.profile.image_url.encode('UTF-8')) ]) except Exception as err: write_message("WARNING: issue when fetching the user: %s" % err, stream=sys.stderr) if hasattr(tweet, 'iso_language_code'): ## Let's add the language of the Tweet if available (also this depends) ## on the kind of Twitter API call we used record_add_field(rec, '045', subfields=[('a', tweet.iso_language_code.encode('UTF-8'))]) ## Let's tag this record as a TWEET so that later we can build a collection ## out of these records. record_add_field(rec, '980', subfields=[('a', 'TWEET'), ('b', query)]) ## Some smart manipulations: let's parse out URLs and tags from the body ## of the Tweet. for url in _RE_GET_HTTP.findall(text): url = url[0] record_add_field(rec, '856', '4', subfields=[('u', url)]) for tag in _RE_TAGS.findall(text): ## And here we add the keywords. record_add_field(rec, '653', '1', subfields=[('a', tag), ('9', 'TWITTER')]) ## Finally we shall serialize everything to MARCXML return record_xml_output(rec)
def add_basic_fields(rec, form, meta): """ Adds the basic fields from the form. Note that these fields are mapped to specific MARC fields. For information on the fields see the www.loc.gov website. For example http://www.loc.gov/marc/bibliographic/bd260.html contains information on field 260 for publication data. """ # why aren't subfields a dictionary?! try: if form.get('title'): record_add_field(rec, '245', subfields=[('a', remove_html_markup(form['title']))]) if form.get('creator'): fields = form.getlist('creator') for f in fields: if f and not f.isspace(): record_add_field(rec, '100', subfields=[ ('a', remove_html_markup(f.strip())) ]) if form.get('domain'): record_add_field(rec, '980', subfields=[('a', remove_html_markup(form['domain']))]) pubfields = [] pubfields.append( ('b', remove_html_markup(form.get('publisher', meta.publisher_default)))) if form.get('publication_date'): pubfields.append( ('c', remove_html_markup(form['publication_date']))) if pubfields: record_add_field(rec, '260', subfields=pubfields) if 'open_access' in form: record_add_field(rec, '542', subfields=[('l', 'open')]) else: record_add_field(rec, '542', subfields=[('l', 'restricted')]) if form.get('licence'): record_add_field(rec, '540', subfields=[('a', remove_html_markup(form['licence']))]) record_add_field(rec, '520', subfields=[('a', remove_html_markup(form['description']))]) if form.get('contact_email'): record_add_field(rec, '270', subfields=[ ('m', remove_html_markup(form['contact_email'])) ]) if form.get('keywords'): for f in form.getlist('keywords'): for kw in f.split(','): if kw and not kw.isspace(): record_add_field(rec, '653', ind1='1', subfields=[ ('a', remove_html_markup(kw.strip())) ]) if form.get('contributors'): fields = form.getlist('contributors') for f in fields: if f and not f.isspace(): record_add_field(rec, '700', subfields=[ ('a', remove_html_markup(f.strip())) ]) record_add_field(rec, '546', subfields=[('a', remove_html_markup( form.get('language', meta.language_default)))]) if form.get('resource_type'): fields = form.getlist('resource_type') for f in fields: record_add_field(rec, '337', subfields=[('a', remove_html_markup(f))]) # Special case for the 'Linguistics' domain: # All the ling_resource_type(s) are also resource_type(s), going into '337' if form.get('ling_resource_type'): fields = form.getlist('ling_resource_type') for f in fields: record_add_field(rec, '337', subfields=[('a', remove_html_markup(f))]) if form.get('alternate_identifier'): record_add_field(rec, '024', subfields=[('a', remove_html_markup( form['alternate_identifier']))]) if form.get('version'): record_add_field(rec, '250', subfields=[('a', remove_html_markup(form['version']))]) if form.get('discipline'): fields = form.getlist('discipline') for f in fields: record_add_field(rec, '526', subfields=[('a', remove_html_markup(f))]) CFG_SITE_NAME = current_app.config.get("CFG_SITE_NAME") record_add_field(rec, '264', subfields=[('b', CFG_SITE_NAME), ('c', str(datetime.utcnow()) + " UTC")]) except Exception as e: current_app.logger.error(e) raise
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True if run_sql( "SELECT id FROM schTASK WHERE proc='bibupload:oairepository' AND status='WAITING'" ): write_message( "Previous requests of oairepository still being elaborated. Let's skip this execution." ) return True initial_snapshot = {} for set_spec in all_set_specs(): initial_snapshot[set_spec] = get_set_definitions(set_spec) write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2) task_update_progress("Fetching records to process") recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e') write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2) all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e') no_more_exported_recids = intbitset(all_current_recids) write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2) all_affected_recids = intbitset() all_should_recids = intbitset() recids_for_set = {} for set_spec in all_set_specs(): if not set_spec: set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC should_recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = should_recids no_more_exported_recids -= should_recids all_should_recids |= should_recids current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') write_message( "%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2) to_add = should_recids - current_recids write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2) to_remove = current_recids - should_recids write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2) affected_recids = to_add | to_remove write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2) all_affected_recids |= affected_recids missing_oaiid = all_should_recids - recids_with_oaiid write_message("%s recids are missing an oaiid" % len(missing_oaiid)) write_message("%s recids should no longer be exported" % len(no_more_exported_recids)) ## Let's add records with missing OAI ID all_affected_recids |= missing_oaiid | no_more_exported_recids write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2) if not all_affected_recids: write_message("Nothing to do!") return True # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 # Iterate over the recids for i, recid in enumerate(all_affected_recids): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(all_affected_recids))) write_message("Elaborating recid %s" % recid, verbose=3) record = get_record(recid) if not record: write_message("Record %s seems empty. Let's skip it." % recid, verbose=3) continue new_record = {} # Check if an OAI identifier is already in the record or # not. assign_oai_id_entry = False oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5]) if not oai_id_entry: assign_oai_id_entry = True oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid) write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) else: write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5])) write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3) current_previous_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5])) write_message( "Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3) # Get the sets that should be in this record according to # settings updated_oai_sets = set(_set for _set, _recids in iteritems(recids_for_set) if recid in _recids) write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3) updated_previous_oai_sets = set( _set for _set in (current_previous_oai_sets - updated_oai_sets) | (current_oai_sets - updated_oai_sets)) write_message( "Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and not assign_oai_id_entry: write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3) continue # Jump to next recid write_message("Something has changed for record %s, let's update it!" % recid, verbose=3) subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)] for oai_set in updated_oai_sets: subfields.append((CFG_OAI_SET_FIELD[5], oai_set)) for oai_set in updated_previous_oai_sets: subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set)) record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields) oai_out.write(record_xml_output(new_record)) tot += 1 if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE: oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n', '-Noairepository', '-P', '-1') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-Noairepository', '-P', '-1') # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 task_sleep_now_if_required(can_stop_too=True) oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if tot > 0: if not no_upload: task_sleep_now_if_required(can_stop_too=True) if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename) else: os.remove(filename) return True
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True initial_snapshot = {} for set_spec in all_set_specs(): initial_snapshot[set_spec] = get_set_definitions(set_spec) write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2) task_update_progress("Fetching records to process") recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e') write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2) all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e') no_more_exported_recids = intbitset(all_current_recids) write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2) all_affected_recids = intbitset() all_should_recids = intbitset() recids_for_set = {} for set_spec in all_set_specs(): if not set_spec: set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC should_recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = should_recids no_more_exported_recids -= should_recids all_should_recids |= should_recids current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') write_message("%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2) to_add = should_recids - current_recids write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2) to_remove = current_recids - should_recids write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2) affected_recids = to_add | to_remove write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2) all_affected_recids |= affected_recids missing_oaiid = all_should_recids - recids_with_oaiid write_message("%s recids are missing an oaiid" % len(missing_oaiid)) write_message("%s recids should no longer be exported" % len(no_more_exported_recids)) ## Let's add records with missing OAI ID all_affected_recids |= missing_oaiid | no_more_exported_recids write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2) if not all_affected_recids: write_message("Nothing to do!") return True # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 # Iterate over the recids for i, recid in enumerate(all_affected_recids): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(all_affected_recids))) write_message("Elaborating recid %s" % recid, verbose=3) record = get_record(recid) if not record: write_message("Record %s seems empty. Let's skip it." % recid, verbose=3) continue new_record = {} # Check if an OAI identifier is already in the record or # not. assign_oai_id_entry = False oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5]) if not oai_id_entry: assign_oai_id_entry = True oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid) write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) else: write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5])) write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3) current_previous_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5])) write_message("Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3) # Get the sets that should be in this record according to # settings updated_oai_sets = set(_set for _set, _recids in iteritems(recids_for_set) if recid in _recids) write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3) updated_previous_oai_sets = set(_set for _set in (current_previous_oai_sets - updated_oai_sets) | (current_oai_sets - updated_oai_sets)) write_message("Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and not assign_oai_id_entry: write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3) continue # Jump to next recid write_message("Something has changed for record %s, let's update it!" % recid, verbose=3) subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)] for oai_set in updated_oai_sets: subfields.append((CFG_OAI_SET_FIELD[5], oai_set)) for oai_set in updated_previous_oai_sets: subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set)) record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields) oai_out.write(record_xml_output(new_record)) tot += 1 if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE: oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename) # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 task_sleep_now_if_required(can_stop_too=True) oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if tot > 0: if not no_upload: task_sleep_now_if_required(can_stop_too=True) if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename) else: os.remove(filename) return True
def add_field(self, tag, value, subfields=None): """ Add a field """ tag = tag.replace("_", " ") record_add_field(self, tag[:3], tag[3], tag[4], value, subfields) self.set_amended("Added field %s" % tag)
def perform_request_record(requestType, uid, data): """Handle 'major' record related requests. Handle retrieving, submitting or cancelling the merging session. """ #TODO add checks before submission and cancel, replace get_bibrecord call result = {'resultCode': 0, 'resultText': ''} recid1 = data["recID1"] record1 = _get_record(recid1, uid, result) if result[ 'resultCode'] != 0: #if record not accessible return error information return result if requestType == 'submit': if 'duplicate' in data: recid2 = data['duplicate'] record2 = _get_record_slave(recid2, result, 'recid', uid) if result['resultCode'] != 0: #return in case of error return result (errcode, message) = check_doi_status_after_merge( data["recID1"], data['duplicate'], record1, record2, record2_marked_as_duplicate_p=data.has_key('duplicate'), submit_confirmed_p=data.get('additional_data', { 'confirmed_submit': False }).get('confirmed_submit', False)) if errcode: result['resultCode'] = errcode result['resultText'] = message return result # mark record2 as deleted record_add_field(record2, '980', ' ', ' ', '', [('c', 'DELETED')]) # mark record2 as duplicate of record1 record_add_field(record2, '970', ' ', ' ', '', [('d', str(recid1))]) # add recid of deleted record to master record record_add_field(record1, '981', ' ', ' ', '', [('a', str(recid2))]) # To ensure updates happen in order, use a seq id sequence_id = str(random.randrange(1, 4294967296)) # submit record2 to be deleted xml_record2 = record_xml_output(record2) save_xml_record(recid2, uid, xml_record2, task_name="bibmerge", sequence_id=sequence_id) # submit record1 xml_record1 = record_xml_output(record1) save_xml_record(recid1, uid, xml_record1, task_name="bibmerge", sequence_id=sequence_id) # Delete cache file if it exists if cache_exists(recid1, uid): delete_cache(recid1, uid) result['resultText'] = 'Records submitted' return result (errcode, message) = check_doi_status_after_merge( data["recID1"], data["recID2"], record1, None, submit_confirmed_p=data.get('additional_data', { 'confirmed_submit': False }).get('confirmed_submit', False)) if errcode: result['resultCode'] = errcode result['resultText'] = message return result #submit record1 from cache save_xml_record(recid1, uid, task_name="bibmerge") # Delete cache file if it exists if cache_exists(recid1, uid): delete_cache(recid1, uid) result['resultText'] = 'Record submitted' return result elif requestType == 'cancel': delete_cache(recid1, uid) result['resultText'] = 'Cancelled' return result recid2 = data["recID2"] mode = data['record2Mode'] record2 = _get_record_slave(recid2, result, mode, uid) if result[ 'resultCode'] != 0: #if record not accessible return error information return result if requestType == 'getRecordCompare': result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Records compared' elif requestType == 'recCopy': copy_R2_to_R1(record1, record2) result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Record copied' elif requestType == 'recMerge': merge_record(record1, record2, merge_conflicting_fields=True) result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Records merged' elif requestType == 'recMergeNC': merge_record(record1, record2, merge_conflicting_fields=False) result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Records merged' else: result['resultCode'], result['resultText'] = 1, 'Wrong request type' return result
def add_basic_fields(rec, form, email): """ Adds the basic fields from the form. Note that these fields are mapped to specific MARC fields. For information on the fields see the www.loc.gov website. For example http://www.loc.gov/marc/bibliographic/bd260.html contains information on field 260 for publication data. """ # why aren't subfields a dictionary?! try: if form['title']: record_add_field(rec, '245', subfields=[('a', remove_html_markup(form['title']))]) if form['creator']: fields = form.getlist('creator') for f in fields: if f and not f.isspace(): record_add_field(rec, '100', subfields=[('a', remove_html_markup(f.strip()))]) if form['domain']: record_add_field(rec, '980', subfields=[('a', remove_html_markup(form['domain']))]) pubfields = [] if form['publisher']: pubfields.append(('b', remove_html_markup(form['publisher']))) if form.get('publication_date'): pubfields.append(('c', remove_html_markup(form['publication_date']))) if pubfields: record_add_field(rec, '260', subfields=pubfields) record_add_field(rec, '856', ind1='0', subfields=[('f', email)]) if 'open_access' in form: record_add_field(rec, '542', subfields=[('l', 'open')]) else: record_add_field(rec, '542', subfields=[('l', 'restricted')]) if form['licence']: record_add_field(rec, '540', subfields=[('a', remove_html_markup(form['licence']))]) record_add_field(rec, '520', subfields=[('a', remove_html_markup(form['description']))]) if form['contact_email']: record_add_field(rec,'270',subfields=[('m', remove_html_markup(form['contact_email']))]) if form['keywords']: for kw in form['keywords'].split(','): if kw and not kw.isspace(): record_add_field(rec, '653', ind1='1', subfields=[('a', remove_html_markup(kw.strip()))]) if 'contributors' in form and form['contributors']: fields = form.getlist('contributors') for f in fields: if f and not f.isspace(): record_add_field(rec, '700', subfields=[('a', remove_html_markup(f.strip()))]) record_add_field(rec, '546', subfields=[('a', remove_html_markup(form['language']))]) # copying zenodo here, but I don't think 980 is the right MARC field if 'resource_type' in form and form['resource_type']: fields = form.getlist('resource_type') for f in fields: record_add_field(rec, '980', subfields=[('a', remove_html_markup(form['resource_type']))]) if 'alternate_identifier' in form and form['alternate_identifier']: record_add_field(rec, '024', subfields=[('a', remove_html_markup(form['alternate_identifier']))]) if 'version' in form and form['version']: record_add_field(rec, '250', subfields=[('a', remove_html_markup(form['version']))]) CFG_SITE_NAME = current_app.config.get("CFG_SITE_NAME") record_add_field(rec, '264', subfields=[('b', CFG_SITE_NAME), ('c', str(datetime.utcnow()) + " UTC")]) except Exception as e: current_app.logger.error(e) raise
def compare_records(self, record1, record2, opt_mode=None): """ Compares two records to identify added/modified/deleted tags. The records are either the upload record or existing record or record archived. Returns a Tuple of Dictionaries(For modified/added/deleted tags). """ def remove_control_tag(tag_list): """ Returns the list of keys without any control tags """ cleaned_list = [ item for item in tag_list if item not in CFG_BIBUPLOAD_CONTROLFIELD_TAGS ] return cleaned_list def group_record_tags(): """ Groups all the tags in a Record as Common/Added/Deleted tags. Returns a Tuple of 3 lists for each category mentioned above. """ rec1_keys = record1.keys() rec2_keys = record2.keys() com_tag_lst = [key for key in rec1_keys if key in rec2_keys] # tags in record2 not present in record1 del_tag_lst = [key for key in rec2_keys if key not in rec1_keys] # additional tags in record1 add_tag_lst = [key for key in rec1_keys if key not in rec2_keys] return (com_tag_lst, add_tag_lst, del_tag_lst) # declaring dictionaries to hold the identified patch mod_patch = {} add_patch = {} del_patch = {} result = {} (common_tags, added_tags, deleted_tags) = group_record_tags() if common_tags: mod_patch = self.find_modified_tags(common_tags, record1, record2) if added_tags: for tag in added_tags: add_patch[tag] = record1[tag] # if record comes with correct, it should already have fields # marked with '0' code. If not deleted tag list will if deleted_tags and \ opt_mode == 'replace' or opt_mode == 'delete': for tag in deleted_tags: del_patch[tag] = record2[tag] # returning back a result dictionary with all available patches if mod_patch: result['MOD'] = mod_patch if add_patch: result['ADD'] = add_patch if del_patch: # for a tag that has been deleted in the upload record in replace # mode, loop through all the fields of the tag and add additional # subfield with code '0' and value '__DELETE_FIELDS__' # NOTE Indicators taken into consideration while deleting fields for tag in del_patch: for data_tuple in del_patch[tag]: ind1 = data_tuple[1] ind2 = data_tuple[2] record_delete_field(del_patch, tag, ind1, ind2) record_add_field(del_patch, tag, ind1, ind2, "", [ (CFG_BIBUPLOAD_DELETE_CODE, CFG_BIBUPLOAD_DELETE_VALUE) ]) result['DEL'] = del_patch return result
def _prepare_marcxml(recid_a, rn_a, recids_and_rns_b, what_is_a_for_b, what_is_b_for_a, display_in_a=True, display_in_b=True, marc_for_a=None, marc_for_b=None, upload_mode='append', consider_empty_p=False): output = '<collection>' record_a = {} record_b = {} if what_is_b_for_a is not None: marc_tag_for_a, marc_ind1_for_a, marc_ind2_for_a = \ _prepare_marc(marc_for_a, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_a and "0" or "1") record_add_field(record_a, "001", controlfield_value=str(recid_a)) if upload_mode == 'correct' and not recids_and_rns_b and consider_empty_p: # Add empty field in order to account for cases where all # linkings are removed by the submitter record_add_field(record_a, marc_tag_for_a, ind1=marc_ind1_for_a, ind2=marc_ind2_for_a) for recid_b, rn_b in recids_and_rns_b: record_add_field(record_a, marc_tag_for_a, ind1=marc_ind1_for_a, ind2=marc_ind2_for_a, subfields=[('i', what_is_b_for_a), ('r', rn_b), ('w', str(recid_b))]) output += record_xml_output(record_a) if what_is_a_for_b is not None: marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b = \ _prepare_marc(marc_for_b, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_b and "0" or "1") for recid_b, rn_b in recids_and_rns_b: record_b = {} record_add_field(record_b, "001", controlfield_value=str(recid_b)) if upload_mode == 'correct': original_linking_fields = _get_record_linking_fields( recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b) record_add_fields(record_b, marc_tag_for_b, original_linking_fields) record_add_field(record_b, marc_tag_for_b, ind1=marc_ind1_for_b, ind2=marc_ind2_for_b, subfields=[('i', what_is_a_for_b), ('r', rn_a), ('w', str(recid_a))]) output += record_xml_output(record_b) # Remove linking in remote records where adequate if consider_empty_p: unlinked_recids = get_unlinked_records(recid_a, marc_for_b, display_in_b, upload_mode, recids_and_rns_b) for recid_b in unlinked_recids: record_b = {} record_add_field(record_b, "001", controlfield_value=str(recid_b)) original_linking_fields = _get_record_linking_fields( recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b) if not original_linking_fields: # Add empty field in order to account for cases where all # linkings are removed by the submitter record_add_field(record_b, marc_tag_for_b, ind1=marc_ind1_for_b, ind2=marc_ind2_for_b) record_add_fields(record_b, marc_tag_for_b, original_linking_fields) output += record_xml_output(record_b) output += '</collection>' return output
def add_basic_fields(rec, form, meta): """ Adds the basic fields from the form. Note that these fields are mapped to specific MARC fields. For information on the fields see the www.loc.gov website. For example http://www.loc.gov/marc/bibliographic/bd260.html contains information on field 260 for publication data. """ # why aren't subfields a dictionary?! try: if form.get('title'): record_add_field(rec, '245', subfields=[('a', remove_html_markup(form['title']))]) if form.get('creator'): fields = form.getlist('creator') for f in fields: if f and not f.isspace(): record_add_field(rec, '100', subfields=[('a', remove_html_markup(f.strip()))]) if form.get('domain'): record_add_field(rec, '980', subfields=[('a', remove_html_markup(form['domain']))]) pubfields = [] pubfields.append(('b', remove_html_markup( form.get('publisher', meta.publisher_default)))) if form.get('publication_date'): pubfields.append(('c', remove_html_markup(form['publication_date']))) if pubfields: record_add_field(rec, '260', subfields=pubfields) if 'open_access' in form: record_add_field(rec, '542', subfields=[('l', 'open')]) else: record_add_field(rec, '542', subfields=[('l', 'restricted')]) if form.get('licence'): record_add_field(rec, '540', subfields=[('a', remove_html_markup(form['licence']))]) record_add_field(rec, '520', subfields=[('a', remove_html_markup(form['description']))]) if form.get('contact_email'): record_add_field(rec, '270', subfields=[('m', remove_html_markup(form['contact_email']))]) if form.get('keywords'): for f in form.getlist('keywords'): for kw in f.split(','): if kw and not kw.isspace(): record_add_field(rec, '653', ind1='1', subfields=[('a', remove_html_markup(kw.strip()))]) if form.get('contributors'): fields = form.getlist('contributors') for f in fields: if f and not f.isspace(): record_add_field(rec, '700', subfields=[('a', remove_html_markup(f.strip()))]) record_add_field(rec, '546', subfields=[('a', remove_html_markup( form.get('language', meta.language_default)))]) if form.get('resource_type'): fields = form.getlist('resource_type') for f in fields: record_add_field(rec, '337', subfields=[('a', remove_html_markup(f))]) # Special case for the 'Linguistics' domain: # All the ling_resource_type(s) are also resource_type(s), going into '337' if form.get('ling_resource_type'): fields = form.getlist('ling_resource_type') for f in fields: record_add_field(rec, '337', subfields=[('a', remove_html_markup(f))]) if form.get('alternate_identifier'): record_add_field(rec, '024', subfields=[('a', remove_html_markup(form['alternate_identifier']))]) if form.get('version'): record_add_field(rec, '250', subfields=[('a', remove_html_markup(form['version']))]) if form.get('discipline'): fields = form.getlist('discipline') for f in fields: record_add_field(rec, '526', subfields=[('a', remove_html_markup(f))]) CFG_SITE_NAME = current_app.config.get("CFG_SITE_NAME") record_add_field(rec, '264', subfields=[('b', CFG_SITE_NAME), ('c', str(datetime.utcnow()) + " UTC")]) except Exception as e: current_app.logger.error(e) raise
def main(): import invenio.modules.editor.models import invenio.modules.editor.views from invenio.legacy.search_engine import get_record from invenio.legacy.bibrecord import ( record_delete_field, record_add_field, ) from invenio.legacy.bibupload.engine import ( bibupload, ) for a in itertools.count(1): old_rec = get_record(a) rec = get_record(a) if not rec: break print('Processing record: {0}'.format(a)) old_337 = [f[0] for f in rec.get('337', [])] new_337 = old_337[:] new_690 = [] new_980 = [] for f in rec.get('980', []): for sf in f[0]: if sf[0] == 'a' and sf[1] in TYPES: if [sf] not in new_337: new_337.append([sf]) else: if [sf] not in new_980: new_980.append([sf]) for f in rec.get('690', []): sfs = f[0] if sfs[0][0] == 'a' and sfs[0][1] == 'ling_resource_type': res_type = sfs[1][1] if res_type in TYPES: if [('a', res_type)] not in new_337: new_337.append([('a', res_type)]) else: print("Unrecognized 'ling_resource_type' value! '{0}'". format(res_type)) else: if sfs not in new_690: new_690.append(sfs) if not new_337 == old_337: record_delete_field(rec, '337') record_delete_field(rec, '980') record_delete_field(rec, '690') for f in new_337: record_add_field(rec, '337', subfields=f) for f in new_980: record_add_field(rec, '980', subfields=f) for f in new_690: record_add_field(rec, '690', subfields=f) print('\nOld 337:') pprint(old_rec.get('337')) print('New 337:') pprint(rec.get('337')) print('\nOld 690:') pprint(old_rec.get('690')) print('New 690:') pprint(rec.get('690')) print('\nOld 980:') pprint(old_rec.get('980')) print('New 980:') pprint(rec.get('980')) if raw_input('Bibupload (y/n)? ') == 'y': bibupload(rec, 'replace')