def main(): from invenio.legacy.search_engine import get_record from invenio.legacy.bibupload.engine import ( bibupload, ) from invenio.legacy.bibrecord import ( record_add_field, record_delete_field, ) # Loop through list of records for r in RECORDS: old_rec = get_record(r) rec = get_record(r) if not rec: break print('Processing record: {0}'.format(r)) # pprint(rec) old_690 = [f[0] for f in rec.get('690', [])] new_690 = [] for f in old_690: a = f[0] b = f[1] t = [a, (b[0], VALUES.get(r))] if (a[0] == 'a' and a[1] == 'language_code' and b[0] == 'b' and VALUES.get(r)) \ else f new_690.append(t) if not new_690 == old_690: record_delete_field(rec, '690') for f in new_690: record_add_field(rec, '690', subfields=f) # pprint(rec) print('\nOld 690:') pprint(old_rec.get('690')) print('\nNew 690:') pprint(rec.get('690')) if raw_input('Bibupload (y/n)? ') == 'y': bibupload(rec, 'delete') sleep(5) bibupload(rec, 'replace')
def match_all_subfields_for_tag(recID, field_tag, subfields_required=[]): """ Tests whether the record with recID has at least one field with 'field_tag' where all of the required subfields in subfields_required match a subfield in the given field both in code and value @param recID: record ID @type recID: int @param field_tag: a 3 digit code for the field tag code @type field_tag: string @param subfields_required: a list of subfield code/value tuples @type subfields_required: list of tuples of strings. same format as in get_record(): e.g. [('w', 't'), ('4', 'XYZ123')] @return: boolean """ rec = get_record(recID) for field in rec[field_tag]: subfields_present = field[0] intersection = set(subfields_present) & set(subfields_required) if set(subfields_required) == intersection: return True return False
def replace_references(recid): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record """ # Parse references references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml) # Record marc xml record = get_record(recid) if references[0]: fields_to_add = record_get_field_instances(references[0], tag='999', ind1='%', ind2='%') # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', fields_to_add) # Update record references out_xml = record_xml_output(record) else: out_xml = None return out_xml
def update_references(recid, overwrite=True): """Update references for a record First, we extract references from a record. Then, we are not updating the record directly but adding a bibupload task in -c mode which takes care of updating the record. Parameters: * recid: the id of the record """ if not overwrite: # Check for references in record record = get_record(recid) if record and record_has_field(record, '999'): raise RecordHasReferences('Record has references and overwrite ' 'mode is disabled: %s' % recid) if get_fieldvalues(recid, '999C59'): raise RecordHasReferences('Record has been curated: %s' % recid) # Parse references references_xml = extract_references_from_record_xml(recid) # Save new record to file (temp_fd, temp_path) = mkstemp(prefix=CFG_REFEXTRACT_FILENAME, dir=CFG_TMPSHAREDDIR) temp_file = os.fdopen(temp_fd, 'w') temp_file.write(references_xml) temp_file.close() # Update record task_low_level_submission('bibupload', 'refextract', '-P', '4', '-c', temp_path)
def perform_get_holdings_information(recid, req, action="borrowal", ln=CFG_SITE_LANG): """ Display all the copies of an item. If the parameter action is 'proposal', display appropriate information to the user. @param recid: identify the record. Primary key of bibrec. @type recid: int @param action: Specifies whether the current record is put up to solicit acquisition proposals(if "proposal") or not("borrowal"). @type proposal: string @return body(html) """ _ = gettext_set_language(ln) if action == "proposal": tag = AMZ_BOOK_PUBLICATION_DATE_TAG publication_date = record_get_field_value(get_record(recid), tag[:3], ind1=tag[3], ind2=tag[4], code=tag[5]) msg = '' if publication_date: cur_date = datetime.date.today() try: pub_date = time.strptime(publication_date, '%d %b %Y') pub_date = datetime.date(pub_date[0], pub_date[1], pub_date[2]) if cur_date < pub_date: msg += _( "The publication date of this book is %(x_date)s.", x_date=(publication_date)) msg += "<br /><br />" else: msg += _("This book has no copies in the library. ") except: msg += _("This book has no copies in the library. ") msg += _( "If you think this book is interesting, suggest it and tell us why you consider this \ book is important. The library will consider your opinion and if we decide to buy the \ book, we will issue a loan for you as soon as it arrives and send it by internal mail." ) msg += "<br \><br \>" msg += _( "In case we decide not to buy the book, we will offer you an interlibrary loan" ) body = bc_templates.tmpl_book_proposal_information(recid, msg, ln=ln) else: holdings_information = db.get_holdings_information(recid, False) body = bc_templates.tmpl_holdings_information( recid=recid, req=req, holdings_info=holdings_information, ln=ln) return body
def check_arxiv(recid): record = get_record(recid) for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'a'): if category.startswith('arXiv'): return True return False
def get_bibrecord(recid): """Return record in BibRecord wrapping.""" if record_exists(recid): record_revision_ids = get_record_revision_ids(recid) if record_revision_ids: return create_record(get_marcxml_of_revision_id(max(record_revision_ids)))[0] else: return get_record(recid)
def get_recstruct_record(recid): value = serialize_via_marshal(get_record(recid)) b = Bibfmt(id_bibrec=recid, format='recstruct', last_updated=db.func.now(), value=value) db.session.add(b) db.session.commit()
def resolve_doi(req, doi, ln=CFG_SITE_LANG, verbose=0): """ Redirect to given DOI, or display error page when DOI cannot be resolved. """ _ = gettext_set_language(ln) # Fetch user ID: try: uid = getUid(req) except Error: register_exception(req=req, alert_admin=True) return page(title=_("Internal Error"), body=create_error_box(req, verbose=verbose, ln=ln), description="%s - Internal Error" % CFG_SITE_NAME, keywords="%s, Internal Error" % CFG_SITE_NAME, language=ln, req=req, navmenuid='search') # Resolve DOI recids = perform_request_search(p='doi:"%s"' % doi, of="id", verbose=verbose) recids = [recid for recid in recids if doi.lower() in \ [doi.lower() for doi in get_record(recid).get('doi', '') if doi]] # Answer if len(recids) == 1: # Found unique matching record return redirect_to_url(req, CFG_SITE_URL + '/' + CFG_SITE_RECORD + '/' + str(recids[0])) elif len(recids) == 0: # No corresponding record found page_body = '<p>' + (_("Sorry, DOI %(x_doi)s could not be resolved.", x_doi=('<strong>' + str(doi) + '</strong>'))) + '</p>' if req.header_only: raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND return page(title=_('DOI "%(x_doi)s" Not Found', x_doi=cgi.escape(doi)), body=page_body, description=(CFG_SITE_NAME + ' - ' + _("Not found") + ': ' + cgi.escape(str(doi))), keywords="%s" % CFG_SITE_NAME, uid=uid, language=ln, req=req, navmenuid='search') else: # Found multiple matching records try: raise Exception('DOI "%s" matched multiple records (%s) -- Please check' % (doi, ', '.join([str(recid) for recid in recids]))) except Exception, e: register_exception(req=req, alert_admin=True) page_body = websearch_templates.tmpl_multiple_dois_found_page(doi, recids, ln) return page(title=_('Found multiple records matching DOI %(x_doi)s', x_doi=cgi.escape(doi)), body=page_body, description=(CFG_SITE_NAME + ' - ' + _("Found multiple records matching DOI") + ': ' + cgi.escape(str(doi))), keywords="%s" % CFG_SITE_NAME, uid=uid, language=ln, req=req, navmenuid='search')
def test_simple_insert(self): """batchuploader - robotupload simple insert""" from invenio.legacy.search_engine import get_record result = urllib2.urlopen(self.req).read() self.failUnless("[INFO]" in result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) current_recid = run_sql("SELECT MAX(id) FROM bibrec")[0][0] self.failIfEqual(self.last_recid, current_recid) record = get_record(current_recid) self.assertEqual(record['245'][0][0], [('a', 'The title')])
def _modify_record(self, recid, test_func, replace_func, include_func, append_colls=[], replace_colls=[]): """Generate record a MARCXML file. @param test_func: Function to test if a collection id should be changed @param replace_func: Function to replace the collection id. @param include_func: Function to test if collection should be included """ from invenio.legacy.search_engine import get_record rec = get_record(recid) newcolls = [] dirty = False try: colls = rec['980'] if replace_colls: for c in replace_colls: newcolls.append([('a', c)]) dirty = True else: for c in colls: try: # We are only interested in subfield 'a' code, val = c[0][0] if test_func(code, val): c[0][0] = replace_func(code, val) dirty = True if include_func(code, val): newcolls.append(c[0]) else: dirty = True except IndexError: pass for c in append_colls: newcolls.append([('a', c)]) dirty = True except KeyError: return False if not dirty: return False rec = {} record_add_field(rec, '001', controlfield_value=str(recid)) for subfields in newcolls: record_add_field(rec, '980', subfields=subfields) return rec
def retrieve_field_values(curdir, field_name, separator=None, system_number_file='SN', tag=None): """ This is a handy function to retrieve values either from the current submission directory, when a form has been just submitted, or from an existing record (e.g. during MBI action). @param curdir: is the current submission directory. @type curdir: string @param field_name: is the form field name that might exists on disk. @type field_name: string @param separator: is an optional separator. If it exists, it will be used to retrieve multiple values contained in the field. @type separator: string @param system_number_file: is the name of the file on disk in curdir, that is supposed to contain the record id. @type system_number_file: string @param tag: is the full MARC tag (tag+ind1+ind2+code) that should contain values. If not specified, only values in curdir will be retrieved. @type tag: 6-chars @return: the field value(s). @rtype: list of strings. @note: if field_name exists in curdir it will take precedence over retrieving the values from the record. """ field_file = os.path.join(curdir, field_name) if os.path.exists(field_file): field_value = open(field_file).read() if separator is not None: return [ value.strip() for value in field_value.split(separator) if value.strip() ] else: return [field_value.strip()] elif tag is not None: system_number_file = os.path.join(curdir, system_number_file) if os.path.exists(system_number_file): recid = int(open(system_number_file).read().strip()) record = get_record(recid) if separator: return record_get_field_values(record, tag[:3], tag[3], tag[4], tag[5]) else: return [ record_get_field_value(record, tag[:3], tag[3], tag[4], tag[5]) ] return []
def get_record_collections(recid=0, recstruct=None): """ Returns all collections of a record, field 980 @param recid: record id to get collections from @type: string @return: list of collections @rtype: list """ if not recstruct: recstruct = get_record(recid) return [collection for collection in record_get_field_values(recstruct, tag="980", ind1=" ", ind2=" ", code="a")]
def test_simple_insert(self): """batchuploader - robotupload simple insert""" if CFG_LOCALHOST_OK: from invenio.legacy.search_engine import get_record result = urllib2.urlopen(self.req).read() self.failUnless("[INFO]" in result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)]) current_recid = run_sql("SELECT MAX(id) FROM bibrec")[0][0] self.failIfEqual(self.last_recid, current_recid) record = get_record(current_recid) self.assertEqual(record['245'][0][0], [('a', 'The title')])
def can_record_have_physical_copies(recid): """Determine if the record can have physical copies (addable through the bibCirculation module). The information is derieved using the tabs displayed for a given record. Only records already saved within the collection may have the physical copies @return: True or False """ if get_record(recid) is None: return False col_id = Collection.query.filter_by(name=guess_primary_collection_of_a_record(recid)).value("id") return False
def _get_updated_record(record_id, update_commands): """Applies all the changes specified by the commands to record identified by record_id and returns resulting record @param record_id: identifier of the record that will be updated @param update_commands: list of commands used to update record contents @return: updated record structure""" record = search_engine.get_record(recid=record_id) for current_command in update_commands: current_command.process_record(record) return record
def can_record_have_physical_copies(recid): """Determine if the record can have physical copies (addable through the bibCirculation module). The information is derieved using the tabs displayed for a given record. Only records already saved within the collection may have the physical copies @return: True or False """ if get_record(recid) is None: return False col_id = Collection.query.filter_by( name=guess_primary_collection_of_a_record(recid)).value('id') return False
def get_record_collections(recid=0, recstruct=None): """ Returns all collections of a record, field 980 @param recid: record id to get collections from @type: string @return: list of collections @rtype: list """ if not recstruct: recstruct = get_record(recid) return [ collection for collection in record_get_field_values( recstruct, tag="980", ind1=" ", ind2=" ", code="a") ]
def _modify_record(self, recid, test_func, replace_func, include_func, append_colls=[], replace_colls=[]): """ Generate record a MARCXML file @param test_func: Function to test if a collection id should be changed @param replace_func: Function to replace the collection id. @param include_func: Function to test if collection should be included """ from invenio.legacy.search_engine import get_record rec = get_record(recid) newcolls = [] dirty = False try: colls = rec['980'] if replace_colls: for c in replace_colls: newcolls.append([('a', c)]) dirty = True else: for c in colls: try: # We are only interested in subfield 'a' code, val = c[0][0] if test_func(code, val): c[0][0] = replace_func(code, val) dirty = True if include_func(code, val): newcolls.append(c[0]) else: dirty = True except IndexError: pass for c in append_colls: newcolls.append([('a', c)]) dirty = True except KeyError: return False if not dirty: return False rec = {} record_add_field(rec, '001', controlfield_value=str(recid)) for subfields in newcolls: record_add_field(rec, '980', subfields=subfields) return rec
def perform_get_holdings_information(recid, req, action="borrowal", ln=CFG_SITE_LANG): """ Display all the copies of an item. If the parameter action is 'proposal', display appropriate information to the user. @param recid: identify the record. Primary key of bibrec. @type recid: int @param action: Specifies whether the current record is put up to solicit acquisition proposals(if "proposal") or not("borrowal"). @type proposal: string @return body(html) """ _ = gettext_set_language(ln) if action == "proposal": tag = AMZ_BOOK_PUBLICATION_DATE_TAG publication_date = record_get_field_value(get_record(recid), tag[:3], ind1=tag[3], ind2=tag[4], code=tag[5]) msg = '' if publication_date: cur_date = datetime.date.today() try: pub_date = time.strptime(publication_date, '%d %b %Y') pub_date = datetime.date(pub_date[0], pub_date[1], pub_date[2]) if cur_date < pub_date: msg += _("The publication date of this book is %(x_date)s.", x_date=(publication_date)) msg += "<br /><br />" else: msg += _("This book has no copies in the library. ") except: msg += _("This book has no copies in the library. ") msg += _("If you think this book is interesting, suggest it and tell us why you consider this \ book is important. The library will consider your opinion and if we decide to buy the \ book, we will issue a loan for you as soon as it arrives and send it by internal mail.") msg += "<br \><br \>" msg += _("In case we decide not to buy the book, we will offer you an interlibrary loan") body = bc_templates.tmpl_book_proposal_information(recid, msg, ln=ln) else: holdings_information = db.get_holdings_information(recid, False) body = bc_templates.tmpl_holdings_information(recid=recid, req=req, holdings_info=holdings_information, ln=ln) return body
def get_record(self): """ Returns the record structure of this L{BibFormatObject} instance @return: the record structure as defined by BibRecord library """ from invenio.legacy.search_engine import get_record # Create record if necessary if self.record is None: # on-the-fly creation if current output is xm self.record = get_record(self.recID) return self.record
def _get_formated_record(record_id, output_format, update_commands, language, outputTags="", checked=True, displayed_records=None): """Returns a record in a given format @param record_id: the ID of record to format @param output_format: an output format code (or short identifier for the output format) @param update_commands: list of commands used to update record contents @param language: the language to use to format the record @param outputTags: the tags to be shown to the user @param checked: is the record checked by the user? @param displayed_records: records to be displayed on a given page @returns: record formated to be displayed or None """ if update_commands and checked: # Modify the bibrecord object with the appropriate actions updated_record = _get_updated_record(record_id, update_commands) textmarc_options = {"aleph-marc":0, "correct-mode":1, "append-mode":0, "delete-mode":0, "insert-mode":0, "replace-mode":0, "text-marc":1} if record_id not in displayed_records: return old_record = search_engine.get_record(recid=record_id) old_record_textmarc = xmlmarc2textmarc.create_marc_record(old_record, sysno="", options=textmarc_options) if "hm" == output_format: if update_commands and checked: updated_record_textmarc = xmlmarc2textmarc.create_marc_record(updated_record, sysno="", options=textmarc_options) result = _get_record_diff(old_record_textmarc, updated_record_textmarc, outputTags, record_id) else: filter_tags = "All tags" not in outputTags and outputTags result = ['<pre>'] for line in old_record_textmarc.splitlines(): if not filter_tags or line.split()[0].replace('_', '') in outputTags: result.append("%09d " % record_id + line.strip()) result.append('</pre>') result = '\n'.join(result) else: if update_commands and checked: # No coloring of modifications in this case xml_record = bibrecord.record_xml_output(updated_record) else: xml_record = bibrecord.record_xml_output(old_record) result = bibformat.format_record(recID=None, of=output_format, xml_record=xml_record, ln=language) return result
def get_current_record(curdir, system_number_file='SN'): """ Return the current record (in case it's being modified). @param curdir: the path to the current directory. @type curdir: string @param system_number_file: is the name of the file on disk in curdir, that is supposed to contain the record id. @type system_number_file: string @return: the record @rtype: as in L{get_record} """ if os.path.exists(os.path.join(curdir, system_number_file)): recid = open(os.path.join(curdir, system_number_file)).read().strip() if recid: recid = int(recid) return get_record(recid) return {}
def can_record_have_physical_copies(recid): """Determine if the record can have physical copies (addable through the bibCirculation module). The information is derieved using the tabs displayed for a given record. Only records already saved within the collection may have the physical copies @return: True or False """ if get_record(recid) is None: return False col_id = get_colID(guess_primary_collection_of_a_record(recid)) collections = get_detailed_page_tabs(col_id, recid) if ("holdings" not in collections or "visible" not in collections["holdings"]): return False return collections["holdings"]["visible"] is True
def update(recid, form): if not is_record_editable(recid): abort(401) from invenio.legacy.search_engine import get_record from invenio.legacy.bibupload.engine import bibupload from invenio.modules.formatter import engine as bibformat_engine bfo = bibformat_engine.BibFormatObject(recid) domain = read_basic_metadata_field_from_marc(bfo, 'domain') metaclass, meta, meta_form = _get_meta_form_data(domain, form) if meta_form.validate_on_submit(): current_app.logger.info("Updating record {}".format(recid)) _bibdoc_modify_files(recid, form) rec_changes = {} add_basic_fields(rec_changes, form, meta) updated = False rec = get_record(recid) for (k, v) in rec_changes.items(): if rec.get(k) != v: current_app.logger.info("Updating key {} from {} to {}".format( k, rec.get(k), v)) rec[k] = v updated = True if updated: bibupload(rec, 'replace') return jsonify(valid=True, newurl=url_for("record.metadata", recid=recid), html=render_template('record_waitforit.html', recid=recid)) else: html = render_template('b2share-addmeta-table.html', recid=recid, metadata=meta, form=meta_form, domain=metaclass, getattr=getattr) return jsonify(valid=False, html=html)
def retrieve_field_values(curdir, field_name, separator=None, system_number_file='SN', tag=None): """ This is a handy function to retrieve values either from the current submission directory, when a form has been just submitted, or from an existing record (e.g. during MBI action). @param curdir: is the current submission directory. @type curdir: string @param field_name: is the form field name that might exists on disk. @type field_name: string @param separator: is an optional separator. If it exists, it will be used to retrieve multiple values contained in the field. @type separator: string @param system_number_file: is the name of the file on disk in curdir, that is supposed to contain the record id. @type system_number_file: string @param tag: is the full MARC tag (tag+ind1+ind2+code) that should contain values. If not specified, only values in curdir will be retrieved. @type tag: 6-chars @return: the field value(s). @rtype: list of strings. @note: if field_name exists in curdir it will take precedence over retrieving the values from the record. """ field_file = os.path.join(curdir, field_name) if os.path.exists(field_file): field_value = open(field_file).read() if separator is not None: return [value.strip() for value in field_value.split(separator) if value.strip()] else: return [field_value.strip()] elif tag is not None: system_number_file = os.path.join(curdir, system_number_file) if os.path.exists(system_number_file): recid = int(open(system_number_file).read().strip()) record = get_record(recid) if separator: return record_get_field_values(record, tag[:3], tag[3], tag[4], tag[5]) else: return [record_get_field_value(record, tag[:3], tag[3], tag[4], tag[5])] return []
def get_record_provenance(recid): """ Return the provenance XML representation of a record, suitable to be put in the about tag. """ record = get_record(recid) provenances = record_get_field_instances( record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]) out = "" for provenance in provenances: base_url = identifier = datestamp = metadata_namespace = origin_description = harvest_date = altered = "" for (code, value) in provenance[0]: if code == CFG_OAI_PROVENANCE_BASEURL_SUBFIELD: base_url = value elif code == CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]: identifier = value elif code == CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD: datestamp = value elif code == CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD: metadata_namespace = value elif code == CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD: origin_description = value elif code == CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD: harvest_date = value elif code == CFG_OAI_PROVENANCE_ALTERED_SUBFIELD: altered = value if base_url: out += """<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">""" out += X.originDescription( harvestDate=harvest_date, altered=altered)( X.baseURL()(base_url), X.identifier()(identifier), X.datestamp()(datestamp), X.metadataNamespace()(metadata_namespace), origin_description and X.originDescription(origin_description) or '' ## This is already XML ) out += """</provenance>""" return out
def main(): from invenio.legacy.search_engine import get_record from invenio.legacy.bibupload.engine import ( bibupload, ) from invenio.legacy.bibrecord import ( create_record, ) from invenio.legacy.bibedit.db_layer import get_record_revisions from invenio.legacy.bibedit.utils import ( get_record_revision_ids, get_marcxml_of_revision, ) # Loop through list of records for r in RECORDS: rec = get_record(r) if not rec: break print('Processing record: {0}'.format(r)) # pprint(rec) print(get_record_revision_ids(r)) print revs = get_record_revisions(r) print(revs) print for id, rev in revs[0:1]: marcxml = get_marcxml_of_revision(r, rev) # print(marcxml) # print rec = create_record(marcxml)[0] pprint(rec) if raw_input('Bibupload (y/n)? ') == 'y': # bibupload(rec, 'delete') # sleep(5) bibupload(rec, 'replace')
def load_ticket_templates(recId): """ Loads all enabled ticket plugins and calls them. @return dictionary with the following structure: key: string: name of queue value: dict: a dictionary with 2 keys, the template subject and content of the queue @rtype dict """ ticket_templates = {} plugins = load_ticket_plugins() record = get_record(recId) for name, plugin in plugins.items(): if plugin: queue_data = plugin["get_template_data"](record) if queue_data: ticket_templates[queue_data[0]] = {"subject": queue_data[1], "content": queue_data[2]} else: raise BibEditPluginException("Plugin not valid in %s" % (name,)) return ticket_templates
def load_ticket_templates(recId): """ Loads all enabled ticket plugins and calls them. @return dictionary with the following structure: key: string: name of queue value: dict: a dictionary with 2 keys, the template subject and content of the queue @rtype dict """ ticket_templates = {} plugins = load_ticket_plugins() record = get_record(recId) for name, plugin in plugins.items(): if plugin: queue_data = plugin['get_template_data'](record) if queue_data: ticket_templates[queue_data[0]] = { 'subject' : queue_data[1], 'content' : queue_data[2] } else: raise BibEditPluginException("Plugin not valid in %s" % (name,)) return ticket_templates
def get_record_provenance(recid): """ Return the provenance XML representation of a record, suitable to be put in the about tag. """ record = get_record(recid) provenances = record_get_field_instances( record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4], ) out = "" for provenance in provenances: base_url = identifier = datestamp = metadata_namespace = origin_description = harvest_date = altered = "" for (code, value) in provenance[0]: if code == CFG_OAI_PROVENANCE_BASEURL_SUBFIELD: base_url = value elif code == CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]: identifier = value elif code == CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD: datestamp = value elif code == CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD: metadata_namespace = value elif code == CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD: origin_description = value elif code == CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD: harvest_date = value elif code == CFG_OAI_PROVENANCE_ALTERED_SUBFIELD: altered = value if base_url: out += """<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">""" out += X.originDescription(harvestDate=harvest_date, altered=altered)( X.baseURL()(base_url), X.identifier()(identifier), X.datestamp()(datestamp), X.metadataNamespace()(metadata_namespace), origin_description and X.originDescription(origin_description) or "", ## This is already XML ) out += """</provenance>""" return out
def from_recid(cls, recid, provisional=False): """Get user communities specified in recid.""" from invenio.legacy.search_engine import get_record rec = get_record(recid) prefix = "%s-" % (cfg['COMMUNITIES_ID_PREFIX_PROVISIONAL'] if provisional else cfg['COMMUNITIES_ID_PREFIX']) colls = rec.get('980', []) usercomm = [] for c in colls: try: # We are only interested in subfield 'a' code, val = c[0][0] if code == 'a' and val.startswith(prefix): val = val[len(prefix):] u = cls.query.filter_by(id=val).first() if u: usercomm.append(u) except IndexError: pass return usercomm
def update(recid, form): if not is_record_editable(recid): abort(401) from invenio.legacy.search_engine import get_record from invenio.legacy.bibupload.engine import bibupload from invenio.modules.formatter import engine as bibformat_engine bfo = bibformat_engine.BibFormatObject(recid) domain = read_basic_metadata_field_from_marc(bfo, 'domain') metaclass, meta, meta_form = _get_meta_form_data(domain, form) if meta_form.validate_on_submit(): current_app.logger.info("Updating record {}".format(recid)) _bibdoc_modify_files(recid, form) rec_changes = {} add_basic_fields(rec_changes, form, meta) updated = False rec = get_record(recid) for (k,v) in rec_changes.items(): if rec.get(k) != v: current_app.logger.info( "Updating key {} from {} to {}".format(k, rec.get(k),v)) rec[k] = v updated = True if updated: bibupload(rec, 'replace') return jsonify(valid=True, newurl=url_for("record.metadata", recid=recid), html=render_template('record_waitforit.html', recid=recid)) else: html = render_template('b2share-addmeta-table.html', recid=recid, metadata=meta, form=meta_form, domain=metaclass, getattr=getattr) return jsonify(valid=False, html=html)
def tarballs_by_recids(recids, sdir): """ Take a string representing one recid or several and get the associated tarballs for those ids. @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @return: tarballs ([string, string, ...]): locations of tarballs """ list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recid.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = int(recid) arXiv_ids = [] for recid in list_of_ids: rec = get_record(recid) for afieldinstance in record_get_field_instances(rec, tag='037'): if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) return tarballs_by_arXiv_id(arXiv_ids, sdir)
def tokenize(self, recID): phrases = [] try: rec = get_record(recID) for rule in self.rules: tag_to_index, necessary_tag, necessary_value = rule core_tag = tag_to_index[0:3] ind = tag_to_index[3:5] sub_tag = tag_to_index[5] fields = [dict(instance[0]) for instance in record_get_field_instances(rec, core_tag, ind[0], ind[1])] for field in fields: tag_condition = necessary_tag and field.has_key(necessary_tag) or necessary_tag == '' value_condition = necessary_value and field.get(necessary_tag, '') == necessary_value or \ necessary_value == '' if tag_condition and field.has_key(sub_tag) and value_condition: phrases.append(field[sub_tag]) return phrases except KeyError: return [] return phrases
def from_recid(cls, recid, provisional=False): """ Get user communities specified in recid """ from invenio.legacy.search_engine import get_record rec = get_record(recid) prefix = "%s-" % ( cfg['COMMUNITIES_ID_PREFIX_PROVISIONAL'] if provisional else cfg['COMMUNITIES_ID_PREFIX']) colls = rec.get('980', []) usercomm = [] for c in colls: try: # We are only interested in subfield 'a' code, val = c[0][0] if code == 'a' and val.startswith(prefix): val = val[len(prefix):] u = Community.query.filter_by(id=val).first() if u: usercomm.append(u) except IndexError: pass return usercomm
def task_run_core(): """Perform a search to find records without a texkey. generates a new one and uploads the changes in chunks """ recids = perform_request_search( p='-035:spirestex -035:inspiretex', cc='HEP') write_message("Found %s records to assign texkeys" % len(recids)) processed_recids = [] xml_to_process = [] for count, recid in enumerate(recids): write_message("processing recid %s" % recid) # Check that the record does not have already a texkey has_texkey = False recstruct = get_record(recid) for instance in record_get_field_instances(recstruct, tag="035", ind1="", ind2=""): try: provenance = field_get_subfield_values(instance, "9")[0] except IndexError: provenance = "" try: value = field_get_subfield_values(instance, "z")[0] except IndexError: try: value = field_get_subfield_values(instance, "a")[0] except IndexError: value = "" provenances = ["SPIRESTeX", "INSPIRETeX"] if provenance in provenances and value: has_texkey = True write_message( "INFO: Record %s has already texkey %s" % (recid, value)) if not has_texkey: TexKeySeq = TexkeySeq() new_texkey = "" try: new_texkey = TexKeySeq.next_value(recid) except TexkeyNoAuthorError: write_message(( "WARNING: Record %s has no first author or " "collaboration") % recid) continue except TexkeyNoYearError: write_message("WARNING: Record %s has no year" % recid) continue write_message("Created texkey %s for record %d" % (new_texkey, recid)) xml = create_xml(recid, new_texkey) processed_recids.append(recid) xml_to_process.append(xml) task_update_progress("Done %d out of %d." % (count, len(recids))) task_sleep_now_if_required() # sequence ID to be used in all subsequent tasks sequence_id = str(random.randrange(1, 4294967296)) if xml_to_process: process_chunk(xml_to_process, sequence_id) # Finally, index all the records processed # FIXME: Waiting for sequence id to be fixed # if processed_recids: # submit_bibindex_task(processed_recids, sequence_id) return True
def task_run_core(): """ Performs a search to find records without a texkey, generates a new one and uploads the changes in chunks """ recids = perform_request_search(p='-035:spirestex -035:inspiretex', cc='HEP') write_message("Found %s records to assign texkeys" % len(recids)) processed_recids = [] xml_to_process = [] for count, recid in enumerate(recids): write_message("processing recid %s" % recid) # Check that the record does not have already a texkey has_texkey = False recstruct = get_record(recid) for instance in record_get_field_instances(recstruct, tag="035", ind1="", ind2=""): try: provenance = field_get_subfield_values(instance, "9")[0] except IndexError: provenance = "" try: value = field_get_subfield_values(instance, "z")[0] except IndexError: try: value = field_get_subfield_values(instance, "a")[0] except IndexError: value = "" provenances = ["SPIRESTeX", "INSPIRETeX"] if provenance in provenances and value: has_texkey = True write_message("INFO: Record %s has already texkey %s" % (recid, value)) if not has_texkey: TexKeySeq = TexkeySeq() new_texkey = "" try: new_texkey = TexKeySeq.next_value(recid) except TexkeyNoAuthorError: write_message( "WARNING: Record %s has no first author or collaboration" % recid) continue except TexkeyNoYearError: write_message("WARNING: Record %s has no year" % recid) continue write_message("Created texkey %s for record %d" % (new_texkey, recid)) xml = create_xml(recid, new_texkey) processed_recids.append(recid) xml_to_process.append(xml) task_update_progress("Done %d out of %d." % (count, len(recids))) task_sleep_now_if_required() # sequence ID to be used in all subsequent tasks sequence_id = str(random.randrange(1, 4294967296)) if xml_to_process: process_chunk(xml_to_process, sequence_id) # Finally, index all the records processed #FIXME: Waiting for sequence id to be fixed # if processed_recids: # submit_bibindex_task(processed_recids, sequence_id) return True
def tarballs_by_recids(recids, sdir, docname=None, doctype=None, docformat=None): """ Take a string representing one recid or several and get the associated tarballs for those ids. By default look for files with names matching the report number and with source field 'arXiv'. This can be changed with C{docname}, C{doctype}, C{docformat} @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @param docname: select tarball for given recid(s) that match docname @param doctype: select tarball for given recid(s) that match doctype @param docformat: select tarball for given recid(s) that match docformat @return: tarballs ([string, string, ...]): locations of tarballs """ if not recids: return [] list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recids.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = [int(recids)] arXiv_ids = [] local_files = [] for recid in list_of_ids: rec = get_record(recid) if not doctype and not docname and not docformat: for afieldinstance in record_get_field_instances(rec, tag='037'): if len(field_get_subfield_values(afieldinstance, '9')) > 0: if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) else: bibarchive = BibRecDocs(recid) all_files = bibarchive.list_latest_files() if doctype: all_files = [docfile for docfile in all_files if docfile.get_type() == doctype] if docname: all_files = [docfile for docfile in all_files if docfile.get_name() == docname] if docformat: all_files = [docfile for docfile in all_files if docfile.get_format() == docformat] local_files.extend([(docfile.get_path(), recid) for docfile in all_files]) if doctype or docname or docformat: return local_files return tarballs_by_arXiv_id(arXiv_ids, sdir)
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True initial_snapshot = {} for set_spec in all_set_specs(): initial_snapshot[set_spec] = get_set_definitions(set_spec) write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2) task_update_progress("Fetching records to process") recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e') write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2) all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e') no_more_exported_recids = intbitset(all_current_recids) write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2) all_affected_recids = intbitset() all_should_recids = intbitset() recids_for_set = {} for set_spec in all_set_specs(): if not set_spec: set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC should_recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = should_recids no_more_exported_recids -= should_recids all_should_recids |= should_recids current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') write_message("%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2) to_add = should_recids - current_recids write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2) to_remove = current_recids - should_recids write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2) affected_recids = to_add | to_remove write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2) all_affected_recids |= affected_recids missing_oaiid = all_should_recids - recids_with_oaiid write_message("%s recids are missing an oaiid" % len(missing_oaiid)) write_message("%s recids should no longer be exported" % len(no_more_exported_recids)) ## Let's add records with missing OAI ID all_affected_recids |= missing_oaiid | no_more_exported_recids write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2) if not all_affected_recids: write_message("Nothing to do!") return True # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 # Iterate over the recids for i, recid in enumerate(all_affected_recids): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(all_affected_recids))) write_message("Elaborating recid %s" % recid, verbose=3) record = get_record(recid) if not record: write_message("Record %s seems empty. Let's skip it." % recid, verbose=3) continue new_record = {} # Check if an OAI identifier is already in the record or # not. assign_oai_id_entry = False oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5]) if not oai_id_entry: assign_oai_id_entry = True oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid) write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) else: write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5])) write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3) current_previous_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5])) write_message("Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3) # Get the sets that should be in this record according to # settings updated_oai_sets = set(_set for _set, _recids in iteritems(recids_for_set) if recid in _recids) write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3) updated_previous_oai_sets = set(_set for _set in (current_previous_oai_sets - updated_oai_sets) | (current_oai_sets - updated_oai_sets)) write_message("Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and not assign_oai_id_entry: write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3) continue # Jump to next recid write_message("Something has changed for record %s, let's update it!" % recid, verbose=3) subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)] for oai_set in updated_oai_sets: subfields.append((CFG_OAI_SET_FIELD[5], oai_set)) for oai_set in updated_previous_oai_sets: subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set)) record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields) oai_out.write(record_xml_output(new_record)) tot += 1 if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE: oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename) # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 task_sleep_now_if_required(can_stop_too=True) oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if tot > 0: if not no_upload: task_sleep_now_if_required(can_stop_too=True) if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename) else: os.remove(filename) return True
def record_get_keywords(record, main_field=bconfig.CFG_MAIN_FIELD, others=bconfig.CFG_OTHER_FIELDS): """Return a dictionary of keywordToken objects from the marc record. Weight is set to (0,0) if no weight can be found. This will load keywords from the field 653 and 695__a (which are the old 'DESY' keywords) :param record: int or marc record, if int - marc record is loaded from the database. If you pass record instance, keywords are extracted from it :return: tuple (found, keywords, marcxml) found - int indicating how many main_field keywords were found the other fields are not counted keywords - standard dictionary of keywordToken objects marcrec - marc record object loaded with data """ keywords = {} if isinstance(main_field, six.string_types): main_field = [main_field] if isinstance(others, six.string_types): others = [others] if isinstance(record, int): rec = get_record(record) else: rec = record found = 0 for m_field in main_field: tag, ind1, ind2 = _parse_marc_code(m_field) for field in rec.get(tag, []): keyword = "" weight = 0 type = "" for subfield in field[0]: if subfield[0] == "a": keyword = subfield[1] elif subfield[0] == "n": weight = int(subfield[1]) elif subfield[0] == "9": type = subfield[1] if keyword: found += 1 keywords[bor.KeywordToken(keyword, type=type)] = [[(0, 0) for x in range(weight)]] if others: for field_no in others: tag, ind1, ind2 = _parse_marc_code(field_no) type = "f%s" % field_no for field in rec.get(tag, []): keyword = "" for subfield in field[0]: if subfield[0] == "a": keyword = subfield[1] keywords[bor.KeywordToken(keyword, type=type)] = [[(0, 0)]] break return found, keywords, rec
def tarballs_by_recids(recids, sdir, docname=None, doctype=None, docformat=None): """ Take a string representing one recid or several and get the associated tarballs for those ids. By default look for files with names matching the report number and with source field 'arXiv'. This can be changed with C{docname}, C{doctype}, C{docformat} @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @param docname: select tarball for given recid(s) that match docname @param doctype: select tarball for given recid(s) that match doctype @param docformat: select tarball for given recid(s) that match docformat @return: tarballs ([string, string, ...]): locations of tarballs """ if not recids: return [] list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recids.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = [int(recids)] arXiv_ids = [] local_files = [] for recid in list_of_ids: rec = get_record(recid) if not doctype and not docname and not docformat: for afieldinstance in record_get_field_instances(rec, tag='037'): if len(field_get_subfield_values(afieldinstance, '9')) > 0: if 'arXiv' == field_get_subfield_values( afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values( afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) else: bibarchive = BibRecDocs(recid) all_files = bibarchive.list_latest_files() if doctype: all_files = [ docfile for docfile in all_files if docfile.get_type() == doctype ] if docname: all_files = [ docfile for docfile in all_files if docfile.get_name() == docname ] if docformat: all_files = [ docfile for docfile in all_files if docfile.get_format() == docformat ] local_files.extend([(docfile.get_path(), recid) for docfile in all_files]) if doctype or docname or docformat: return local_files return tarballs_by_arXiv_id(arXiv_ids, sdir)
def record_get_keywords(record, main_field=bconfig.CFG_MAIN_FIELD, others=bconfig.CFG_OTHER_FIELDS): """Return a dictionary of keywordToken objects from the marc record. Weight is set to (0,0) if no weight can be found. This will load keywords from the field 653 and 695__a (which are the old 'DESY' keywords) :param record: int or marc record, if int - marc record is loaded from the database. If you pass record instance, keywords are extracted from it :return: tuple (found, keywords, marcxml) found - int indicating how many main_field keywords were found the other fields are not counted keywords - standard dictionary of keywordToken objects marcrec - marc record object loaded with data """ keywords = {} if isinstance(main_field, six.string_types): main_field = [main_field] if isinstance(others, six.string_types): others = [others] if isinstance(record, int): rec = get_record(record) else: rec = record found = 0 for m_field in main_field: tag, ind1, ind2 = _parse_marc_code(m_field) for field in rec.get(tag, []): keyword = '' weight = 0 type = '' for subfield in field[0]: if subfield[0] == 'a': keyword = subfield[1] elif subfield[0] == 'n': weight = int(subfield[1]) elif subfield[0] == '9': type = subfield[1] if keyword: found += 1 keywords[bor.KeywordToken(keyword, type=type)] = [[ (0, 0) for x in range(weight) ]] if others: for field_no in others: tag, ind1, ind2 = _parse_marc_code(field_no) type = 'f%s' % field_no for field in rec.get(tag, []): keyword = '' for subfield in field[0]: if subfield[0] == 'a': keyword = subfield[1] keywords[bor.KeywordToken(keyword, type=type)] = [[(0, 0)]] break return found, keywords, rec
def main(): import invenio.modules.editor.models import invenio.modules.editor.views from invenio.legacy.search_engine import get_record from invenio.legacy.bibrecord import ( record_delete_field, record_add_field, ) from invenio.legacy.bibupload.engine import ( bibupload, ) for a in itertools.count(1): old_rec = get_record(a) rec = get_record(a) if not rec: break print('Processing record: {0}'.format(a)) old_337 = [f[0] for f in rec.get('337', [])] new_337 = old_337[:] new_690 = [] new_980 = [] for f in rec.get('980', []): for sf in f[0]: if sf[0] == 'a' and sf[1] in TYPES: if [sf] not in new_337: new_337.append([sf]) else: if [sf] not in new_980: new_980.append([sf]) for f in rec.get('690', []): sfs = f[0] if sfs[0][0] == 'a' and sfs[0][1] == 'ling_resource_type': res_type = sfs[1][1] if res_type in TYPES: if [('a', res_type)] not in new_337: new_337.append([('a', res_type)]) else: print("Unrecognized 'ling_resource_type' value! '{0}'". format(res_type)) else: if sfs not in new_690: new_690.append(sfs) if not new_337 == old_337: record_delete_field(rec, '337') record_delete_field(rec, '980') record_delete_field(rec, '690') for f in new_337: record_add_field(rec, '337', subfields=f) for f in new_980: record_add_field(rec, '980', subfields=f) for f in new_690: record_add_field(rec, '690', subfields=f) print('\nOld 337:') pprint(old_rec.get('337')) print('New 337:') pprint(rec.get('337')) print('\nOld 690:') pprint(old_rec.get('690')) print('New 690:') pprint(rec.get('690')) print('\nOld 980:') pprint(old_rec.get('980')) print('New 980:') pprint(rec.get('980')) if raw_input('Bibupload (y/n)? ') == 'y': bibupload(rec, 'replace')
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True if run_sql( "SELECT id FROM schTASK WHERE proc='bibupload:oairepository' AND status='WAITING'" ): write_message( "Previous requests of oairepository still being elaborated. Let's skip this execution." ) return True initial_snapshot = {} for set_spec in all_set_specs(): initial_snapshot[set_spec] = get_set_definitions(set_spec) write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2) task_update_progress("Fetching records to process") recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e') write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2) all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e') no_more_exported_recids = intbitset(all_current_recids) write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2) all_affected_recids = intbitset() all_should_recids = intbitset() recids_for_set = {} for set_spec in all_set_specs(): if not set_spec: set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC should_recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = should_recids no_more_exported_recids -= should_recids all_should_recids |= should_recids current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') write_message( "%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2) to_add = should_recids - current_recids write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2) to_remove = current_recids - should_recids write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2) affected_recids = to_add | to_remove write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2) all_affected_recids |= affected_recids missing_oaiid = all_should_recids - recids_with_oaiid write_message("%s recids are missing an oaiid" % len(missing_oaiid)) write_message("%s recids should no longer be exported" % len(no_more_exported_recids)) ## Let's add records with missing OAI ID all_affected_recids |= missing_oaiid | no_more_exported_recids write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2) if not all_affected_recids: write_message("Nothing to do!") return True # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 # Iterate over the recids for i, recid in enumerate(all_affected_recids): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(all_affected_recids))) write_message("Elaborating recid %s" % recid, verbose=3) record = get_record(recid) if not record: write_message("Record %s seems empty. Let's skip it." % recid, verbose=3) continue new_record = {} # Check if an OAI identifier is already in the record or # not. assign_oai_id_entry = False oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5]) if not oai_id_entry: assign_oai_id_entry = True oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid) write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) else: write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5])) write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3) current_previous_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5])) write_message( "Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3) # Get the sets that should be in this record according to # settings updated_oai_sets = set(_set for _set, _recids in iteritems(recids_for_set) if recid in _recids) write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3) updated_previous_oai_sets = set( _set for _set in (current_previous_oai_sets - updated_oai_sets) | (current_oai_sets - updated_oai_sets)) write_message( "Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and not assign_oai_id_entry: write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3) continue # Jump to next recid write_message("Something has changed for record %s, let's update it!" % recid, verbose=3) subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)] for oai_set in updated_oai_sets: subfields.append((CFG_OAI_SET_FIELD[5], oai_set)) for oai_set in updated_previous_oai_sets: subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set)) record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields) oai_out.write(record_xml_output(new_record)) tot += 1 if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE: oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n', '-Noairepository', '-P', '-1') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-Noairepository', '-P', '-1') # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 task_sleep_now_if_required(can_stop_too=True) oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if tot > 0: if not no_upload: task_sleep_now_if_required(can_stop_too=True) if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename) else: os.remove(filename) return True