def bibupload(record=None, collection=None, file_prefix="", mode="-c"): """ General purpose function that will write a MARCXML file and call bibupload on it. """ if collection is None and record is None: return (file_out, filename) = open_temp_file(file_prefix) if collection is not None: file_out.write("<collection>") tot = 0 for rec in collection: file_out.write(record_xml_output(rec)) tot += 1 if tot == MAX_RECORDS: file_out.write("</collection>") file_out.close() logger.debug("Submitting bibupload %s -n %s" % (mode, filename)) task_low_level_submission('bibupload', 'openaire', mode, filename, '-n') (file_out, filename) = open_temp_file(file_prefix) file_out.write("<collection>") tot = 0 file_out.write("</collection>") elif record is not None: tot = 1 file_out.write(record_xml_output(record)) file_out.close() if tot > 0: logger.debug("Submitting bibupload %s -n %s" % (mode, filename)) task_low_level_submission('bibupload', 'openaire', mode, filename, '-n')
def _prepare_marcxml(recid_a, rn_a, recid_b, rn_b, what_is_a_for_b, what_is_b_for_a, display_in_a=True, display_in_b=True): record_a = {} record_b = {} record_add_field(record_a, "001", controlfield_value=str(recid_a)) record_add_field(record_a, CFG_OTHER_RELATIONSHIP_ENTRY, ind1=display_in_a and "0" or "1", subfields=[('i', what_is_b_for_a), ('r', rn_b), ('w', str(recid_b))]) record_add_field(record_b, "001", controlfield_value=str(recid_b)) record_add_field(record_b, CFG_OTHER_RELATIONSHIP_ENTRY, ind1=display_in_b and "0" or "1", subfields=[('i', what_is_a_for_b), ('r', rn_a), ('w', str(recid_a))]) return "<collection>\n%s\n%s</collection>" % (record_xml_output(record_a), record_xml_output(record_b))
def save_xml_record(recid, uid, xml_record='', to_upload=True, to_merge=False, task_name="bibedit", sequence_id=None): """Write XML record to file. Default behaviour is to read the record from a BibEdit cache file, filter out the unchanged volatile subfields, write it back to an XML file and then pass this file to BibUpload. @param xml_record: give XML as string in stead of reading cache file @param to_upload: pass the XML file to BibUpload @param to_merge: prepare an XML file for BibMerge to use """ if not xml_record: # Read record from cache file. cache = get_cache_contents(recid, uid) if cache: record = cache[2] used_changes = cache[4] xml_record = record_xml_output(record) delete_cache(recid, uid) delete_disabled_changes(used_changes) else: record = create_record(xml_record)[0] # clean the record from unfilled volatile fields record_strip_empty_volatile_subfields(record) record_strip_empty_fields(record) # order subfields alphabetically before saving the record record_order_subfields(record) xml_to_write = wash_for_xml(record_xml_output(record)) # Write XML file. if not to_merge: fd, file_path = tempfile.mkstemp(dir=cfg['CFG_BIBEDIT_CACHEDIR'], prefix="%s_" % cfg['CFG_BIBEDIT_FILENAME'], suffix="_%s_%s.xml" % (recid, uid)) f = os.fdopen(fd, 'w') f.write(xml_to_write) f.close() else: file_path = '%s_%s.xml' % (_get_file_path(recid, uid), cfg['CFG_BIBEDIT_TO_MERGE_SUFFIX']) xml_file = open(file_path, 'w') xml_file.write(xml_to_write) xml_file.close() user_name = get_user_info(uid)[1] if to_upload: args = ['bibupload', user_name, '-P', '5', '-r', file_path, '-u', user_name] if task_name == "bibedit": args.extend(['--name', 'bibedit']) if sequence_id: args.extend(["-I", sequence_id]) args.extend(['--email-logs-on-error']) task_low_level_submission(*args) return True
def compare_references(test, a, b): from invenio.legacy.bibrecord import create_record, record_xml_output, \ record_delete_field ## Let's normalize records to remove the Invenio refextract signature a = create_record(a)[0] b = create_record(b)[0] record_delete_field(a, '999', 'C', '6') a = record_xml_output(a) b = record_xml_output(b) test.assertEqual(a, b)
def _get_formated_record(record_id, output_format, update_commands, language, outputTags="", checked=True, displayed_records=None): """Returns a record in a given format @param record_id: the ID of record to format @param output_format: an output format code (or short identifier for the output format) @param update_commands: list of commands used to update record contents @param language: the language to use to format the record @param outputTags: the tags to be shown to the user @param checked: is the record checked by the user? @param displayed_records: records to be displayed on a given page @returns: record formated to be displayed or None """ if update_commands and checked: # Modify the bibrecord object with the appropriate actions updated_record = _get_updated_record(record_id, update_commands) textmarc_options = {"aleph-marc":0, "correct-mode":1, "append-mode":0, "delete-mode":0, "insert-mode":0, "replace-mode":0, "text-marc":1} if record_id not in displayed_records: return old_record = search_engine.get_record(recid=record_id) old_record_textmarc = xmlmarc2textmarc.create_marc_record(old_record, sysno="", options=textmarc_options) if "hm" == output_format: if update_commands and checked: updated_record_textmarc = xmlmarc2textmarc.create_marc_record(updated_record, sysno="", options=textmarc_options) result = _get_record_diff(old_record_textmarc, updated_record_textmarc, outputTags, record_id) else: filter_tags = "All tags" not in outputTags and outputTags result = ['<pre>'] for line in old_record_textmarc.splitlines(): if not filter_tags or line.split()[0].replace('_', '') in outputTags: result.append("%09d " % record_id + line.strip()) result.append('</pre>') result = '\n'.join(result) else: if update_commands and checked: # No coloring of modifications in this case xml_record = bibrecord.record_xml_output(updated_record) else: xml_record = bibrecord.record_xml_output(old_record) result = bibformat.format_record(recID=None, of=output_format, xml_record=xml_record, ln=language) return result
def upload_amendments(records, holdingpen): """ Upload a modified record """ if task_get_option("no_upload", False) or len(records) == 0: return xml = '<collection xmlns="http://www.loc.gov/MARC21/slim">' for record in records: xml += record_xml_output(record) xml += "</collection>" tmp_file_fd, tmp_file = mkstemp( suffix='.xml', prefix="bibcheckfile_%s" % time.strftime("%Y-%m-%d_%H:%M:%S"), dir=CFG_TMPSHAREDDIR ) os.write(tmp_file_fd, xml) os.close(tmp_file_fd) os.chmod(tmp_file, 0644) if holdingpen: flag = "-o" else: flag = "-r" task = task_low_level_submission('bibupload', 'bibcheck', flag, tmp_file) write_message("Submitted bibupload task %s" % task)
def test_marc_export(self): from invenio.modules.records.api import Record from invenio.legacy.bibrecord import create_record, record_xml_output rec = Record(json=test_record, master_format='marc') # Needed to properly set authors when generating MARC first = rec['authors'][0] additional = rec['authors'][1:] rec['_first_author'] = first rec['_additional_authors'] = additional output_marc = record_xml_output( create_record(rec.legacy_export_as_marc())[0] ) try: self.assertEqual(test_marc, output_marc) except AssertionError: # Print diff in case of errors. import difflib diff = "".join(difflib.unified_diff( test_marc.splitlines(1), output_marc.splitlines(1) )) raise AssertionError(diff) form_json = rec.produce('json_for_form') for k, v in test_form_json.items(): self.assertEqual(form_json[k], test_form_json[k])
def replace_references(recid, uid=None, txt=None, url=None): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record * txt: references in text mode * inspire: format of ther references """ # Parse references if txt is not None: references_xml = extract_references_from_string_xml(txt, is_only_references=True) elif url is not None: references_xml = extract_references_from_url_xml(url) else: references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml) dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_contents(recid, uid) out_xml = None references_to_add = record_get_field_instances(references[0], tag="999", ind1="C", ind2="5") refextract_status = record_get_field_instances(references[0], tag="999", ind1="C", ind2="6") if references_to_add: # Replace 999 fields record_delete_fields(record, "999") record_add_fields(record, "999", references_to_add) record_add_fields(record, "999", refextract_status) # Update record references out_xml = record_xml_output(record) return out_xml
def replace_references(recid): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record """ # Parse references references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml) # Record marc xml record = get_record(recid) if references[0]: fields_to_add = record_get_field_instances(references[0], tag='999', ind1='%', ind2='%') # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', fields_to_add) # Update record references out_xml = record_xml_output(record) else: out_xml = None return out_xml
def save_xml_record(recid, uid, xml_record='', to_upload=True, to_merge=False): """Write XML record to file. Default behaviour is to read the record from a BibEdit cache file, filter out the unchanged volatile subfields, write it back to an XML file and then pass this file to BibUpload. @param xml_record: give XML as string in stead of reading cache file @param to_upload: pass the XML file to BibUpload @param to_merge: prepare an XML file for BibMerge to use """ if not xml_record: # Read record from cache file. cache = get_cache_file_contents(recid, uid) if cache: record = cache[2] used_changes = cache[4] xml_record = record_xml_output(record) delete_cache_file(recid, uid) delete_disabled_changes(used_changes) else: record = create_record(xml_record)[0] # clean the record from unfilled volatile fields record_strip_empty_volatile_subfields(record) record_strip_empty_fields(record) # order subfields alphabetically before saving the record record_order_subfields(record) xml_to_write = wash_for_xml(record_xml_output(record)) # Write XML file. if not to_merge: file_path = '%s.xml' % _get_file_path(recid, uid) else: file_path = '%s_%s.xml' % (_get_file_path(recid, uid), CFG_BIBEDIT_TO_MERGE_SUFFIX) xml_file = open(file_path, 'w') xml_file.write(xml_to_write) xml_file.close() user_name = get_user_info(uid)[1] if to_upload: # Pass XML file to BibUpload. task_low_level_submission('bibupload', 'bibedit', '-P', '5', '-r', file_path, '-u', user_name) return True
def create_marcxml(record): """Create MARCXML based on type of input variable.""" from invenio_records.api import Record if isinstance(record, six.string_types): return record elif isinstance(record, Record): return record.legacy_export_as_marc() else: return record_xml_output(record)
def create_marcxml(record): """Create MARCXML based on type of input variable.""" from invenio.modules.records.api import Record if isinstance(record, six.string_types): return record elif isinstance(record, Record): return record.legacy_export_as_marc() else: return record_xml_output(record)
def _prepare_marcxml(recid_a, rn_a, recids_and_rns_b, what_is_a_for_b, what_is_b_for_a, display_in_a=True, display_in_b=True, marc_for_a=None, marc_for_b=None, upload_mode='append', consider_empty_p=False): output = '<collection>' record_a = {} record_b = {} if what_is_b_for_a is not None: marc_tag_for_a, marc_ind1_for_a, marc_ind2_for_a = \ _prepare_marc(marc_for_a, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_a and "0" or "1") record_add_field(record_a, "001", controlfield_value=str(recid_a)) if upload_mode == 'correct' and not recids_and_rns_b and consider_empty_p: # Add empty field in order to account for cases where all # linkings are removed by the submitter record_add_field(record_a, marc_tag_for_a, ind1=marc_ind1_for_a, ind2=marc_ind2_for_a) for recid_b, rn_b in recids_and_rns_b: record_add_field(record_a, marc_tag_for_a, ind1=marc_ind1_for_a, ind2=marc_ind2_for_a, subfields=[('i', what_is_b_for_a), ('r', rn_b), ('w', str(recid_b))]) output += record_xml_output(record_a) if what_is_a_for_b is not None: marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b = \ _prepare_marc(marc_for_b, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_b and "0" or "1") for recid_b, rn_b in recids_and_rns_b: record_b = {} record_add_field(record_b, "001", controlfield_value=str(recid_b)) if upload_mode == 'correct': original_linking_fields = _get_record_linking_fields(recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b) record_add_fields(record_b, marc_tag_for_b, original_linking_fields) record_add_field(record_b, marc_tag_for_b, ind1=marc_ind1_for_b, ind2=marc_ind2_for_b, subfields=[('i', what_is_a_for_b), ('r', rn_a), ('w', str(recid_a))]) output += record_xml_output(record_b) # Remove linking in remote records where adequate if consider_empty_p: unlinked_recids = get_unlinked_records(recid_a, marc_for_b, display_in_b, upload_mode, recids_and_rns_b) for recid_b in unlinked_recids: record_b = {} record_add_field(record_b, "001", controlfield_value=str(recid_b)) original_linking_fields = _get_record_linking_fields(recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b) if not original_linking_fields: # Add empty field in order to account for cases where all # linkings are removed by the submitter record_add_field(record_b, marc_tag_for_b, ind1=marc_ind1_for_b, ind2=marc_ind2_for_b) record_add_fields(record_b, marc_tag_for_b, original_linking_fields) output += record_xml_output(record_b) output += '</collection>' return output
def modify_record_timestamp(revision_xml, last_revision_ts): """ Modify tag 005 to add the revision passed as parameter. @param revision_xml: marcxml representation of the record to modify @type revision_xml: string @param last_revision_ts: timestamp to add to 005 tag @type last_revision_ts: string @return: marcxml with 005 tag modified """ recstruct = create_record(revision_xml)[0] record_modify_controlfield(recstruct, "005", last_revision_ts, field_position_local=0) return record_xml_output(recstruct)
def output_records(container): try: strio = codecs.open(sys.argv[2], mode='w', encoding='utf-8') _print(TF.YELLOW + "Writing to %s" % (sys.argv[2],) + TF.END) except Exception: strio = sys.stdout _print(TF.YELLOW + "Writing to StdOut" + TF.END) stream = Streamer(strio, '\n') stream.write(MARCXML_COLLECTION_HEADER) for record in container: marcxml = record_xml_output(record) stream.write(marcxml) stream.write(MARCXML_COLLECTION_FOOTER)
def bibupload(record=None, collection=None, file_prefix="", mode="-c"): """ General purpose function that will write a MARCXML file and call bibupload on it. """ if collection is None and record is None: return (file_out, filename) = open_temp_file(file_prefix) if collection is not None: file_out.write("<collection>") tot = 0 for rec in collection: file_out.write(record_xml_output(rec)) tot += 1 if tot == MAX_RECORDS: file_out.write("</collection>") file_out.close() logger.debug( "Submitting bibupload %s -n %s" % (mode, filename)) task_low_level_submission( 'bibupload', 'openaire', mode, filename, '-n') (file_out, filename) = open_temp_file(file_prefix) file_out.write("<collection>") tot = 0 file_out.write("</collection>") elif record is not None: tot = 1 file_out.write(record_xml_output(record)) file_out.close() if tot > 0: logger.debug("Submitting bibupload %s -n %s" % (mode, filename)) task_low_level_submission( 'bibupload', 'openaire', mode, filename, '-n')
def create_marc(form, sub_id, email): """ Generates MARC data used by Invenio from the filled out form, then submits it to the Invenio system. """ rec = {} recid = create_recid() record_add_field(rec, '001', controlfield_value=str(recid)) add_basic_fields(rec, form, email) add_domain_fields(rec, form) add_file_info(rec, form, email, sub_id, recid) checksum = create_checksum(rec, sub_id) add_epic_pid(rec, recid, checksum) marc = record_xml_output(rec) return recid, marc
def modify_record_timestamp(revision_xml, last_revision_ts): """ Modify tag 005 to add the revision passed as parameter. @param revision_xml: marcxml representation of the record to modify @type revision_xml: string @param last_revision_ts: timestamp to add to 005 tag @type last_revision_ts: string @return: marcxml with 005 tag modified """ recstruct = create_record(revision_xml)[0] if "005" in recstruct: record_modify_controlfield(recstruct, "005", last_revision_ts, field_position_local=0) else: record_add_field(recstruct, '005', controlfield_value=last_revision_ts) return record_xml_output(recstruct)
def create_marc(form, sub_id, email, meta): """ Generates MARC data used by Invenio from the filled out form, then submits it to the Invenio system. """ rec = {} recid = create_recid() record_add_field(rec, '001', controlfield_value=str(recid)) add_basic_fields(rec, form, meta) record_add_field(rec, '856', ind1='0', subfields=[('f', email)]) add_domain_fields(rec, form, meta) add_file_info(rec, form, email, sub_id, recid) checksum = create_checksum(rec, sub_id) add_epic_pid(rec, recid, checksum) marc = record_xml_output(rec) return recid, marc
def replace_references(recid, uid=None, txt=None, url=None): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record * txt: references in text mode * inspire: format of ther references """ # Parse references if txt is not None: references_xml = extract_references_from_string_xml( txt, is_only_references=True) elif url is not None: references_xml = extract_references_from_url_xml(url) else: references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml) dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_contents( recid, uid) out_xml = None references_to_add = record_get_field_instances(references[0], tag='999', ind1='C', ind2='5') refextract_status = record_get_field_instances(references[0], tag='999', ind1='C', ind2='6') if references_to_add: # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', references_to_add) record_add_fields(record, '999', refextract_status) # Update record references out_xml = record_xml_output(record) return out_xml
def _author_list(obj, eng): from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.api import get_tarball_from_arxiv from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout from ..utils import find_matching_files identifiers = obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "") if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) if tarball is None: obj.log.error("No tarball found") return else: tarball = obj.extra_data["_result"]["tarball"] # FIXME tarball = str(tarball) sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "") try: untar(tarball, sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % ( obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors = convert(xml_content, stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error("Error parsing authorlist record for id: %s" % ( identifiers,)) authorlist_record = authorlist_record[0][0] author_xml = record_xml_output(authorlist_record) if author_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' new_dict_representation = convert_marcxml_to_bibfield(updated_xml) obj.data["authors"] = new_dict_representation["authors"] obj.update_task_results( "authors", [{ "name": "authors", "results": new_dict_representation["authors"] }] ) obj.update_task_results( "number_of_authors", [{ "name": "number_of_authors", "results": new_dict_representation["number_of_authors"] }] ) break
def author_list(obj, eng): """Perform the special authorlist extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex, find_matching_files) from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.getter import harvest_single from invenio.modules.workflows.errors import WorkflowError from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout identifiers = obj.data["system_control_number"]["value"] if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg['CFG_TMPSHAREDDIR'], str(eng.uuid) ) if not os.path.exists(extract_path): os.makedirs(extract_path) tarball, pdf = harvest_single( obj.data["system_control_number"]["value"], extract_path, ["tarball"]) tarball = str(tarball) if tarball is None: raise WorkflowError(str( "Error harvesting tarball from id: %s %s" % ( identifiers, extract_path)), eng.uuid, id_object=obj.id) obj.extra_data["_result"]["tarball"] = tarball sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"], cfg['CFG_TMPDIR'], "") try: untar(obj.extra_data["_result"]["tarball"], sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % ( obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") a_stylesheet = obj.extra_data["repository"]["arguments"].get( "a_stylesheet" ) or "authorlist2marcxml.xsl" authors = convert(xml_content, a_stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error("Error parsing authorlist record for id: %s" % ( identifiers,)) authorlist_record = authorlist_record[0][0] # Convert any LaTeX symbols in authornames translate_fieldvalues_from_latex(authorlist_record, '100', code='a') translate_fieldvalues_from_latex(authorlist_record, '700', code='a') updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' if not None == updated_xml: # We store the path to the directory the tarball contents live # Read and grab MARCXML from plotextractor run new_dict_representation = convert_marcxml_to_bibfield(updated_xml) obj.data['authors'] = new_dict_representation["authors"] obj.data['number_of_authors'] = new_dict_representation[ "number_of_authors"] obj.add_task_result("authors", new_dict_representation["authors"]) obj.add_task_result("number_of_authors", new_dict_representation["number_of_authors"]) break
def _author_list(obj, eng): from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.api import get_tarball_from_arxiv from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout from ..utils import find_matching_files identifiers = obj.data.get( cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "") if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) if tarball is None: obj.log.error("No tarball found") return else: tarball = obj.extra_data["_result"]["tarball"] # FIXME tarball = str(tarball) sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "") try: untar(tarball, sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % (obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors = convert(xml_content, stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error( "Error parsing authorlist record for id: %s" % (identifiers, )) authorlist_record = authorlist_record[0][0] author_xml = record_xml_output(authorlist_record) if author_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' new_dict_representation = convert_marcxml_to_bibfield( updated_xml) obj.data["authors"] = new_dict_representation["authors"] obj.update_task_results( "authors", [{ "name": "authors", "results": new_dict_representation["authors"] }]) obj.update_task_results("number_of_authors", [{ "name": "number_of_authors", "results": new_dict_representation["number_of_authors"] }]) break
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True initial_snapshot = {} for set_spec in all_set_specs(): initial_snapshot[set_spec] = get_set_definitions(set_spec) write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2) task_update_progress("Fetching records to process") recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e') write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2) all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e') no_more_exported_recids = intbitset(all_current_recids) write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2) all_affected_recids = intbitset() all_should_recids = intbitset() recids_for_set = {} for set_spec in all_set_specs(): if not set_spec: set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC should_recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = should_recids no_more_exported_recids -= should_recids all_should_recids |= should_recids current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') write_message("%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2) to_add = should_recids - current_recids write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2) to_remove = current_recids - should_recids write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2) affected_recids = to_add | to_remove write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2) all_affected_recids |= affected_recids missing_oaiid = all_should_recids - recids_with_oaiid write_message("%s recids are missing an oaiid" % len(missing_oaiid)) write_message("%s recids should no longer be exported" % len(no_more_exported_recids)) ## Let's add records with missing OAI ID all_affected_recids |= missing_oaiid | no_more_exported_recids write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2) if not all_affected_recids: write_message("Nothing to do!") return True # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 # Iterate over the recids for i, recid in enumerate(all_affected_recids): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(all_affected_recids))) write_message("Elaborating recid %s" % recid, verbose=3) record = get_record(recid) if not record: write_message("Record %s seems empty. Let's skip it." % recid, verbose=3) continue new_record = {} # Check if an OAI identifier is already in the record or # not. assign_oai_id_entry = False oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5]) if not oai_id_entry: assign_oai_id_entry = True oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid) write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) else: write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5])) write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3) current_previous_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5])) write_message("Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3) # Get the sets that should be in this record according to # settings updated_oai_sets = set(_set for _set, _recids in iteritems(recids_for_set) if recid in _recids) write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3) updated_previous_oai_sets = set(_set for _set in (current_previous_oai_sets - updated_oai_sets) | (current_oai_sets - updated_oai_sets)) write_message("Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and not assign_oai_id_entry: write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3) continue # Jump to next recid write_message("Something has changed for record %s, let's update it!" % recid, verbose=3) subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)] for oai_set in updated_oai_sets: subfields.append((CFG_OAI_SET_FIELD[5], oai_set)) for oai_set in updated_previous_oai_sets: subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set)) record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields) oai_out.write(record_xml_output(new_record)) tot += 1 if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE: oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename) # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 task_sleep_now_if_required(can_stop_too=True) oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if tot > 0: if not no_upload: task_sleep_now_if_required(can_stop_too=True) if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename) else: os.remove(filename) return True
def perform_request_record(requestType, uid, data): """Handle 'major' record related requests. Handle retrieving, submitting or cancelling the merging session. """ #TODO add checks before submission and cancel, replace get_bibrecord call result = { 'resultCode': 0, 'resultText': '' } recid1 = data["recID1"] record1 = _get_record(recid1, uid, result) if result['resultCode'] != 0: #if record not accessible return error information return result if requestType == 'submit': if 'duplicate' in data: recid2 = data['duplicate'] record2 = _get_record_slave(recid2, result, 'recid', uid) if result['resultCode'] != 0: #return in case of error return result # mark record2 as deleted record_add_field(record2, '980', ' ', ' ', '', [('c', 'DELETED')]) # mark record2 as duplicate of record1 record_add_field(record2, '970', ' ', ' ', '', [('d', str(recid1))]) # submit record2 to be deleted xml_record2 = record_xml_output(record2) save_xml_record(recid2, uid, xml_record2) #submit record1 xml_record1 = record_xml_output(record1) save_xml_record(recid1, uid, xml_record1) result['resultText'] = 'Records submitted' return result #submit record1 from cache save_xml_record(recid1, uid) # Delete cache file if it exists if cache_exists(recid1, uid): delete_cache_file(recid1, uid) result['resultText'] = 'Record submitted' return result elif requestType == 'cancel': delete_cache_file(recid1, uid) result['resultText'] = 'Cancelled' return result recid2 = data["recID2"] mode = data['record2Mode'] record2 = _get_record_slave(recid2, result, mode, uid) if result['resultCode'] != 0: #if record not accessible return error information return result if requestType == 'getRecordCompare': result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2) result['resultText'] = 'Records compared' elif requestType == 'recCopy': copy_R2_to_R1(record1, record2) result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2) result['resultText'] = 'Record copied' elif requestType == 'recMerge': merge_record(record1, record2, merge_conflicting_fields=True) result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2) result['resultText'] = 'Records merged' elif requestType == 'recMergeNC': merge_record(record1, record2, merge_conflicting_fields=False) result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2) result['resultText'] = 'Records merged' else: result['resultCode'], result['resultText'] = 1, 'Wrong request type' return result
def format_with_format_template(format_template_filename, bfo, verbose=0, format_template_code=None, qid="", extra_context=None): """ Format a record given a format template. Returns a formatted version of the record represented by bfo, in the language specified in bfo, and with the specified format template. If format_template_code is provided, the template will not be loaded from format_template_filename (but format_template_filename will still be used to determine if bft or xsl transformation applies). This allows to preview format code without having to save file on disk. :param format_template_filename: the dilename of a format template :param bfo: the object containing parameters for the current formatting :param format_template_code: if not empty, use code as template instead of reading format_template_filename (used for previews) :param verbose: the level of verbosity from 0 to 9 (O: silent, 5: errors, 7: errors and warnings, 9: errors and warnings, stop if error (debug mode )) @return: formatted text """ if format_template_code is not None: format_content = str(format_template_code) elif not format_template_filename.endswith("." + CFG_BIBFORMAT_FORMAT_JINJA_TEMPLATE_EXTENSION): format_content = get_format_template(format_template_filename)['code'] if format_template_filename.endswith("." + CFG_BIBFORMAT_FORMAT_JINJA_TEMPLATE_EXTENSION): evaluated_format = '<!-- empty -->' #try: from functools import wraps from invenio.modules.records.api import \ create_record as new_create_record, \ get_record as new_get_record from flask_login import current_user from invenio.base.helpers import unicodifier def _format_record(recid, of='hb', user_info=current_user, *args, **kwargs): from invenio.modules.formatter import format_record return format_record(recid, of, user_info=user_info, *args, **kwargs) # Fixes unicode problems in Jinja2 templates. def encode_utf8(f): @wraps(f) def wrapper(*args, **kwds): return unicodifier(f(*args, **kwds)) return wrapper if bfo.xml_record is None: record = new_get_record(bfo.recID) else: record = new_create_record(bfo.xml_record, master_format='marc') bfo.recID = bfo.recID if bfo.recID else 0 record.__getitem__ = encode_utf8(record.__getitem__) record.get = encode_utf8(record.get) evaluated_format = render_template_to_string( 'format/record/'+format_template_filename, recid=bfo.recID, record=record, format_record=_format_record, qid=qid, bfo=bfo, **(extra_context or {})).encode('utf-8') needs_2nd_pass = False else: from invenio.modules.records.api import get_record as new_get_record #.xsl if bfo.xml_record: # bfo was initialized with a custom MARCXML xml_record = '<?xml version="1.0" encoding="UTF-8"?>\n' + \ record_xml_output(bfo.record) else: # Fetch MARCXML. On-the-fly xm if we are now formatting in xm xml_record = '<?xml version="1.0" encoding="UTF-8"?>\n' + \ new_get_record(bfo.recID).legacy_export_as_marc() # Transform MARCXML using stylesheet evaluated_format = xslt.format(xml_record, template_source=format_content).decode('utf-8') needs_2nd_pass = False return evaluated_format, needs_2nd_pass
def _prepare_marcxml(recid_a, rn_a, recids_and_rns_b, what_is_a_for_b, what_is_b_for_a, display_in_a=True, display_in_b=True, marc_for_a=None, marc_for_b=None, upload_mode='append', consider_empty_p=False): output = '<collection>' record_a = {} record_b = {} if what_is_b_for_a is not None: marc_tag_for_a, marc_ind1_for_a, marc_ind2_for_a = \ _prepare_marc(marc_for_a, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_a and "0" or "1") record_add_field(record_a, "001", controlfield_value=str(recid_a)) if upload_mode == 'correct' and not recids_and_rns_b and consider_empty_p: # Add empty field in order to account for cases where all # linkings are removed by the submitter record_add_field(record_a, marc_tag_for_a, ind1=marc_ind1_for_a, ind2=marc_ind2_for_a) for recid_b, rn_b in recids_and_rns_b: record_add_field(record_a, marc_tag_for_a, ind1=marc_ind1_for_a, ind2=marc_ind2_for_a, subfields=[('i', what_is_b_for_a), ('r', rn_b), ('w', str(recid_b))]) output += record_xml_output(record_a) if what_is_a_for_b is not None: marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b = \ _prepare_marc(marc_for_b, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_b and "0" or "1") for recid_b, rn_b in recids_and_rns_b: record_b = {} record_add_field(record_b, "001", controlfield_value=str(recid_b)) if upload_mode == 'correct': original_linking_fields = _get_record_linking_fields( recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b) record_add_fields(record_b, marc_tag_for_b, original_linking_fields) record_add_field(record_b, marc_tag_for_b, ind1=marc_ind1_for_b, ind2=marc_ind2_for_b, subfields=[('i', what_is_a_for_b), ('r', rn_a), ('w', str(recid_a))]) output += record_xml_output(record_b) # Remove linking in remote records where adequate if consider_empty_p: unlinked_recids = get_unlinked_records(recid_a, marc_for_b, display_in_b, upload_mode, recids_and_rns_b) for recid_b in unlinked_recids: record_b = {} record_add_field(record_b, "001", controlfield_value=str(recid_b)) original_linking_fields = _get_record_linking_fields( recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b) if not original_linking_fields: # Add empty field in order to account for cases where all # linkings are removed by the submitter record_add_field(record_b, marc_tag_for_b, ind1=marc_ind1_for_b, ind2=marc_ind2_for_b) record_add_fields(record_b, marc_tag_for_b, original_linking_fields) output += record_xml_output(record_b) output += '</collection>' return output
def perform_request_record(requestType, uid, data): """Handle 'major' record related requests. Handle retrieving, submitting or cancelling the merging session. """ #TODO add checks before submission and cancel, replace get_bibrecord call result = { 'resultCode': 0, 'resultText': '' } recid1 = data["recID1"] record1 = _get_record(recid1, uid, result) if result['resultCode'] != 0: #if record not accessible return error information return result if requestType == 'submit': if 'duplicate' in data: recid2 = data['duplicate'] record2 = _get_record_slave(recid2, result, 'recid', uid) if result['resultCode'] != 0: #return in case of error return result (errcode, message) = check_doi_status_after_merge(data["recID1"], data['duplicate'], record1, record2, record2_marked_as_duplicate_p=data.has_key('duplicate'), submit_confirmed_p=data.get('additional_data', {'confirmed_submit': False}).get('confirmed_submit', False)) if errcode: result['resultCode'] = errcode result['resultText'] = message return result # mark record2 as deleted record_add_field(record2, '980', ' ', ' ', '', [('c', 'DELETED')]) # mark record2 as duplicate of record1 record_add_field(record2, '970', ' ', ' ', '', [('d', str(recid1))]) # add recid of deleted record to master record record_add_field(record1, '981', ' ', ' ', '', [('a', str(recid2))]) # To ensure updates happen in order, use a seq id sequence_id = str(random.randrange(1, 4294967296)) # submit record2 to be deleted xml_record2 = record_xml_output(record2) save_xml_record(recid2, uid, xml_record2, task_name="bibmerge", sequence_id=sequence_id) # submit record1 xml_record1 = record_xml_output(record1) save_xml_record(recid1, uid, xml_record1, task_name="bibmerge", sequence_id=sequence_id) # Delete cache file if it exists if cache_exists(recid1, uid): delete_cache(recid1, uid) result['resultText'] = 'Records submitted' return result (errcode, message) = check_doi_status_after_merge(data["recID1"], data["recID2"], record1, None, submit_confirmed_p=data.get('additional_data', {'confirmed_submit': False}).get('confirmed_submit', False)) if errcode: result['resultCode'] = errcode result['resultText'] = message return result #submit record1 from cache save_xml_record(recid1, uid, task_name="bibmerge") # Delete cache file if it exists if cache_exists(recid1, uid): delete_cache(recid1, uid) result['resultText'] = 'Record submitted' return result elif requestType == 'cancel': delete_cache(recid1, uid) result['resultText'] = 'Cancelled' return result recid2 = data["recID2"] mode = data['record2Mode'] record2 = _get_record_slave(recid2, result, mode, uid) if result['resultCode'] != 0: #if record not accessible return error information return result if requestType == 'getRecordCompare': result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2) result['resultText'] = 'Records compared' elif requestType == 'recCopy': copy_R2_to_R1(record1, record2) result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2) result['resultText'] = 'Record copied' elif requestType == 'recMerge': merge_record(record1, record2, merge_conflicting_fields=True) result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2) result['resultText'] = 'Records merged' elif requestType == 'recMergeNC': merge_record(record1, record2, merge_conflicting_fields=False) result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2) result['resultText'] = 'Records merged' else: result['resultCode'], result['resultText'] = 1, 'Wrong request type' return result
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True if run_sql( "SELECT id FROM schTASK WHERE proc='bibupload:oairepository' AND status='WAITING'" ): write_message( "Previous requests of oairepository still being elaborated. Let's skip this execution." ) return True initial_snapshot = {} for set_spec in all_set_specs(): initial_snapshot[set_spec] = get_set_definitions(set_spec) write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2) task_update_progress("Fetching records to process") recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e') write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2) all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e') no_more_exported_recids = intbitset(all_current_recids) write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2) all_affected_recids = intbitset() all_should_recids = intbitset() recids_for_set = {} for set_spec in all_set_specs(): if not set_spec: set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC should_recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = should_recids no_more_exported_recids -= should_recids all_should_recids |= should_recids current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') write_message( "%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2) to_add = should_recids - current_recids write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2) to_remove = current_recids - should_recids write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2) affected_recids = to_add | to_remove write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2) all_affected_recids |= affected_recids missing_oaiid = all_should_recids - recids_with_oaiid write_message("%s recids are missing an oaiid" % len(missing_oaiid)) write_message("%s recids should no longer be exported" % len(no_more_exported_recids)) ## Let's add records with missing OAI ID all_affected_recids |= missing_oaiid | no_more_exported_recids write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2) if not all_affected_recids: write_message("Nothing to do!") return True # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 # Iterate over the recids for i, recid in enumerate(all_affected_recids): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(all_affected_recids))) write_message("Elaborating recid %s" % recid, verbose=3) record = get_record(recid) if not record: write_message("Record %s seems empty. Let's skip it." % recid, verbose=3) continue new_record = {} # Check if an OAI identifier is already in the record or # not. assign_oai_id_entry = False oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5]) if not oai_id_entry: assign_oai_id_entry = True oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid) write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) else: write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5])) write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3) current_previous_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5])) write_message( "Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3) # Get the sets that should be in this record according to # settings updated_oai_sets = set(_set for _set, _recids in iteritems(recids_for_set) if recid in _recids) write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3) updated_previous_oai_sets = set( _set for _set in (current_previous_oai_sets - updated_oai_sets) | (current_oai_sets - updated_oai_sets)) write_message( "Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and not assign_oai_id_entry: write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3) continue # Jump to next recid write_message("Something has changed for record %s, let's update it!" % recid, verbose=3) subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)] for oai_set in updated_oai_sets: subfields.append((CFG_OAI_SET_FIELD[5], oai_set)) for oai_set in updated_previous_oai_sets: subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set)) record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields) oai_out.write(record_xml_output(new_record)) tot += 1 if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE: oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n', '-Noairepository', '-P', '-1') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-Noairepository', '-P', '-1') # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 task_sleep_now_if_required(can_stop_too=True) oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if tot > 0: if not no_upload: task_sleep_now_if_required(can_stop_too=True) if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename) else: os.remove(filename) return True
def perform_request_record(requestType, uid, data): """Handle 'major' record related requests. Handle retrieving, submitting or cancelling the merging session. """ #TODO add checks before submission and cancel, replace get_bibrecord call result = {'resultCode': 0, 'resultText': ''} recid1 = data["recID1"] record1 = _get_record(recid1, uid, result) if result[ 'resultCode'] != 0: #if record not accessible return error information return result if requestType == 'submit': if 'duplicate' in data: recid2 = data['duplicate'] record2 = _get_record_slave(recid2, result, 'recid', uid) if result['resultCode'] != 0: #return in case of error return result (errcode, message) = check_doi_status_after_merge( data["recID1"], data['duplicate'], record1, record2, record2_marked_as_duplicate_p=data.has_key('duplicate'), submit_confirmed_p=data.get('additional_data', { 'confirmed_submit': False }).get('confirmed_submit', False)) if errcode: result['resultCode'] = errcode result['resultText'] = message return result # mark record2 as deleted record_add_field(record2, '980', ' ', ' ', '', [('c', 'DELETED')]) # mark record2 as duplicate of record1 record_add_field(record2, '970', ' ', ' ', '', [('d', str(recid1))]) # add recid of deleted record to master record record_add_field(record1, '981', ' ', ' ', '', [('a', str(recid2))]) # To ensure updates happen in order, use a seq id sequence_id = str(random.randrange(1, 4294967296)) # submit record2 to be deleted xml_record2 = record_xml_output(record2) save_xml_record(recid2, uid, xml_record2, task_name="bibmerge", sequence_id=sequence_id) # submit record1 xml_record1 = record_xml_output(record1) save_xml_record(recid1, uid, xml_record1, task_name="bibmerge", sequence_id=sequence_id) # Delete cache file if it exists if cache_exists(recid1, uid): delete_cache(recid1, uid) result['resultText'] = 'Records submitted' return result (errcode, message) = check_doi_status_after_merge( data["recID1"], data["recID2"], record1, None, submit_confirmed_p=data.get('additional_data', { 'confirmed_submit': False }).get('confirmed_submit', False)) if errcode: result['resultCode'] = errcode result['resultText'] = message return result #submit record1 from cache save_xml_record(recid1, uid, task_name="bibmerge") # Delete cache file if it exists if cache_exists(recid1, uid): delete_cache(recid1, uid) result['resultText'] = 'Record submitted' return result elif requestType == 'cancel': delete_cache(recid1, uid) result['resultText'] = 'Cancelled' return result recid2 = data["recID2"] mode = data['record2Mode'] record2 = _get_record_slave(recid2, result, mode, uid) if result[ 'resultCode'] != 0: #if record not accessible return error information return result if requestType == 'getRecordCompare': result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Records compared' elif requestType == 'recCopy': copy_R2_to_R1(record1, record2) result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Record copied' elif requestType == 'recMerge': merge_record(record1, record2, merge_conflicting_fields=True) result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Records merged' elif requestType == 'recMergeNC': merge_record(record1, record2, merge_conflicting_fields=False) result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Records merged' else: result['resultCode'], result['resultText'] = 1, 'Wrong request type' return result
def tweet_to_record(tweet, query): """ Transform a tweet into a record. @note: you may want to highly customize this. """ rec = {} ## Let's normalize the body of the tweet. text = tweet.text.encode('UTF-8') text = text.replace('>', '>') text = text.replace('<', '<') text = text.replace('"', "'") text = text.replace('&', '&') ## Let's add the creation date try: creation_date = time.strptime(tweet.created_at, '%a, %d %b %Y %H:%M:%S +0000') except ValueError: creation_date = time.strptime(tweet.created_at, '%a %b %d %H:%M:%S +0000 %Y') record_add_field(rec, '260__c', time.strftime('%Y-%m-%dZ%H:%M:%ST', creation_date)) ## Let's add the Tweet ID record_add_field(rec, '970', subfields=[('a', str(tweet.id))]) ## Let's add the body of the tweet as an abstract record_add_field(rec, '520', subfields=[('a', text)]) ## Let's re-add the body of the tweet as a title. record_add_field(rec, '245', subfields=[('a', text)]) ## Let's fetch information about the user try: user = _TWITTER_API.GetUser(tweet.from_user) ## Let's add the user name as author of the tweet record_add_field(rec, '100', subfields=[('a', str(user.name.encode('UTF-8')))]) ## Let's fetch the icon of the user profile, and let's upload it as ## an image (and an icon of itself) record_add_field(rec, 'FFT', subfields=[ ('a', user.profile.image_url.encode('UTF-8')), ('x', user.profile.image_url.encode('UTF-8')) ]) except Exception as err: write_message("WARNING: issue when fetching the user: %s" % err, stream=sys.stderr) if hasattr(tweet, 'iso_language_code'): ## Let's add the language of the Tweet if available (also this depends) ## on the kind of Twitter API call we used record_add_field(rec, '045', subfields=[('a', tweet.iso_language_code.encode('UTF-8'))]) ## Let's tag this record as a TWEET so that later we can build a collection ## out of these records. record_add_field(rec, '980', subfields=[('a', 'TWEET'), ('b', query)]) ## Some smart manipulations: let's parse out URLs and tags from the body ## of the Tweet. for url in _RE_GET_HTTP.findall(text): url = url[0] record_add_field(rec, '856', '4', subfields=[('u', url)]) for tag in _RE_TAGS.findall(text): ## And here we add the keywords. record_add_field(rec, '653', '1', subfields=[('a', tag), ('9', 'TWITTER')]) ## Finally we shall serialize everything to MARCXML return record_xml_output(rec)
def author_list(obj, eng): """ Performs the special authorlist extraction step (Mostly INSPIRE/CERN related). :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex, find_matching_files) from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.cli import get_defaults identifiers = obj.data["system_number_external"]["value"] bibtask.task_sleep_now_if_required() if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = plotextractor_getter.make_single_directory(cfg['CFG_TMPSHAREDDIR'], eng.uuid) tarball, pdf = plotextractor_getter.harvest_single(obj.data["system_number_external"]["value"], extract_path, ["tarball"]) tarball = str(tarball) if tarball is None: raise workflows_error.WorkflowError(str("Error harvesting tarball from id: %s %s" % (identifiers, extract_path)), eng.uuid, id_object=obj.id) obj.extra_data["_result"]["tarball"] = tarball sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"], cfg['CFG_TMPDIR'], "") try: untar(obj.extra_data["_result"]["tarball"], sub_dir) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % (obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if not match == []: authors += match[0] # Generate file to store conversion results if authors is not '': authors = convert(authors, "authorlist2marcxml.xsl") authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error("Error parsing authorlist record for id: %s" % (identifiers,)) authorlist_record = authorlist_record[0][0] # Convert any LaTeX symbols in authornames translate_fieldvalues_from_latex(authorlist_record, '100', code='a') translate_fieldvalues_from_latex(authorlist_record, '700', code='a') updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' + record_xml_output(authorlist_record) \ + '</collection>' if not None == updated_xml: # We store the path to the directory the tarball contents live # Read and grab MARCXML from plotextractor run new_dict_representation = records_api.create_record(updated_xml, master_format="marc").dumps() obj.data['authors'] = new_dict_representation["authors"] obj.data['number_of_authors'] = new_dict_representation["number_of_authors"] obj.add_task_result("authors", new_dict_representation["authors"]) obj.add_task_result("number_of_authors", new_dict_representation["number_of_authors"])
def tweet_to_record(tweet, query): """ Transform a tweet into a record. @note: you may want to highly customize this. """ rec = {} ## Let's normalize the body of the tweet. text = tweet.text.encode('UTF-8') text = text.replace('>', '>') text = text.replace('<', '<') text = text.replace('"', "'") text = text.replace('&', '&') ## Let's add the creation date try: creation_date = time.strptime(tweet.created_at, '%a, %d %b %Y %H:%M:%S +0000') except ValueError: creation_date = time.strptime(tweet.created_at, '%a %b %d %H:%M:%S +0000 %Y') record_add_field(rec, '260__c', time.strftime('%Y-%m-%dZ%H:%M:%ST', creation_date)) ## Let's add the Tweet ID record_add_field(rec, '970', subfields=[('a', str(tweet.id))]) ## Let's add the body of the tweet as an abstract record_add_field(rec, '520', subfields=[('a', text)]) ## Let's re-add the body of the tweet as a title. record_add_field(rec, '245', subfields=[('a', text)]) ## Let's fetch information about the user try: user = _TWITTER_API.GetUser(tweet.from_user) ## Let's add the user name as author of the tweet record_add_field(rec, '100', subfields=[('a', str(user.name.encode('UTF-8')))]) ## Let's fetch the icon of the user profile, and let's upload it as ## an image (and an icon of itself) record_add_field(rec, 'FFT', subfields=[('a', user.profile.image_url.encode('UTF-8')), ('x', user.profile.image_url.encode('UTF-8'))]) except Exception as err: write_message("WARNING: issue when fetching the user: %s" % err, stream=sys.stderr) if hasattr(tweet, 'iso_language_code'): ## Let's add the language of the Tweet if available (also this depends) ## on the kind of Twitter API call we used record_add_field(rec, '045', subfields=[('a', tweet.iso_language_code.encode('UTF-8'))]) ## Let's tag this record as a TWEET so that later we can build a collection ## out of these records. record_add_field(rec, '980', subfields=[('a', 'TWEET'), ('b', query)]) ## Some smart manipulations: let's parse out URLs and tags from the body ## of the Tweet. for url in _RE_GET_HTTP.findall(text): url = url[0] record_add_field(rec, '856', '4', subfields=[('u', url)]) for tag in _RE_TAGS.findall(text): ## And here we add the keywords. record_add_field(rec, '653', '1', subfields=[('a', tag), ('9', 'TWITTER')]) ## Finally we shall serialize everything to MARCXML return record_xml_output(rec)