class oaiharvest_record(object): """Sample workflow for OAI harvesting with oai_dc metadataprefix. This workflow assumes the incoming data to be a string representation of OAI_DC XML. NOTE: This workflow blindly inserts records into the database. """ object_type = "OAI harvest" workflow = [ # Convert OAI_DC XML -> MARCXML # FIXME Remove this step when we have one-step OAI_DC -> JSON convert_record("oaidc2marcxml.xsl"), # Convert MARCXML -> JSON convert_record_to_json, # Try to match the record with the database # FIXME Add more identifiers to match. By default only control_number. workflow_if(quick_match_record(), True), [ # Create record in the database using invenio_records create_record, ], workflow_else, [ log_info("Record is already in the database"), ], ]
class oaiharvest_record_approval(RecordWorkflow): """Workflow run for each record OAI harvested.""" object_type = "OAI harvest" workflow = [ convert_record_with_repository(), convert_record_to_bibfield(), workflow_if(quick_match_record, True), [ approve_record, workflow_if(was_approved), [ upload_step, ], workflow_else, [log_info("Record has been rejected")] ], workflow_else, [ log_info("Record is already in the database"), ], ]
class oaiharvest_record_post_process(RecordWorkflow): """Workflow run for each record OAI harvested.""" object_type = "OAI harvest" workflow = [ workflow_if(post_process_selected("c")), [ convert_record_with_repository(), check_record, convert_record_to_bibfield(), ], workflow_if(post_process_selected("t")), [ arxiv_fulltext_download, ], workflow_if(post_process_selected("p")), [ plot_extract(), ], workflow_if(post_process_selected("a")), [ author_list, ], workflow_if(post_process_selected("r")), [ refextract, ], workflow_if(post_process_selected("f")), [ filter_step, ], workflow_if(post_process_selected("u")), [ upload_step, ], ]
class oaiharvest_record_approval(object): """Sample workflow for OAI harvesting with oai_dc metadataprefix. This workflow assumes the incoming data to be a string representation of OAI_DC XML. NOTE: This workflow makes use of Holding Pen for record approval. """ object_type = "OAI harvest" mapping = { "title": "title_statement.title", "subject": "subject_added_entry_topical_term.topical_term_or_geographic_name_entry_element", "abstract": "summary.summary", "ids": "system_control_number.system_control_number" } workflow = [ # Convert OAI_DC XML -> MARCXML # FIXME Remove this step when we have one-step OAI_DC -> JSON convert_record("oaidc2marcxml.xsl"), # Convert MARCXML -> JSON convert_record_to_json, # Try to match the record with the database # FIXME Add more identifiers to match. By default only control_number. workflow_if(quick_match_record(), True), [ # Halt this record to be approved in the Holding Pen approve_record, # Check user action taken workflow_if(was_approved), [ # Create record in the database using invenio_records create_record, ], workflow_else, [log_info("Record has been rejected")] ], workflow_else, [ log_info("Record is already in the database"), ], ] @staticmethod def get_title(bwo, **kwargs): """Return the value to put in the title column of HoldingPen.""" if isinstance(bwo.data, six.string_types): # Probably XML, nothing to do here return "No title extracted" record = Record(bwo.data) return record[oaiharvest_record_approval.mapping["title"]][0] @staticmethod def get_description(bwo, **kwargs): """Return the value to put in the description column of HoldingPen.""" if isinstance(bwo.data, six.string_types): # Probably XML, nothing to do here return "Unformatted: <pre>{0}</pre>".format(bwo.data[:100]) record = Record(bwo.data) abstract = record[oaiharvest_record_approval.mapping["abstract"]][0][0] categories = record[oaiharvest_record_approval.mapping["subject"]][0] identifiers = record[oaiharvest_record_approval.mapping["ids"]][0] return render_template('oaiharvester/holdingpen/oai_record.html', object=bwo, categories=categories, abstract=abstract, identifiers=identifiers) @staticmethod def get_additional(bwo, **kwargs): """Return the value to put in the additional column of HoldingPen.""" return "" @staticmethod def formatter(obj, **kwargs): """Format the object.""" return "<pre>{0}</pre>".format(obj.data) @staticmethod def get_sort_data(obj, **kwargs): """Return a dictionary of key values useful for sorting in Holding Pen.""" return {}
class process_record_arxiv(WorkflowBase): """Processing workflow for a single arXiv record. The records have been harvested via oaiharvester. """ object_type = "arXiv" workflow = [ convert_record_with_repository("oaiarXiv2inspire_nofilter.xsl"), convert_record_to_bibfield, workflow_if(quick_match_record, True), [ plot_extract(["latex"]), arxiv_fulltext_download, classify_paper_with_oaiharvester( taxonomy="HEPont", output_mode="dict", ), refextract, author_list, inspire_filter_custom(fields=["report_number", "arxiv_category"], custom_widgeted="*", custom_accepted="gr", action="inspire_approval"), workflow_if(was_approved), [ upload_step, ], workflow_else, [log_info("Record rejected")] ], workflow_else, [ log_info("Record already into database"), ], ] @staticmethod def get_title(bwo): """Get the title.""" extracted_title = [] record = bwo.get_data() if hasattr(record, "get") and "title" in record: if isinstance(record["title"], str): extracted_title = [record["title"]] else: for a_title in record["title"]: extracted_title.append(record["title"][a_title]) else: extracted_title = [" No title"] title_final = "" for i in extracted_title: title_final += "{0} ".format(i) return title_final @staticmethod def get_description(bwo): """Get the description column part.""" record = bwo.get_data() from invenio.modules.records.api import Record try: identifiers = Record(record.dumps()).persistent_identifiers final_identifiers = [] for i in identifiers: final_identifiers.append(i['value']) except Exception: if hasattr(record, "get"): final_identifiers = [ record.get("system_number_external", {}).get("value", 'No ids') ] else: final_identifiers = [' No ids'] task_results = bwo.get_tasks_results() results = [] if 'bibclassify' in task_results: try: result = task_results['bibclassify'][0]['result'] fast_mode = result.get('fast_mode', False) result = result['dict']['complete_output'] result_string = "<strong></br>Bibclassify result:"\ "</br></strong>"\ "Number of Core keywords: \t%s</br>"\ "PACS: \t%s</br>"\ % (len(result['Core keywords']), len(result['Field codes'])) if fast_mode: result_string += "(This task run at fast mode"\ " taking into consideration"\ " only the title and the abstract)" results.append(result_string) except (KeyError, IndexError): pass categories = [] if hasattr(record, "get"): if 'subject' in record: lookup = ["subject", "term"] elif "subject_term": lookup = ["subject_term", "term"] else: lookup = None if lookup: primary, secondary = lookup category_list = record.get(primary, []) if isinstance(category_list, dict): category_list = [category_list] for subject in category_list: category = subject[secondary] if len(subject) == 2: if subject.keys()[1] == secondary: source_list = subject[subject.keys()[0]] else: source_list = subject[subject.keys()[1]] else: try: source_list = subject['source'] except KeyError: source_list = "" if source_list.lower() == 'inspire': categories.append(category) from flask import render_template return render_template('workflows/styles/harvesting_record.html', categories=categories, identifiers=final_identifiers, results=results) @staticmethod def formatter(bwo, **kwargs): """Return a formatted version of the data.""" from invenio.modules.formatter.engine import format_record data = bwo.get_data() if not data: return '' formatter = kwargs.get("formatter", None) format = kwargs.get("format", None) if formatter: # A seperate formatter is supplied return formatter(data) from invenio.modules.records.api import Record if isinstance(data, collections.Mapping): # Dicts are cool on its own, but maybe its SmartJson (record) try: data = Record(data.dumps()).legacy_export_as_marc() except (TypeError, KeyError): # Maybe not, submission? return data if isinstance(data, string_types): # Its a string type, lets try to convert if format: # We can try formatter! # If already XML, format_record does not like it. if format != 'xm': try: return format_record(recID=None, of=format, xml_record=data) except TypeError: # Wrong kind of type pass else: # So, XML then from xml.dom.minidom import parseString try: pretty_data = parseString(data) return pretty_data.toprettyxml() except TypeError: # Probably not proper XML string then return "Data cannot be parsed: %s" % (data, ) except Exception: # Some other parsing error pass # Just return raw string return data if isinstance(data, set): return list(data) # Not any of the above types. How juicy! return data
class oaiharvest_harvest_repositories(RecordWorkflow): """A workflow for use with OAI harvesting in BibSched.""" object_type = "workflow" record_workflow = "oaiharvest_record_post_process" workflow = [ init_harvesting, foreach(get_repositories_list(), "repository"), [ write_something_generic("Harvesting", [task_update_progress, write_message]), harvest_records, foreach(get_obj_extra_data_key("harvested_files_list")), [ write_something_generic("Starting sub-workflows for file", [task_update_progress, write_message]), foreach(get_records_from_file()), [ workflow_if(filtering_oai_pmh_identifier), [ workflow_if(num_workflow_running_greater(10), neg=True), [ start_async_workflow( preserve_data=True, preserve_extra_data_keys=[ "repository", "oai_identifier" ], get_workflow_from= get_workflow_from_engine_definition, ), ], workflow_else, [ write_something_generic( ["Waiting for workflows to finish"], [task_update_progress, write_message]), wait_for_a_workflow_to_complete(10.0), start_async_workflow( preserve_data=True, preserve_extra_data_keys=[ "repository", "oai_identifier" ], get_workflow_from= get_workflow_from_engine_definition, ), ], ], ], end_for ], end_for ], end_for, write_something_generic( ["Processing: ", get_nb_workflow_created, " records"], [task_update_progress, write_message]), simple_for(0, get_nb_workflow_created, 1), [ wait_for_a_workflow_to_complete(1.0), write_something_generic([get_workflows_progress, "%% complete"], [task_update_progress, write_message]), ], end_for, workflows_reviews(stop_if_error=True), update_last_update(get_repositories_list()) ] @staticmethod def get_description(bwo): """Return description of object.""" from flask import render_template identifiers = None extra_data = bwo.get_extra_data() if 'options' in extra_data and 'identifiers' in extra_data["options"]: identifiers = extra_data["options"]["identifiers"] results = bwo.get_tasks_results() if 'review_workflow' in results: result_progress = results['review_workflow'][0]['result'] else: result_progress = {} current_task = extra_data['_last_task_name'] return render_template("workflows/styles/harvesting_description.html", identifiers=identifiers, result_progress=result_progress, current_task=current_task) @staticmethod def get_title(bwo): """Return title of object.""" return "Summary of OAI harvesting from: {0}".format( bwo.get_extra_data()["repository"]["name"]) @staticmethod def formatter(bwo): """Return description of object.""" from flask import render_template from invenio.modules.workflows.models import BibWorkflowObject from invenio.modules.workflows.registry import workflows identifiers = None extra_data = bwo.get_extra_data() if 'options' in extra_data and 'identifiers' in extra_data["options"]: identifiers = extra_data["options"]["identifiers"] results = bwo.get_tasks_results() if 'review_workflow' in results: result_progress = results['review_workflow'][0]['result'] else: result_progress = {} current_task = extra_data['_last_task_name'] related_objects = [] for id_object in extra_data.get("objects_spawned", list()): spawned_object = BibWorkflowObject.query.get(id_object) if spawned_object: workflow = workflows.get(spawned_object.get_workflow_name()) related_objects.append( (spawned_object.id, workflow.get_title(spawned_object) or "No title")) else: related_objects.append((id_object, None)) return render_template("workflows/styles/harvesting_description.html", identifiers=identifiers, result_progress=result_progress, current_task=current_task, related_objects=related_objects)
class ingestion_arxiv_math(WorkflowBase): """Main workflow for harvesting arXiv via OAI-PMH (oaiharvester).""" object_type = "workflow" workflow = [ write_something_generic("Initialization", [task_update_progress, write_message]), init_harvesting, write_something_generic("Starting", [task_update_progress, write_message]), foreach(get_repositories_list(), "repository"), [ write_something_generic("Harvesting", [task_update_progress, write_message]), harvest_records, write_something_generic("Reading Files", [task_update_progress, write_message]), foreach(get_obj_extra_data_key("harvested_files_list")), [ write_something_generic("Creating Workflows", [task_update_progress, write_message]), foreach(get_records_from_file()), [ workflow_if(filtering_oai_pmh_identifier), [ workflow_if(num_workflow_running_greater(10), neg=True), [ start_async_workflow("process_record_arxiv", preserve_data=True, preserve_extra_data_keys=["repository"]), write_something_generic( ["Workflow started: ", get_nb_workflow_created], [task_update_progress, write_message]), ], workflow_else, [ write_something_generic( ["Max simultaneous workflows reached: ", "Waiting for one to finish"], [task_update_progress, write_message]), wait_for_a_workflow_to_complete(0.05), start_async_workflow("process_record_arxiv", preserve_data=True, preserve_extra_data_keys=["repository"]), write_something_generic(["Workflow started :", get_nb_workflow_created, " "], [task_update_progress, write_message]), ], ], ], end_for ], end_for ], end_for, write_something_generic(["Processing : ", get_nb_workflow_created, " records"], [task_update_progress, write_message]), simple_for(0, get_nb_workflow_created, 1), [ wait_for_a_workflow_to_complete(), write_something_generic([get_workflows_progress, " % Complete"], [task_update_progress, write_message]), ], end_for, write_something_generic("Finishing", [task_update_progress, write_message]), workflows_reviews(stop_if_error=True), update_last_update(get_repositories_list()) ] @staticmethod def get_description(bwo): """Return description of object.""" from flask import render_template identifiers = None extra_data = bwo.get_extra_data() if 'options' in extra_data and 'identifiers' in extra_data["options"]: identifiers = extra_data["options"]["identifiers"] results = bwo.get_tasks_results() if 'review_workflow' in results: result_progress = results['review_workflow'][0]['result'] else: result_progress = {} current_task = extra_data['_last_task_name'] return render_template("workflows/styles/harvesting_description.html", identifiers=identifiers, result_progress=result_progress, current_task=current_task) @staticmethod def get_title(bwo): """Return title of object.""" return "Supervising harvesting of {0}".format( bwo.get_extra_data()["repository"]["name"]) @staticmethod def formatter(bwo, **kwargs): """Return formatted data of object.""" return ingestion_arxiv_math.get_description(bwo)
class literature(SimpleRecordDeposition, WorkflowBase): """Literature deposit submission.""" object_type = "submission" workflow = [ # Pre-fill draft with values passed in from request prefill_draft(draft_id='default'), # Render form and wait for user to submit render_form(draft_id='default'), add_files_to_task_results, # Create the submission information package by merging form data # from all drafts (in this case only one draft exists). prepare_sip(), # Process metadata to match your JSONAlchemy record model. This will # call process_sip_metadata() on your subclass. process_sip_metadata(), # Generate MARC based on metadata dictionary. finalize_record_sip(is_dump=False), halt_to_render, classify_paper_with_deposit( taxonomy="HEPont.rdf", output_mode="dict", ), halt_record_with_action(action="inspire_approval", message="Accept submission?"), workflow_if(was_approved), [send_robotupload()], inform_submitter ] name = "Literature" name_plural = "Literature submissions" group = "Articles & Preprints" draft_definitions = { 'default': LiteratureForm, } @staticmethod def get_title(bwo): """Return title of object.""" deposit_object = Deposition(bwo) sip = deposit_object.get_latest_sip() if sip: # Get the SmartJSON object record = sip.metadata return record.get("title", {"title": "No title"}).get("title") else: return "User submission in progress!!" @staticmethod def get_description(bwo): """Return description of object.""" deposit_object = Deposition(bwo) sip = deposit_object.get_latest_sip() if sip: record = sip.metadata identifiers = [record.get("arxiv_id", "")] categories = [record.get("type_of_doc", "")] return render_template('workflows/styles/submission_record.html', categories=categories, identifiers=identifiers) else: from invenio.modules.access.control import acc_get_user_email id_user = deposit_object.workflow_object.id_user return "Submitted by: %s" % str(acc_get_user_email(id_user)) @staticmethod def formatter(bwo, **kwargs): """Return formatted data of object.""" from invenio.modules.formatter.engine import format_record deposit_object = Deposition(bwo) submission_data = deposit_object.get_latest_sip() marcxml = submission_data.package of = kwargs.get("format", "hd") if of == "xm": return marcxml else: return format_record(recID=None, of=kwargs.get("format", "hd"), xml_record=marcxml) @classmethod #TODO: ensure that this regex is correct def match_authors_initials(self, author_name): """Check if author's name contains only its initials.""" return not bool(re.compile(r'[^A-Z. ]').search(author_name)) @classmethod def process_sip_metadata(cls, deposition, metadata): """Map fields to match jsonalchemy configuration.""" delete_keys = [] field_list = ['abstract', 'title'] # maps from a form field to the corresponding MarcXML field field_map = { 'abstract': "summary", 'title': "title", 'subject_term': "term", 'defense_date': "date", 'university': "university", 'degree_type': "degree_type", 'journal_title': "journal_title", 'page_range': "page_artid", 'article_id': "page_artid", 'volume': "journal_volume", 'year': "year", 'issue': "journal_issue" } # ============================ # Abstract, Title and Subjects # ============================ for field in field_list: if field in metadata: tmp_field = metadata[field] metadata[field] = {} metadata[field][field_map[field]] = tmp_field if "subject_term" in metadata: tmp_field = metadata["subject_term"] metadata["subject_term"] = [{ "term": t, "scheme": "INSPIRE" } for t in tmp_field] # ======= # Authors # ======= metadata['authors'] = filter(None, metadata['authors']) if 'authors' in metadata and metadata['authors']: first_author = metadata['authors'][0].get('full_name').split(',') if len(first_author) > 1 and \ literature.match_authors_initials(first_author[1]): first_author[1] = first_author[1].replace(' ', '') metadata['authors'][0]['full_name'] = ",".join(first_author) metadata['_first_author'] = metadata['authors'][0] metadata['_first_author']['email'] = '' if metadata['authors'][1:]: metadata['_additional_authors'] = metadata['authors'][1:] for k in metadata['_additional_authors']: try: additional_author = k.get('full_name').split(',') if len(additional_author) > 1 and \ literature.match_authors_initials(additional_author[1]): additional_author[1] = additional_author[ 1].replace(' ', '') k['full_name'] = ",".join(additional_author) k['email'] = '' except AttributeError: pass delete_keys.append('authors') # =========== # Supervisors # =========== if 'supervisors' in metadata and metadata['supervisors']: metadata['thesis_supervisor'] = metadata['supervisors'][0] metadata['thesis_supervisor']['email'] = '' #metadata['_additional_authors'] = metadata['authors'][1:] delete_keys.append('supervisors') # ============== # Thesis related # ============== thesis_fields = filter(lambda field: field in metadata, ['defense_date', 'university', 'degree_type']) if thesis_fields: metadata['thesis'] = {} for field in thesis_fields: metadata['thesis'][field_map[field]] = metadata[field] delete_keys.extend(thesis_fields) # ======== # Category # ======== metadata['collections'] = [{'primary': "HEP"}] # =============== # Abstract source # =============== if 'title_arXiv' in metadata: metadata['abstract']['source'] = 'arXiv' # ======== # arXiv ID # ======== if 'arxiv_id' in metadata: metadata['report_number'] = "$$9arXiv$$aoai:arXiv.org:" + metadata[ 'arxiv_id'] # ======== # Language # ======== metadata['language'] = unicode( dict(LiteratureForm.languages).get(metadata['language'])) # ========== # Experiment # ========== if 'experiment' in metadata: metadata['accelerator_experiment'] = {} metadata['accelerator_experiment']['experiment'] = metadata[ 'experiment'] delete_keys.append('experiment') # =============== # Conference Info # =============== if 'conf_name' in metadata: if 'nonpublic_note' in metadata: field = [metadata['nonpublic_note'], metadata['conf_name']] metadata['nonpublic_note'] = field else: metadata['nonpublic_note'] = [metadata['conf_name']] metadata['collections'].extend([{'primary': "ConferencePaper"}]) delete_keys.append('conf_name') # ======= # License # ======= if 'license_url' in metadata: metadata['license'] = {} metadata['license']['url'] = metadata['license_url'] delete_keys.append('license_url') # =========== # Files (FFT) # =========== if 'fft' in metadata and metadata['fft']: fft = metadata['fft'] metadata['fft'] = {} metadata['fft']['url'] = fft[0]['path'] # ================ # Publication Info # ================ publication_fields = filter(lambda field: field in metadata, [ 'journal_title', 'page_range', 'article_id', 'volume', 'year', 'issue' ]) if publication_fields: metadata['publication_info'] = {} for field in publication_fields: metadata['publication_info'][ field_map[field]] = metadata[field] if 'page_nr' not in metadata and 'page_range' in publication_fields: pages = metadata['page_range'].split('-') if len(pages) == 2: try: metadata['page_nr'] = int(pages[1]) - int(pages[0]) except ValueError: pass delete_keys.extend(publication_fields) if 'nonpublic_note' in metadata and len( metadata['nonpublic_note']) > 1: del metadata['nonpublic_note'][0] if {'primary': "ConferencePaper"} in metadata['collections']: metadata['collections'].remove({'primary': "ConferencePaper"}) metadata['collections'].append({'primary': "Published"}) # =================== # Delete useless data # =================== for key in delete_keys: del metadata[key]
class oaiharvest_harvest_repositories(RecordWorkflow): """A workflow for use with OAI harvesting in BibSched.""" object_type = "workflow" record_workflow = "oaiharvest_record_post_process" workflow = [ init_harvesting, foreach(get_repositories_list(), "repository"), [ write_something_generic("Harvesting", [task_update_progress, write_message]), harvest_records, foreach(get_obj_extra_data_key("harvested_files_list")), [ write_something_generic("Starting sub-workflows for file", [task_update_progress, write_message]), foreach(get_records_from_file()), [ workflow_if(filtering_oai_pmh_identifier), [ workflow_if(num_workflow_running_greater(10), neg=True), [ start_async_workflow( preserve_data=True, preserve_extra_data_keys=[ "repository", "oai_identifier" ], get_workflow_from= get_workflow_from_engine_definition, ), ], workflow_else, [ write_something_generic( ["Waiting for workflows to finish"], [task_update_progress, write_message]), wait_for_a_workflow_to_complete(10.0), start_async_workflow( preserve_data=True, preserve_extra_data_keys=[ "repository", "oai_identifier" ], get_workflow_from= get_workflow_from_engine_definition, ), ], ], ], end_for ], end_for ], end_for, write_something_generic( ["Processing: ", get_nb_workflow_created, " records"], [task_update_progress, write_message]), simple_for(0, get_nb_workflow_created, 1), [ wait_for_a_workflow_to_complete(1.0), write_something_generic([get_workflows_progress, "%% complete"], [task_update_progress, write_message]), ], end_for, workflows_reviews(stop_if_error=True), update_last_update(get_repositories_list()) ]