class oaiharvest_record(object):
    """Sample workflow for OAI harvesting with oai_dc metadataprefix.

    This workflow assumes the incoming data to be a string representation of
    OAI_DC XML.

    NOTE: This workflow blindly inserts records into the database.
    """

    object_type = "OAI harvest"

    workflow = [
        # Convert OAI_DC XML -> MARCXML
        # FIXME Remove this step when we have one-step OAI_DC -> JSON
        convert_record("oaidc2marcxml.xsl"),
        # Convert MARCXML -> JSON
        convert_record_to_json,
        # Try to match the record with the database
        # FIXME Add more identifiers to match. By default only control_number.
        workflow_if(quick_match_record(), True),
        [
            # Create record in the database using invenio_records
            create_record,
        ],
        workflow_else,
        [
            log_info("Record is already in the database"),
        ],
    ]
Example #2
0
class oaiharvest_record_approval(RecordWorkflow):
    """Workflow run for each record OAI harvested."""

    object_type = "OAI harvest"

    workflow = [
        convert_record_with_repository(),
        convert_record_to_bibfield(),
        workflow_if(quick_match_record, True),
        [
            approve_record,
            workflow_if(was_approved), [
                upload_step,
            ], workflow_else, [log_info("Record has been rejected")]
        ],
        workflow_else,
        [
            log_info("Record is already in the database"),
        ],
    ]
Example #3
0
class oaiharvest_record_post_process(RecordWorkflow):
    """Workflow run for each record OAI harvested."""

    object_type = "OAI harvest"

    workflow = [
        workflow_if(post_process_selected("c")),
        [
            convert_record_with_repository(),
            check_record,
            convert_record_to_bibfield(),
        ],
        workflow_if(post_process_selected("t")),
        [
            arxiv_fulltext_download,
        ],
        workflow_if(post_process_selected("p")),
        [
            plot_extract(),
        ],
        workflow_if(post_process_selected("a")),
        [
            author_list,
        ],
        workflow_if(post_process_selected("r")),
        [
            refextract,
        ],
        workflow_if(post_process_selected("f")),
        [
            filter_step,
        ],
        workflow_if(post_process_selected("u")),
        [
            upload_step,
        ],
    ]
Example #4
0
class oaiharvest_record_approval(object):
    """Sample workflow for OAI harvesting with oai_dc metadataprefix.

    This workflow assumes the incoming data to be a string representation of
    OAI_DC XML.

    NOTE: This workflow makes use of Holding Pen for record approval.
    """

    object_type = "OAI harvest"
    mapping = {
        "title": "title_statement.title",
        "subject":
        "subject_added_entry_topical_term.topical_term_or_geographic_name_entry_element",
        "abstract": "summary.summary",
        "ids": "system_control_number.system_control_number"
    }

    workflow = [
        # Convert OAI_DC XML -> MARCXML
        # FIXME Remove this step when we have one-step OAI_DC -> JSON
        convert_record("oaidc2marcxml.xsl"),
        # Convert MARCXML -> JSON
        convert_record_to_json,
        # Try to match the record with the database
        # FIXME Add more identifiers to match. By default only control_number.
        workflow_if(quick_match_record(), True),
        [
            # Halt this record to be approved in the Holding Pen
            approve_record,
            # Check user action taken
            workflow_if(was_approved),
            [
                # Create record in the database using invenio_records
                create_record,
            ],
            workflow_else,
            [log_info("Record has been rejected")]
        ],
        workflow_else,
        [
            log_info("Record is already in the database"),
        ],
    ]

    @staticmethod
    def get_title(bwo, **kwargs):
        """Return the value to put in the title column of HoldingPen."""
        if isinstance(bwo.data, six.string_types):
            # Probably XML, nothing to do here
            return "No title extracted"
        record = Record(bwo.data)
        return record[oaiharvest_record_approval.mapping["title"]][0]

    @staticmethod
    def get_description(bwo, **kwargs):
        """Return the value to put in the description column of HoldingPen."""
        if isinstance(bwo.data, six.string_types):
            # Probably XML, nothing to do here
            return "Unformatted: <pre>{0}</pre>".format(bwo.data[:100])
        record = Record(bwo.data)

        abstract = record[oaiharvest_record_approval.mapping["abstract"]][0][0]
        categories = record[oaiharvest_record_approval.mapping["subject"]][0]
        identifiers = record[oaiharvest_record_approval.mapping["ids"]][0]

        return render_template('oaiharvester/holdingpen/oai_record.html',
                               object=bwo,
                               categories=categories,
                               abstract=abstract,
                               identifiers=identifiers)

    @staticmethod
    def get_additional(bwo, **kwargs):
        """Return the value to put in the additional column of HoldingPen."""
        return ""

    @staticmethod
    def formatter(obj, **kwargs):
        """Format the object."""
        return "<pre>{0}</pre>".format(obj.data)

    @staticmethod
    def get_sort_data(obj, **kwargs):
        """Return a dictionary of key values useful for sorting in Holding Pen."""
        return {}
class process_record_arxiv(WorkflowBase):
    """Processing workflow for a single arXiv record.

    The records have been harvested via oaiharvester.
    """

    object_type = "arXiv"

    workflow = [
        convert_record_with_repository("oaiarXiv2inspire_nofilter.xsl"),
        convert_record_to_bibfield,
        workflow_if(quick_match_record, True),
        [
            plot_extract(["latex"]), arxiv_fulltext_download,
            classify_paper_with_oaiharvester(
                taxonomy="HEPont",
                output_mode="dict",
            ), refextract, author_list,
            inspire_filter_custom(fields=["report_number", "arxiv_category"],
                                  custom_widgeted="*",
                                  custom_accepted="gr",
                                  action="inspire_approval"),
            workflow_if(was_approved), [
                upload_step,
            ], workflow_else, [log_info("Record rejected")]
        ],
        workflow_else,
        [
            log_info("Record already into database"),
        ],
    ]

    @staticmethod
    def get_title(bwo):
        """Get the title."""
        extracted_title = []
        record = bwo.get_data()
        if hasattr(record, "get") and "title" in record:
            if isinstance(record["title"], str):
                extracted_title = [record["title"]]
            else:
                for a_title in record["title"]:
                    extracted_title.append(record["title"][a_title])
        else:
            extracted_title = [" No title"]
        title_final = ""
        for i in extracted_title:
            title_final += "{0} ".format(i)
        return title_final

    @staticmethod
    def get_description(bwo):
        """Get the description column part."""
        record = bwo.get_data()
        from invenio.modules.records.api import Record
        try:
            identifiers = Record(record.dumps()).persistent_identifiers
            final_identifiers = []
            for i in identifiers:
                final_identifiers.append(i['value'])
        except Exception:
            if hasattr(record, "get"):
                final_identifiers = [
                    record.get("system_number_external",
                               {}).get("value", 'No ids')
                ]
            else:
                final_identifiers = [' No ids']

        task_results = bwo.get_tasks_results()
        results = []
        if 'bibclassify' in task_results:
            try:
                result = task_results['bibclassify'][0]['result']
                fast_mode = result.get('fast_mode', False)
                result = result['dict']['complete_output']
                result_string = "<strong></br>Bibclassify result:"\
                                "</br></strong>"\
                                "Number of Core keywords: \t%s</br>"\
                                "PACS: \t%s</br>"\
                                % (len(result['Core keywords']),
                                   len(result['Field codes']))
                if fast_mode:
                    result_string += "(This task run at fast mode"\
                                     " taking into consideration"\
                                     " only the title and the abstract)"
                results.append(result_string)
            except (KeyError, IndexError):
                pass
        categories = []
        if hasattr(record, "get"):
            if 'subject' in record:
                lookup = ["subject", "term"]
            elif "subject_term":
                lookup = ["subject_term", "term"]
            else:
                lookup = None
            if lookup:
                primary, secondary = lookup
                category_list = record.get(primary, [])
                if isinstance(category_list, dict):
                    category_list = [category_list]
                for subject in category_list:
                    category = subject[secondary]
                    if len(subject) == 2:
                        if subject.keys()[1] == secondary:
                            source_list = subject[subject.keys()[0]]
                        else:
                            source_list = subject[subject.keys()[1]]
                    else:
                        try:
                            source_list = subject['source']
                        except KeyError:
                            source_list = ""
                    if source_list.lower() == 'inspire':
                        categories.append(category)

        from flask import render_template
        return render_template('workflows/styles/harvesting_record.html',
                               categories=categories,
                               identifiers=final_identifiers,
                               results=results)

    @staticmethod
    def formatter(bwo, **kwargs):
        """Return a formatted version of the data."""
        from invenio.modules.formatter.engine import format_record

        data = bwo.get_data()
        if not data:
            return ''
        formatter = kwargs.get("formatter", None)
        format = kwargs.get("format", None)
        if formatter:
            # A seperate formatter is supplied
            return formatter(data)
        from invenio.modules.records.api import Record
        if isinstance(data, collections.Mapping):
            # Dicts are cool on its own, but maybe its SmartJson (record)
            try:
                data = Record(data.dumps()).legacy_export_as_marc()
            except (TypeError, KeyError):
                # Maybe not, submission?
                return data

        if isinstance(data, string_types):
            # Its a string type, lets try to convert
            if format:
                # We can try formatter!
                # If already XML, format_record does not like it.
                if format != 'xm':
                    try:
                        return format_record(recID=None,
                                             of=format,
                                             xml_record=data)
                    except TypeError:
                        # Wrong kind of type
                        pass
                else:
                    # So, XML then
                    from xml.dom.minidom import parseString

                    try:
                        pretty_data = parseString(data)
                        return pretty_data.toprettyxml()
                    except TypeError:
                        # Probably not proper XML string then
                        return "Data cannot be parsed: %s" % (data, )
                    except Exception:
                        # Some other parsing error
                        pass

            # Just return raw string
            return data
        if isinstance(data, set):
            return list(data)
        # Not any of the above types. How juicy!
        return data
class oaiharvest_harvest_repositories(RecordWorkflow):
    """A workflow for use with OAI harvesting in BibSched."""

    object_type = "workflow"
    record_workflow = "oaiharvest_record_post_process"

    workflow = [
        init_harvesting,
        foreach(get_repositories_list(), "repository"),
        [
            write_something_generic("Harvesting",
                                    [task_update_progress, write_message]),
            harvest_records,
            foreach(get_obj_extra_data_key("harvested_files_list")),
            [
                write_something_generic("Starting sub-workflows for file",
                                        [task_update_progress, write_message]),
                foreach(get_records_from_file()),
                [
                    workflow_if(filtering_oai_pmh_identifier),
                    [
                        workflow_if(num_workflow_running_greater(10),
                                    neg=True),
                        [
                            start_async_workflow(
                                preserve_data=True,
                                preserve_extra_data_keys=[
                                    "repository", "oai_identifier"
                                ],
                                get_workflow_from=
                                get_workflow_from_engine_definition,
                            ),
                        ],
                        workflow_else,
                        [
                            write_something_generic(
                                ["Waiting for workflows to finish"],
                                [task_update_progress, write_message]),
                            wait_for_a_workflow_to_complete(10.0),
                            start_async_workflow(
                                preserve_data=True,
                                preserve_extra_data_keys=[
                                    "repository", "oai_identifier"
                                ],
                                get_workflow_from=
                                get_workflow_from_engine_definition,
                            ),
                        ],
                    ],
                ], end_for
            ], end_for
        ], end_for,
        write_something_generic(
            ["Processing: ", get_nb_workflow_created, " records"],
            [task_update_progress, write_message]),
        simple_for(0, get_nb_workflow_created, 1),
        [
            wait_for_a_workflow_to_complete(1.0),
            write_something_generic([get_workflows_progress, "%% complete"],
                                    [task_update_progress, write_message]),
        ], end_for,
        workflows_reviews(stop_if_error=True),
        update_last_update(get_repositories_list())
    ]

    @staticmethod
    def get_description(bwo):
        """Return description of object."""
        from flask import render_template

        identifiers = None

        extra_data = bwo.get_extra_data()
        if 'options' in extra_data and 'identifiers' in extra_data["options"]:
            identifiers = extra_data["options"]["identifiers"]

        results = bwo.get_tasks_results()

        if 'review_workflow' in results:
            result_progress = results['review_workflow'][0]['result']
        else:
            result_progress = {}

        current_task = extra_data['_last_task_name']

        return render_template("workflows/styles/harvesting_description.html",
                               identifiers=identifiers,
                               result_progress=result_progress,
                               current_task=current_task)

    @staticmethod
    def get_title(bwo):
        """Return title of object."""
        return "Summary of OAI harvesting from: {0}".format(
            bwo.get_extra_data()["repository"]["name"])

    @staticmethod
    def formatter(bwo):
        """Return description of object."""
        from flask import render_template
        from invenio.modules.workflows.models import BibWorkflowObject
        from invenio.modules.workflows.registry import workflows

        identifiers = None

        extra_data = bwo.get_extra_data()
        if 'options' in extra_data and 'identifiers' in extra_data["options"]:
            identifiers = extra_data["options"]["identifiers"]

        results = bwo.get_tasks_results()

        if 'review_workflow' in results:
            result_progress = results['review_workflow'][0]['result']
        else:
            result_progress = {}

        current_task = extra_data['_last_task_name']

        related_objects = []
        for id_object in extra_data.get("objects_spawned", list()):
            spawned_object = BibWorkflowObject.query.get(id_object)
            if spawned_object:
                workflow = workflows.get(spawned_object.get_workflow_name())
                related_objects.append(
                    (spawned_object.id, workflow.get_title(spawned_object)
                     or "No title"))
            else:
                related_objects.append((id_object, None))

        return render_template("workflows/styles/harvesting_description.html",
                               identifiers=identifiers,
                               result_progress=result_progress,
                               current_task=current_task,
                               related_objects=related_objects)
class ingestion_arxiv_math(WorkflowBase):

    """Main workflow for harvesting arXiv via OAI-PMH (oaiharvester)."""

    object_type = "workflow"
    workflow = [
        write_something_generic("Initialization", [task_update_progress, write_message]),
        init_harvesting,
        write_something_generic("Starting", [task_update_progress, write_message]),
        foreach(get_repositories_list(), "repository"),
        [
            write_something_generic("Harvesting", [task_update_progress, write_message]),
            harvest_records,
            write_something_generic("Reading Files", [task_update_progress, write_message]),
            foreach(get_obj_extra_data_key("harvested_files_list")),
            [
                write_something_generic("Creating Workflows", [task_update_progress, write_message]),
                foreach(get_records_from_file()),
                [
                    workflow_if(filtering_oai_pmh_identifier),
                    [
                        workflow_if(num_workflow_running_greater(10), neg=True),
                        [
                            start_async_workflow("process_record_arxiv",
                                                 preserve_data=True,
                                                 preserve_extra_data_keys=["repository"]),

                            write_something_generic(
                                ["Workflow started: ",
                                 get_nb_workflow_created],
                                [task_update_progress,
                                 write_message]),
                        ],
                        workflow_else,
                        [
                            write_something_generic(
                                ["Max simultaneous workflows reached: ",
                                 "Waiting for one to finish"],
                                [task_update_progress,
                                 write_message]),
                            wait_for_a_workflow_to_complete(0.05),
                            start_async_workflow("process_record_arxiv",
                                                 preserve_data=True,
                                                 preserve_extra_data_keys=["repository"]),
                            write_something_generic(["Workflow started :",
                                                     get_nb_workflow_created,
                                                     " "],
                                                    [task_update_progress,
                                                     write_message]),
                        ],
                    ],
                ],
                end_for
            ],
            end_for
        ],
        end_for,
        write_something_generic(["Processing : ", get_nb_workflow_created, " records"],
                                [task_update_progress, write_message]),
        simple_for(0, get_nb_workflow_created, 1),
        [
            wait_for_a_workflow_to_complete(),
            write_something_generic([get_workflows_progress, " % Complete"],
                                    [task_update_progress, write_message]),
        ],
        end_for,
        write_something_generic("Finishing", [task_update_progress, write_message]),
        workflows_reviews(stop_if_error=True),
        update_last_update(get_repositories_list())
    ]

    @staticmethod
    def get_description(bwo):
        """Return description of object."""
        from flask import render_template

        identifiers = None

        extra_data = bwo.get_extra_data()
        if 'options' in extra_data and 'identifiers' in extra_data["options"]:
            identifiers = extra_data["options"]["identifiers"]

        results = bwo.get_tasks_results()

        if 'review_workflow' in results:
            result_progress = results['review_workflow'][0]['result']
        else:
            result_progress = {}

        current_task = extra_data['_last_task_name']

        return render_template("workflows/styles/harvesting_description.html",
                               identifiers=identifiers,
                               result_progress=result_progress,
                               current_task=current_task)

    @staticmethod
    def get_title(bwo):
        """Return title of object."""
        return "Supervising harvesting of {0}".format(
            bwo.get_extra_data()["repository"]["name"])

    @staticmethod
    def formatter(bwo, **kwargs):
        """Return formatted data of object."""
        return ingestion_arxiv_math.get_description(bwo)
Example #8
0
class literature(SimpleRecordDeposition, WorkflowBase):
    """Literature deposit submission."""

    object_type = "submission"

    workflow = [
        # Pre-fill draft with values passed in from request
        prefill_draft(draft_id='default'),
        # Render form and wait for user to submit
        render_form(draft_id='default'),
        add_files_to_task_results,
        # Create the submission information package by merging form data
        # from all drafts (in this case only one draft exists).
        prepare_sip(),
        # Process metadata to match your JSONAlchemy record model. This will
        # call process_sip_metadata() on your subclass.
        process_sip_metadata(),
        # Generate MARC based on metadata dictionary.
        finalize_record_sip(is_dump=False),
        halt_to_render,
        classify_paper_with_deposit(
            taxonomy="HEPont.rdf",
            output_mode="dict",
        ),
        halt_record_with_action(action="inspire_approval",
                                message="Accept submission?"),
        workflow_if(was_approved),
        [send_robotupload()],
        inform_submitter
    ]

    name = "Literature"
    name_plural = "Literature submissions"
    group = "Articles & Preprints"
    draft_definitions = {
        'default': LiteratureForm,
    }

    @staticmethod
    def get_title(bwo):
        """Return title of object."""
        deposit_object = Deposition(bwo)
        sip = deposit_object.get_latest_sip()
        if sip:
            # Get the SmartJSON object
            record = sip.metadata
            return record.get("title", {"title": "No title"}).get("title")
        else:
            return "User submission in progress!!"

    @staticmethod
    def get_description(bwo):
        """Return description of object."""
        deposit_object = Deposition(bwo)
        sip = deposit_object.get_latest_sip()
        if sip:
            record = sip.metadata
            identifiers = [record.get("arxiv_id", "")]
            categories = [record.get("type_of_doc", "")]
            return render_template('workflows/styles/submission_record.html',
                                   categories=categories,
                                   identifiers=identifiers)
        else:
            from invenio.modules.access.control import acc_get_user_email
            id_user = deposit_object.workflow_object.id_user
            return "Submitted by: %s" % str(acc_get_user_email(id_user))

    @staticmethod
    def formatter(bwo, **kwargs):
        """Return formatted data of object."""
        from invenio.modules.formatter.engine import format_record
        deposit_object = Deposition(bwo)
        submission_data = deposit_object.get_latest_sip()
        marcxml = submission_data.package

        of = kwargs.get("format", "hd")
        if of == "xm":
            return marcxml
        else:
            return format_record(recID=None,
                                 of=kwargs.get("format", "hd"),
                                 xml_record=marcxml)

    @classmethod
    #TODO: ensure that this regex is correct
    def match_authors_initials(self, author_name):
        """Check if author's name contains only its initials."""
        return not bool(re.compile(r'[^A-Z. ]').search(author_name))

    @classmethod
    def process_sip_metadata(cls, deposition, metadata):
        """Map fields to match jsonalchemy configuration."""
        delete_keys = []
        field_list = ['abstract', 'title']

        # maps from a form field to the corresponding MarcXML field
        field_map = {
            'abstract': "summary",
            'title': "title",
            'subject_term': "term",
            'defense_date': "date",
            'university': "university",
            'degree_type': "degree_type",
            'journal_title': "journal_title",
            'page_range': "page_artid",
            'article_id': "page_artid",
            'volume': "journal_volume",
            'year': "year",
            'issue': "journal_issue"
        }

        # ============================
        # Abstract, Title and Subjects
        # ============================
        for field in field_list:
            if field in metadata:
                tmp_field = metadata[field]
                metadata[field] = {}
                metadata[field][field_map[field]] = tmp_field

        if "subject_term" in metadata:
            tmp_field = metadata["subject_term"]
            metadata["subject_term"] = [{
                "term": t,
                "scheme": "INSPIRE"
            } for t in tmp_field]

        # =======
        # Authors
        # =======
        metadata['authors'] = filter(None, metadata['authors'])
        if 'authors' in metadata and metadata['authors']:
            first_author = metadata['authors'][0].get('full_name').split(',')
            if len(first_author) > 1 and \
                    literature.match_authors_initials(first_author[1]):
                first_author[1] = first_author[1].replace(' ', '')
                metadata['authors'][0]['full_name'] = ",".join(first_author)
            metadata['_first_author'] = metadata['authors'][0]
            metadata['_first_author']['email'] = ''
            if metadata['authors'][1:]:
                metadata['_additional_authors'] = metadata['authors'][1:]
                for k in metadata['_additional_authors']:
                    try:
                        additional_author = k.get('full_name').split(',')
                        if len(additional_author) > 1 and \
                                literature.match_authors_initials(additional_author[1]):
                            additional_author[1] = additional_author[
                                1].replace(' ', '')
                            k['full_name'] = ",".join(additional_author)
                        k['email'] = ''
                    except AttributeError:
                        pass
            delete_keys.append('authors')

        # ===========
        # Supervisors
        # ===========
        if 'supervisors' in metadata and metadata['supervisors']:
            metadata['thesis_supervisor'] = metadata['supervisors'][0]
            metadata['thesis_supervisor']['email'] = ''
            #metadata['_additional_authors'] = metadata['authors'][1:]
            delete_keys.append('supervisors')

        # ==============
        # Thesis related
        # ==============
        thesis_fields = filter(lambda field: field in metadata,
                               ['defense_date', 'university', 'degree_type'])
        if thesis_fields:
            metadata['thesis'] = {}

            for field in thesis_fields:
                metadata['thesis'][field_map[field]] = metadata[field]

            delete_keys.extend(thesis_fields)

        # ========
        # Category
        # ========
        metadata['collections'] = [{'primary': "HEP"}]

        # ===============
        # Abstract source
        # ===============
        if 'title_arXiv' in metadata:
            metadata['abstract']['source'] = 'arXiv'

        # ========
        # arXiv ID
        # ========
        if 'arxiv_id' in metadata:
            metadata['report_number'] = "$$9arXiv$$aoai:arXiv.org:" + metadata[
                'arxiv_id']

        # ========
        # Language
        # ========
        metadata['language'] = unicode(
            dict(LiteratureForm.languages).get(metadata['language']))

        # ==========
        # Experiment
        # ==========
        if 'experiment' in metadata:
            metadata['accelerator_experiment'] = {}
            metadata['accelerator_experiment']['experiment'] = metadata[
                'experiment']
            delete_keys.append('experiment')

        # ===============
        # Conference Info
        # ===============
        if 'conf_name' in metadata:
            if 'nonpublic_note' in metadata:
                field = [metadata['nonpublic_note'], metadata['conf_name']]
                metadata['nonpublic_note'] = field
            else:
                metadata['nonpublic_note'] = [metadata['conf_name']]
            metadata['collections'].extend([{'primary': "ConferencePaper"}])
            delete_keys.append('conf_name')

        # =======
        # License
        # =======
        if 'license_url' in metadata:
            metadata['license'] = {}
            metadata['license']['url'] = metadata['license_url']
            delete_keys.append('license_url')

        # ===========
        # Files (FFT)
        # ===========
        if 'fft' in metadata and metadata['fft']:
            fft = metadata['fft']
            metadata['fft'] = {}
            metadata['fft']['url'] = fft[0]['path']

        # ================
        # Publication Info
        # ================
        publication_fields = filter(lambda field: field in metadata, [
            'journal_title', 'page_range', 'article_id', 'volume', 'year',
            'issue'
        ])
        if publication_fields:
            metadata['publication_info'] = {}

            for field in publication_fields:
                metadata['publication_info'][
                    field_map[field]] = metadata[field]

            if 'page_nr' not in metadata and 'page_range' in publication_fields:
                pages = metadata['page_range'].split('-')
                if len(pages) == 2:
                    try:
                        metadata['page_nr'] = int(pages[1]) - int(pages[0])
                    except ValueError:
                        pass

            delete_keys.extend(publication_fields)

            if 'nonpublic_note' in metadata and len(
                    metadata['nonpublic_note']) > 1:
                del metadata['nonpublic_note'][0]

            if {'primary': "ConferencePaper"} in metadata['collections']:
                metadata['collections'].remove({'primary': "ConferencePaper"})
            metadata['collections'].append({'primary': "Published"})

        # ===================
        # Delete useless data
        # ===================
        for key in delete_keys:
            del metadata[key]
class oaiharvest_harvest_repositories(RecordWorkflow):
    """A workflow for use with OAI harvesting in BibSched."""

    object_type = "workflow"
    record_workflow = "oaiharvest_record_post_process"

    workflow = [
        init_harvesting,
        foreach(get_repositories_list(), "repository"),
        [
            write_something_generic("Harvesting",
                                    [task_update_progress, write_message]),
            harvest_records,
            foreach(get_obj_extra_data_key("harvested_files_list")),
            [
                write_something_generic("Starting sub-workflows for file",
                                        [task_update_progress, write_message]),
                foreach(get_records_from_file()),
                [
                    workflow_if(filtering_oai_pmh_identifier),
                    [
                        workflow_if(num_workflow_running_greater(10),
                                    neg=True),
                        [
                            start_async_workflow(
                                preserve_data=True,
                                preserve_extra_data_keys=[
                                    "repository", "oai_identifier"
                                ],
                                get_workflow_from=
                                get_workflow_from_engine_definition,
                            ),
                        ],
                        workflow_else,
                        [
                            write_something_generic(
                                ["Waiting for workflows to finish"],
                                [task_update_progress, write_message]),
                            wait_for_a_workflow_to_complete(10.0),
                            start_async_workflow(
                                preserve_data=True,
                                preserve_extra_data_keys=[
                                    "repository", "oai_identifier"
                                ],
                                get_workflow_from=
                                get_workflow_from_engine_definition,
                            ),
                        ],
                    ],
                ], end_for
            ], end_for
        ], end_for,
        write_something_generic(
            ["Processing: ", get_nb_workflow_created, " records"],
            [task_update_progress, write_message]),
        simple_for(0, get_nb_workflow_created, 1),
        [
            wait_for_a_workflow_to_complete(1.0),
            write_something_generic([get_workflows_progress, "%% complete"],
                                    [task_update_progress, write_message]),
        ], end_for,
        workflows_reviews(stop_if_error=True),
        update_last_update(get_repositories_list())
    ]