Example #1
0
    def __init__(self):
        self.validation_log = ""

        # load the schema into memory for more efficient usage in repeat calls to the crosswalk
        if self.schema_path is None:
            raise exceptions.IngestException(message="Unable to validate for DOAJXWalk, as schema path is not set in config")

        try:
            schema_file = open(self.schema_path)
            schema_doc = etree.parse(schema_file)
            self.schema = etree.XMLSchema(schema_doc)
        except Exception as e:
            raise exceptions.IngestException(message="There was an error attempting to load schema from " + self.schema_path, inner=e)
Example #2
0
def load_crossref_schema(app):
    schema_path = app.config["SCHEMAS"].get("crossref")

    if not app.config.get("CROSSREF_SCHEMA"):
        try:
            schema_doc = etree.parse(schema_path)
            schema = etree.XMLSchema(schema_doc)
            app.config["CROSSREF_SCHEMA"] = schema
        except Exception as e:
            raise exceptions.IngestException(
                message="There was an error attempting to load schema from " +
                schema_path,
                inner=e)
Example #3
0
    def __init__(self):
        self.validation_log = ""

        # load the schema into memory for more efficient usage in repeat calls to the crosswalk
        if self.schema_path is None:
            raise exceptions.IngestException(
                message=
                "Unable to validate for DOAJXWalk, as schema path is not set in config"
            )
        try:
            with open(self.schema_path) as schema_file:
                schema_doc = etree.parse(schema_file)

                # If we are using a test or dev environment, edit the schema to use local paths
                if app.config.get("DOAJENV") != 'production':
                    self._localise_schema(schema_doc)

                self.schema = etree.XMLSchema(schema_doc)
        except Exception as e:
            raise exceptions.IngestException(
                message="There was an error attempting to load schema from " +
                self.schema_path,
                inner=e)
Example #4
0
    def __init__(self):
        self.validation_log = ""
        self.schema_path = app.config.get("SCHEMAS", {}).get("crossref")

        # load the schema into memory for more efficient usage in repeat calls to the crosswalk
        if self.schema_path is None:
            raise exceptions.IngestException(
                message=
                "Unable to validate for CrossrefXWalk, as schema path is not set in config"
            )

        while app.config["CROSSREF_SCHEMA"] is None:
            continue

        self.schema = app.config["CROSSREF_SCHEMA"]
Example #5
0
    def batch_create_articles(self,
                              articles,
                              account,
                              duplicate_check=True,
                              merge_duplicate=True,
                              limit_to_account=True,
                              add_journal_info=False):
        """
        Create a batch of articles in a single operation.  Articles are either all created/updated or none of them are

        This method checks for duplicates within the provided set and within the current database (if you set duplicate_check=True)

        :param articles:  The list of article objects
        :param account:     The account creating the articles
        :param duplicate_check:     Whether to check for duplicates in the batch and in the index
        :param merge_duplicate:     Should duplicates be merged.  If set to False, this may raise a DuplicateArticleException
        :param limit_to_account:    Should the ingest be limited only to articles for journals owned by the account.  If set to True, may result in an IngestException
        :param add_journal_info:    Should we fetch the journal info and attach it to the article before save?
        :return: a report on the state of the import: {success: x, fail: x, update: x, new: x, shared: [], unowned: [], unmatched: []}
        """
        # first validate the incoming arguments to ensure that we've got the right thing
        argvalidate("batch_create_article", [{
            "arg": articles,
            "instance": list,
            "allow_none": False,
            "arg_name": "articles"
        }, {
            "arg": account,
            "instance": models.Account,
            "allow_none": False,
            "arg_name": "account"
        }, {
            "arg": duplicate_check,
            "instance": bool,
            "allow_none": False,
            "arg_name": "duplicate_check"
        }, {
            "arg": merge_duplicate,
            "instance": bool,
            "allow_none": False,
            "arg_name": "merge_duplicate"
        }, {
            "arg": limit_to_account,
            "instance": bool,
            "allow_none": False,
            "arg_name": "limit_to_account"
        }, {
            "arg": add_journal_info,
            "instance": bool,
            "allow_none": False,
            "arg_name": "add_journal_info"
        }], exceptions.ArgumentException)

        # 1. dedupe the batch
        if duplicate_check:
            batch_duplicates = self._batch_contains_duplicates(articles)
            if batch_duplicates:
                report = {
                    "success": 0,
                    "fail": len(articles),
                    "update": 0,
                    "new": 0,
                    "shared": [],
                    "unowned": [],
                    "unmatched": []
                }
                raise exceptions.IngestException(
                    message=Messages.EXCEPTION_ARTICLE_BATCH_DUPLICATE,
                    result=report)

        # 2. check legitimate ownership
        success = 0
        fail = 0
        update = 0
        new = 0
        all_shared = set()
        all_unowned = set()
        all_unmatched = set()

        for article in articles:
            try:
                result = self.create_article(article,
                                             account,
                                             duplicate_check=duplicate_check,
                                             merge_duplicate=merge_duplicate,
                                             limit_to_account=limit_to_account,
                                             add_journal_info=add_journal_info,
                                             dry_run=True)
            except exceptions.ArticleMergeConflict:
                raise exceptions.IngestException(
                    message=Messages.EXCEPTION_ARTICLE_BATCH_CONFLICT)

            success += result.get("success", 0)
            fail += result.get("fail", 0)
            update += result.get("update", 0)
            new += result.get("new", 0)
            all_shared.update(result.get("shared", set()))
            all_unowned.update(result.get("unowned", set()))
            all_unmatched.update(result.get("unmatched", set()))

        report = {
            "success": success,
            "fail": fail,
            "update": update,
            "new": new,
            "shared": all_shared,
            "unowned": all_unowned,
            "unmatched": all_unmatched
        }

        # if there were no failures in the batch, then we can do the save
        if fail == 0:
            for i in range(len(articles)):
                block = i == len(articles) - 1
                # block on the final save, so that when this method returns, all articles are
                # available in the index
                articles[i].save(blocking=block)

            # return some stats on the import
            return report
        else:
            raise exceptions.IngestException(
                message=Messages.EXCEPTION_ARTICLE_BATCH_FAIL, result=report)