def __init__(self): self.validation_log = "" # load the schema into memory for more efficient usage in repeat calls to the crosswalk if self.schema_path is None: raise exceptions.IngestException(message="Unable to validate for DOAJXWalk, as schema path is not set in config") try: schema_file = open(self.schema_path) schema_doc = etree.parse(schema_file) self.schema = etree.XMLSchema(schema_doc) except Exception as e: raise exceptions.IngestException(message="There was an error attempting to load schema from " + self.schema_path, inner=e)
def load_crossref_schema(app): schema_path = app.config["SCHEMAS"].get("crossref") if not app.config.get("CROSSREF_SCHEMA"): try: schema_doc = etree.parse(schema_path) schema = etree.XMLSchema(schema_doc) app.config["CROSSREF_SCHEMA"] = schema except Exception as e: raise exceptions.IngestException( message="There was an error attempting to load schema from " + schema_path, inner=e)
def __init__(self): self.validation_log = "" # load the schema into memory for more efficient usage in repeat calls to the crosswalk if self.schema_path is None: raise exceptions.IngestException( message= "Unable to validate for DOAJXWalk, as schema path is not set in config" ) try: with open(self.schema_path) as schema_file: schema_doc = etree.parse(schema_file) # If we are using a test or dev environment, edit the schema to use local paths if app.config.get("DOAJENV") != 'production': self._localise_schema(schema_doc) self.schema = etree.XMLSchema(schema_doc) except Exception as e: raise exceptions.IngestException( message="There was an error attempting to load schema from " + self.schema_path, inner=e)
def __init__(self): self.validation_log = "" self.schema_path = app.config.get("SCHEMAS", {}).get("crossref") # load the schema into memory for more efficient usage in repeat calls to the crosswalk if self.schema_path is None: raise exceptions.IngestException( message= "Unable to validate for CrossrefXWalk, as schema path is not set in config" ) while app.config["CROSSREF_SCHEMA"] is None: continue self.schema = app.config["CROSSREF_SCHEMA"]
def batch_create_articles(self, articles, account, duplicate_check=True, merge_duplicate=True, limit_to_account=True, add_journal_info=False): """ Create a batch of articles in a single operation. Articles are either all created/updated or none of them are This method checks for duplicates within the provided set and within the current database (if you set duplicate_check=True) :param articles: The list of article objects :param account: The account creating the articles :param duplicate_check: Whether to check for duplicates in the batch and in the index :param merge_duplicate: Should duplicates be merged. If set to False, this may raise a DuplicateArticleException :param limit_to_account: Should the ingest be limited only to articles for journals owned by the account. If set to True, may result in an IngestException :param add_journal_info: Should we fetch the journal info and attach it to the article before save? :return: a report on the state of the import: {success: x, fail: x, update: x, new: x, shared: [], unowned: [], unmatched: []} """ # first validate the incoming arguments to ensure that we've got the right thing argvalidate("batch_create_article", [{ "arg": articles, "instance": list, "allow_none": False, "arg_name": "articles" }, { "arg": account, "instance": models.Account, "allow_none": False, "arg_name": "account" }, { "arg": duplicate_check, "instance": bool, "allow_none": False, "arg_name": "duplicate_check" }, { "arg": merge_duplicate, "instance": bool, "allow_none": False, "arg_name": "merge_duplicate" }, { "arg": limit_to_account, "instance": bool, "allow_none": False, "arg_name": "limit_to_account" }, { "arg": add_journal_info, "instance": bool, "allow_none": False, "arg_name": "add_journal_info" }], exceptions.ArgumentException) # 1. dedupe the batch if duplicate_check: batch_duplicates = self._batch_contains_duplicates(articles) if batch_duplicates: report = { "success": 0, "fail": len(articles), "update": 0, "new": 0, "shared": [], "unowned": [], "unmatched": [] } raise exceptions.IngestException( message=Messages.EXCEPTION_ARTICLE_BATCH_DUPLICATE, result=report) # 2. check legitimate ownership success = 0 fail = 0 update = 0 new = 0 all_shared = set() all_unowned = set() all_unmatched = set() for article in articles: try: result = self.create_article(article, account, duplicate_check=duplicate_check, merge_duplicate=merge_duplicate, limit_to_account=limit_to_account, add_journal_info=add_journal_info, dry_run=True) except exceptions.ArticleMergeConflict: raise exceptions.IngestException( message=Messages.EXCEPTION_ARTICLE_BATCH_CONFLICT) success += result.get("success", 0) fail += result.get("fail", 0) update += result.get("update", 0) new += result.get("new", 0) all_shared.update(result.get("shared", set())) all_unowned.update(result.get("unowned", set())) all_unmatched.update(result.get("unmatched", set())) report = { "success": success, "fail": fail, "update": update, "new": new, "shared": all_shared, "unowned": all_unowned, "unmatched": all_unmatched } # if there were no failures in the batch, then we can do the save if fail == 0: for i in range(len(articles)): block = i == len(articles) - 1 # block on the final save, so that when this method returns, all articles are # available in the index articles[i].save(blocking=block) # return some stats on the import return report else: raise exceptions.IngestException( message=Messages.EXCEPTION_ARTICLE_BATCH_FAIL, result=report)