def get(cls, scope): """ Returns an implementation of the base Store class """ si = app.config.get("STORE_IMPL") sm = plugin.load_class(si) return sm(scope)
def test_04_last_harvest(self): hs = models.HarvestState() hs.account = "abcdefg" hs.issn = "2222-2222" hs.save() # first check that we don't get a last harvest date for anyone harvesters = app.config.get("HARVESTERS", []) plugins = [] for h in harvesters: p = plugin.load_class(h)() plugins.append(p) lh = hs.get_last_harvest(p.get_name()) assert lh is None lhs = {} for p in plugins: lhs[p.get_name()] = dates.random_date() hs.set_harvested(p.get_name(), lhs[p.get_name()]) hs.save(blocking=True) hs2 = models.HarvestState.find_by_issn("abcdefg", "2222-2222") for p in plugins: lh = hs2.get_last_harvest(p.get_name()) assert lh == lhs[p.get_name()]
def _get_dao_klass(self, cfg): # get the name of the model that will handle this query, and then look up # the class that will handle it dao_name = cfg.get("dao") dao_klass = plugin.load_class(dao_name) if dao_klass is None: raise exceptions.NoSuchObjectException(dao_name) return dao_klass
def tmp(cls): """ Returns an implementation of the base Store class which should be able to provide local temp storage to the app. In addition to the methods supplied by Store, it must also provide a "path" function to give the path on-disk to the file """ si = app.config.get("STORE_TMP_IMPL") sm = plugin.load_class(si) return sm()
def process_issn(cls, account_id, issn): app.logger.info("Processing ISSN:{x} for Account:{y}".format( y=account_id, x=issn)) state = HarvestState.find_by_issn(account_id, issn) # if this issn is suspended, don't process it if state.suspended: return Report.set_state_by_issn(issn, state) try: # get all the plugins that we need to run harvesters = app.config.get("HARVESTERS", []) for h in harvesters: p = plugin.load_class(h)() p_name = p.get_name() lh = state.get_last_harvest(p_name) if lh is None: lh = app.config.get("INITIAL_HARVEST_DATE") app.logger.info( "Processing ISSN:{x} for Account:{y} with Plugin:{z} Since:{a}" .format(y=account_id, x=issn, z=p_name, a=lh)) Report.set_start_by_issn(p_name, issn, lh) for article, lhd in p.iterate(issn, lh): saved = HarvesterWorkflow.process_article( account_id, article) Report.increment_articles_processed(p_name) # if the above worked, then we can update the harvest state if saved: state.set_harvested(p_name, lhd) Report.increment_articles_saved_successfully(p_name) except Exception: app.logger.info( "Exception Processing ISSN:{x} for Account:{y} ".format( y=account_id, x=issn)) raise finally: # once we've finished working with this issn, we should update the state # this is especially true if there is an exception, as this will allow us # to record where we got to, without having to do a save after each article # create state.save(blocking=True) app.logger.info( "Saved state record for ISSN:{x} for Account:{y}".format( y=account_id, x=issn))
def _file_upload(cls, username, f, schema, previous): # prep a record to go into the index, to record this upload record = models.FileUpload() record.upload(username, f.filename) record.set_id() # the file path that we are going to write to xml = os.path.join(app.config.get("UPLOAD_DIR", "."), record.local_filename) # it's critical here that no errors cause files to get left behind unrecorded try: # write the incoming file out to the XML file f.save(xml) # save the index entry record.save() except: # if we can't record either of these things, we need to back right off try: file_failed(xml) except: pass try: record.delete() except: pass raise BackgroundException( "Failed to upload file - please contact an administrator") xwalk_name = app.config.get("ARTICLE_CROSSWALKS", {}).get(schema) xwalk = plugin.load_class(xwalk_name)() # now we have the record in the index and on disk, we can attempt to # validate it try: with open(xml) as handle: xwalk.validate_file(handle) record.validated(schema) record.save() previous.insert(0, record) return record.id except IngestException as e: record.failed(e.message, e.inner_message) try: file_failed(xml) except: pass record.save() previous.insert(0, record) raise BackgroundException("Failed to upload file: " + e.message + "; " + str(e.inner_message)) except Exception as e: record.failed("File system error when reading file") try: file_failed(xml) except: pass record.save() previous.insert(0, record) raise BackgroundException( "Failed to upload file - please contact an administrator")
def _process(self, file_upload): job = self.background_job upload_dir = app.config.get("UPLOAD_DIR") path = os.path.join(upload_dir, file_upload.local_filename) if not os.path.exists(path): job.add_audit_message( u"File not found at path {} . Retrying job later.".format( path)) count = self.get_param(job.params, "attempts") retry_limit = app.config.get("HUEY_TASKS", {}).get("ingest_articles", {}).get("retries", 0) self.set_param(job.params, "attempts", count + 1) if retry_limit <= count: job.add_audit_message( u"File still not found at path {} . Giving up.".format( path)) job.fail() raise RetryException() job.add_audit_message(u"Importing from {x}".format(x=path)) articleService = DOAJ.articleService() account = models.Account.pull(file_upload.owner) xwalk_name = app.config.get("ARTICLE_CROSSWALKS", {}).get(file_upload.schema) xwalk = plugin.load_class(xwalk_name)() ingest_exception = False result = {} try: with open(path) as handle: articles = xwalk.crosswalk_file( handle, add_journal_info=False ) # don't import the journal info, as we haven't validated ownership of the ISSNs in the article yet for article in articles: article.set_upload_id(file_upload.id) result = articleService.batch_create_articles( articles, account, add_journal_info=True) except IngestException as e: job.add_audit_message( u"IngestException: {msg}. Inner message: {inner}. Stack: {x}". format(msg=e.message, inner=e.inner_message, x=e.trace())) file_upload.failed(e.message, e.inner_message) result = e.result try: file_failed(path) ingest_exception = True except: job.add_audit_message( u"Error cleaning up file which caused IngestException: {x}" .format(x=traceback.format_exc())) except (DuplicateArticleException, ArticleNotAcceptable) as e: job.add_audit_message( u"One or more articles did not contain either a DOI or a Fulltext URL" ) file_upload.failed( u"One or more articles did not contain either a DOI or a Fulltext URL" ) try: file_failed(path) except: job.add_audit_message( u"Error cleaning up file which caused Exception: {x}". format(x=traceback.format_exc())) return except Exception as e: job.add_audit_message( u"Unanticipated error: {x}".format(x=traceback.format_exc())) file_upload.failed("Unanticipated error when importing articles") try: file_failed(path) except: job.add_audit_message( u"Error cleaning up file which caused Exception: {x}". format(x=traceback.format_exc())) return success = result.get("success", 0) fail = result.get("fail", 0) update = result.get("update", 0) new = result.get("new", 0) shared = result.get("shared", []) unowned = result.get("unowned", []) unmatched = result.get("unmatched", []) if success == 0 and fail > 0 and not ingest_exception: file_upload.failed("All articles in file failed to import") job.add_audit_message("All articles in file failed to import") if success > 0 and fail == 0: file_upload.processed(success, update, new) if success > 0 and fail > 0: file_upload.partial(success, fail, update, new) job.add_audit_message( "Some articles in file failed to import correctly, so no articles imported" ) file_upload.set_failure_reasons(list(shared), list(unowned), list(unmatched)) job.add_audit_message("Shared ISSNs: " + ", ".join(list(shared))) job.add_audit_message("Unowned ISSNs: " + ", ".join(list(unowned))) job.add_audit_message("Unmatched ISSNs: " + ", ".join(list(unmatched))) if not ingest_exception: try: os.remove(path) # just remove the file, no need to keep it except Exception as e: job.add_audit_message( u"Error while deleting file {x}: {y}".format(x=path, y=e.message))
def _download(self, file_upload): job = self.background_job upload_dir = app.config.get("UPLOAD_DIR") path = os.path.join(upload_dir, file_upload.local_filename) # first, determine if ftp or http parsed_url = urlparse(file_upload.filename) if parsed_url.scheme == 'ftp': if not ftp_upload(job, path, parsed_url, file_upload): return False elif parsed_url.scheme in ['http', "https"]: if not http_upload(job, path, file_upload): return False else: msg = u"We only support HTTP(s) and FTP uploads by URL. This is a: {x}".format( x=parsed_url.scheme) job.add_audit_message(msg) file_upload.failed(msg) return False job.add_audit_message(u"Downloaded {x} as {y}".format( x=file_upload.filename, y=file_upload.local_filename)) xwalk_name = app.config.get("ARTICLE_CROSSWALKS", {}).get(file_upload.schema) xwalk = plugin.load_class(xwalk_name)() # now we have the record in the index and on disk, we can attempt to # validate it try: with open(path) as handle: xwalk.validate_file(handle) except IngestException as e: job.add_audit_message(u"IngestException: {x}".format(x=e.trace())) file_upload.failed(e.message, e.inner_message) try: file_failed(path) except: job.add_audit_message( u"Error cleaning up file which caused IngestException: {x}" .format(x=traceback.format_exc())) return False except Exception as e: job.add_audit_message( u"File system error while downloading file: {x}".format( x=traceback.format_exc())) file_upload.failed("File system error when downloading file") try: file_failed(path) except: job.add_audit_message( u"Error cleaning up file which caused Exception: {x}". format(x=traceback.format_exc())) return False # if we get to here then we have a successfully downloaded and validated # document, so we can write it to the index job.add_audit_message( u"Validated file as schema {x}".format(x=file_upload.schema)) file_upload.validated(file_upload.schema) return True