def test_01_sitemap(self): user = app.config.get("SYSTEM_USERNAME") job = sitemap.SitemapBackgroundTask.prepare(user) task = sitemap.SitemapBackgroundTask(job) BackgroundApi.execute(task) time.sleep(1) assert len(os.listdir(os.path.join(self.tmp_dir, "sitemap"))) == 1
def test_0x_check_latest_es_backup(self): user = app.config.get("SYSTEM_USERNAME") job = check_latest_es_backup.CheckLatestESBackupBackgroundTask.prepare(user) task = check_latest_es_backup.CheckLatestESBackupBackgroundTask(job) BackgroundApi.execute(task)
def check_latest_es_backup(job_id): job = models.BackgroundJob.pull(job_id) task = CheckLatestESBackupBackgroundTask(job) BackgroundApi.execute(task)
def journal_bulk_delete(job_id): job = models.BackgroundJob.pull(job_id) task = JournalBulkDeleteBackgroundTask(job) BackgroundApi.execute(task)
def async_workflow_notifications(job_id): job = models.BackgroundJob.pull(job_id) task = AsyncWorkflowBackgroundTask(job) BackgroundApi.execute(task)
def public_data_dump(job_id): job = models.BackgroundJob.pull(job_id) task = PublicDataDumpBackgroundTask(job) BackgroundApi.execute(task)
def prune_es_backups(job_id): job = models.BackgroundJob.pull(job_id) task = PruneESBackupsBackgroundTask(job) BackgroundApi.execute(task)
def read_news(job_id): job = models.BackgroundJob.pull(job_id) task = ReadNewsBackgroundTask(job) BackgroundApi.execute(task)
def article_cleanup_sync(job_id): job = models.BackgroundJob.pull(job_id) task = ArticleCleanupSyncBackgroundTask(job) BackgroundApi.execute(task)
def suggestion_bulk_edit(job_id): job = models.BackgroundJob.pull(job_id) task = SuggestionBulkEditBackgroundTask(job) BackgroundApi.execute(task)
def article_duplicate_report(job_id): job = models.BackgroundJob.pull(job_id) task = ArticleDuplicateReportBackgroundTask(job) BackgroundApi.execute(task)
def article_bulk_delete(job_id): job = models.BackgroundJob.pull(job_id) task = ArticleBulkDeleteBackgroundTask(job) BackgroundApi.execute(task)
def journal_csv(job_id): job = models.BackgroundJob.pull(job_id) task = JournalCSVBackgroundTask(job) BackgroundApi.execute(task)
def request_es_backup(job_id): job = models.BackgroundJob.pull(job_id) task = RequestESBackupBackgroundTask(job) BackgroundApi.execute(task)
def test_public_data_dump(self, name, kwargs): clean_arg = kwargs.get("clean") prune_arg = kwargs.get("prune") types_arg = kwargs.get("types") journals_arg = kwargs.get("journals") articles_arg = kwargs.get("articles") batch_size_arg = kwargs.get("batch_size") tmp_write_arg = kwargs.get("tmp_write") store_write_arg = kwargs.get("store_write") status_arg = kwargs.get("status") ############################################### ## set up clean = True if clean_arg == "yes" else False if clean_arg == "no" else None prune = True if prune_arg == "yes" else False if prune_arg == "no" else None types = types_arg if types_arg != "-" else None journal_count = int(journals_arg) article_count = int(articles_arg) batch_size = int(batch_size_arg) journal_file_count = 0 if journal_count == 0 else (journal_count // batch_size) + 1 article_file_count = 0 if article_count == 0 else (article_count // batch_size) + 1 first_article_file_records = 0 if article_count == 0 else batch_size if article_count > batch_size else article_count first_journal_file_records = 0 if journal_count == 0 else batch_size if journal_count > batch_size else journal_count # add the data to the index first, to maximise the time it has to become available for search sources = JournalFixtureFactory.make_many_journal_sources( journal_count, in_doaj=True) jids = [] for i in range(len(sources)): source = sources[i] journal = models.Journal(**source) journal.save() jids.append((journal.id, journal.last_updated)) aids = [] for i in range(article_count): source = ArticleFixtureFactory.make_article_source( eissn="{x}000-0000".format(x=i), pissn="0000-{x}000".format(x=i), with_id=False, doi="10.123/{x}".format(x=i), fulltext="http://example.com/{x}".format(x=i)) article = models.Article(**source) article.save() aids.append((article.id, article.last_updated)) # construct some test data in the local store container_id = app.config["STORE_PUBLIC_DATA_DUMP_CONTAINER"] localStore = store.StoreLocal(None) localStoreFiles = [] if clean or prune: for i in range(5): localStore.store(container_id, "doaj_article_data_2018-01-0" + str(i) + ".tar.gz", source_stream=StringIO("test")) localStore.store(container_id, "doaj_journal_data_2018-01-0" + str(i) + ".tar.gz", source_stream=StringIO("test")) localStoreFiles = localStore.list(container_id) app.config["DISCOVERY_RECORDS_PER_FILE"] = batch_size # set the mocks for store write failures if tmp_write_arg == "fail": app.config[ "STORE_TMP_IMPL"] = StoreMockFactory.no_writes_classpath() if store_write_arg == "fail": app.config["STORE_IMPL"] = StoreMockFactory.no_writes_classpath() # block until all the records are saved for jid, lu in jids: models.Journal.block(jid, lu, sleep=0.05) for aid, lu in aids: models.Article.block(aid, lu, sleep=0.05) ########################################################### # Execution job = PublicDataDumpBackgroundTask.prepare("testuser", clean=clean, prune=prune, types=types) task = PublicDataDumpBackgroundTask(job) BackgroundApi.execute(task) # make sure we have a fresh copy of the job job = task.background_job assert job.status == status_arg if job.status != "error": article_url = models.cache.Cache.get_public_data_dump().get( "article", {}).get("url") if types_arg in ["-", "all", "article"]: assert article_url is not None else: assert article_url is None journal_url = models.cache.Cache.get_public_data_dump().get( "journal", {}).get("url") if types_arg in ["-", "all", "journal"]: assert journal_url is not None else: assert journal_url is None assert localStore.exists(container_id) files = localStore.list(container_id) if types_arg in ["-", "all"]: assert len(files) == 2 else: assert len(files) == 1 day_at_start = dates.today() if types_arg in ["-", "all", "article"]: article_file = "doaj_article_data_" + day_at_start + ".tar.gz" assert article_file in files stream = localStore.get(container_id, article_file) tarball = tarfile.open(fileobj=stream, mode="r:gz") members = tarball.getmembers() assert len(members) == article_file_count if len(members) > 0: f = tarball.extractfile(members[0]) data = json.loads(f.read().decode("utf-8")) assert len(data) == first_article_file_records record = data[0] for key in list(record.keys()): assert key in [ "admin", "bibjson", "id", "last_updated", "created_date" ] if "admin" in record: for key in list(record["admin"].keys()): assert key in ["ticked", "seal"] if types_arg in ["-", "all", "journal"]: journal_file = "doaj_journal_data_" + day_at_start + ".tar.gz" assert journal_file in files stream = localStore.get(container_id, journal_file) tarball = tarfile.open(fileobj=stream, mode="r:gz") members = tarball.getmembers() assert len(members) == journal_file_count if len(members) > 0: f = tarball.extractfile(members[0]) data = json.loads(f.read().decode("utf-8")) assert len(data) == first_journal_file_records record = data[0] for key in list(record.keys()): assert key in [ "admin", "bibjson", "id", "last_updated", "created_date" ] if "admin" in record: for key in list(record["admin"].keys()): assert key in ["ticked", "seal"] else: # in the case of an error, we expect the tmp store to have been cleaned up tmpStore = store.TempStore() assert not tmpStore.exists(container_id) # in the case of an error, we expect the main store not to have been touched # (for the errors that we are checking for) if prune and not clean: # no matter what the error, if we didn't specify clean then we expect everything # to survive survived = localStore.list(container_id) assert localStoreFiles == survived elif clean: # if we specified clean, then it's possible the main store was cleaned before the # error occurred, in which case it depends on the error. This reminds us that # clean shouldn't be used in production if tmp_write_arg == "fail": assert not localStore.exists(container_id) else: survived = localStore.list(container_id) assert localStoreFiles == survived else: # otherwise, we expect the main store to have survived assert not localStore.exists(container_id)
def run_reports(job_id): job = models.BackgroundJob.pull(job_id) task = ReportingBackgroundTask(job) BackgroundApi.execute(task)
""" use this script if you want to manually (and synchronously) execute the sitemap task """ from portality.tasks import sitemap from portality.core import app from portality.background import BackgroundApi if __name__ == "__main__": user = app.config.get("SYSTEM_USERNAME") job = sitemap.SitemapBackgroundTask.prepare(user) task = sitemap.SitemapBackgroundTask(job) BackgroundApi.execute(task)
def ingest_articles(job_id): job = models.BackgroundJob.pull(job_id) task = IngestArticlesBackgroundTask(job) BackgroundApi.execute(task)
def set_in_doaj(job_id): job = models.BackgroundJob.pull(job_id) task = SetInDOAJBackgroundTask(job) BackgroundApi.execute(task)
def test_public_data_dump(self, name, kwargs): clean_arg = kwargs.get("clean") prune_arg = kwargs.get("prune") types_arg = kwargs.get("types") journals_arg = kwargs.get("journals") articles_arg = kwargs.get("articles") batch_size_arg = kwargs.get("batch_size") tmp_write_arg = kwargs.get("tmp_write") store_write_arg = kwargs.get("store_write") status_arg = kwargs.get("status") ############################################### ## set up clean = True if clean_arg == "yes" else False if clean_arg == "no" else None prune = True if prune_arg == "yes" else False if prune_arg == "no" else None types = types_arg if types_arg != "-" else None journal_count = int(journals_arg) article_count = int(articles_arg) batch_size = int(batch_size_arg) journal_file_count = 0 if journal_count == 0 else (journal_count / batch_size) + 1 article_file_count = 0 if article_count == 0 else (article_count / batch_size) + 1 first_article_file_records = 0 if article_count == 0 else batch_size if article_count > batch_size else article_count first_journal_file_records = 0 if journal_count == 0 else batch_size if journal_count > batch_size else journal_count # add the data to the index first, to maximise the time it has to become available for search sources = JournalFixtureFactory.make_many_journal_sources(journal_count, in_doaj=True) jids = [] for i in range(len(sources)): source = sources[i] journal = models.Journal(**source) journal.save() jids.append((journal.id, journal.last_updated)) aids = [] for i in range(article_count): source = ArticleFixtureFactory.make_article_source( eissn="{x}000-0000".format(x=i), pissn="0000-{x}000".format(x=i), with_id=False, doi="10.123/{x}".format(x=i), fulltext="http://example.com/{x}".format(x=i) ) article = models.Article(**source) article.save() aids.append((article.id, article.last_updated)) # construct some test data in the local store container_id = app.config["STORE_PUBLIC_DATA_DUMP_CONTAINER"] localStore = store.StoreLocal(None) localStoreFiles = [] if clean or prune: for i in range(5): localStore.store(container_id, "doaj_article_data_2018-01-0" + str(i) + ".tar.gz", source_stream=StringIO("test")) localStore.store(container_id, "doaj_journal_data_2018-01-0" + str(i) + ".tar.gz", source_stream=StringIO("test")) localStoreFiles = localStore.list(container_id) app.config["DISCOVERY_RECORDS_PER_FILE"] = batch_size # set the mocks for store write failures if tmp_write_arg == "fail": app.config["STORE_TMP_IMPL"] = StoreMockFactory.no_writes_classpath() if store_write_arg == "fail": app.config["STORE_IMPL"] = StoreMockFactory.no_writes_classpath() # block until all the records are saved for jid, lu in jids: models.Journal.block(jid, lu, sleep=0.05) for aid, lu in aids: models.Article.block(aid, lu, sleep=0.05) ########################################################### # Execution job = PublicDataDumpBackgroundTask.prepare("testuser", clean=clean, prune=prune, types=types) task = PublicDataDumpBackgroundTask(job) BackgroundApi.execute(task) # make sure we have a fresh copy of the job job = task.background_job assert job.status == status_arg if job.status != "error": article_url = models.cache.Cache.get_public_data_dump().get("article", {}).get("url") if types_arg in ["-", "all", "article"]: assert article_url is not None else: assert article_url is None journal_url = models.cache.Cache.get_public_data_dump().get("journal", {}).get("url") if types_arg in ["-", "all", "journal"]: assert journal_url is not None else: assert journal_url is None assert localStore.exists(container_id) files = localStore.list(container_id) if types_arg in ["-", "all"]: assert len(files) == 2 else: assert len(files) == 1 day_at_start = dates.today() if types_arg in ["-", "all", "article"]: article_file = "doaj_article_data_" + day_at_start + ".tar.gz" assert article_file in files stream = localStore.get(container_id, article_file) tarball = tarfile.open(fileobj=stream, mode="r:gz") members = tarball.getmembers() assert len(members) == article_file_count if len(members) > 0: f = tarball.extractfile(members[0]) data = json.loads(f.read()) assert len(data) == first_article_file_records record = data[0] for key in record.keys(): assert key in ["admin", "bibjson", "id", "last_updated", "created_date"] if "admin" in record: for key in record["admin"].keys(): assert key in ["ticked", "seal"] if types_arg in ["-", "all", "journal"]: journal_file = "doaj_journal_data_" + day_at_start + ".tar.gz" assert journal_file in files stream = localStore.get(container_id, journal_file) tarball = tarfile.open(fileobj=stream, mode="r:gz") members = tarball.getmembers() assert len(members) == journal_file_count if len(members) > 0: f = tarball.extractfile(members[0]) data = json.loads(f.read()) assert len(data) == first_journal_file_records record = data[0] for key in record.keys(): assert key in ["admin", "bibjson", "id", "last_updated", "created_date"] if "admin" in record: for key in record["admin"].keys(): assert key in ["ticked", "seal"] else: # in the case of an error, we expect the tmp store to have been cleaned up tmpStore = store.TempStore() assert not tmpStore.exists(container_id) # in the case of an error, we expect the main store not to have been touched # (for the errors that we are checking for) if prune and not clean: # no matter what the error, if we didn't specify clean then we expect everything # to survive survived = localStore.list(container_id) assert localStoreFiles == survived elif clean: # if we specified clean, then it's possible the main store was cleaned before the # error occurred, in which case it depends on the error. This reminds us that # clean shouldn't be used in production if tmp_write_arg == "fail": assert not localStore.exists(container_id) else: survived = localStore.list(container_id) assert localStoreFiles == survived else: # otherwise, we expect the main store to have survived assert not localStore.exists(container_id)
def generate_sitemap(job_id): job = models.BackgroundJob.pull(job_id) task = SitemapBackgroundTask(job) BackgroundApi.execute(task)
def journal_bulk_edit(job_id): job = models.BackgroundJob.pull(job_id) task = JournalBulkEditBackgroundTask(job) BackgroundApi.execute(task)
from portality.core import app from portality.tasks import read_news from portality.background import BackgroundApi if __name__ == "__main__": if app.config.get("SCRIPTS_READ_ONLY_MODE", False): print "System is in READ-ONLY mode, script cannot run" exit() user = app.config.get("SYSTEM_USERNAME") job = read_news.ReadNewsBackgroundTask.prepare(user) task = read_news.ReadNewsBackgroundTask(job) BackgroundApi.execute(task)