import esprit, requests from portality.core import app, initialise_index initialise_index(app) live = esprit.raw.Connection(app.config.get("DOAJGATE_URL"), "doaj", auth=requests.auth.HTTPBasicAuth(app.config.get("DOAJGATE_UN"), app.config.get("DOAJGATE_PW")), verify_ssl=False, port=app.config.get("DOAJGATE_PORT")) local = esprit.raw.Connection(app.config.get("ELASTIC_SEARCH_HOST"), app.config.get("ELASTIC_SEARCH_DB")) esprit.tasks.copy(live, "account", local, "account", method="GET") esprit.tasks.copy(live, "journal", local, "journal", method="GET") esprit.tasks.copy(live, "suggestion", local, "suggestion", method="GET") esprit.tasks.copy(live, "article", local, "article", limit=100000, method="GET") esprit.tasks.copy(live, "editor_group", local, "editor_group", method="GET")
time.sleep(10) still_running = [hrv for hrv in running_harvesters if hrv.is_running()] # Move on to killing the processes if they don't respond to terminate if len(still_running) > 0: print("Old Harvesters are still running. Escalating to SIGKILL.") [h.kill() for h in running_harvesters] time.sleep(10) # Startup complete, change process name to running. setproctitle(RUNNING_PROCTITLE) if __name__ == "__main__": run_only_once() initialise_index(app) sub_prefix = app.config.get('HARVESTER_EMAIL_SUBJECT_PREFIX', '') # Send an email when the harvester starts. mail_prereqs = False fro = app.config.get("HARVESTER_EMAIL_FROM_ADDRESS", '*****@*****.**') if app.config.get("HARVESTER_EMAIL_ON_EVENT", False): to = app.config.get("HARVESTER_EMAIL_RECIPIENTS", None) if to is not None: mail_prereqs = True from portality import app_email as mail mail.send_mail( to=to, fro=fro, subject=sub_prefix + "DOAJ Harvester started at {0}".format(
def init_index(self): core.initialise_index(self.app_test)
def do_import(config): host = app.config["ELASTIC_SEARCH_HOST"] index = app.config["ELASTIC_SEARCH_DB"] if config.get("elastic_search_host") is not None: host = config.get("elastic_search_host") app.config["ELASTIC_SEARCH_HOST"] = host if config.get("elastic_search_db") is not None: index = config.get("elastic_search_db") app.config["ELASTIC_SEARCH_DB"] = index print("\n") print("Using host {x} and index {y}\n".format(x=host, y=index)) conn = esprit.raw.make_connection(None, host, None, index) # filter for the types we are going to work with import_types = {} for t, s in config.get("types", {}).iteritems(): if s.get("import", False) is True: import_types[t] = s print("==Carrying out the following import==") for import_type, cfg in import_types.iteritems(): count = "All" if cfg.get("limit", -1) == -1 else cfg.get("limit") print("{x} from {y}".format(x=count, y=import_type)) print("\n") if config.get("confirm", True): text = raw_input("Continue? [y/N] ") if text.lower() != "y": exit() # remove all the types that we are going to import for import_type in import_types.keys(): esprit.raw.delete(conn, import_type) # re-initialise the index (sorting out mappings, etc) print("==Initialising Index for Mappings==") initialise_index(app) mainStore = StoreFactory.get("anon_data") tempStore = StoreFactory.tmp() container = app.config.get("STORE_ANON_DATA_CONTAINER") print("\n==Importing==") for import_type, cfg in import_types.iteritems(): count = "all" if cfg.get("limit", -1) == -1 else cfg.get("limit") print("Importing {x} from {y}".format(x=count, y=import_type)) print("Obtaining {x} from storage".format(x=import_type)) limit = cfg.get("limit", -1) limit = None if limit == -1 else limit n = 1 while True: filename = import_type + ".bulk" + "." + str(n) handle = mainStore.get(container, filename) if handle is None: break tempStore.store(container, filename + ".gz", source_stream=handle) print("Retrieved {x} from storage".format(x=filename)) handle.close() print("Unzipping {x} in temporary store".format(x=filename)) compressed_file = tempStore.path(container, filename + ".gz") uncompressed_file = tempStore.path(container, filename, must_exist=False) with gzip.open(compressed_file, "rb") as f_in, open(uncompressed_file, "wb") as f_out: shutil.copyfileobj(f_in, f_out) tempStore.delete(container, filename + ".gz") print("Importing from {x}".format(x=filename)) imported_count = esprit.tasks.bulk_load(conn, import_type, uncompressed_file, limit=limit, max_content_length=config.get("max_content_length", 100000000)) tempStore.delete(container, filename) if limit is not None and imported_count != -1: limit -= imported_count if limit is not None and limit <= 0: break n += 1 tempStore.delete(container)
def setUp(self): core.initialise_index(core.app) time.sleep(1)
def init_index(self): core.initialise_index(self.app_test)
def do_import(config): host = app.config["ELASTIC_SEARCH_HOST"] index = app.config["ELASTIC_SEARCH_DB"] if config.get("elastic_search_host") is not None: host = config.get("elastic_search_host") app.config["ELASTIC_SEARCH_HOST"] = host if config.get("elastic_search_db") is not None: index = config.get("elastic_search_db") app.config["ELASTIC_SEARCH_DB"] = index print("\n") print("Using host {x} and index {y}\n".format(x=host, y=index)) conn = esprit.raw.make_connection(None, host, None, index) # filter for the types we are going to work with import_types = {} for t, s in config.get("types", {}).iteritems(): if s.get("import", False) is True: import_types[t] = s print("==Carrying out the following import==") for import_type, cfg in import_types.iteritems(): count = "All" if cfg.get("limit", -1) == -1 else cfg.get("limit") print("{x} from {y}".format(x=count, y=import_type)) print("\n") if config.get("confirm", True): text = raw_input("Continue? [y/N] ") if text.lower() != "y": exit() # remove all the types that we are going to import for import_type in import_types.keys(): esprit.raw.delete(conn, import_type) # re-initialise the index (sorting out mappings, etc) print("==Initialising Index for Mappings==") initialise_index(app) mainStore = StoreFactory.get("anon_data") tempStore = StoreFactory.tmp() container = app.config.get("STORE_ANON_DATA_CONTAINER") print("\n==Importing==") for import_type, cfg in import_types.iteritems(): count = "all" if cfg.get("limit", -1) == -1 else cfg.get("limit") print("Importing {x} from {y}".format(x=count, y=import_type)) print("Obtaining {x} from storage".format(x=import_type)) limit = cfg.get("limit", -1) limit = None if limit == -1 else limit n = 1 while True: filename = import_type + ".bulk" + "." + str(n) handle = mainStore.get(container, filename) if handle is None: break tempStore.store(container, filename + ".gz", source_stream=handle) print("Retrieved {x} from storage".format(x=filename)) handle.close() print("Unzipping {x} in temporary store".format(x=filename)) compressed_file = tempStore.path(container, filename + ".gz") uncompressed_file = tempStore.path(container, filename, must_exist=False) with gzip.open(compressed_file, "rb") as f_in, open(uncompressed_file, "wb") as f_out: shutil.copyfileobj(f_in, f_out) tempStore.delete_file(container, filename + ".gz") print("Importing from {x}".format(x=filename)) imported_count = esprit.tasks.bulk_load(conn, import_type, uncompressed_file, limit=limit, max_content_length=config.get("max_content_length", 100000000)) tempStore.delete_file(container, filename) if limit is not None and imported_count != -1: limit -= imported_count if limit is not None and limit <= 0: break n += 1 tempStore.delete_file(container)