Beispiel #1
0
import esprit, requests
from portality.core import app, initialise_index

initialise_index(app)

live = esprit.raw.Connection(app.config.get("DOAJGATE_URL"), "doaj", auth=requests.auth.HTTPBasicAuth(app.config.get("DOAJGATE_UN"), app.config.get("DOAJGATE_PW")), verify_ssl=False, port=app.config.get("DOAJGATE_PORT"))
local = esprit.raw.Connection(app.config.get("ELASTIC_SEARCH_HOST"), app.config.get("ELASTIC_SEARCH_DB"))

esprit.tasks.copy(live, "account", local, "account", method="GET")
esprit.tasks.copy(live, "journal", local, "journal", method="GET")
esprit.tasks.copy(live, "suggestion", local, "suggestion", method="GET")
esprit.tasks.copy(live, "article", local, "article", limit=100000, method="GET")
esprit.tasks.copy(live, "editor_group", local, "editor_group", method="GET")
Beispiel #2
0
        time.sleep(10)
        still_running = [hrv for hrv in running_harvesters if hrv.is_running()]

    # Move on to killing the processes if they don't respond to terminate
    if len(still_running) > 0:
        print("Old Harvesters are still running. Escalating to SIGKILL.")
        [h.kill() for h in running_harvesters]
        time.sleep(10)

    # Startup complete, change process name to running.
    setproctitle(RUNNING_PROCTITLE)


if __name__ == "__main__":
    run_only_once()
    initialise_index(app)
    sub_prefix = app.config.get('HARVESTER_EMAIL_SUBJECT_PREFIX', '')

    # Send an email when the harvester starts.
    mail_prereqs = False
    fro = app.config.get("HARVESTER_EMAIL_FROM_ADDRESS", '*****@*****.**')
    if app.config.get("HARVESTER_EMAIL_ON_EVENT", False):
        to = app.config.get("HARVESTER_EMAIL_RECIPIENTS", None)

        if to is not None:
            mail_prereqs = True
            from portality import app_email as mail
            mail.send_mail(
                to=to,
                fro=fro,
                subject=sub_prefix + "DOAJ Harvester started at {0}".format(
Beispiel #3
0
 def init_index(self):
     core.initialise_index(self.app_test)
Beispiel #4
0
def do_import(config):
    host = app.config["ELASTIC_SEARCH_HOST"]
    index = app.config["ELASTIC_SEARCH_DB"]
    if config.get("elastic_search_host") is not None:
        host = config.get("elastic_search_host")
        app.config["ELASTIC_SEARCH_HOST"] = host
    if config.get("elastic_search_db") is not None:
        index = config.get("elastic_search_db")
        app.config["ELASTIC_SEARCH_DB"] = index

    print("\n")
    print("Using host {x} and index {y}\n".format(x=host, y=index))
    conn = esprit.raw.make_connection(None, host, None, index)

    # filter for the types we are going to work with
    import_types = {}
    for t, s in config.get("types", {}).iteritems():
        if s.get("import", False) is True:
            import_types[t] = s

    print("==Carrying out the following import==")
    for import_type, cfg in import_types.iteritems():
        count = "All" if cfg.get("limit", -1) == -1 else cfg.get("limit")
        print("{x} from {y}".format(x=count, y=import_type))
    print("\n")

    if config.get("confirm", True):
        text = raw_input("Continue? [y/N] ")
        if text.lower() != "y":
            exit()

    # remove all the types that we are going to import
    for import_type in import_types.keys():
        esprit.raw.delete(conn, import_type)

    # re-initialise the index (sorting out mappings, etc)
    print("==Initialising Index for Mappings==")
    initialise_index(app)

    mainStore = StoreFactory.get("anon_data")
    tempStore = StoreFactory.tmp()
    container = app.config.get("STORE_ANON_DATA_CONTAINER")

    print("\n==Importing==")
    for import_type, cfg in import_types.iteritems():
        count = "all" if cfg.get("limit", -1) == -1 else cfg.get("limit")
        print("Importing {x} from {y}".format(x=count, y=import_type))
        print("Obtaining {x} from storage".format(x=import_type))

        limit = cfg.get("limit", -1)
        limit = None if limit == -1 else limit

        n = 1
        while True:
            filename = import_type + ".bulk" + "." + str(n)
            handle = mainStore.get(container, filename)
            if handle is None:
                break
            tempStore.store(container, filename + ".gz", source_stream=handle)
            print("Retrieved {x} from storage".format(x=filename))
            handle.close()

            print("Unzipping {x} in temporary store".format(x=filename))
            compressed_file = tempStore.path(container, filename + ".gz")
            uncompressed_file = tempStore.path(container, filename, must_exist=False)
            with gzip.open(compressed_file, "rb") as f_in, open(uncompressed_file, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)
            tempStore.delete(container, filename + ".gz")

            print("Importing from {x}".format(x=filename))
            imported_count = esprit.tasks.bulk_load(conn, import_type, uncompressed_file,
                                                    limit=limit, max_content_length=config.get("max_content_length", 100000000))
            tempStore.delete(container, filename)

            if limit is not None and imported_count != -1:
                limit -= imported_count
            if limit is not None and limit <= 0:
                break

            n += 1

    tempStore.delete(container)
Beispiel #5
0
 def setUp(self):
     core.initialise_index(core.app)
     time.sleep(1)
Beispiel #6
0
 def init_index(self):
     core.initialise_index(self.app_test)
Beispiel #7
0
def do_import(config):
    host = app.config["ELASTIC_SEARCH_HOST"]
    index = app.config["ELASTIC_SEARCH_DB"]
    if config.get("elastic_search_host") is not None:
        host = config.get("elastic_search_host")
        app.config["ELASTIC_SEARCH_HOST"] = host
    if config.get("elastic_search_db") is not None:
        index = config.get("elastic_search_db")
        app.config["ELASTIC_SEARCH_DB"] = index

    print("\n")
    print("Using host {x} and index {y}\n".format(x=host, y=index))
    conn = esprit.raw.make_connection(None, host, None, index)

    # filter for the types we are going to work with
    import_types = {}
    for t, s in config.get("types", {}).iteritems():
        if s.get("import", False) is True:
            import_types[t] = s

    print("==Carrying out the following import==")
    for import_type, cfg in import_types.iteritems():
        count = "All" if cfg.get("limit", -1) == -1 else cfg.get("limit")
        print("{x} from {y}".format(x=count, y=import_type))
    print("\n")

    if config.get("confirm", True):
        text = raw_input("Continue? [y/N] ")
        if text.lower() != "y":
            exit()

    # remove all the types that we are going to import
    for import_type in import_types.keys():
        esprit.raw.delete(conn, import_type)

    # re-initialise the index (sorting out mappings, etc)
    print("==Initialising Index for Mappings==")
    initialise_index(app)

    mainStore = StoreFactory.get("anon_data")
    tempStore = StoreFactory.tmp()
    container = app.config.get("STORE_ANON_DATA_CONTAINER")

    print("\n==Importing==")
    for import_type, cfg in import_types.iteritems():
        count = "all" if cfg.get("limit", -1) == -1 else cfg.get("limit")
        print("Importing {x} from {y}".format(x=count, y=import_type))
        print("Obtaining {x} from storage".format(x=import_type))

        limit = cfg.get("limit", -1)
        limit = None if limit == -1 else limit

        n = 1
        while True:
            filename = import_type + ".bulk" + "." + str(n)
            handle = mainStore.get(container, filename)
            if handle is None:
                break
            tempStore.store(container, filename + ".gz", source_stream=handle)
            print("Retrieved {x} from storage".format(x=filename))
            handle.close()

            print("Unzipping {x} in temporary store".format(x=filename))
            compressed_file = tempStore.path(container, filename + ".gz")
            uncompressed_file = tempStore.path(container, filename, must_exist=False)
            with gzip.open(compressed_file, "rb") as f_in, open(uncompressed_file, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)
            tempStore.delete_file(container, filename + ".gz")

            print("Importing from {x}".format(x=filename))
            imported_count = esprit.tasks.bulk_load(conn, import_type, uncompressed_file,
                                                    limit=limit, max_content_length=config.get("max_content_length", 100000000))
            tempStore.delete_file(container, filename)

            if limit is not None and imported_count != -1:
                limit -= imported_count
            if limit is not None and limit <= 0:
                break

            n += 1

    tempStore.delete_file(container)