Esempio n. 1
0
def create_res_idx(res_name, num_docs):
    global MIG_TOTAL
    MIG_TOTAL[res_name] = num_docs

    idx_desc = es_resources_desc.resources_2_es_mapping.get(res_name, None)
    idx_name = get_index_name(res_name)

    idx_exists = es_util.get_idx_count(idx_name) > 0

    if idx_exists and not DELETE_AND_CREATE_INDEXES:
        MIG_LOG.info('INDEX {0} EXISTS. SKIPPING DELETION.'.format(idx_name))
        return

    n_shards = getattr(idx_desc, 'shards', num_shards_by_num_rows(num_docs))
    n_replicas = getattr(idx_desc, 'replicas', 0)
    res_analysis = getattr(idx_desc, 'analysis', None)
    res_mappings = getattr(idx_desc, 'mappings', None)
    es_util.create_idx(idx_name,
                       shards=n_shards,
                       replicas=n_replicas,
                       analysis=res_analysis,
                       mappings=res_mappings,
                       logger=MIG_LOG)
    MIG_LOG.info(
        "ELASTIC INDEX CREATED: RESOURCE:{0}->INDEX:{1} SHARDS:{2} REPLICAS:{3}"
        .format(res_name, idx_name, n_shards, n_replicas))
Esempio n. 2
0
def pre_cache_svg_files():
    global CACHING_PB, CACHING_PB_COUNT, WS_REQUEST_POOL, RDKIT_CACHE, INDIGO_CACHE, SVG_FAILURES, BASE_CACHE_PATH
    CACHING_PB = progress_bar_handler.get_new_progressbar(
        'molecule_svg_caching',
        max_val=es_util.get_idx_count(MOLECULE.idx_name))
    CACHING_PB_COUNT = 0

    def __handle_molecule_doc(doc, *args, **kargs):
        if not STOP_SCAN:
            WS_REQUEST_POOL.submit(get_svg_by_chembl_id,
                                   doc['molecule_chembl_id'])
            WS_REQUEST_POOL.submit(get_svg_by_chembl_id,
                                   doc['molecule_chembl_id'], True)

    es_util.scan_index(MOLECULE.idx_name,
                       on_doc=__handle_molecule_doc,
                       query={
                           '_source': 'molecule_chembl_id',
                           'query': {
                               'query_string': {
                                   'query': '_exists_:molecule_structures'
                               }
                           }
                       })
    WS_REQUEST_POOL.join()
    CACHING_PB.finish()
    print('RDKIT SVG data has been cached for {0} CHEMBL IDS'.format(
        len(RDKIT_CACHE)),
          file=sys.stderr)
    print('INDIGO SVG data has been cached for {0} CHEMBL IDS'.format(
        len(INDIGO_CACHE)),
          file=sys.stderr)

    indigo_fails = 0
    rdkit_fails = 0
    both_fails = 0

    for key, value in SVG_FAILURES.items():
        if len(value) > 1:
            SVG_FAILURES[key] = 'BOTH'
            both_fails += 1
        else:
            if value[0] == 'INDIGO':
                indigo_fails += 1
            else:
                rdkit_fails += 1
            SVG_FAILURES[key] = value[0]

    failures_file_path = os.path.join(BASE_CACHE_PATH, 'svg_failures.json')
    try:
        with open(failures_file_path, 'w', encoding='utf-8') as failures_file:
            json.dump(SVG_FAILURES, failures_file)
    except:
        traceback.print_exc()
        print('UNABLE TO WRITE FILE AT {0}'.format(failures_file_path),
              file=sys.stderr)

    print('INDIGO FAIL COUNT: {0}'.format(indigo_fails), file=sys.stderr)
    print('RDKIT FAIL COUNT: {0}'.format(rdkit_fails), file=sys.stderr)
    print('BOTH FAIL COUNT: {0}'.format(both_fails), file=sys.stderr)
Esempio n. 3
0
def check_ws_vs_es_counts():
    for resource_i in ALL_WS_RESOURCES:
        ws_count = get_ws_count(resource_i.res_name)
        es_count = es_util.get_idx_count(resource_i.idx_name)
        mismatch = ws_count == -1 or es_count == -1 or ws_count != es_count
        mismatch_txt = 'MISMATCH' if mismatch else ''
        formatted_ws_count = '{0:,}'.format(ws_count)
        formatted_ws_count = ' ' * (
            12 - len(formatted_ws_count)) + formatted_ws_count
        formatted_es_count = '{0:,}'.format(es_count)
        formatted_es_count = ' ' * (
            12 - len(formatted_es_count)) + formatted_es_count
        print_txt = '{0}: ws_count: {1} - es_count: {2}  {3}'\
            .format(resource_i.get_res_name_for_print(), formatted_ws_count, formatted_es_count, mismatch_txt)
        print(print_txt)
Esempio n. 4
0
def main():
    t_ini = time.time()
    parser = argparse.ArgumentParser(
        description="Migrate ChEMBL data from the WebServices to Elastic Search"
    )
    parser.add_argument(
        "--delete_indexes",
        dest="delete_indexes",
        help="Delete indexes if they exist already in the elastic cluster.",
        action="store_true",
    )
    parser.add_argument(
        "-A",
        "--all",
        dest="migrate_all",
        help="Migrate all the data in the WebServices, "
        "if missing defaults to only 1000 records per resource.",
        action="store_true",
    )
    parser.add_argument(
        "--generate_mappings",
        dest="generate_mappings",
        help="Generate elastic search mapping skeleton files without migrating",
        action="store_true",
    )
    parser.add_argument("--host",
                        dest="es_host",
                        help="Elastic Search Hostname or IP address.",
                        default="localhost")
    parser.add_argument("--user",
                        dest="es_user",
                        help="Elastic Search username.",
                        default=None)
    parser.add_argument("--password",
                        dest="es_password",
                        help="Elastic Search username password.",
                        default=None)
    parser.add_argument("--port",
                        dest="es_port",
                        help="Elastic Search port.",
                        default=9200)
    parser.add_argument(
        "--resource",
        dest="ws_resource",
        help=
        "Web Services resource to iterate, if not specified will iterate all the resources.",
        default=None)
    parser.add_argument(
        "--production",
        dest="ws_prod_env",
        help=
        "If included will use the production environment of the WS, if not will default to dev.",
        action="store_true",
    )
    parser.add_argument(
        "--create_alias",
        dest="create_alias",
        help="If included will create alias for the configured resources.",
        action="store_true",
    )
    args = parser.parse_args()

    prod_env = False
    if args.ws_prod_env:
        prod_env = True
    resources_description.set_ws_env(prod_env)
    print('CHEMBL WS URL: {0}'.format(resources_description.WS_URL_TO_USE),
          file=sys.stderr)
    sys.stderr.flush()

    if args.create_alias:
        resources_description.ResourceDescription.create_all_aliases(
            args.es_host, args.es_port, args.es_user, args.es_password)
        sys.exit(0)

    es_util.setup_connection(args.es_host, args.es_port, args.es_user,
                             args.es_password)

    if not es_util.ping():
        print("ERROR: Can't ping the elastic search server.", file=sys.stderr)
        sys.exit(1)

    selected_resources = None
    if args.ws_resource:
        selected_resources = args.ws_resource.split(',')

    if args.generate_mappings:
        migration_common.generate_mappings_for_resources(selected_resources)
        return

    migration_common.DELETE_AND_CREATE_INDEXES = args.delete_indexes
    if migration_common.DELETE_AND_CREATE_INDEXES:
        if not query_yes_no(
                "This procedure will delete and create all indexes again in the server.\n"
                "Do you want to proceed?",
                default="no"):
            return

    es_util.bulk_submitter.start()

    on_start = migration_common.create_res_idx
    on_doc = migration_common.write_res_doc2es_first_id
    on_done = None
    iterate_all = args.migrate_all

    iterator_thread_pool = SharedThreadPool(max_workers=10)

    resources_to_run = resources_description.ALL_WS_RESOURCES
    if selected_resources:
        resources_to_run = []
        for resource_i_str in selected_resources:
            resource_i = resources_description.RESOURCES_BY_RES_NAME.get(
                resource_i_str, None)
            if resource_i is None:
                print('Unknown resource {0}'.format(resource_i_str),
                      file=sys.stderr)
                sys.exit(1)
            resources_to_run.append(resource_i)
    iterators = []
    for resource_i in resources_to_run:
        res_it_i = ResourceIterator(resource_i,
                                    iterator_thread_pool,
                                    on_start=on_start,
                                    on_doc=on_doc,
                                    on_done=on_done,
                                    iterate_all=iterate_all,
                                    redo_failed_chunks=True)
        res_it_i.start()
        iterators.append(res_it_i)
    for res_it_i in iterators:
        res_it_i.join()

    es_util.bulk_submitter.join()
    for res_i in resources_description.ALL_WS_RESOURCES_NAMES:
        if migration_common.MIG_TOTAL[res_i] > 0:
            migration_common.MIG_LOG.info(
                "{0} migrated {1} out of {2} tried out of {3} total".format(
                    res_i,
                    es_util.get_idx_count(
                        migration_common.get_index_name(res_i)),
                    migration_common.MIG_TRIED_COUNT[res_i],
                    migration_common.MIG_TOTAL[res_i]))
    glados.es.ws2es.progress_bar_handler.write_after_progress_bars()

    total_time = time.time() - t_ini
    sec = timedelta(seconds=total_time)
    d = datetime(1, 1, 1) + sec

    migration_common.MIG_LOG.info(
        "Finished in: {0} Day(s), {1} Hour(s), {2} Minute(s) and {3} Second(s)"
        .format(d.day - 1, d.hour, d.minute, d.second))