Example #1
0
    def __init__(self,
                 resource: resources_description.ResourceDescription,
                 thread_pool: SharedThreadPool,
                 progress_records_base_path=os.getcwd(),
                 on_start=None,
                 on_doc=None,
                 on_done=None,
                 iterate_all=False,
                 redo_failed_chunks=False):
        super().__init__()
        self.resource = resource
        base_path = os.path.join(progress_records_base_path,
                                 'ws-iterator-progress')
        self.progress_records_base_path = os.path.join(base_path,
                                                       self.resource.res_name)
        os.makedirs(self.progress_records_base_path, exist_ok=True)
        self.fail_file = os.path.join(
            base_path, self.resource.res_name + '-failed-chunks')
        self.count_file = os.path.join(base_path,
                                       self.resource.res_name + '-count')
        ws_endpoint = resources_description.WS_URL_TO_USE
        parsed_url = urlparse(ws_endpoint)

        self.domain = parsed_url.scheme + '://' + parsed_url.netloc
        self.base_url_path = parsed_url.path + '/{0}.json?offset={1}&limit=' + str(
            ResourceIterator.LIMIT)

        self.stop = False
        signal_handler.add_termination_handler(self.stop_iterator)
        self.scheduled_tasks = []
        self.total_count = 0
        self.iterated_count = 0
        self.count_future = None
        self.progress_bar = None
        self.thread_pool = thread_pool
        self.scheduled_tasks_count = 0
        self.sync_lock = Lock()
        self.iterate_all = iterate_all
        self.redo_failed_chunks = redo_failed_chunks
        # Callable parameters check
        self.on_start = None
        if on_start:
            if callable(on_start):
                self.on_start = on_start
            else:
                raise Exception('on_start parameter is not callable')
        self.on_doc = None
        if on_doc:
            if callable(on_doc):
                self.on_doc = on_doc
            else:
                raise Exception('on_doc parameter is not callable')
        self.on_done = None
        if on_done:
            if callable(on_done):
                self.on_done = on_done
            else:
                raise Exception('on_done parameter is not callable')
Example #2
0
 def __init__(self, max_workers=10, label='SharedThreadPool'):
     super().__init__()
     self.max_workers = max_workers
     self.max_queue_size = max_workers * 1000
     self.current_tasks_queue = []
     self.thread_pool = ThreadPoolExecutor(max_workers=max_workers)
     self.stop_thread_pool = False
     self.label = label
     signal_handler.add_termination_handler(self.stop_pool)
Example #3
0
 def run(self):
     signal_handler.add_termination_handler(self.stop_submitter)
     self.submission_pb = progress_bar_handler.get_new_progressbar(
         'ES-bulk-submitter', 1)
     self.submission_pool.start()
     cur_low_counts = 0
     while not self.stop_submission:
         max_count = self.get_max_queue_count()
         if max_count >= self.max_docs_per_request * 5 or (
                 max_count > 0 and cur_low_counts > 10):
             cur_low_counts = 0
             self.check_and_submit_queues()
         else:
             if max_count > 0:
                 cur_low_counts += 1
             time.sleep(1)
             sys.stderr.flush()
Example #4
0
def get_new_progressbar(name, max_val=1) -> ProgressBar:
    global PROGRESS_BAR_IDX, PROGRESS_BAR_REQUESTED
    if PROGRESS_BAR_IDX == 0:
        signal_handler.add_termination_handler(on_exit)
        atexit.register(on_exit, *[None, None])
    PROGRESS_BAR_REQUESTED = True
    if PROGRESS_BAR_IDX == 0:
        print(term.clear)
    writer = Writer((0, PROGRESS_BAR_IDX))
    p_bar = ProgressBar(widgets=[
        name + ': ',
        Counter(format='%(value)d out of %(max_value)d'), ' ',
        Percentage(), ' ',
        Bar(), ' ',
        ETA()
    ],
                        fd=writer,
                        max_value=max_val).start(max_value=max_val)
    PROGRESS_BAR_IDX += 1

    return p_bar
Example #5
0
CACHING_PB_COUNT = 0
RDKIT_CACHE = {}
INDIGO_CACHE = {}
SVG_FAILURES = {}
STOP_SCAN = False
BASE_CACHE_PATH = os.path.join(os.getcwd(), 'svg-files-cache')
os.makedirs(BASE_CACHE_PATH, exist_ok=True)


def stop_scan(signal, frame):
    global STOP_SCAN
    STOP_SCAN = True
    es_util.stop_scan(signal, frame)


signal_handler.add_termination_handler(stop_scan)


@synchronized
def register_fail(molecule_chembl_id, framework):
    global SVG_FAILURES
    if molecule_chembl_id not in SVG_FAILURES:
        SVG_FAILURES[molecule_chembl_id] = []
    SVG_FAILURES[molecule_chembl_id].append(framework)


def get_svg_by_chembl_id(molecule_chembl_id, indigo=False):
    global BASE_WS_URL, CACHING_PB, CACHING_PB_COUNT, RDKIT_CACHE, INDIGO_CACHE, STOP_SCAN, BASE_CACHE_PATH, \
        SVG_FAILURES
    if STOP_SCAN:
        return None
def main():
    t_ini = time.time()
    parser = argparse.ArgumentParser(
        description="Denormalize ChEMBL data existing in Elastic Search")
    parser.add_argument("--host",
                        dest="es_host",
                        help="Elastic Search Hostname or IP address.",
                        default="localhost")
    parser.add_argument("--user",
                        dest="es_user",
                        help="Elastic Search username.",
                        default=None)
    parser.add_argument("--password",
                        dest="es_password",
                        help="Elastic Search username password.",
                        default=None)
    parser.add_argument("--port",
                        dest="es_port",
                        help="Elastic Search port.",
                        default=9200)
    parser.add_argument(
        "--unichem",
        dest="denormalize_unichem",
        help="If included will denormalize the unichem related data.",
        action="store_true",
    )
    parser.add_argument(
        "--activity",
        dest="denormalize_activity",
        help=
        "If included will denormalize the configured activity related data.",
        action="store_true",
    )
    parser.add_argument(
        "--compound_hierarchy",
        dest="denormalize_compound_hierarchy",
        help="If included will denormalize the Compound Hierarchy data.",
        action="store_true",
    )
    parser.add_argument(
        "--mechanism_and_drug_indication",
        dest="denormalize_mechanism_and_drug_indication",
        help=
        "If included will denormalize the Mechanism and Drug Indication data.",
        action="store_true",
    )
    args = parser.parse_args()

    es_util.setup_connection(args.es_host, args.es_port, args.es_user,
                             args.es_password)
    es_util.bulk_submitter.start()

    signal_handler.add_termination_handler(es_util.stop_scan)

    dn_type = None
    if args.denormalize_compound_hierarchy:
        denormalize_compound_hierarchy()
        dn_type = 'COMPOUND-HIERARCHY'
    elif args.denormalize_activity:
        denormalize_activity()
        dn_type = 'ACTIVITY'
    elif args.denormalize_unichem:
        denormalize_unichem()
        dn_type = 'UNICHEM'
    elif args.denormalize_mechanism_and_drug_indication:
        denormalize_mechanism_and_drug_indication()
        dn_type = 'MECHANISMS-AND-DRUG-INDICATION'
    else:
        denormalize_all_but_activity()
        dn_type = 'ALL-NO-ACTIVITY'
    end_msg = 'DENORMALIZATION FOR "{}" FINISHED'.format(dn_type)

    es_util.bulk_submitter.join()
    glados.es.ws2es.progress_bar_handler.write_after_progress_bars()

    total_time = time.time() - t_ini
    sec = timedelta(seconds=total_time)
    d = datetime(1, 1, 1) + sec

    print(end_msg, file=sys.stderr)
    print(
        "Finished in: {0} Day(s), {1} Hour(s), {2} Minute(s) and {3} Second(s)"
        .format(d.day - 1, d.hour, d.minute, d.second),
        file=sys.stderr)
Example #7
0
                                               resource_desc.res_name)
        doc_ids = list(dn_dict.keys())
        p_bar = progress_bar_handler.get_new_progressbar(
            progressbar_name, len(dn_dict))
        entity_dn_count = 0
        for doc_id_i in doc_ids:
            if DenormalizationHandler.STOP:
                return

            update_doc, update_size = get_update_script_and_size(
                doc_id_i, dn_dict[doc_id_i])
            # Indexes instead of update if it is requested
            if do_index:
                es_util.index_doc_bulk(resource_desc.idx_name, doc_id_i,
                                       update_doc)
            else:
                es_util.update_doc_bulk(resource_desc.idx_name,
                                        doc_id_i,
                                        doc=update_doc)

            entity_dn_count += 1
            p_bar.update(entity_dn_count)

        es_util.bulk_submitter.finish_current_queues()

        p_bar.finish()


signal_handler.add_termination_handler(
    DenormalizationHandler.stop_denormalization)
Example #8
0
import glados.es.ws2es.signal_handler as signal_handler
import requests
import sys
import zlib

BASE_EBI_URL = 'https://www.ebi.ac.uk'
UNICHEM_FTP_URL = 'http://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/wholeSourceMapping/src_id1/src1src{0}.txt.gz'
STOP_LOAD = False


def stop_unichem(signal, frame):
    global STOP_LOAD
    STOP_LOAD = True


signal_handler.add_termination_handler(stop_unichem)

UNICHEM_MAPPING = {
    'properties': {
        '_metadata': {
            'properties': {
                'unichem': {
                    'properties': {
                        'id': DefaultMappings.ID,
                        'src_name': DefaultMappings.KEYWORD,
                        'link': DefaultMappings.NO_INDEX_KEYWORD,
                        'src_url': DefaultMappings.NO_INDEX_KEYWORD,
                    }
                }
            }
        }
def main():
    t_ini = time.time()
    parser = argparse.ArgumentParser(
        description="Replicate ChEMBL data existing in Elastic Search from origin to destination."
    )
    parser.add_argument("--delete_indexes",
                        dest="delete_indexes",
                        help="Delete indexes if they exist already in the elastic cluster.",
                        action="store_true",)
    parser.add_argument("--skip-update-mappings",
                        dest="skip_update_mappings",
                        help="Does not attempt to update the mappings in the destination cluster.",
                        action="store_true",)
    parser.add_argument("--monitoring",
                        dest="monitoring",
                        help="Replicate the monitoring indexes.",
                        action="store_true",)
    parser.add_argument("--resource",
                        dest="es_resource",
                        help="Resource to iterate, if not specified will iterate all the resources.",
                        default=None)
    parser.add_argument("--es-major-version-origin",
                        dest="es_major_version_origin",
                        help="Elastic Search class to use for origin cluster.",
                        default=CURRENT_ES_VERSION)
    parser.add_argument("--host-origin",
                        dest="es_host_origin",
                        help="Elastic Search Hostname or IP address for origin cluster.",
                        default="localhost")
    parser.add_argument("--user-origin",
                        dest="es_user_origin",
                        help="Elastic Search username for origin cluster.",
                        default=None)
    parser.add_argument("--password-origin",
                        dest="es_password_origin",
                        help="Elastic Search username password for origin cluster.",
                        default=None)
    parser.add_argument("--port-origin",
                        dest="es_port_origin",
                        help="Elastic Search port for origin cluster.",
                        default=9200)
    parser.add_argument("--host-destination",
                        dest="es_host_destination",
                        help="Elastic Search Hostname or IP address for destination cluster.",
                        default="localhost")
    parser.add_argument("--user-destination",
                        dest="es_user_destination",
                        help="Elastic Search username for destination cluster.",
                        default=None)
    parser.add_argument("--password-destination",
                        dest="es_password_destination",
                        help="Elastic Search username password for destination cluster.",
                        default=None)
    parser.add_argument("--port-destination",
                        dest="es_port_destination",
                        help="Elastic Search port for destination cluster.",
                        default=9200)
    args = parser.parse_args()

    try:
        args.es_major_version_origin = int(args.es_major_version_origin)
        assert args.es_major_version_origin <= CURRENT_ES_VERSION
    except:
        traceback.print_exc()
        print(
            'ERROR: Major version for elastic "{0}" is not valid, it must be an integer lower than {1}.'
            .format(args.es_major_version_origin, CURRENT_ES_VERSION),
            file=sys.stderr
        )
        sys.exit(1)

    print('ORIGIN:')
    print(args.es_host_origin, args.es_port_origin, args.es_user_origin)
    print('DESTINATION:')
    print(args.es_host_destination, args.es_port_destination, args.es_user_destination)

    selected_resources = None
    if args.es_resource:
        selected_resources = args.es_resource.split(',')
    resources_to_run = resources_description.ALL_MONITORING_RESOURCES if args.monitoring else \
        resources_description.ALL_RELEASE_RESOURCES
    if selected_resources:
        resources_to_run = []
        for resource_i_str in selected_resources:
            resource_i = resources_description.RESOURCES_BY_RES_NAME.get(resource_i_str, None)
            if resource_i is None:
                print('Unknown resource {0}'.format(resource_i_str), file=sys.stderr)
                sys.exit(1)
            resources_to_run.append(resource_i)

    if args.es_host_origin == args.es_host_destination and args.es_port_origin == args.es_port_destination:
        print('ERROR: Origin and destination clusters are the same.')
        return

    if args.delete_indexes:
        if not query_yes_no("This procedure will delete and create all indexes again in the destination server.\n"
                            "Do you want to proceed?", default="no"):
            return

    es_util_origin = ESUtil(es_major_version=args.es_major_version_origin)
    es_util_origin.setup_connection(
        args.es_host_origin, args.es_port_origin, args.es_user_origin, args.es_password_origin
    )
    es_util_destination = ESUtil()
    es_util_destination.setup_connection(
        args.es_host_destination, args.es_port_destination, args.es_user_destination, args.es_password_destination
    )

    ping_failed = False

    if not es_util_origin.ping():
        print('ERROR: Ping failed to origin cluster.', file=sys.stderr)
        ping_failed = True

    if not es_util_destination.ping():
        print('ERROR: Ping failed to destination cluster.', file=sys.stderr)
        ping_failed = True

    if ping_failed:
        return

    es_util_destination.bulk_submitter.start()

    signal_handler.add_termination_handler(es_util_origin.stop_scan)
    signal_handler.add_termination_handler(es_util_destination.stop_scan)
    signal_handler.add_termination_handler(es_util_destination.bulk_submitter.stop_submitter)

    replicate_clusters(
        es_util_origin, es_util_destination, resources_to_run=resources_to_run, delete_dest_idx=args.delete_indexes,
        skip_update_mappings=args.skip_update_mappings
    )

    es_util_destination.bulk_submitter.finish_current_queues()
    es_util_destination.bulk_submitter.join()
    pbh.write_after_progress_bars()

    end_msg = 'REPLICATION FINISHED'

    total_time = time.time() - t_ini
    sec = timedelta(seconds=total_time)
    d = datetime(1, 1, 1) + sec

    print(end_msg, file=sys.stderr)
    print(
        "Finished in: {0} Day(s), {1} Hour(s), {2} Minute(s) and {3} Second(s)"
        .format(d.day-1, d.hour, d.minute, d.second),
        file=sys.stderr
    )
    check_origin_vs_destination_counts(es_util_origin, es_util_destination, resources_to_run=resources_to_run)