Example #1
0
    def complete_cell_n_tissue(self, assay_2_compound: dict, ac_dh_assay_dict: dict):
        pb = get_new_progressbar('cell-completion', len(self.cell_2_assay))
        for i, cell_id in enumerate(self.cell_2_assay):
            if cell_id not in self.cell_dict:
                self.cell_dict[cell_id] = {}
            self.cell_dict[cell_id]['related_activities'] = {
                'count': 0,
                'all_chembl_ids': set()
            }
            self.cell_dict[cell_id]['related_compounds'] = {
                'count': 0,
                'all_chembl_ids': set()
            }
            for assay in self.cell_2_assay.get(cell_id, []):
                compounds = assay_2_compound.get(assay, {})
                for compound_i in compounds:
                    if compound_i not in self.cell_dict[cell_id]['related_compounds']['all_chembl_ids']:
                        self.cell_dict[cell_id]['related_compounds']['count'] += 1
                        self.cell_dict[cell_id]['related_compounds']['all_chembl_ids'].add(compound_i)
                if ac_dh_assay_dict.get(assay, None):
                    self.cell_dict[cell_id]['related_activities']['count'] += ac_dh_assay_dict[assay].get(
                        'related_activities', {}
                    ).get('count', 0)
            pb.update(i)
        pb.finish()

        pb = get_new_progressbar('tissue-completion', len(self.tissue_2_assay))
        for i, tissue_id in enumerate(self.tissue_2_assay):
            if tissue_id not in self.tissue_dict:
                self.tissue_dict[tissue_id] = {}
            self.tissue_dict[tissue_id]['related_activities'] = {
                'count': 0,
                'all_chembl_ids': set()
            }
            self.tissue_dict[tissue_id]['related_compounds'] = {
                'count': 0,
                'all_chembl_ids': set()
            }
            for assay in self.tissue_2_assay.get(tissue_id, []):
                compounds = assay_2_compound.get(assay, {})
                for compound_i in compounds:
                    if compound_i not in self.tissue_dict[tissue_id]['related_compounds']['all_chembl_ids']:
                        self.tissue_dict[tissue_id]['related_compounds']['count'] += 1
                        self.tissue_dict[tissue_id]['related_compounds']['all_chembl_ids'].add(compound_i)
                if ac_dh_assay_dict.get(assay, None):
                    self.tissue_dict[tissue_id]['related_activities']['count'] += ac_dh_assay_dict[assay].get(
                        'related_activities', {}
                    ).get('count', 0)
            pb.update(i)
        pb.finish()
Example #2
0
    def iterate_resource(self):
        self.count_future = self.thread_pool.submit(self._get_resource_count)
        self.total_count = self.count_future.result()
        self.iterated_count = 0
        chunk_size = ResourceIterator.LIMIT * 3
        self.progress_bar = progress_bar_handler.get_new_progressbar(
            self.resource.res_name, self.total_count)
        if self.on_start:
            try:
                self.on_start(self.resource.res_name, self.total_count)
            except:
                print('Exception on resource "{0}" start.\n'.format(
                    self.resource.res_name),
                      file=sys.stderr)
                print('Exception caught: \n{0}\n'.format(
                    traceback.format_exc()),
                      file=sys.stderr)
                print('ERROR: on_resource_start for {0} failed exiting now!'.
                      format(self.resource.res_name),
                      file=sys.stderr)
                sys.stderr.flush()
                return
        stop_at = self.total_count if self.iterate_all else (
            ResourceIterator.LIMIT * 10)
        for offset_i in range(0, stop_at, chunk_size):
            if self.stop:
                return
            task = self._submit_iterate_resource_chunk_to_queue(
                offset_i, offset_i + chunk_size)
            if task:
                self.scheduled_tasks.append(task)
        self.check_progress_bar(wait_to_finish=True)

        if not self.stop and self.on_done:
            self.on_done(self.resource.res_name)
Example #3
0
def pre_cache_svg_files():
    global CACHING_PB, CACHING_PB_COUNT, WS_REQUEST_POOL, RDKIT_CACHE, INDIGO_CACHE, SVG_FAILURES, BASE_CACHE_PATH
    CACHING_PB = progress_bar_handler.get_new_progressbar(
        'molecule_svg_caching',
        max_val=es_util.get_idx_count(MOLECULE.idx_name))
    CACHING_PB_COUNT = 0

    def __handle_molecule_doc(doc, *args, **kargs):
        if not STOP_SCAN:
            WS_REQUEST_POOL.submit(get_svg_by_chembl_id,
                                   doc['molecule_chembl_id'])
            WS_REQUEST_POOL.submit(get_svg_by_chembl_id,
                                   doc['molecule_chembl_id'], True)

    es_util.scan_index(MOLECULE.idx_name,
                       on_doc=__handle_molecule_doc,
                       query={
                           '_source': 'molecule_chembl_id',
                           'query': {
                               'query_string': {
                                   'query': '_exists_:molecule_structures'
                               }
                           }
                       })
    WS_REQUEST_POOL.join()
    CACHING_PB.finish()
    print('RDKIT SVG data has been cached for {0} CHEMBL IDS'.format(
        len(RDKIT_CACHE)),
          file=sys.stderr)
    print('INDIGO SVG data has been cached for {0} CHEMBL IDS'.format(
        len(INDIGO_CACHE)),
          file=sys.stderr)

    indigo_fails = 0
    rdkit_fails = 0
    both_fails = 0

    for key, value in SVG_FAILURES.items():
        if len(value) > 1:
            SVG_FAILURES[key] = 'BOTH'
            both_fails += 1
        else:
            if value[0] == 'INDIGO':
                indigo_fails += 1
            else:
                rdkit_fails += 1
            SVG_FAILURES[key] = value[0]

    failures_file_path = os.path.join(BASE_CACHE_PATH, 'svg_failures.json')
    try:
        with open(failures_file_path, 'w', encoding='utf-8') as failures_file:
            json.dump(SVG_FAILURES, failures_file)
    except:
        traceback.print_exc()
        print('UNABLE TO WRITE FILE AT {0}'.format(failures_file_path),
              file=sys.stderr)

    print('INDIGO FAIL COUNT: {0}'.format(indigo_fails), file=sys.stderr)
    print('RDKIT FAIL COUNT: {0}'.format(rdkit_fails), file=sys.stderr)
    print('BOTH FAIL COUNT: {0}'.format(both_fails), file=sys.stderr)
Example #4
0
 def scan_index(self, es_index, on_doc=None, query=None):
     if self.es_conn is None:
         print(
             "FATAL ERROR: there is not an elastic search connection defined.",
             file=sys.stderr)
         traceback.print_exc(file=sys.stderr)
         sys.exit(1)
     if query is None:
         query = {}
     query['track_total_hits'] = True
     search_res = self.es_conn.search(index=es_index, body=query)
     total_docs = search_res['hits']['total']['value']
     update_every = min(math.ceil(total_docs * 0.001), 1000)
     scan_query = SummableDict()
     if query:
         scan_query += query
     scanner = helpers.scan(self.es_conn,
                            index=es_index,
                            scroll='10m',
                            query=query,
                            size=1000)
     count = 0
     p_bar = progress_bar_handler.get_new_progressbar(
         '{0}_es-index-scan'.format(es_index), total_docs)
     for doc_n in scanner:
         if callable(on_doc):
             should_stop = on_doc(doc_n['_source'], doc_n['_id'],
                                  total_docs, count, count == 0,
                                  count == total_docs - 1)
             if should_stop or self.stop_scan:
                 return
         count += 1
         if count % update_every == 0:
             p_bar.update(count)
     p_bar.finish()
Example #5
0
 def complete_compound(self):
     pb = get_new_progressbar('compound-completion',
                              len(self.compound_2_assay))
     for i, molecule_chembl_id in enumerate(self.compound_2_assay):
         if molecule_chembl_id not in self.compound_dict:
             self.compound_dict[molecule_chembl_id] = {}
         self.compound_dict[molecule_chembl_id]['related_cell_lines'] = {
             'count': 0,
             'all_chembl_ids': set()
         }
         self.compound_dict[molecule_chembl_id]['related_tissues'] = {
             'count': 0,
             'all_chembl_ids': set()
         }
         for assay in self.compound_2_assay.get(molecule_chembl_id, []):
             cell_n_tissue = self.assay_dh.assay_2_cell_n_tissue.get(
                 assay, {})
             cell_id = cell_n_tissue.get('cell_chembl_id', None)
             tissue_id = cell_n_tissue.get('tissue_chembl_id', None)
             if cell_id and \
                     cell_id not in self.compound_dict[molecule_chembl_id]['related_cell_lines']['all_chembl_ids']:
                 self.compound_dict[molecule_chembl_id][
                     'related_cell_lines']['count'] += 1
                 self.compound_dict[molecule_chembl_id][
                     'related_cell_lines']['all_chembl_ids'].add(cell_id)
             if tissue_id and \
                     tissue_id not in self.compound_dict[molecule_chembl_id]['related_tissues']['all_chembl_ids']:
                 self.compound_dict[molecule_chembl_id]['related_tissues'][
                     'count'] += 1
                 self.compound_dict[molecule_chembl_id]['related_tissues'][
                     'all_chembl_ids'].add(tissue_id)
         pb.update(i)
     pb.finish()
 def get_all_dn_dicts(self):
     total_dn_dict = SummableDict()
     pb = get_new_progressbar('built-dn-hierarchy-dict', len(self.children))
     current = 0
     for node in self.children.values():
         dn_dict_i, shared_family_data, node_data = node.get_denormalization_dict(
         )
         for chembl_id, dn_data in dn_dict_i.items():
             total_dn_dict[chembl_id] = dn_data
         current += 1
         pb.update(current)
     pb.finish()
     return total_dn_dict
Example #7
0
 def run(self):
     signal_handler.add_termination_handler(self.stop_submitter)
     self.submission_pb = progress_bar_handler.get_new_progressbar(
         'ES-bulk-submitter', 1)
     self.submission_pool.start()
     cur_low_counts = 0
     while not self.stop_submission:
         max_count = self.get_max_queue_count()
         if max_count >= self.max_docs_per_request * 5 or (
                 max_count > 0 and cur_low_counts > 10):
             cur_low_counts = 0
             self.check_and_submit_queues()
         else:
             if max_count > 0:
                 cur_low_counts += 1
             time.sleep(1)
             sys.stderr.flush()
Example #8
0
def load_all_chembl_unichem_data():
    global STOP_LOAD
    unichem_ds = load_unichem_ds_desc()
    unichem_data_by_chembl_id = {}
    pb = progress_bar_handler.get_new_progressbar('reading-unichem',
                                                  len(unichem_ds) - 1)

    for i, src_id_i in enumerate(sorted(unichem_ds.keys())):
        if STOP_LOAD:
            return
        if src_id_i == 1 or src_id_i == '1':
            continue
        req_i = requests.get(url=UNICHEM_FTP_URL.format(src_id_i),
                             stream=True,
                             verify=False)
        decoder = zlib.decompressobj(16 + zlib.MAX_WBITS)
        last_row_in_last_chunk = None
        for chunk in req_i.iter_content(chunk_size=1024, decode_unicode=False):
            if STOP_LOAD:
                return
            rows_in_chunk = decoder.decompress(chunk).decode("utf-8")
            if last_row_in_last_chunk:
                rows_in_chunk = last_row_in_last_chunk + rows_in_chunk
            save_last = not rows_in_chunk.endswith('\n')
            records = rows_in_chunk.split('\n')
            if save_last:
                last_row_in_last_chunk = records[-1]
                records = records[:-1]
            else:
                last_row_in_last_chunk = None
            collect_unichem_records(src_id_i, records,
                                    unichem_data_by_chembl_id, unichem_ds)
        last_rows = decoder.flush().decode("utf-8")
        if last_row_in_last_chunk:
            last_rows = last_row_in_last_chunk + last_rows
        records = last_rows.split('\n')
        collect_unichem_records(src_id_i, records, unichem_data_by_chembl_id,
                                unichem_ds)
        pb.update(i)
    pb.finish()
    return unichem_data_by_chembl_id
Example #9
0
    def save_denormalization_dict(
            cls,
            resource_desc: resources_description.ResourceDescription,
            dn_dict: dict,
            get_update_script_and_size,
            new_mappings=None,
            do_index=False):
        if new_mappings:
            es_util.update_doc_type_mappings(resource_desc.idx_name,
                                             new_mappings)

        progressbar_name = '{0}-dn-{1}'.format(cls.RESOURCE.res_name,
                                               resource_desc.res_name)
        doc_ids = list(dn_dict.keys())
        p_bar = progress_bar_handler.get_new_progressbar(
            progressbar_name, len(dn_dict))
        entity_dn_count = 0
        for doc_id_i in doc_ids:
            if DenormalizationHandler.STOP:
                return

            update_doc, update_size = get_update_script_and_size(
                doc_id_i, dn_dict[doc_id_i])
            # Indexes instead of update if it is requested
            if do_index:
                es_util.index_doc_bulk(resource_desc.idx_name, doc_id_i,
                                       update_doc)
            else:
                es_util.update_doc_bulk(resource_desc.idx_name,
                                        doc_id_i,
                                        doc=update_doc)

            entity_dn_count += 1
            p_bar.update(entity_dn_count)

        es_util.bulk_submitter.finish_current_queues()

        p_bar.finish()
Example #10
0
    def do_complete_data(self, doc: dict, total_docs: int, index: int,
                         first: bool, last: bool):
        if first:
            self.complete_data_pb = progress_bar_handler.get_new_progressbar(
                '{0}-data-completion'.format(self.RESOURCE.idx_name),
                total_docs)
            mappings = self.get_custom_mappings_for_complete_data()
            if len(mappings.keys()) > 0:
                self.update_mappings(mappings)
        update_doc = self.get_doc_for_complete_data(doc)
        if update_doc is not None:
            es_util.update_doc_bulk(self.RESOURCE.idx_name,
                                    self.RESOURCE.get_doc_id(doc),
                                    doc=update_doc)

        es_util.bulk_submitter.set_complete_futures(True)

        if last:
            es_util.bulk_submitter.finish_current_queues()
            es_util.bulk_submitter.set_complete_futures(False)
            self.complete_data_pb.finish()
        else:
            self.complete_data_pb.update(index)
Example #11
0
    def save_denormalization_for_new_index(self):
        es_util.delete_idx(self.generated_resource.idx_name)
        es_util.create_idx(self.generated_resource.idx_name,
                           3,
                           1,
                           analysis=DefaultMappings.COMMON_ANALYSIS,
                           mappings=DrugIndicationDenormalizationHandler.
                           get_new_index_mappings())

        dn_dict = {}

        print('{0} GROUPED RECORDS WERE FOUND'.format(
            len(self.drug_inds_by_grouping_id)),
              file=sys.stderr)
        p_bar = progress_bar_handler.get_new_progressbar(
            'drug_inds_by_parent-dn-generation',
            len(self.drug_inds_by_grouping_id))
        i = 0
        for group_drug_inds in self.drug_inds_by_grouping_id.values():
            base_drug_ind = group_drug_inds[0]
            efo_data = {}
            indication_refs = []
            max_phase_for_ind = 0
            for drug_ind_i in group_drug_inds:

                max_phase_for_ind = max(max_phase_for_ind,
                                        drug_ind_i.get('max_phase_for_ind', 0))

                efo_id_i = drug_ind_i.get('efo_id', None)
                if efo_id_i is not None:
                    efo_data[efo_id_i] = drug_ind_i.get('efo_term', None)

                indication_refs += drug_ind_i.get('indication_refs', [])

            parent_chembl_id, mesh_id = self.get_drug_ind_grouping_id_parts(
                base_drug_ind)

            drug_ind_data = SummableDict(
                **DRUG_INDICATION.get_doc_by_id_from_es(
                    base_drug_ind['drugind_id']))
            drug_ind_data -= ['efo_term', 'efo_id']
            drug_ind_data['efo'] = [{
                'id': efo_id,
                'term': term
            } for efo_id, term in efo_data.items()]
            drug_ind_data['max_phase_for_ind'] = max_phase_for_ind
            drug_ind_data['indication_refs'] = indication_refs

            new_mechanism_doc = {
                'parent_molecule':
                MOLECULE.get_doc_by_id_from_es(parent_chembl_id),
                'drug_indication': drug_ind_data
            }
            doc_id = self.generated_resource.get_doc_id(new_mechanism_doc)

            dn_dict[doc_id] = new_mechanism_doc
            i += 1
            p_bar.update(i)
        p_bar.finish()

        self.save_denormalization_dict(
            self.generated_resource,
            dn_dict,
            DenormalizationHandler.default_update_script_and_size,
            do_index=True)
Example #12
0
        'index': 'unichem_bkp_simple',
        'size': 1000,
        'slice': {
            'id': 2,
            'max': 1000
        }
    },
    'dest': {
        'index': 'unichem_test'
    }
}

num_slices = 1000
initial_time = time.time()
sleep_time = 10
pb_scheduled = get_new_progressbar('scheduled_slices', max_val=num_slices)
pb_reindexed = get_new_progressbar('reindex_slices', max_val=num_slices)
scheduled_slices = 0
completed_slices = 0
slice_reindex_timeout = sleep_time * 1000


def reindex_slice(slice_index):
    global completed_slices, scheduled_slices, task_ids, sleep_time, pb_scheduled, pb_reindexed, slice_reindex_timeout,\
        base_url, base_request_path, base_reindex_data, sync_lock, es_auth, stop_reindex
    if stop_reindex:
        return

    task_id = None
    sync_lock.acquire()
    try:
Example #13
0
    def save_denormalization(self):
        if self.compound_families_dir:
            es_util.delete_idx(self.generated_resource.idx_name)
            es_util.create_idx(self.generated_resource.idx_name,
                               3,
                               1,
                               analysis=DefaultMappings.COMMON_ANALYSIS,
                               mappings=MechanismDenormalizationHandler.
                               get_new_index_mappings())

            dn_dict = {}

            print('{0} GROUPED RECORDS WERE FOUND'.format(
                len(self.mechanisms_by_grouping_id)),
                  file=sys.stderr)
            p_bar = progress_bar_handler.get_new_progressbar(
                'mechanism_by_parent_target-dn-generation',
                len(self.mechanisms_by_grouping_id))
            i = 0
            for group_mechanisms in self.mechanisms_by_grouping_id.values():
                base_mechanism = group_mechanisms[0]
                action_type = base_mechanism.get('action_type', None)
                bs_id = base_mechanism.get('site_id', None)
                mechanism_refs = []
                mechanism_comments_set = set()
                selectivity_comments_set = set()
                binding_site_comments_set = set()
                max_phase = 0
                for mechanism_i in group_mechanisms:
                    if action_type != mechanism_i.get('action_type', None):
                        print('ACTION TYPE SHOULD BE {0} FOR MECHANISM {1}!'.
                              format(action_type, mechanism_i['mec_id']),
                              file=sys.stderr)
                        print(pprint.pformat(group_mechanisms),
                              file=sys.stderr)
                    if bs_id != mechanism_i.get('site_id', None):
                        print('BINDING SITE SHOULD BE {0} FOR MECHANISM {1}!'.
                              format(bs_id, mechanism_i['mec_id']),
                              file=sys.stderr)
                        print(pprint.pformat(group_mechanisms),
                              file=sys.stderr)
                    if bs_id is None:
                        bs_id = mechanism_i.get('site_id', None)

                    mechanism_i_comment = mechanism_i.get(
                        'mechanism_comment', None)
                    if mechanism_i_comment is not None:
                        mechanism_comments_set.add(mechanism_i_comment)

                    mechanism_i_selectivity_comment = mechanism_i.get(
                        'selectivity_comment', None)
                    if mechanism_i_selectivity_comment is not None:
                        selectivity_comments_set.add(
                            mechanism_i_selectivity_comment)

                    mechanism_i_binding_site_comment = mechanism_i.get(
                        'binding_site_comment', None)
                    if mechanism_i_binding_site_comment is not None:
                        binding_site_comments_set.add(
                            mechanism_i_binding_site_comment)

                    mechanism_refs += mechanism_i.get('mechanism_refs', [])

                    max_phase = max(max_phase, mechanism_i.get('max_phase', 0))

                parent_chembl_id, target_chembl_id, mechanism_of_action = \
                    self.get_mechanism_grouping_id_parts(base_mechanism)

                new_mechanism_doc = {
                    'parent_molecule':
                    MOLECULE.get_doc_by_id_from_es(parent_chembl_id),
                    'target':
                    TARGET.get_doc_by_id_from_es(target_chembl_id),
                    'binding_site':
                    BINDING_SITE.get_doc_by_id_from_es(bs_id),
                    'mechanism_of_action':
                    base_mechanism
                }
                new_mechanism_doc['mechanism_of_action'][
                    'mechanism_comment'] = list(mechanism_comments_set)
                new_mechanism_doc['mechanism_of_action'][
                    'selectivity_comment'] = list(selectivity_comments_set)
                new_mechanism_doc['mechanism_of_action'][
                    'binding_site_comment'] = list(binding_site_comments_set)
                new_mechanism_doc['mechanism_of_action'][
                    'max_phase'] = max_phase
                doc_id = self.generated_resource.get_doc_id(new_mechanism_doc)

                if len(mechanism_comments_set) > 1:
                    print('MULTIPLE MECHANISM COMMENTS FOUND FOR {0}'.format(
                        doc_id),
                          file=sys.stderr)
                if len(selectivity_comments_set) > 1:
                    print('MULTIPLE SELECTIVITY COMMENTS FOUND FOR {0}'.format(
                        doc_id),
                          file=sys.stderr)
                if len(binding_site_comments_set) > 1:
                    print(
                        'MULTIPLE BINDING SITE COMMENTS FOUND FOR {0}'.format(
                            doc_id),
                        file=sys.stderr)

                dn_dict[doc_id] = new_mechanism_doc
                i += 1
                p_bar.update(i)
            p_bar.finish()

            self.save_denormalization_dict(
                self.generated_resource,
                dn_dict,
                DenormalizationHandler.default_update_script_and_size,
                do_index=True)