def get_validated_evidence_strings(self,
                                       size=1000,
                                       datasources=[],
                                       is_valid=True):
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-multi-get.html
        index_name = Loader.get_versioned_index(
            Const.ELASTICSEARCH_VALIDATED_DATA_INDEX_NAME + '*', True)

        doc_type = None
        if datasources:
            doc_type = datasources

        res = helpers.scan(
            client=self.handler,
            query={
                "query": {
                    "match_phrase": {
                        "is_valid": is_valid
                    }
                },
                '_source': True,
                'size': size,
            },
            scroll='12h',
            doc_type=doc_type,
            index=index_name,
            timeout="20m",
        )

        for hit in res:
            yield hit['_source']
    def get_disease_to_targets_vectors(self, treshold=0.1, evidence_count=3):
        '''
        Get all the association objects that are:
        - direct -> to avoid ontology inflation
        - > 3 evidence count -> remove noise
        - overall score > threshold -> remove very lo quality noise
        :param treshold: minimum overall score threshold to consider for fetching association data
        :param evidence_count: minimum number of evidence consider for fetching association data
        :return: two dictionaries mapping target to disease  and the reverse
        '''
        self.logger.debug('scan es to get all diseases and targets')
        res = helpers.scan(
            client=self.handler,
            query={
                "query": {
                    "term": {
                        "is_direct": True,
                    }
                },
                '_source': {
                    'includes': [
                        "target.id", 'disease.id', 'harmonic-sum',
                        'evidence_count'
                    ]
                },
                'size': 1000,
            },
            scroll='12h',
            index=Loader.get_versioned_index(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME, True),
            timeout="10m",
        )

        target_results = dict()
        disease_results = dict()

        self.logger.debug('start getting all targets and diseases from es')
        c = 0
        for hit in res:
            c += 1
            hit = hit['_source']
            if hit['evidence_count']['total']>=evidence_count and \
                hit['harmonic-sum']['overall'] >=treshold:
                '''store target associations'''
                if hit['target']['id'] not in target_results:
                    target_results[hit['target']['id']] = SparseFloatDict()
                #TODO: return all counts and scores up to datasource level
                target_results[hit['target']['id']][
                    hit['disease']['id']] = hit['harmonic-sum']['overall']
                '''store disease associations'''
                if hit['disease']['id'] not in disease_results:
                    disease_results[hit['disease']['id']] = SparseFloatDict()
                # TODO: return all counts and scores up to datasource level
                disease_results[hit['disease']['id']][
                    hit['target']['id']] = hit['harmonic-sum']['overall']

                if c % 10000 == 0:
                    self.logger.debug('%d elements retrieved', c)

        return target_results, disease_results
    def get_associations_for_disease(self,
                                     disease,
                                     fields=None,
                                     size=100,
                                     get_top_hits=True):
        source = self._get_source_from_fields(fields)

        aggs = addict.Dict()
        if get_top_hits:
            aggs.direct_associations.filter.term.is_direct = True
            aggs.direct_associations.aggs.top_direct_ass.top_hits.sort[
                'harmonic-sum.overall'].order = 'desc'
            aggs.direct_associations.aggs.top_direct_ass.top_hits._source = source
            aggs.direct_associations.aggs.top_direct_ass.top_hits.size = size

        q = addict.Dict()
        q.query.constant_score.filter.terms['disease.id'] = [disease]
        q.sort['harmonic-sum.overall'].order = 'desc'
        q._source = source
        q.aggs = aggs
        q.size = size

        res = self.handler.search(
            index=Loader.get_versioned_index(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME, True),
            doc_type=Const.ELASTICSEARCH_DATA_ASSOCIATION_DOC_NAME,
            body=q.to_dict())
        return AssociationSummary(res)
    def get_all_evidence_for_datatype(
        self,
        datatype,
        fields=None,
    ):
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-multi-get.html
        index_name = Loader.get_versioned_index(
            Const.ELASTICSEARCH_DATA_INDEX_NAME, True)
        res = helpers.scan(
            client=self.handler,
            query={
                "query": {
                    "match": {
                        "type": datatype
                    }
                },
                '_source': self._get_source_from_fields(fields),
                'size': 1000,
            },
            scroll='12h',
            index=index_name,
            timeout="10m",
        )

        # res = list(res)
        for hit in res:
            yield hit['_source']
    def get_evidence_for_target_simple(self, target, expected=None):
        query_body = {
            "query": {
                "constant_score": {
                    "filter": {
                        "term": {
                            "target.id": target
                        }
                    }
                }
            },
            '_source': {
                "includes": [
                    "target.id",
                    "private.efo_codes",
                    "disease.id",
                    "scores.association_score",
                    "sourceID",
                    "id",
                ]
            },
        }

        if expected is not None and expected < 10000:
            query_body['size'] = 10000
            res = self.handler.search(index=Loader.get_versioned_index(
                Const.ELASTICSEARCH_DATA_INDEX_NAME, True),
                                      body=query_body)
            for hit in res['hits']['hits']:
                yield hit['_source']
        else:
            res = helpers.scan(client=self.handler,
                               query=query_body,
                               scroll='1h',
                               index=Loader.get_versioned_index(
                                   Const.ELASTICSEARCH_DATA_INDEX_NAME, True),
                               timeout="1h",
                               request_timeout=2 * 60 * 60,
                               size=1000)
            for hit in res:
                yield hit['_source']
 def count_elements_in_index(self, index_name, doc_type=None, query=None):
     if query is None:
         query = {"match_all": {}}
     res = self.handler.search(index=Loader.get_versioned_index(
         index_name, True),
                               doc_type=doc_type,
                               body={
                                   "query": query,
                                   '_source': False,
                                   'size': 0,
                               })
     return res['hits']['total']
 def get_reaction(self, reaction_id):
     res = self.handler.search(
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_REACTOME_INDEX_NAME, True),
         doc_type=Const.ELASTICSEARCH_REACTOME_REACTION_DOC_NAME,
         body={
             "query": {
                 "ids": {
                     "values": [reaction_id]
                 }
             },
             '_source': True,
             'size': 1,
         })
     for hit in res['hits']['hits']:
         return hit['_source']
 def get_all_associations(self, ):
     res = helpers.scan(
         client=self.handler,
         query={
             "query": {
                 "match_all": {}
             },
             '_source': True,
             'size': 1000,
         },
         scroll='1h',
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME, True),
         timeout="10m",
     )
     for hit in res:
         yield hit['_source']
 def count_evidence_for_target(self, target):
     res = self.handler.search(index=Loader.get_versioned_index(
         Const.ELASTICSEARCH_DATA_INDEX_NAME, True),
                               body={
                                   "query": {
                                       "constant_score": {
                                           "filter": {
                                               "term": {
                                                   "target.id": target
                                               }
                                           }
                                       }
                                   },
                                   '_source': [],
                                   'size': 0
                               })
     return res['hits']['total']
 def get_all_ensembl_genes(self):
     res = helpers.scan(
         client=self.handler,
         query={
             "query": {
                 "match_all": {}
             },
             '_source': True,
             'size': 1000,
         },
         scroll='1h',
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME, True),
         timeout="10m",
     )
     for hit in res:
         yield hit['_source']
 def get_all_uniprot_entries(self):
     res = helpers.scan(
         client=self.handler,
         query={
             "query": {
                 "match_all": {}
             },
             '_source': True,
             'size': 100,
         },
         scroll='12h',
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_UNIPROT_INDEX_NAME, True),
         timeout="10m",
     )
     for hit in res:
         yield jsonpickle.decode(base64.b64decode(hit['_source']['entry']))
 def get_all_reactions(self):
     res = helpers.scan(
         client=self.handler,
         query={
             "query": {
                 "match_all": {}
             },
             '_source': True,
             'size': 1000,
         },
         scroll='1h',
         doc_type=Const.ELASTICSEARCH_REACTOME_REACTION_DOC_NAME,
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_REACTOME_INDEX_NAME, True),
         timeout="10m",
     )
     for hit in res:
         yield hit['_source']
 def get_all_target_ids_with_evidence_data(self):
     #TODO: use an aggregation to get those with just data
     res = helpers.scan(
         client=self.handler,
         query={
             "query": {
                 "match_all": {}
             },
             '_source': False,
             'size': 100,
         },
         scroll='12h',
         doc_type=Const.ELASTICSEARCH_GENE_NAME_DOC_NAME,
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_GENE_NAME_INDEX_NAME, True),
         timeout="30m",
     )
     for target in res:
         yield target['_id']
 def get_all_diseases(self, fields=None):
     source = self._get_source_from_fields(fields)
     res = helpers.scan(
         client=self.handler,
         query={
             "query": {
                 "match_all": {}
             },
             '_source': source,
             'size': 1000,
         },
         scroll='12h',
         doc_type=Const.ELASTICSEARCH_EFO_LABEL_DOC_NAME,
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME, True),
         timeout="10m",
     )
     for hit in res:
         yield hit['_source']
    def get_disease_labels(self, ids):
        res = helpers.scan(
            client=self.handler,
            query={
                "query": {
                    "ids": {
                        "values": ids,
                    }
                },
                '_source': 'label',
                'size': 1,
            },
            scroll='12h',
            index=Loader.get_versioned_index(
                Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME, True),
            timeout="10m",
        )

        return dict((hit['_id'], hit['_source']['label']) for hit in res)
    def get_all_target_disease_pair_from_evidence(self, only_direct=False):

        res = helpers.scan(
            client=self.handler,
            query={
                "query": {
                    "match_all": {}
                },
                '_source':
                self._get_source_from_fields([
                    'target.id', 'disease.id', 'private.efo_codes',
                    'scores.association_score'
                ]),
                'size':
                1000,
            },
            scroll='6h',
            index=Loader.get_versioned_index(
                Const.ELASTICSEARCH_DATA_INDEX_NAME, True),
            timeout="1h",
            request_timeout=2 * 60 * 60,
        )

        yielded_pairs = set()
        for hit in res:
            if hit['_source']['scores']['association_score'] > 0:
                if only_direct:
                    pair = '-'.join([
                        hit['_source']['target']['id'],
                        hit['_source']['disease']['id']
                    ])
                    if pair not in yielded_pairs:
                        yield pair
                        yielded_pairs.add(pair)
                else:
                    for efo_id in hit['_source']['private']['efo_codes']:
                        pair = '-'.join(
                            [hit['_source']['target']['id'], efo_id])
                        if pair not in yielded_pairs:
                            yield pair
                            yielded_pairs.add(pair)
    def get_all_evidence(self, fields=None):
        index_name = Loader.get_versioned_index(
            Const.ELASTICSEARCH_DATA_INDEX_NAME, True)
        doc_type = None
        res = helpers.scan(
            client=self.handler,
            query={
                "query": {
                    "match_all": {}
                },
                '_source': self._get_source_from_fields(fields),
                'size': 1000,
            },
            scroll='12h',
            index=index_name,
            timeout="10m",
        )

        # res = list(res)
        for hit in res:
            yield hit['_source']
    def delete_data(self,
                    index,
                    query,
                    doc_type='',
                    chunk_size=1000,
                    altered_keys=()):
        '''
        Delete all the documents in an index matching a given query
        :param index: index to use
        :param query: query matching the elements to remove
        :param doc_type: document types, default is to look for all the doc types
        :param chunk_size: size of the bulk action sent to delete
        :param altered_keys: list of fields to fetch data and return as being altered by the delete query
        :return: dict of keys altered by the query
        '''
        '''count available data'''
        res = self.handler.search(
            index=Loader.get_versioned_index(index, True),
            body={
                "query": query,
                '_source': False,
                'size': 0,
            },
            doc_type=doc_type,
        )
        total = res['hits']['total']
        '''if data is matching query, delete it with scan and bulk'''
        altered = dict()
        for key in altered_keys:
            altered[key] = set()
        if total:
            batch = []
            for hit in helpers.scan(
                    client=self.handler,
                    query={
                        "query": query,
                        '_source': self._get_source_from_fields(altered_keys),
                        'size': chunk_size,
                    },
                    scroll='1h',
                    index=Loader.get_versioned_index(index, True),
                    doc_type=doc_type,
                    timeout='1h',
            ):
                action = {
                    '_op_type': 'delete',
                    '_index': hit['_index'],
                    '_type': hit['_type'],
                    '_id': hit['_id'],
                }
                batch.append(action)
                flat_source = self.flatten(hit['_source'])
                for key in altered_keys:
                    if key in flat_source:
                        altered[key].add(flat_source[key])
                if len(batch) >= chunk_size:
                    self._flush_bulk(batch)
                    batch = []

            #if len(batch) >= chunk_size:
            self._flush_bulk(batch)
            '''flush changes'''
            self.handler.indices.flush(Loader.get_versioned_index(index, True),
                                       wait_if_ongoing=True)

        return altered
    def get_objects_by_id(self,
                          ids,
                          index,
                          doc_type,
                          source=True,
                          source_exclude=[],
                          realtime=False):
        '''

        :param ids: list of idientifiers for documents
        :param index: index for all the documents
        :param doc_type: doc type for all the documents
        :return: generator of documents
        '''
        if isinstance(ids, (list, tuple)):
            res = self.handler.mget(
                index=Loader.get_versioned_index(index, True),
                doc_type=doc_type,
                body=dict(ids=ids),
                _source=source,
                _source_exclude=source_exclude,
                realtime=True,
            )
            if not res:
                time.sleep(0.1)
                res = self.handler.mget(
                    index=Loader.get_versioned_index(index, True),
                    doc_type=doc_type,
                    body=dict(ids=ids),
                    _source=source,
                    _source_exclude=source_exclude,
                    realtime=True,
                )
            for doc in res['docs']:
                if doc['found']:
                    yield doc['_source']
                else:
                    raise KeyError('object with id %s not found' %
                                   (doc['_id']))

        else:

            try:
                res = self.handler.get(
                    index=Loader.get_versioned_index(index, True),
                    doc_type=doc_type,
                    id=ids,
                    _source=source,
                    _source_exclude=source_exclude,
                    realtime=True,
                )
                try:
                    yield res['_source']
                except Exception as e:
                    self.logger.exception(
                        'cannot retrieve single object by id %s ' % ids)
                    raise KeyError('object with id %s not found' % ids)

            except TransportError as te:
                if te.status_code == 404:
                    raise KeyError('object with id %s not found' % ids)
class DataDrivenRelationProcess(object):

    def __init__(self, es):
        self.es = es
        self.es_query=ESQuery(self.es)
        self.logger = logging.getLogger(__name__)

    def process_all(self, dry_run, 
            ddr_workers_production,
            ddr_workers_score,
            ddr_queue_production_score,
            ddr_queue_score_result):
        start_time = time.time()

        target_data, disease_data = self.es_query.get_disease_to_targets_vectors()

        self.logger.info('Retrieved all the associations data in %i s'%(time.time()-start_time))
        self.logger.info('target data length: %s size in memory: %f Kb'%(len(target_data),sys.getsizeof(target_data)/1024.))
        self.logger.info('disease data length: %s size in memory: %f Kb' % (len(disease_data),sys.getsizeof(disease_data)/1024.))

        '''sort the lists and keep using always the same order in all the steps'''
        disease_keys = sorted(disease_data.keys())
        target_keys = sorted(target_data.keys())

        self.logger.info('getting disese labels')
        disease_id_to_label = self.es_query.get_disease_labels(disease_keys)
        disease_labels = [disease_id_to_label[hit_id] for hit_id in disease_keys]
        self.logger.info('getting target labels')
        target_id_to_label = self.es_query.get_target_labels(target_keys)
        target_labels = [target_id_to_label[hit_id] for hit_id in target_keys]

        #setup elasticsearch
        self.loader = Loader(self.es, dry_run=dry_run)
        if not dry_run:
            #need to directly get the versioned index name for this function
            self.loader.create_new_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME)
            self.loader.prepare_for_bulk_indexing(self.loader.get_versioned_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME))


        #calculate and store disease-to-disease in multiple processess
        self.logger.info('handling disease-to-disease')
        handle_pairs(RelationType.SHARED_TARGET, disease_labels, disease_data, disease_keys, 
            target_keys, 0.19, 1024, self.loader, dry_run, 
            ddr_workers_production, ddr_workers_score, 
            ddr_queue_production_score, ddr_queue_score_result)
        self.logger.info('handled disease-to-disease')

        #calculate and store target-to-target in multiple processess
        self.logger.info('handling target-to-target')
        handle_pairs(RelationType.SHARED_DISEASE, target_labels, target_data, target_keys, 
            disease_keys, 0.19, 1024, self.loader, dry_run, 
            ddr_workers_production, ddr_workers_score, 
            ddr_queue_production_score, ddr_queue_score_result)
        self.logger.info('handled target-to-target')

        #cleanup elasticsearch
        if not dry_run:
            self.loader.flush_all_and_wait(Const.ELASTICSEARCH_RELATION_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.loader.restore_after_bulk_indexing()
 def exists(self, index, doc_type, id, realtime=False):
     return self.handler.exists(index=Loader.get_versioned_index(
         index, True),
                                doc_type=doc_type,
                                id=id,
                                realtime=realtime)
Ejemplo n.º 22
0
class ScoringProcess():
    def __init__(self, redis_host, redis_port, es_hosts):

        self.logger = logging.getLogger(__name__)

        self.es_hosts = es_hosts
        self.es = new_es_client(self.es_hosts)
        self.es_loader = Loader(self.es)
        self.es_query = ESQuery(self.es)

        self.redis_host = redis_host
        self.redis_port = redis_port
        self.r_server = new_redis_client(self.redis_host, self.redis_port)

    def process_all(self, scoring_weights, is_direct_do_not_propagate,
                    datasources_to_datatypes, dry_run, num_workers_produce,
                    num_workers_score, max_queued_produce_to_score):

        lookup_data = LookUpDataRetriever(
            self.es,
            self.r_server,
            targets=[],
            data_types=(LookUpDataType.DISEASE, LookUpDataType.TARGET,
                        LookUpDataType.ECO, LookUpDataType.HPA)).lookup

        targets = list(self.es_query.get_all_target_ids_with_evidence_data())

        #setup elasticsearch
        if not dry_run:
            self.es_loader.create_new_index(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)
            self.es_loader.prepare_for_bulk_indexing(
                self.es_loader.get_versioned_index(
                    Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME))

        self.logger.info('setting up stages')

        #bake the arguments for the setup into function objects
        produce_evidence_local_init_baked = functools.partial(
            produce_evidence_local_init, self.es_hosts, scoring_weights,
            is_direct_do_not_propagate, datasources_to_datatypes)
        score_producer_local_init_baked = functools.partial(
            score_producer_local_init, self.es_hosts, self.redis_host,
            self.redis_port, lookup_data, datasources_to_datatypes, dry_run)

        #this doesn't need to be in the external config, since its so content light
        #as to be meaningless
        max_queued_score_out = 10000

        #pipeline stage for making the lists of the target/disease pairs and evidence
        pipeline_stage = pr.flat_map(
            produce_evidence,
            targets,
            workers=num_workers_produce,
            maxsize=max_queued_produce_to_score,
            on_start=produce_evidence_local_init_baked,
            on_done=produce_evidence_local_shutdown)

        #pipeline stage for scoring the evidence sets
        #includes writing to elasticsearch
        pipeline_stage = pr.each(score_producer,
                                 pipeline_stage,
                                 workers=num_workers_score,
                                 maxsize=max_queued_score_out,
                                 on_start=score_producer_local_init_baked,
                                 on_done=score_producer_local_shutdown)

        #loop over the end of the pipeline to make sure everything is finished
        self.logger.info('stages created, running scoring and writing')
        pr.run(pipeline_stage)
        self.logger.info('stages created, ran scoring and writing')

        #cleanup elasticsearch
        if not dry_run:
            self.logger.info('flushing data to index')
            self.es_loader.flush_all_and_wait(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.es_loader.restore_after_bulk_indexing()
            self.logger.info('flushed data to index')

        self.logger.info("DONE")

    """
    Run a series of QC tests on EFO elasticsearch index. Returns a dictionary
    of string test names and result objects
    """

    def qc(self, esquery):

        #number of eco entries
        association_count = 0
        #Note: try to avoid doing this more than once!
        for association in esquery.get_all_associations():
            association_count += 1
            if association_count % 1000 == 0:
                self.logger.debug("checking %d", association_count)

        #put the metrics into a single dict
        metrics = dict()
        metrics["association.count"] = association_count

        return metrics