Beispiel #1
0
def validation_on_start(eco_scores_uri, schema_uri, excluded_biotypes,
                        datasources_to_datatypes, es_hosts, es_index_gene,
                        es_index_eco, es_index_efo, cache_target,
                        cache_target_u2e, cache_target_contains, cache_eco,
                        cache_efo, cache_efo_contains):
    logger = logging.getLogger(__name__)

    validator = opentargets_validator.helpers.generate_validator_from_schema(
        schema_uri)

    lookup_data = LookUpDataRetriever(
        new_es_client(es_hosts),
        gene_index=es_index_gene,
        gene_cache_size=cache_target,
        gene_cache_u2e_size=cache_target_u2e,
        gene_cache_contains_size=cache_target_contains,
        eco_index=es_index_eco,
        eco_cache_size=cache_efo_contains,
        efo_index=es_index_efo,
        efo_cache_size=cache_efo,
        efo_cache_contains_size=cache_efo_contains).lookup

    datasources_to_datatypes = datasources_to_datatypes
    evidence_manager = EvidenceManager(lookup_data, eco_scores_uri,
                                       excluded_biotypes,
                                       datasources_to_datatypes)

    return logger, validator, lookup_data, datasources_to_datatypes, evidence_manager
Beispiel #2
0
def score_producer_local_init(datasources_to_datatypes, dry_run, es_hosts,
        es_index_gene, es_index_eco, es_index_hpa, es_index_efo):
    scorer = Scorer()
    lookup_data = LookUpDataRetriever(new_es_client(es_hosts), 
        (
            LookUpDataType.DISEASE,
            LookUpDataType.TARGET,
            LookUpDataType.ECO,
            LookUpDataType.HPA
        ),
        gene_index=es_index_gene,
        eco_index=es_index_eco,
        hpa_index=es_index_hpa,
        efo_index=es_index_efo).lookup
    return scorer, lookup_data, datasources_to_datatypes, dry_run
Beispiel #3
0
def score_producer_local_init(
    datasources_to_datatypes,
    dry_run,
    es_hosts,
    es_index_gene,
    es_index_hpa,
    es_index_efo,
    gene_cache_size,
    hpa_cache_size,
    efo_cache_size,
):
    scorer = Scorer()
    lookup_data = LookUpDataRetriever(new_es_client(es_hosts),
                                      gene_index=es_index_gene,
                                      gene_cache_size=gene_cache_size,
                                      hpa_index=es_index_hpa,
                                      hpa_cache_size=hpa_cache_size,
                                      efo_index=es_index_efo,
                                      efo_cache_size=efo_cache_size).lookup
    return scorer, lookup_data, datasources_to_datatypes, dry_run
Beispiel #4
0
def validation_on_start(eco_scores_uri, schema_uri, excluded_biotypes,
                        datasources_to_datatypes, es_hosts, es_index_gene,
                        es_index_eco, es_index_efo):
    logger = logging.getLogger(__name__)

    validator = opentargets_validator.helpers.generate_validator_from_schema(
        schema_uri)

    lookup_data = LookUpDataRetriever(
        new_es_client(es_hosts),
        (LookUpDataType.DISEASE, LookUpDataType.TARGET, LookUpDataType.ECO),
        gene_index=es_index_gene,
        eco_index=es_index_eco,
        efo_index=es_index_efo).lookup

    datasources_to_datatypes = datasources_to_datatypes
    evidence_manager = EvidenceManager(lookup_data, eco_scores_uri,
                                       excluded_biotypes,
                                       datasources_to_datatypes)

    return logger, validator, lookup_data, datasources_to_datatypes, evidence_manager
Beispiel #5
0
    def generate(self, es):

        # pre-load into indexed shelf dicts

        self.logger.info("Starting pre-loading")

        #create lookup tables
        self.lookup_data = LookUpDataRetriever(
            es, (LookUpDataType.TARGET, LookUpDataType.DISEASE),
            gene_index=self.es_index_gene,
            efo_index=self.es_index_efo).lookup

        # these are all separate files
        # intentional, partly because its what chembl API gives us, and partly because
        # it is easier for partners to add information to existing chembl records

        # TODO potentially load these in separate processes?

        self.logger.debug("Loading molecules")
        mols = self.create_shelf_multi(self.chembl_molecule_uris,
                                       get_parent_id)
        self.logger.debug("Loaded %d molecules", len(mols))
        self.logger.debug("Loading indications")
        indications = self.create_shelf_multi(
            self.chembl_indication_uris, lambda x: x["molecule_chembl_id"])
        self.logger.debug("Loaded %d indications", len(indications))
        self.logger.debug("Loading mechanisms")
        mechanisms = self.create_shelf_multi(self.chembl_mechanism_uris,
                                             lambda x: x["molecule_chembl_id"])
        self.logger.debug("Loaded %d mechanisms", len(mechanisms))
        self.logger.debug("Loading targets")
        targets = self.create_shelf(self.chembl_target_uris,
                                    lambda x: x["target_chembl_id"])
        self.logger.debug("Loaded %d targets", len(targets))
        self.logger.info("Completed pre-loading")

        drugs = {}
        #TODO finish

        for ident in mols:

            parent_mol = None
            child_mols = []

            for mol in mols[ident]:
                if mol["molecule_chembl_id"] == ident:
                    #this is the parent
                    assert parent_mol is None
                    parent_mol = mol
                else:
                    #this is a child
                    assert mol not in child_mols
                    child_mols.append(mol)

            assert parent_mol is not None, ident

            #TODO sure no grandparenting

            child_mols = sorted(child_mols)

            drug = self.handle_drug(ident, parent_mol, indications, mechanisms,
                                    targets)

            #append information from children
            for child_mol in child_mols:
                self.handle_drug_child(drug, child_mol["molecule_chembl_id"],
                                       child_mol, indications, mechanisms,
                                       targets)

            if "indications" in drug:
                drug["number_of_indications"] = len(drug["indications"])
            else:
                drug["number_of_indications"] = 0

            if "mechanisms_of_action" in drug:
                drug["number_of_mechanisms_of_action"] = len(
                    drug["mechanisms_of_action"])
            else:
                drug["number_of_mechanisms_of_action"] = 0

            # only keep those with indications or mechanisms
            if drug["number_of_indications"] > 0 \
                    or drug["number_of_mechanisms_of_action"] > 0:
                drugs[ident] = drug

        return drugs
    def process_all(self, scoring_weights, is_direct_do_not_propagate,
                    datasources_to_datatypes, dry_run, num_workers_produce,
                    num_workers_score, max_queued_produce_to_score):

        lookup_data = LookUpDataRetriever(
            self.es,
            self.r_server,
            targets=[],
            data_types=(LookUpDataType.DISEASE, LookUpDataType.TARGET,
                        LookUpDataType.ECO, LookUpDataType.HPA)).lookup

        targets = list(self.es_query.get_all_target_ids_with_evidence_data())

        #setup elasticsearch
        if not dry_run:
            self.es_loader.create_new_index(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)
            self.es_loader.prepare_for_bulk_indexing(
                self.es_loader.get_versioned_index(
                    Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME))

        self.logger.info('setting up stages')

        #bake the arguments for the setup into function objects
        produce_evidence_local_init_baked = functools.partial(
            produce_evidence_local_init, self.es_hosts, scoring_weights,
            is_direct_do_not_propagate, datasources_to_datatypes)
        score_producer_local_init_baked = functools.partial(
            score_producer_local_init, self.es_hosts, self.redis_host,
            self.redis_port, lookup_data, datasources_to_datatypes, dry_run)

        #this doesn't need to be in the external config, since its so content light
        #as to be meaningless
        max_queued_score_out = 10000

        #pipeline stage for making the lists of the target/disease pairs and evidence
        pipeline_stage = pr.flat_map(
            produce_evidence,
            targets,
            workers=num_workers_produce,
            maxsize=max_queued_produce_to_score,
            on_start=produce_evidence_local_init_baked,
            on_done=produce_evidence_local_shutdown)

        #pipeline stage for scoring the evidence sets
        #includes writing to elasticsearch
        pipeline_stage = pr.each(score_producer,
                                 pipeline_stage,
                                 workers=num_workers_score,
                                 maxsize=max_queued_score_out,
                                 on_start=score_producer_local_init_baked,
                                 on_done=score_producer_local_shutdown)

        #loop over the end of the pipeline to make sure everything is finished
        self.logger.info('stages created, running scoring and writing')
        pr.run(pipeline_stage)
        self.logger.info('stages created, ran scoring and writing')

        #cleanup elasticsearch
        if not dry_run:
            self.logger.info('flushing data to index')
            self.es_loader.flush_all_and_wait(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.es_loader.restore_after_bulk_indexing()
            self.logger.info('flushed data to index')

        self.logger.info("DONE")
Beispiel #7
0
def process_evidences_pipeline(filenames, first_n, es_client, redis_client,
                               dry_run, output_folder, num_workers,
                               num_writers, max_queued_events, eco_scores_uri,
                               schema_uri, es_hosts, excluded_biotypes,
                               datasources_to_datatypes):
    logger = logging.getLogger(__name__)

    if not filenames:
        logger.error('tried to run with no filenames at all')
        raise RuntimeError("Must specify at least one filename of evidence")

    # files that are not fetchable
    failed_filenames = list(itertools.ifilterfalse(IO.check_to_open,
                                                   filenames))

    for uri in failed_filenames:
        logger.warning('failed to fetch uri %s', uri)

    # get the filenames that are properly fetchable
    #sort the list for consistent behaviour
    checked_filenames = sorted((set(filenames) - set(failed_filenames)))

    logger.info('start evidence processing pipeline')

    #load lookup tables
    lookup_data = LookUpDataRetriever(
        es_client, redis_client, [],
        (LookUpDataType.TARGET, LookUpDataType.DISEASE,
         LookUpDataType.ECO)).lookup

    #create a iterable of lines from all file handles
    evs = IO.make_iter_lines(checked_filenames, first_n)

    #create functions with pre-baked arguments
    validation_on_start_baked = functools.partial(validation_on_start,
                                                  lookup_data, eco_scores_uri,
                                                  schema_uri,
                                                  excluded_biotypes,
                                                  datasources_to_datatypes)

    writer_global_init, writer_local_init, writer_main, writer_local_shutdown, writer_global_shutdown = setup_writers(
        dry_run, es_hosts, output_folder)
    if writer_global_init:
        writer_global_init()

    #here is the pipeline definition
    pl_stage = pr.map(process_evidence,
                      evs,
                      workers=num_workers,
                      maxsize=max_queued_events,
                      on_start=validation_on_start_baked)

    pl_stage = pr.map(writer_main,
                      pl_stage,
                      workers=num_writers,
                      maxsize=max_queued_events,
                      on_start=writer_local_init,
                      on_done=writer_local_shutdown)

    logger.info('run evidence processing pipeline')
    results = reduce_tuple_with_sum(pr.to_iterable(pl_stage))

    #perform any single-thread cleanup
    if writer_global_shutdown:
        writer_global_shutdown()

    logger.info("results (failed: %s, succeed: %s)", results[0], results[1])
    if failed_filenames:
        raise RuntimeError('unable to handle %s', str(failed_filenames))

    if not results[1]:
        raise RuntimeError("No evidence was sucessful!")
    def generate(self, es):

        # pre-load into indexed shelf dicts

        self.logger.info("Starting pre-loading")

        # create lookup tables
        self.lookup_data = LookUpDataRetriever(
            es,
            gene_index=self.es_index_gene,
            gene_cache_size=self.cache_target,
            gene_cache_u2e_size=self.cache_target_u2e,
            gene_cache_contains_size=self.cache_target_contains,
            efo_index=self.es_index_efo,
            efo_cache_size=self.cache_efo,
            efo_cache_contains_size=self.cache_efo_contains).lookup

        # these are all separate files
        # intentional, partly because its what chembl API gives us, and partly because
        # it is easier for partners to add information to existing chembl records

        # TODO potentially load these in separate processes?

        self.logger.debug("Loading molecules")
        mols = self.create_shelf_multi(self.chembl_molecule_uris,
                                       get_parent_id)
        self.logger.debug("Loaded %d molecules", len(mols))
        self.logger.debug("Loading indications")
        indications = self.create_shelf_multi(
            self.chembl_indication_uris, lambda x: x["molecule_chembl_id"])
        self.logger.debug("Loaded %d indications", len(indications))
        self.logger.debug("Loading mechanisms")
        mechanisms = self.create_shelf_multi(self.chembl_mechanism_uris,
                                             lambda x: x["molecule_chembl_id"])
        self.logger.debug("Loaded %d mechanisms", len(mechanisms))
        self.logger.debug("Loading targets")
        targets = self.create_shelf(self.chembl_target_uris,
                                    lambda x: x["target_chembl_id"])
        self.logger.debug("Loaded %d targets", len(targets))
        adverse_events = self.create_shelf_multi_csv(self.adverse_events_uris,
                                                     "chembl_id", csv.excel)
        self.logger.debug("Loaded %d adverse events", len(adverse_events))
        # technically this can be duplicate e.g. CHEMBL1236107
        drugbank_ids = self.create_shelf_multi_csv(self.drugbank_uris,
                                                   "From src:'1'",
                                                   csv.excel_tab)
        self.logger.debug("Loaded %d drugbank ids", len(drugbank_ids))
        self.logger.info("Completed pre-loading")

        drugs = {}
        # TODO finish
        for ident in mols:  # all keys in mols
            parent_mol = None
            child_mols = []
            # 1. Set parent mol and list of children
            for mol in mols[ident]:
                mol["molecule_chembl_id"] = self.str_hook(
                    mol["molecule_chembl_id"])
                if mol["molecule_chembl_id"] == ident:
                    # this is the parent
                    assert parent_mol is None
                    parent_mol = mol
                else:
                    # this is a child
                    assert mol not in child_mols
                    child_mols.append(mol)

            # ToDo: check with AF
            assert parent_mol is not None, ident

            # TODO sure no grandparenting
            child_mols = sorted(child_mols,
                                key=lambda x: x["molecule_chembl_id"])

            drug = self.handle_drug(ident, parent_mol, indications, mechanisms,
                                    targets, adverse_events, drugbank_ids)

            # append information from children
            for child_mol in child_mols:
                self.handle_drug_child(drug, child_mol["molecule_chembl_id"],
                                       child_mol, indications, mechanisms,
                                       targets, adverse_events, drugbank_ids)

            if "indications" in drug:
                drug["number_of_indications"] = len(drug["indications"])
                # buld a summary of therapeutic areas covered by indications
                # TODO avoid repeat EFO lookup by doing inside handle_indication()
                indication_therapeutic_areas = defaultdict(int)
                for indication in drug["indications"]:
                    efo_id = indication["efo_id"]
                    stored_efo = self.lookup_data.available_efos.get_efo(
                        efo_id)
                    if "therapeutic_codes" in stored_efo and "therapeutic_labels" in stored_efo:
                        for ta_code, ta_label in zip(
                                stored_efo["therapeutic_codes"],
                                stored_efo["therapeutic_labels"]):
                            indication_therapeutic_areas[ta_code,
                                                         ta_label] += 1
                drug["indication_therapeutic_areas"] = []
                for (ta_code, ta_label), value in sorted(
                        indication_therapeutic_areas.items(),
                        key=lambda x: x[1],
                        reverse=True):
                    indication_therapeutic_area = {}
                    indication_therapeutic_area["therapeutic_code"] = ta_code
                    indication_therapeutic_area["therapeutic_label"] = ta_label
                    indication_therapeutic_area["count"] = value
                    drug["indication_therapeutic_areas"].append(
                        indication_therapeutic_area)
                drug["indication_therapeutic_areas"] = tuple(
                    drug["indication_therapeutic_areas"])

            else:
                drug["number_of_indications"] = 0

            if "mechanisms_of_action" in drug:
                drug["number_of_mechanisms_of_action"] = len(
                    drug["mechanisms_of_action"])
            else:
                drug["number_of_mechanisms_of_action"] = 0

            # Aggregate indication refs, empty array if no indications present.
            drug["indication_refs"] = self.generateAggregatedIndicationRefs(
                drug)
            # only keep those with indications or mechanisms
            if drug["number_of_indications"] == 0 \
                    and drug["number_of_mechanisms_of_action"] == 0:
                continue

            drugs[ident] = drug

        return drugs