Esempio n. 1
0
def validation_on_start(eco_scores_uri, schema_uri, excluded_biotypes,
                        datasources_to_datatypes, es_hosts, es_index_gene,
                        es_index_eco, es_index_efo, cache_target,
                        cache_target_u2e, cache_target_contains, cache_eco,
                        cache_efo, cache_efo_contains):
    logger = logging.getLogger(__name__)

    validator = opentargets_validator.helpers.generate_validator_from_schema(
        schema_uri)

    lookup_data = LookUpDataRetriever(
        new_es_client(es_hosts),
        gene_index=es_index_gene,
        gene_cache_size=cache_target,
        gene_cache_u2e_size=cache_target_u2e,
        gene_cache_contains_size=cache_target_contains,
        eco_index=es_index_eco,
        eco_cache_size=cache_efo_contains,
        efo_index=es_index_efo,
        efo_cache_size=cache_efo,
        efo_cache_contains_size=cache_efo_contains).lookup

    datasources_to_datatypes = datasources_to_datatypes
    evidence_manager = EvidenceManager(lookup_data, eco_scores_uri,
                                       excluded_biotypes,
                                       datasources_to_datatypes)

    return logger, validator, lookup_data, datasources_to_datatypes, evidence_manager
Esempio n. 2
0
    def _store_efo(self, dry_run):

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):

            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            actions = elasticsearch_actions(self.efos.items(), self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)
def setup_writers(dry_run, es_hosts, output_folder):
    global_init = None
    local_init = None
    main = None
    local_shutdown = None
    global_shutdown = None

    if dry_run:
        main = dry_run_main
    elif es_hosts:
        #have to bake the loader object in so that the prepare for bulk indexing works
        es_loader = Loader(new_es_client(es_hosts))
        #use partial to "bake" arguments into the function we return
        global_init = functools.partial(elasticsearch_global_init, es_loader)
        local_init = functools.partial(elasticsearch_local_init, es_hosts)
        main = elasticsearch_main
        local_shutdown = elasticsearch_local_shutdown
        global_shutdown = functools.partial(elasticsearch_global_shutdown,
                                            es_loader)
    elif output_folder:
        #use partial to "bake" arguments into the function we return
        global_init = functools.partial(file_global_init, output_folder)
        local_init = functools.partial(file_local_init, output_folder)
        main = file_main
        local_shutdown = file_local_shutdown
    else:
        raise ValueError(
            "Must specify one of dry_run, es_hosts, output_folder")

    return global_init, local_init, main, local_shutdown, global_shutdown
Esempio n. 4
0
    def process_all(self, dry_run):

        es = new_es_client(self.es_hosts)
        threshold = 0.1
        evidence_count = 3
        target_data, disease_data = get_disease_to_targets_vectors(
            self.score_threshold, self.evidence_count, es, self.es_index_assoc)

        if len(target_data) == 0 or len(disease_data) == 0:
            raise Exception(
                'Could not find a set of targets AND diseases that had the sufficient number'
                ' of evidences or acceptable harmonic sum score')
        '''sort the lists and keep using always the same order in all the steps'''
        disease_keys = sorted(disease_data.keys())
        target_keys = sorted(target_data.keys())

        self.logger.info('getting disese labels')
        disease_id_to_label = get_disease_labels(disease_keys, es,
                                                 self.es_index_efo)
        disease_labels = [
            disease_id_to_label[hit_id] for hit_id in disease_keys
        ]
        self.logger.info('getting target labels')
        target_id_to_label = get_target_labels(target_keys, es,
                                               self.es_index_gen)
        target_labels = [target_id_to_label[hit_id] for hit_id in target_keys]

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):

            #calculate and store disease-to-disease in multiple processess
            self.logger.info('handling disease-to-disease')
            handle_pairs(RelationType.SHARED_TARGET, disease_labels,
                         disease_data, disease_keys, target_keys, 0.19, 1024,
                         es, dry_run, self.ddr_workers_production,
                         self.ddr_workers_score, self.ddr_workers_write,
                         self.ddr_queue_production_score,
                         self.ddr_queue_score_result, self.ddr_queue_write,
                         self.es_index, self.es_doc)
            self.logger.info('handled disease-to-disease')

            #calculate and store target-to-target in multiple processess
            self.logger.info('handling target-to-target')
            handle_pairs(RelationType.SHARED_DISEASE, target_labels,
                         target_data, target_keys, disease_keys, 0.19, 1024,
                         es, dry_run, self.ddr_workers_production,
                         self.ddr_workers_score, self.ddr_workers_write,
                         self.ddr_queue_production_score,
                         self.ddr_queue_score_result, self.ddr_queue_write,
                         self.es_index, self.es_doc)
            self.logger.info('handled target-to-target')
def score_producer_local_init(es_hosts, redis_host, redis_port, lookup_data,
                              datasources_to_datatypes, dry_run):

    #set the R server to lookup into
    r_server = new_redis_client(redis_host, redis_port)

    scorer = Scorer()

    loader = Loader(new_es_client(es_hosts))

    return scorer, loader, r_server, lookup_data, datasources_to_datatypes, dry_run
Esempio n. 6
0
    def process_all(self, dry_run):

        self.relations = dict()
        self.g.add_node('root', name="", species="")

        for row in self.downloader.get_pathway_data():
            self.g.add_node(row['id'],
                            name=row['name'],
                            species=row['species'])
        children = set()
        for row in self.downloader.get_pathway_relations():
            self.g.add_edge(row['id'], row['child'])
            children.add(row['child'])

        nodes_without_parent = set(self.g.nodes()) - children
        for node in nodes_without_parent:
            if node != 'root':
                self.g.add_edge('root', node)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):
            #write into elasticsearch
            chunk_size = 1000  #TODO make configurable
            docs = generate_documents(self.g)
            actions = elasticsearch_actions(docs, self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)
    def __init__(self, redis_host, redis_port, es_hosts):

        self.logger = logging.getLogger(__name__)

        self.es_hosts = es_hosts
        self.es = new_es_client(self.es_hosts)
        self.es_loader = Loader(self.es)
        self.es_query = ESQuery(self.es)

        self.redis_host = redis_host
        self.redis_port = redis_port
        self.r_server = new_redis_client(self.redis_host, self.redis_port)
Esempio n. 8
0
    def merge_all(self, dry_run):

        es = new_es_client(self.es_hosts)

        #run the actual plugins
        for plugin_name in self.plugin_order:
            plugin = self.simplePluginManager.getPluginByName(plugin_name)

            # TODO remove the former redis object from all plugins
            plugin.plugin_object.merge_data(self.genes, es, None,
                                            self.data_config, self.es_config)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        # Hot fix issue 643: missing pathway in the association. Need a review for the reactome functions
        for geneid, gene in self.genes.iterate():
            gene._create_suggestions()
            gene._create_facets()

        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):

            #write into elasticsearch
            chunk_size = 1000  #TODO make configurable
            actions = elasticsearch_actions(self.genes, self.es_index)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)
Esempio n. 9
0
def score_producer_local_init(datasources_to_datatypes, dry_run, es_hosts,
        es_index_gene, es_index_eco, es_index_hpa, es_index_efo):
    scorer = Scorer()
    lookup_data = LookUpDataRetriever(new_es_client(es_hosts), 
        (
            LookUpDataType.DISEASE,
            LookUpDataType.TARGET,
            LookUpDataType.ECO,
            LookUpDataType.HPA
        ),
        gene_index=es_index_gene,
        eco_index=es_index_eco,
        hpa_index=es_index_hpa,
        efo_index=es_index_efo).lookup
    return scorer, lookup_data, datasources_to_datatypes, dry_run
Esempio n. 10
0
    def process_all(self, dry_run):
        ''' process all the objects that needs to be returned by the search method
        :return:
        '''

        es = new_es_client(self.es_hosts)
        #setup chembl handler
        self.chembl_handler = ChEMBLLookup(
            self.chembl_target_uri, self.chembl_mechanism_uri,
            self.chembl_component_uri, self.chembl_protein_uri,
            self.chembl_molecule_set_uri_pattern)
        self.chembl_handler.get_molecules_from_evidence(
            es, self.es_index_val_right)
        all_molecules = set()
        for target, molecules in self.chembl_handler.target2molecule.items():
            all_molecules = all_molecules | molecules
        all_molecules = sorted(all_molecules)
        query_batch_size = 100
        for i in range(0, len(all_molecules) + 1, query_batch_size):
            self.chembl_handler.populate_synonyms_for_molecule(
                all_molecules[i:i + query_batch_size],
                self.chembl_handler.molecule2synonyms)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):
            #process targets
            self.logger.info('handling targets')
            targets = self.get_targets(es)
            so_it = self.handle_search_object(targets, es,
                                              SearchObjectTypes.TARGET)
            store_in_elasticsearch(so_it, dry_run, es, self.es_index,
                                   self.es_doc, self.workers_write,
                                   self.queue_write)

            #process diseases
            self.logger.info('handling diseases')
            diseases = self.get_diseases(es)
            so_it = self.handle_search_object(diseases, es,
                                              SearchObjectTypes.DISEASE)
            store_in_elasticsearch(so_it, dry_run, es, self.es_index,
                                   self.es_doc, self.workers_write,
                                   self.queue_write)
Esempio n. 11
0
    def store_data(self, dry_run):
        self.logger.info('store_data called')

        self.logger.debug('calling to create new expression index')

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):

            #write into elasticsearch
            chunk_size = 1000  #TODO make configurable
            actions = elasticsearch_actions(self.hpa_merged_table, dry_run,
                                            self.es_index)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)

        if failcount:
            raise RuntimeError("%s failed to index" % failcount)

        self.logger.info('missing tissues %s', str(_missing_tissues))
Esempio n. 12
0
    def process(self, dry_run):
        def _put_line(line):
            return 1

        self.logger.info('Reading Ensembl gene info from %s' %
                         self.ensembl_filename)

        lines = more_itertools.with_iter(
            URLZSource(self.ensembl_filename).open())

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):
            #write into elasticsearch
            chunk_size = 1000  #TODO make configurable
            actions = elasticsearch_actions(lines, self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)
Esempio n. 13
0
def score_producer_local_init(
    datasources_to_datatypes,
    dry_run,
    es_hosts,
    es_index_gene,
    es_index_hpa,
    es_index_efo,
    gene_cache_size,
    hpa_cache_size,
    efo_cache_size,
):
    scorer = Scorer()
    lookup_data = LookUpDataRetriever(new_es_client(es_hosts),
                                      gene_index=es_index_gene,
                                      gene_cache_size=gene_cache_size,
                                      hpa_index=es_index_hpa,
                                      hpa_cache_size=hpa_cache_size,
                                      efo_index=es_index_efo,
                                      efo_cache_size=efo_cache_size).lookup
    return scorer, lookup_data, datasources_to_datatypes, dry_run
Esempio n. 14
0
def validation_on_start(eco_scores_uri, schema_uri, excluded_biotypes,
                        datasources_to_datatypes, es_hosts, es_index_gene,
                        es_index_eco, es_index_efo):
    logger = logging.getLogger(__name__)

    validator = opentargets_validator.helpers.generate_validator_from_schema(
        schema_uri)

    lookup_data = LookUpDataRetriever(
        new_es_client(es_hosts),
        (LookUpDataType.DISEASE, LookUpDataType.TARGET, LookUpDataType.ECO),
        gene_index=es_index_gene,
        eco_index=es_index_eco,
        efo_index=es_index_efo).lookup

    datasources_to_datatypes = datasources_to_datatypes
    evidence_manager = EvidenceManager(lookup_data, eco_scores_uri,
                                       excluded_biotypes,
                                       datasources_to_datatypes)

    return logger, validator, lookup_data, datasources_to_datatypes, evidence_manager
Esempio n. 15
0
    def process(self, dry_run):
        self.logger.debug("download uniprot uri %s", self.uri)
        self.logger.debug("to generate this file you have to call this url "
                            "https://www.uniprot.org/uniprot/?query=reviewed%3Ayes%2BAND%2Borganism%3A9606&compress=yes&format=xml")

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)
        chunk_size = 1000 # TODO make configurable
        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):

            items = generate_uniprot(self.uri)
            actions = elasticsearch_actions(items, self.es_index, self.es_doc)

            #write into elasticsearch
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)
Esempio n. 16
0
def main():
    #parse config file, environment, and command line arguments
    mrtarget.cfg.setup_ops_parser()
    args = mrtarget.cfg.get_ops_args()

    #set up logging
    logger = None
    if args.log_config:
        if os.path.isfile(args.log_config) and os.access(
                args.log_config, os.R_OK):
            #read a log configuration file
            logging.config.fileConfig(args.log_config,
                                      disable_existing_loggers=False)
            logger = logging.getLogger(__name__ + ".main()")
        else:
            #unable to read the logging config file, abort
            logging.basicConfig()
            logger = logging.getLogger(__name__ + ".main()")
            logger.error("unable to read file {}".format(args.log_config))
            return 1
    else:
        #no logging config specified, fall back to default
        logging.basicConfig()
        logger = logging.getLogger(__name__ + ".main()")

    if not args.release_tag:
        logger.error('A [release-tag] has to be specified.')
        print('A [release-tag] has to be specified.', file=sys.stderr)
        return 1
    else:
        Config.RELEASE_VERSION = args.release_tag
        logger.info('setting release version %s' % Config.RELEASE_VERSION)

    with RedisManager(args.redis_remote, args.redis_host, args.redis_port):

        es = new_es_client(args.elasticseach_nodes)
        redis = new_redis_client(args.redis_host, args.redis_port)

        #create a single query object for future use
        esquery = ESQuery(es)

        #read the data configuration
        data_config = mrtarget.cfg.get_data_config(args.data_config)

        #create something to accumulate qc metrics into over various steps
        qc_metrics = QCMetrics()

        with Loader(es,
                    chunk_size=ElasticSearchConfiguration.bulk_load_chunk,
                    dry_run=args.dry_run) as loader:

            if args.rea:
                process = ReactomeProcess(
                    loader, data_config.reactome_pathway_data,
                    data_config.reactome_pathway_relation)
                if not args.qc_only:
                    process.process_all(args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
            if args.ens:
                process = EnsemblProcess(loader)
                if not args.qc_only:
                    process.process(data_config.ensembl_filename, args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
            if args.unic:
                process = UniprotDownloader(loader)
                if not args.qc_only:
                    process.process(data_config.uniprot_uri, args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
            if args.hpa:
                process = HPAProcess(loader, redis, args.elasticseach_nodes,
                                     data_config.tissue_translation_map,
                                     data_config.tissue_curation_map,
                                     data_config.hpa_normal_tissue,
                                     data_config.hpa_rna_level,
                                     data_config.hpa_rna_value,
                                     data_config.hpa_rna_zscore)
                if not args.qc_only:
                    process.process_all(args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))

            if args.gen:
                process = GeneManager(
                    loader,
                    redis,
                    args.gen_plugin_places,
                    data_config.gene_data_plugin_names,
                )
                if not args.qc_only:
                    process.merge_all(data_config, dry_run=args.dry_run)

                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))

            if args.efo:
                process = EfoProcess(loader, data_config.ontology_efo,
                                     data_config.ontology_hpo,
                                     data_config.ontology_mp,
                                     data_config.disease_phenotype)
                if not args.qc_only:
                    process.process_all(args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
            if args.eco:
                process = EcoProcess(loader, data_config.ontology_eco,
                                     data_config.ontology_so)
                if not args.qc_only:
                    process.process_all(args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))

            if args.val:
                es_output_folder = None
                if "elasticsearch_folder" in vars(
                        args) and args.elasticsearch_folder is not None:
                    es_output_folder = args.elasticsearch_folder

                process_evidences_pipeline(
                    filenames=data_config.input_file,
                    first_n=args.val_first_n,
                    es_client=es,
                    redis_client=redis,
                    dry_run=args.dry_run,
                    output_folder=es_output_folder,
                    num_workers=args.val_workers_validator,
                    num_writers=args.val_workers_writer,
                    max_queued_events=args.val_queue_validator_writer,
                    eco_scores_uri=data_config.eco_scores,
                    schema_uri=data_config.schema,
                    es_hosts=args.elasticseach_nodes,
                    excluded_biotypes=data_config.excluded_biotypes,
                    datasources_to_datatypes=data_config.
                    datasources_to_datatypes)

                #TODO qc

            if args.assoc:
                process = ScoringProcess(args.redis_host, args.redis_port,
                                         args.elasticseach_nodes)
                if not args.qc_only:
                    process.process_all(data_config.scoring_weights,
                                        data_config.is_direct_do_not_propagate,
                                        data_config.datasources_to_datatypes,
                                        args.dry_run,
                                        args.as_workers_production,
                                        args.as_workers_score,
                                        args.as_queue_production_score)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
                    pass

            if args.ddr:
                process = DataDrivenRelationProcess(es)
                if not args.qc_only:
                    process.process_all(args.dry_run,
                                        args.ddr_workers_production,
                                        args.ddr_workers_score,
                                        args.ddr_queue_production_score,
                                        args.ddr_queue_score_result)
                #TODO qc

            if args.sea:
                process = SearchObjectProcess(loader, redis)
                if not args.qc_only:
                    process.process_all(
                        data_config.chembl_target,
                        data_config.chembl_mechanism,
                        data_config.chembl_component,
                        data_config.chembl_protein,
                        data_config.chembl_molecule_set_uri_pattern,
                        args.dry_run)
                #TODO qc

            if args.metric:
                process = Metrics(
                    es, args.metric_file,
                    data_config.datasources_to_datatypes).generate_metrics()

    if args.qc_in:
        #handle reading in previous qc from filename provided, and adding comparitive metrics
        qc_metrics.compare_with(args.qc_in)

    if args.qc_out:
        #handle writing out to a tsv file
        qc_metrics.write_out(args.qc_out)

    logger.info('`' + " ".join(sys.argv) + '` - finished')
    return 0
# ./data_pipeline_extract.py --index=master_evidence-data --output-dir=~/data/ot/extract --output-filename=evidence.json
# ./data_pipeline_extract.py --index=master_gene-data --output-dir=~/data/ot/extract --output-filename=gene.json
# ./data_pipeline_extract.py --index=master_association-data --output-dir=~/data/ot/extract --output-filename=association.json
# ./data_pipeline_extract.py --index=master_efo-data --output-dir=~/data/ot/extract --output-filename=efo.json --id-field-name=id
# ./data_pipeline_extract.py --index=master_eco-data --output-dir=~/data/ot/extract --output-filename=eco.json
from mrtarget.common.connection import new_es_client
from elasticsearch import helpers
from pathlib import Path
import more_itertools
import tqdm
import argparse
import json
import os
import logging

es = new_es_client('http://elasticsearch:9200')


def get_record_iterator(index, id_field, batch_size=10000):
    # Setup scanner for entire index
    query = {"query": {"match_all": {}}}
    res = helpers.scan(es, query, index=index, size=batch_size, scroll='1h')
    for batch in more_itertools.chunked(tqdm.tqdm(res), batch_size):
        for r in batch:
            rec = r['_source']
            if id_field:
                rec[id_field] = r['_id']
            yield rec

def export(index, out_file, id_field):
    logging.info(f'Beginning export to {out_file}')
Esempio n. 18
0
def main():
    #parse config file, environment, and command line arguments
    mrtarget.cfg.setup_ops_parser()
    args = mrtarget.cfg.get_ops_args()

    #set up logging
    logger = None
    if args.log_config:
        if os.path.isfile(args.log_config) and os.access(
                args.log_config, os.R_OK):
            #read a log configuration file
            logging.config.fileConfig(args.log_config,
                                      disable_existing_loggers=False)
            logger = logging.getLogger(__name__ + ".main()")
        else:
            #unable to read the logging config file, abort
            logging.basicConfig()
            logger = logging.getLogger(__name__ + ".main()")
            logger.error("unable to read file {}".format(args.log_config))
            return 1
    else:
        #no logging config specified, fall back to default
        logging.basicConfig()
        logger = logging.getLogger(__name__ + ".main()")

    logger.info('`' + " ".join(sys.argv) + '` - starting')

    #read the data configuration
    data_config = mrtarget.cfg.get_config(args.data_config)

    #read the es configuration
    es_config = mrtarget.cfg.get_config(args.es_config)

    #es clients can't be pased around to multiple processs!
    es = new_es_client(args.elasticseach_nodes)

    #create something to accumulate qc metrics into over various steps
    qc_metrics = QCMetrics()

    if args.rea:
        process = ReactomeProcess(args.elasticseach_nodes, es_config.rea.name,
                                  es_config.rea.mapping, es_config.rea.setting,
                                  data_config.reactome_pathway_data,
                                  data_config.reactome_pathway_relation,
                                  args.rea_workers_writer,
                                  args.rea_queue_write)
        if not args.qc_only:
            process.process_all(args.dry_run)
        if not args.skip_qc:
            qc_metrics.update(process.qc(es, es_config.rea.name))

    if args.gen:
        process = GeneManager(args.elasticseach_nodes, es_config.gen.name,
                              es_config.gen.mapping, es_config.gen.setting,
                              args.gen_plugin_places,
                              data_config.gene_data_plugin_names, data_config,
                              es_config, args.gen_workers_writer,
                              args.gen_queue_write)
        if not args.qc_only:
            process.merge_all(args.dry_run)
        if not args.skip_qc:
            qc_metrics.update(process.qc(es, es_config.gen.name))

    if args.efo:
        process = EfoProcess(args.elasticseach_nodes, es_config.efo.name,
                             es_config.efo.mapping, es_config.efo.setting,
                             data_config.ontology_efo,
                             data_config.ontology_hpo, data_config.ontology_mp,
                             data_config.disease_phenotype,
                             args.efo_workers_writer, args.efo_queue_write)
        if not args.qc_only:
            process.process_all(args.dry_run)
        if not args.skip_qc:
            qc_metrics.update(process.qc(es, es_config.efo.name))
    if args.eco:
        process = EcoProcess(args.elasticseach_nodes, es_config.eco.name,
                             es_config.eco.mapping, es_config.eco.setting,
                             data_config.ontology_eco, data_config.ontology_so,
                             args.eco_workers_writer, args.eco_queue_write)
        if not args.qc_only:
            process.process_all(args.dry_run)
        if not args.skip_qc:
            qc_metrics.update(process.qc(es, es_config.eco.name))

    if args.val:
        process_evidences_pipeline(
            data_config.input_file, args.val_first_n, args.elasticseach_nodes,
            es_config.val_right.name, es_config.val_wrong.name,
            es_config.val_right.mapping, es_config.val_wrong.mapping,
            es_config.val_right.setting, es_config.val_wrong.setting,
            es_config.gen.name, es_config.eco.name, es_config.efo.name,
            args.dry_run, args.val_append_data, args.val_workers_validator,
            args.val_queue_validator, args.val_workers_writer,
            args.val_queue_validator_writer, args.val_cache_target,
            args.val_cache_target_u2e, args.val_cache_target_contains,
            args.val_cache_eco, args.val_cache_efo,
            args.val_cache_efo_contains, data_config.eco_scores,
            data_config.schema, data_config.excluded_biotypes,
            data_config.datasources_to_datatypes)

        #TODO qc

    if args.hpa:
        process = HPAProcess(
            args.elasticseach_nodes, es_config.hpa.name, es_config.hpa.mapping,
            es_config.hpa.setting, data_config.tissue_translation_map,
            data_config.tissue_curation_map, data_config.hpa_normal_tissue,
            data_config.hpa_rna_level, data_config.hpa_rna_value,
            data_config.hpa_rna_zscore, args.hpa_workers_writer,
            args.hpa_queue_write)
        if not args.qc_only:
            process.process_all(args.dry_run)
        if not args.skip_qc:
            qc_metrics.update(process.qc(es, es_config.hpa.name))

    if args.assoc:
        process = ScoringProcess(
            args.elasticseach_nodes, es_config.asc.name, es_config.asc.mapping,
            es_config.asc.setting, es_config.gen.name,
            es_config.val_right.name, es_config.hpa.name, es_config.efo.name,
            args.as_workers_writer, args.as_workers_production,
            args.as_workers_score, args.as_queue_score,
            args.as_queue_production, args.as_queue_write, args.as_cache_hpa,
            args.as_cache_efo, args.as_cache_target,
            data_config.scoring_weights,
            data_config.is_direct_do_not_propagate,
            data_config.datasources_to_datatypes)
        if not args.qc_only:
            process.process_all(args.dry_run)
        if not args.skip_qc:
            qc_metrics.update(process.qc(es, es_config.asc.name))

    if args.ddr:
        process = DataDrivenRelationProcess(
            args.elasticseach_nodes, es_config.ddr.name, es_config.ddr.mapping,
            es_config.ddr.setting, es_config.efo.name, es_config.gen.name,
            es_config.asc.name, args.ddr_workers_production,
            args.ddr_workers_score, args.ddr_workers_write,
            args.ddr_queue_production_score, args.ddr_queue_score_result,
            args.ddr_queue_write, data_config.ddr["score-threshold"],
            data_config.ddr["evidence-count"])
        if not args.qc_only:
            process.process_all(args.dry_run)
        #TODO qc

    if args.sea:
        process = SearchObjectProcess(
            args.elasticseach_nodes, es_config.sea.name, es_config.sea.mapping,
            es_config.sea.setting, es_config.gen.name, es_config.efo.name,
            es_config.val_right.name, es_config.asc.name,
            args.sea_workers_writer, args.sea_queue_write,
            data_config.chembl_target, data_config.chembl_mechanism,
            data_config.chembl_component, data_config.chembl_protein,
            data_config.chembl_molecule)
        if not args.qc_only:
            process.process_all(args.dry_run)
        #TODO qc

    if args.drg:
        process = DrugProcess(
            args.elasticseach_nodes, es_config.drg.name, es_config.drg.mapping,
            es_config.drg.setting, es_config.gen.name, es_config.efo.name,
            args.drg_workers_writer, args.drg_queue_write, args.drg_cache_efo,
            args.drg_cache_efo_contains, args.drg_cache_target,
            args.drg_cache_target_u2e, args.drg_cache_target_contains,
            data_config.chembl_target, data_config.chembl_mechanism,
            data_config.chembl_component, data_config.chembl_protein,
            data_config.chembl_molecule, data_config.chembl_indication,
            data_config.adverse_events)
        if not args.qc_only:
            process.process_all(args.dry_run)
        if not args.skip_qc:
            qc_metrics.update(process.qc(es, es_config.drg.name))

    if args.qc_in:
        #handle reading in previous qc from filename provided, and adding comparitive metrics
        qc_metrics.compare_with(args.qc_in)

    if args.qc_out:
        #handle writing out to a tsv file
        qc_metrics.write_out(args.qc_out)

    logger.info('`' + " ".join(sys.argv) + '` - finished')
    return 0
Esempio n. 19
0
def write_on_start(es_hosts):
    kwargs = {}
    es_client = new_es_client(es_hosts)
    kwargs['es_loader'] = Loader(es=es_client)

    return kwargs
Esempio n. 20
0
def produce_evidence_local_init(es_hosts, es_index_val_right,
        scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes):
    es = new_es_client(es_hosts)
    return (es, es_index_val_right, scoring_weights, 
        is_direct_do_not_propagate, datasources_to_datatypes)
Esempio n. 21
0
    def process_all(self, dry_run):
        es = new_es_client(self.es_hosts)

        drugs = self.generate(es)
        self.store(es, dry_run, drugs)
Esempio n. 22
0
def produce_evidence_local_init(es_hosts, scoring_weights,
                                is_direct_do_not_propagate,
                                datasources_to_datatypes):
    es = new_es_client(es_hosts)
    es_query = ESQuery(es)
    return es_query, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes
Esempio n. 23
0
def process_evidences_pipeline(
        filenames, first_n, es_hosts, es_index_valid, es_index_invalid,
        es_doc_valid, es_doc_invalid, es_mappings_valid, es_mappings_invalid,
        es_settings_valid, es_settings_invalid, es_index_gene, es_index_eco,
        es_index_efo, dry_run, workers_validation, queue_validation,
        workers_write, queue_write, eco_scores_uri, schema_uri,
        excluded_biotypes, datasources_to_datatypes):

    logger = logging.getLogger(__name__)

    # do not pass this es object to other processess, single process only!
    es = new_es_client(es_hosts)

    if not filenames:
        logger.error('tried to run with no filenames at all')
        raise RuntimeError("Must specify at least one filename of evidence")

    # files that are not fetchable
    failed_filenames = list(itertools.ifilterfalse(IO.check_to_open,
                                                   filenames))

    for uri in failed_filenames:
        logger.warning('failed to fetch uri %s', uri)

    # get the filenames that are properly fetchable
    #sort the list for consistent behaviour
    checked_filenames = sorted((set(filenames) - set(failed_filenames)))

    logger.info('start evidence processing pipeline')

    #create a iterable of lines from all file handles
    evs = IO.make_iter_lines(checked_filenames, first_n)

    #create functions with pre-baked arguments
    validation_on_start_baked = functools.partial(validation_on_start,
                                                  eco_scores_uri, schema_uri,
                                                  excluded_biotypes,
                                                  datasources_to_datatypes,
                                                  es_hosts, es_index_gene,
                                                  es_index_eco, es_index_efo)

    #here is the pipeline definition
    pl_stage = pr.map(process_evidence,
                      evs,
                      workers=workers_validation,
                      maxsize=queue_validation,
                      on_start=validation_on_start_baked)

    logger.info('stages created, running scoring and writing')

    with URLZSource(es_mappings_valid).open() as mappings_file:
        mappings_valid = json.load(mappings_file)

    with URLZSource(es_mappings_invalid).open() as mappings_file:
        mappings_invalid = json.load(mappings_file)

    with URLZSource(es_settings_valid).open() as settings_file:
        settings_valid = json.load(settings_file)

    with URLZSource(es_settings_invalid).open() as settings_file:
        settings_invalid = json.load(settings_file)

    with ElasticsearchBulkIndexManager(es, es_index_invalid, settings_invalid,
                                       mappings_invalid):
        with ElasticsearchBulkIndexManager(es, es_index_valid, settings_valid,
                                           mappings_valid):
            #load into elasticsearch
            chunk_size = 1000  #TODO make configurable
            actions = elasticsearch_actions(pl_stage, es_index_valid,
                                            es_index_invalid, es_doc_valid,
                                            es_doc_invalid)
            failcount = 0

            if not dry_run:
                results = None
                if workers_write > 0:
                    logger.debug("Using parallel bulk writer for Elasticearch")
                    # this can silently crash ?
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=workers_write,
                        queue_size=queue_write,
                        chunk_size=chunk_size)
                else:
                    logger.debug(
                        "Using streaming bulk writer for Elasticearch")
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)

                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)

            print('stages created, ran scoring and writing')
            logger.info('stages created, ran scoring and writing')

    if failed_filenames:
        raise RuntimeError('unable to handle %s', str(failed_filenames))
Esempio n. 24
0
    def process_all(self, dry_run):

        # do not pass this es object to other processess, single process only!
        es = new_es_client(self.es_hosts)

        targets = self.get_targets(es)

        self.logger.info('setting up stages')

        #bake the arguments for the setup into function objects
        produce_evidence_local_init_baked = functools.partial(produce_evidence_local_init, 
            self.es_hosts, self.es_index_val_right,
            self.scoring_weights, self.is_direct_do_not_propagate, 
            self.datasources_to_datatypes)
        score_producer_local_init_baked = functools.partial(score_producer_local_init,
            self.datasources_to_datatypes, dry_run, self.es_hosts,
            self.es_index_gene, self.es_index_eco, self.es_index_hpa, self.es_index_efo)
        
        #pipeline stage for making the lists of the target/disease pairs and evidence
        pipeline_stage1 = pr.flat_map(produce_evidence, targets, 
            workers=self.workers_production,
            maxsize=self.queue_produce,
            on_start=produce_evidence_local_init_baked)

        #pipeline stage for scoring the evidence sets
        #includes writing to elasticsearch
        pipeline_stage2 = pr.map(score_producer, pipeline_stage1, 
            workers=self.workers_score,
            maxsize=self.queue_score,
            on_start=score_producer_local_init_baked)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):
            #load into elasticsearch
            self.logger.info('stages created, running scoring and writing')
            client = es
            chunk_size = 1000 #TODO make configurable
            actions = self.elasticsearch_actions(pipeline_stage2, 
                self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    self.logger.debug("Using parallel bulk writer for Elasticearch")
                    results = elasticsearch.helpers.parallel_bulk(client, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    self.logger.debug("Using streaming bulk writer for Elasticearch")
                    results = elasticsearch.helpers.streaming_bulk(client, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)

        self.logger.info("DONE")
def elasticsearch_local_init(es_hosts):
    return Loader(new_es_client(es_hosts)),
Esempio n. 26
0
def get_es_client():
    return new_es_client('http://elasticsearch:9200')