Esempi in Python per ElasticsearchBulkIndexManager, esempi in Python per mrtarget.common.esutil.ElasticsearchBulkIndexManager

Esempio n. 1

0

Mostra file

File: EFO.py Progetto: jwills/data_pipeline

    def _store_efo(self, dry_run):

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):

            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            actions = elasticsearch_actions(self.efos.items(), self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)

Esempio n. 2

0

Mostra file

File: Drug.py Progetto: thehyve/opentargets_data_pipeline

    def store(self, es, dry_run, data):
        self.logger.info("Starting drug storage")
        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):
            # write into elasticsearch
            chunk_size = 1000  # TODO make configurable
            actions = elasticsearch_actions(list(data.items()), self.es_index)
            failcount = 0
            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)

        self.logger.debug("Completed storage")

Esempio n. 3

0

Mostra file

File: DataDrivenRelation.py Progetto: jwills/data_pipeline

    def process_all(self, dry_run):

        es = new_es_client(self.es_hosts)
        threshold = 0.1
        evidence_count = 3
        target_data, disease_data = get_disease_to_targets_vectors(
            self.score_threshold, self.evidence_count, es, self.es_index_assoc)

        if len(target_data) == 0 or len(disease_data) == 0:
            raise Exception(
                'Could not find a set of targets AND diseases that had the sufficient number'
                ' of evidences or acceptable harmonic sum score')
        '''sort the lists and keep using always the same order in all the steps'''
        disease_keys = sorted(disease_data.keys())
        target_keys = sorted(target_data.keys())

        self.logger.info('getting disese labels')
        disease_id_to_label = get_disease_labels(disease_keys, es,
                                                 self.es_index_efo)
        disease_labels = [
            disease_id_to_label[hit_id] for hit_id in disease_keys
        ]
        self.logger.info('getting target labels')
        target_id_to_label = get_target_labels(target_keys, es,
                                               self.es_index_gen)
        target_labels = [target_id_to_label[hit_id] for hit_id in target_keys]

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):

            #calculate and store disease-to-disease in multiple processess
            self.logger.info('handling disease-to-disease')
            handle_pairs(RelationType.SHARED_TARGET, disease_labels,
                         disease_data, disease_keys, target_keys, 0.19, 1024,
                         es, dry_run, self.ddr_workers_production,
                         self.ddr_workers_score, self.ddr_workers_write,
                         self.ddr_queue_production_score,
                         self.ddr_queue_score_result, self.ddr_queue_write,
                         self.es_index, self.es_doc)
            self.logger.info('handled disease-to-disease')

            #calculate and store target-to-target in multiple processess
            self.logger.info('handling target-to-target')
            handle_pairs(RelationType.SHARED_DISEASE, target_labels,
                         target_data, target_keys, disease_keys, 0.19, 1024,
                         es, dry_run, self.ddr_workers_production,
                         self.ddr_workers_score, self.ddr_workers_write,
                         self.ddr_queue_production_score,
                         self.ddr_queue_score_result, self.ddr_queue_write,
                         self.es_index, self.es_doc)
            self.logger.info('handled target-to-target')

Esempio n. 4

0

Mostra file

File: Reactome.py Progetto: jwills/data_pipeline

    def process_all(self, dry_run):

        self.relations = dict()
        self.g.add_node('root', name="", species="")

        for row in self.downloader.get_pathway_data():
            self.g.add_node(row['id'],
                            name=row['name'],
                            species=row['species'])
        children = set()
        for row in self.downloader.get_pathway_relations():
            self.g.add_edge(row['id'], row['child'])
            children.add(row['child'])

        nodes_without_parent = set(self.g.nodes()) - children
        for node in nodes_without_parent:
            if node != 'root':
                self.g.add_edge('root', node)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):
            #write into elasticsearch
            chunk_size = 1000  #TODO make configurable
            docs = generate_documents(self.g)
            actions = elasticsearch_actions(docs, self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)

Esempio n. 5

0

Mostra file

    def merge_all(self, dry_run):

        es = new_es_client(self.es_hosts)

        #run the actual plugins
        for plugin_name in self.plugin_order:
            plugin = self.simplePluginManager.getPluginByName(plugin_name)

            # TODO remove the former redis object from all plugins
            plugin.plugin_object.merge_data(self.genes, es, None,
                                            self.data_config, self.es_config)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        # Hot fix issue 643: missing pathway in the association. Need a review for the reactome functions
        for geneid, gene in self.genes.iterate():
            gene._create_suggestions()
            gene._create_facets()

        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):

            #write into elasticsearch
            chunk_size = 1000  #TODO make configurable
            actions = elasticsearch_actions(self.genes, self.es_index)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)

Esempio n. 6

0

Mostra file

File: SearchObjects.py Progetto: jwills/data_pipeline

    def process_all(self, dry_run):
        ''' process all the objects that needs to be returned by the search method
        :return:
        '''

        es = new_es_client(self.es_hosts)
        #setup chembl handler
        self.chembl_handler = ChEMBLLookup(
            self.chembl_target_uri, self.chembl_mechanism_uri,
            self.chembl_component_uri, self.chembl_protein_uri,
            self.chembl_molecule_set_uri_pattern)
        self.chembl_handler.get_molecules_from_evidence(
            es, self.es_index_val_right)
        all_molecules = set()
        for target, molecules in self.chembl_handler.target2molecule.items():
            all_molecules = all_molecules | molecules
        all_molecules = sorted(all_molecules)
        query_batch_size = 100
        for i in range(0, len(all_molecules) + 1, query_batch_size):
            self.chembl_handler.populate_synonyms_for_molecule(
                all_molecules[i:i + query_batch_size],
                self.chembl_handler.molecule2synonyms)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):
            #process targets
            self.logger.info('handling targets')
            targets = self.get_targets(es)
            so_it = self.handle_search_object(targets, es,
                                              SearchObjectTypes.TARGET)
            store_in_elasticsearch(so_it, dry_run, es, self.es_index,
                                   self.es_doc, self.workers_write,
                                   self.queue_write)

            #process diseases
            self.logger.info('handling diseases')
            diseases = self.get_diseases(es)
            so_it = self.handle_search_object(diseases, es,
                                              SearchObjectTypes.DISEASE)
            store_in_elasticsearch(so_it, dry_run, es, self.es_index,
                                   self.es_doc, self.workers_write,
                                   self.queue_write)

Esempio n. 7

0

Mostra file

    def store_data(self, dry_run):
        self.logger.info('store_data called')

        self.logger.debug('calling to create new expression index')

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):

            #write into elasticsearch
            chunk_size = 1000  #TODO make configurable
            actions = elasticsearch_actions(self.hpa_merged_table, dry_run,
                                            self.es_index)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)

        if failcount:
            raise RuntimeError("%s failed to index" % failcount)

        self.logger.info('missing tissues %s', str(_missing_tissues))

Esempio n. 8

0

Mostra file

File: Ensembl.py Progetto: jwills/data_pipeline

    def process(self, dry_run):
        def _put_line(line):
            return 1

        self.logger.info('Reading Ensembl gene info from %s' %
                         self.ensembl_filename)

        lines = more_itertools.with_iter(
            URLZSource(self.ensembl_filename).open())

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):
            #write into elasticsearch
            chunk_size = 1000  #TODO make configurable
            actions = elasticsearch_actions(lines, self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)

Esempio n. 9

0

Mostra file

File: Uniprot.py Progetto: jwills/data_pipeline

    def process(self, dry_run):
        self.logger.debug("download uniprot uri %s", self.uri)
        self.logger.debug("to generate this file you have to call this url "
                            "https://www.uniprot.org/uniprot/?query=reviewed%3Ayes%2BAND%2Borganism%3A9606&compress=yes&format=xml")

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)
        chunk_size = 1000 # TODO make configurable
        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):

            items = generate_uniprot(self.uri)
            actions = elasticsearch_actions(items, self.es_index, self.es_doc)

            #write into elasticsearch
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)

Esempio n. 10

0

Mostra file

File: Evidences.py Progetto: jwills/data_pipeline

def process_evidences_pipeline(
        filenames, first_n, es_hosts, es_index_valid, es_index_invalid,
        es_doc_valid, es_doc_invalid, es_mappings_valid, es_mappings_invalid,
        es_settings_valid, es_settings_invalid, es_index_gene, es_index_eco,
        es_index_efo, dry_run, workers_validation, queue_validation,
        workers_write, queue_write, eco_scores_uri, schema_uri,
        excluded_biotypes, datasources_to_datatypes):

    logger = logging.getLogger(__name__)

    # do not pass this es object to other processess, single process only!
    es = new_es_client(es_hosts)

    if not filenames:
        logger.error('tried to run with no filenames at all')
        raise RuntimeError("Must specify at least one filename of evidence")

    # files that are not fetchable
    failed_filenames = list(itertools.ifilterfalse(IO.check_to_open,
                                                   filenames))

    for uri in failed_filenames:
        logger.warning('failed to fetch uri %s', uri)

    # get the filenames that are properly fetchable
    #sort the list for consistent behaviour
    checked_filenames = sorted((set(filenames) - set(failed_filenames)))

    logger.info('start evidence processing pipeline')

    #create a iterable of lines from all file handles
    evs = IO.make_iter_lines(checked_filenames, first_n)

    #create functions with pre-baked arguments
    validation_on_start_baked = functools.partial(validation_on_start,
                                                  eco_scores_uri, schema_uri,
                                                  excluded_biotypes,
                                                  datasources_to_datatypes,
                                                  es_hosts, es_index_gene,
                                                  es_index_eco, es_index_efo)

    #here is the pipeline definition
    pl_stage = pr.map(process_evidence,
                      evs,
                      workers=workers_validation,
                      maxsize=queue_validation,
                      on_start=validation_on_start_baked)

    logger.info('stages created, running scoring and writing')

    with URLZSource(es_mappings_valid).open() as mappings_file:
        mappings_valid = json.load(mappings_file)

    with URLZSource(es_mappings_invalid).open() as mappings_file:
        mappings_invalid = json.load(mappings_file)

    with URLZSource(es_settings_valid).open() as settings_file:
        settings_valid = json.load(settings_file)

    with URLZSource(es_settings_invalid).open() as settings_file:
        settings_invalid = json.load(settings_file)

    with ElasticsearchBulkIndexManager(es, es_index_invalid, settings_invalid,
                                       mappings_invalid):
        with ElasticsearchBulkIndexManager(es, es_index_valid, settings_valid,
                                           mappings_valid):
            #load into elasticsearch
            chunk_size = 1000  #TODO make configurable
            actions = elasticsearch_actions(pl_stage, es_index_valid,
                                            es_index_invalid, es_doc_valid,
                                            es_doc_invalid)
            failcount = 0

            if not dry_run:
                results = None
                if workers_write > 0:
                    logger.debug("Using parallel bulk writer for Elasticearch")
                    # this can silently crash ?
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=workers_write,
                        queue_size=queue_write,
                        chunk_size=chunk_size)
                else:
                    logger.debug(
                        "Using streaming bulk writer for Elasticearch")
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)

                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)

            print('stages created, ran scoring and writing')
            logger.info('stages created, ran scoring and writing')

    if failed_filenames:
        raise RuntimeError('unable to handle %s', str(failed_filenames))

Esempio n. 11

0

Mostra file

File: Association.py Progetto: jwills/data_pipeline

    def process_all(self, dry_run):

        # do not pass this es object to other processess, single process only!
        es = new_es_client(self.es_hosts)

        targets = self.get_targets(es)

        self.logger.info('setting up stages')

        #bake the arguments for the setup into function objects
        produce_evidence_local_init_baked = functools.partial(produce_evidence_local_init, 
            self.es_hosts, self.es_index_val_right,
            self.scoring_weights, self.is_direct_do_not_propagate, 
            self.datasources_to_datatypes)
        score_producer_local_init_baked = functools.partial(score_producer_local_init,
            self.datasources_to_datatypes, dry_run, self.es_hosts,
            self.es_index_gene, self.es_index_eco, self.es_index_hpa, self.es_index_efo)
        
        #pipeline stage for making the lists of the target/disease pairs and evidence
        pipeline_stage1 = pr.flat_map(produce_evidence, targets, 
            workers=self.workers_production,
            maxsize=self.queue_produce,
            on_start=produce_evidence_local_init_baked)

        #pipeline stage for scoring the evidence sets
        #includes writing to elasticsearch
        pipeline_stage2 = pr.map(score_producer, pipeline_stage1, 
            workers=self.workers_score,
            maxsize=self.queue_score,
            on_start=score_producer_local_init_baked)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):
            #load into elasticsearch
            self.logger.info('stages created, running scoring and writing')
            client = es
            chunk_size = 1000 #TODO make configurable
            actions = self.elasticsearch_actions(pipeline_stage2, 
                self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    self.logger.debug("Using parallel bulk writer for Elasticearch")
                    results = elasticsearch.helpers.parallel_bulk(client, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    self.logger.debug("Using streaming bulk writer for Elasticearch")
                    results = elasticsearch.helpers.streaming_bulk(client, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)

        self.logger.info("DONE")