Beispiel #1
0
    def _store_efo(self, dry_run):

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):

            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            actions = elasticsearch_actions(self.efos.items(), self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)
    def store(self, es, dry_run, data):
        self.logger.info("Starting drug storage")
        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):
            # write into elasticsearch
            chunk_size = 1000  # TODO make configurable
            actions = elasticsearch_actions(list(data.items()), self.es_index)
            failcount = 0
            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)

        self.logger.debug("Completed storage")
    def process_all(self, dry_run):

        es = new_es_client(self.es_hosts)
        threshold = 0.1
        evidence_count = 3
        target_data, disease_data = get_disease_to_targets_vectors(
            self.score_threshold, self.evidence_count, es, self.es_index_assoc)

        if len(target_data) == 0 or len(disease_data) == 0:
            raise Exception(
                'Could not find a set of targets AND diseases that had the sufficient number'
                ' of evidences or acceptable harmonic sum score')
        '''sort the lists and keep using always the same order in all the steps'''
        disease_keys = sorted(disease_data.keys())
        target_keys = sorted(target_data.keys())

        self.logger.info('getting disese labels')
        disease_id_to_label = get_disease_labels(disease_keys, es,
                                                 self.es_index_efo)
        disease_labels = [
            disease_id_to_label[hit_id] for hit_id in disease_keys
        ]
        self.logger.info('getting target labels')
        target_id_to_label = get_target_labels(target_keys, es,
                                               self.es_index_gen)
        target_labels = [target_id_to_label[hit_id] for hit_id in target_keys]

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):

            #calculate and store disease-to-disease in multiple processess
            self.logger.info('handling disease-to-disease')
            handle_pairs(RelationType.SHARED_TARGET, disease_labels,
                         disease_data, disease_keys, target_keys, 0.19, 1024,
                         es, dry_run, self.ddr_workers_production,
                         self.ddr_workers_score, self.ddr_workers_write,
                         self.ddr_queue_production_score,
                         self.ddr_queue_score_result, self.ddr_queue_write,
                         self.es_index, self.es_doc)
            self.logger.info('handled disease-to-disease')

            #calculate and store target-to-target in multiple processess
            self.logger.info('handling target-to-target')
            handle_pairs(RelationType.SHARED_DISEASE, target_labels,
                         target_data, target_keys, disease_keys, 0.19, 1024,
                         es, dry_run, self.ddr_workers_production,
                         self.ddr_workers_score, self.ddr_workers_write,
                         self.ddr_queue_production_score,
                         self.ddr_queue_score_result, self.ddr_queue_write,
                         self.es_index, self.es_doc)
            self.logger.info('handled target-to-target')
Beispiel #4
0
    def build_json(self, filename1, filename2):
        # *** Work through manually curated chemical probes from the different portals ***
        # chemicalprobes column names are Probe, Target, SGClink, CPPlink, OSPlink, Note
        with URLZSource(filename1).open() as r_file:
            for i, row in enumerate(csv.DictReader(r_file,
                                                   dialect='excel-tab'),
                                    start=1):
                # Generate 'line' for current target
                probelinks = []
                if row["SGClink"] != "":
                    probelinks.append({
                        'source': "Structural Genomics Consortium",
                        'link': row["SGClink"]
                    })
                if row["CPPlink"] != "":
                    probelinks.append({
                        'source': "Chemical Probes Portal",
                        'link': row["CPPlink"]
                    })
                if row["OSPlink"] != "":
                    probelinks.append({
                        'source': "Open Science Probes",
                        'link': row["OSPlink"]
                    })

                line = {
                    "gene": row["Target"],
                    "chemicalprobe": row["Probe"],
                    "sourcelinks": probelinks,
                    "note": row["Note"]
                }
                # Add data for current chemical probe to self.chemicalprobes[Target]['portalprobes']
                # If gene has not appeared in chemical probe list yet,
                # initialise self.chemicalprobes with an empty list
                if row["Target"] not in self.chemicalprobes:
                    self.chemicalprobes[row["Target"]] = {}
                    self.chemicalprobes[row["Target"]]['portalprobes'] = []
                self.chemicalprobes[row["Target"]]['portalprobes'].append(line)

        # *** Work through Probe Miner targets ***
        # probeminer column names are Target, UniPRotID, NrofProbes
        # probeminer column names are hgnc_symbol, uniprot_symbol, nr_of_probes
        with URLZSource(filename2).open() as r_file:
            for i, row in enumerate(csv.DictReader(r_file,
                                                   dialect='excel-tab'),
                                    start=1):
                PMdata = {
                    "probenumber":
                    row["nr_of_probes"],
                    "link":
                    "https://probeminer.icr.ac.uk/#/" + row["uniprot_symbol"]
                }
                if row["hgnc_symbol"] not in self.chemicalprobes:
                    self.chemicalprobes[row["hgnc_symbol"]] = {}
                self.chemicalprobes[row["hgnc_symbol"]]['probeminer'] = PMdata
Beispiel #5
0
    def process_all(self, dry_run):

        self.relations = dict()
        self.g.add_node('root', name="", species="")

        for row in self.downloader.get_pathway_data():
            self.g.add_node(row['id'],
                            name=row['name'],
                            species=row['species'])
        children = set()
        for row in self.downloader.get_pathway_relations():
            self.g.add_edge(row['id'], row['child'])
            children.add(row['child'])

        nodes_without_parent = set(self.g.nodes()) - children
        for node in nodes_without_parent:
            if node != 'root':
                self.g.add_edge('root', node)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):
            #write into elasticsearch
            chunk_size = 1000  #TODO make configurable
            docs = generate_documents(self.g)
            actions = elasticsearch_actions(docs, self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)
Beispiel #6
0
    def merge_all(self, dry_run):

        es = new_es_client(self.es_hosts)

        #run the actual plugins
        for plugin_name in self.plugin_order:
            plugin = self.simplePluginManager.getPluginByName(plugin_name)

            # TODO remove the former redis object from all plugins
            plugin.plugin_object.merge_data(self.genes, es, None,
                                            self.data_config, self.es_config)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        # Hot fix issue 643: missing pathway in the association. Need a review for the reactome functions
        for geneid, gene in self.genes.iterate():
            gene._create_suggestions()
            gene._create_facets()

        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):

            #write into elasticsearch
            chunk_size = 1000  #TODO make configurable
            actions = elasticsearch_actions(self.genes, self.es_index)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)
Beispiel #7
0
    def process_all(self, dry_run):
        ''' process all the objects that needs to be returned by the search method
        :return:
        '''

        es = new_es_client(self.es_hosts)
        #setup chembl handler
        self.chembl_handler = ChEMBLLookup(
            self.chembl_target_uri, self.chembl_mechanism_uri,
            self.chembl_component_uri, self.chembl_protein_uri,
            self.chembl_molecule_set_uri_pattern)
        self.chembl_handler.get_molecules_from_evidence(
            es, self.es_index_val_right)
        all_molecules = set()
        for target, molecules in self.chembl_handler.target2molecule.items():
            all_molecules = all_molecules | molecules
        all_molecules = sorted(all_molecules)
        query_batch_size = 100
        for i in range(0, len(all_molecules) + 1, query_batch_size):
            self.chembl_handler.populate_synonyms_for_molecule(
                all_molecules[i:i + query_batch_size],
                self.chembl_handler.molecule2synonyms)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):
            #process targets
            self.logger.info('handling targets')
            targets = self.get_targets(es)
            so_it = self.handle_search_object(targets, es,
                                              SearchObjectTypes.TARGET)
            store_in_elasticsearch(so_it, dry_run, es, self.es_index,
                                   self.es_doc, self.workers_write,
                                   self.queue_write)

            #process diseases
            self.logger.info('handling diseases')
            diseases = self.get_diseases(es)
            so_it = self.handle_search_object(diseases, es,
                                              SearchObjectTypes.DISEASE)
            store_in_elasticsearch(so_it, dry_run, es, self.es_index,
                                   self.es_doc, self.workers_write,
                                   self.queue_write)
    def create_shelf_csv(self, uris, key_col, dialect):
        # sanity check inputs
        assert uris is not None
        assert len(uris) > 0

        # Shelve creates a file with specific database. Using a temp file requires a workaround to open it.
        # dumbdbm creates an empty database file. In this way shelve can open it properly.

        # note: this file is never deleted!
        filename = tempfile.NamedTemporaryFile(delete=True).name
        shelf = shelve.Shelf(dict=dbm.open(filename, 'n'))
        for uri in uris:
            with URLZSource(uri).open() as f_obj:
                f_obj = codecs.getreader("utf-8")(f_obj)
                for row in csv.DictReader(f_obj, dialect=dialect):
                    key_value = row[key_col]
                    key = self.str_hook(key_value)
                    if key is not None:
                        if key in shelf:
                            raise ValueError("Duplicate key %s in uri %s" %
                                             (key, uri))
                        row_dict = dict(row)
                        del row_dict[key_col]
                        shelf[key] = row_dict
        return shelf
    def create_shelf_multi(self, uris, key_f):
        # sanity check inputs
        assert uris is not None
        assert len(uris) > 0

        # Shelve creates a file with specific database. Using a temp file requires a workaround to open it.
        # dumbdbm creates an empty database file. In this way shelve can open it properly.

        # note: this file is never deleted!
        filename = tempfile.NamedTemporaryFile(delete=True).name
        shelf = shelve.Shelf(dict=dbm.open(filename, 'n'))
        for uri in uris:
            with URLZSource(uri).open() as f_obj:
                # for python2 we need to decode utf-8
                if sys.version_info < (3, 0):
                    f_obj = codecs.getreader("utf-8")(f_obj)
                for line_no, line in enumerate(f_obj):
                    try:
                        obj = json.loads(line)
                    except json.JSONDecodeError as e:
                        self.logger.error("Unable to read line %d %s", line_no,
                                          uri)
                        raise e

                    key_value = key_f(obj)
                    key = self.str_hook(key_value)
                    if key is not None:
                        existing = shelf.get(key, [])
                        existing.append(obj)
                        shelf[key] = existing
        return shelf
Beispiel #10
0
    def build_json(self, filename):

        with URLZSource(filename).open() as r_file:
            safety_data = json.load(r_file)
            for genekey in safety_data:
                if genekey not in self.safety:
                    self.safety[genekey] = safety_data[genekey]
    def __extract_protein_classes_from(uris):
        """uris is a list of filenames: str
        returns ({id:[{label:l, id:id},...],...}, {label:id,...})
        """
        classes = {}
        classes_inv_idx = {}

        for uri in uris:
            with URLZSource(uri).open() as f_obj:
                for line in f_obj:
                    i = json.loads(line)
                    protein_class_id = i.pop('protein_class_id')

                    gen = ((k, dict(label=v, id='')) for k, v in i.items()
                           if v)
                    protein_class_data = sorted(gen,
                                                key=lambda x: x[0],
                                                reverse=True)

                    classes[protein_class_id] = protein_class_data

                    label = protein_class_data[0][1]['label']
                    classes_inv_idx[label] = protein_class_id
        '''inject missing ids'''
        for k, v in classes.items():
            for level, data in v:
                label = data['label']
                if label in classes_inv_idx:
                    data['id'] = classes_inv_idx[label]

        return classes, classes_inv_idx
    def create_subset(self,evidence_file, evidence_info):
        count = 0
        path_filename, filename_attr = os.path.split(evidence_file)
        new_filename = "subset_" + filename_attr.replace('.gz', '')
        uri_to_filename = self.output_dir + '/' + new_filename
        if os.path.exists(uri_to_filename): os.remove(uri_to_filename)
        self.stats[filename_attr]['ensembl'] = {}
        with open(uri_to_filename, "a+") as file_subset:
            with URLZSource(evidence_file).open() as f_obj:
                for line in f_obj:
                    try:
                        read_line = json.loads(line)
                        new_key = self.deref_multi(read_line, evidence_info['subset_key'])
                        new_key = new_key.replace(evidence_info['subset_prefix'],'')
                        count = count + 1
                        if new_key in self.elem_to_search:
                            file_subset.write(line)
                            if new_key not in self.stats[filename_attr]['ensembl']:
                                self.stats[filename_attr]['ensembl'][new_key] = 1
                            else:
                                self.stats[filename_attr]['ensembl'][new_key]= self.stats[filename_attr]['ensembl'][new_key] + 1

                    except Exception as e:
                        logging.info("This line is not in a JSON format. Skipped it")

            self.stats[filename_attr]['num_key'] = count
        logging.debug("Finished")
        return uri_to_filename
Beispiel #13
0
    def get_pathway_relations(self):
        added_relations = []
        with URLZSource(self.pathway_relation_url).open() as source:
            for i, row in enumerate(csv.DictReader(
                    source,
                    fieldnames=self.headers_pathway_rel,
                    dialect='excel-tab'),
                                    start=1):
                if len(row) != 2:
                    raise ValueError(
                        'Reactome.py: Pathway Relation file format unexpected at line %d.'
                        % i)

                parent_id = row["id"]
                child_id = row["related_id"]

                relation = (parent_id, child_id)
                if relation not in added_relations:
                    if parent_id in self.valid_pathway_ids:
                        yield dict(
                            id=parent_id,
                            child=child_id,
                        )
                        added_relations.append(relation)
                        if len(added_relations) % 1000 == 0:
                            self.logger.debug(
                                "%i rows parsed from reactome_pathway_relation"
                                % len(added_relations))
                else:
                    self.logger.warn(
                        "Pathway relation %s is already loaded, skipping duplicate data"
                        % str(relation))
        self.logger.info('parsed %i rows from reactome_pathway_relation' %
                         len(added_relations))
Beispiel #14
0
    def create_shelf(self, uris, key_f):
        # Shelve creates a file with specific database. Using a temp file requires a workaround to open it.
        # dumbdbm creates an empty database file. In this way shelve can open it properly.

        #note: this file is never deleted!
        filename = tempfile.NamedTemporaryFile(delete=False).name
        shelf = shelve.Shelf(dict=dbm.open(filename, 'n'))
        for uri in uris:
            with URLZSource(uri).open() as f_obj:
                f_obj = codecs.getreader("utf-8")(f_obj)
                for line_no, line in enumerate(f_obj):
                    try:
                        obj = json.loads(line)
                    except json.JSONDecodeError as e:
                        self.logger.error("Unable to read line %d %s %s",
                                          line_no, uri, e)
                        raise e

                    key = key_f(obj)
                    if key is not None:
                        if str(key) in shelf:
                            raise ValueError("Duplicate key %s in uri %s" %
                                             (key, uri))
                        shelf[str(key)] = obj
        return shelf
Beispiel #15
0
def main():
    logging.config.fileConfig(file_or_resource('logging.ini'),
                              disable_existing_loggers=False)
    logger = logging.getLogger(__name__)

    parser = argparse.ArgumentParser(description='OpenTargets evs validator')
    parser.add_argument('data_source_file',
                        nargs='?',
                        default='-',
                        help='The prefix to prepend default: STDIN')
    parser.add_argument("--schema",
                        dest='schema',
                        help="set the schema file to use",
                        action='store')
    parser.add_argument("--log-level",
                        dest='loglevel',
                        help="set the log level def: WARNING",
                        action='store',
                        default='WARNING')
    parser.add_argument(
        "--log-lines",
        dest='loglines',
        help="number of log errors to print out [no longer supported]",
        action='store',
        type=int,
        default=None)

    args = parser.parse_args()

    if args.loglevel:
        try:
            root_logger = logging.getLogger()
            root_logger.setLevel(logging.getLevelName(args.loglevel))
            logger.setLevel(logging.getLevelName(args.loglevel))
        except Exception as e:
            root_logger.exception(e)

    #TODO use a position argument
    if not args.schema:
        logger.error('A --schema <schemafile> has to be specified.')
        return 1

    # warn and exit when using removed arguments
    if args.loglines is not None:
        logger.error("--log-lines is no longer supported")
        return 3

    valid = True
    if args.data_source_file == '-':
        valid = validate(sys.stdin, args.schema)
    else:
        with URLZSource(args.data_source_file).open() as fh:
            valid = validate(fh, args.schema)

    #if we had any validation errors, exit with status 2
    if not valid:
        return 2

    #if everything was fine, exit with status 0
    return 0
Beispiel #16
0
    def get_pathway_data(self):
        self.valid_pathway_ids = []
        with URLZSource(self.pathway_data_url).open() as source:
            for i, row in enumerate(csv.DictReader(source,
                                                   fieldnames=self.headers,
                                                   dialect='excel-tab'),
                                    start=1):
                if len(row) != 3:
                    raise ValueError(
                        'Reactome.py: Pathway file format unexpected at line %d.'
                        % i)

                pathway_id = row["id"]
                pathway_name = row["description"]
                species = row["species"]

                if pathway_id not in self.valid_pathway_ids:
                    if species in self.allowed_species:
                        self.valid_pathway_ids.append(pathway_id)
                        yield dict(
                            id=pathway_id,
                            name=pathway_name,
                            species=species,
                        )
                        if len(self.valid_pathway_ids) % 1000 == 0:
                            self.logger.debug(
                                "%i rows parsed for reactome_pathway_data" %
                                len(self.valid_pathway_ids))
                else:
                    self.logger.warn(
                        "Pathway id %s is already loaded, skipping duplicate data"
                        % pathway_id)

        self.logger.info('parsed %i rows for reactome_pathway_data' %
                         len(self.valid_pathway_ids))
Beispiel #17
0
 def build_json_experimental_toxicity(self, uri):
     with URLZSource(uri).open() as f_obj:
         for row in csv.DictReader(f_obj, dialect='excel-tab'):
             toxicity_json = self.exp_toxicity_json_format(row)
             genekey = row["ensembl_gene_id"].strip()
             if genekey not in self.experimental_toxicity:
                 self.experimental_toxicity[genekey] = []
             self.experimental_toxicity[genekey].append(toxicity_json)
Beispiel #18
0
 def build_json_safety(self, filename):
     with URLZSource(filename).open() as r_file:
         safety_data = json.load(r_file)
         for genekey in safety_data:
             if genekey not in self.safety:
                 self.safety[genekey] = safety_data[genekey]
             else:
                 self._logger.info("Safety gene id duplicated: " + genekey)
Beispiel #19
0
    def build_json(self, filename):
        # Just for reference: column names are: "ID_CENSUS_ANNOT", "ID_CENSUS", "ID_GENE", "GENE_NAME", "CELL_TYPE",
        # "PUBMED_PMID", "ID_DATA_CATEGORY", "DESCRIPTION", "DISPLAY", "SHORT", "CELL_LINE", "DESCRIPTION_1")
        with URLZSource(filename).open() as r_file:
            for i, row in enumerate(csv.DictReader(r_file, dialect='excel-tab'), start=1):

                PMID = re.sub(r'^"|"$', '', row["PUBMED_PMID"])
                Short = re.sub(r'^"|"$', '', row["SHORT"])
                GeneSymbol = re.sub(r'^"|"$', '', row["GENE_NAME"])
                Description_1 = re.sub(r'^"|"$', '', row["DESCRIPTION_1"])
                Description_1.rstrip()
                Description = re.sub(r'^"|"$', '', row["DESCRIPTION"])

                if GeneSymbol not in self.hallmarks:
                    self.hallmarks[GeneSymbol] = dict()

                if Description_1 in self.hallmarks_labels:
                    promote  = False
                    suppress = False

                    if Short == 'a': promote = True
                    if Short == 's': suppress = True

                    line = {
                             "label": Description_1,
                             "description": Description,
                             "promote": promote,
                             "suppress": suppress,
                             "pmid": PMID
                            }

                    try:
                        self.hallmarks[GeneSymbol]["cancer_hallmarks"].append(line)
                    except KeyError:
                        self.hallmarks[GeneSymbol]["cancer_hallmarks"] = list()
                        self.hallmarks[GeneSymbol]["cancer_hallmarks"].append(line)

                elif Description_1 == 'function summary':
                    line = {"pmid": PMID, "description": Description}

                    try:
                        self.hallmarks[GeneSymbol]["function_summary"].append(line)
                    except KeyError:
                        self.hallmarks[GeneSymbol]["function_summary"] = list()
                        self.hallmarks[GeneSymbol]["function_summary"].append(line)

                else:
                    line = {
                             "attribute_name": Description_1,
                             "description": Description,
                             "pmid": PMID
                           }

                    try:
                        self.hallmarks[GeneSymbol]["attributes"].append(line)
                    except KeyError:
                        self.hallmarks[GeneSymbol]["attributes"] = list()
                        self.hallmarks[GeneSymbol]["attributes"].append(line)
Beispiel #20
0
    def store_data(self, dry_run):
        self.logger.info('store_data called')

        self.logger.debug('calling to create new expression index')

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):

            #write into elasticsearch
            chunk_size = 1000  #TODO make configurable
            actions = elasticsearch_actions(self.hpa_merged_table, dry_run,
                                            self.es_index)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)

        if failcount:
            raise RuntimeError("%s failed to index" % failcount)

        self.logger.info('missing tissues %s', str(_missing_tissues))
Beispiel #21
0
    def retrieve_normal_tissue_data(self):
        """Parse 'normal_tissue' csv file,
        the expression profiles for proteins in human tissues from HPA

        :return: dict
        """
        self.logger.info('get normal tissue rows into dicts')
        table = (petl.fromcsv(
            URLZSource(self.normal_tissue_url), delimiter='\t'
        ).rename({
            'Tissue': 'tissue',
            'Cell type': 'cell_type',
            'Level': 'level',
            'Reliability': 'reliability',
            'Gene': 'gene'
        }).cut('tissue', 'cell_type', 'level', 'reliability', 'gene').addfield(
            'tissue_label',
            lambda rec: name_from_tissue(rec['tissue'].strip(), self.t2m)
        ).addfield('tissue_code', lambda rec: code_from_tissue(
            rec['tissue_label'], self.t2m)).addfield(
                'tissue_level',
                lambda rec: level_from_text(rec['level'])).addfield(
                    'anatomical_systems',
                    lambda rec: asys_from_tissue(rec['tissue_label'], self.t2m)
                ).addfield(
                    'organs', lambda rec: organs_from_tissue(
                        rec['tissue_label'], self.t2m)).addfield(
                            'tissue_reliability', lambda rec:
                            reliability_from_text(rec['reliability'])).cut(
                                'gene', 'tissue_code', 'tissue_label',
                                'tissue_level', 'tissue_reliability',
                                'cell_type', 'anatomical_systems',
                                'organs').aggregate(
                                    ('gene', 'tissue_code'),
                                    aggregation={
                                        'cell_types':
                                        (('cell_type', 'tissue_level',
                                          'tissue_reliability'), list),
                                        'tissue_label': ('tissue_label', set),
                                        'anatomical_systems':
                                        ('anatomical_systems', list),
                                        'organs': ('organs', list)
                                    },
                                    presorted=True).
                 aggregate('gene',
                           aggregation={
                               'data':
                               (('tissue_code', 'tissue_label', 'cell_types',
                                 'anatomical_systems', 'organs'), list)
                           },
                           presorted=True).addfield(
                               'result',
                               lambda rec: format_expression(rec)).cut(
                                   'gene', 'result'))

        return table
Beispiel #22
0
    def process(self, dry_run):
        def _put_line(line):
            return 1

        self.logger.info('Reading Ensembl gene info from %s' %
                         self.ensembl_filename)

        lines = more_itertools.with_iter(
            URLZSource(self.ensembl_filename).open())

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):
            #write into elasticsearch
            chunk_size = 1000  #TODO make configurable
            actions = elasticsearch_actions(lines, self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)
Beispiel #23
0
    def test_minimal(self):
        resources_path = os.path.dirname(os.path.realpath(__file__))
        data_source_file = resources_path + os.path.sep + "resources" + os.path.sep + "minimal.data.json"

        schema_source_file = resources_path + os.path.sep + "resources" + os.path.sep + "minimal.schema.json"
        schema_uri = "file://"+schema_source_file

        with URLZSource(data_source_file).open() as data_file_handle:
            valid = validate(data_file_handle, schema_uri)
            self.assertTrue(valid)
Beispiel #24
0
def generate_uniprot(uri):
    with URLZSource(uri).open() as r_file:
        for event, elem in etree.iterparse(r_file, events=("end",), 
                tag='{http://uniprot.org/uniprot}entry'):

            #parse the XML into an object
            entry = Parser(elem, return_raw_comments=False).parse()
            elem.clear()

            yield entry
Beispiel #25
0
def get_list_of_file_download(config_file, headers):
    number_elem = len(headers)
    result = {}
    with URLZSource(config_file).open() as source:
        for i, row in enumerate(csv.DictReader(source, fieldnames=headers), start=1):
                if len(row) != number_elem:
                   raise ValueError('File format unexpected at line %d.' % i)

                for item in row:
                    result[item] = row[item]
                yield result
    def download_molecules_linked_to_target(self):
        '''generate a dictionary with all the synonyms known for a given molecules.
         Only retrieves molecules linked to a target'''
        '''fetches all the targets from chembl and store their data and a mapping to uniprot id'''

        for uri in self.target_uri:
            with URLZSource(uri).open() as f_obj:
                for line in f_obj:
                    i = json.loads(line)
                    if 'target_components' in i and \
                            i['target_components'] and \
                            'accession' in i['target_components'][0] and \
                            i['target_components'][0]['accession']:
                        uniprot_id = i['target_components'][0]['accession']
                        self.targets[uniprot_id] = i
                        self.uni2chembl[uniprot_id] = i['target_chembl_id']

        allowed_target_chembl_ids = set(self.uni2chembl.values())
        for uri in self.mechanism_uri:
            with URLZSource(uri).open() as f_obj:
                for line in f_obj:
                    i = json.loads(line)
                    self.mechanisms[i['record_id']] = i
                    target_id = i['target_chembl_id']
                    if target_id in allowed_target_chembl_ids:
                        if target_id not in self.target2molecule:
                            self.target2molecule[target_id] = set()
                        self.target2molecule[target_id].add(
                            i['molecule_chembl_id'])

        required_molecules = set()
        self._logger.info('chembl t2m mols')
        for molecules in list(self.target2molecule.values()):
            for molecule in molecules:
                required_molecules.add(molecule)
        required_molecules = list(required_molecules)
        batch_size = 100
        self._logger.debug('chembl populate synonyms')
        for i in range(0, len(required_molecules), batch_size):
            self.populate_synonyms_for_molecule(
                required_molecules[i:i + batch_size], self.molecule2synonyms)
Beispiel #27
0
    def download_protein_classification(self):
        '''fetches targets components from chembls and inject the target class data in self.protein_classification'''

        for uri in self.protein_uri:
            with URLZSource(uri).open() as f_obj:
                for line in f_obj:
                    i = json.loads(line)
                    protein_class_id = i.pop('protein_class_id')
                    protein_class_data = dict((k, dict(label=v, id=''))
                                              for k, v in i.items()
                                              if v)  # remove values with none
                    self.protein_class[protein_class_id] = protein_class_data

                    max_level = 0
                    label = ''
                    for k, v in protein_class_data.items():
                        level = int(k[1])
                        if level >= max_level:
                            max_level = level
                            label = v['label']
                    self.protein_class_label_to_id[label] = protein_class_id
        '''inject missing ids'''
        for k, v in self.protein_class.items():
            for level, data in v.items():
                label = data['label']
                if label in self.protein_class_label_to_id:
                    data['id'] = self.protein_class_label_to_id[label]

        for uri in self.component_uri:
            with URLZSource(uri).open() as f_obj:
                for line in f_obj:
                    i = json.loads(line)
                    if 'accession' in i:
                        if i['accession'] not in self.protein_classification:
                            self.protein_classification[i['accession']] = []
                        for classification in i['protein_classifications']:
                            protein_class_id = classification[
                                'protein_classification_id']
                            self.protein_classification[i['accession']].append(
                                self.protein_class[protein_class_id])
    def merge_data(self, genes, es, r_server, data_config, es_config):

        self._logger.info("HGNC parsing - requesting from URL %s", data_config.hgnc_complete_set)

        with URLZSource(data_config.hgnc_complete_set).open() as source:

            data = json.load(source)

            for row in data['response']['docs']:
                gene = Gene()
                self.load_hgnc_data_from_json(gene, row)
                genes.add_gene(gene)

            self._logger.info("STATS AFTER HGNC PARSING:\n" + genes.get_stats())
Beispiel #29
0
    def process(self, dry_run):
        self.logger.debug("download uniprot uri %s", self.uri)
        self.logger.debug("to generate this file you have to call this url "
                            "https://www.uniprot.org/uniprot/?query=reviewed%3Ayes%2BAND%2Borganism%3A9606&compress=yes&format=xml")

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)
        chunk_size = 1000 # TODO make configurable
        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):

            items = generate_uniprot(self.uri)
            actions = elasticsearch_actions(items, self.es_index, self.es_doc)

            #write into elasticsearch
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)
Beispiel #30
0
    def __init__(self, tissue_translation_map, tissue_curation_map,
                 normal_tissue_url, rna_level_url, rna_value_url,
                 rna_zscore_url):
        self.logger = logging.getLogger(__name__)
        self.tissue_translation_map = tissue_translation_map
        self.tissue_curation_map = tissue_curation_map
        self.normal_tissue_url = normal_tissue_url
        self.rna_level_url = rna_level_url
        self.rna_value_url = rna_value_url
        self.rna_zscore_url = rna_zscore_url

        #load t2m
        t2m = {'tissues': {}, 'curations': {}}

        with URLZSource(self.tissue_translation_map).open(mode='rb') as r_file:
            t2m['tissues'] = json.load(r_file)['tissues']

        with URLZSource(self.tissue_curation_map).open(mode='rb') as r_file:
            for line in r_file:
                line = line.strip()
                line = line.split('\t')
                t2m['curations'][line[0].strip()] = line[1].strip()

        self.t2m = t2m