def _store_efo(self, dry_run): with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(self.efos.items(), self.es_index, self.es_doc) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk(es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk(es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
def store(self, es, dry_run, data): self.logger.info("Starting drug storage") with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): # write into elasticsearch chunk_size = 1000 # TODO make configurable actions = elasticsearch_actions(list(data.items()), self.es_index) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk( es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk( es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount) self.logger.debug("Completed storage")
def process_all(self, dry_run): es = new_es_client(self.es_hosts) threshold = 0.1 evidence_count = 3 target_data, disease_data = get_disease_to_targets_vectors( self.score_threshold, self.evidence_count, es, self.es_index_assoc) if len(target_data) == 0 or len(disease_data) == 0: raise Exception( 'Could not find a set of targets AND diseases that had the sufficient number' ' of evidences or acceptable harmonic sum score') '''sort the lists and keep using always the same order in all the steps''' disease_keys = sorted(disease_data.keys()) target_keys = sorted(target_data.keys()) self.logger.info('getting disese labels') disease_id_to_label = get_disease_labels(disease_keys, es, self.es_index_efo) disease_labels = [ disease_id_to_label[hit_id] for hit_id in disease_keys ] self.logger.info('getting target labels') target_id_to_label = get_target_labels(target_keys, es, self.es_index_gen) target_labels = [target_id_to_label[hit_id] for hit_id in target_keys] with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #calculate and store disease-to-disease in multiple processess self.logger.info('handling disease-to-disease') handle_pairs(RelationType.SHARED_TARGET, disease_labels, disease_data, disease_keys, target_keys, 0.19, 1024, es, dry_run, self.ddr_workers_production, self.ddr_workers_score, self.ddr_workers_write, self.ddr_queue_production_score, self.ddr_queue_score_result, self.ddr_queue_write, self.es_index, self.es_doc) self.logger.info('handled disease-to-disease') #calculate and store target-to-target in multiple processess self.logger.info('handling target-to-target') handle_pairs(RelationType.SHARED_DISEASE, target_labels, target_data, target_keys, disease_keys, 0.19, 1024, es, dry_run, self.ddr_workers_production, self.ddr_workers_score, self.ddr_workers_write, self.ddr_queue_production_score, self.ddr_queue_score_result, self.ddr_queue_write, self.es_index, self.es_doc) self.logger.info('handled target-to-target')
def build_json(self, filename1, filename2): # *** Work through manually curated chemical probes from the different portals *** # chemicalprobes column names are Probe, Target, SGClink, CPPlink, OSPlink, Note with URLZSource(filename1).open() as r_file: for i, row in enumerate(csv.DictReader(r_file, dialect='excel-tab'), start=1): # Generate 'line' for current target probelinks = [] if row["SGClink"] != "": probelinks.append({ 'source': "Structural Genomics Consortium", 'link': row["SGClink"] }) if row["CPPlink"] != "": probelinks.append({ 'source': "Chemical Probes Portal", 'link': row["CPPlink"] }) if row["OSPlink"] != "": probelinks.append({ 'source': "Open Science Probes", 'link': row["OSPlink"] }) line = { "gene": row["Target"], "chemicalprobe": row["Probe"], "sourcelinks": probelinks, "note": row["Note"] } # Add data for current chemical probe to self.chemicalprobes[Target]['portalprobes'] # If gene has not appeared in chemical probe list yet, # initialise self.chemicalprobes with an empty list if row["Target"] not in self.chemicalprobes: self.chemicalprobes[row["Target"]] = {} self.chemicalprobes[row["Target"]]['portalprobes'] = [] self.chemicalprobes[row["Target"]]['portalprobes'].append(line) # *** Work through Probe Miner targets *** # probeminer column names are Target, UniPRotID, NrofProbes # probeminer column names are hgnc_symbol, uniprot_symbol, nr_of_probes with URLZSource(filename2).open() as r_file: for i, row in enumerate(csv.DictReader(r_file, dialect='excel-tab'), start=1): PMdata = { "probenumber": row["nr_of_probes"], "link": "https://probeminer.icr.ac.uk/#/" + row["uniprot_symbol"] } if row["hgnc_symbol"] not in self.chemicalprobes: self.chemicalprobes[row["hgnc_symbol"]] = {} self.chemicalprobes[row["hgnc_symbol"]]['probeminer'] = PMdata
def process_all(self, dry_run): self.relations = dict() self.g.add_node('root', name="", species="") for row in self.downloader.get_pathway_data(): self.g.add_node(row['id'], name=row['name'], species=row['species']) children = set() for row in self.downloader.get_pathway_relations(): self.g.add_edge(row['id'], row['child']) children.add(row['child']) nodes_without_parent = set(self.g.nodes()) - children for node in nodes_without_parent: if node != 'root': self.g.add_edge('root', node) with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable docs = generate_documents(self.g) actions = elasticsearch_actions(docs, self.es_index, self.es_doc) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk( es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk( es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
def merge_all(self, dry_run): es = new_es_client(self.es_hosts) #run the actual plugins for plugin_name in self.plugin_order: plugin = self.simplePluginManager.getPluginByName(plugin_name) # TODO remove the former redis object from all plugins plugin.plugin_object.merge_data(self.genes, es, None, self.data_config, self.es_config) with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) # Hot fix issue 643: missing pathway in the association. Need a review for the reactome functions for geneid, gene in self.genes.iterate(): gene._create_suggestions() gene._create_facets() with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(self.genes, self.es_index) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk( es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk( es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
def process_all(self, dry_run): ''' process all the objects that needs to be returned by the search method :return: ''' es = new_es_client(self.es_hosts) #setup chembl handler self.chembl_handler = ChEMBLLookup( self.chembl_target_uri, self.chembl_mechanism_uri, self.chembl_component_uri, self.chembl_protein_uri, self.chembl_molecule_set_uri_pattern) self.chembl_handler.get_molecules_from_evidence( es, self.es_index_val_right) all_molecules = set() for target, molecules in self.chembl_handler.target2molecule.items(): all_molecules = all_molecules | molecules all_molecules = sorted(all_molecules) query_batch_size = 100 for i in range(0, len(all_molecules) + 1, query_batch_size): self.chembl_handler.populate_synonyms_for_molecule( all_molecules[i:i + query_batch_size], self.chembl_handler.molecule2synonyms) with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #process targets self.logger.info('handling targets') targets = self.get_targets(es) so_it = self.handle_search_object(targets, es, SearchObjectTypes.TARGET) store_in_elasticsearch(so_it, dry_run, es, self.es_index, self.es_doc, self.workers_write, self.queue_write) #process diseases self.logger.info('handling diseases') diseases = self.get_diseases(es) so_it = self.handle_search_object(diseases, es, SearchObjectTypes.DISEASE) store_in_elasticsearch(so_it, dry_run, es, self.es_index, self.es_doc, self.workers_write, self.queue_write)
def create_shelf_csv(self, uris, key_col, dialect): # sanity check inputs assert uris is not None assert len(uris) > 0 # Shelve creates a file with specific database. Using a temp file requires a workaround to open it. # dumbdbm creates an empty database file. In this way shelve can open it properly. # note: this file is never deleted! filename = tempfile.NamedTemporaryFile(delete=True).name shelf = shelve.Shelf(dict=dbm.open(filename, 'n')) for uri in uris: with URLZSource(uri).open() as f_obj: f_obj = codecs.getreader("utf-8")(f_obj) for row in csv.DictReader(f_obj, dialect=dialect): key_value = row[key_col] key = self.str_hook(key_value) if key is not None: if key in shelf: raise ValueError("Duplicate key %s in uri %s" % (key, uri)) row_dict = dict(row) del row_dict[key_col] shelf[key] = row_dict return shelf
def create_shelf_multi(self, uris, key_f): # sanity check inputs assert uris is not None assert len(uris) > 0 # Shelve creates a file with specific database. Using a temp file requires a workaround to open it. # dumbdbm creates an empty database file. In this way shelve can open it properly. # note: this file is never deleted! filename = tempfile.NamedTemporaryFile(delete=True).name shelf = shelve.Shelf(dict=dbm.open(filename, 'n')) for uri in uris: with URLZSource(uri).open() as f_obj: # for python2 we need to decode utf-8 if sys.version_info < (3, 0): f_obj = codecs.getreader("utf-8")(f_obj) for line_no, line in enumerate(f_obj): try: obj = json.loads(line) except json.JSONDecodeError as e: self.logger.error("Unable to read line %d %s", line_no, uri) raise e key_value = key_f(obj) key = self.str_hook(key_value) if key is not None: existing = shelf.get(key, []) existing.append(obj) shelf[key] = existing return shelf
def build_json(self, filename): with URLZSource(filename).open() as r_file: safety_data = json.load(r_file) for genekey in safety_data: if genekey not in self.safety: self.safety[genekey] = safety_data[genekey]
def __extract_protein_classes_from(uris): """uris is a list of filenames: str returns ({id:[{label:l, id:id},...],...}, {label:id,...}) """ classes = {} classes_inv_idx = {} for uri in uris: with URLZSource(uri).open() as f_obj: for line in f_obj: i = json.loads(line) protein_class_id = i.pop('protein_class_id') gen = ((k, dict(label=v, id='')) for k, v in i.items() if v) protein_class_data = sorted(gen, key=lambda x: x[0], reverse=True) classes[protein_class_id] = protein_class_data label = protein_class_data[0][1]['label'] classes_inv_idx[label] = protein_class_id '''inject missing ids''' for k, v in classes.items(): for level, data in v: label = data['label'] if label in classes_inv_idx: data['id'] = classes_inv_idx[label] return classes, classes_inv_idx
def create_subset(self,evidence_file, evidence_info): count = 0 path_filename, filename_attr = os.path.split(evidence_file) new_filename = "subset_" + filename_attr.replace('.gz', '') uri_to_filename = self.output_dir + '/' + new_filename if os.path.exists(uri_to_filename): os.remove(uri_to_filename) self.stats[filename_attr]['ensembl'] = {} with open(uri_to_filename, "a+") as file_subset: with URLZSource(evidence_file).open() as f_obj: for line in f_obj: try: read_line = json.loads(line) new_key = self.deref_multi(read_line, evidence_info['subset_key']) new_key = new_key.replace(evidence_info['subset_prefix'],'') count = count + 1 if new_key in self.elem_to_search: file_subset.write(line) if new_key not in self.stats[filename_attr]['ensembl']: self.stats[filename_attr]['ensembl'][new_key] = 1 else: self.stats[filename_attr]['ensembl'][new_key]= self.stats[filename_attr]['ensembl'][new_key] + 1 except Exception as e: logging.info("This line is not in a JSON format. Skipped it") self.stats[filename_attr]['num_key'] = count logging.debug("Finished") return uri_to_filename
def get_pathway_relations(self): added_relations = [] with URLZSource(self.pathway_relation_url).open() as source: for i, row in enumerate(csv.DictReader( source, fieldnames=self.headers_pathway_rel, dialect='excel-tab'), start=1): if len(row) != 2: raise ValueError( 'Reactome.py: Pathway Relation file format unexpected at line %d.' % i) parent_id = row["id"] child_id = row["related_id"] relation = (parent_id, child_id) if relation not in added_relations: if parent_id in self.valid_pathway_ids: yield dict( id=parent_id, child=child_id, ) added_relations.append(relation) if len(added_relations) % 1000 == 0: self.logger.debug( "%i rows parsed from reactome_pathway_relation" % len(added_relations)) else: self.logger.warn( "Pathway relation %s is already loaded, skipping duplicate data" % str(relation)) self.logger.info('parsed %i rows from reactome_pathway_relation' % len(added_relations))
def create_shelf(self, uris, key_f): # Shelve creates a file with specific database. Using a temp file requires a workaround to open it. # dumbdbm creates an empty database file. In this way shelve can open it properly. #note: this file is never deleted! filename = tempfile.NamedTemporaryFile(delete=False).name shelf = shelve.Shelf(dict=dbm.open(filename, 'n')) for uri in uris: with URLZSource(uri).open() as f_obj: f_obj = codecs.getreader("utf-8")(f_obj) for line_no, line in enumerate(f_obj): try: obj = json.loads(line) except json.JSONDecodeError as e: self.logger.error("Unable to read line %d %s %s", line_no, uri, e) raise e key = key_f(obj) if key is not None: if str(key) in shelf: raise ValueError("Duplicate key %s in uri %s" % (key, uri)) shelf[str(key)] = obj return shelf
def main(): logging.config.fileConfig(file_or_resource('logging.ini'), disable_existing_loggers=False) logger = logging.getLogger(__name__) parser = argparse.ArgumentParser(description='OpenTargets evs validator') parser.add_argument('data_source_file', nargs='?', default='-', help='The prefix to prepend default: STDIN') parser.add_argument("--schema", dest='schema', help="set the schema file to use", action='store') parser.add_argument("--log-level", dest='loglevel', help="set the log level def: WARNING", action='store', default='WARNING') parser.add_argument( "--log-lines", dest='loglines', help="number of log errors to print out [no longer supported]", action='store', type=int, default=None) args = parser.parse_args() if args.loglevel: try: root_logger = logging.getLogger() root_logger.setLevel(logging.getLevelName(args.loglevel)) logger.setLevel(logging.getLevelName(args.loglevel)) except Exception as e: root_logger.exception(e) #TODO use a position argument if not args.schema: logger.error('A --schema <schemafile> has to be specified.') return 1 # warn and exit when using removed arguments if args.loglines is not None: logger.error("--log-lines is no longer supported") return 3 valid = True if args.data_source_file == '-': valid = validate(sys.stdin, args.schema) else: with URLZSource(args.data_source_file).open() as fh: valid = validate(fh, args.schema) #if we had any validation errors, exit with status 2 if not valid: return 2 #if everything was fine, exit with status 0 return 0
def get_pathway_data(self): self.valid_pathway_ids = [] with URLZSource(self.pathway_data_url).open() as source: for i, row in enumerate(csv.DictReader(source, fieldnames=self.headers, dialect='excel-tab'), start=1): if len(row) != 3: raise ValueError( 'Reactome.py: Pathway file format unexpected at line %d.' % i) pathway_id = row["id"] pathway_name = row["description"] species = row["species"] if pathway_id not in self.valid_pathway_ids: if species in self.allowed_species: self.valid_pathway_ids.append(pathway_id) yield dict( id=pathway_id, name=pathway_name, species=species, ) if len(self.valid_pathway_ids) % 1000 == 0: self.logger.debug( "%i rows parsed for reactome_pathway_data" % len(self.valid_pathway_ids)) else: self.logger.warn( "Pathway id %s is already loaded, skipping duplicate data" % pathway_id) self.logger.info('parsed %i rows for reactome_pathway_data' % len(self.valid_pathway_ids))
def build_json_experimental_toxicity(self, uri): with URLZSource(uri).open() as f_obj: for row in csv.DictReader(f_obj, dialect='excel-tab'): toxicity_json = self.exp_toxicity_json_format(row) genekey = row["ensembl_gene_id"].strip() if genekey not in self.experimental_toxicity: self.experimental_toxicity[genekey] = [] self.experimental_toxicity[genekey].append(toxicity_json)
def build_json_safety(self, filename): with URLZSource(filename).open() as r_file: safety_data = json.load(r_file) for genekey in safety_data: if genekey not in self.safety: self.safety[genekey] = safety_data[genekey] else: self._logger.info("Safety gene id duplicated: " + genekey)
def build_json(self, filename): # Just for reference: column names are: "ID_CENSUS_ANNOT", "ID_CENSUS", "ID_GENE", "GENE_NAME", "CELL_TYPE", # "PUBMED_PMID", "ID_DATA_CATEGORY", "DESCRIPTION", "DISPLAY", "SHORT", "CELL_LINE", "DESCRIPTION_1") with URLZSource(filename).open() as r_file: for i, row in enumerate(csv.DictReader(r_file, dialect='excel-tab'), start=1): PMID = re.sub(r'^"|"$', '', row["PUBMED_PMID"]) Short = re.sub(r'^"|"$', '', row["SHORT"]) GeneSymbol = re.sub(r'^"|"$', '', row["GENE_NAME"]) Description_1 = re.sub(r'^"|"$', '', row["DESCRIPTION_1"]) Description_1.rstrip() Description = re.sub(r'^"|"$', '', row["DESCRIPTION"]) if GeneSymbol not in self.hallmarks: self.hallmarks[GeneSymbol] = dict() if Description_1 in self.hallmarks_labels: promote = False suppress = False if Short == 'a': promote = True if Short == 's': suppress = True line = { "label": Description_1, "description": Description, "promote": promote, "suppress": suppress, "pmid": PMID } try: self.hallmarks[GeneSymbol]["cancer_hallmarks"].append(line) except KeyError: self.hallmarks[GeneSymbol]["cancer_hallmarks"] = list() self.hallmarks[GeneSymbol]["cancer_hallmarks"].append(line) elif Description_1 == 'function summary': line = {"pmid": PMID, "description": Description} try: self.hallmarks[GeneSymbol]["function_summary"].append(line) except KeyError: self.hallmarks[GeneSymbol]["function_summary"] = list() self.hallmarks[GeneSymbol]["function_summary"].append(line) else: line = { "attribute_name": Description_1, "description": Description, "pmid": PMID } try: self.hallmarks[GeneSymbol]["attributes"].append(line) except KeyError: self.hallmarks[GeneSymbol]["attributes"] = list() self.hallmarks[GeneSymbol]["attributes"].append(line)
def store_data(self, dry_run): self.logger.info('store_data called') self.logger.debug('calling to create new expression index') with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(self.hpa_merged_table, dry_run, self.es_index) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk( es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk( es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount) if failcount: raise RuntimeError("%s failed to index" % failcount) self.logger.info('missing tissues %s', str(_missing_tissues))
def retrieve_normal_tissue_data(self): """Parse 'normal_tissue' csv file, the expression profiles for proteins in human tissues from HPA :return: dict """ self.logger.info('get normal tissue rows into dicts') table = (petl.fromcsv( URLZSource(self.normal_tissue_url), delimiter='\t' ).rename({ 'Tissue': 'tissue', 'Cell type': 'cell_type', 'Level': 'level', 'Reliability': 'reliability', 'Gene': 'gene' }).cut('tissue', 'cell_type', 'level', 'reliability', 'gene').addfield( 'tissue_label', lambda rec: name_from_tissue(rec['tissue'].strip(), self.t2m) ).addfield('tissue_code', lambda rec: code_from_tissue( rec['tissue_label'], self.t2m)).addfield( 'tissue_level', lambda rec: level_from_text(rec['level'])).addfield( 'anatomical_systems', lambda rec: asys_from_tissue(rec['tissue_label'], self.t2m) ).addfield( 'organs', lambda rec: organs_from_tissue( rec['tissue_label'], self.t2m)).addfield( 'tissue_reliability', lambda rec: reliability_from_text(rec['reliability'])).cut( 'gene', 'tissue_code', 'tissue_label', 'tissue_level', 'tissue_reliability', 'cell_type', 'anatomical_systems', 'organs').aggregate( ('gene', 'tissue_code'), aggregation={ 'cell_types': (('cell_type', 'tissue_level', 'tissue_reliability'), list), 'tissue_label': ('tissue_label', set), 'anatomical_systems': ('anatomical_systems', list), 'organs': ('organs', list) }, presorted=True). aggregate('gene', aggregation={ 'data': (('tissue_code', 'tissue_label', 'cell_types', 'anatomical_systems', 'organs'), list) }, presorted=True).addfield( 'result', lambda rec: format_expression(rec)).cut( 'gene', 'result')) return table
def process(self, dry_run): def _put_line(line): return 1 self.logger.info('Reading Ensembl gene info from %s' % self.ensembl_filename) lines = more_itertools.with_iter( URLZSource(self.ensembl_filename).open()) with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(lines, self.es_index, self.es_doc) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk( es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk( es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
def test_minimal(self): resources_path = os.path.dirname(os.path.realpath(__file__)) data_source_file = resources_path + os.path.sep + "resources" + os.path.sep + "minimal.data.json" schema_source_file = resources_path + os.path.sep + "resources" + os.path.sep + "minimal.schema.json" schema_uri = "file://"+schema_source_file with URLZSource(data_source_file).open() as data_file_handle: valid = validate(data_file_handle, schema_uri) self.assertTrue(valid)
def generate_uniprot(uri): with URLZSource(uri).open() as r_file: for event, elem in etree.iterparse(r_file, events=("end",), tag='{http://uniprot.org/uniprot}entry'): #parse the XML into an object entry = Parser(elem, return_raw_comments=False).parse() elem.clear() yield entry
def get_list_of_file_download(config_file, headers): number_elem = len(headers) result = {} with URLZSource(config_file).open() as source: for i, row in enumerate(csv.DictReader(source, fieldnames=headers), start=1): if len(row) != number_elem: raise ValueError('File format unexpected at line %d.' % i) for item in row: result[item] = row[item] yield result
def download_molecules_linked_to_target(self): '''generate a dictionary with all the synonyms known for a given molecules. Only retrieves molecules linked to a target''' '''fetches all the targets from chembl and store their data and a mapping to uniprot id''' for uri in self.target_uri: with URLZSource(uri).open() as f_obj: for line in f_obj: i = json.loads(line) if 'target_components' in i and \ i['target_components'] and \ 'accession' in i['target_components'][0] and \ i['target_components'][0]['accession']: uniprot_id = i['target_components'][0]['accession'] self.targets[uniprot_id] = i self.uni2chembl[uniprot_id] = i['target_chembl_id'] allowed_target_chembl_ids = set(self.uni2chembl.values()) for uri in self.mechanism_uri: with URLZSource(uri).open() as f_obj: for line in f_obj: i = json.loads(line) self.mechanisms[i['record_id']] = i target_id = i['target_chembl_id'] if target_id in allowed_target_chembl_ids: if target_id not in self.target2molecule: self.target2molecule[target_id] = set() self.target2molecule[target_id].add( i['molecule_chembl_id']) required_molecules = set() self._logger.info('chembl t2m mols') for molecules in list(self.target2molecule.values()): for molecule in molecules: required_molecules.add(molecule) required_molecules = list(required_molecules) batch_size = 100 self._logger.debug('chembl populate synonyms') for i in range(0, len(required_molecules), batch_size): self.populate_synonyms_for_molecule( required_molecules[i:i + batch_size], self.molecule2synonyms)
def download_protein_classification(self): '''fetches targets components from chembls and inject the target class data in self.protein_classification''' for uri in self.protein_uri: with URLZSource(uri).open() as f_obj: for line in f_obj: i = json.loads(line) protein_class_id = i.pop('protein_class_id') protein_class_data = dict((k, dict(label=v, id='')) for k, v in i.items() if v) # remove values with none self.protein_class[protein_class_id] = protein_class_data max_level = 0 label = '' for k, v in protein_class_data.items(): level = int(k[1]) if level >= max_level: max_level = level label = v['label'] self.protein_class_label_to_id[label] = protein_class_id '''inject missing ids''' for k, v in self.protein_class.items(): for level, data in v.items(): label = data['label'] if label in self.protein_class_label_to_id: data['id'] = self.protein_class_label_to_id[label] for uri in self.component_uri: with URLZSource(uri).open() as f_obj: for line in f_obj: i = json.loads(line) if 'accession' in i: if i['accession'] not in self.protein_classification: self.protein_classification[i['accession']] = [] for classification in i['protein_classifications']: protein_class_id = classification[ 'protein_classification_id'] self.protein_classification[i['accession']].append( self.protein_class[protein_class_id])
def merge_data(self, genes, es, r_server, data_config, es_config): self._logger.info("HGNC parsing - requesting from URL %s", data_config.hgnc_complete_set) with URLZSource(data_config.hgnc_complete_set).open() as source: data = json.load(source) for row in data['response']['docs']: gene = Gene() self.load_hgnc_data_from_json(gene, row) genes.add_gene(gene) self._logger.info("STATS AFTER HGNC PARSING:\n" + genes.get_stats())
def process(self, dry_run): self.logger.debug("download uniprot uri %s", self.uri) self.logger.debug("to generate this file you have to call this url " "https://www.uniprot.org/uniprot/?query=reviewed%3Ayes%2BAND%2Borganism%3A9606&compress=yes&format=xml") with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) chunk_size = 1000 # TODO make configurable es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): items = generate_uniprot(self.uri) actions = elasticsearch_actions(items, self.es_index, self.es_doc) #write into elasticsearch failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk(es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk(es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
def __init__(self, tissue_translation_map, tissue_curation_map, normal_tissue_url, rna_level_url, rna_value_url, rna_zscore_url): self.logger = logging.getLogger(__name__) self.tissue_translation_map = tissue_translation_map self.tissue_curation_map = tissue_curation_map self.normal_tissue_url = normal_tissue_url self.rna_level_url = rna_level_url self.rna_value_url = rna_value_url self.rna_zscore_url = rna_zscore_url #load t2m t2m = {'tissues': {}, 'curations': {}} with URLZSource(self.tissue_translation_map).open(mode='rb') as r_file: t2m['tissues'] = json.load(r_file)['tissues'] with URLZSource(self.tissue_curation_map).open(mode='rb') as r_file: for line in r_file: line = line.strip() line = line.split('\t') t2m['curations'][line[0].strip()] = line[1].strip() self.t2m = t2m