def score_producer(data, scorer, loader, r_server, lookup_data, datasources_to_datatypes, dry_run): target, disease, evidence, is_direct = data logger = logging.getLogger(__name__) if evidence: score = scorer.score(target, disease, evidence, is_direct, datasources_to_datatypes) # skip associations only with data with score 0 if score: gene_data = Gene() try: gene_data.load_json( lookup_data.available_genes.get_gene(target, r_server)) except KeyError as e: logger.debug('Cannot find gene code "%s" ' 'in lookup table' % target) raise e score.set_target_data(gene_data) # create a hpa expression empty jsonserializable class # to fill from Redis cache lookup_data hpa_data = HPAExpression() try: hpa_data.update( lookup_data.available_hpa.get_hpa(target, r_server)) except KeyError: pass except Exception as e: raise e try: score.set_hpa_data(hpa_data) except KeyError: pass except Exception as e: raise e disease_data = EFO() try: disease_data.load_json( lookup_data.available_efos.get_efo(disease, r_server)) except KeyError as e: logger.debug('Cannot find EFO code "%s" ' 'in lookup table' % disease) logger.exception(e) score.set_disease_data(disease_data) element_id = '%s-%s' % (target, disease) if not dry_run: loader.put(Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME, Const.ELASTICSEARCH_DATA_ASSOCIATION_DOC_NAME, element_id, score) else: logger.warning('Skipped association with score 0: %s-%s' % (target, disease))
def merge_data(self, genes, loader, r_server, data_config): esquery = ESQuery(loader.es) try: count = esquery.count_elements_in_index( Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME) except NotFoundError as ex: self._logger.error( 'no Ensembl index in ES. Skipping. Has the --ensembl step been run? Are you pointing to the correct index? %s' % ex) raise ex for row in esquery.get_all_ensembl_genes(): if row['id'] in genes: gene = genes.get_gene(row['id']) gene.load_ensembl_data(row) genes.add_gene(gene) else: gene = Gene() gene.load_ensembl_data(row) genes.add_gene(gene) self._clean_non_reference_genes(genes) self._logger.info("STATS AFTER ENSEMBL PARSING:\n" + genes.get_stats())
def score_producer(data, scorer, lookup_data, datasources_to_datatypes, dry_run): target, disease, evidence, is_direct = data if evidence: score = scorer.score(target, disease, evidence, is_direct, datasources_to_datatypes) # skip associations only with data with score 0 if score: gene_data = Gene() gene_data_index = lookup_data.available_genes.get_gene(target) if gene_data_index != None: gene_data.load_json(gene_data_index) score.set_target_data(gene_data) # create a hpa expression empty jsonserializable class hpa_data = HPAExpression() try: hpa_index = lookup_data.available_hpa.get_hpa(target) if hpa_index is not None: hpa_data.update(hpa_index) except KeyError: pass except Exception as e: raise e try: score.set_hpa_data(hpa_data) except KeyError: pass except Exception as e: raise e disease_data = EFO() disease_data.load_json( lookup_data.available_efos.get_efo(disease)) score.set_disease_data(disease_data) element_id = '%s-%s' % (target, disease) #convert the score into a JSON-compatible object #otherwise Python serialization consumes too much memory return (element_id, score.to_json()) return None
def merge_data(self, genes, loader, r_server, data_config): self._logger.info("HGNC parsing - requesting from URL %s", data_config.hgnc_complete_set) with URLZSource(data_config.hgnc_complete_set).open() as source: data = json.load(source) for row in data['response']['docs']: gene = Gene() gene.load_hgnc_data_from_json(row) genes.add_gene(gene) self._logger.info("STATS AFTER HGNC PARSING:\n" + genes.get_stats())
def merge_data(self, genes, es, r_server, data_config, es_config): index = es_config.ens.name for row in Search().using(es).index(index).query(MatchAll()).scan(): gene = None if row['id'] in genes: gene = genes.get_gene(row['id']) else: gene = Gene() self.load_ensembl_data(gene, row) genes.add_gene(gene) self._clean_non_reference_genes(genes) self._logger.info("STATS AFTER ENSEMBL PARSING:\n" + genes.get_stats())
def merge_data(self, genes, es, r_server, data_config, es_config): ensembl_filename = data_config.ensembl_filename with URLZSource(ensembl_filename).open() as ensembl_file: for line in ensembl_file: content = json.loads(line) if content['id'] in genes: gene = genes.get_gene(content['id']) else: gene = Gene() self.load_ensembl_data(gene, content) genes.add_gene(gene) self._clean_non_reference_genes(genes) self._logger.info("STATS AFTER ENSEMBL PARSING:\n" + genes.get_stats())
def _get_gene_obj(self, geneid): gene = Gene(geneid) gene.load_json(self.available_genes.get_gene(geneid)) return gene