def get_elastic_settings_with_user_uploads(cls, elastic_dict=None, new_upload_file=None): '''Get the updated elastic settings with user uploaded idx_types''' idx_key = 'CP_STATS_UD' idx = ElasticSettings.idx(idx_key) ''' Check if an index type exists in elastic and later check there is a contenttype/model for the given elastic index type. ''' # @IgnorePep8 elastic_url = ElasticSettings.url() url = idx + '/_mapping' response = Search.elastic_request(elastic_url, url, is_post=False) ''' why don't we use Search.get_mapping ? I guess it's not a class method''' #logger.debug(response.json()) if "error" in response.json(): logger.warn(response.json()) return None # get idx_types from _mapping elastic_mapping = json.loads(response.content.decode("utf-8")) # here if we use aliasing then idx can be different # this causes problems as it's effectively hardcoded # this should fix to handle things where aliases are deployed idx = list(elastic_mapping.keys())[0] idx_types = list(elastic_mapping[idx]['mappings'].keys()) if elastic_dict is None: elastic_dict = ElasticSettings.attrs().get('IDX') idx_type_dict = {} existing_ct = [ct.name for ct in ContentType.objects.filter(app_label=cls.PERMISSION_MODEL_APP_NAME)] for idx_type in idx_types: idx_type_with_suffix = idx_type + cls.PERMISSION_MODEL_TYPE_SUFFIX for ct in existing_ct: if ct.endswith(idx_type_with_suffix): meta_url = idx + '/' + idx_type + '/_meta/_source' meta_response = Search.elastic_request(elastic_url, meta_url, is_post=False) try: elastic_meta = json.loads(meta_response.content.decode("utf-8")) label = elastic_meta['label'] except: label = "UD-" + idx_type idx_type_dict['UD-' + idx_type.upper()] = {'label': label, 'type': idx_type} if new_upload_file is not None: idx_type = new_upload_file label = "UD-" + idx_type idx_type_dict['UD-' + idx_type.upper()] = {'label': label, 'type': idx_type} elastic_dict['CP_STATS_UD']['idx_type'] = idx_type_dict return elastic_dict
def get_models_to_delete(self): '''Get models to delete''' idx_key = 'CP_STATS_UD' idx = ElasticSettings.idx(idx_key) ''' Check if an index type exists in elastic and later check there is a contenttype/model for the given elastic index type. ''' # @IgnorePep8 elastic_url = ElasticSettings.url() url = idx + '/_mapping' response = Search.elastic_request(elastic_url, url, is_post=False) if "error" in response.json(): logger.warn(response.json()) return None # get idx_types from _mapping elastic_mapping = json.loads(response.content.decode("utf-8")) ## fix needed if we deploy aliasing for indices idx = list(elastic_mapping.keys())[0] idx_types = list(elastic_mapping[idx]['mappings'].keys()) models2go = [] expire_days = 7 # 1 week # add idx_types that have no docs for idx_type in idx_types: ndocs = Search(idx=idx, idx_type=idx_type).get_count()['count'] if (ndocs > 0): models2go.append(idx_type) # add idx_types that were not accessed for a given time period url = idx + '/' + idx_type + '/_meta' response = Search.elastic_request(elastic_url, url, is_post=False) elastic_meta = json.loads(response.content.decode("utf-8")) if '_source' in elastic_meta: uploaded_str_date = elastic_meta['_source']['uploaded'] yymmdd_str = uploaded_str_date.split()[0] # Format: 2015-11-03 14:43:54.099645+00:00 from datetime import datetime as dt dt = dt.strptime(yymmdd_str, '%Y-%m-%d') uploaded_date = dt.date() d1 = datetime.date.today() d2 = d1 - datetime.timedelta(days=expire_days) if uploaded_date < d2: models2go.append(idx_type) return models2go
def get_models_to_delete(self): """Get models to delete""" idx_key = "CP_STATS_UD" idx = ElasticSettings.idx(idx_key) """ Check if an index type exists in elastic and later check there is a contenttype/model for the given elastic index type. """ # @IgnorePep8 elastic_url = ElasticSettings.url() url = idx + "/_mapping" response = Search.elastic_request(elastic_url, url, is_post=False) if "error" in response.json(): logger.warn(response.json()) return None # get idx_types from _mapping elastic_mapping = json.loads(response.content.decode("utf-8")) idx_types = list(elastic_mapping[idx]["mappings"].keys()) models2go = [] expire_days = 7 # 1 weeks # add idx_types that have no docs for idx_type in idx_types: ndocs = Search(idx=idx, idx_type=idx_type).get_count()["count"] if ndocs <= 1: models2go.append(idx_type) # add idx_types that were not accessed for a given time period url = idx + "/" + idx_type + "/_meta" response = Search.elastic_request(elastic_url, url, is_post=False) elastic_meta = json.loads(response.content.decode("utf-8")) if "_source" in elastic_meta: uploaded_str_date = elastic_meta["_source"]["uploaded"] yymmdd_str = uploaded_str_date.split()[0] # Format: 2015-11-03 14:43:54.099645+00:00 from datetime import datetime as dt dt = dt.strptime(yymmdd_str, "%Y-%m-%d") uploaded_date = dt.date() d1 = datetime.date.today() d2 = d1 - datetime.timedelta(days=expire_days) if uploaded_date < d2: models2go.append(idx_type) return models2go
def get_meta_info(cls, idx, idx_type): elastic_url = ElasticSettings.url() meta_url = idx + '/' + idx_type + '/_mapping' # print(elastic_url + meta_url) meta_response = Search.elastic_request(elastic_url, meta_url, is_post=False) try: elastic_meta = json.loads(meta_response.content.decode("utf-8")) meta_info = elastic_meta[idx]['mappings'][idx_type]['_meta'] return meta_info except: return None
def get_elastic_settings_with_user_uploads(cls, elastic_dict=None): '''Get the updated elastic settings with user uploaded idx_types''' idx_key = 'CP_STATS_UD' idx = ElasticSettings.idx(idx_key) ''' Check if an index exists. ''' elastic_url = ElasticSettings.url() url = idx + '/_mapping' response = Search.elastic_request(elastic_url, url, is_post=False) if "error" in response.json(): logger.warn(response.json()) return None # get idx_types from _mapping elastic_mapping = json.loads(response.content.decode("utf-8")) idx_types = list(elastic_mapping[idx]['mappings'].keys()) if elastic_dict is None: elastic_dict = ElasticSettings.attrs().get('IDX') idx_type_dict = {} for idx_type in idx_types: meta_url = idx + '/' + idx_type + '/_meta/_source' meta_response = Search.elastic_request(elastic_url, meta_url, is_post=False) try: elastic_meta = json.loads(meta_response.content.decode("utf-8")) label = elastic_meta['label'] except: label = "UD-" + idx_type idx_type_dict['UD-' + idx_type.upper()] = {'label': label, 'type': idx_type} elastic_dict['CP_STATS_UD']['idx_type'] = idx_type_dict return elastic_dict
def get_criteria_index_types(cls, idx_key): idx = ElasticSettings.idx(idx_key) elastic_url = ElasticSettings.url() url = idx + '/_mappings' response = Search.elastic_request(elastic_url, url, is_post=False) if "error" in response.json(): logger.warn(response.json()) return None # get idx_types from _mapping elastic_mapping = json.loads(response.content.decode("utf-8")) idx_types = list(elastic_mapping[idx]['mappings'].keys()) return idx_types
def test_criteria_mappings(self, idx, idx_types): (main_codes, other_codes) = CriteriaManager.get_available_diseases() site_enabled_diseases = main_codes + other_codes elastic_url = ElasticSettings.url() for idx_type in idx_types: url = idx + '/' + idx_type + '/_mapping' response = Search.elastic_request(elastic_url, url, is_post=False) elastic_type_mapping = json.loads(response.content.decode("utf-8")) property_keys = list(elastic_type_mapping[idx]['mappings'][idx_type]['properties'].keys()) '''check if score and disease_tags and qid are there in mapping''' self.assertIn('score', property_keys) self.assertIn('disease_tags', property_keys) self.assertIn('qid', property_keys) '''check if all the enabled diseases are there''' for disease in site_enabled_diseases: self.assertIn(disease, property_keys)
def get_context_models_to_delete(self, *args, **options): '''Get models to delete''' ct = options['content_type'] retDict = dict() retDict['acknowledged'] = 0 logger.debug(ct) idx_key = 'CP_STATS_UD' idx = ElasticSettings.idx(idx_key) ''' Check if an index type exists in elastic and later check there is a contenttype/model for the given elastic index type. ''' # @IgnorePep8 elastic_url = ElasticSettings.url() url = idx + '/_mapping' response = Search.elastic_request(elastic_url, url, is_post=False) if "error" in response.json(): logger.warn(response.json()) retDict['errorMsg'] = response.json() self.stdout.write(json.dumps(retDict)) # get idx_types from _mapping elastic_mapping = json.loads(response.content.decode("utf-8")) ## fix needed if we deploy aliasing for indices idx = list(elastic_mapping.keys())[0] idx_types = list(elastic_mapping[idx]['mappings'].keys()) logger.debug(idx_types) # add idx_types that have no docs for idx_type in idx_types: if idx_type != ct: continue logger.debug("Found " + idx_type + " equal to " + ct) ndocs = Search(idx=idx, idx_type=idx_type).get_count()['count'] #logger.debug(Search(idx=idx, idx_type=idx_type).get_json_response()) logger.debug("WE have " + str(ndocs)) if (ndocs > 0): for cnt in ContentType.objects.filter(): if str(cnt.name).endswith(ct + '_idx_type'): logger.debug( 'Matched, finding permissions for %s %s' % (str(cnt.name), str(cnt.id))) logger.debug("deleting %s" % ct) cnt.delete() retDict['acknowledged'] = 1 #logger.debug(retDict) self.stdout.write(json.dumps(retDict))
def get_elastic_settings_with_user_uploads(cls, elastic_dict=None, new_upload_file=None): '''Get the updated elastic settings with user uploaded idx_types''' idx_key = 'CP_STATS_UD' idx = ElasticSettings.idx(idx_key) ''' Check if an index type exists in elastic and later check there is a contenttype/model for the given elastic index type. ''' # @IgnorePep8 elastic_url = ElasticSettings.url() url = idx + '/_mapping' response = Search.elastic_request(elastic_url, url, is_post=False) ''' why don't we use Search.get_mapping ? I guess it's not a class method''' #logger.debug(response.json()) if "error" in response.json(): logger.warn(response.json()) return None # get idx_types from _mapping elastic_mapping = json.loads(response.content.decode("utf-8")) # here if we use aliasing then idx can be different # this causes problems as it's effectively hardcoded # this should fix to handle things where aliases are deployed idx = list(elastic_mapping.keys())[0] idx_types = list(elastic_mapping[idx]['mappings'].keys()) if elastic_dict is None: elastic_dict = ElasticSettings.attrs().get('IDX') idx_type_dict = {} existing_ct = [ ct.name for ct in ContentType.objects.filter( app_label=cls.PERMISSION_MODEL_APP_NAME) ] for idx_type in idx_types: idx_type_with_suffix = idx_type + cls.PERMISSION_MODEL_TYPE_SUFFIX for ct in existing_ct: if ct.endswith(idx_type_with_suffix): meta_url = idx + '/' + idx_type + '/_meta/_source' meta_response = Search.elastic_request(elastic_url, meta_url, is_post=False) try: elastic_meta = json.loads( meta_response.content.decode("utf-8")) label = elastic_meta['label'] except: label = "UD-" + idx_type idx_type_dict['UD-' + idx_type.upper()] = { 'label': label, 'type': idx_type } if new_upload_file is not None: idx_type = new_upload_file label = "UD-" + idx_type idx_type_dict['UD-' + idx_type.upper()] = { 'label': label, 'type': idx_type } elastic_dict['CP_STATS_UD']['idx_type'] = idx_type_dict return elastic_dict
def filter_queryset(self, request, queryset, view): ''' Get disease regions. ''' try: filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) dis = filters.get('disease', 'T1D') show_genes = filters.get('genes', False) show_markers = filters.get('markers', False) show_regions = filters.get('regions', True) build = self._get_build(filters.get('build', settings.DEFAULT_BUILD)) docs = DiseaseLocusDocument.get_disease_loci_docs(dis) if len(docs) == 0: messages.error(request, 'No regions found for '+dis+'.') visible_hits = DiseaseLocusDocument.get_hits([h for r in docs for h in getattr(r, 'hits')]) regions = [] all_markers = [] all_genes = [] ens_all_cand_genes = [] for r in docs: region = r.get_disease_region(visible_hits, build=build) if region is not None: ens_all_cand_genes.extend(region['ens_cand_genes']) all_markers.extend(region['markers']) region['hits'] = [self._study_hit_obj(s, region) for s in StudyHitDocument.process_hits(r.hit_docs, region['all_diseases'])] (all_coding, all_non_coding) = views.get_genes_for_region(getattr(r, "seqid"), region['rstart']-500000, region['rstop']+500000) (region_coding, coding_up, coding_down) = views._region_up_down(all_coding, region['rstart'], region['rstop']) (region_non_coding, non_coding_up, non_coding_down) = \ views._region_up_down(all_non_coding, region['rstart'], region['rstop']) region['genes'] = { 'upstream': {'coding': [g.doc_id() for g in coding_up], 'non_coding': [g.doc_id() for g in non_coding_up]}, 'region': {'coding': [g.doc_id() for g in region_coding], 'non_coding': [g.doc_id() for g in region_non_coding]}, 'downstream': {'coding': [g.doc_id() for g in coding_down], 'non_coding': [g.doc_id() for g in non_coding_down]}, } all_genes.extend(region['genes']['region']['coding']) all_genes.extend(region['genes']['region']['non_coding']) regions.append(region) # look for pleiotropy by looking for diseases for the markers in IC_STATS and other study hits stats_query = ElasticQuery.filtered(Query.terms("marker", all_markers), Filter(RangeQuery("p_value", lte=5E-08))) stats_docs = Search(stats_query, idx=ElasticSettings.idx("IC_STATS"), size=len(all_markers)).search().docs meta_response = Search.elastic_request(ElasticSettings.url(), ElasticSettings.idx("IC_STATS") + '/_mapping', is_post=False) # get ensembl to gene symbol mapping for all candidate genes extra_markers = [] for region in regions: # add diseases from IC/GWAS stats (study_ids, region['marker_stats']) = views._process_stats(stats_docs, region['markers'], meta_response) region['all_diseases'].extend([getattr(mstat, 'disease') for mstat in region['marker_stats']]) other_hits_query = ElasticQuery( BoolQuery(must_arr=[RangeQuery("tier", lte=2), Query.terms("marker", region['markers'])], must_not_arr=[Query.terms("dil_study_id", study_ids)])) other_hits = Search(other_hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=100).search() region['extra_markers'] = [self._study_hit_obj(s, region) for s in StudyHitDocument.process_hits(other_hits.docs, region['all_diseases'])] region['all_diseases'] = list(set(region['all_diseases'])) extra_markers.extend([m['marker_id'] for m in region['extra_markers']]) # get markers marker_objs = [] if show_markers: query = ElasticQuery(Query.terms("id", all_markers), sources=['id', 'start']) marker_docs = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=len(all_markers)).search().docs mids = {getattr(m, 'id'): getattr(m, 'start') for m in marker_docs} marker_objs = [h for r in regions for h in r['hits']] marker_objs.extend([h for r in regions for h in r['extra_markers']]) for m in marker_objs: m['start'] = mids[m['marker_id']] # get genes gene_objs = [] if show_genes: all_genes.extend(ens_all_cand_genes) gene_docs = GeneDocument.get_genes(all_genes, sources=['start', 'stop', 'chromosome', 'symbol', 'biotype']) for doc in Document.sorted_alphanum(gene_docs, 'chromosome'): ensembl_id = doc.doc_id() region_name = '' candidate_gene = 0 for region in regions: if ('genes' in region and (ensembl_id in region['genes']['region']['coding'] or ensembl_id in region['genes']['region']['non_coding'] or ensembl_id in region['ens_cand_genes'])): region_name = region['region_name'] candidate_gene = 1 if ensembl_id in region['ens_cand_genes'] else 0 break gene_objs.append({ 'ensembl_id': ensembl_id, 'seqid': 'chr'+getattr(doc, 'chromosome'), 'start': getattr(doc, 'start'), 'end': getattr(doc, 'stop'), 'symbol': getattr(doc, 'symbol'), 'biotype': getattr(doc, 'biotype'), 'region_name': region_name, 'candidate_gene': candidate_gene }) if show_regions == 'false': regions = [] regions.extend(gene_objs) regions.extend(marker_objs) return regions except (TypeError, ValueError, IndexError, ConnectionError) as e: print(e) raise Http404
def get_regions(cls, request, dis, context): # is_authenticated = False elastic_url = ElasticSettings.url() (core, other) = Disease.get_site_diseases(dis_list=dis.upper().split(',')) if len(core) == 0 and len(other) == 0: messages.error(request, 'Disease '+dis+' not found.') raise Http404() disease = core[0] if len(core) > 0 else other[0] context['title'] = getattr(disease, "name")+" Regions" docs = DiseaseLocusDocument.get_disease_loci_docs(dis) if len(docs) == 0: messages.error(request, 'No regions found for '+dis+'.') raise Http404() visible_hits = DiseaseLocusDocument.get_hits([h for r in docs for h in getattr(r, 'hits')]) meta_response = Search.elastic_request(elastic_url, ElasticSettings.idx("IC_STATS") + '/_mapping', is_post=False) regions = [] ens_all_cand_genes = [] all_markers = [] for r in docs: region = r.get_disease_region(visible_hits) if region is not None: ens_all_cand_genes.extend(region['ens_cand_genes']) all_markers.extend(region['markers']) region['hits'] = StudyHitDocument.process_hits(r.hit_docs, region['all_diseases']) (all_coding, all_non_coding) = get_genes_for_region(getattr(r, "seqid"), region['rstart']-500000, region['rstop']+500000) (region_coding, coding_up, coding_down) = _region_up_down(all_coding, region['rstart'], region['rstop']) (region_non_coding, non_coding_up, non_coding_down) = \ _region_up_down(all_non_coding, region['rstart'], region['rstop']) region['genes'] = { 'upstream': {'coding': coding_up, 'non_coding': non_coding_up}, 'region': {'coding': region_coding, 'non_coding': region_non_coding}, 'downstream': {'coding': coding_down, 'non_coding': non_coding_down}, } regions.append(region) # look for pleiotropy by looking for diseases for the markers in IC_STATS and other study hits stats_query = ElasticQuery.filtered(Query.terms("marker", all_markers), Filter(RangeQuery("p_value", lte=5E-08))) stats_docs = Search(stats_query, idx=ElasticSettings.idx("IC_STATS"), size=len(all_markers)).search().docs # get ensembl to gene symbol mapping for all candidate genes all_cand_genes = gene.utils.get_gene_docs_by_ensembl_id(ens_all_cand_genes) for region in regions: region['cand_genes'] = {cg: all_cand_genes[cg] for cg in region.pop("ens_cand_genes", None)} (study_ids, region['marker_stats']) = _process_stats(stats_docs, region['markers'], meta_response) # add diseases from IC/GWAS stats region['all_diseases'].extend([getattr(mstat, 'disease') for mstat in region['marker_stats']]) other_hits_query = ElasticQuery( BoolQuery(must_arr=[RangeQuery("tier", lte=2), Query.terms("marker", region['markers'])], must_not_arr=[Query.terms("dil_study_id", study_ids)])) other_hits = Search(other_hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=100).search() region['extra_markers'] = StudyHitDocument.process_hits(other_hits.docs, region['all_diseases']) context['regions'] = regions context['disease_code'] = [dis] context['disease'] = getattr(disease, "name") return context
def get_disease(cls, request, disease, context): disease = disease.lower() if disease is None: messages.error(request, 'No disease given.') raise Http404() query = ElasticQuery(Query.terms("code", [disease.split(',')])) elastic = Search(query, idx=ElasticSettings.idx('DISEASE', 'DISEASE'), size=5) res = elastic.search() if res.hits_total == 0: messages.error(request, 'Disease(s) '+disease+' not found.') elif res.hits_total < 9: disease_docs = res.docs names = ', '.join([getattr(doc, 'name') for doc in disease_docs]) meta_response = Search.elastic_request(ElasticSettings.url(), ElasticSettings.idx("IC_STATS") + '/_mapping', is_post=False) elastic_meta = json.loads(meta_response.content.decode("utf-8")) disease_docs = res.docs for dis in disease_docs: dis_code = getattr(dis, 'code').upper() docs = DiseaseLocusDocument.get_disease_loci_docs(dis_code) regions = [] ens_all_cand_genes = [] all_markers = [] for r in docs: region = r.get_disease_region() if region is not None: regions.append(region) ens_all_cand_genes.extend(region['ens_cand_genes']) all_markers.extend(region['markers']) # get ensembl to gene symbol mapping for all candidate genes all_cand_genes = gene.utils.get_gene_docs_by_ensembl_id(ens_all_cand_genes) for region in regions: region['cand_genes'] = {cg: all_cand_genes[cg] for cg in region.pop("ens_cand_genes", None)} setattr(dis, 'regions', regions) # look for pleiotropy by looking for diseases for the markers in IC_STATS and other study hits stats_query = ElasticQuery.filtered(Query.terms("marker", all_markers), Filter(RangeQuery("p_value", lte=5E-08)), sources=['marker']) stats_docs = Search(stats_query, idx=ElasticSettings.idx("IC_STATS"), size=len(all_markers)).search().docs other_hits_query = ElasticQuery( BoolQuery(must_arr=[RangeQuery("tier", lte=2), Query.terms("marker", all_markers)]), sources=['marker', 'disease']) other_hits = Search(other_hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=5000).search().docs for region in regions: diseases = [dis_code] for doc in stats_docs: if getattr(doc, 'marker') in region['markers']: meta_info = elastic_meta[doc.index()]['mappings'][doc.type()]['_meta'] if meta_info['disease'] not in diseases: diseases.append(meta_info['disease']) for doc in other_hits: if getattr(doc, 'marker') in region['markers']: if doc.disease is not None and doc.disease not in diseases: diseases.append(doc.disease) region['diseases'] = diseases studies = StudyDocument.get_studies(disease_code=dis_code) for doc in studies: setattr(doc, 'study_id', getattr(doc, 'study_id').replace('GDXHsS00', '')) pmid = getattr(doc, 'principal_paper') pubs = PublicationDocument.get_publications(pmid, sources=['date', 'authors.name', 'journal']) if len(pubs) > 0: authors = getattr(pubs[0], 'authors') setattr(doc, 'date', getattr(pubs[0], 'date')) setattr(doc, 'journal', getattr(pubs[0], 'journal')) setattr(doc, 'author', authors[0]['name'].rsplit(None, 1)[-1] if authors else "") setattr(dis, 'studies', studies) context['features'] = disease_docs context['title'] = names return context raise Http404()
def process_criteria(cls, feature, section, config, sub_class, test=False): ''' Top level function that calls the right criteria implementation based on the subclass passed. Iterates over all the documents using the ScanAndScroll and the hits are processed by the inner function process_hits. The entire result is stored in result_container (a dict), and at the end of the processing, the result is loaded in to the elastic index after creating the mapping @type feature: string @param feature: feature type, could be 'gene','region', 'marker' etc., @type section: string @keyword section: The section in the criteria.ini file @type config: string @keyword config: The config object initialized from criteria.ini. @type sub_class: string @param sub_class: The name of the inherited sub_class where the actual implementation is ''' global gl_result_container gl_result_container = {} test_mode = test if config is None: if test_mode: config = CriteriaManager().get_criteria_config(ini_file='test_criteria.ini') else: config = CriteriaManager().get_criteria_config(ini_file='criteria.ini') section_config = config[section] source_idx = section_config['source_idx'] if ',' in source_idx: idxs = source_idx.split(',') idx_all = [ElasticSettings.idx(idx) for idx in idxs] source_idx = ','.join(idx_all) else: source_idx = ElasticSettings.idx(section_config['source_idx']) source_idx_type = None if 'source_idx_type' in section_config: source_idx_type = section_config['source_idx_type'] if source_idx_type is not None: source_idx = ElasticSettings.idx(section_config['source_idx'], idx_type=section_config['source_idx_type']) else: source_idx_type = '' logger.warning(source_idx + ' ' + source_idx_type) def process_hits(resp_json): global gl_result_container hits = resp_json['hits']['hits'] global hit_counter for hit in hits: hit_counter = hit_counter + 1 result_container = sub_class.tag_feature_to_disease(hit, section, config, result_container=gl_result_container) gl_result_container = result_container if test_mode: if gl_result_container is not None and len(gl_result_container) > 5: return query = cls.get_elastic_query(section, config) if test_mode: result_size = len(gl_result_container) from_ = 0 size_ = 20 while (result_size < 1): from_ = from_ + size_ url = ElasticSettings.url() if 'mhc' in section: url_search = (source_idx + '/_search') else: url_search = (source_idx + '/_search?from=' + str(from_) + '&size=' + str(size_)) if query is None: query = { "query": {"match_all": {}}, "size": 20 } response = Search.elastic_request(url, url_search, data=json.dumps(query)) query = None else: # print(query) response = Search.elastic_request(url, url_search, data=json.dumps(query.query)) process_hits(response.json()) if gl_result_container is not None: result_size = len(gl_result_container) else: ScanAndScroll.scan_and_scroll(source_idx, call_fun=process_hits, query=query) cls.map_and_load(feature, section, config, gl_result_container)
def marker_is_gwas_significant_in_ic(cls, hit, section=None, config=None, result_container={}): """ /hg38_gwas_statistics,hg38_ic_statistics/_search?pretty' -d '{"query":{"range":{"p_value":{"lt": 0.00000005}}}}' """ gw_sig_p = 0.00000005 feature_doc = hit["_source"] feature_doc["_id"] = hit["_id"] idx = hit["_index"] idx_type = hit["_type"] # get meta data # studyid and diseaes elastic_url = ElasticSettings.url() meta_url = idx + "/" + idx_type + "/_mapping" meta_response = Search.elastic_request(elastic_url, meta_url, is_post=False) try: elastic_meta = json.loads(meta_response.content.decode("utf-8")) meta_info = elastic_meta[idx]["mappings"][idx_type]["_meta"] disease = meta_info["disease"] dil_study_id = meta_info["study"] except: disease = None dil_study_id = None marker = None if "marker" in feature_doc: marker = feature_doc["marker"] if marker is None or disease is None: return result_container p_val = feature_doc["p_value"] if p_val is None: return result_container global counter counter = counter + 1 p_val_to_compare = float(p_val) if p_val_to_compare < gw_sig_p: if dil_study_id is None or dil_study_id == "None": first_author = "NA" dil_study_id = "NA" else: query = ElasticQuery(Query.ids([dil_study_id])) elastic = Search(search_query=query, idx=ElasticSettings.idx("STUDY", "STUDY"), size=1) study_doc = elastic.search().docs[0] author = getattr(study_doc, "authors")[0] first_author = author["name"] + " " + author["initials"] fnotes = { "linkdata": "pval", "linkvalue": p_val_to_compare, "linkid": dil_study_id, "linkname": first_author, } result_container_populated = cls.populate_container( dil_study_id, first_author, fnotes=fnotes, features=[marker], diseases=[disease], result_container=result_container, ) return result_container_populated else: return result_container
def add_study_data(self, **options): ''' add gwas stats from a study ''' study = options['study_id'] file = options['addStudyData'] message = "" print("Deleting study hits for " + study) Delete.docs_by_query(ElasticSettings.idx('REGION', 'STUDY_HITS'), query=Query.term("dil_study_id", study)) with open(file, newline='') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='|') for row in reader: if row[0] == 'Marker': continue # 0 - Marker # 1 - disease # 2 - Chromosome # 3 - Region Start # 4 - Region End # 5 - Position # 6 - Strand # 7 - Major Allele # 8 - Minor allele # 9 - Minor allele frequency # 10 - Discovery P value # 11 - Discovery Odds ratio # 12 - Discovery 95% confidence interval lower limit # 13 - Discovery 95% confidence interval upper limit # 14 - Replication P value # 15 - Replication Odds ratio # 16 - Replication 95% confidence interval lower limit # 17 - Replication 95% confidence interval upper limit # 18 - Combined P value # 19 - Combined Odds ratio # 20 - Combined 95% confidence interval lower limit # 21 - Combined 95% confidence interval upper limit # 22 - PP Colocalisation # 23 - Gene # 24 - PubMed ID # 25 - Other Signal # 26 - Notes # 27 - Curation status/ failed quality control query = ElasticQuery(Query.match("id", row[0])) result = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER')).search() if result.hits_total == 0: result2 = Search(search_query=ElasticQuery( Query.match("rshigh", row[0])), idx=ElasticSettings.idx( 'MARKER', 'HISTORY')).search() if result2.hits_total > 0: history_doc = result2.docs[0] new_id = getattr(history_doc, "rscurrent") query = ElasticQuery(Query.match("id", new_id)) result = Search(search_query=query, idx=ElasticSettings.idx( 'MARKER', 'MARKER')).search() if result.hits_total != 1: message += "ERROR loading row of gwas data for " + row[ 0] + " - Marker cannot be found; <br />\n" marker = result.docs[0] query = ElasticQuery(Query.match("code", row[1])) result = Search(query, idx=ElasticSettings.idx('DISEASE', 'DISEASE')).search() if result.hits_total != 1: message += "ERROR loading row of gwas data for " + row[ 0] + " - Disease cannot be found; <br />\n" continue disease = result.docs[0] if not re.match(r"^\w$", row[7]): message += "ERROR loading row of gwas data for " + row[ 0] + " - Major allele is not set; <br />\n" continue if not re.match(r"^\w$", row[8]): message += "ERROR loading row of gwas data for " + row[ 0] + " - Minor allele is not set; <br />\n" continue if float(row[9]) > 0.5: message += "WARNING - MAF for " + row[ 0] + " is >0.5; <br />\n" strand = row[6] if re.match(r"\d", strand): strand = '+' if strand > 0 else '-' row[6] = strand if not re.match(r"\d+", row[2]): row[2] = getattr(marker, "seqid") if not re.match(r"\d+", row[5]): row[5] = getattr(marker, "start") if not row[5] == getattr(marker, "start"): row[5] = getattr(marker, "start") data = { "chr_band": self._get_chr_band(row[2], row[5]), "other_signal": row[25], "species": "Human", "disease": getattr(disease, "code"), "notes": row[26], "disease_locus": "TBC", "dil_study_id": study, "marker": getattr(marker, "id"), "status": "N", "pp_probability": row[22], "tier": 100, "pmid": row[24], "genes": self._get_ens_gene(row[23]) } build_info = self._get_current_build_info(row[2], row[5]) data['build_info'] = [build_info] data['p_values'] = { 'discovery': row[10], 'replication': row[14], 'combined': row[18] } data['odds_ratios'] = { 'discovery': { "or": row[11], "lower": row[12], "upper": row[13] }, 'replication': { "or": row[15], "lower": row[16], "upper": row[17] }, 'combined': { "or": row[19], "lower": row[20], "upper": row[21] } } data['alleles'] = { 'major': row[7], 'minor': row[8], 'maf': row[9] } data['suggest'] = {'input': [], 'weight': 1} r = Search.elastic_request( ElasticSettings.url(), ElasticSettings.idx('REGION', 'STUDY_HITS'), json.dumps(data)) if r.status_code != 201: message += "ERROR loading row of gwas data for " + row[ 0] + " - Failed to create document; <br />\n" print("\n\n" + message)