def test_mapping(self): ''' Test retrieving the mapping for an index. ''' elastic = Search(idx=ElasticSettings.idx('DEFAULT')) mapping = elastic.get_mapping() self.assertTrue(ElasticSettings.idx('DEFAULT') in mapping, "Database name in mapping result") if ElasticSettings.idx('DEFAULT') in mapping: self.assertTrue("mappings" in mapping[ElasticSettings.idx('DEFAULT')], "Mapping result found") # check using the index type mapping = elastic.get_mapping('marker') self.assertTrue(ElasticSettings.idx('DEFAULT') in mapping, "Database name in mapping result") # err check mapping = elastic.get_mapping('marker/xx') self.assertTrue('error' in mapping, "Database name in mapping result")
def test_gene_history_loader(self): """ Test the gene history loading. """ call_command("pipeline", "--steps", "load", sections="GENE_HISTORY", dir=TEST_DATA_DIR, ini=MY_INI_FILE) INI_CONFIG = IniParser().read_ini(MY_INI_FILE) idx = INI_CONFIG["GENE_HISTORY"]["index"] idx_type = INI_CONFIG["GENE_HISTORY"]["index_type"] elastic = Search(idx=idx, idx_type=idx_type) Search.index_refresh(idx) self.assertTrue(elastic.get_count()["count"] > 1, "Count documents in the index") map1_props = Gene.gene_history_mapping(idx, idx_type, test_mode=True).mapping_properties map2_props = elastic.get_mapping() if idx not in map2_props: logger.error("MAPPING ERROR: " + json.dumps(map2_props)) self._cmpMappings(map2_props[idx]["mappings"], map1_props, idx_type)
def _search_engine(query_dict, user_filters, user): ''' Carry out a search and add results to the context object. ''' user_query = query_dict.get("query") query = _gene_lookup(user_query) source_filter = [ 'symbol', 'synonyms', "dbxrefs.*", 'biotype', 'description', # gene 'id', 'rscurrent', 'rshigh', # marker 'journal', 'title', 'tags.disease', # publication 'name', 'code', # disease 'study_id', 'study_name', # study 'region_name', 'marker'] # regions if re.compile(r'^[0-9 ]+$').findall(query): source_filter.append('pmid') # publication - possible PMID(s) search_fields = [] maxsize = 20 if user_filters.getlist("maxsize"): maxsize = int(user_filters.get("maxsize")) # build search_fields from user input filter fields for it in user_filters.items(): if len(it) == 2: if it[0] == 'query': continue parts = it[1].split(":") if len(parts) == 3: search_fields.append(parts[1]+"."+parts[2]) elif len(parts) == 2: search_fields.append(parts[1]) if len(search_fields) == 0: search_fields = list(source_filter) search_fields.extend(['abstract', 'authors.name', # publication 'authors', 'pmids', # study 'markers', 'genes']) # study/region source_filter.extend(['date', 'pmid', 'build_id', 'ref', 'alt', 'chr_band', 'disease_locus', 'disease_loci', 'region_id']) idx_name = query_dict.get("idx") idx_dict = ElasticSettings.search_props(idx_name, user) query_filters = _get_query_filters(user_filters, user) highlight = Highlight(search_fields, pre_tags="<strong>", post_tags="</strong>", number_of_fragments=0) sub_agg = Agg('idx_top_hits', 'top_hits', {"size": maxsize, "_source": source_filter, "highlight": highlight.highlight['highlight']}) aggs = Aggs([Agg("idxs", "terms", {"field": "_index"}, sub_agg=sub_agg), Agg("biotypes", "terms", {"field": "biotype", "size": 0}), Agg("categories", "terms", {"field": "_type", "size": 0})]) # create score functions score_fns = _build_score_functions(idx_dict) equery = BoolQuery(must_arr=Query.query_string(query, fields=search_fields), should_arr=_auth_arr(user), b_filter=query_filters, minimum_should_match=1) search_query = ElasticQuery(FunctionScoreQuery(equery, score_fns, boost_mode='replace')) elastic = Search(search_query=search_query, aggs=aggs, size=0, idx=idx_dict['idx'], idx_type=idx_dict['idx_type']) result = elastic.search() mappings = elastic.get_mapping() _update_mapping_filters(mappings, result.aggs) _update_biotypes(user_filters, result) return {'data': _top_hits(result), 'aggs': result.aggs, 'query': user_query, 'idx_name': idx_name, 'fields': search_fields, 'mappings': mappings, 'hits_total': result.hits_total, 'maxsize': maxsize, 'took': result.took}
def test_gene_pipeline(self): """ Test gene pipeline. """ INI_CONFIG = IniParser().read_ini(MY_INI_FILE) idx = INI_CONFIG["ENSEMBL_GENE_GTF"]["index"] idx_type = INI_CONFIG["ENSEMBL_GENE_GTF"]["index_type"] """ 1. Test ensembl GTF loading. """ call_command( "pipeline", "--steps", "stage", "load", sections="ENSEMBL_GENE_GTF", dir=TEST_DATA_DIR, ini=MY_INI_FILE ) Search.index_refresh(idx) elastic = Search(idx=idx, idx_type=idx_type) self.assertGreaterEqual(elastic.get_count()["count"], 1, "Count documents in the index") map1_props = Gene.gene_mapping(idx, idx_type, test_mode=True).mapping_properties map2_props = elastic.get_mapping() if idx not in map2_props: logger.error("MAPPING ERROR: " + json.dumps(map2_props)) self._cmpMappings(map2_props[idx]["mappings"], map1_props, idx_type) """ 2. Test adding entrez ID to documents """ call_command("pipeline", "--steps", "load", sections="GENE2ENSEMBL", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertEqual(len(docs), 1) self.assertTrue("entrez" in getattr(docs[0], "dbxrefs")) self.assertEqual(getattr(docs[0], "dbxrefs")["entrez"], "26191") """ 3. Add uniprot and fill in missing entrez fields. """ call_command( "pipeline", "--steps", "download", "load", sections="ENSMART_GENE", dir=TEST_DATA_DIR, ini=MY_INI_FILE ) Search.index_refresh(idx) query = ElasticQuery.query_string("DNMT3L", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertTrue("entrez" in getattr(docs[0], "dbxrefs")) self.assertTrue("swissprot" in getattr(docs[0], "dbxrefs")) """ 4. Add gene synonyms and dbxrefs. """ call_command("pipeline", "--steps", "load", sections="GENE_INFO", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertTrue("PTPN8" in getattr(docs[0], "synonyms")) """ 5. Add PMIDs to gene docs. """ call_command("pipeline", "--steps", "load", sections="GENE_PUBS", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertGreater(len(getattr(docs[0], "pmids")), 0) """ 6. Add ortholog data. """ call_command("pipeline", "--steps", "load", sections="ENSMART_HOMOLOG", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs dbxrefs = getattr(docs[0], "dbxrefs") self.assertTrue("orthologs" in dbxrefs, dbxrefs) self.assertTrue("mmusculus" in dbxrefs["orthologs"], dbxrefs) self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"]) query = ElasticQuery.filtered( Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl", ["ENSMUSG00000027843"]), ) docs = Search(query, idx=idx, size=1).search().docs self.assertEqual(len(docs), 1) """ 7. Add mouse ortholog link to MGI """ call_command("pipeline", "--steps", "load", sections="ENSEMBL2MGI", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) docs = Search(query, idx=idx, size=1).search().docs dbxrefs = getattr(docs[0], "dbxrefs") self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"]) self.assertEqual("107170", dbxrefs["orthologs"]["mmusculus"]["MGI"])