def test_bulk(self): ''' Test the Bulk.load(). ''' self.set_up() idx = IDX['MARKER']['indexName'] elastic = Search(ElasticQuery(Query.match_all()), idx=idx) hits_total1 = elastic.get_count()['count'] json_data = '{"index": {"_index": "%s", "_type": "%s"}}\n' % \ (idx, 'marker') json_data += json.dumps({"alt": "G", "start": 946, "seqid": "1", "filter": ".", "ref": "A", "id": "rsXXXXX", "qual": ".", "info": "RS=XXXXX"}) resp = Bulk.load(idx, '', json_data) self.assertNotEquals(resp.status_code, 200) # note: needs a trailing line return to work Bulk.load(idx, '', json_data + '\n') Search.index_refresh(idx) hits_total2 = elastic.get_count()['count'] self.assertEquals(hits_total2, hits_total1+1, "contains documents") # produce errors updating doc id that doesn't exist json_data += '{"delete": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \ (idx, 'marker', 'XYZ') json_data += '{"update": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \ (idx, 'marker', 'XYZ') json_data += '{"doc": {"start": 100, "end": 200}}\n' resp = Bulk.load(idx, '', json_data) self.assertTrue('errors' in resp.json() and resp.json()['errors'])
def test_pubs_disease_tags(self): ''' Check the number of disease publications against the number of tags.disease and report differences`. ''' count = True msg = '' for disease in DiseasePublicationTest.DISEASES: pmids = self._get_pmids(disease) disease_code = disease.lower() elastic = Search(search_query=ElasticQuery(BoolQuery( b_filter=Filter(Query.term('tags.disease', disease_code))), sources=['pmid']), idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2) res = elastic.get_count() msg += disease_code+'\tINDEX: '+str(res['count'])+'\tNCBI: '+str(len(pmids)) if res['count'] != len(pmids): count = False docs = elastic.search().docs pmids_in_idx = [getattr(doc, 'pmid') for doc in docs] pmids_diff1 = [pmid for pmid in pmids_in_idx if pmid not in pmids] pmids_diff2 = [pmid for pmid in pmids if pmid not in pmids_in_idx] if len(pmids_diff1) > 0: msg += '\textra PMIDs: '+str(pmids_diff1) if len(pmids_diff2) > 0: msg += '\tmissing PMIDs: '+str(pmids_diff2) msg += '\n' print(msg) self.assertTrue(count, 'Count for disease tags')
def test_marker_pipeline(self): """ Test marker pipeline. """ call_command("pipeline", "--steps", "load", sections="DBSNP", dir=TEST_DATA_DIR, ini=MY_INI_FILE) INI_CONFIG = IniParser().read_ini(MY_INI_FILE) idx = INI_CONFIG["DBSNP"]["index"] idx_type = INI_CONFIG["DBSNP"]["index_type"] elastic = Search(idx=idx, idx_type=idx_type) Search.index_refresh(idx) self.assertGreater(elastic.get_count()["count"], 0) call_command("pipeline", "--steps", "load", sections="RSMERGEARCH", dir=TEST_DATA_DIR, ini=MY_INI_FILE) idx = INI_CONFIG["RSMERGEARCH"]["index"] idx_type = INI_CONFIG["RSMERGEARCH"]["index_type"] elastic = Search(idx=idx, idx_type=idx_type) Search.index_refresh(idx) self.assertGreater(elastic.get_count()["count"], 0)
def test_delete_docs_by_query(self): ''' Test deleting docs using a query. ''' self.set_up() idx = IDX['MARKER']['indexName'] elastic = Search(ElasticQuery(Query.match_all()), idx=idx) hits_total1 = elastic.get_count()['count'] self.assertGreater(hits_total1, 0, "contains documents") # delete single doc Delete.docs_by_query(idx, query=Query.term("id", "rs2476601")) Search.index_refresh(idx) hits_total2 = elastic.get_count()['count'] self.assertEquals(hits_total2, hits_total1-1, "contains documents") # delete remaining docs Delete.docs_by_query(idx, 'marker') Search.index_refresh(idx) self.assertEquals(elastic.get_count()['count'], 0, "contains no documents")
def get_pmids(resp_json): pmids = [] for hit in resp_json['hits']['hits']: doc = Document(hit) pmids.append(getattr(doc, "pmid")) pmids = list(set(pmids)) elastic = Search(search_query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))), sources=['pmid']), idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2) if len(pmids) != elastic.get_count()['count']: # check for differences in pmids docs = elastic.search().docs pmids_in_pub_idx = [getattr(doc, 'pmid') for doc in docs] pmids_diff = list(set(pmids) - set(pmids_in_pub_idx)) self.assertListEqual([], pmids_diff, "PMIDs list empty ("+str(pmids_diff)+")") self.assertEqual(len(pmids), elastic.get_count()['count'], 'Count for region publications')
def test_gene_history_loader(self): """ Test the gene history loading. """ call_command("pipeline", "--steps", "load", sections="GENE_HISTORY", dir=TEST_DATA_DIR, ini=MY_INI_FILE) INI_CONFIG = IniParser().read_ini(MY_INI_FILE) idx = INI_CONFIG["GENE_HISTORY"]["index"] idx_type = INI_CONFIG["GENE_HISTORY"]["index_type"] elastic = Search(idx=idx, idx_type=idx_type) Search.index_refresh(idx) self.assertTrue(elastic.get_count()["count"] > 1, "Count documents in the index") map1_props = Gene.gene_history_mapping(idx, idx_type, test_mode=True).mapping_properties map2_props = elastic.get_mapping() if idx not in map2_props: logger.error("MAPPING ERROR: " + json.dumps(map2_props)) self._cmpMappings(map2_props[idx]["mappings"], map1_props, idx_type)
def test_gene_pubs(self): ''' Check the difference between the pubs indexed and those from the gene_pub file from the NCBI. If the publication pipeline has not been run recently there is likely to be a difference. This is allowed for with the NUM_DIFF variable. If there is a larger difference than this then the publication pipeline should be run. ''' ini = IniParser() config = ini.read_ini('publications.ini') section = config['GENE'] file_name = 'gene_pub_test.tmp' download_file = os.path.join(DiseasePublicationTest.TEST_DATA_DIR, file_name) success = FTPDownload().download(urljoin(section['location'], section['files']), DiseasePublicationTest.TEST_DATA_DIR, file_name=file_name) self.assertTrue(success, 'downloaded gene publications file') pmids = set() with gzip.open(download_file, 'rt') as outf: seen_add = pmids.add for x in outf: if not x.startswith('9606\t'): continue pmid = re.split('\t', x)[2].strip() if pmid not in pmids: seen_add(pmid) pmids = list(pmids) elastic = Search(search_query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))), sources=['pmid']), idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2) self.assertLess(len(pmids)-elastic.get_count()['count'], GenePublicationTest.NUM_DIFF, 'Count for gene publications') # check for differences in pmids # pmids_in_idx = [] # # def get_pmids(resp_json): # hits = resp_json['hits']['hits'] # pmids_in_idx.extend([getattr(Document(h), "pmid") for h in hits]) # # ScanAndScroll.scan_and_scroll(idx=ElasticSettings.idx('PUBLICATION'), call_fun=get_pmids, # query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))), # sources=['pmid']), # time_to_keep_scoll=30) # pmids_diff = list(set(pmids) - set(pmids_in_idx)) # self.assertLess(len(pmids_diff), GenePublicationTest.NUM_DIFF) os.remove(download_file)
def test_pub_disease_counts(self): ''' Check all publications exist in the publication index. ''' for disease in DiseasePublicationTest.DISEASES: pmids = self._get_pmids(disease) disease_code = disease.lower() elastic = Search(search_query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids)))), idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2) self.assertEqual(elastic.get_count()['count'], len(pmids), 'Count for '+disease_code) # check for differences in pmids pmids_in_idx = [] def get_pmids(resp_json): pmids_in_idx.extend([getattr(Document(h), "pmid") for h in resp_json['hits']['hits']]) ScanAndScroll.scan_and_scroll(idx=ElasticSettings.idx('PUBLICATION'), call_fun=get_pmids, query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))), sources=['pmid'])) pmids_diff = list(set(pmids) - set(pmids_in_idx)) self.assertEqual(len(pmids_diff), 0)
def get_docs_count(cls, idx, idx_type): '''Get doc counts''' elastic = Search(idx=idx, idx_type=idx_type) return elastic.get_count()['count']
def test_gene_pipeline(self): """ Test gene pipeline. """ INI_CONFIG = IniParser().read_ini(MY_INI_FILE) idx = INI_CONFIG["ENSEMBL_GENE_GTF"]["index"] idx_type = INI_CONFIG["ENSEMBL_GENE_GTF"]["index_type"] """ 1. Test ensembl GTF loading. """ call_command( "pipeline", "--steps", "stage", "load", sections="ENSEMBL_GENE_GTF", dir=TEST_DATA_DIR, ini=MY_INI_FILE ) Search.index_refresh(idx) elastic = Search(idx=idx, idx_type=idx_type) self.assertGreaterEqual(elastic.get_count()["count"], 1, "Count documents in the index") map1_props = Gene.gene_mapping(idx, idx_type, test_mode=True).mapping_properties map2_props = elastic.get_mapping() if idx not in map2_props: logger.error("MAPPING ERROR: " + json.dumps(map2_props)) self._cmpMappings(map2_props[idx]["mappings"], map1_props, idx_type) """ 2. Test adding entrez ID to documents """ call_command("pipeline", "--steps", "load", sections="GENE2ENSEMBL", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertEqual(len(docs), 1) self.assertTrue("entrez" in getattr(docs[0], "dbxrefs")) self.assertEqual(getattr(docs[0], "dbxrefs")["entrez"], "26191") """ 3. Add uniprot and fill in missing entrez fields. """ call_command( "pipeline", "--steps", "download", "load", sections="ENSMART_GENE", dir=TEST_DATA_DIR, ini=MY_INI_FILE ) Search.index_refresh(idx) query = ElasticQuery.query_string("DNMT3L", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertTrue("entrez" in getattr(docs[0], "dbxrefs")) self.assertTrue("swissprot" in getattr(docs[0], "dbxrefs")) """ 4. Add gene synonyms and dbxrefs. """ call_command("pipeline", "--steps", "load", sections="GENE_INFO", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertTrue("PTPN8" in getattr(docs[0], "synonyms")) """ 5. Add PMIDs to gene docs. """ call_command("pipeline", "--steps", "load", sections="GENE_PUBS", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertGreater(len(getattr(docs[0], "pmids")), 0) """ 6. Add ortholog data. """ call_command("pipeline", "--steps", "load", sections="ENSMART_HOMOLOG", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs dbxrefs = getattr(docs[0], "dbxrefs") self.assertTrue("orthologs" in dbxrefs, dbxrefs) self.assertTrue("mmusculus" in dbxrefs["orthologs"], dbxrefs) self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"]) query = ElasticQuery.filtered( Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl", ["ENSMUSG00000027843"]), ) docs = Search(query, idx=idx, size=1).search().docs self.assertEqual(len(docs), 1) """ 7. Add mouse ortholog link to MGI """ call_command("pipeline", "--steps", "load", sections="ENSEMBL2MGI", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) docs = Search(query, idx=idx, size=1).search().docs dbxrefs = getattr(docs[0], "dbxrefs") self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"]) self.assertEqual("107170", dbxrefs["orthologs"]["mmusculus"]["MGI"])
def test_count_with_query(self): ''' Test count the number of documents returned by a query. ''' query = ElasticQuery(Query.term("id", "rs768019142")) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.get_count()['count'] == 1, "Elastic count with a query")
def test_count(self): ''' Test count the number of documents in an index. ''' elastic = Search(idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.get_count()['count'] > 1, "Elastic count documents in an index")