def test_gene_pubs(self): ''' Check the difference between the pubs indexed and those from the gene_pub file from the NCBI. If the publication pipeline has not been run recently there is likely to be a difference. This is allowed for with the NUM_DIFF variable. If there is a larger difference than this then the publication pipeline should be run. ''' ini = IniParser() config = ini.read_ini('publications.ini') section = config['GENE'] file_name = 'gene_pub_test.tmp' download_file = os.path.join(DiseasePublicationTest.TEST_DATA_DIR, file_name) success = FTPDownload().download(urljoin(section['location'], section['files']), DiseasePublicationTest.TEST_DATA_DIR, file_name=file_name) self.assertTrue(success, 'downloaded gene publications file') pmids = set() with gzip.open(download_file, 'rt') as outf: seen_add = pmids.add for x in outf: if not x.startswith('9606\t'): continue pmid = re.split('\t', x)[2].strip() if pmid not in pmids: seen_add(pmid) pmids = list(pmids) elastic = Search(search_query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))), sources=['pmid']), idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2) self.assertLess(len(pmids)-elastic.get_count()['count'], GenePublicationTest.NUM_DIFF, 'Count for gene publications') # check for differences in pmids # pmids_in_idx = [] # # def get_pmids(resp_json): # hits = resp_json['hits']['hits'] # pmids_in_idx.extend([getattr(Document(h), "pmid") for h in hits]) # # ScanAndScroll.scan_and_scroll(idx=ElasticSettings.idx('PUBLICATION'), call_fun=get_pmids, # query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))), # sources=['pmid']), # time_to_keep_scoll=30) # pmids_diff = list(set(pmids) - set(pmids_in_idx)) # self.assertLess(len(pmids_diff), GenePublicationTest.NUM_DIFF) os.remove(download_file)
def setUpClass(cls): ''' Retrieve the publication list for each disease from NCBI. ''' ini = IniParser() config = ini.read_ini('publications.ini') res = Search(ElasticQuery(Query.match_all(), sources=['code']), idx=ElasticSettings.idx('DISEASE')).search() sections = '' for doc in res.docs: sections += 'DISEASE::'+getattr(doc, 'code').upper()+',' # sections = 'DISEASE::T1D,DISEASE::MS,DISEASE::SLE' # download ncbi publication lists for each disease for section_name in config.sections(): if sections is not None and not ini._is_section_match(section_name, sections): continue ini._inherit_section(section_name, config) logger.debug(section_name) section = config[section_name] disease = section_name.split('::')[1] file_name = 'disease_pub_'+disease+'.tmp' HTTPDownload().download(section['location']+"?"+section['http_params'], cls.TEST_DATA_DIR, file_name=file_name) DiseasePublicationTest.DISEASES.append(disease) print()
def get_criteria_config(cls, ini_file='criteria.ini'): '''function to build the criteria config ''' BASE_DIR = os.path.dirname(os.path.dirname(__file__)) if 'test' in ini_file: ini_file = os.path.join(BASE_DIR, 'test', ini_file) else: ini_file = os.path.join(BASE_DIR, ini_file) config = None if os.path.isfile(ini_file): config = IniParser.read_ini(cls, ini_file=ini_file) return config
def tag_feature_to_all_diseases(cls, feature_id, section, config, result_container={}): ''' function to tag the feature to all the diseases, used to tag features in the MHC region @type feature_id: string @keyword feature_id: Id of the feature (gene => gene_id, region=>region_id) @type section: string @keyword section: The section in the criteria.ini file @type config: string @keyword config: The config object initialized from criteria.ini. @type result_container : string @keyword result_container: Container object for storing the result with keys as the feature_id ''' # (main_codes, other_codes) = CriteriaManager.get_available_diseases() # all_diseases = main_codes + other_codes result_container_ = result_container if config is None: config = IniParser.read_ini(ini_file='criteria.ini') dis_dict = dict() criteria_disease_dict = {} for disease in cls.site_enabled_diseases: dis_dict[disease] = [] criteria_dict = cls.get_criteria_dict(disease, disease) if len(result_container_.get(feature_id, {})) > 0: criteria_disease_dict = result_container_[feature_id] criteria_disease_dict = cls.get_criteria_disease_dict([disease], criteria_dict, criteria_disease_dict) result_container_[feature_id] = criteria_disease_dict else: criteria_disease_dict = {} criteria_disease_dict = cls.get_criteria_disease_dict([disease], criteria_dict, criteria_disease_dict) result_container_[feature_id] = criteria_disease_dict return result_container_