def ensmart_homolog_parse(cls, *args, **kwargs): ''' Parse result from ensembl mart. ''' download_file = cls._get_download_file(*args, **kwargs) with open(download_file, 'rt') as ensmart_f: Gene.ensmart_homolog_parse(ensmart_f, kwargs['section']['attrs'], kwargs['section']['index'], kwargs['section']['index_type'])
def gene_info_parse(cls, *args, **kwargs): ''' Parse gene_info file from NCBI. ''' download_file = cls._get_download_file(*args, **kwargs) idx = kwargs['section']['index'] with gzip.open(download_file, 'rt') as gene_info_f: Gene.gene_info_parse(gene_info_f, idx)
def ensembl_gene_parse(cls, *args, **kwargs): ''' Parse gene GTF file from ensembl. ''' stage_file = cls._get_stage_file(*args, **kwargs) download_file = cls._get_download_file(*args, **kwargs) Gene.gene_mapping(kwargs['section']['index'], kwargs['section']['index_type']) with gzip.open(download_file, 'rt') as ensembl_gene_f: with open(stage_file, 'w') as outfile: json.dump(Gene.ensembl_gene_parse(ensembl_gene_f), outfile, indent=0)
def test__replace_oldids_with_newids(self): '''Test if the old ids are getting replaced wih newids''' gene_sets = ['339457', '197215', '26191'] new_gene_ids = {'339457': '85452'} replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, new_gene_ids) self.assertEqual(replaced_gene_sets, ['85452', '197215', '26191'], "Replaced 339457 with 85452") discontinued_ids = ['197215'] replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, new_gene_ids, discontinued_ids) print(replaced_gene_sets) self.assertEqual(replaced_gene_sets, ['85452', '26191'], "Replaced 339457 with 85452")
def test__convert_entrezid2ensembl(self): config = IniParser().read_ini("tests/test_download.ini") section = config["BIOPLEX"] self.assertIsNotNone(section, "Section is not none") gene_sets = ['26191'] ensembl_ids = Gene._convert_entrezid2ensembl(gene_sets, section) self.assertTrue(len(ensembl_ids) == 1, "Got back one id") self.assertEqual(ensembl_ids[0], "ENSG00000134242", "Got back the right ensembl id for 26191") gene_sets = ['26191', '339457'] ensembl_ids = Gene._convert_entrezid2ensembl(gene_sets, section) self.assertTrue(len(ensembl_ids) == 2, "Got back 2 ensembl ids")
def _process_pathway(cls, download_file, stage_output_file, section, source, is_public, config=None): '''Function to parse the pathway input files eg: kegg, reactome, go INPUT file format: Pathway name \t Pathyway url \t List of entrez ids REACTOME_RNA_POL_I_TRANSCRIPTION_TERMINATION http://www.broadinstitute.org/gsea/msigdb/cards/REACTOME_RNA_POL_I_TRANSCRIPTION_TERMINATION1022 2068 2071 25885 284119 2965 2966 2967 2968 4331 The entrez ids are converted to ensembl ids and logs are written to track the conversion rates (LESS/MORE/EQUAL) ''' json_target_file_path = stage_output_file.replace(".out", ".json") json_target_file = open(json_target_file_path, mode='w', encoding='utf-8') json_target_file.write('{"docs":[\n') count = 0 tmp_row_count_file = open(download_file, encoding='utf-8') row_count = sum(1 for row in tmp_row_count_file) logger.debug('Number of lines in the file ' + str(row_count)) load_mapping = True gene_sets = [] with open(download_file, encoding='utf-8') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: gene_sets.extend(row[2:]) csvfile.close() ens_look_up = Gene._entrez_ensembl_lookup(gene_sets, section, config) with open(download_file, encoding='utf-8') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: path_object = dict() pathway_name = row[0] pathway_url = row[1] gene_sets = row[2:] converted_genesets = [ens_look_up[entrez] for entrez in gene_sets if entrez in ens_look_up] path_object["pathway_name"] = pathway_name path_object["pathway_url"] = pathway_url path_object["gene_sets"] = converted_genesets path_object["source"] = source path_object["is_public"] = is_public json_target_file.write(json.dumps(path_object)) count += 1 if row_count == count: json_target_file.write('\n') else: json_target_file.write(',\n') json_target_file.write('\n]}') logger.debug("No. genes to load "+str(count)) logger.debug("Json written to " + json_target_file_path) logger.debug("Load mappings") if load_mapping: status = cls._load_pathway_mappings(section) print(status)
def get_ensemb_ids(self, entrez_list): config = {} section = {} section['index'] = 'genes_hg38_v0.0.2' section['index_type'] = 'gene_history' config['GENE_HISTORY'] = section result_dict = Gene._entrez_ensembl_lookup(entrez_list, section, config) return result_dict
def test__check_gene_history(self): '''Test if the right newid is fetched from genehistory''' config = IniParser().read_ini("tests/test_download.ini") section = config["BIOPLEX"] self.assertIsNotNone(section, "Section is not none") gene_sets = ['339457', '197215', '26191'] (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, section) self.assertTrue(len(newgene_ids) == 1, "Got back one new id") self.assertIn('339457', newgene_ids, "Got back 339457 in new gene ids") self.assertTrue(len(discontinued_ids) == 1, "Got back one discontinued geneid")
def test__check_gene_history(self): '''Test if the right newid is fetched from genehistory''' config = IniParser().read_ini(MY_INI_FILE) gene_sets = ['56730', '26026', '188', '26191'] (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config) self.assertTrue(len(newgene_ids) == 2, "Got back two new ids") self.assertIn('56730', newgene_ids.keys(), "Got back 56730 in new gene ids -key") self.assertIn('84666', newgene_ids.values(), "Got back 84666 in new gene ids -value") self.assertIn('188', discontinued_ids, "Got back 188 in new discontinued_ids") self.assertTrue(len(discontinued_ids) == 1, "Got back one discontinued geneid")
def test_gene_history_loader(self): """ Test the gene history loading. """ call_command("pipeline", "--steps", "load", sections="GENE_HISTORY", dir=TEST_DATA_DIR, ini=MY_INI_FILE) INI_CONFIG = IniParser().read_ini(MY_INI_FILE) idx = INI_CONFIG["GENE_HISTORY"]["index"] idx_type = INI_CONFIG["GENE_HISTORY"]["index_type"] elastic = Search(idx=idx, idx_type=idx_type) Search.index_refresh(idx) self.assertTrue(elastic.get_count()["count"] > 1, "Count documents in the index") map1_props = Gene.gene_history_mapping(idx, idx_type, test_mode=True).mapping_properties map2_props = elastic.get_mapping() if idx not in map2_props: logger.error("MAPPING ERROR: " + json.dumps(map2_props)) self._cmpMappings(map2_props[idx]["mappings"], map1_props, idx_type)
def _process_bioplex(cls, download_file, stage_output_file, section, config): '''Function to process bioplex data files. Interactors are in first two columns, they are converted to ensembl ids and stored in temperory.out files Input File format: GeneA GeneB UniprotA UniprotB SymbolA SymbolB pW pNI pInt 100 728378 P00813 A5A3E0 ADA POTEF 2.38086E-09 0.000331856 0.999668142 100 345651 P00813 Q562R1 ADA ACTBL2 9.79E-18 0.211914437 0.788085563 Output file format: interactorA interactorB ENSG00000196839 ENSG00000196604 ENSG00000196839 ENSG00000169067 ''' stage_output_file_handler = open(stage_output_file, 'w') mapped_counter = 0 unmapped_ids = [] stage_output_file_handler.write('interactorA' + '\t' + 'interactorB\n') gene_sets = [] with open(download_file, encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: gene_sets.extend([row['GeneA'], row['GeneB']]) csvfile.close() ens_look_up = Gene._entrez_ensembl_lookup(gene_sets, section, config) with open(download_file, encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: interactor_a = row['GeneA'] interactor_b = row['GeneB'] if interactor_a in ens_look_up and interactor_b in ens_look_up: line = ens_look_up[interactor_a] + '\t' + ens_look_up[interactor_b] + '\n' stage_output_file_handler.write(line) mapped_counter += 1 else: line = interactor_a + '\t' + interactor_b + '\n' unmapped_ids.append(interactor_a) unmapped_ids.append(interactor_b) logger.debug("\n".join(unmapped_ids)) logger.debug("Mapped {} Unmapped {} " . format(mapped_counter, len(unmapped_ids))) stage_output_file_handler.close() cls._process_interaction_out_file(stage_output_file, section, False)
def gene_mgi_parse(cls, *args, **kwargs): download_file = cls._get_download_file(*args, **kwargs) with open(download_file, 'rt') as gene_mgi_f: Gene.gene_mgi_parse(gene_mgi_f, kwargs['section']['index'])
def gene_history_parse(cls, *args, **kwargs): ''' Parse gene_history file from NCBI. ''' download_file = cls._get_download_file(*args, **kwargs) Gene.gene_history_mapping(kwargs['section']['index'], kwargs['section']['index_type']) with gzip.open(download_file, 'rt') as gene_his_f: Gene.gene_history_parse(gene_his_f, kwargs['section']['index'], kwargs['section']['index_type'])
def gene_pub_parse(cls, *args, **kwargs): ''' Parse gene2pubmed file from NCBI. ''' download_file = cls._get_download_file(*args, **kwargs) with gzip.open(download_file, 'rt') as gene_pub_f: Gene.gene_pub_parse(gene_pub_f, kwargs['section']['index'])
def gene2ensembl_parse(cls, *args, **kwargs): ''' Parse gene2ensembl file from NCBI. ''' download_file = cls._get_download_file(*args, **kwargs) with gzip.open(download_file, 'rt') as gene2ens_f: Gene.gene2ensembl_parse(gene2ens_f, kwargs['section']['index'], kwargs['section']['index_type'])
def check_bioplex_data(self, child_doc, parent_doc): ''' Get all interactors, collect the ensembl ids, convert them to entrez ids Fetch the source file from bioplex and search for the parent entrez id Compare if the interactors if count is same between two sets If there is difference, check if the entrez id is in gene_history ''' config = IniParser().read_ini("download.ini") self.assertEqual(getattr(child_doc, "interaction_source"), 'bioplex', 'interaction_source is bioplex') # Get interactors interactors = getattr(child_doc, 'interactors') # Get ensembl ids ensembl_ids_interactors = [interactor['interactor'] for interactor in interactors] # Do a ensembl to entrez id lookup section = config["ENSEMBL_GENE"] ensembl_entrez_dict = Gene._ensembl_entrez_lookup(ensembl_ids_interactors, section) entrez_list_pydgin = set() for ensembl_id, entrez_id in ensembl_entrez_dict.items(): # @UnusedVariable entrez_list_pydgin.add(entrez_id) number_of_interactors_pydgin = len(interactors) parent_id = parent_doc.doc_id() self.assertEqual(parent_id, child_doc.parent(), 'Parent id ok') parent_entrez = getattr(parent_doc, "dbxrefs")["entrez"] # Download bioplex file from source and search for the parent entrez id interactors section_bioplex = config["BIOPLEX"] file_url = section_bioplex['location'] + section_bioplex['files'] status = HTTPDownload.download(file_url, '/tmp', 'bioplex.tmp') my_regex = r"\b" + re.escape(parent_entrez) + r"\b" interactor_counter = 0 if status: entrez_list_bioplex = set() with open('/tmp/bioplex.tmp', "r") as data: for line in data: if re.search(my_regex, line): tmp_list = line.split() if tmp_list[0] != parent_entrez: entrez_list_bioplex.add(tmp_list[0]) if tmp_list[1] != parent_entrez: entrez_list_bioplex.add(tmp_list[1]) interactor_counter += 1 if(len(entrez_list_pydgin) == len(entrez_list_bioplex)): self.assertEqual(number_of_interactors_pydgin, interactor_counter, "Interactor count is correct " + str(number_of_interactors_pydgin)) else: # find the missing one - Subtract. diff = set() if(len(entrez_list_pydgin) > len(entrez_list_bioplex)): diff = entrez_list_pydgin - entrez_list_bioplex else: diff = entrez_list_bioplex - entrez_list_pydgin # now check if these ids exists in history # Do a entrez to ensembl id lookup in gene history (newgene_ids, discontinued_ids) = Gene._check_gene_history(list(diff), config) # @UnusedVariable self.assertEqual(len(diff), len(discontinued_ids), "The missing ids where found in gene_history as discontinued ids")
def test_gene_pipeline(self): """ Test gene pipeline. """ INI_CONFIG = IniParser().read_ini(MY_INI_FILE) idx = INI_CONFIG["ENSEMBL_GENE_GTF"]["index"] idx_type = INI_CONFIG["ENSEMBL_GENE_GTF"]["index_type"] """ 1. Test ensembl GTF loading. """ call_command( "pipeline", "--steps", "stage", "load", sections="ENSEMBL_GENE_GTF", dir=TEST_DATA_DIR, ini=MY_INI_FILE ) Search.index_refresh(idx) elastic = Search(idx=idx, idx_type=idx_type) self.assertGreaterEqual(elastic.get_count()["count"], 1, "Count documents in the index") map1_props = Gene.gene_mapping(idx, idx_type, test_mode=True).mapping_properties map2_props = elastic.get_mapping() if idx not in map2_props: logger.error("MAPPING ERROR: " + json.dumps(map2_props)) self._cmpMappings(map2_props[idx]["mappings"], map1_props, idx_type) """ 2. Test adding entrez ID to documents """ call_command("pipeline", "--steps", "load", sections="GENE2ENSEMBL", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertEqual(len(docs), 1) self.assertTrue("entrez" in getattr(docs[0], "dbxrefs")) self.assertEqual(getattr(docs[0], "dbxrefs")["entrez"], "26191") """ 3. Add uniprot and fill in missing entrez fields. """ call_command( "pipeline", "--steps", "download", "load", sections="ENSMART_GENE", dir=TEST_DATA_DIR, ini=MY_INI_FILE ) Search.index_refresh(idx) query = ElasticQuery.query_string("DNMT3L", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertTrue("entrez" in getattr(docs[0], "dbxrefs")) self.assertTrue("swissprot" in getattr(docs[0], "dbxrefs")) """ 4. Add gene synonyms and dbxrefs. """ call_command("pipeline", "--steps", "load", sections="GENE_INFO", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertTrue("PTPN8" in getattr(docs[0], "synonyms")) """ 5. Add PMIDs to gene docs. """ call_command("pipeline", "--steps", "load", sections="GENE_PUBS", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertGreater(len(getattr(docs[0], "pmids")), 0) """ 6. Add ortholog data. """ call_command("pipeline", "--steps", "load", sections="ENSMART_HOMOLOG", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs dbxrefs = getattr(docs[0], "dbxrefs") self.assertTrue("orthologs" in dbxrefs, dbxrefs) self.assertTrue("mmusculus" in dbxrefs["orthologs"], dbxrefs) self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"]) query = ElasticQuery.filtered( Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl", ["ENSMUSG00000027843"]), ) docs = Search(query, idx=idx, size=1).search().docs self.assertEqual(len(docs), 1) """ 7. Add mouse ortholog link to MGI """ call_command("pipeline", "--steps", "load", sections="ENSEMBL2MGI", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) docs = Search(query, idx=idx, size=1).search().docs dbxrefs = getattr(docs[0], "dbxrefs") self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"]) self.assertEqual("107170", dbxrefs["orthologs"]["mmusculus"]["MGI"])