filenames = [ '1134_and_Cho.txt', '1134_not_Cho.txt', 'cho_not_1134.txt', ] references = dict() for file in filenames: print '****************** %s ********************' % file references[os.path.splitext(file)[0]] = set(yield_refs(file)) # get all the HS unigenes from all the files cell_cycle_hs_unigene = list(chain(*references.values())) # translate them to ensembl HS genes using the Synergizer translated = S.translate('ensembl', 'H**o sapiens', 'unigene', 'ensembl_gene_id', cell_cycle_hs_unigene) print 'Ensembl translated: %d' % S.how_many_have_translations(translated) ensembl_hs_genes = S.get_translations(translated) def yield_mouse_orthologs(hs_genes): # map into mouse orthologs using biomart query = B.new_query() dataset = B.add_dataset(query, 'hsapiens_gene_ensembl') B.add_attribute(dataset, 'ensembl_gene_id') B.add_attribute(dataset, 'mouse_ensembl_gene') filter = B.add_filter(dataset, name='ensembl_gene_id', value='') filter.set('value', ','.join(ensembl_hs_genes)) for chunk in B.split_big_list(ensembl_hs_genes, 50): #logging.info('Querying Ensembl biomart for chunk of %d genes', len(chunk)) filter.set('value', ','.join(chunk))
'1134_not_Cho.txt', 'cho_not_1134.txt', ] references = dict() for file in filenames: print '****************** %s ********************' % file references[os.path.splitext(file)[0]] = set(yield_refs(file)) # get all the HS unigenes from all the files cell_cycle_hs_unigene = list(chain(*references.values())) # translate them to ensembl HS genes using the Synergizer translated = S.translate('ensembl', 'H**o sapiens', 'unigene', 'ensembl_gene_id', cell_cycle_hs_unigene) print 'Ensembl translated: %d' % S.how_many_have_translations(translated) ensembl_hs_genes = S.get_translations(translated) def yield_mouse_orthologs(hs_genes): # map into mouse orthologs using biomart query = B.new_query() dataset = B.add_dataset(query, 'hsapiens_gene_ensembl') B.add_attribute(dataset, 'ensembl_gene_id') B.add_attribute(dataset, 'mouse_ensembl_gene') filter = B.add_filter(dataset, name='ensembl_gene_id', value='') filter.set('value', ','.join(ensembl_hs_genes)) for chunk in B.split_big_list(ensembl_hs_genes, 50): #logging.info('Querying Ensembl biomart for chunk of %d genes', len(chunk)) filter.set('value', ','.join(chunk)) for row in B.yield_csv_query_results(query):
""" muscle_tfs = [ 'MEF2c', # MEF2 'SP1', 'SRF', 'MyoD1', # EBOX (MyoD) 'TEF', ] """ Muscle study TFBS (based on work by Wasserman and Fickett, 1998) """ tf_sets = { 'Haematopoietic': haematopoietic_tfs, 'Liver': liver_tfs, 'Muscle': muscle_tfs } import biopsy.identifiers.synergizer as S import biopsy.identifiers.biomart as B for tag, tfs in tf_sets.iteritems(): translated = S.translate('ensembl', 'Mus musculus', 'mgi_symbol', 'ensembl_gene_id', tfs) print translated print 'Ensembl translated: %d/%d' % ( S.how_many_have_translations(translated), len(tfs)) ensembl_genes = S.get_translations(translated) open('%s-ensembl.txt' % tag, 'w').write('\n'.join(ensembl_genes))
"PAX5", "SP1", ] """ Haematopoietic TFBS """ liver_tfs = ["HNF1a", "Foxe3", "HNF4a", "CEBPa"] # HNF1 # HNF3 # HNF4 # CEBP """ Liver study TFBS (based on work by Krivan and Wasserman, 2001) """ muscle_tfs = ["MEF2c", "SP1", "SRF", "MyoD1", "TEF"] # MEF2 # EBOX (MyoD) """ Muscle study TFBS (based on work by Wasserman and Fickett, 1998) """ tf_sets = {"Haematopoietic": haematopoietic_tfs, "Liver": liver_tfs, "Muscle": muscle_tfs} import biopsy.identifiers.synergizer as S import biopsy.identifiers.biomart as B for tag, tfs in tf_sets.iteritems(): translated = S.translate("ensembl", "Mus musculus", "mgi_symbol", "ensembl_gene_id", tfs) print translated print "Ensembl translated: %d/%d" % (S.how_many_have_translations(translated), len(tfs)) ensembl_genes = S.get_translations(translated) open("%s-ensembl.txt" % tag, "w").write("\n".join(ensembl_genes))
# Copyright John Reid 2009 # """ Targets of liver specific transcription factors in Wasserman's predictive model for liver paper """ from utils import * from itertools import imap import biopsy.identifiers.synergizer as S liver_targets = ["G6PC", "IGF1", "PAH", "IGFBP1", "CFB", "FABP2", "GUCA2B", "HOXA4", "SLC34A1"] # BF translated = S.translate("ensembl", "Mus musculus", "mgi_symbol", "ensembl_gene_id", liver_targets) print translated print "Ensembl translated: %d/%d" % (S.how_many_have_translations(translated), len(liver_targets)) ensembl_genes = S.get_translations(translated) open("liver-targets.txt", "w").write("\n".join(ensembl_genes)) liver_ensembl_targets = { "G6PC": "ENSMUSG00000078650", "IGF1": "ENSMUSG00000020053", "PAH": "ENSMUSG00000020051", "IGFBP1": "ENSMUSG00000020429", "CFB": "ENSMUSG00000024371", "FABP2": "ENSMUSG00000023057", "GUCA2B": "ENSMUSG00000032978", "HOXA4": "ENSMUSG00000000942",
from itertools import imap import biopsy.identifiers.synergizer as S liver_targets = [ 'G6PC', 'IGF1', 'PAH', 'IGFBP1', 'CFB', # BF 'FABP2', 'GUCA2B', 'HOXA4', 'SLC34A1' ] translated = S.translate('ensembl', 'Mus musculus', 'mgi_symbol', 'ensembl_gene_id', liver_targets) print translated print 'Ensembl translated: %d/%d' % (S.how_many_have_translations(translated), len(liver_targets)) ensembl_genes = S.get_translations(translated) open('liver-targets.txt', 'w').write('\n'.join(ensembl_genes)) liver_ensembl_targets = { 'G6PC': 'ENSMUSG00000078650', 'IGF1': 'ENSMUSG00000020053', 'PAH': 'ENSMUSG00000020051', 'IGFBP1': 'ENSMUSG00000020429', 'CFB': 'ENSMUSG00000024371', 'FABP2': 'ENSMUSG00000023057', 'GUCA2B': 'ENSMUSG00000032978', 'HOXA4': 'ENSMUSG00000000942',