'1134_not_Cho.txt', 'cho_not_1134.txt', ] references = dict() for file in filenames: print '****************** %s ********************' % file references[os.path.splitext(file)[0]] = set(yield_refs(file)) # get all the HS unigenes from all the files cell_cycle_hs_unigene = list(chain(*references.values())) # translate them to ensembl HS genes using the Synergizer translated = S.translate('ensembl', 'H**o sapiens', 'unigene', 'ensembl_gene_id', cell_cycle_hs_unigene) print 'Ensembl translated: %d' % S.how_many_have_translations(translated) ensembl_hs_genes = S.get_translations(translated) def yield_mouse_orthologs(hs_genes): # map into mouse orthologs using biomart query = B.new_query() dataset = B.add_dataset(query, 'hsapiens_gene_ensembl') B.add_attribute(dataset, 'ensembl_gene_id') B.add_attribute(dataset, 'mouse_ensembl_gene') filter = B.add_filter(dataset, name='ensembl_gene_id', value='') filter.set('value', ','.join(ensembl_hs_genes)) for chunk in B.split_big_list(ensembl_hs_genes, 50): #logging.info('Querying Ensembl biomart for chunk of %d genes', len(chunk)) filter.set('value', ','.join(chunk)) for row in B.yield_csv_query_results(query):
'cho_not_1134.txt', ] references = dict() for file in filenames: print '****************** %s ********************' % file references[os.path.splitext(file)[0]] = set(yield_refs(file)) # get all the HS unigenes from all the files cell_cycle_hs_unigene = list(chain(*references.values())) # translate them to ensembl HS genes using the Synergizer translated = S.translate('ensembl', 'H**o sapiens', 'unigene', 'ensembl_gene_id', cell_cycle_hs_unigene) print 'Ensembl translated: %d' % S.how_many_have_translations(translated) ensembl_hs_genes = S.get_translations(translated) def yield_mouse_orthologs(hs_genes): # map into mouse orthologs using biomart query = B.new_query() dataset = B.add_dataset(query, 'hsapiens_gene_ensembl') B.add_attribute(dataset, 'ensembl_gene_id') B.add_attribute(dataset, 'mouse_ensembl_gene') filter = B.add_filter(dataset, name='ensembl_gene_id', value='') filter.set('value', ','.join(ensembl_hs_genes)) for chunk in B.split_big_list(ensembl_hs_genes, 50): #logging.info('Querying Ensembl biomart for chunk of %d genes', len(chunk)) filter.set('value', ','.join(chunk)) for row in B.yield_csv_query_results(query): if row[1]:
"PAX5", "SP1", ] """ Haematopoietic TFBS """ liver_tfs = ["HNF1a", "Foxe3", "HNF4a", "CEBPa"] # HNF1 # HNF3 # HNF4 # CEBP """ Liver study TFBS (based on work by Krivan and Wasserman, 2001) """ muscle_tfs = ["MEF2c", "SP1", "SRF", "MyoD1", "TEF"] # MEF2 # EBOX (MyoD) """ Muscle study TFBS (based on work by Wasserman and Fickett, 1998) """ tf_sets = {"Haematopoietic": haematopoietic_tfs, "Liver": liver_tfs, "Muscle": muscle_tfs} import biopsy.identifiers.synergizer as S import biopsy.identifiers.biomart as B for tag, tfs in tf_sets.iteritems(): translated = S.translate("ensembl", "Mus musculus", "mgi_symbol", "ensembl_gene_id", tfs) print translated print "Ensembl translated: %d/%d" % (S.how_many_have_translations(translated), len(tfs)) ensembl_genes = S.get_translations(translated) open("%s-ensembl.txt" % tag, "w").write("\n".join(ensembl_genes))
""" muscle_tfs = [ 'MEF2c', # MEF2 'SP1', 'SRF', 'MyoD1', # EBOX (MyoD) 'TEF', ] """ Muscle study TFBS (based on work by Wasserman and Fickett, 1998) """ tf_sets = { 'Haematopoietic': haematopoietic_tfs, 'Liver': liver_tfs, 'Muscle': muscle_tfs } import biopsy.identifiers.synergizer as S import biopsy.identifiers.biomart as B for tag, tfs in tf_sets.iteritems(): translated = S.translate('ensembl', 'Mus musculus', 'mgi_symbol', 'ensembl_gene_id', tfs) print translated print 'Ensembl translated: %d/%d' % ( S.how_many_have_translations(translated), len(tfs)) ensembl_genes = S.get_translations(translated) open('%s-ensembl.txt' % tag, 'w').write('\n'.join(ensembl_genes))
""" Targets of liver specific transcription factors in Wasserman's predictive model for liver paper """ from utils import * from itertools import imap import biopsy.identifiers.synergizer as S liver_targets = ["G6PC", "IGF1", "PAH", "IGFBP1", "CFB", "FABP2", "GUCA2B", "HOXA4", "SLC34A1"] # BF translated = S.translate("ensembl", "Mus musculus", "mgi_symbol", "ensembl_gene_id", liver_targets) print translated print "Ensembl translated: %d/%d" % (S.how_many_have_translations(translated), len(liver_targets)) ensembl_genes = S.get_translations(translated) open("liver-targets.txt", "w").write("\n".join(ensembl_genes)) liver_ensembl_targets = { "G6PC": "ENSMUSG00000078650", "IGF1": "ENSMUSG00000020053", "PAH": "ENSMUSG00000020051", "IGFBP1": "ENSMUSG00000020429", "CFB": "ENSMUSG00000024371", "FABP2": "ENSMUSG00000023057", "GUCA2B": "ENSMUSG00000032978", "HOXA4": "ENSMUSG00000000942", "SLC34A1": "ENSMUSG00000021490", }