def load_mouse_ortho_data( filename = os.path.join(biopsy.get_data_dir(), 'TreeFam', 'orthologs', 'MOUSE_ORTHO.tsv') ): reader = csv.reader( open( filename, "rb" ), delimiter = '\t' ) return one_to_many( generate_db_refs_for_mouse_ortho( reader ) )
def to_mouse_map(ortho_filename = os.path.join(biopsy.get_data_dir(), 'TreeFam', 'orthologs', 'MOUSE_ORTHO.tsv')): return dict( (biopsy.DbRef.try_to_parse(row[1]), biopsy.DbRef.try_to_parse(row[0])) for row in csv.reader( open(ortho_filename), delimiter = '\t' ) if len(row) > 1 )
mapper[T.db.entrez_gene] = entrez_gene_mapper mapper[T.db.swissprot] = uniprot_mapper return map, mapper def build_map(): map, mapper = default_map_and_mapper() for m in T.Matrix.all(): mapper(m.acc.as_db_ref()) for s in T.Site.all(): if s.id.factor != 'CONS': continue mapper(s.acc.as_db_ref()) return map identifier_map = PersistedCache(build_map, os.path.join(biopsy.get_data_dir(), 'identifiers', 'identifier_map.pickle')) def small_test_map(): 'Return a small map for testing purposes.' map, mapper = default_map_and_mapper() mapper(T.DbRef.parse_as('71431', T.db.entrez_gene)) return map def matrices_that_map_to(map, db): """ Return a set of those matrices that map to the at least one entry in the given database type """ matrices = set() for m in T.Matrix.all():
mouse_genes.add(r) return mouse_genes def write_ensembl_mouse_genes_file(filename): """ Writes all the ensembl mouse gene references in transfac gene table to a file for use at http://www.informatics.jax.org/javawi2/servlet/WIFetch?page=batchQF. """ f = open(filename, "w") for g in ensembl_mouse_genes_in_transfac(): f.write(str(g)) f.write('\n') f.close() accession_map_filename = os.path.join(biopsy.get_data_dir(), 'identifiers', 'mgi', 'MRK_Dump1.rpt') def parse_accession_map(): """ Parse the MGI accession map flat file and yield tuples (mgi identifier ref, mgi accession) """ for id, acc in csv.reader(open(accession_map_filename, 'rb'), delimiter='\t'): yield (T.DbRef.parse_as(id, T.db.mgi), acc) def build_acc_2_id_map(): """ Returns a dict mapping MGI accessions to ids. """ return dict((acc, id) for id, acc in parse_accession_map())
def get_kegg_pathways(): pathway_dir = os.path.join(biopsy.get_data_dir(), 'KEGG', 'pathways') return [ (file, set(l.strip() for l in open(os.path.join(pathway_dir, file)))) for file in os.listdir(pathway_dir) ]
def get_data_dir(): return os.path.join(biopsy.get_data_dir(), 'SymAtlas')
Query Entrez to get a map from its protein accessions to ids and xrefs """ result = ProteinMap( acc_2_id = cookbook.DictOfSets(), xrefs = cookbook.DictOfSets() ) for acc, id, refs in refs_for_mouse_protein_accs(): result.acc_2_id[acc].add(id.acc) for ref in refs: result.xrefs[id.acc].add(ref) return result _proteins_pickle_file = os.path.join(biopsy.get_data_dir(), 'identifiers', 'entrez', 'proteins.pickle') proteins = lazy.PersistableLazyInitialiser(get_protein_map, _proteins_pickle_file) def write_mouse_protein_ids(filename): from Bio.EUtils import HistoryClient f = open(filename, 'w') results = HistoryClient.HistoryClient().search(db='protein', term='mouse[orgn]') for id in results.dbids.ids: f.write(id) f.write('\n') f.close()
self.marginal_prior, self.joint_prior, self.mi_threshold ) def sequences_from_jaspar_file(jaspar_file): length = None for l in jaspar_file: if l.startswith('>'): continue if not l.strip(): continue site = ''.join( [ c for c in l if c.isupper() ] ) if length == None: length = len(site) elif len(site) != length: continue yield site _jaspar_dir = os.path.join(biopsy.get_data_dir(), 'Jaspar', 'JASPAR_CORE') _jaspar_phylofacts_dir = os.path.join(biopsy.get_data_dir(), 'Jaspar', 'JASPAR_PHYLOFACTS') def jaspar_sequences(dir = _jaspar_dir): import os for filename in os.listdir(dir): f = open(os.path.join(dir, filename), 'r') yield ( filename.split('.')[0], [ s for s in sequences_from_jaspar_file(f) ] ) def jaspar_phylofacts_sequences(): for x in jaspar_sequences(_jaspar_phylofacts_dir): yield x if '__main__' == __name__:
def get_data_dir(): return os.path.join(biopsy.get_data_dir(), 'biobase', 'transpro')
def get_programs_dir(): "@return: The directory to put program specific info into." #ensure_dir_exists(_programs_dir) return _programs_dir _summaries_dir = os.path.join(options.output_dir, 'summaries') "the directory to put DPM summaries into" def get_summaries_dir(): "@return: The directory to put DPM summaries into." #ensure_dir_exists(_summaries_dir) return _summaries_dir _site_dpm_data_dir = os.path.join(biopsy.get_data_dir(), 'site-dpm') "The directory where results that are reused across runs are cached." def get_site_dpm_data_dir(): "@return: The directory where results that are reused across runs are cached." #ensure_dir_exists(_site_dpm_data_dir) return _site_dpm_data_dir try: import pylab except: import warnings warnings.warn('Could not set matplotlib figure size') print sys.exc_info()
# # Copyright John Reid 2006-2010 # import gzip, os.path, re, biopsy _base_data_dir = os.path.join(biopsy.get_data_dir(), 'ensembl', 'genomes') def get_genome_dir(genome): return os.path.join(_base_data_dir, genome) def get_chromosome_file(genome, chromosome): """Returns an open file handle for the chromosome file in the given genome""" genome_dir = get_genome_dir(genome) files = os.listdir(genome_dir) chr_re = re.compile('\.dna\.chromosome\.%s$' % (str(chromosome))) matched_files = [f for f in files if chr_re.search(f)] if 0 == len(matched_files): raise RuntimeError('Did not find chromosome "%s" in directory: %s' % (str(chromosome), genome_dir)) if len(matched_files) > 1: raise RuntimeError('Expecting only one match in genome directory: %s' % genome_dir) filename = os.path.join(genome_dir, matched_files[0]) print 'File: %s' % filename return open(filename, 'r') def get_chromosome_sequence(chromosome_file, offset, length):
# Copyright John Reid 2007 # """ Code to parse UniProt data. www.ensembl.org/ """ import gzip, sys, cookbook, biopsy from . import lazy T = biopsy.transfac _uniprot_file = os.path.join(biopsy.get_data_dir(), "UniProt", "uniprot_sprot.dat.gz") def data(): """ Returns a handle to the uniprot data. """ return gzip.open(_uniprot_file) def yield_records(handle): """ Takes a file like handle and separates it into records. """ record = [] for line in handle:
# # Copyright John Reid 2006-2010 # import gzip, os.path, re, biopsy _base_data_dir = os.path.join(biopsy.get_data_dir(), 'ensembl', 'genomes') def get_genome_dir( genome ): return os.path.join( _base_data_dir, genome ) def get_chromosome_file( genome, chromosome ): """Returns an open file handle for the chromosome file in the given genome""" genome_dir = get_genome_dir( genome ) files = os.listdir( genome_dir ) chr_re = re.compile( '\.dna\.chromosome\.%s$' % ( str( chromosome ) ) ) matched_files = [ f for f in files if chr_re.search( f ) ] if 0 == len( matched_files ): raise RuntimeError( 'Did not find chromosome "%s" in directory: %s' % ( str( chromosome ), genome_dir ) ) if len( matched_files ) > 1: raise RuntimeError( 'Expecting only one match in genome directory: %s' % genome_dir ) filename = os.path.join( genome_dir, matched_files[0] ) print 'File: %s' % filename return open( filename, 'r' )