Ejemplo n.º 1
0
def load_mouse_ortho_data(
        filename = os.path.join(biopsy.get_data_dir(), 'TreeFam', 'orthologs', 'MOUSE_ORTHO.tsv')
):
    reader = csv.reader(
            open( filename, "rb" ),
            delimiter = '\t'
    )
    return one_to_many( generate_db_refs_for_mouse_ortho( reader ) )
Ejemplo n.º 2
0
def to_mouse_map(ortho_filename = os.path.join(biopsy.get_data_dir(), 'TreeFam', 'orthologs', 'MOUSE_ORTHO.tsv')):
    return dict(
            (biopsy.DbRef.try_to_parse(row[1]), biopsy.DbRef.try_to_parse(row[0]))
            for row
            in csv.reader(
                    open(ortho_filename),
                    delimiter = '\t'
            )
            if len(row) > 1
    )
Ejemplo n.º 3
0
    mapper[T.db.entrez_gene] = entrez_gene_mapper
    mapper[T.db.swissprot] = uniprot_mapper

    return map, mapper

def build_map():
    map, mapper = default_map_and_mapper()
    for m in T.Matrix.all():
        mapper(m.acc.as_db_ref())
    for s in T.Site.all():
        if s.id.factor != 'CONS':
            continue
        mapper(s.acc.as_db_ref())
    return map

identifier_map = PersistedCache(build_map, os.path.join(biopsy.get_data_dir(), 'identifiers', 'identifier_map.pickle'))

def small_test_map():
    'Return a small map for testing purposes.'
    map, mapper = default_map_and_mapper()
    mapper(T.DbRef.parse_as('71431', T.db.entrez_gene))
    return map


def matrices_that_map_to(map, db):
    """
    Return a set of those matrices that map to the at least one entry in the
    given database type
    """
    matrices = set()
    for m in T.Matrix.all():
Ejemplo n.º 4
0
                mouse_genes.add(r)
    return mouse_genes

def write_ensembl_mouse_genes_file(filename):
    """
    Writes all the ensembl mouse gene references in transfac gene table to a
    file for use at
    http://www.informatics.jax.org/javawi2/servlet/WIFetch?page=batchQF.
    """
    f = open(filename, "w")
    for g in ensembl_mouse_genes_in_transfac():
        f.write(str(g))
        f.write('\n')
    f.close()

accession_map_filename = os.path.join(biopsy.get_data_dir(), 'identifiers', 'mgi', 'MRK_Dump1.rpt')

def parse_accession_map():
    """
    Parse the MGI accession map flat file and yield tuples

    (mgi identifier ref, mgi accession)
    """
    for id, acc in csv.reader(open(accession_map_filename, 'rb'), delimiter='\t'):
        yield (T.DbRef.parse_as(id, T.db.mgi), acc)

def build_acc_2_id_map():
    """
    Returns a dict mapping MGI accessions to ids.
    """
    return dict((acc, id) for id, acc in parse_accession_map())
Ejemplo n.º 5
0
def get_kegg_pathways():
    pathway_dir = os.path.join(biopsy.get_data_dir(), 'KEGG', 'pathways')
    return [
      (file, set(l.strip() for l in open(os.path.join(pathway_dir, file))))
      for file in os.listdir(pathway_dir)
    ]
Ejemplo n.º 6
0
def get_data_dir():
    return os.path.join(biopsy.get_data_dir(), 'SymAtlas')
Ejemplo n.º 7
0
    Query Entrez to get a map from its protein accessions to ids and xrefs
    """
    result = ProteinMap(
            acc_2_id = cookbook.DictOfSets(),
            xrefs = cookbook.DictOfSets()
    )
    for acc, id, refs in refs_for_mouse_protein_accs():
        result.acc_2_id[acc].add(id.acc)
        for ref in refs:
            result.xrefs[id.acc].add(ref)
    return result




_proteins_pickle_file = os.path.join(biopsy.get_data_dir(), 'identifiers', 'entrez', 'proteins.pickle')

proteins = lazy.PersistableLazyInitialiser(get_protein_map, _proteins_pickle_file)

def write_mouse_protein_ids(filename):
    from Bio.EUtils import HistoryClient
    f = open(filename, 'w')
    results = HistoryClient.HistoryClient().search(db='protein', term='mouse[orgn]')
    for id in results.dbids.ids:
        f.write(id)
        f.write('\n')
    f.close()



Ejemplo n.º 8
0
    self.marginal_prior,
    self.joint_prior,
    self.mi_threshold
        )

def sequences_from_jaspar_file(jaspar_file):
    length = None
    for l in jaspar_file:
        if l.startswith('>'): continue
        if not l.strip(): continue
        site = ''.join( [ c for c in l if c.isupper() ] )
        if length == None: length = len(site)
        elif len(site) != length: continue
        yield site

_jaspar_dir = os.path.join(biopsy.get_data_dir(), 'Jaspar', 'JASPAR_CORE')
_jaspar_phylofacts_dir = os.path.join(biopsy.get_data_dir(), 'Jaspar', 'JASPAR_PHYLOFACTS')
def jaspar_sequences(dir = _jaspar_dir):
    import os
    for filename in os.listdir(dir):
        f = open(os.path.join(dir, filename), 'r')
        yield (
                filename.split('.')[0],
                [ s for s in sequences_from_jaspar_file(f) ]
        )
def jaspar_phylofacts_sequences():
    for x in jaspar_sequences(_jaspar_phylofacts_dir):
        yield x

if '__main__' == __name__:
Ejemplo n.º 9
0
def get_data_dir():
    return os.path.join(biopsy.get_data_dir(), 'biobase', 'transpro')
Ejemplo n.º 10
0
def get_programs_dir():
    "@return: The directory to put program specific info into."
    #ensure_dir_exists(_programs_dir)
    return _programs_dir


_summaries_dir = os.path.join(options.output_dir, 'summaries')
"the directory to put DPM summaries into"

def get_summaries_dir():
    "@return: The directory to put DPM summaries into."
    #ensure_dir_exists(_summaries_dir)
    return _summaries_dir


_site_dpm_data_dir = os.path.join(biopsy.get_data_dir(), 'site-dpm')
"The directory where results that are reused across runs are cached."

def get_site_dpm_data_dir():
    "@return: The directory where results that are reused across runs are cached."
    #ensure_dir_exists(_site_dpm_data_dir)
    return _site_dpm_data_dir



try:
    import pylab
except:
    import warnings
    warnings.warn('Could not set matplotlib figure size')
    print sys.exc_info()
Ejemplo n.º 11
0
def get_data_dir():
    return os.path.join(biopsy.get_data_dir(), 'biobase', 'transpro')
Ejemplo n.º 12
0
#
# Copyright John Reid 2006-2010
#

import gzip, os.path, re, biopsy

_base_data_dir = os.path.join(biopsy.get_data_dir(), 'ensembl', 'genomes')


def get_genome_dir(genome):
    return os.path.join(_base_data_dir, genome)


def get_chromosome_file(genome, chromosome):
    """Returns an open file handle for the chromosome file in the given genome"""
    genome_dir = get_genome_dir(genome)
    files = os.listdir(genome_dir)
    chr_re = re.compile('\.dna\.chromosome\.%s$' % (str(chromosome)))
    matched_files = [f for f in files if chr_re.search(f)]
    if 0 == len(matched_files):
        raise RuntimeError('Did not find chromosome "%s" in directory: %s' %
                           (str(chromosome), genome_dir))
    if len(matched_files) > 1:
        raise RuntimeError('Expecting only one match in genome directory: %s' %
                           genome_dir)
    filename = os.path.join(genome_dir, matched_files[0])
    print 'File: %s' % filename
    return open(filename, 'r')


def get_chromosome_sequence(chromosome_file, offset, length):
Ejemplo n.º 13
0
# Copyright John Reid 2007
#

"""
Code to parse UniProt data.

www.ensembl.org/
"""

import gzip, sys, cookbook, biopsy
from . import lazy

T = biopsy.transfac


_uniprot_file = os.path.join(biopsy.get_data_dir(), "UniProt", "uniprot_sprot.dat.gz")


def data():
    """
    Returns a handle to the uniprot data.
    """
    return gzip.open(_uniprot_file)


def yield_records(handle):
    """
    Takes a file like handle and separates it into records.
    """
    record = []
    for line in handle:
Ejemplo n.º 14
0
#
# Copyright John Reid 2006-2010
#

import gzip, os.path, re, biopsy

_base_data_dir = os.path.join(biopsy.get_data_dir(), 'ensembl', 'genomes')

def get_genome_dir( genome ):
    return os.path.join( _base_data_dir, genome )

def get_chromosome_file( genome, chromosome ):
    """Returns an open file handle for the chromosome file in the given genome"""
    genome_dir = get_genome_dir( genome )
    files = os.listdir( genome_dir )
    chr_re = re.compile( '\.dna\.chromosome\.%s$' % ( str( chromosome ) ) )
    matched_files = [ f for f in files if chr_re.search( f ) ]
    if 0 == len( matched_files ):
        raise RuntimeError(
                'Did not find chromosome "%s" in directory: %s' % (
                        str( chromosome ),
                        genome_dir
                )
        )
    if len( matched_files ) > 1:
        raise RuntimeError(
                'Expecting only one match in genome directory: %s' % genome_dir
        )
    filename = os.path.join( genome_dir, matched_files[0] )
    print 'File: %s' % filename
    return open( filename, 'r' )