コード例 #1
0
def _translate_gene_list(gene_list):
    """gene_list can already be a sequence of strings or it could be a path to
       a file that contains a list of strings. 
       The path can be relative to the data directory or an absolute path.
       The file can be any format supported by read_strings_from_file()
    """
    # if it's not something that could be a filename assume it's a sequence of strings
    if not isinstance(gene_list, basestring):
        return None, gene_list

    # try to find a matching filename and read the data from it
    pathway_name = splitext(basename(gene_list))[
        0]  # use the file's basename (without extension) as the pathway name
    for path in [
            gene_list,
            join(project_dirs.data_dir(), gene_list),
            join(project_dirs.pathways_dir(), gene_list)
    ]:
        if isfile(path):
            if cfg.verbosity > 0:
                print 'Reading gene list from {}'.format(path)
            gene_list = read_strings_from_file(path)
            return pathway_name, gene_list

    # not found
    return None, None
コード例 #2
0
ファイル: load_data.py プロジェクト: ronniemaor/timefit
def load_17_pathways_breakdown(b_unique=False, short_names=False):
    filename = join(project_dirs.data_dir(),'17pathways-breakdown.pkl')
    with open(filename) as f:
        dct_pathways = pickle.load(f)
    if b_unique:
        dct_pathways = unique_genes_only(dct_pathways)
    if short_names:
        name_mapping = get_17_pathways_short_names()
        dct_pathways = {name_mapping[name]:val for name,val in dct_pathways.iteritems()}
    return dct_pathways
コード例 #3
0
    def load(dataset):
        datadir = project_dirs.data_dir()
        filename = '{}_allGenes.mat'.format(dataset)
        path = join(datadir, filename)
        if cfg.verbosity > 0:
            print 'Loading dataset {} from {}'.format(dataset, path)
        mat = loadmat(path)
        ages = np.array(mat['ages'].flat)
        gene_names = np.array(
            matlab_cell_array_to_list_of_strings(mat['gene_names']))
        region_names = np.array(
            matlab_cell_array_to_list_of_strings(mat['region_names']))
        if 'genders' in mat:
            genders = np.array(
                matlab_cell_array_to_list_of_strings(mat['genders']))
        else:
            genders = None
        expression = mat['expression']
        if expression.ndim == 2:  # extend shape to represent a single region name
            expression.shape = list(expression.shape) + [1]

        # average expression for duplicate genes
        dct = defaultdict(
            list)  # gene_name -> list of indices where it appears
        for i, g in enumerate(gene_names):
            dct[g].append(i)
        new_gene_names = sorted(set(gene_names))
        new_expression = np.empty(
            [len(ages), len(new_gene_names),
             len(region_names)])
        for i, g in enumerate(new_gene_names):
            idx = dct[g]
            new_expression[:, i, :] = expression[:, idx, :].mean(axis=1)
        gene_names = np.array(new_gene_names)
        expression = new_expression

        # make sure ages are sorted (for colantuoni there are 2 datapoints that aren't)
        inds = np.argsort(ages)
        ages = ages[inds]
        genders = genders[inds] if genders is not None else None
        expression = expression[inds, :, :]

        exons = _get_exons(gene_names) if cfg.exon_level else None

        res = OneDataset(expression=expression,
                         gene_names=gene_names,
                         region_names=region_names,
                         genders=genders,
                         ages=ages,
                         name=dataset,
                         exons=exons).restrict_pathway('all')
        sorted_regions = cfg.sorted_regions.get(dataset)
        if sorted_regions is not None:
            res = res.restrict_regions(cfg.sorted_regions[dataset])
        return res
コード例 #4
0
ファイル: load_data.py プロジェクト: ronniemaor/timefit
def load_kang_tree_distances():
    filename = join(project_dirs.data_dir(), 'kangTreeDistances.mat')
    mat = loadmat(filename)
    distances = mat['developingTreeDistances']
    regions = matlab_cell_array_to_list_of_strings(mat['regions'])
    dct_regions = {r:i for i,r in enumerate(regions)}
    return Bunch(
        regions = regions,
        dct_regions = dct_regions,
        distances = distances,
    )
コード例 #5
0
def load_kang_tree_distances():
    filename = join(project_dirs.data_dir(), 'kangTreeDistances.mat')
    mat = loadmat(filename)
    distances = mat['developingTreeDistances']
    regions = matlab_cell_array_to_list_of_strings(mat['regions'])
    dct_regions = {r: i for i, r in enumerate(regions)}
    return Bunch(
        regions=regions,
        dct_regions=dct_regions,
        distances=distances,
    )
コード例 #6
0
def load_17_pathways_breakdown(b_unique=False, short_names=False):
    filename = join(project_dirs.data_dir(), '17pathways-breakdown.pkl')
    with open(filename) as f:
        dct_pathways = pickle.load(f)
    if b_unique:
        dct_pathways = unique_genes_only(dct_pathways)
    if short_names:
        name_mapping = get_17_pathways_short_names()
        dct_pathways = {
            name_mapping[name]: val
            for name, val in dct_pathways.iteritems()
        }
    return dct_pathways
コード例 #7
0
def load_entrez_to_symbol_mapping():
    conversion_filename = join(project_dirs.data_dir(),'human_entrez_conversion.txt')
    with open(conversion_filename) as f:
        lines = f.readlines()
    dct_mapping = {}
    for line in lines[1:]: # skip header
        fields = line.split()
        if len(fields) < 3:
            continue
        _,entrez,symbol = fields
        if entrez == 'NA':
            continue
        dct_mapping[int(entrez)] = symbol
    return dct_mapping
コード例 #8
0
ファイル: load_data.py プロジェクト: ronniemaor/timefit
    def load(dataset):
        datadir = project_dirs.data_dir()
        filename = '{}_allGenes.mat'.format(dataset)
        path = join(datadir,filename)
        if cfg.verbosity > 0:
            print 'Loading dataset {} from {}'.format(dataset,path)
        mat = loadmat(path)
        ages = np.array(mat['ages'].flat)
        gene_names = np.array(matlab_cell_array_to_list_of_strings(mat['gene_names']))
        region_names = np.array(matlab_cell_array_to_list_of_strings(mat['region_names']))
        if 'genders' in mat:
            genders = np.array(matlab_cell_array_to_list_of_strings(mat['genders']))        
        else:
            genders = None
        expression = mat['expression']
        if expression.ndim == 2: # extend shape to represent a single region name
            expression.shape = list(expression.shape)+[1]
            
        # average expression for duplicate genes
        dct = defaultdict(list) # gene_name -> list of indices where it appears
        for i,g in enumerate(gene_names):
            dct[g].append(i)
        new_gene_names = sorted(set(gene_names))
        new_expression = np.empty([len(ages),len(new_gene_names),len(region_names)])
        for i,g in enumerate(new_gene_names):
            idx = dct[g]
            new_expression[:,i,:] = expression[:,idx,:].mean(axis=1)
        gene_names = np.array(new_gene_names)
        expression = new_expression

        # make sure ages are sorted (for colantuoni there are 2 datapoints that aren't)
        inds = np.argsort(ages)
        ages = ages[inds]
        genders = genders[inds] if genders is not None else None
        expression = expression[inds,:,:]
        
        res = OneDataset(
            expression = expression,
            gene_names = gene_names,
            region_names = region_names,
            genders = genders,
            ages = ages,
            name = dataset
        ).restrict_pathway('all')
        sorted_regions = cfg.sorted_regions.get(dataset)
        if sorted_regions is not None:
            res = res.restrict_regions(cfg.sorted_regions[dataset])
        return res
コード例 #9
0
ファイル: load_data.py プロジェクト: ronniemaor/timefit
def _translate_gene_list(gene_list):
    """gene_list can already be a sequence of strings or it could be a path to
       a file that contains a list of strings. 
       The path can be relative to the data directory or an absolute path.
       The file can be any format supported by read_strings_from_file()
    """
    # if it's not something that could be a filename assume it's a sequence of strings
    if not isinstance(gene_list, basestring):
        return None, gene_list 

    # try to find a matching filename and read the data from it
    pathway_name = splitext(basename(gene_list))[0] # use the file's basename (without extension) as the pathway name
    for path in [gene_list, join(project_dirs.data_dir(),gene_list), join(project_dirs.pathways_dir(),gene_list)]:
        if isfile(path):
            if cfg.verbosity > 0:
                print 'Reading gene list from {}'.format(path)
            gene_list = read_strings_from_file(path)
            return pathway_name, gene_list

    # not found
    return None,None 
コード例 #10
0
def get_entrez_pathways():
    pathway_numbers = {
        4020 : 'Calcium signaling', # http://www.genome.jp/kegg-bin/show_pathway?map=hsa04020
        4080 : 'Neuroactive ligand-receptor interaction',
        4724 : 'Glutamatergic synapse', 
        4725 : 'Cholinergic synapse', 
        4726 : 'Serotonergic synapse', 
        4727 : 'GABAergic synapse', 
        4728 : 'Dopaminergic synapse', 
        4730 : 'Long-term depression', 
        5010 : "Alzheimer's disease", 
        5012 : "Parkinson's disease", 
        5014 : 'Amyotrophic lateral sclerosis (ALS)', 
        5016 : "Huntington's disease", 
        5030 : 'Cocaine addiction', 
        5031 : 'Amphetamine addiction', 
        5032 : 'Morphine addiction', 
        5033 : 'Nicotine addiction',
        5034 : 'Alcoholism', 
    }
    
    datadir = project_dirs.data_dir()
    filename = 'gene_pathways.mat'
    path = join(datadir,filename)
    mat = loadmat(path)['path_genes_mat']
    
    row_indices, column_indices = np.nonzero(mat)
    pathways = row_indices + 1 # the matrix was created in matlab which is 1 based. python is 0 based
    genes = column_indices + 1
    pathway_genes = zip(pathways,genes)
    
    dct_pathways = defaultdict(set)
    for p,g in pathway_genes:
        if p in pathway_numbers:
            pname = pathway_numbers[p]
            dct_pathways[pname].add(g)
    return dct_pathways
コード例 #11
0
    dct_mapping = {}
    for line in lines[1:]: # skip header
        fields = line.split()
        if len(fields) < 3:
            continue
        _,entrez,symbol = fields
        if entrez == 'NA':
            continue
        dct_mapping[int(entrez)] = symbol
    return dct_mapping
    
dct_pathways_entrez = get_entrez_pathways()
dct_mapping = load_entrez_to_symbol_mapping()

dct_pathways = {}
for pathway,entrez_genes in dct_pathways_entrez.iteritems():
    symbols = set(dct_mapping.get(eg) for eg in entrez_genes)
    symbols = set(x for x in symbols if x is not None)
    dct_pathways[pathway] = symbols

outfile = join(project_dirs.data_dir(),'17pathways-breakdown.pkl')
with open(outfile,'w') as f:
    pickle.dump(dct_pathways,f)
    
all_genes = set(g for pwy in dct_pathways.itervalues() for g in pwy)
outfile = join(project_dirs.data_dir(),'17pathways-full.txt')
with open(outfile,'w') as f:
    for g in sorted(all_genes):
        f.write(g + '\n')