def _translate_gene_list(gene_list): """gene_list can already be a sequence of strings or it could be a path to a file that contains a list of strings. The path can be relative to the data directory or an absolute path. The file can be any format supported by read_strings_from_file() """ # if it's not something that could be a filename assume it's a sequence of strings if not isinstance(gene_list, basestring): return None, gene_list # try to find a matching filename and read the data from it pathway_name = splitext(basename(gene_list))[ 0] # use the file's basename (without extension) as the pathway name for path in [ gene_list, join(project_dirs.data_dir(), gene_list), join(project_dirs.pathways_dir(), gene_list) ]: if isfile(path): if cfg.verbosity > 0: print 'Reading gene list from {}'.format(path) gene_list = read_strings_from_file(path) return pathway_name, gene_list # not found return None, None
def load_17_pathways_breakdown(b_unique=False, short_names=False): filename = join(project_dirs.data_dir(),'17pathways-breakdown.pkl') with open(filename) as f: dct_pathways = pickle.load(f) if b_unique: dct_pathways = unique_genes_only(dct_pathways) if short_names: name_mapping = get_17_pathways_short_names() dct_pathways = {name_mapping[name]:val for name,val in dct_pathways.iteritems()} return dct_pathways
def load(dataset): datadir = project_dirs.data_dir() filename = '{}_allGenes.mat'.format(dataset) path = join(datadir, filename) if cfg.verbosity > 0: print 'Loading dataset {} from {}'.format(dataset, path) mat = loadmat(path) ages = np.array(mat['ages'].flat) gene_names = np.array( matlab_cell_array_to_list_of_strings(mat['gene_names'])) region_names = np.array( matlab_cell_array_to_list_of_strings(mat['region_names'])) if 'genders' in mat: genders = np.array( matlab_cell_array_to_list_of_strings(mat['genders'])) else: genders = None expression = mat['expression'] if expression.ndim == 2: # extend shape to represent a single region name expression.shape = list(expression.shape) + [1] # average expression for duplicate genes dct = defaultdict( list) # gene_name -> list of indices where it appears for i, g in enumerate(gene_names): dct[g].append(i) new_gene_names = sorted(set(gene_names)) new_expression = np.empty( [len(ages), len(new_gene_names), len(region_names)]) for i, g in enumerate(new_gene_names): idx = dct[g] new_expression[:, i, :] = expression[:, idx, :].mean(axis=1) gene_names = np.array(new_gene_names) expression = new_expression # make sure ages are sorted (for colantuoni there are 2 datapoints that aren't) inds = np.argsort(ages) ages = ages[inds] genders = genders[inds] if genders is not None else None expression = expression[inds, :, :] exons = _get_exons(gene_names) if cfg.exon_level else None res = OneDataset(expression=expression, gene_names=gene_names, region_names=region_names, genders=genders, ages=ages, name=dataset, exons=exons).restrict_pathway('all') sorted_regions = cfg.sorted_regions.get(dataset) if sorted_regions is not None: res = res.restrict_regions(cfg.sorted_regions[dataset]) return res
def load_kang_tree_distances(): filename = join(project_dirs.data_dir(), 'kangTreeDistances.mat') mat = loadmat(filename) distances = mat['developingTreeDistances'] regions = matlab_cell_array_to_list_of_strings(mat['regions']) dct_regions = {r:i for i,r in enumerate(regions)} return Bunch( regions = regions, dct_regions = dct_regions, distances = distances, )
def load_kang_tree_distances(): filename = join(project_dirs.data_dir(), 'kangTreeDistances.mat') mat = loadmat(filename) distances = mat['developingTreeDistances'] regions = matlab_cell_array_to_list_of_strings(mat['regions']) dct_regions = {r: i for i, r in enumerate(regions)} return Bunch( regions=regions, dct_regions=dct_regions, distances=distances, )
def load_17_pathways_breakdown(b_unique=False, short_names=False): filename = join(project_dirs.data_dir(), '17pathways-breakdown.pkl') with open(filename) as f: dct_pathways = pickle.load(f) if b_unique: dct_pathways = unique_genes_only(dct_pathways) if short_names: name_mapping = get_17_pathways_short_names() dct_pathways = { name_mapping[name]: val for name, val in dct_pathways.iteritems() } return dct_pathways
def load_entrez_to_symbol_mapping(): conversion_filename = join(project_dirs.data_dir(),'human_entrez_conversion.txt') with open(conversion_filename) as f: lines = f.readlines() dct_mapping = {} for line in lines[1:]: # skip header fields = line.split() if len(fields) < 3: continue _,entrez,symbol = fields if entrez == 'NA': continue dct_mapping[int(entrez)] = symbol return dct_mapping
def load(dataset): datadir = project_dirs.data_dir() filename = '{}_allGenes.mat'.format(dataset) path = join(datadir,filename) if cfg.verbosity > 0: print 'Loading dataset {} from {}'.format(dataset,path) mat = loadmat(path) ages = np.array(mat['ages'].flat) gene_names = np.array(matlab_cell_array_to_list_of_strings(mat['gene_names'])) region_names = np.array(matlab_cell_array_to_list_of_strings(mat['region_names'])) if 'genders' in mat: genders = np.array(matlab_cell_array_to_list_of_strings(mat['genders'])) else: genders = None expression = mat['expression'] if expression.ndim == 2: # extend shape to represent a single region name expression.shape = list(expression.shape)+[1] # average expression for duplicate genes dct = defaultdict(list) # gene_name -> list of indices where it appears for i,g in enumerate(gene_names): dct[g].append(i) new_gene_names = sorted(set(gene_names)) new_expression = np.empty([len(ages),len(new_gene_names),len(region_names)]) for i,g in enumerate(new_gene_names): idx = dct[g] new_expression[:,i,:] = expression[:,idx,:].mean(axis=1) gene_names = np.array(new_gene_names) expression = new_expression # make sure ages are sorted (for colantuoni there are 2 datapoints that aren't) inds = np.argsort(ages) ages = ages[inds] genders = genders[inds] if genders is not None else None expression = expression[inds,:,:] res = OneDataset( expression = expression, gene_names = gene_names, region_names = region_names, genders = genders, ages = ages, name = dataset ).restrict_pathway('all') sorted_regions = cfg.sorted_regions.get(dataset) if sorted_regions is not None: res = res.restrict_regions(cfg.sorted_regions[dataset]) return res
def _translate_gene_list(gene_list): """gene_list can already be a sequence of strings or it could be a path to a file that contains a list of strings. The path can be relative to the data directory or an absolute path. The file can be any format supported by read_strings_from_file() """ # if it's not something that could be a filename assume it's a sequence of strings if not isinstance(gene_list, basestring): return None, gene_list # try to find a matching filename and read the data from it pathway_name = splitext(basename(gene_list))[0] # use the file's basename (without extension) as the pathway name for path in [gene_list, join(project_dirs.data_dir(),gene_list), join(project_dirs.pathways_dir(),gene_list)]: if isfile(path): if cfg.verbosity > 0: print 'Reading gene list from {}'.format(path) gene_list = read_strings_from_file(path) return pathway_name, gene_list # not found return None,None
def get_entrez_pathways(): pathway_numbers = { 4020 : 'Calcium signaling', # http://www.genome.jp/kegg-bin/show_pathway?map=hsa04020 4080 : 'Neuroactive ligand-receptor interaction', 4724 : 'Glutamatergic synapse', 4725 : 'Cholinergic synapse', 4726 : 'Serotonergic synapse', 4727 : 'GABAergic synapse', 4728 : 'Dopaminergic synapse', 4730 : 'Long-term depression', 5010 : "Alzheimer's disease", 5012 : "Parkinson's disease", 5014 : 'Amyotrophic lateral sclerosis (ALS)', 5016 : "Huntington's disease", 5030 : 'Cocaine addiction', 5031 : 'Amphetamine addiction', 5032 : 'Morphine addiction', 5033 : 'Nicotine addiction', 5034 : 'Alcoholism', } datadir = project_dirs.data_dir() filename = 'gene_pathways.mat' path = join(datadir,filename) mat = loadmat(path)['path_genes_mat'] row_indices, column_indices = np.nonzero(mat) pathways = row_indices + 1 # the matrix was created in matlab which is 1 based. python is 0 based genes = column_indices + 1 pathway_genes = zip(pathways,genes) dct_pathways = defaultdict(set) for p,g in pathway_genes: if p in pathway_numbers: pname = pathway_numbers[p] dct_pathways[pname].add(g) return dct_pathways
dct_mapping = {} for line in lines[1:]: # skip header fields = line.split() if len(fields) < 3: continue _,entrez,symbol = fields if entrez == 'NA': continue dct_mapping[int(entrez)] = symbol return dct_mapping dct_pathways_entrez = get_entrez_pathways() dct_mapping = load_entrez_to_symbol_mapping() dct_pathways = {} for pathway,entrez_genes in dct_pathways_entrez.iteritems(): symbols = set(dct_mapping.get(eg) for eg in entrez_genes) symbols = set(x for x in symbols if x is not None) dct_pathways[pathway] = symbols outfile = join(project_dirs.data_dir(),'17pathways-breakdown.pkl') with open(outfile,'w') as f: pickle.dump(dct_pathways,f) all_genes = set(g for pwy in dct_pathways.itervalues() for g in pwy) outfile = join(project_dirs.data_dir(),'17pathways-full.txt') with open(outfile,'w') as f: for g in sorted(all_genes): f.write(g + '\n')