Esempio n. 1
0
    def load(dataset):
        datadir = project_dirs.data_dir()
        filename = '{}_allGenes.mat'.format(dataset)
        path = join(datadir, filename)
        if cfg.verbosity > 0:
            print 'Loading dataset {} from {}'.format(dataset, path)
        mat = loadmat(path)
        ages = np.array(mat['ages'].flat)
        gene_names = np.array(
            matlab_cell_array_to_list_of_strings(mat['gene_names']))
        region_names = np.array(
            matlab_cell_array_to_list_of_strings(mat['region_names']))
        if 'genders' in mat:
            genders = np.array(
                matlab_cell_array_to_list_of_strings(mat['genders']))
        else:
            genders = None
        expression = mat['expression']
        if expression.ndim == 2:  # extend shape to represent a single region name
            expression.shape = list(expression.shape) + [1]

        # average expression for duplicate genes
        dct = defaultdict(
            list)  # gene_name -> list of indices where it appears
        for i, g in enumerate(gene_names):
            dct[g].append(i)
        new_gene_names = sorted(set(gene_names))
        new_expression = np.empty(
            [len(ages), len(new_gene_names),
             len(region_names)])
        for i, g in enumerate(new_gene_names):
            idx = dct[g]
            new_expression[:, i, :] = expression[:, idx, :].mean(axis=1)
        gene_names = np.array(new_gene_names)
        expression = new_expression

        # make sure ages are sorted (for colantuoni there are 2 datapoints that aren't)
        inds = np.argsort(ages)
        ages = ages[inds]
        genders = genders[inds] if genders is not None else None
        expression = expression[inds, :, :]

        exons = _get_exons(gene_names) if cfg.exon_level else None

        res = OneDataset(expression=expression,
                         gene_names=gene_names,
                         region_names=region_names,
                         genders=genders,
                         ages=ages,
                         name=dataset,
                         exons=exons).restrict_pathway('all')
        sorted_regions = cfg.sorted_regions.get(dataset)
        if sorted_regions is not None:
            res = res.restrict_regions(cfg.sorted_regions[dataset])
        return res
Esempio n. 2
0
    def load(dataset):
        datadir = project_dirs.data_dir()
        filename = '{}_allGenes.mat'.format(dataset)
        path = join(datadir,filename)
        if cfg.verbosity > 0:
            print 'Loading dataset {} from {}'.format(dataset,path)
        mat = loadmat(path)
        ages = np.array(mat['ages'].flat)
        gene_names = np.array(matlab_cell_array_to_list_of_strings(mat['gene_names']))
        region_names = np.array(matlab_cell_array_to_list_of_strings(mat['region_names']))
        if 'genders' in mat:
            genders = np.array(matlab_cell_array_to_list_of_strings(mat['genders']))        
        else:
            genders = None
        expression = mat['expression']
        if expression.ndim == 2: # extend shape to represent a single region name
            expression.shape = list(expression.shape)+[1]
            
        # average expression for duplicate genes
        dct = defaultdict(list) # gene_name -> list of indices where it appears
        for i,g in enumerate(gene_names):
            dct[g].append(i)
        new_gene_names = sorted(set(gene_names))
        new_expression = np.empty([len(ages),len(new_gene_names),len(region_names)])
        for i,g in enumerate(new_gene_names):
            idx = dct[g]
            new_expression[:,i,:] = expression[:,idx,:].mean(axis=1)
        gene_names = np.array(new_gene_names)
        expression = new_expression

        # make sure ages are sorted (for colantuoni there are 2 datapoints that aren't)
        inds = np.argsort(ages)
        ages = ages[inds]
        genders = genders[inds] if genders is not None else None
        expression = expression[inds,:,:]
        
        res = OneDataset(
            expression = expression,
            gene_names = gene_names,
            region_names = region_names,
            genders = genders,
            ages = ages,
            name = dataset
        ).restrict_pathway('all')
        sorted_regions = cfg.sorted_regions.get(dataset)
        if sorted_regions is not None:
            res = res.restrict_regions(cfg.sorted_regions[dataset])
        return res
Esempio n. 3
0
def load_kang_tree_distances():
    filename = join(project_dirs.data_dir(), 'kangTreeDistances.mat')
    mat = loadmat(filename)
    distances = mat['developingTreeDistances']
    regions = matlab_cell_array_to_list_of_strings(mat['regions'])
    dct_regions = {r:i for i,r in enumerate(regions)}
    return Bunch(
        regions = regions,
        dct_regions = dct_regions,
        distances = distances,
    )
Esempio n. 4
0
def load_kang_tree_distances():
    filename = join(project_dirs.data_dir(), 'kangTreeDistances.mat')
    mat = loadmat(filename)
    distances = mat['developingTreeDistances']
    regions = matlab_cell_array_to_list_of_strings(mat['regions'])
    dct_regions = {r: i for i, r in enumerate(regions)}
    return Bunch(
        regions=regions,
        dct_regions=dct_regions,
        distances=distances,
    )