Esempio n. 1
0
def process_mousebrain_multiinds_data(data_dir, result_dir, sample_ind, pred_ind, 
        region="FC", sample=False, sample_seed=0, celltype_gran=0):
    '''Process mousebrain data from multiple individuals to another

    @sample_ind: downsample to a certain individual number
    @pred_ind: the predicted individual
    '''
    ## individuals for brain regions
    FC_inds = ["P60FCAldh1l1Rep1", "P60FCCx3cr1Rep1", "P60FCRep1", "P60FCRep2",
            "P60FCRep3", "P60FCRep4", "P60FCRep6"]
    HC_inds = ["P60HippoRep1", "P60HippoRep2", "P60HippoRep3",
            "P60HippoRep4", "P60HippoRep5", "P60HippoRep6"]

    if region == "FC":
        exclude_list = FC_inds
    elif region == "HC":
        exclude_list = HC_inds
    exclude_list.remove(pred_ind)
    exclude_inds = '_'.join(exclude_list)
    train_adata = load_mousebrain_data.load_brain_adata(data_dir, region=region,
            ind=exclude_inds, celltype_gran=celltype_gran)

    ## sample to number of cells when compared to the sample_id
    if sample:
        avg_number = train_adata.shape[0] // len(exclude_list)

        print("=== Downsample to number:", avg_number)
        random.seed(sample_seed)
        sampled_cells = random.sample(list(train_adata.obs_names), k=avg_number)
        train_adata = train_adata[sampled_cells]

    test_adata = load_mousebrain_data.load_brain_adata(data_dir, region=region,
            ind=pred_ind, celltype_gran=celltype_gran)
    return train_adata, test_adata
Esempio n. 2
0
def process_mousebrain_regions(data_dir, result_dir, region1, region2,
        celltype_gran=0):
    ''' Use one region to predict another
    can allow region_individual input

    @celltype_gran: with sub-cell types and check ARI
    '''
    ## get region information and individuals
    regions1 = region1.split('_')
    regions2 = region2.split('_')

    region1 = regions1[0]
    if len(regions1) > 1:
        regions1_inds = '_'.join(regions1[1:])
    else:
        regions1_inds = None

    region2 = regions2[0]
    if len(regions2) > 1:
        regions2_inds = '_'.join(regions2[1:])
    else:
        regions2_inds = None

    ## get train/test anndata
    train_adata = load_mousebrain_data.load_brain_adata(data_dir, region=region1,
            ind=regions1_inds, celltype_gran=celltype_gran, curate=True)
    test_adata = load_mousebrain_data.load_brain_adata(data_dir, region=region2,
            ind=regions2_inds, celltype_gran=celltype_gran, curate=True)
    return train_adata, test_adata
Esempio n. 3
0
def process_mousecortex_protocols_mouseFC_data(data_dir, result_dir, 
        input1, input2, curate=False):
    '''process both mousecortex protocols and mouse FC data to predict

    Note here we can only do major cell types because Cortex data does not 
    provide sub-cell types

    @input1/input2: MouseProtocol+exp+protocols_inds, mouseFC_inds
    @curate: whether to curate the datasets
    '''
    ## load train and test adata
    input1_list = input1.split('_')
    input1_dataset = input1_list[0]
    if len(input1_list) > 1:
        input1_inds = '_'.join(input1_list[1:])
    else:
        input1_inds = None

    input2_list = input2.split('_')
    input2_dataset = input2_list[0]
    if len(input2_list) > 1:
        input2_inds = '_'.join(input2_list[1:])
    else:
        input2_inds = None

    ## extract train and test adata accordingly
    if "mouseFC" in input1_dataset:
        train_adata = load_mousebrain_data.load_brain_adata(data_dir, region="FC",
                ind=input1_inds, celltype_gran=0, curate=True)
    elif "MouseProtocol" in input1_dataset:
        info = input1_dataset.split('+')
        exp = info[1]
        protocol = info[2]

        if exp == "Both":
            train_adata = load_mouseprotocol_data.load_mouseprotocol_adata(data_dir,
                    exp=None, protocol=protocol, curate=True)
        else:
            train_adata = load_mouseprotocol_data.load_mouseprotocol_adata(data_dir,
                    exp=exp, protocol=protocol, curate=True)

    if "mouseFC" in input2_dataset:
        test_adata = load_mousebrain_data.load_brain_adata(data_dir, region="FC",
                ind=input2_inds, celltype_gran=0, curate=True)
    elif "MouseProtocol" in input2_dataset:
        info = input2_dataset.split('+')
        exp = info[1]
        protocol = info[2]

        if exp == "Both":
            test_adata = load_mouseprotocol_data.load_mouseprotocol_adata(data_dir,
                    exp=None, protocol=protocol, curate=True)
        else:
            test_adata = load_mouseprotocol_data.load_mouseprotocol_adata(data_dir,
                    exp=exp, protocol=protocol, curate=True)
    return train_adata, test_adata
Esempio n. 4
0
def process_mousebrain_data(data_dir, result_dir, ind1, ind2, region="FC",
        celltype_gran=0, curate=False):
    '''Process Frontal Cortex from one ind to another
    @ind: P60FCAldh1l1Rep1/P60FCCx3cr1Rep1/P60FCRep1/P60FCRep2/P60FCRep3/P60FCRep4/P60FCRep6
    @celltype_gran: granularity of cell types. 0: major cell types, 1: sub-celltypes
    @purify: whether to purify the training dataset
    @purify_method: purify using distance or SVM
    @curate: whether to curate mouse brain data by combining different neurons/interneurons
    '''
    train_adata = load_mousebrain_data.load_brain_adata(data_dir, region=region,
            ind=ind1, celltype_gran=celltype_gran, curate=curate)
    test_adata = load_mousebrain_data.load_brain_adata(data_dir, region=region,
            ind=ind2, celltype_gran=celltype_gran, curate=curate)
    return train_adata, test_adata
Esempio n. 5
0
def process_mousebrain_crossdataset_inds_data(data_dir, result_dir, ref_inds, tgt_inds='mFC_P60FCCx3cr1Rep1',
        celltype_gran=0, curate=True):
    '''Predict mouse FC target individual using pFC individuals and allenbrain

    @ ref_inds: pFC_inds+allen_inds
    @ target_ind: FC_P60FCCx3cr1Rep1
    '''
    adata_list = []
    dataset_inds_list = ref_inds.split('+')
    for dataset_ind in dataset_inds_list:
        if 'mFC' in dataset_ind:
            FC_inds = dataset_ind.replace('mFC_', '')
            FC_adata = load_mousebrain_data.load_brain_adata(data_dir, region="FC",
                    ind=FC_inds, curate=curate)
            adata_list.append(FC_adata)
        if 'pFC' in dataset_ind:
            pFC_inds = dataset_ind.replace('pFC_', '')
            pFC_adata = load_mouseFC_data.load_FC_adata(data_dir, devstage='Adult', ind=pFC_inds, curate=curate)
            adata_list.append(pFC_adata)
        if 'allen' in dataset_ind:
            allen_inds = dataset_ind.replace('allen_', '')
            allen_adata = load_mouseAllen_data.load_mouseAllen_10X_data(data_dir, ind=allen_inds, curate=curate)
            adata_list.append(allen_adata)
    train_adata = anndata.AnnData.concatenate(*adata_list, join='inner')
    del adata_list ## release memory

    ## same as reference, for simplification, I just copied and pasted here
    adata_list = []
    dataset_inds_list = tgt_inds.split('+')
    for dataset_ind in dataset_inds_list:
        if 'mFC' in dataset_ind:
            FC_inds = dataset_ind.replace('mFC_', '')
            FC_adata = load_mousebrain_data.load_brain_adata(data_dir, region="FC",
                    ind=FC_inds, curate=curate)
            adata_list.append(FC_adata)
        if 'pFC' in dataset_ind:
            pFC_inds = dataset_ind.replace('pFC_', '')
            pFC_adata = load_mouseFC_data.load_FC_adata(data_dir, devstage='Adult', ind=pFC_inds, curate=curate)
            adata_list.append(pFC_adata)
        if 'allen' in dataset_ind:
            allen_inds = dataset_ind.replace('allen_', '')
            allen_adata = load_mouseAllen_data.load_mouseAllen_10X_data(data_dir, ind=allen_inds, curate=curate)
            adata_list.append(allen_adata)
    test_adata = anndata.AnnData.concatenate(*adata_list, join='inner')
    del adata_list ## release memory

    return train_adata, test_adata
Esempio n. 6
0
def process_FCdatasets_mouseFC_data(data_dir, result_dir, datasets_ref, mousebrain_tgt,
        celltype_gran=0, curate=False):
    '''Combine mouse brain protocol + pFC datasets together to predict the 
    mouse FC target individual

    @datasets_ref: MouseProtocol+Both+DroNc-seq_pFC+Adult+Saline
    @mousebrain_tgt: mousebrain dataset with certain targets
    '''
    datasets_list = datasets_ref.split('_')
    adata_list = []
    for dataset in datasets_list:
        infos = dataset.split('+')
        dataset_name = infos[0]
        if dataset_name == "MouseProtocol":
            exp = infos[1]
            protocol = infos[2]
            if exp == "Both":
                adata = load_mouseprotocol_data.load_mouseprotocol_adata(data_dir,
                        exp=None, protocol=protocol, curate=curate)
            else:
                adata = load_mouseprotocol_data.load_mouseprotocol_adata(data_dur,
                        exp=exp, protocol=protocol, curate=curate)
        elif dataset_name == "pFC":
            devstage = infos[1]
            treatment = infos[2]
            adata = load_mouseFC_data.load_FC_adata(data_dir, devstage=devstage,
                    treatment=treatment, celltype_gran=celltype_gran, curate=curate)
        adata_list.append(adata)

    ## find common genes and concatenate adata
    common_columns = ['barcode', 'nGene', 'nUMI', 'percent.mito', 'cell.type']
    for adata in adata_list:
        adata_obs = adata.obs[common_columns]
        adata.obs = adata_obs
    train_adata = anndata.AnnData.concatenate(*adata_list, join='inner')
    del adata_list  ## release space

    ## get target individuals list
    tgt_list = mousebrain_tgt.split('_')
    region = tgt_list[0]
    if len(tgt_list) > 1:
        tgt_inds = '_'.join(tgt_list[1:])
    else:
        tgt_inds = None

    test_adata = load_mousebrain_data.load_brain_adata(data_dir, region=region,
            ind=tgt_inds, celltype_gran=celltype_gran, curate=curate)
    return train_adata, test_adata
Esempio n. 7
0
def process_allenbrain_cross_data(data_dir, result_dir, dataset1, dataset2="Allen",
        sample_seed=0, celltype_gran=0, curate=True):
    '''Predict allenbrain FACS-sorted SSv4 data 20% test cells

    @ dataset1: individual FC_P60FCRep1 or pFC_PFCSample1
    @ dataset2: Allen
    @ curate: whether to curate mouse brain cell types
    '''
    ## 20% test cells from allen brain dataset
    adata = load_mouseAllen_data.load_mouseAllen_SS_data(data_dir, curate=curate)
    random.seed(sample_seed)
    train_cells = random.sample(adata.obs_names.tolist(), round(0.8*adata.shape[0]))
    test_cells = list(set(adata.obs_names) - set(train_cells))
    test_adata = adata[test_cells]

    ## construct training dataset
    if "FC" == dataset1:
        train_adata = load_mousebrain_data.load_brain_adata(data_dir, region="FC",
            ind="P60FCRep1", celltype_gran=celltype_gran, curate=curate)
    if "pFC" == dataset1:
        train_adata = load_mouseFC_data.load_FC_adata(data_dir, devstage="Adult",
            ind="PFCSample1", curate=curate)
    return train_adata, test_adata
Esempio n. 8
0
def process_mousebrain_FC_datasets(data_dir, result_dir, dataset1_input, dataset2_input, 
        sample=False, sample_seed=0, celltype_gran=0):
    '''Use FC from one dataset to predict another

    @dataset1_input: "FC_inds", if sample == True, 
        the inds indicate the number of cells to downsample
    @dataset2_input: "Adult_inds"
    the above two parameters can be interchangable
    '''
    dataset1_split = dataset1_input.split('_')
    dataset2_split = dataset2_input.split('_')

    ## identify where the dataset points to
    regions = ["FC", "HC", "CB", "STR", "SN", "PC"]
    stages = ["Adult", "P21"]

    dataset1_input = dataset1_split[0]
    if len(dataset1_split) > 1:
        dataset1_inds = dataset1_split[1:]
        dataset1_inds = '_'.join(dataset1_inds)
    else:
        dataset1_inds = None

    dataset2_input = dataset2_split[0]
    if len(dataset2_split) > 1:
        dataset2_inds = dataset2_split[1:]
        dataset2_inds = '_'.join(dataset2_inds)
    else:
        dataset2_inds = None

    if dataset1_input in regions:
        train_adata = load_mousebrain_data.load_brain_adata(data_dir, 
                region=dataset1_input, ind=dataset1_inds, curate=True)
    else:
        info = dataset1_input.split('-')
        if info[0] in stages:
            if len(info) > 1:
                train_adata = load_mouseFC_data.load_FC_adata(data_dir, devstage=info[0],
                    treatment=info[1], curate=True)
            else:
                train_adata = load_mouseFC_data.load_FC_adata(data_dir, devstage=info[0],
                    ind=dataset1_inds, curate=True)

    if dataset2_input in regions:
        test_adata = load_mousebrain_data.load_brain_adata(data_dir,
                region=dataset2_input, ind=dataset2_inds, curate=True)
    else:
        info = dataset2_input.split('-')
        if info[0] in stages:
            if len(info) > 1:
                test_adata = load_mouseFC_data.load_FC_adata(data_dir, devstage=info[0],
                    treatment=info[1], curate=True)
            else:
                test_adata = load_mouseFC_data.load_FC_adata(data_dir, devstage=info[0],
                    ind=dataset2_inds, curate=True)
        
    if sample:
        ## sample number to the given individual cell numbers
        avg_number = train_adata.shape[0]
        if dataset1_input == "FC":
            train_adata = load_mousebrain_data.load_brain_adata(data_dir,
                    region=dataset1_input, ind=None, curate=True)
        
        if dataset1_input == "Adult":
            train_adata = load_mouseFC_data.load_FC_adata(data_dir, dataset1_input,
                    ind=None, curate=True)

        random.seed(sample_seed)
        sampled_cells = random.sample(list(train_adata.obs_names), k=avg_number)
        train_adata = train_adata[sampled_cells]
    return train_adata, test_adata