def study_type(study): ancestor_dict = {} exps = v5.study_to_exps(study) for exp in exps: ancestors = v5.exp_to_celltypes(exp) for celltype in v5.exp_to_celltypes(exp): ancestors += co.get_ancestors(celltype) ancestor_dict[exp] = ancestors if len(exps) == 1: ancestors = ancestor_dict[exps[0]] else: first = True for exp in ancestor_dict: if first: ancestors = set(ancestor_dict[exp]) first = False continue ancestors = ancestors | set(ancestor_dict[exp]) ancestors = list(ancestors) common = [] for term_id in co.get_terms_without_children(list(ancestors)): common.append(co.get_term_name(term_id)) common = '; '.join(common) return common
def exp_to_celltypes(exp): """ Gets specific labels for a given experiment. """ type_ids = co.get_terms_without_children(type_file[exp]) general_labels = [ "CL:2000001", "CL:0000081", "CL:0002371", "CL:0000548", "CL:0000010" ] # PBMC, blood cell, somatic cell, animal cell, cultured cell # making an executive decision to exclude these cell types specific_list = [id for id in type_ids if id not in general_labels] if len(specific_list) == 0: # if the only labels are "bad" specific_list = type_ids #return original labels (some of which are bad) return specific_list # used 6 times
def create_reference(index_list): """ Given a set of studies from which to draw from, creates a reference matrix. """ cell_types = [] for i in index_list: cell_types += exp_to_celltypes(exp_acc[i]) # TEST if 'CL:0000236' in cell_types: print("Yes!") else: print("No!") cell_types = list(set(cell_types)) leaves = co.get_terms_without_children(cell_types) if 'CL:0000236' in leaves: print("Yes! Two") else: print("No! Two") leaf_index = {} for leaf in leaves: leaf_index[leaf] = [ exp_to_index(celltype) for celltype in celltype_to_exp(leaf) ] # TEST # TODO: this could use some clarification # explain what's going on with column stack and why you picked it first = True for leaf in leaves: signatures = get_signatures(leaf_index[leaf]) if len(leaf_index[leaf]) == 1: average = signatures else: average = np.mean(signatures, axis=1) if first: a = average first = False else: a = np.column_stack([a, average]) leaves = [co.get_term_name(leaf) for leaf in leaves] return {"gene_ids": gene_ids, "reference": a, "cell_types": leaves}
def multi_core(cell_exp_count, studies, exp_acc, gene_ids, countspermillion, qualified_cell_type_name, cell_type_file, qualified_cell_type): exp_acc_list = list(exp_acc) cell_types_selected = qualified_cell_type # Construct the noise added reference matrix reference_matrix = [] select_study_list = {} for i in qualified_cell_type: tmp_exp = V5.celltype_to_exp(i) select_sample = random.choice(tmp_exp) select_study_list[i] = V5.exp_to_study(select_sample) exp_index = exp_acc_list.index(select_sample) reference_matrix.append(exp_index) # print(select_study_list) # Build the noise added reference matrix for i in range(len(reference_matrix)): if i == 0: reference = countspermillion[reference_matrix[i]] else: tmp = countspermillion[reference_matrix[i]] reference = np.vstack((reference, tmp)) reference_noise = reference # Build the reference matrix reference_noise_free = [] for i in range(len(qualified_cell_type)): tmp_exp = V5.celltype_to_exp(qualified_cell_type[i]) tmp_ref = [] # Since all cell type will be included, therefore we can simply using the previous one` for j in tmp_exp: # Only one study wll be chosen to construct the noisy reference, therefore using != if V5.exp_to_study(j) != select_study_list[qualified_cell_type[i]]: tmp_ref.append(exp_acc_list.index(j)) for j in range(len(tmp_ref)): if j == 0: reference = countspermillion[tmp_ref[j]] else: tmp = countspermillion[tmp_ref[j]] reference = np.vstack((reference, tmp)) if len(tmp_ref) > 1: ref_mean = np.mean(reference, axis=0) else: ref_mean = reference reference_noise_free.append(ref_mean) reference_noise_free_np = np.array(reference_noise_free) signature_np = np.transpose(reference_noise_free_np) reference_noise_np = reference_noise.copy() signature_noise_np = np.transpose(reference_noise_np) # signature_temp = signature_np.copy() # Transform to pandas signature_np = signature_np.transpose() signature_noise_np = signature_noise_np.transpose() signature_pd = pd.DataFrame(data=signature_np, columns=gene_ids, index=qualified_cell_type_name) signature_noise_np_pd = pd.DataFrame( data=signature_noise_np, columns=gene_ids, index=[co.get_term_name(i) for i in qualified_cell_type]) # Save the signature and noisy signature for future analysis signature_pd.to_csv('~/IndependentStudy/Data/SignatureSimulation/' + str(cell_exp_count) + '_signature.tsv', sep='\t') signature_noise_np_pd.to_csv( '~/IndependentStudy/Data/SignatureSimulation/' + str(cell_exp_count) + '_signature_noise.tsv', sep='\t') # Build the variance data set # Eliminate the redundant cell type in all exp cell_type_specific_file = {} for i in cell_type_file: cell_type_specific_file[i] = co.get_terms_without_children( cell_type_file[i]) # Build the exp to study check dictionary studyexpMap = {} expstudyMap = {} for i in range(len(exp_acc)): expstudyMap[exp_acc[i]] = studies[i] if studies[i] not in studyexpMap: studyexpMap[studies[i]] = [exp_acc[i]] else: studyexpMap[studies[i]].append(exp_acc[i]) # Build the variance matrix variance_matrix = [] cell_types_48 = [] for cell_co in range(len(cell_types_selected)): # Get the cell type cellExpDict = {} for i in cell_type_specific_file: if cell_types_selected[cell_co] in cell_type_specific_file[i]: cellExpDict[i] = [cell_types_selected[cell_co]] # cell type specific Exp to Study dictionary expPerStudy = [] keys = list(cellExpDict.keys()) # print(keys) studyList = [] for i in keys: if expstudyMap[i] not in studyList: studyList.append(expstudyMap[i]) expPerStudy.append(i) else: continue tmp_exp_study = {} for i in cellExpDict.keys(): if expstudyMap[i] not in tmp_exp_study.keys(): tmp_exp_study[expstudyMap[i]] = [i] else: tmp_exp_study[expstudyMap[i]].append(i) # Get the within study variance # Generate the mean profile tmp_mean = [] within_study_var = [] # Build the exp expression matrix # print(tmp_exp_study.items()) for j in tmp_exp_study.items(): # print(select_study_list[cell_types_selected[cell_co]]) # print(j[0]) if j[0] not in select_study_list[cell_types_selected[cell_co]]: # Garb the cell index specific_cell_exp_index = [] for i in range(len(exp_acc)): if exp_acc[i] in j[1]: specific_cell_exp_index.append(i) else: continue specific_cell_exp_signature = get_signatures( specific_cell_exp_index, countspermillion) # Generate the cell_type specific mean (j[1] is a tuple), tmp_mean consist study mean if len(j[1]) == 1: tmp_mean.append(specific_cell_exp_signature) else: tmp_mean.append( np.mean(specific_cell_exp_signature, axis=1)) # Calculate the residue (if j[1] > 1) if len(j[1]) > 1: tmp_residue_list = [] for index in specific_cell_exp_index: tmp_exp = get_signatures([index], countspermillion) tmp_residue = np.abs( tmp_exp - np.mean(specific_cell_exp_signature, axis=1)) tmp_residue_list.append(tmp_residue) # Construct the within study variance tmp_residue_list = np.array(tmp_residue_list) within_study_var.append(np.var(tmp_residue_list, axis=0)) else: within_study_var.append( np.zeros(specific_cell_exp_signature.shape[0])) else: continue cell_types_48 += tmp_mean within_study_var = np.array(within_study_var) # Construct the study variance tmp_mean = np.array(tmp_mean) study_variance = np.var(tmp_mean, axis=0) # We assume variance sum law here total_variance = np.zeros(study_variance.shape[0]) total_variance = total_variance + study_variance for i in within_study_var: total_variance = total_variance + i variance_matrix.append(total_variance) variance_matrix = np.array(variance_matrix) print(variance_matrix.shape) os.system("touch " + '~/IndependentStudy/Data/Variance/' + str(cell_exp_count) + '_variance.txt') np.savetxt('/ua/shi235/IndependentStudy/Data/Variance/' + str(cell_exp_count) + '_variance.txt', variance_matrix, delimiter="\t")
def build_model(): # Provided h5 file file_name = 'expression_CPM.h5' decon_temp = './decon_temp/' decon_temp_shell = "./decon_temp/" # Load h5 data cpm = h5py.File("expression_CPM.h5", 'r') studies = np.array(cpm.get('study')).astype(str) exp_acc = np.array(cpm.get('experiment_accession')).astype(str) gene_ids = np.array(cpm.get('gene')).astype(str) countspermillion = np.array(cpm.get('cpm')) cpm.close() with open('cell_types.json', 'r') as type_file: cell_type_file = json.load(type_file) # Eliminate the redundant cell type in all exp cell_type_specific_file = {} for i in cell_type_file: cell_type_specific_file[i] = co.get_terms_without_children( cell_type_file[i]) cellExpDict = {} for i in cell_type_specific_file: if cell_type_specific_file[i] == ['CL:2000001']: cellExpDict[i] = ['CL:2000001'] # print(cellExpDict) # Build the exp to study check dictionary studyexpMap = {} expstudyMap = {} for i in range(len(exp_acc)): expstudyMap[exp_acc[i]] = studies[i] if studies[i] not in studyexpMap: studyexpMap[studies[i]] = [exp_acc[i]] else: studyexpMap[studies[i]].append(exp_acc[i]) # print(studyexpMap) # Generate data set with single cell type across different study, each study only take one experiment # Containing cell type 'CL:0001067', which is 'group 1 innate lymphoid cell' expPerStudy = [] keys = list(cellExpDict.keys()) studyList = [] for i in keys: if expstudyMap[i] not in studyList: studyList.append(expstudyMap[i]) expPerStudy.append(i) else: continue # print(len(keys)) # Transform to the index specific_cell_exp = expPerStudy specific_cell_exp = set(specific_cell_exp) # Build the Blood_Platelets exp expression matrix specific_cell_exp_index = [] for i in range(len(exp_acc)): if exp_acc[i] in specific_cell_exp: specific_cell_exp_index.append(i) else: pass # print(len(specific_cell_exp)) specific_cell_exp_signature = get_signatures(specific_cell_exp_index) # Unsure, according to the words, we should not include those gene with 0 m temp_mean_x = np.mean(specific_cell_exp_signature, axis=1) x = np.log(np.mean(specific_cell_exp_signature, axis=1) + 1) # Confuse, ask later. y = np.log(np.std(specific_cell_exp_signature, axis=1)) # print(len(x)) # print(x) # print(y) # Nan and Inf value needs to be dropped, otherwise we cannnot use Guassian KDE to estimate the density. index = [] for i in range(len(y)): # if np.isnan(y[i]) or np.isinf(y[i]): # index.append(i) if x[i] == 0: index.append(i) x = np.delete(x, index, axis=0) y = np.delete(y, index, axis=0) # print(len(x)) # print(x) # print(y) fig, ax1 = plt.subplots() ax1.scatter(x, y, alpha=0.6) ax1.set_title("Std-Mean Plot in Log Space") ax1.set_ylabel("log(STD)") ax1.set_xlabel("log(MeanCPM+1)") # plt.savefig("CV_mean_sp1.png") x_index = x.argsort() estimatex, estimatey = x[x_index], y[x_index] # Build a dictionary to record all x associated with y xyDict = {} for i in range(len(estimatex)): if estimatex[i] not in xyDict: xyDict[estimatex[i]] = [estimatey[i]] else: xyDict[estimatex[i]].append(estimatey[i]) # We randomly select a CV for corresponding expression level interpolatex = list(set(estimatex)) interpolatey = [] for i in interpolatex: temp = random.randint(0, len(xyDict[i])) - 1 interpolatey.append(xyDict[i][temp]) interpolatex = np.array(interpolatex) interpolatey = np.array(interpolatey) idx = interpolatex.argsort() x, y = interpolatex[idx], interpolatey[idx] sp1 = UnivariateSpline(x, y, k=1) return sp1
def cell_type_variance_model(): # Gene Specific model file_name = 'expression_CPM.h5' decon_temp = './decon_temp/' decon_temp_shell = "./decon_temp/" # Load h5 data cpm = h5py.File("expression_CPM.h5", 'r') studies = np.array(cpm.get('study')).astype(str) exp_acc = np.array(cpm.get('experiment_accession')).astype(str) gene_ids = np.array(cpm.get('gene')).astype(str) countspermillion = np.array(cpm.get('cpm')) cpm.close() with open('cell_types.json', 'r') as type_file: cell_type_file = json.load(type_file) # Eliminate the redundant cell type in all exp cell_type_specific_file = {} for i in cell_type_file: cell_type_specific_file[i] = co.get_terms_without_children( cell_type_file[i]) # Build the exp to study check dictionary studyexpMap = {} expstudyMap = {} for i in range(len(exp_acc)): expstudyMap[exp_acc[i]] = studies[i] if studies[i] not in studyexpMap: studyexpMap[studies[i]] = [exp_acc[i]] else: studyexpMap[studies[i]].append(exp_acc[i]) cell_types_selected = [ 'CL:1000274', 'CL:0002618', 'CL:0000501', 'CL:0000765', 'CL:2000001', 'CL:0002341', 'CL:0000583', 'CL:0000127', 'CL:0002631', 'CL:0000936', 'CL:0002327', 'CL:0000023', 'CL:0000216', 'CL:0000557', 'CL:0000018', 'CL:0000905', 'CL:0000182', 'CL:0000895', 'CL:0000096', 'CL:0002340', 'CL:0011001', 'CL:0000050', 'CL:0002633', 'CL:0000232', 'CL:0000019', 'CL:0000792', 'CL:0002063', 'CL:0000836', 'CL:0000904', 'CL:0002399', 'CL:0000233', 'CL:0002038', 'CL:0000788', 'CL:0000900', 'CL:0000670', 'CL:0002057', 'CL:0000351', 'CL:0001069', 'CL:0000091', 'CL:0000359', 'CL:0010004', 'CL:0000171', 'CL:0000169', 'CL:0000017', 'CL:0000623', 'CL:0001057', 'CL:0002394', 'CL:0000129' ] variance_matrix = [] cell_types_48 = [] for cell_co in cell_types_selected: # Get the cell type cellExpDict = {} for i in cell_type_specific_file: if cell_co in cell_type_specific_file[i]: cellExpDict[i] = [cell_co] # cell type specific Exp to Study dictionary expPerStudy = [] keys = list(cellExpDict.keys()) studyList = [] for i in keys: if expstudyMap[i] not in studyList: studyList.append(expstudyMap[i]) expPerStudy.append(i) else: continue tmp_exp_study = {} for i in cellExpDict.keys(): if expstudyMap[i] not in tmp_exp_study.keys(): tmp_exp_study[expstudyMap[i]] = [i] else: tmp_exp_study[expstudyMap[i]].append(i) # Get the within study variance # Generate the mean profile tmp_mean = [] within_study_var = [] # Build the exp expression matrix for j in tmp_exp_study.items(): # Garb the cell index specific_cell_exp_index = [] for i in range(len(exp_acc)): if exp_acc[i] in j[1]: specific_cell_exp_index.append(i) else: continue specific_cell_exp_signature = get_signatures( specific_cell_exp_index) # Generate the cell_type specific mean (j[1] is a tuple), tmp_mean consist study mean if len(j[1]) == 1: tmp_mean.append(specific_cell_exp_signature) else: tmp_mean.append(np.mean(specific_cell_exp_signature, axis=1)) # Calculate the residue (if j[1] > 1) if len(j[1]) > 1: tmp_residue_list = [] for index in specific_cell_exp_index: tmp_exp = get_signatures([index]) tmp_residue = np.abs( tmp_exp - np.mean(specific_cell_exp_signature, axis=1)) tmp_residue_list.append(tmp_residue) # Construct the within study variance tmp_residue_list = np.array(tmp_residue_list) within_study_var.append(np.var(tmp_residue_list, axis=0)) else: within_study_var.append( np.zeros(specific_cell_exp_signature.shape[0])) cell_types_48 += tmp_mean within_study_var = np.array(within_study_var) # Construct the study variance tmp_mean = np.array(tmp_mean) study_variance = np.var(tmp_mean, axis=0) # We assume variance sum law here total_variance = np.zeros(study_variance.shape[0]) total_variance = total_variance + study_variance for i in within_study_var: total_variance = total_variance + i variance_matrix.append(total_variance) variance_matrix = np.array(variance_matrix) variance_matrix_sum = np.sum(variance_matrix, axis=0) x = np.log(np.mean(cell_types_48, axis=0) + 1) # Confuse, ask later. y = np.log(np.sqrt(variance_matrix_sum) + 1) # Nan and Inf value needs to be dropped, otherwise we cannnot use Guassian KDE to estimate the density. index = [] for i in range(len(y)): if x[i] == 0: index.append(i) x = np.delete(x, index, axis=0) y = np.delete(y, index, axis=0) x_index = x.argsort() estimatex, estimatey = x[x_index], y[x_index] # Build a dictionary to record all x associated with y xyDict = {} for i in range(len(estimatex)): if estimatex[i] not in xyDict: xyDict[estimatex[i]] = [estimatey[i]] else: xyDict[estimatex[i]].append(estimatey[i]) # We randomly select a CV for corresponding expression level interpolatex = list(set(estimatex)) interpolatey = [] for i in interpolatex: temp = random.randint(0, len(xyDict[i])) - 1 interpolatey.append(xyDict[i][temp]) interpolatex = np.array(interpolatex) interpolatey = np.array(interpolatey) idx = interpolatex.argsort() x, y = interpolatex[idx], interpolatey[idx] sp1 = UnivariateSpline(x, y, k=5) return sp1
def cell_type_variance(): # Gene Specific model file_name = 'expression_CPM.h5' decon_temp = './decon_temp/' decon_temp_shell = "./decon_temp/" # Load h5 data cpm = h5py.File("expression_CPM.h5", 'r') studies = np.array(cpm.get('study')).astype(str) exp_acc = np.array(cpm.get('experiment_accession')).astype(str) gene_ids = np.array(cpm.get('gene')).astype(str) countspermillion = np.array(cpm.get('cpm')) cpm.close() with open('cell_types.json', 'r') as type_file: cell_type_file = json.load(type_file) # Eliminate the redundant cell type in all exp cell_type_specific_file = {} for i in cell_type_file: cell_type_specific_file[i] = co.get_terms_without_children( cell_type_file[i]) # Build the exp to study check dictionary studyexpMap = {} expstudyMap = {} for i in range(len(exp_acc)): expstudyMap[exp_acc[i]] = studies[i] if studies[i] not in studyexpMap: studyexpMap[studies[i]] = [exp_acc[i]] else: studyexpMap[studies[i]].append(exp_acc[i]) cell_types_selected = [ 'CL:1000274', 'CL:0002618', 'CL:0000501', 'CL:0000765', 'CL:2000001', 'CL:0002341', 'CL:0000583', 'CL:0000127', 'CL:0002631', 'CL:0000936', 'CL:0002327', 'CL:0000023', 'CL:0000216', 'CL:0000557', 'CL:0000018', 'CL:0000905', 'CL:0000182', 'CL:0000895', 'CL:0000096', 'CL:0002340', 'CL:0011001', 'CL:0000050', 'CL:0002633', 'CL:0000232', 'CL:0000019', 'CL:0000792', 'CL:0002063', 'CL:0000836', 'CL:0000904', 'CL:0002399', 'CL:0000233', 'CL:0002038', 'CL:0000788', 'CL:0000900', 'CL:0000670', 'CL:0002057', 'CL:0000351', 'CL:0001069', 'CL:0000091', 'CL:0000359', 'CL:0010004', 'CL:0000171', 'CL:0000169', 'CL:0000017', 'CL:0000623', 'CL:0001057', 'CL:0002394', 'CL:0000129' ] variance_matrix = [] cell_types_48 = [] for cell_co in cell_types_selected: # Get the cell type cellExpDict = {} for i in cell_type_specific_file: if cell_co in cell_type_specific_file[i]: cellExpDict[i] = [cell_co] # cell type specific Exp to Study dictionary expPerStudy = [] keys = list(cellExpDict.keys()) studyList = [] for i in keys: if expstudyMap[i] not in studyList: studyList.append(expstudyMap[i]) expPerStudy.append(i) else: continue tmp_exp_study = {} for i in cellExpDict.keys(): if expstudyMap[i] not in tmp_exp_study.keys(): tmp_exp_study[expstudyMap[i]] = [i] else: tmp_exp_study[expstudyMap[i]].append(i) # Get the within study variance # Generate the mean profile tmp_mean = [] within_study_var = [] # Build the exp expression matrix for j in tmp_exp_study.items(): # Garb the cell index specific_cell_exp_index = [] for i in range(len(exp_acc)): if exp_acc[i] in j[1]: specific_cell_exp_index.append(i) else: continue specific_cell_exp_signature = get_signatures( specific_cell_exp_index) # Generate the cell_type specific mean (j[1] is a tuple), tmp_mean consist study mean if len(j[1]) == 1: tmp_mean.append(specific_cell_exp_signature) else: tmp_mean.append(np.mean(specific_cell_exp_signature, axis=1)) # Calculate the residue (if j[1] > 1) if len(j[1]) > 1: tmp_residue_list = [] for index in specific_cell_exp_index: tmp_exp = get_signatures([index]) tmp_residue = np.abs( tmp_exp - np.mean(specific_cell_exp_signature, axis=1)) tmp_residue_list.append(tmp_residue) # Construct the within study variance tmp_residue_list = np.array(tmp_residue_list) within_study_var.append(np.var(tmp_residue_list, axis=0)) else: within_study_var.append( np.zeros(specific_cell_exp_signature.shape[0])) cell_types_48 += tmp_mean within_study_var = np.array(within_study_var) # Construct the study variance tmp_mean = np.array(tmp_mean) study_variance = np.var(tmp_mean, axis=0) # We assume variance sum law here total_variance = np.zeros(study_variance.shape[0]) total_variance = total_variance + study_variance for i in within_study_var: total_variance = total_variance + i variance_matrix.append(total_variance) variance_matrix = np.array(variance_matrix) # print(variance_matrix.shape) # return np.sum(variance_matrix, axis=0) return variance_matrix
# Load h5 data cpm = h5py.File("expression_CPM.h5", 'r') studies = np.array(cpm.get('study')).astype(str) exp_acc = np.array(cpm.get('experiment_accession')).astype(str) gene_ids = np.array(cpm.get('gene')).astype(str) countspermillion = np.array(cpm.get('cpm')) cpm.close() with open('cell_types.json', 'r') as type_file: cell_type_file = json.load(type_file) # Eliminate the redundant cell type in all exp cell_type_specific_file = {} for i in cell_type_file: cell_type_specific_file[i] = co.get_terms_without_children( cell_type_file[i]) cellExpDict = {} for i in cell_type_specific_file: if cell_type_specific_file[i] == ['CL:2000001']: cellExpDict[i] = ['CL:2000001'] # Build the exp to study check dictionary studyexpMap = {} expstudyMap = {} for i in range(len(exp_acc)): expstudyMap[exp_acc[i]] = studies[i] if studies[i] not in studyexpMap: studyexpMap[studies[i]] = [exp_acc[i]] else: studyexpMap[studies[i]].append(exp_acc[i])