def predict(cv): col_names = [ line.split(',')[0] for line in open( '/home/ubuntu/data/scRNAseq/mice/count.csv', 'r').readlines() ][1:] model = HAL(warm_start=True, n_cluster_init=50, clf_type='rf') model.load() ypossible = model.possible_clusters(cv) X = np.genfromtxt('/home/ubuntu/data/scRNAseq/mice/count.csv', delimiter=',') X = X[1:, 1:].T ypred = model.predict(X, cv) col_names_all = list(col_names[:, 1].flatten()) df_median_expression = pd.DataFrame(np.array( [np.median(X[ypred == yu], axis=0) for yu in ypossible]), index=list(ypossible), columns=col_names_all) df_frequency = pd.DataFrame( [np.count_nonzero(ypred == yu) / len(ypred) for yu in ypossible], index=ypossible, columns=[f]) df_frequency.to_csv('/home/ubuntu/data/scRNAseq/mice/Frequencies.csv') df_median_expression.to_csv( '/home/ubuntu/data/scRNAseq/mice/Median_expression.csv') result[f] = [ypred, df_median_expression, df_frequency] pickle.dump(result, open('results.pkl', 'wb'))
def predict(cv): file_name_list = [] for filename in os.listdir( '/Users/mukherjeer2/Documents/Data/CyTOF/20190209_B6_IdU_Pilot/Live_Single_Cells/' ): if filename.endswith(".fcs"): file_name_list.append(filename) continue else: continue col = np.loadtxt('columns.txt', delimiter='\t', dtype=str) result = {} model = HAL(warm_start=True, n_cluster_init=50) model.load() ypossible = model.possible_clusters(cv) for f in file_name_list: print(f) data = load(f) data = data[col[:, 0]] X = np.arcsinh(data) ypred = model.predict(X, cv) #Xtmp = model.preprocess(X) col_names_all = list(col_names[:, 1].flatten()) df_median_expression = pd.DataFrame(np.array( [np.median(X[ypred == yu], axis=0) for yu in ypossible]), index=list(ypossible), columns=col_names_all) df_frequency = pd.DataFrame( [np.count_nonzero(ypred == yu) / len(ypred) for yu in ypossible], index=ypossible, columns=[f]) df_frequency.to_csv( '/Users/mukherjeer2/Documents/Data/CyTOF/20190209_B6_IdU_Pilot/Live_Single_Cells/Frequencies.csv' ) df_median_expression.to_csv( '/Users/mukherjeer2/Documents/Data/CyTOF/20190209_B6_IdU_Pilot/Live_Single_Cells/Median_expression.csv' ) #print(df_median_expression) #print(df_frequency) #exit() result[f] = [ypred, df_median_expression, df_frequency] pickle.dump(result, open('results.pkl', 'wb'))
def analyze(cv): results = pickle.load(open('results.pkl', 'rb')) model = HAL(warm_start=True, n_cluster_init=50) model.load() #exit() ypossible = model.possible_clusters(cv=cv) data = [] idx = [] #df = pd.DataFrame([], columns=[str(yu) for yu in ypossible]) for k, v in results.items(): _, _, df_freq = v data.append(df_freq.values.flatten()) idx.append(k[:k.find('Singlets.fcs') - 1][find_second_last(k[:k.find('Singlets.fcs') - 1], '_') + 1:]) df = pd.DataFrame( np.array(data), index=idx, columns=results['c11_20190209_BoneMarrow_1-10_01_BM_1_Singlets.fcs'] [2].index) ax = sns.clustermap(np.arcsinh(df * 1000).T, xticklabels=df.index, cbar_kws={'label': 'arcsinh(frequency*1000)'}, figsize=(15, 15)) plt.setp(ax.ax_heatmap.xaxis.get_majorticklabels(), rotation=90, ha='right') plt.setp(ax.ax_heatmap.yaxis.get_majorticklabels(), rotation=00, ha='left') plt.show() pickle.dump(df, open('frequencies.pkl', 'wb'))