def predict(cv): col_names = [ line.split(',')[0] for line in open( '/home/ubuntu/data/scRNAseq/mice/count.csv', 'r').readlines() ][1:] model = HAL(warm_start=True, n_cluster_init=50, clf_type='rf') model.load() ypossible = model.possible_clusters(cv) X = np.genfromtxt('/home/ubuntu/data/scRNAseq/mice/count.csv', delimiter=',') X = X[1:, 1:].T ypred = model.predict(X, cv) col_names_all = list(col_names[:, 1].flatten()) df_median_expression = pd.DataFrame(np.array( [np.median(X[ypred == yu], axis=0) for yu in ypossible]), index=list(ypossible), columns=col_names_all) df_frequency = pd.DataFrame( [np.count_nonzero(ypred == yu) / len(ypred) for yu in ypossible], index=ypossible, columns=[f]) df_frequency.to_csv('/home/ubuntu/data/scRNAseq/mice/Frequencies.csv') df_median_expression.to_csv( '/home/ubuntu/data/scRNAseq/mice/Median_expression.csv') result[f] = [ypred, df_median_expression, df_frequency] pickle.dump(result, open('results.pkl', 'wb'))
def predict(cv): file_name_list = [] for filename in os.listdir( '/Users/mukherjeer2/Documents/Data/CyTOF/20190209_B6_IdU_Pilot/Live_Single_Cells/' ): if filename.endswith(".fcs"): file_name_list.append(filename) continue else: continue col = np.loadtxt('columns.txt', delimiter='\t', dtype=str) result = {} model = HAL(warm_start=True, n_cluster_init=50) model.load() ypossible = model.possible_clusters(cv) for f in file_name_list: print(f) data = load(f) data = data[col[:, 0]] X = np.arcsinh(data) ypred = model.predict(X, cv) #Xtmp = model.preprocess(X) col_names_all = list(col_names[:, 1].flatten()) df_median_expression = pd.DataFrame(np.array( [np.median(X[ypred == yu], axis=0) for yu in ypossible]), index=list(ypossible), columns=col_names_all) df_frequency = pd.DataFrame( [np.count_nonzero(ypred == yu) / len(ypred) for yu in ypossible], index=ypossible, columns=[f]) df_frequency.to_csv( '/Users/mukherjeer2/Documents/Data/CyTOF/20190209_B6_IdU_Pilot/Live_Single_Cells/Frequencies.csv' ) df_median_expression.to_csv( '/Users/mukherjeer2/Documents/Data/CyTOF/20190209_B6_IdU_Pilot/Live_Single_Cells/Median_expression.csv' ) #print(df_median_expression) #print(df_frequency) #exit() result[f] = [ypred, df_median_expression, df_frequency] pickle.dump(result, open('results.pkl', 'wb'))
def defineModel(self, markers, score, output_dir=None, retrain=False): """ This method creates a training set and runs the clustering algorithm markers: a list of strings corresponding to columns in the input data frames score: score to use in new model arcsinh: whether to apply the arcsinh transformation prior to clustering (this is usually a good idea) scaling: z-score scaling (this is considered best practice) output_dir (default is None): output directory, None indicates the current working directory retrain (default is False): specifies whether to retrain with existing model or create a new one """ assert (not retrain) or ( self.model is not None ) # exception if retraining without existing model if output_dir is None: output_dir = os.getcwd( ) # set to current directory if not specified data = copy.copy( self.data) # dictionary of samples (file names are keys) n_cells = np.floor(self.n_tsne / len(set(data.index.get_level_values(0))) ) # number of data points selected from each sample samples = list(set( data.index.get_level_values(0))) # sample/file names origins = [] # list tracking which sample individual points came from for ii, sample in enumerate(list(set( data.index.get_level_values(0)))): # iterate through samples sample_data = data[data.index.get_level_values(0) == sample] sample_size = int(np.min([ n_cells, sample_data.shape[0] ])) # number of points to select per sample random_choice = np.random.choice(sample_data.shape[0], sample_size) origins.extend( [sample] * len(random_choice)) # note where data points came from if ii == 0: data_samples = sample_data[markers].iloc[ random_choice, :].values # start list of data points else: data_samples = np.concatenate([ data_samples, sample_data[markers].iloc[random_choice, :].values ]) ''' for i, current_marker in enumerate(markers): print(current_marker) print(stats.entropy(np.arcsinh(data_samples[:, i]))) plt.hist(np.arcsinh(data_samples[:, i])) plt.show() ''' # determine whether the current experiment has been processed (with any CV score) redundant = False for file_name in os.listdir(output_dir + '/serialized'): match = re.fullmatch('model_0.*%s\.pkl' % self.name, file_name) if match is not None: redundant = True break # create new model if not retraining model_file = 'model_' + self.name + '.pkl' scaler_file = 'scaler_' + self.name + '.pkl' label_file = 'Labels_tSNE_' + str(score) + self.name + '.pkl' if (label_file in os.listdir(output_dir + '/serialized')) and not retrain: # re-run experiment with same CV score model = pickle.load( open(output_dir + '/serialized/' + model_file, 'rb')) self.scaler_obj = pickle.load( open(output_dir + '/serialized/' + scaler_file, 'rb')) tsne_frame = pickle.load( open(output_dir + '/serialized/' + label_file, 'rb')) labels_tSNE = tsne_frame['clusters'] data_samples = tsne_frame.loc[:, markers].values output = tsne_frame else: if redundant and not retrain: # re-run experiment with different CV score model = pickle.load( open(output_dir + '/serialized/' + model_file, 'rb')) data_samples = pickle.load( open( output_dir + '/serialized/tSNE_subset_' + self.name + '.pkl', 'rb')) self.scaler_obj = pickle.load( open(output_dir + '/serialized/' + scaler_file, 'rb')) else: # create HAL object and fit model to data (using only training data) try: shutil.rmtree('./info_hal') # remove old info_hal folder except FileNotFoundError: pass model = HAL(clf_type=self.clf_type, outlier_ratio=0.1, late_exag=900, alpha_late=2.0, n_cluster_init=150, warm_start=True) # apply arcsinh transformation (enabled by default) if self.arcsinh: transformed_samples = np.arcsinh(data_samples) else: transformed_samples = data_samples # apply standard (z-score) scaling (enabled by default) if self.scaling: if self.scaler_obj is None: self.scaler_obj = MinMaxScaler() scaled_data = self.scaler_obj.fit_transform( transformed_samples) else: scaled_data = self.scaler_obj.transform( transformed_samples) else: scaled_data = transformed_samples # do not use this option without a good reason! model.fit(scaled_data) pickle.dump(model, open(output_dir + '/serialized/' + model_file, 'wb')) pickle.dump(self.scaler_obj, open(output_dir + '/serialized/' + scaler_file, 'wb')) # create a frame with the clusters and samples for each data point labels_tSNE = model.predict(scaled_data, cv=score) output = pd.DataFrame(data_samples) output.columns = markers output["clusters"] = labels_tSNE output["origin"] = origins output = self.addTsne(output) output.to_csv(output_dir + '/Labels_tSNE_' + str(score) + self.name + '.csv') pickle.dump( data_samples, open( output_dir + '/serialized/tSNE_subset_' + self.name + '.pkl', "wb")) pickle.dump( output, open( output_dir + '/serialized/Labels_tSNE_' + str(score) + self.name + '.pkl', "wb")) self.model = model labels_only = np.array(labels_tSNE) return labels_only, output # do not return samples of origin with labels