def main(): X = np.genfromtxt( '/home/ubuntu/data/scRNAseq/TabulaMuris/FACS/Marrow-counts.csv', delimiter=',') X = X[1:, 1:].T model = HAL(n_cluster_init=50, clf_type='rf') model.fit(X)
def main(): data = load() #np.savetxt('columns.txt',data.columns.values,fmt='%s') col = np.loadtxt('columns.txt', delimiter='\t', dtype=str) data = data[col[:,0]] X = np.arcsinh(data) model = HAL( n_cluster_init=50) model.fit(X)
def defineModel(self, markers, score, output_dir=None, retrain=False): """ This method creates a training set and runs the clustering algorithm markers: a list of strings corresponding to columns in the input data frames score: score to use in new model arcsinh: whether to apply the arcsinh transformation prior to clustering (this is usually a good idea) scaling: z-score scaling (this is considered best practice) output_dir (default is None): output directory, None indicates the current working directory retrain (default is False): specifies whether to retrain with existing model or create a new one """ assert (not retrain) or ( self.model is not None ) # exception if retraining without existing model if output_dir is None: output_dir = os.getcwd( ) # set to current directory if not specified data = copy.copy( self.data) # dictionary of samples (file names are keys) n_cells = np.floor(self.n_tsne / len(set(data.index.get_level_values(0))) ) # number of data points selected from each sample samples = list(set( data.index.get_level_values(0))) # sample/file names origins = [] # list tracking which sample individual points came from for ii, sample in enumerate(list(set( data.index.get_level_values(0)))): # iterate through samples sample_data = data[data.index.get_level_values(0) == sample] sample_size = int(np.min([ n_cells, sample_data.shape[0] ])) # number of points to select per sample random_choice = np.random.choice(sample_data.shape[0], sample_size) origins.extend( [sample] * len(random_choice)) # note where data points came from if ii == 0: data_samples = sample_data[markers].iloc[ random_choice, :].values # start list of data points else: data_samples = np.concatenate([ data_samples, sample_data[markers].iloc[random_choice, :].values ]) ''' for i, current_marker in enumerate(markers): print(current_marker) print(stats.entropy(np.arcsinh(data_samples[:, i]))) plt.hist(np.arcsinh(data_samples[:, i])) plt.show() ''' # determine whether the current experiment has been processed (with any CV score) redundant = False for file_name in os.listdir(output_dir + '/serialized'): match = re.fullmatch('model_0.*%s\.pkl' % self.name, file_name) if match is not None: redundant = True break # create new model if not retraining model_file = 'model_' + self.name + '.pkl' scaler_file = 'scaler_' + self.name + '.pkl' label_file = 'Labels_tSNE_' + str(score) + self.name + '.pkl' if (label_file in os.listdir(output_dir + '/serialized')) and not retrain: # re-run experiment with same CV score model = pickle.load( open(output_dir + '/serialized/' + model_file, 'rb')) self.scaler_obj = pickle.load( open(output_dir + '/serialized/' + scaler_file, 'rb')) tsne_frame = pickle.load( open(output_dir + '/serialized/' + label_file, 'rb')) labels_tSNE = tsne_frame['clusters'] data_samples = tsne_frame.loc[:, markers].values output = tsne_frame else: if redundant and not retrain: # re-run experiment with different CV score model = pickle.load( open(output_dir + '/serialized/' + model_file, 'rb')) data_samples = pickle.load( open( output_dir + '/serialized/tSNE_subset_' + self.name + '.pkl', 'rb')) self.scaler_obj = pickle.load( open(output_dir + '/serialized/' + scaler_file, 'rb')) else: # create HAL object and fit model to data (using only training data) try: shutil.rmtree('./info_hal') # remove old info_hal folder except FileNotFoundError: pass model = HAL(clf_type=self.clf_type, outlier_ratio=0.1, late_exag=900, alpha_late=2.0, n_cluster_init=150, warm_start=True) # apply arcsinh transformation (enabled by default) if self.arcsinh: transformed_samples = np.arcsinh(data_samples) else: transformed_samples = data_samples # apply standard (z-score) scaling (enabled by default) if self.scaling: if self.scaler_obj is None: self.scaler_obj = MinMaxScaler() scaled_data = self.scaler_obj.fit_transform( transformed_samples) else: scaled_data = self.scaler_obj.transform( transformed_samples) else: scaled_data = transformed_samples # do not use this option without a good reason! model.fit(scaled_data) pickle.dump(model, open(output_dir + '/serialized/' + model_file, 'wb')) pickle.dump(self.scaler_obj, open(output_dir + '/serialized/' + scaler_file, 'wb')) # create a frame with the clusters and samples for each data point labels_tSNE = model.predict(scaled_data, cv=score) output = pd.DataFrame(data_samples) output.columns = markers output["clusters"] = labels_tSNE output["origin"] = origins output = self.addTsne(output) output.to_csv(output_dir + '/Labels_tSNE_' + str(score) + self.name + '.csv') pickle.dump( data_samples, open( output_dir + '/serialized/tSNE_subset_' + self.name + '.pkl', "wb")) pickle.dump( output, open( output_dir + '/serialized/Labels_tSNE_' + str(score) + self.name + '.pkl', "wb")) self.model = model labels_only = np.array(labels_tSNE) return labels_only, output # do not return samples of origin with labels