def main():
    X = np.genfromtxt(
        '/home/ubuntu/data/scRNAseq/TabulaMuris/FACS/Marrow-counts.csv',
        delimiter=',')
    X = X[1:, 1:].T
    model = HAL(n_cluster_init=50, clf_type='rf')
    model.fit(X)
Beispiel #2
0
def main():
    data =  load()
    #np.savetxt('columns.txt',data.columns.values,fmt='%s')
    col = np.loadtxt('columns.txt', delimiter='\t', dtype=str)
    
    data = data[col[:,0]]

    X = np.arcsinh(data)

    model = HAL( n_cluster_init=50)

    model.fit(X)
    def defineModel(self, markers, score, output_dir=None, retrain=False):
        """ This method creates a training set and runs the clustering algorithm

        markers: a list of strings corresponding to columns in the input data frames
        score: score to use in new model
        arcsinh: whether to apply the arcsinh transformation prior to clustering (this is usually a good idea)
        scaling: z-score scaling (this is considered best practice)
        output_dir (default is None): output directory, None indicates the current working directory
        retrain (default is False): specifies whether to retrain with existing model or create a new one
        """
        assert (not retrain) or (
            self.model is not None
        )  # exception if retraining without existing model

        if output_dir is None:
            output_dir = os.getcwd(
            )  # set to current directory if not specified

        data = copy.copy(
            self.data)  # dictionary of samples (file names are keys)

        n_cells = np.floor(self.n_tsne /
                           len(set(data.index.get_level_values(0)))
                           )  # number of data points selected from each sample
        samples = list(set(
            data.index.get_level_values(0)))  # sample/file names
        origins = []  # list tracking which sample individual points came from

        for ii, sample in enumerate(list(set(
                data.index.get_level_values(0)))):  # iterate through samples
            sample_data = data[data.index.get_level_values(0) == sample]
            sample_size = int(np.min([
                n_cells, sample_data.shape[0]
            ]))  # number of points to select per sample
            random_choice = np.random.choice(sample_data.shape[0], sample_size)
            origins.extend(
                [sample] *
                len(random_choice))  # note where data points came from
            if ii == 0:
                data_samples = sample_data[markers].iloc[
                    random_choice, :].values  # start list of data points
            else:
                data_samples = np.concatenate([
                    data_samples,
                    sample_data[markers].iloc[random_choice, :].values
                ])
        '''
        for i, current_marker in enumerate(markers):
            print(current_marker)
            print(stats.entropy(np.arcsinh(data_samples[:, i])))
            plt.hist(np.arcsinh(data_samples[:, i]))
            plt.show()
        '''
        # determine whether the current experiment has been processed (with any CV score)
        redundant = False
        for file_name in os.listdir(output_dir + '/serialized'):
            match = re.fullmatch('model_0.*%s\.pkl' % self.name, file_name)
            if match is not None:
                redundant = True
                break
        # create new model if not retraining
        model_file = 'model_' + self.name + '.pkl'
        scaler_file = 'scaler_' + self.name + '.pkl'
        label_file = 'Labels_tSNE_' + str(score) + self.name + '.pkl'
        if (label_file
                in os.listdir(output_dir + '/serialized')) and not retrain:
            # re-run experiment with same CV score
            model = pickle.load(
                open(output_dir + '/serialized/' + model_file, 'rb'))
            self.scaler_obj = pickle.load(
                open(output_dir + '/serialized/' + scaler_file, 'rb'))
            tsne_frame = pickle.load(
                open(output_dir + '/serialized/' + label_file, 'rb'))
            labels_tSNE = tsne_frame['clusters']
            data_samples = tsne_frame.loc[:, markers].values
            output = tsne_frame
        else:
            if redundant and not retrain:
                # re-run experiment with different CV score
                model = pickle.load(
                    open(output_dir + '/serialized/' + model_file, 'rb'))
                data_samples = pickle.load(
                    open(
                        output_dir + '/serialized/tSNE_subset_' + self.name +
                        '.pkl', 'rb'))
                self.scaler_obj = pickle.load(
                    open(output_dir + '/serialized/' + scaler_file, 'rb'))
            else:
                # create HAL object and fit model to data (using only training data)
                try:
                    shutil.rmtree('./info_hal')  # remove old info_hal folder
                except FileNotFoundError:
                    pass
                model = HAL(clf_type=self.clf_type,
                            outlier_ratio=0.1,
                            late_exag=900,
                            alpha_late=2.0,
                            n_cluster_init=150,
                            warm_start=True)
            # apply arcsinh transformation (enabled by default)
            if self.arcsinh:
                transformed_samples = np.arcsinh(data_samples)
            else:
                transformed_samples = data_samples
            # apply standard (z-score) scaling (enabled by default)
            if self.scaling:
                if self.scaler_obj is None:
                    self.scaler_obj = MinMaxScaler()
                    scaled_data = self.scaler_obj.fit_transform(
                        transformed_samples)
                else:
                    scaled_data = self.scaler_obj.transform(
                        transformed_samples)
            else:
                scaled_data = transformed_samples  # do not use this option without a good reason!
            model.fit(scaled_data)
            pickle.dump(model,
                        open(output_dir + '/serialized/' + model_file, 'wb'))
            pickle.dump(self.scaler_obj,
                        open(output_dir + '/serialized/' + scaler_file, 'wb'))
            # create a frame with the clusters and samples for each data point
            labels_tSNE = model.predict(scaled_data, cv=score)
            output = pd.DataFrame(data_samples)
            output.columns = markers
            output["clusters"] = labels_tSNE
            output["origin"] = origins
            output = self.addTsne(output)
            output.to_csv(output_dir + '/Labels_tSNE_' + str(score) +
                          self.name + '.csv')
            pickle.dump(
                data_samples,
                open(
                    output_dir + '/serialized/tSNE_subset_' + self.name +
                    '.pkl', "wb"))
            pickle.dump(
                output,
                open(
                    output_dir + '/serialized/Labels_tSNE_' + str(score) +
                    self.name + '.pkl', "wb"))

        self.model = model
        labels_only = np.array(labels_tSNE)

        return labels_only, output  # do not return samples of origin with labels