コード例 #1
0
def predict(cv):
    col_names = [
        line.split(',')[0] for line in open(
            '/home/ubuntu/data/scRNAseq/mice/count.csv', 'r').readlines()
    ][1:]
    model = HAL(warm_start=True, n_cluster_init=50, clf_type='rf')
    model.load()
    ypossible = model.possible_clusters(cv)
    X = np.genfromtxt('/home/ubuntu/data/scRNAseq/mice/count.csv',
                      delimiter=',')
    X = X[1:, 1:].T
    ypred = model.predict(X, cv)
    col_names_all = list(col_names[:, 1].flatten())
    df_median_expression = pd.DataFrame(np.array(
        [np.median(X[ypred == yu], axis=0) for yu in ypossible]),
                                        index=list(ypossible),
                                        columns=col_names_all)
    df_frequency = pd.DataFrame(
        [np.count_nonzero(ypred == yu) / len(ypred) for yu in ypossible],
        index=ypossible,
        columns=[f])
    df_frequency.to_csv('/home/ubuntu/data/scRNAseq/mice/Frequencies.csv')
    df_median_expression.to_csv(
        '/home/ubuntu/data/scRNAseq/mice/Median_expression.csv')
    result[f] = [ypred, df_median_expression, df_frequency]
    pickle.dump(result, open('results.pkl', 'wb'))
コード例 #2
0
def predict(cv):

    file_name_list = []

    for filename in os.listdir(
            '/Users/mukherjeer2/Documents/Data/CyTOF/20190209_B6_IdU_Pilot/Live_Single_Cells/'
    ):
        if filename.endswith(".fcs"):
            file_name_list.append(filename)
            continue
        else:
            continue

    col = np.loadtxt('columns.txt', delimiter='\t', dtype=str)
    result = {}

    model = HAL(warm_start=True, n_cluster_init=50)

    model.load()

    ypossible = model.possible_clusters(cv)

    for f in file_name_list:
        print(f)
        data = load(f)

        data = data[col[:, 0]]

        X = np.arcsinh(data)

        ypred = model.predict(X, cv)

        #Xtmp = model.preprocess(X)

        col_names_all = list(col_names[:, 1].flatten())

        df_median_expression = pd.DataFrame(np.array(
            [np.median(X[ypred == yu], axis=0) for yu in ypossible]),
                                            index=list(ypossible),
                                            columns=col_names_all)

        df_frequency = pd.DataFrame(
            [np.count_nonzero(ypred == yu) / len(ypred) for yu in ypossible],
            index=ypossible,
            columns=[f])

        df_frequency.to_csv(
            '/Users/mukherjeer2/Documents/Data/CyTOF/20190209_B6_IdU_Pilot/Live_Single_Cells/Frequencies.csv'
        )

        df_median_expression.to_csv(
            '/Users/mukherjeer2/Documents/Data/CyTOF/20190209_B6_IdU_Pilot/Live_Single_Cells/Median_expression.csv'
        )

        #print(df_median_expression)
        #print(df_frequency)
        #exit()
        result[f] = [ypred, df_median_expression, df_frequency]

    pickle.dump(result, open('results.pkl', 'wb'))
コード例 #3
0
    def defineModel(self, markers, score, output_dir=None, retrain=False):
        """ This method creates a training set and runs the clustering algorithm

        markers: a list of strings corresponding to columns in the input data frames
        score: score to use in new model
        arcsinh: whether to apply the arcsinh transformation prior to clustering (this is usually a good idea)
        scaling: z-score scaling (this is considered best practice)
        output_dir (default is None): output directory, None indicates the current working directory
        retrain (default is False): specifies whether to retrain with existing model or create a new one
        """
        assert (not retrain) or (
            self.model is not None
        )  # exception if retraining without existing model

        if output_dir is None:
            output_dir = os.getcwd(
            )  # set to current directory if not specified

        data = copy.copy(
            self.data)  # dictionary of samples (file names are keys)

        n_cells = np.floor(self.n_tsne /
                           len(set(data.index.get_level_values(0)))
                           )  # number of data points selected from each sample
        samples = list(set(
            data.index.get_level_values(0)))  # sample/file names
        origins = []  # list tracking which sample individual points came from

        for ii, sample in enumerate(list(set(
                data.index.get_level_values(0)))):  # iterate through samples
            sample_data = data[data.index.get_level_values(0) == sample]
            sample_size = int(np.min([
                n_cells, sample_data.shape[0]
            ]))  # number of points to select per sample
            random_choice = np.random.choice(sample_data.shape[0], sample_size)
            origins.extend(
                [sample] *
                len(random_choice))  # note where data points came from
            if ii == 0:
                data_samples = sample_data[markers].iloc[
                    random_choice, :].values  # start list of data points
            else:
                data_samples = np.concatenate([
                    data_samples,
                    sample_data[markers].iloc[random_choice, :].values
                ])
        '''
        for i, current_marker in enumerate(markers):
            print(current_marker)
            print(stats.entropy(np.arcsinh(data_samples[:, i])))
            plt.hist(np.arcsinh(data_samples[:, i]))
            plt.show()
        '''
        # determine whether the current experiment has been processed (with any CV score)
        redundant = False
        for file_name in os.listdir(output_dir + '/serialized'):
            match = re.fullmatch('model_0.*%s\.pkl' % self.name, file_name)
            if match is not None:
                redundant = True
                break
        # create new model if not retraining
        model_file = 'model_' + self.name + '.pkl'
        scaler_file = 'scaler_' + self.name + '.pkl'
        label_file = 'Labels_tSNE_' + str(score) + self.name + '.pkl'
        if (label_file
                in os.listdir(output_dir + '/serialized')) and not retrain:
            # re-run experiment with same CV score
            model = pickle.load(
                open(output_dir + '/serialized/' + model_file, 'rb'))
            self.scaler_obj = pickle.load(
                open(output_dir + '/serialized/' + scaler_file, 'rb'))
            tsne_frame = pickle.load(
                open(output_dir + '/serialized/' + label_file, 'rb'))
            labels_tSNE = tsne_frame['clusters']
            data_samples = tsne_frame.loc[:, markers].values
            output = tsne_frame
        else:
            if redundant and not retrain:
                # re-run experiment with different CV score
                model = pickle.load(
                    open(output_dir + '/serialized/' + model_file, 'rb'))
                data_samples = pickle.load(
                    open(
                        output_dir + '/serialized/tSNE_subset_' + self.name +
                        '.pkl', 'rb'))
                self.scaler_obj = pickle.load(
                    open(output_dir + '/serialized/' + scaler_file, 'rb'))
            else:
                # create HAL object and fit model to data (using only training data)
                try:
                    shutil.rmtree('./info_hal')  # remove old info_hal folder
                except FileNotFoundError:
                    pass
                model = HAL(clf_type=self.clf_type,
                            outlier_ratio=0.1,
                            late_exag=900,
                            alpha_late=2.0,
                            n_cluster_init=150,
                            warm_start=True)
            # apply arcsinh transformation (enabled by default)
            if self.arcsinh:
                transformed_samples = np.arcsinh(data_samples)
            else:
                transformed_samples = data_samples
            # apply standard (z-score) scaling (enabled by default)
            if self.scaling:
                if self.scaler_obj is None:
                    self.scaler_obj = MinMaxScaler()
                    scaled_data = self.scaler_obj.fit_transform(
                        transformed_samples)
                else:
                    scaled_data = self.scaler_obj.transform(
                        transformed_samples)
            else:
                scaled_data = transformed_samples  # do not use this option without a good reason!
            model.fit(scaled_data)
            pickle.dump(model,
                        open(output_dir + '/serialized/' + model_file, 'wb'))
            pickle.dump(self.scaler_obj,
                        open(output_dir + '/serialized/' + scaler_file, 'wb'))
            # create a frame with the clusters and samples for each data point
            labels_tSNE = model.predict(scaled_data, cv=score)
            output = pd.DataFrame(data_samples)
            output.columns = markers
            output["clusters"] = labels_tSNE
            output["origin"] = origins
            output = self.addTsne(output)
            output.to_csv(output_dir + '/Labels_tSNE_' + str(score) +
                          self.name + '.csv')
            pickle.dump(
                data_samples,
                open(
                    output_dir + '/serialized/tSNE_subset_' + self.name +
                    '.pkl', "wb"))
            pickle.dump(
                output,
                open(
                    output_dir + '/serialized/Labels_tSNE_' + str(score) +
                    self.name + '.pkl', "wb"))

        self.model = model
        labels_only = np.array(labels_tSNE)

        return labels_only, output  # do not return samples of origin with labels