Ejemplo n.º 1
0
def predict(cv):
    col_names = [
        line.split(',')[0] for line in open(
            '/home/ubuntu/data/scRNAseq/mice/count.csv', 'r').readlines()
    ][1:]
    model = HAL(warm_start=True, n_cluster_init=50, clf_type='rf')
    model.load()
    ypossible = model.possible_clusters(cv)
    X = np.genfromtxt('/home/ubuntu/data/scRNAseq/mice/count.csv',
                      delimiter=',')
    X = X[1:, 1:].T
    ypred = model.predict(X, cv)
    col_names_all = list(col_names[:, 1].flatten())
    df_median_expression = pd.DataFrame(np.array(
        [np.median(X[ypred == yu], axis=0) for yu in ypossible]),
                                        index=list(ypossible),
                                        columns=col_names_all)
    df_frequency = pd.DataFrame(
        [np.count_nonzero(ypred == yu) / len(ypred) for yu in ypossible],
        index=ypossible,
        columns=[f])
    df_frequency.to_csv('/home/ubuntu/data/scRNAseq/mice/Frequencies.csv')
    df_median_expression.to_csv(
        '/home/ubuntu/data/scRNAseq/mice/Median_expression.csv')
    result[f] = [ypred, df_median_expression, df_frequency]
    pickle.dump(result, open('results.pkl', 'wb'))
def predict(cv):

    file_name_list = []

    for filename in os.listdir(
            '/Users/mukherjeer2/Documents/Data/CyTOF/20190209_B6_IdU_Pilot/Live_Single_Cells/'
    ):
        if filename.endswith(".fcs"):
            file_name_list.append(filename)
            continue
        else:
            continue

    col = np.loadtxt('columns.txt', delimiter='\t', dtype=str)
    result = {}

    model = HAL(warm_start=True, n_cluster_init=50)

    model.load()

    ypossible = model.possible_clusters(cv)

    for f in file_name_list:
        print(f)
        data = load(f)

        data = data[col[:, 0]]

        X = np.arcsinh(data)

        ypred = model.predict(X, cv)

        #Xtmp = model.preprocess(X)

        col_names_all = list(col_names[:, 1].flatten())

        df_median_expression = pd.DataFrame(np.array(
            [np.median(X[ypred == yu], axis=0) for yu in ypossible]),
                                            index=list(ypossible),
                                            columns=col_names_all)

        df_frequency = pd.DataFrame(
            [np.count_nonzero(ypred == yu) / len(ypred) for yu in ypossible],
            index=ypossible,
            columns=[f])

        df_frequency.to_csv(
            '/Users/mukherjeer2/Documents/Data/CyTOF/20190209_B6_IdU_Pilot/Live_Single_Cells/Frequencies.csv'
        )

        df_median_expression.to_csv(
            '/Users/mukherjeer2/Documents/Data/CyTOF/20190209_B6_IdU_Pilot/Live_Single_Cells/Median_expression.csv'
        )

        #print(df_median_expression)
        #print(df_frequency)
        #exit()
        result[f] = [ypred, df_median_expression, df_frequency]

    pickle.dump(result, open('results.pkl', 'wb'))
def analyze(cv):

    results = pickle.load(open('results.pkl', 'rb'))
    model = HAL(warm_start=True, n_cluster_init=50)
    model.load()
    #exit()
    ypossible = model.possible_clusters(cv=cv)

    data = []
    idx = []
    #df = pd.DataFrame([], columns=[str(yu) for yu in ypossible])

    for k, v in results.items():
        _, _, df_freq = v
        data.append(df_freq.values.flatten())
        idx.append(k[:k.find('Singlets.fcs') -
                     1][find_second_last(k[:k.find('Singlets.fcs') - 1], '_') +
                        1:])

    df = pd.DataFrame(
        np.array(data),
        index=idx,
        columns=results['c11_20190209_BoneMarrow_1-10_01_BM_1_Singlets.fcs']
        [2].index)

    ax = sns.clustermap(np.arcsinh(df * 1000).T,
                        xticklabels=df.index,
                        cbar_kws={'label': 'arcsinh(frequency*1000)'},
                        figsize=(15, 15))
    plt.setp(ax.ax_heatmap.xaxis.get_majorticklabels(),
             rotation=90,
             ha='right')
    plt.setp(ax.ax_heatmap.yaxis.get_majorticklabels(), rotation=00, ha='left')

    plt.show()

    pickle.dump(df, open('frequencies.pkl', 'wb'))