Python filter_RNASeqの例、topslam.filtering.filter_RNASeq Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_pseudotime.py プロジェクト: mzwiessele/topslam

    def testRNASeq(self):
        from topslam.simulation.simulate_trajectory import rnaseq_simulation

        p_dims = 30
        Xsim, simulate_new, t, c, labels, seed = rnaseq_simulation(p_dims, 4, 2, 1234, split_prob=.5)
        
        np.random.seed(42)
        Y, d = simulate_new()
        
        from topslam.filtering import filter_RNASeq
        import pandas as pd
        
        E = filter_RNASeq(pd.DataFrame(np.exp(Y)))        
        np.testing.assert_array_less(-E.values, 0)
        
        E = filter_RNASeq(pd.DataFrame(np.exp(Y)), transform_log1p=False)
        np.testing.assert_allclose(-Y[d].mean(), 0)
        np.testing.assert_array_less((E.values==0).sum(), (Y==0).sum())

コード例 #2

0

ファイルを表示

ファイル: test_pseudotime.py プロジェクト: kant/topslam

    def testRNASeq(self):
        from topslam.simulation.simulate_trajectory import rnaseq_simulation

        p_dims = 30
        Xsim, simulate_new, t, c, labels, seed = rnaseq_simulation(
            p_dims, 4, 2, 1234, split_prob=.5)

        np.random.seed(42)
        Y, d = simulate_new()

        from topslam.filtering import filter_RNASeq
        import pandas as pd

        E = filter_RNASeq(pd.DataFrame(np.exp(Y)))
        np.testing.assert_array_less(-E.values, 0)

        E = filter_RNASeq(pd.DataFrame(np.exp(Y)), transform_log1p=False)
        np.testing.assert_allclose(-Y[d].mean(), 0)
        np.testing.assert_array_less((E.values == 0).sum(), (Y == 0).sum())

コード例 #3

0

ファイルを表示

ファイル: examples.py プロジェクト: mzwiessele/topslam

def example_deng(optimize=True, plot=True):
    import pandas as pd, os
    import GPy, numpy as np
    from topslam.filtering import filter_RNASeq
    # Reproduceability, BGPLVM has local optima
    np.random.seed(42)

    # This is the process of how we loaded the data:
    ulabels = ['Zygote',
               '2-cell embryo',
               'Early 2-cell blastomere', 'Mid 2-cell blastomere', 'Late 2-cell blastomere',
               '4-cell blastomere', '8-cell blastomere', '16-cell blastomere',
               'Early blastocyst cell', 'Mid blastocyst cell', 'Late blastocyst cell',
               'fibroblast',
               'adult liver',
              ]

    folder_path = os.path.expanduser('~/tmp/Deng')
    csv_file = os.path.join(folder_path, 'filtered_expression_values.csv')

    if os.path.exists(csv_file):
        print('Loading previous filtered data: {}'.format(csv_file))
        Y_bgplvm = pd.read_csv(csv_file, index_col=[0,1,2], header=0)
    else:
        print('Loading data:')
        data = GPy.util.datasets.singlecell_rna_seq_deng()
        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
        Ydata = data['Y'].copy()
        Ydata.columns = Ydata.columns.to_series().apply(str.upper)
        Ydata = Ydata.reset_index().set_index('index', append=True)
        Ydata['labels'] = data['labels'].values
        Ydata = Ydata.set_index('labels', append=True)
        Ydata = Ydata.reorder_levels([0,2,1])
        Ydata = Ydata.reset_index([0,2]).loc[ulabels].set_index(['level_0', 'index'], append=True)

        Y = Ydata.copy()
        Y.columns = [c.split('.')[0] for c in Y.columns]
        Y_bgplvm = filter_RNASeq(Y)
        print('\nSaving data to tmp file: {}'.format(csv_file))
        Y_bgplvm.to_csv(csv_file)

    labels = Y_bgplvm.index.get_level_values(0).values
    Ymean = Y_bgplvm.values.mean()
    Ystd = Y_bgplvm.values.std()

    Y_m = Y_bgplvm.values
    Y_m -= Ymean
    Y_m /= Ystd

    # get the labels right for split experiments
    # get the labels right for 8 and split
    new_8_labels = []
    for _l in Y_bgplvm.loc['8-cell blastomere'].index.get_level_values(1):
        _l = _l.split('-')[0]
        if not('split' in _l):
            new_8_labels.append('8')
        elif not('pooled' in _l):
            new_8_labels.append('8 split')
        else:
            new_8_labels.append('8 split')

    labels[labels=='8-cell blastomere'] = new_8_labels

    # get the labels right for 16 and split
    new_16_labels = []
    for _l in Y_bgplvm.loc['16-cell blastomere'].index.get_level_values(1):
        _l = _l.split('-')[0]
        if not('split' in _l):
            new_16_labels.append('16')
        elif not('pooled' in _l):
            new_16_labels.append('16 split')
        else:
            new_16_labels.append('16 split')

    labels[labels=='16-cell blastomere'] = new_16_labels

    ulabels = []
    for lab in labels:
        if lab not in ulabels:
            ulabels.append(lab)

    short_labels = labels.copy()
    _ulabels_convert = np.array([
            'Z',# Z',
            'E',# Em',
            '2',# Bm E',
            '2',# Bm M',
            '2',# Bm L',
            '4',
            '8',
            '8 s',
            '16',
            '16 s',
            'Bz',# E',
            'Bz',# M',
            'Bz',# L'
            'F',
            'L'
        ])

    short_ulabels = []
    for lab, nlab in zip(ulabels, _ulabels_convert):
        short_labels[short_labels==lab] = nlab
        if nlab not in short_ulabels:
            short_ulabels.append(nlab)

    from topslam.optimization import run_methods, methods, create_model, optimize_model
    X_init, dims = run_methods(Y_m, methods)

    m = create_model(Y_m, X_init, num_inducing=25)
    m.Ymean = Ymean
    m.Ystd = Ystd
    m.data_labels = short_labels
    m.data_ulabels = short_ulabels
    m.data = Y_bgplvm

    m.X_init = X_init
    m.dims = dims

    if optimize:
        optimize_model(m)
    if plot:
        mc = ManifoldCorrectionTree(m)
        plot_comparison(mc, X_init, dims, m.data_labels, m.data_ulabels, 0)

    return m

コード例 #4

0

ファイルを表示

ファイル: examples.py プロジェクト: mzwiessele/topslam

def example_deng(optimize=True, plot=True):
    import pandas as pd, os
    import GPy, numpy as np
    from topslam.filtering import filter_RNASeq
    # Reproduceability, BGPLVM has local optima
    np.random.seed(42)

    # This is the process of how we loaded the data:
    ulabels = [
        'Zygote',
        '2-cell embryo',
        'Early 2-cell blastomere',
        'Mid 2-cell blastomere',
        'Late 2-cell blastomere',
        '4-cell blastomere',
        '8-cell blastomere',
        '16-cell blastomere',
        'Early blastocyst cell',
        'Mid blastocyst cell',
        'Late blastocyst cell',
        'fibroblast',
        'adult liver',
    ]

    folder_path = os.path.expanduser('~/tmp/Deng')
    csv_file = os.path.join(folder_path, 'filtered_expression_values.csv')

    if os.path.exists(csv_file):
        print('Loading previous filtered data: {}'.format(csv_file))
        Y_bgplvm = pd.read_csv(csv_file, index_col=[0, 1, 2], header=0)
    else:
        print('Loading data:')
        data = GPy.util.datasets.singlecell_rna_seq_deng()
        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
        Ydata = data['Y'].copy()
        Ydata.columns = Ydata.columns.to_series().apply(str.upper)
        Ydata = Ydata.reset_index().set_index('index', append=True)
        Ydata['labels'] = data['labels'].values
        Ydata = Ydata.set_index('labels', append=True)
        Ydata = Ydata.reorder_levels([0, 2, 1])
        Ydata = Ydata.reset_index([0, 2]).loc[ulabels].set_index(
            ['level_0', 'index'], append=True)

        Y = Ydata.copy()
        Y.columns = [c.split('.')[0] for c in Y.columns]
        Y_bgplvm = filter_RNASeq(Y)
        print('\nSaving data to tmp file: {}'.format(csv_file))
        Y_bgplvm.to_csv(csv_file)

    labels = Y_bgplvm.index.get_level_values(0).values
    Ymean = Y_bgplvm.values.mean()
    Ystd = Y_bgplvm.values.std()

    Y_m = Y_bgplvm.values
    Y_m -= Ymean
    Y_m /= Ystd

    # get the labels right for split experiments
    # get the labels right for 8 and split
    new_8_labels = []
    for _l in Y_bgplvm.loc['8-cell blastomere'].index.get_level_values(1):
        _l = _l.split('-')[0]
        if not ('split' in _l):
            new_8_labels.append('8')
        elif not ('pooled' in _l):
            new_8_labels.append('8 split')
        else:
            new_8_labels.append('8 split')

    labels[labels == '8-cell blastomere'] = new_8_labels

    # get the labels right for 16 and split
    new_16_labels = []
    for _l in Y_bgplvm.loc['16-cell blastomere'].index.get_level_values(1):
        _l = _l.split('-')[0]
        if not ('split' in _l):
            new_16_labels.append('16')
        elif not ('pooled' in _l):
            new_16_labels.append('16 split')
        else:
            new_16_labels.append('16 split')

    labels[labels == '16-cell blastomere'] = new_16_labels

    ulabels = []
    for lab in labels:
        if lab not in ulabels:
            ulabels.append(lab)

    short_labels = labels.copy()
    _ulabels_convert = np.array([
        'Z',  # Z',
        'E',  # Em',
        '2',  # Bm E',
        '2',  # Bm M',
        '2',  # Bm L',
        '4',
        '8',
        '8 s',
        '16',
        '16 s',
        'Bz',  # E',
        'Bz',  # M',
        'Bz',  # L'
        'F',
        'L'
    ])

    short_ulabels = []
    for lab, nlab in zip(ulabels, _ulabels_convert):
        short_labels[short_labels == lab] = nlab
        if nlab not in short_ulabels:
            short_ulabels.append(nlab)

    from topslam.optimization import run_methods, methods, create_model, optimize_model
    X_init, dims = run_methods(Y_m, methods)

    m = create_model(Y_m, X_init, num_inducing=25)
    m.Ymean = Ymean
    m.Ystd = Ystd
    m.data_labels = short_labels
    m.data_ulabels = short_ulabels
    m.data = Y_bgplvm

    m.X_init = X_init
    m.dims = dims

    if optimize:
        optimize_model(m)
    if plot:
        mc = ManifoldCorrectionTree(m)
        plot_comparison(mc, X_init, dims, m.data_labels, m.data_ulabels, 0)

    return m