def load_data(batch_size=32,
              valid_set_size=0.0,
              test_set_size=1.0,
              file_path=''):

    #Load array data
    array_dict = isoio.load(file_path + 'apa_array_data_master_seq')
    array_df = array_dict['array_df']
    array_cuts = array_dict['pooled_cuts']

    array_index = np.arange(len(array_df), dtype=np.int)

    print('Designed MPRA size = ' + str(array_index.shape[0]))

    #Generate training and test set indexes
    array_index = np.arange(len(array_df), dtype=np.int)

    array_train_index = array_index[:-int(
        len(array_df) * (valid_set_size + test_set_size))]
    array_valid_index = array_index[
        array_train_index.shape[0]:-int(len(array_df) * test_set_size)]
    array_test_index = array_index[array_train_index.shape[0] +
                                   array_valid_index.shape[0]:]

    print('Training set size = ' + str(array_train_index.shape[0]))
    print('Validation set size = ' + str(array_valid_index.shape[0]))
    print('Test set size = ' + str(array_test_index.shape[0]))

    #Manually set sublibrary intercept terms
    array_df['library_index'] = np.zeros(len(array_df), dtype=np.int)
    array_df['distal_pas'] = np.ones(len(array_df))
    array_df.loc[array_df['gene'] == 'doubledope', 'library_index'] = 20
    array_df.loc[array_df['gene'] == 'doubledope', 'distal_pas'] = 1
    array_df.loc[array_df['gene'] == 'simple', 'library_index'] = 22
    array_df.loc[array_df['gene'] == 'simple', 'distal_pas'] = 0
    array_df.loc[array_df['gene'] == 'tomm5', 'library_index'] = 8
    array_df.loc[array_df['gene'] == 'tomm5', 'distal_pas'] = 1
    array_df.loc[array_df['gene'] == 'aar', 'library_index'] = 30
    array_df.loc[array_df['gene'] == 'aar', 'distal_pas'] = 0
    array_df.loc[array_df['gene'] == 'atr', 'library_index'] = 31
    array_df.loc[array_df['gene'] == 'atr', 'distal_pas'] = 0
    array_df.loc[array_df['gene'] == 'hsp', 'library_index'] = 32
    array_df.loc[array_df['gene'] == 'hsp', 'distal_pas'] = 0
    array_df.loc[array_df['gene'] == 'snh', 'library_index'] = 33
    array_df.loc[array_df['gene'] == 'snh', 'distal_pas'] = 0
    array_df.loc[array_df['gene'] == 'sox', 'library_index'] = 34
    array_df.loc[array_df['gene'] == 'sox', 'distal_pas'] = 0
    array_df.loc[array_df['gene'] == 'wha', 'library_index'] = 35
    array_df.loc[array_df['gene'] == 'wha', 'distal_pas'] = 0

    array_gens = {
        gen_id:
        iso.DataGenerator(idx, {
            'df': array_df,
            'cuts': array_cuts
        },
                          batch_size=batch_size,
                          inputs=[{
                              'id':
                              'seq',
                              'source_type':
                              'dataframe',
                              'source':
                              'df',
                              'extractor':
                              iso.SequenceExtractor('seq_ext',
                                                    start_pos=200 + 1,
                                                    end_pos=200 + 1 + 185),
                              'encoder':
                              iso.OneHotEncoder(seq_length=185),
                              'dim': (1, 185, 4),
                              'sparsify':
                              False
                          }, {
                              'id':
                              'lib',
                              'source_type':
                              'dataframe',
                              'source':
                              'df',
                              'extractor':
                              lambda row, index: row['library_index'],
                              'encoder':
                              iso.CategoricalEncoder(
                                  n_categories=36,
                                  categories=np.arange(36,
                                                       dtype=np.int).tolist()),
                              'sparsify':
                              False
                          }, {
                              'id':
                              'distal_pas',
                              'source_type':
                              'dataframe',
                              'source':
                              'df',
                              'extractor':
                              lambda row, index: row['distal_pas'],
                              'encoder':
                              None,
                              'sparsify':
                              False
                          }],
                          outputs=[{
                              'id':
                              'prox_usage',
                              'source_type':
                              'matrix',
                              'source':
                              'cuts',
                              'extractor':
                              iso.CountExtractor(start_pos=200 + 1,
                                                 end_pos=200 + 1 + 185,
                                                 static_poses=[-1],
                                                 sparse_source=True),
                              'transformer':
                              lambda t: iso_normalizer(t),
                              'sparsify':
                              False
                          }],
                          randomizers=[],
                          shuffle=False)
        for gen_id, idx in [('all', array_index), (
            'train',
            array_train_index), ('valid',
                                 array_valid_index), ('test',
                                                      array_test_index)]
    }

    return array_gens
Beispiel #2
0
def load_data(batch_size=32,
              valid_set_size=0.0,
              test_set_size=1.0,
              file_path='',
              data_version=''):

    #Load array data
    array_dict = isoio.load(file_path + 'apa_array_data' + data_version)
    array_df = array_dict['array_df']
    array_cuts = array_dict['pooled_cuts']

    array_index = np.arange(len(array_df), dtype=np.int)

    print('Designed MPRA size = ' + str(array_index.shape[0]))

    #Generate training and test set indexes
    array_index = np.arange(len(array_df), dtype=np.int)

    array_train_index = array_index[:-int(
        len(array_df) * (valid_set_size + test_set_size))]
    array_valid_index = array_index[
        array_train_index.shape[0]:-int(len(array_df) * test_set_size)]
    array_test_index = array_index[array_train_index.shape[0] +
                                   array_valid_index.shape[0]:]

    print('Training set size = ' + str(array_train_index.shape[0]))
    print('Validation set size = ' + str(array_valid_index.shape[0]))
    print('Test set size = ' + str(array_test_index.shape[0]))

    unique_libraries = np.array([
        'tomm5_up_n20c20_dn_c20', 'tomm5_up_c20n20_dn_c20',
        'tomm5_up_n20c20_dn_n20', 'tomm5_up_c20n20_dn_n20', 'doubledope',
        'simple', 'atr', 'hsp', 'snh', 'sox', 'wha', 'array', 'aar'
    ],
                                dtype=np.object)

    array_gens = {
        gen_id: iso.DataGenerator(idx, {
            'df': array_df,
            'cuts': array_cuts
        },
                                  batch_size=batch_size,
                                  inputs=[{
                                      'id':
                                      'seq',
                                      'source_type':
                                      'dataframe',
                                      'source':
                                      'df',
                                      'extractor':
                                      iso.SequenceExtractor('seq_ext',
                                                            start_pos=180,
                                                            end_pos=180 + 205),
                                      'encoder':
                                      iso.OneHotEncoder(seq_length=205),
                                      'dim': (205, 4, 1),
                                      'sparsify':
                                      False
                                  }, {
                                      'id':
                                      'lib',
                                      'source_type':
                                      'dataframe',
                                      'source':
                                      'df',
                                      'extractor':
                                      lambda row, index: 'array',
                                      'encoder':
                                      iso.CategoricalEncoder(
                                          n_categories=len(unique_libraries),
                                          categories=unique_libraries),
                                      'sparsify':
                                      False
                                  }, {
                                      'id': 'distal_pas',
                                      'source_type': 'dataframe',
                                      'source': 'df',
                                      'extractor': lambda row, index: 1,
                                      'encoder': None,
                                      'sparsify': False
                                  }],
                                  outputs=[{
                                      'id':
                                      'prox_usage',
                                      'source_type':
                                      'matrix',
                                      'source':
                                      'cuts',
                                      'extractor':
                                      iso.CountExtractor(start_pos=180,
                                                         end_pos=180 + 205,
                                                         static_poses=[-1],
                                                         sparse_source=True),
                                      'transformer':
                                      lambda t: iso_normalizer(t),
                                      'sparsify':
                                      False
                                  }],
                                  randomizers=[],
                                  shuffle=False)
        for gen_id, idx in [('all', array_index), (
            'train',
            array_train_index), ('valid',
                                 array_valid_index), ('test',
                                                      array_test_index)]
    }

    return array_gens
def load_data(batch_size=32, valid_set_size=0.025, test_set_size=0.025, file_path='', kept_libraries=None, canonical_pas=False, no_dse_canonical_pas=False) :

    #Load plasmid data
    #plasmid_dict = pickle.load(open('apa_plasmid_data' + data_version + '.pickle', 'rb'))
    plasmid_dict = isoio.load(file_path + 'apa_plasmid_data_legacy')
    plasmid_df = plasmid_dict['plasmid_df']
    plasmid_cuts = plasmid_dict['plasmid_cuts']
    
    if kept_libraries is not None :
        keep_index = np.nonzero(plasmid_df.library_index.isin(kept_libraries))[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]

    if canonical_pas :
        keep_index = np.nonzero(plasmid_df.seq.str.slice(50, 56) == 'AATAAA')[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]

    if no_dse_canonical_pas :
        keep_index = np.nonzero(~plasmid_df.seq.str.slice(56).str.contains('AATAAA'))[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]
    
    #Generate training and test set indexes
    plasmid_index = np.arange(len(plasmid_df), dtype=np.int)

    plasmid_train_index, plasmid_valid_index, plasmid_test_index = None, None, None

    if valid_set_size <= 1.0 and test_set_size <= 1.0 :
        plasmid_train_index = plasmid_index[:-int(len(plasmid_df) * (valid_set_size + test_set_size))]
        plasmid_valid_index = plasmid_index[plasmid_train_index.shape[0]:-int(len(plasmid_df) * test_set_size)]
        plasmid_test_index = plasmid_index[plasmid_train_index.shape[0] + plasmid_valid_index.shape[0]:]
    else :
        plasmid_train_index = plasmid_index[:-(valid_set_size + test_set_size)]
        plasmid_valid_index = plasmid_index[plasmid_train_index.shape[0]:-test_set_size]
        plasmid_test_index = plasmid_index[plasmid_train_index.shape[0] + plasmid_valid_index.shape[0]:]

    print('Training set size = ' + str(plasmid_train_index.shape[0]))
    print('Validation set size = ' + str(plasmid_valid_index.shape[0]))
    print('Test set size = ' + str(plasmid_test_index.shape[0]))
    
    

    plasmid_prediction_gens = {
        gen_id : iso.DataGenerator(
            idx,
            {'df' : plasmid_df, 'cuts' : plasmid_cuts},
            batch_size=batch_size,
            inputs = [
                {
                    'id' : 'seq',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : iso.SequenceExtractor('seq', start_pos=1, end_pos=1 + 185),
                    'encoder' : iso.OneHotEncoder(seq_length=185),
                    'dim' : (1, 185, 4),
                    'sparsify' : False
                },
                {
                    'id' : 'lib',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : lambda row, index: row['library_index'],
                    'encoder' : iso.CategoricalEncoder(n_categories=36, categories=np.arange(36, dtype=np.int).tolist()),
                    'sparsify' : False
                },
                {
                    'id' : 'distal_pas',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : lambda row, index: 1 if row['library_index'] in [2, 5, 8, 11, 20] else 0,
                    'encoder' : None,
                    'sparsify' : False
                }
            ],
            outputs = [
                {
                    'id' : 'prox_usage',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : lambda row, index: row['proximal_count'] / row['total_count'],
                    'transformer' : lambda t: t,
                    'dim' : (1,),
                    'sparsify' : False
                },
                {
                    'id' : 'prox_cuts',
                    'source_type' : 'matrix',
                    'source' : 'cuts',
                    'extractor' : iso.CountExtractor(start_pos=0, end_pos=186, sparse_source=False),
                    'transformer' : lambda t: t,
                    'dim' : (186,),
                    'sparsify' : False
                }
            ],
            randomizers = [],
            shuffle = False,
            densify_batch_matrices=True
        ) for gen_id, idx in [('all', plasmid_index), ('train', plasmid_train_index), ('valid', plasmid_valid_index), ('test', plasmid_test_index)]
    }

    return plasmid_prediction_gens
Beispiel #4
0
def load_data(batch_size=64, valid_set_size=0.025, test_set_size=0.025, file_path='', data_version='_v2', kept_libraries=None, canonical_pas=False, no_dse_canonical_pas=False, no_clinvar_wt=True) :

    #Load plasmid data
    #plasmid_dict = pickle.load(open('apa_plasmid_data' + data_version + '.pickle', 'rb'))
    plasmid_dict = isoio.load(file_path + 'apa_plasmid_data' + data_version)
    plasmid_df = plasmid_dict['plasmid_df']
    plasmid_cuts = plasmid_dict['plasmid_cuts']
    
    unique_libraries = np.array(['tomm5_up_n20c20_dn_c20', 'tomm5_up_c20n20_dn_c20', 'tomm5_up_n20c20_dn_n20', 'tomm5_up_c20n20_dn_n20', 'doubledope', 'simple', 'atr', 'hsp', 'snh', 'sox', 'wha', 'array', 'aar'], dtype=np.object)#plasmid_df['library'].unique()
    
    if kept_libraries is not None :
        keep_index = np.nonzero(plasmid_df.library_index.isin(kept_libraries))[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]

    if canonical_pas :
        keep_index = np.nonzero(plasmid_df.seq.str.slice(70, 76) == 'AATAAA')[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]

    if no_dse_canonical_pas :
        keep_index = np.nonzero(~plasmid_df.seq.str.slice(76).str.contains('AATAAA'))[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]
     
    if no_clinvar_wt :
        print("size before filtering out clinvar_wt = " + str(len(plasmid_df)))
        keep_index = np.nonzero(plasmid_df.sublibrary != 'clinvar_wt')[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]
        print("size after filtering out clinvar_wt = " + str(len(plasmid_df)))
    
    #Generate training and test set indexes
    plasmid_index = np.arange(len(plasmid_df), dtype=np.int)

    plasmid_train_index = plasmid_index[:-int(len(plasmid_df) * (valid_set_size + test_set_size))]
    plasmid_valid_index = plasmid_index[plasmid_train_index.shape[0]:-int(len(plasmid_df) * test_set_size)]
    plasmid_test_index = plasmid_index[plasmid_train_index.shape[0] + plasmid_valid_index.shape[0]:]

    print('Training set size = ' + str(plasmid_train_index.shape[0]))
    print('Validation set size = ' + str(plasmid_valid_index.shape[0]))
    print('Test set size = ' + str(plasmid_test_index.shape[0]))
    
    prox_range = (np.arange(30, dtype=np.int) + 80).tolist()
    norm_range = np.arange(206).tolist()

    plasmid_training_gens = {
        gen_id : iso.DataGenerator(
            idx,
            {'df' : plasmid_df, 'cuts' : plasmid_cuts},
            batch_size=batch_size,
            inputs = [
                {
                    'id' : 'seq',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : iso.SequenceExtractor('padded_seq', start_pos=180, end_pos=180 + 205),
                    'encoder' : iso.OneHotEncoder(seq_length=205),
                    'dim' : (1, 205, 4),
                    'sparsify' : False
                },
                {
                    'id' : 'lib',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : lambda row, index: row['library'],
                    'encoder' : iso.CategoricalEncoder(n_categories=len(unique_libraries), categories=unique_libraries),
                    'sparsify' : False
                },
                {
                    'id' : 'total_count',
                    'source_type' : 'matrix',
                    'source' : 'cuts',
                    'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False),
                    'transformer' : lambda t: np.sum(t),
                    'dim' : (1,),
                    'sparsify' : False
                },
                {
                    'id' : 'prox_usage',
                    'source_type' : 'matrix',
                    'source' : 'cuts',
                    'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False),
                    'transformer' : lambda t: iso_normalizer(t),
                    'dim' : (1,),
                    'sparsify' : False
                },
                {
                    'id' : 'prox_cuts',
                    'source_type' : 'matrix',
                    'source' : 'cuts',
                    'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False),
                    'transformer' : lambda t: cut_normalizer(t),
                    'dim' : (206,),
                    'sparsify' : False
                }
            ],
            outputs = [
                {
                    'id' : 'dummy_output',
                    'source_type' : 'zeros',
                    'dim' : (1,),
                    'sparsify' : False
                }
            ],
            randomizers = [],
            shuffle = True,
            densify_batch_matrices=True
        ) for gen_id, idx in [('all', plasmid_index), ('train', plasmid_train_index), ('valid', plasmid_valid_index), ('test', plasmid_test_index)]
    }

    plasmid_prediction_gens = {
        gen_id : iso.DataGenerator(
            idx,
            {'df' : plasmid_df, 'cuts' : plasmid_cuts},
            batch_size=batch_size,
            inputs = [
                {
                    'id' : 'seq',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : iso.SequenceExtractor('padded_seq', start_pos=180, end_pos=180 + 205),
                    'encoder' : iso.OneHotEncoder(seq_length=205),
                    'dim' : (1, 205, 4),
                    'sparsify' : False
                },
                {
                    'id' : 'lib',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : lambda row, index: row['library'],
                    'encoder' : iso.CategoricalEncoder(n_categories=len(unique_libraries), categories=unique_libraries),
                    'sparsify' : False
                }
            ],
            outputs = [
                {
                    'id' : 'prox_usage',
                    'source_type' : 'matrix',
                    'source' : 'cuts',
                    'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False),
                    'transformer' : lambda t: iso_normalizer(t),
                    'dim' : (1,),
                    'sparsify' : False
                },
                {
                    'id' : 'prox_cuts',
                    'source_type' : 'matrix',
                    'source' : 'cuts',
                    'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False),
                    'transformer' : lambda t: cut_normalizer(t),
                    'dim' : (206,),
                    'sparsify' : False
                }
            ],
            randomizers = [],
            shuffle = False,
            densify_batch_matrices=True
        ) for gen_id, idx in [('all', plasmid_index), ('train', plasmid_train_index), ('valid', plasmid_valid_index), ('test', plasmid_test_index)]
    }

    return plasmid_training_gens, plasmid_prediction_gens