def __data_generation(self, sequences):
     'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
     # Initialization
     X=one_hot_seqs=cf.seq_one_hot(sequences,seq_type=self.seq_type,
                         max_len=self.max_len,
                         seq_resize=self.seq_resize,
                         skip_first=self.skip_first)
      
     return X
Example #2
0
    annotation_ydata_df = pd.DataFrame({
        'ydata': range(num_classes),
        'annotation': uniq_anno
    })
    seq_df = pd.merge(seq_df, annotation_ydata_df, on='annotation')
    seq_cluster = seq_df.loc[seq_df['Cluster'] > -1]
    seq_cluster_noise = seq_df.loc[seq_df['Cluster'] == -1]
    seq_cluster_a = seq_cluster

    #%%
    #generate training data for annotation/cluster datasets
    ##annotations
    train_a = seq_cluster_a.groupby(['annotation']).sample(n=sample_n)
    seq_cluster_a = seq_cluster_a.drop(train_a.index)
    train_a_one_hot = cf.seq_one_hot(train_a['sequence'],
                                     seq_type=seq_type,
                                     max_len=max_len,
                                     seq_resize=seq_resize)

    ##clusters
    train_c = seq_cluster.groupby(['annotation', 'Cluster']).sample(n=sample_n)
    seq_cluster = seq_cluster.drop(train_c.index)
    train_c_one_hot = cf.seq_one_hot(train_c['sequence'],
                                     seq_type=seq_type,
                                     max_len=max_len,
                                     seq_resize=seq_resize)

    #%%
    #generate validation data for annotation/cluster datasets
    ##annotation
    validation_a = seq_cluster_a.groupby(['annotation']).sample(n=sample_n)
    seq_cluster_a = seq_cluster_a.drop(validation_a.index)
df = pd.read_csv('data/swiss_data_variants/swiss_n1_unknown_first_removed.tsv',
                 sep='\t')
uniq_anno = df.annotation.unique()
num_classes = len(uniq_anno)
anno_categorical = pd.DataFrame({
    'ydata': range(num_classes),
    'annotation': uniq_anno
})
df = pd.merge(df, anno_categorical, on='annotation')

if skip_first == 1:
    max_len -= 1

one_hot_seqs = cf.seq_one_hot(df['sequence'],
                              seq_type=seq_type,
                              max_len=max_len,
                              seq_resize=seq_resize,
                              skip_first=skip_first)

y_cat = to_categorical(np.array(df.ydata, dtype='uint32'), num_classes)

X_train, X_test, y_train, y_test = train_test_split(one_hot_seqs,
                                                    y_cat,
                                                    stratify=y_cat,
                                                    train_size=0.85)

X_train, X_validation, y_train, y_validation = train_test_split(
    one_hot_seqs, y_cat, stratify=y_cat, train_size=0.70 / 0.85)

model = cf.original_blstm(num_classes,
                          num_letters,
Example #4
0
#     min_i=i*max_n
#     if i<batch_size-1:
#         max_i=(i+1)*max_n-1
#     else:
#         max_i=seq_df.shape[0]-1

#     sub=seq_df.iloc[min_i:max_i]
#     test_one_hot=seq_one_hot(sub['sequence'],
#                                   seq_type=seq_type,
#                                   max_len=max_len,
#                                   seq_resize=seq_resize)
#     tmp=model.predict(test_one_hot)
#     size_reshape=tmp.shape[0]
#     sub['prediction']=tmp.reshape(-1,size_reshape)[0]
#     sub.to_csv('data/tara/tara_predict_parallel/subsample_'+str(i)+"_"+str(i+1)+".csv")

# ytest=np.array(test[regress_var],dtype=float)

# Parallel(n_jobs=n)(delayed(parallel_predict_tara)(seq_df,i,batch_size,model) for i in range(batch_size))
test_one_hot = seq_one_hot(seq_df['sequence'],
                           seq_type=seq_type,
                           max_len=max_len,
                           seq_resize=seq_resize)

tmp = model.predict(test_one_hot)
size_reshape = tmp.shape[0]
seq_df['prediction'] = tmp.reshape(-1, size_reshape)[0]
seq_df_mean = seq_df.groupby('Site')['prediction', 'T_C'].mean()
seq_df_mean.to_csv('data/tara/temperature/mae.csv')
print(mean_absolute_error(seq_df_mean.prediction, seq_df_mean.T_C))
        })
        seq_df = pd.merge(seq_df, annotation_ydata_df, on='annotation')
        seq_df = seq_df.loc[seq_df['Cluster'] > -1]
        seq_df = seq_df.drop(
            ['Component_1', 'Component_2', 'Cluster', 'id', 'annotation'],
            axis=1)

        seq_df_shuffle = cf.randomize_groups(seq_df, 'ydata', f)

        #%%
        #generate training data for annotation/cluster datasets
        ##annotations
        train_a = seq_df_shuffle.groupby(['ydata']).sample(n=sample_n)
        seq_df_shuffle = seq_df_shuffle.drop(train_a.index)
        train_a_one_hot = cf.seq_one_hot(train_a['sequence'],
                                         seq_type=seq_type,
                                         max_len=max_len,
                                         seq_resize=seq_resize)

        #%%
        #generate validation data for annotation/cluster datasets
        ##annotation
        validation_a = seq_df_shuffle.groupby(['ydata']).sample(n=sample_n)
        seq_df_shuffle = seq_df_shuffle.drop(validation_a.index)
        validation_a_one_hot = cf.seq_one_hot(validation_a['sequence'],
                                              seq_type=seq_type,
                                              max_len=max_len,
                                              seq_resize=seq_resize)

        #generate test data for annotation/cluster datasets
        ##annotation
Example #6
0
import pandas as pd
import numpy as np

sys.path.insert(1, 'scripts/python/tmr/')
import cnn_functions as cf

emb_data_path = 'data/swiss_data_variants/swiss_n1.tsv'
write_path = 'data/tara/swiss_n1_11182020_embedding.tsv'
model_path = '/home/troyalty/Documents/projects/sequence_cnn/data/models/iteration/iteration_swiss_n1/swiss_iteration_0.h5'
sep = '\t'
max_len = 300
embed_size = 256
batch_size = 100
seq_type = 'aa'
seq_resize = False
layer = "lstm_1"
n_components = 2

model = load_model(model_path)
emb_data = pd.read_csv(emb_data_path, sep=sep)
embed_model = Model(inputs=model.input, outputs=model.get_layer(layer).output)
#	embed_model.summary()
new_seq = cf.seq_one_hot(emb_data['sequence'],
                         seq_type=seq_type,
                         max_len=max_len,
                         seq_resize=seq_resize)
embed = embed_model.predict(new_seq)
emb_data = pd.concat([emb_data, pd.DataFrame(embed)], axis=1)

emb_data.to_csv(write_path, sep='\t')