def __data_generation(self, sequences): 'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels) # Initialization X=one_hot_seqs=cf.seq_one_hot(sequences,seq_type=self.seq_type, max_len=self.max_len, seq_resize=self.seq_resize, skip_first=self.skip_first) return X
annotation_ydata_df = pd.DataFrame({ 'ydata': range(num_classes), 'annotation': uniq_anno }) seq_df = pd.merge(seq_df, annotation_ydata_df, on='annotation') seq_cluster = seq_df.loc[seq_df['Cluster'] > -1] seq_cluster_noise = seq_df.loc[seq_df['Cluster'] == -1] seq_cluster_a = seq_cluster #%% #generate training data for annotation/cluster datasets ##annotations train_a = seq_cluster_a.groupby(['annotation']).sample(n=sample_n) seq_cluster_a = seq_cluster_a.drop(train_a.index) train_a_one_hot = cf.seq_one_hot(train_a['sequence'], seq_type=seq_type, max_len=max_len, seq_resize=seq_resize) ##clusters train_c = seq_cluster.groupby(['annotation', 'Cluster']).sample(n=sample_n) seq_cluster = seq_cluster.drop(train_c.index) train_c_one_hot = cf.seq_one_hot(train_c['sequence'], seq_type=seq_type, max_len=max_len, seq_resize=seq_resize) #%% #generate validation data for annotation/cluster datasets ##annotation validation_a = seq_cluster_a.groupby(['annotation']).sample(n=sample_n) seq_cluster_a = seq_cluster_a.drop(validation_a.index)
df = pd.read_csv('data/swiss_data_variants/swiss_n1_unknown_first_removed.tsv', sep='\t') uniq_anno = df.annotation.unique() num_classes = len(uniq_anno) anno_categorical = pd.DataFrame({ 'ydata': range(num_classes), 'annotation': uniq_anno }) df = pd.merge(df, anno_categorical, on='annotation') if skip_first == 1: max_len -= 1 one_hot_seqs = cf.seq_one_hot(df['sequence'], seq_type=seq_type, max_len=max_len, seq_resize=seq_resize, skip_first=skip_first) y_cat = to_categorical(np.array(df.ydata, dtype='uint32'), num_classes) X_train, X_test, y_train, y_test = train_test_split(one_hot_seqs, y_cat, stratify=y_cat, train_size=0.85) X_train, X_validation, y_train, y_validation = train_test_split( one_hot_seqs, y_cat, stratify=y_cat, train_size=0.70 / 0.85) model = cf.original_blstm(num_classes, num_letters,
# min_i=i*max_n # if i<batch_size-1: # max_i=(i+1)*max_n-1 # else: # max_i=seq_df.shape[0]-1 # sub=seq_df.iloc[min_i:max_i] # test_one_hot=seq_one_hot(sub['sequence'], # seq_type=seq_type, # max_len=max_len, # seq_resize=seq_resize) # tmp=model.predict(test_one_hot) # size_reshape=tmp.shape[0] # sub['prediction']=tmp.reshape(-1,size_reshape)[0] # sub.to_csv('data/tara/tara_predict_parallel/subsample_'+str(i)+"_"+str(i+1)+".csv") # ytest=np.array(test[regress_var],dtype=float) # Parallel(n_jobs=n)(delayed(parallel_predict_tara)(seq_df,i,batch_size,model) for i in range(batch_size)) test_one_hot = seq_one_hot(seq_df['sequence'], seq_type=seq_type, max_len=max_len, seq_resize=seq_resize) tmp = model.predict(test_one_hot) size_reshape = tmp.shape[0] seq_df['prediction'] = tmp.reshape(-1, size_reshape)[0] seq_df_mean = seq_df.groupby('Site')['prediction', 'T_C'].mean() seq_df_mean.to_csv('data/tara/temperature/mae.csv') print(mean_absolute_error(seq_df_mean.prediction, seq_df_mean.T_C))
}) seq_df = pd.merge(seq_df, annotation_ydata_df, on='annotation') seq_df = seq_df.loc[seq_df['Cluster'] > -1] seq_df = seq_df.drop( ['Component_1', 'Component_2', 'Cluster', 'id', 'annotation'], axis=1) seq_df_shuffle = cf.randomize_groups(seq_df, 'ydata', f) #%% #generate training data for annotation/cluster datasets ##annotations train_a = seq_df_shuffle.groupby(['ydata']).sample(n=sample_n) seq_df_shuffle = seq_df_shuffle.drop(train_a.index) train_a_one_hot = cf.seq_one_hot(train_a['sequence'], seq_type=seq_type, max_len=max_len, seq_resize=seq_resize) #%% #generate validation data for annotation/cluster datasets ##annotation validation_a = seq_df_shuffle.groupby(['ydata']).sample(n=sample_n) seq_df_shuffle = seq_df_shuffle.drop(validation_a.index) validation_a_one_hot = cf.seq_one_hot(validation_a['sequence'], seq_type=seq_type, max_len=max_len, seq_resize=seq_resize) #generate test data for annotation/cluster datasets ##annotation
import pandas as pd import numpy as np sys.path.insert(1, 'scripts/python/tmr/') import cnn_functions as cf emb_data_path = 'data/swiss_data_variants/swiss_n1.tsv' write_path = 'data/tara/swiss_n1_11182020_embedding.tsv' model_path = '/home/troyalty/Documents/projects/sequence_cnn/data/models/iteration/iteration_swiss_n1/swiss_iteration_0.h5' sep = '\t' max_len = 300 embed_size = 256 batch_size = 100 seq_type = 'aa' seq_resize = False layer = "lstm_1" n_components = 2 model = load_model(model_path) emb_data = pd.read_csv(emb_data_path, sep=sep) embed_model = Model(inputs=model.input, outputs=model.get_layer(layer).output) # embed_model.summary() new_seq = cf.seq_one_hot(emb_data['sequence'], seq_type=seq_type, max_len=max_len, seq_resize=seq_resize) embed = embed_model.predict(new_seq) emb_data = pd.concat([emb_data, pd.DataFrame(embed)], axis=1) emb_data.to_csv(write_path, sep='\t')