def train_model(path_config, args=None): x = get_data1(path_config[args.model_name]['train'], args) x_test = get_data2(path_config[args.model_name]['test'], args) hyperparams = utils.get_model_hyperparams('lstm_vae') dirpath_results = path_config[args.model_name]['results'] logger = utils.get_logger() logger.info((x.shape, x_test.shape)) input_dim = x.shape[-1] # 13 timesteps = x.shape[1] # 3 # batch_size = 1 vae, enc, gen = create_lstm_vae(input_dim, timesteps, hyperparams['batch_size'], hyperparams['intermediate_dim'], hyperparams['latent_dim'], hyperparams['epsilon_std']) ep = hyperparams["num_iterations"] if args.is_demo: ep = 1 vae.fit(x, x, epochs=ep) vae.save_weights(dirpath_results + args.model_name + "_weights.h5") logger.info(vae.summary()) # preds = vae.predict(x, batch_size=batch_size) predicted = enc.predict(x_test, batch_size=hyperparams['batch_size']) np.save(dirpath_results + args.model_name + "_predicted", predicted) logger.info(predicted.shape)
def save_loss_curve(dirpath_results, model_name): logger = utils.get_logger() df_metrics = pd.read_csv('{}/{}_metrics.csv'.format( dirpath_results, model_name), index_col=0) fpath_plot = '{}/{}_loss.png'.format(dirpath_results, model_name) train_ces = df_metrics['train_ce'].values valid_ces = df_metrics['valid_ce'].values if 'kmer' in model_name: train_ces = np.clip(train_ces, a_min=min(train_ces), a_max=0.07) valid_ces = np.clip(valid_ces, a_min=min(valid_ces), a_max=0.07) iterations = [i * 50 for i in range(df_metrics.shape[0])] plt.plot(iterations, train_ces, color=COLORS[0], label='Train CE') plt.plot(iterations, valid_ces, color=COLORS[1], label='Validation CE') plt.xlabel('Iteration') plt.ylabel('Cross Entropy') plt.title('Cross entropy curves {}'.format(model_name), fontsize=16) plt.legend() plt.savefig(fpath_plot) plt.clf() logger.info('Saved loss curves in {}'.format(fpath_plot))
def generate_kmers(fpath_hierarchy, fpath_kmer): logger = utils.get_logger() for level in ['phylum', 'class', 'order']: logger.info('Generating K-Mers for {}'.format(level)) fpath_level = os.path.join(fpath_kmer, level) if not os.path.exists(fpath_level): os.makedirs(fpath_level) for k in range(1, 7): logger.info('K={}'.format(k)) kmerize_data( '{hie}/{level}/train.csv'.format( hie=fpath_hierarchy, level=level), '{fpath}/train_{k}mer.csv'.format( fpath=fpath_level, k=k ), k) kmerize_data( '{hie}/{level}/val.csv'.format(hie=fpath_hierarchy, level=level), '{fpath}/val_{k}mer.csv'.format( fpath=fpath_level, level=level, k=k ), k)
def embed_and_plot(df_data, embeddings, embed_config): logger = utils.get_logger() embed_data = embed_config['model'].fit_transform(embeddings) phy_labels = sorted(list(df_data['phylum'].value_counts().index[:3])) phy_indices = [i for i, _ in enumerate(phy_labels)] fpath_plot = embed_config['fpath_plot'] for phy_idx in phy_indices: data_indices = np.where( df_data['phylum'].values == phy_labels[phy_idx]) data_pts = embed_data[data_indices] plt.scatter(data_pts[:, 0], data_pts[:, 1], color=COLORS[phy_idx], label=phy_labels[phy_idx]) plt.xlabel(embed_config['xlabel']) plt.ylabel(embed_config['xlabel']) plt.title(embed_config['title'], fontsize=16) plt.legend() plt.savefig(fpath_plot) plt.clf() logger.info(embed_config['info_msg'].format(path=fpath_plot))
def split_data(df_taxa, level, n_samples_per_class, fpath_hierarchy): logger = utils.get_logger() fpath_level = os.path.join(fpath_hierarchy, level) if not os.path.exists(fpath_level): os.makedirs(fpath_level) logger.info('Creating hierarchy data for {} level'.format(level)) df_group = df_taxa.sample(frac=1).groupby(by=level) X = [] y = [] for name, group in df_group: gX, gy = get_xy_from_df(group, level) gX = gX[:int(n_samples_per_class)] gy = gy[:int(n_samples_per_class)] X.append(gX) y.append(gy) X = np.vstack(X) y = np.concatenate(y) (train_data, testval_data, train_labels, testval_labels) = \ train_test_split(X, y, test_size=0.4, stratify=y, shuffle=True) (test_data, val_data, test_labels, val_labels) = \ train_test_split(testval_data, testval_labels, test_size=0.5, stratify=testval_labels, shuffle=True) create_split_csv(train_data, train_labels, '{}/train.csv'.format(fpath_level, level)) create_split_csv(test_data, test_labels, '{}/test.csv'.format(fpath_level, level)) create_split_csv(val_data, val_labels, '{}/val.csv'.format(fpath_level, level))
def train_model(path_config, args=None): if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") logger = utils.get_logger() hyperparams = utils.get_model_hyperparams("rnn") learning_rate = hyperparams['learning_rate'] path_config = path_config['rnn'] dirpath_results = path_config['dirpath_rnn'] logger.info('Training RNN Phylum Classifier') parameters = Parameters('phylum') data_config = path_config['phylum'] train_data = load_data(data_config['train']) valid_data = load_data(data_config['val']) test_data = load_data(data_config['test']) model = LSTMClassifier(parameters.embedding_dim, parameters.hidden_dim, parameters.vocab_size, parameters.label_size, device).to(device) loss_fun = nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) train(train_data, valid_data, test_data, model, loss_fun, optimizer, dirpath_results, parameters, device, logger, args.is_demo, 'rnn_phylum') logger.info('Training RNN Class Classifier') parameters = Parameters('class') data_config = path_config['class'] train_data = load_data(data_config['train']) valid_data = load_data(data_config['val']) test_data = load_data(data_config['test']) model = LSTMClassifier(parameters.embedding_dim, parameters.hidden_dim, parameters.vocab_size, parameters.label_size, device).to(device) loss_fun = nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) train(train_data, valid_data, test_data, model, loss_fun, optimizer, dirpath_results, parameters, device, logger, args.is_demo, 'rnn_class') logger.info('Training RNN Order Classifier') parameters = Parameters('order') data_config = path_config['order'] train_data = load_data(data_config['train']) valid_data = load_data(data_config['val']) test_data = load_data(data_config['test']) model = LSTMClassifier(parameters.embedding_dim, parameters.hidden_dim, parameters.vocab_size, parameters.label_size, device).to(device) loss_fun = nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) train(train_data, valid_data, test_data, model, loss_fun, optimizer, dirpath_results, parameters, device, logger, args.is_demo, 'rnn_order')
def generate_kmers(fpath_embeds, dirpath_vae, k): logger = utils.get_logger() logger.info('Generating {}-mers for VAE'.format(k)) fpath_embed_kmer = '{}/embeds_{}mer.csv'.format(dirpath_vae, k) kmerize_data(fpath_embeds, fpath_embed_kmer, k) df_kmer = pd.read_csv(fpath_embed_kmer).drop( ['id', 'phylum', 'class', 'order'], axis=1) data_kmer = df_kmer.values.astype(np.float16) np.save('{}/features_{}mer.npy'.format(dirpath_vae, k), data_kmer)
def create_hierarchy(fpath_taxa, fpath_hierarchy): logger = utils.get_logger() logger.info('Creating hierarchy files from taxa.csv') if not os.path.exists(fpath_hierarchy): os.makedirs(fpath_hierarchy) df_taxa = pd.read_csv(fpath_taxa) split_data(df_taxa, 'phylum', MAX_SAMPLES_PER_CLASS, fpath_hierarchy) split_data(df_taxa, 'class', MAX_SAMPLES_PER_CLASS, fpath_hierarchy) df_group = group_labels(df_taxa, ORDER_OTHER_LABELS) split_data(df_group, 'order', MAX_SAMPLES_PER_CLASS, fpath_hierarchy)
def generate_vae_data(fpath_embeds, dirpath_vae): logger = utils.get_logger() logger.info('Generating data files for VAE training') if not os.path.exists(dirpath_vae): os.makedirs(dirpath_vae) df_embeds = pd.read_csv(fpath_embeds) seqs = df_embeds['sequence'].values with open('{}/ordinal_sequences.txt'.format(dirpath_vae), 'w') as seqfile: seqfile.write('\n'.join(seqs)) generate_kmers(fpath_embeds, dirpath_vae, 4) generate_kmers(fpath_embeds, dirpath_vae, 5)
def train_basic(models, dirpath_kmer, dirpath_output, kmin, kmax, is_demo): logger = utils.get_logger() for model in models: model_str = type(model).__name__.lower() for level in ['phylum', 'class', 'order']: combined_train_data = [] combined_val_data = [] for k in range(kmin, kmax + 1): df_train = pd.read_csv('{}/{}/train_{}mer.csv'.format( dirpath_kmer, level, k)) df_val = pd.read_csv('{}/{}/val_{}mer.csv'.format( dirpath_kmer, level, k)) train_f1, pred_val = train_kmer_for_level( model, level, k, df_train, df_val) combined_train_data.append(df_train.values[:, :-2].astype( np.float16)) combined_val_data.append(df_val.values[:, :-2].astype( np.float16)) logger.info( 'Train F1 Score for {} model for {} level and k={} is {:.3f}' .format(model_str, level, k, train_f1)) np.save( '{}/{}_preds_{}_{}mer.npy'.format(dirpath_output, model_str, level, k), pred_val) if not is_demo: combined_train_data = np.hstack(combined_train_data) combined_val_data = np.hstack(combined_val_data) train_labels = df_train['label'].values.astype(np.int8) val_labels = df_val['label'].values.astype(np.int8) combined_f1, combined_pred = train_kmer_combined( model, level, combined_train_data, train_labels, combined_val_data, val_labels) logger.info(('Train F1 Score for {} model for {} level and ' + 'combined K from 1-5 is {:.3f}').format( model_str, level, combined_f1)) np.save( '{}/{}_preds_{}_combined.npy'.format( dirpath_output, model_str, level), combined_pred)
def get_embeddings(fpath_data, is_kmer, feature_type, use_gpu, dirpath_results, model_name): logger = utils.get_logger() logger.info('Running inference and obtaining embeddings for {}'.format( feature_type)) if is_kmer: kval = int(feature_type.split('_')[1]) fpath_features = '{}/features_{}mer.npy'.format(fpath_data, kval) features = np.load(fpath_features) else: fpath_features = '{}/ordinal_sequences.txt'.format(fpath_data) features = open(fpath_features, 'r').read().split('\n') batch_loader = BatchLoader(data_path=fpath_features, is_kmer=(is_kmer)) parameters = Parameters(batch_loader.vocab_size, feature_type=feature_type) vae = VAE(parameters) if use_gpu: vae = vae.cuda() vae.load_state_dict( t.load('{}/{}_best.pth'.format(dirpath_results, model_name))) batch_size = 5000 batch_indices = [ i for i in np.arange( start=batch_size, stop=len(features), step=batch_size) ] feature_batches = np.split(features, batch_indices) embeddings = [] for feature_batch in feature_batches: if not is_kmer: idxs = [batch_loader._get_idxs(seq) for seq in feature_batch] encoder_input, _, _ = batch_loader._wrap_tensor(idxs, use_cuda=use_gpu) else: encoder_input, _, _ = batch_loader._wrap_tensor(feature_batch, use_cuda=use_gpu) mu, _ = vae.inference(encoder_input) embeddings.extend(mu.data.cpu().numpy()) embeddings = np.array(embeddings) return embeddings
def plot_kmer_metrics(path_config, args): logger = utils.get_logger() path_config = path_config['basic_kmer'] dirpath_kmer = path_config['dirpath_kmer'] dirpath_results = path_config['results'] model_display = {'svc': 'SVM', 'randomforestclassifier': 'Random Forest'} xticks = [str(i) for i in range(1, 7)] xticks.append('combined') for model in ['svc', 'randomforestclassifier']: model_scores = {} for i, level in enumerate(['phylum', 'class', 'order']): model_scores[level] = [] df_data = pd.read_csv('{}/{}/val_1mer.csv'.format( dirpath_kmer, level)) gt_y = df_data['label'].values.astype(np.int8) for k in range(1, 7): pred_y = np.load('{}/{}_preds_{}_{}mer.npy'.format( dirpath_results, model, level, k)) model_scores[level].append( f1_score(gt_y, pred_y, average='macro')) pred_y = np.load('{}/{}_preds_{}_combined.npy'.format( dirpath_results, model, level)) model_scores[level].append(f1_score(gt_y, pred_y, average='macro')) plt.plot(range(1, 8), model_scores[level], label=level, color=COLORS[i]) fpath_plot = '{}/{}_kmer.png'.format(dirpath_results, model) plt.xticks(range(1, 8), xticks) plt.xlabel('K-Mer length') plt.ylabel('F1 Score') plt.title('F1 Metrics for {}'.format(model_display[model])) plt.legend() plt.savefig(fpath_plot) logger.info('Saving K-Mer comparison plot for {} in {}'.format( model_display[model], fpath_plot)) plt.clf()
def test_model(path_config, args=None): if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") logger = utils.get_logger() path_config = path_config['rnn'] dirpath_results = path_config['dirpath_rnn'] logger.info('Testing RNN Phylum Classifier') parameters = Parameters('phylum') data_config = path_config['phylum'] test_data = load_data(data_config['test']) model = LSTMClassifier(parameters.embedding_dim, parameters.hidden_dim, parameters.vocab_size, parameters.label_size, device).to(device) evaluate(test_data, model, dirpath_results, parameters, device, logger, 'rnn_phylum') logger.info('Testing RNN Class Classifier') parameters = Parameters('class') data_config = path_config['class'] test_data = load_data(data_config['test']) model = LSTMClassifier(parameters.embedding_dim, parameters.hidden_dim, parameters.vocab_size, parameters.label_size, device).to(device) evaluate(test_data, model, dirpath_results, parameters, device, logger, 'rnn_class') logger.info('Testing RNN Order Classifier') parameters = Parameters('order') data_config = path_config['order'] test_data = load_data(data_config['test']) model = LSTMClassifier(parameters.embedding_dim, parameters.hidden_dim, parameters.vocab_size, parameters.label_size, device).to(device) evaluate(test_data, model, dirpath_results, parameters, device, logger, 'rnn_order')
def train_model(path_config, args=None): if args.model_name == 'basic_kmer': logger = utils.get_logger() path_config = path_config['basic_kmer'] kmer_config = utils.get_model_hyperparams('basic_kmer') models = [ SVC(**kmer_config['svm']), RandomForestClassifier(**kmer_config['random_forest']) ] if args.is_demo: logger.info( 'WARNING! Running in Demo Mode. Because of the fact that SVM is taking a long time, ' + 'training will be run only for k=1 to 4. Training for combined data will not be run.' ) kmer_config['kmax'] = 4 kmer_train_basic(models, dirpath_kmer=path_config['dirpath_kmer'], dirpath_output=path_config['results'], kmin=kmer_config['kmin'], kmax=kmer_config['kmax'], is_demo=args.is_demo) elif args.model_name == 'basic_vector': path_config = path_config['basic_vector'] vector_train_basic(dirpath_vector=path_config['dirpath_vector'], dirpath_output=path_config['results']) elif args.model_name == 'basic_onehot': path_config = path_config['basic_onehot'] onehot_train_basic(dirpath_vector=path_config['dirpath_onehot'], dirpath_output=path_config['results']) else: raise ValueError('Basic ML model {} not supported'.format( args.model_name))
import warnings import matplotlib.pyplot as plt import numpy as np import pandas as pd import torch import torch.utils.data as torch_utils from ml.utils import get_logger plt.style.use('seaborn') warnings.filterwarnings("ignore") torch.set_num_threads(1) LOGGER = get_logger() def read_data_from_csv(path): df = pd.read_csv(path) X = df["sequence"].values y = df["label"].values y_names = df["class_name"].values return X, y, y_names def load_train_val_test_data(base_path, level, analyze=True, return_label_names=False): base_path_on_level = os.path.join(base_path, level)
def train_basic(dirpath_vector, dirpath_output): logger = utils.get_logger() x_train = np.genfromtxt( dirpath_vector + '/phylum/train.csv', delimiter='\n', dtype=None, encoding=None) x_test = np.genfromtxt(dirpath_vector + '/phylum/test.csv', delimiter='\n', dtype=None, encoding=None) x_val = np.genfromtxt(dirpath_vector + '/phylum/val.csv', delimiter='\n', dtype=None, encoding=None) arr = [] arr1 = [] arr2 = [] for item in x_train[1:]: arr.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_test[1:]: arr1.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_val[1:]: arr2.append(ordinal_encoder(string_to_array(item.split(",")[3]))) maxi = 0 for item in arr: if len(item) > maxi: maxi = len(item) final1 = np.zeros((x_train.shape[0] - 1, maxi, 5)) count = 0 for item in arr: final1[count][:len(item)] = item count += 1 maxi1 = 0 for item in arr1: if len(item) > maxi1: maxi1 = len(item) final2 = np.zeros((x_test.shape[0] - 1, maxi1, 5)) count = 0 for item in arr1: final2[count][:len(item)] = item count += 1 maxi2 = 0 for item in arr2: if len(item) > maxi2: maxi2 = len(item) final3 = np.zeros((x_val.shape[0] - 1, maxi2, 5)) count = 0 for item in arr2: final3[count][:len(item)] = item count += 1 hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'w') hf.create_dataset('dataset_1', data=final1) hf.create_dataset('dataset_2', data=final2) hf.create_dataset('dataset_3', data=final3) hf.close() x_train = np.genfromtxt( dirpath_vector + '/class/train.csv', delimiter='\n', dtype=None, encoding=None) x_test = np.genfromtxt(dirpath_vector + '/class/test.csv', delimiter='\n', dtype=None, encoding=None) x_val = np.genfromtxt(dirpath_vector + '/class/val.csv', delimiter='\n', dtype=None, encoding=None) arr = [] arr1 = [] arr2 = [] for item in x_train[1:]: arr.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_test[1:]: arr1.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_val[1:]: arr2.append(ordinal_encoder(string_to_array(item.split(",")[3]))) maxi = 0 for item in arr: if len(item) > maxi: maxi = len(item) final1 = np.zeros((x_train.shape[0] - 1, maxi, 5)) count = 0 for item in arr: final1[count][:len(item)] = item count += 1 maxi1 = 0 for item in arr1: if len(item) > maxi1: maxi1 = len(item) final2 = np.zeros((x_test.shape[0] - 1, maxi1, 5)) count = 0 for item in arr1: final2[count][:len(item)] = item count += 1 maxi2 = 0 for item in arr2: if len(item) > maxi2: maxi2 = len(item) final3 = np.zeros((x_val.shape[0] - 1, maxi2, 5)) count = 0 for item in arr2: final3[count][:len(item)] = item count += 1 hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'w') hf.create_dataset('dataset_1', data=final1) hf.create_dataset('dataset_2', data=final2) hf.create_dataset('dataset_3', data=final3) hf.close() hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'r') n1 = hf.get('dataset_1') n2 = hf.get('dataset_2') n3 = hf.get('dataset_3') X = np.array(n1) Y = np.array(n2) V = np.array(n3) hf.close() lab = np.genfromtxt(dirpath_vector + '/phylum/train.csv', delimiter='\n', dtype=None, encoding=None) lab1 = np.genfromtxt(dirpath_vector + '/phylum/test.csv', delimiter='\n', dtype=None, encoding=None) lab2 = np.genfromtxt(dirpath_vector + '/phylum/val.csv', delimiter='\n', dtype=None, encoding=None) labels = [] i = 0 for item in lab[1:]: if item.split(",")[0][0] == "A": labels.append(0) elif item.split(",")[0][0] == "F": labels.append(1) else: labels.append(2) i += 1 labels1 = [] i = 0 for item in lab1[1:]: if item.split(",")[0][0] == "A": labels1.append(0) elif item.split(",")[0][0] == "F": labels1.append(1) else: labels1.append(2) i += 1 labels2 = [] i = 0 for item in lab2[1:]: if item.split(",")[0][0] == "A": labels2.append(0) elif item.split(",")[0][0] == "F": labels2.append(1) else: labels2.append(2) i += 1 label = np.array(labels) label1 = np.array(labels1) label2 = np.array(labels2) clf2 = SVC(kernel='rbf') clf = RandomForestClassifier() newX = X.reshape(X.shape[0], X.shape[1] * X.shape[2]) newY = Y.reshape(Y.shape[0], Y.shape[1] * Y.shape[2]) clf2.fit(newX, label) clf.fit(newX, label) preds2 = clf2.predict(newX) preds = clf.predict(newX) preds2_test = clf2.predict(newY) preds_test = clf.predict(newY) np.save(dirpath_output + '/SVM_phylum_predictions', preds2_test) np.save(dirpath_output + '/RF_phylum_predictions', preds_test) scores = clf2.decision_function(newY) scores2 = clf.predict(newY) score = np.amax(scores, axis=1) scores_train = clf2.decision_function(newX) scores2_train = clf.predict(newX) score_train = np.amax(scores_train, axis=1) np.save(dirpath_output + '/SVM_phylum_scores', score) np.save(dirpath_output + '/RF_phylum_scores', scores2) fpr, tpr, thresholds = roc_curve(label, score_train, pos_label=2) fpr2, tpr2, thresholds2 = roc_curve(label, scores2_train, pos_label=2) match2 = 0 for i in range(preds2.shape[0]): if preds2[i] == label[i]: match2 += 1 accuracy2 = float(match2) / preds2.shape[0] p, r, f1, s = precision_recall_fscore_support( label, preds2, average='weighted') match = 0 for i in range(preds.shape[0]): if preds[i] == label[i]: match += 1 accuracy = float(match) / preds.shape[0] p2, r2, f12, s = precision_recall_fscore_support( label, preds, average='weighted') C = confusion_matrix(label, preds2) logger.info('Train Accuracy, precision, recall and F1 Score for SVM model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format( accuracy2, p, r, f1)) logger.info('Train Accuracy, precision, recall and F1 Score for Random Forest model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format( accuracy, p2, r2, f12)) hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'r') n1 = hf.get('dataset_1') n2 = hf.get('dataset_2') n3 = hf.get('dataset_3') X = np.array(n1) Y = np.array(n2) V = np.array(n3) hf.close() lab = np.genfromtxt(dirpath_vector + '/class/train.csv', delimiter='\n', dtype=None, encoding=None) lab1 = np.genfromtxt(dirpath_vector + '/class/test.csv', delimiter='\n', dtype=None, encoding=None) lab2 = np.genfromtxt(dirpath_vector + '/class/val.csv', delimiter='\n', dtype=None, encoding=None) labels = [] i = 0 for item in lab[1:]: labels.append(int(item.split(",")[2])) i += 1 labels1 = [] i = 0 for item in lab1[1:]: labels1.append(int(item.split(",")[2])) i += 1 labels2 = [] i = 0 for item in lab2[1:]: labels2.append(int(item.split(",")[2])) i += 1 label = np.array(labels) label1 = np.array(labels1) label2 = np.array(labels2) clf2 = RandomForestClassifier() clf = SVC(kernel='rbf') newX = X.reshape(X.shape[0], X.shape[1] * X.shape[2]) newY = Y.reshape(Y.shape[0], Y.shape[1] * Y.shape[2]) clf2.fit(newX, label) clf.fit(newX, label) preds2 = clf2.predict(newX) preds = clf.predict(newX) scores = clf2.predict(newY) scores1 = clf.decision_function(newY) preds2_test = clf2.predict(newY) preds_test = clf.predict(newY) np.save(dirpath_output + '/SVM_class_predictions', preds2_test) np.save(dirpath_output + '/RF_class_predictions', preds_test) score = np.amax(scores1, axis=1) scores_train = clf.decision_function(newX) scores2_train = clf2.predict(newX) score_train = np.amax(scores_train, axis=1) np.save(dirpath_output + '/SVM_class_scores', score) np.save(dirpath_output + '/RF_class_scores', scores) fpr, tpr, thresholds = roc_curve(label, scores_train, pos_label=2) fpr2, tpr2, thresholds2 = roc_curve(label, score_train, pos_label=2) match2 = 0 for i in range(preds2.shape[0]): if preds2[i] == label[i]: match2 += 1 accuracy2 = float(match2) / preds2.shape[0] p, r, f1, s = precision_recall_fscore_support( label, preds2, average='weighted') C = confusion_matrix(label, preds2) match = 0 for i in range(preds.shape[0]): if preds[i] == label[i]: match += 1 accuracy = float(match) / preds.shape[0] p2, r2, f12, s = precision_recall_fscore_support( label, preds, average='weighted') logger.info('Train Accuracy, precision, recall and F1 Score for SVM model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format( accuracy, p2, r2, f12)) logger.info('Train Accuracy, precision, recall and F1 Score for Random Forest model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format( accuracy2, p, r, f1))
def train_vae(path_config, feature_type, hyperparams, model_name): t.cuda.empty_cache() logger = utils.get_logger() is_kmer = ('kmer' in feature_type) fpath_data = (path_config['features'] if is_kmer else path_config['sequences']) dirpath_results = path_config['results'] use_gpu = t.cuda.is_available() batch_loader = BatchLoader(data_path=fpath_data, is_kmer=(is_kmer)) parameters = Parameters(batch_loader.vocab_size, feature_type=feature_type) vae = VAE(parameters) if use_gpu: vae = vae.cuda() optimizer = Adam(vae.parameters(), hyperparams['learning_rate']) metrics = [] min_ce = 1000 min_vae = None num = hyperparams['num_iterations'] for iteration in tqdm(range(num), total=num): '''Train step''' input, decoder_input, target = batch_loader.next_batch( hyperparams['batch_size'], 'train', use_gpu) target = target.view(-1) logits, aux_logits, kld = vae(hyperparams['dropout'], input, decoder_input) logits = logits.view(-1, batch_loader.vocab_size) cross_entropy = F.cross_entropy(logits, target, size_average=False) aux_logits = aux_logits.view(-1, batch_loader.vocab_size) aux_cross_entropy = F.cross_entropy(aux_logits, target, size_average=False) loss = cross_entropy + hyperparams['aux'] * aux_cross_entropy + kld optimizer.zero_grad() loss.backward() optimizer.step() '''Validation''' input, decoder_input, target = batch_loader.next_batch( hyperparams['batch_size'], 'valid', use_gpu) target = target.view(-1) logits, aux_logits, valid_kld = vae(hyperparams['dropout'], input, decoder_input) logits = logits.view(-1, batch_loader.vocab_size) valid_cross_entropy = F.cross_entropy(logits, target, size_average=False) aux_logits = aux_logits.view(-1, batch_loader.vocab_size) valid_aux_cross_entropy = F.cross_entropy(aux_logits, target, size_average=False) loss = valid_cross_entropy + \ hyperparams['aux'] * valid_aux_cross_entropy + kld if iteration % 50 == 0: metrics_dict = get_metrics_dict(cross_entropy, aux_cross_entropy, kld, valid_cross_entropy, valid_aux_cross_entropy, valid_kld, hyperparams['batch_size']) metrics.append(metrics_dict) valid_ce = metrics_dict['valid_ce'] if valid_ce <= min_ce: min_vae_dict = vae.state_dict() min_ce = valid_ce logger.info( 'Saving best model in iteration {}'.format(iteration)) t.save(vae.state_dict(), '{}/{}_best.pth'.format(dirpath_results, model_name)) logger.info('Saving final metrics') df_metrics = pd.DataFrame(metrics) df_metrics.to_csv('{}/{}_metrics.csv'.format(dirpath_results, model_name))
def test_basic(dirpath_vector, dirpath_output, verbose=True): logger = utils.get_logger() x_train = np.genfromtxt(dirpath_vector + '/phylum/train.csv', delimiter='\n', dtype=None, encoding=None) x_test = np.genfromtxt(dirpath_vector + '/phylum/test.csv', delimiter='\n', dtype=None, encoding=None) x_val = np.genfromtxt(dirpath_vector + '/phylum/val.csv', delimiter='\n', dtype=None, encoding=None) arr = [] arr1 = [] arr2 = [] for item in x_train[1:]: arr.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_test[1:]: arr1.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_val[1:]: arr2.append(ordinal_encoder(string_to_array(item.split(",")[3]))) maxi = 0 for item in arr: if len(item) > maxi: maxi = len(item) final1 = np.zeros((x_train.shape[0] - 1, maxi)) count = 0 for item in arr: final1[count][:len(item)] = item count += 1 maxi1 = 0 for item in arr1: if len(item) > maxi1: maxi1 = len(item) final2 = np.zeros((x_test.shape[0] - 1, maxi1)) count = 0 for item in arr1: final2[count][:len(item)] = item count += 1 maxi2 = 0 for item in arr2: if len(item) > maxi2: maxi2 = len(item) final3 = np.zeros((x_val.shape[0] - 1, maxi2)) count = 0 for item in arr2: final3[count][:len(item)] = item count += 1 hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'w') hf.create_dataset('dataset_1', data=final1) hf.create_dataset('dataset_2', data=final2) hf.create_dataset('dataset_3', data=final3) hf.close() x_train = np.genfromtxt(dirpath_vector + '/class/train.csv', delimiter='\n', dtype=None, encoding=None) x_test = np.genfromtxt(dirpath_vector + '/class/test.csv', delimiter='\n', dtype=None, encoding=None) x_val = np.genfromtxt(dirpath_vector + '/class/val.csv', delimiter='\n', dtype=None, encoding=None) arr = [] arr1 = [] arr2 = [] for item in x_train[1:]: arr.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_test[1:]: arr1.append(ordinal_encoder(string_to_array(item.split(",")[3]))) for item in x_val[1:]: arr2.append(ordinal_encoder(string_to_array(item.split(",")[3]))) maxi = 0 for item in arr: if len(item) > maxi: maxi = len(item) final1 = np.zeros((x_train.shape[0] - 1, maxi)) count = 0 for item in arr: final1[count][:len(item)] = item count += 1 maxi1 = 0 for item in arr1: if len(item) > maxi1: maxi1 = len(item) final2 = np.zeros((x_test.shape[0] - 1, maxi1)) count = 0 for item in arr1: final2[count][:len(item)] = item count += 1 maxi2 = 0 for item in arr2: if len(item) > maxi2: maxi2 = len(item) final3 = np.zeros((x_val.shape[0] - 1, maxi2)) count = 0 for item in arr2: final3[count][:len(item)] = item count += 1 hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'w') hf.create_dataset('dataset_1', data=final1) hf.create_dataset('dataset_2', data=final2) hf.create_dataset('dataset_3', data=final3) hf.close() hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'r') n1 = hf.get('dataset_1') n2 = hf.get('dataset_2') n3 = hf.get('dataset_3') X = np.array(n1) Y = np.array(n2) V = np.array(n3) hf.close() lab = np.genfromtxt(dirpath_vector + '/phylum/train.csv', delimiter='\n', dtype=None, encoding=None) lab1 = np.genfromtxt(dirpath_vector + '/phylum/test.csv', delimiter='\n', dtype=None, encoding=None) lab2 = np.genfromtxt(dirpath_vector + '/phylum/val.csv', delimiter='\n', dtype=None, encoding=None) labels = [] i = 0 for item in lab[1:]: if item.split(",")[0][0] == "A": labels.append(0) elif item.split(",")[0][0] == "F": labels.append(1) else: labels.append(2) i += 1 labels1 = [] i = 0 for item in lab1[1:]: if item.split(",")[0][0] == "A": labels1.append(0) elif item.split(",")[0][0] == "F": labels1.append(1) else: labels1.append(2) i += 1 labels2 = [] i = 0 for item in lab2[1:]: if item.split(",")[0][0] == "A": labels2.append(0) elif item.split(",")[0][0] == "F": labels2.append(1) else: labels2.append(2) i += 1 label = np.array(labels) label1 = np.array(labels1) label2 = np.array(labels2) clf2 = SVC(kernel='rbf') clf = RandomForestClassifier() clf2.fit(X, label) clf.fit(X, label) preds2 = clf2.predict(Y) preds = clf.predict(Y) scores = clf2.decision_function(Y) scores2 = clf.predict(Y) score = np.amax(scores, axis=1) fpr, tpr, thresholds = roc_curve(label1, score, pos_label=2) fpr2, tpr2, thresholds2 = roc_curve(label1, scores2, pos_label=2) roc_auc = auc(fpr, tpr) roc_auc2 = auc(fpr2, tpr2) plt.plot(fpr2, tpr2, lw=1, label='(AUC = %0.2f)' % (roc_auc2)) plt.plot(fpr, tpr, lw=1, label='(AUC = %0.2f)' % (roc_auc)) plt.plot([0, 1], [0, 1], 'k--') plt.title("ROC curve SVM vs RF - Phylum Level") plt.legend(("SVM", "RandomForest")) plt.xlabel("fpr") plt.ylabel("tpr") plt.savefig(dirpath_output + "/" + "ROC_Phylum") match2 = 0 for i in range(preds2.shape[0]): if preds2[i] == label1[i]: match2 += 1 accuracy2 = float(match2) / preds2.shape[0] p, r, f1, s = precision_recall_fscore_support(label1, preds2, average='weighted') match = 0 for i in range(preds.shape[0]): if preds[i] == label1[i]: match += 1 accuracy = float(match) / preds.shape[0] p2, r2, f12, s = precision_recall_fscore_support(label1, preds, average='weighted') C = confusion_matrix(label1, preds2) logger.info( 'Test Accuracy, precision, recall and F1 Score for SVM model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}' .format(accuracy2, p, r, f1)) logger.info( 'Test Accuracy, precision, recall and F1 Score for Random Forest model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}' .format(accuracy, p2, r2, f12)) hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'r') n1 = hf.get('dataset_1') n2 = hf.get('dataset_2') n3 = hf.get('dataset_3') X = np.array(n1) Y = np.array(n2) V = np.array(n3) hf.close() lab = np.genfromtxt(dirpath_vector + '/class/train.csv', delimiter='\n', dtype=None, encoding=None) lab1 = np.genfromtxt(dirpath_vector + '/class/test.csv', delimiter='\n', dtype=None, encoding=None) lab2 = np.genfromtxt(dirpath_vector + '/class/val.csv', delimiter='\n', dtype=None, encoding=None) labels = [] i = 0 for item in lab[1:]: labels.append(int(item.split(",")[2])) i += 1 labels1 = [] i = 0 for item in lab1[1:]: labels1.append(int(item.split(",")[2])) i += 1 labels2 = [] i = 0 for item in lab2[1:]: labels2.append(int(item.split(",")[2])) i += 1 label = np.array(labels) label1 = np.array(labels1) label2 = np.array(labels2) clf2 = RandomForestClassifier() clf = SVC(kernel='rbf') clf2.fit(X, label) clf.fit(X, label) preds2 = clf2.predict(Y) preds = clf.predict(Y) scores = clf2.predict(Y) scores1 = clf.decision_function(Y) score = np.amax(scores1, axis=1) fpr, tpr, thresholds = roc_curve(label1, scores, pos_label=2) fpr2, tpr2, thresholds2 = roc_curve(label1, score, pos_label=2) roc_auc = auc(fpr, tpr) roc_auc2 = auc(fpr2, tpr2) plt.figure() plt.plot(fpr2, tpr2, lw=1, label='(AUC = %0.2f)' % (roc_auc2)) plt.plot(fpr, tpr, lw=1, label='(AUC = %0.2f)' % (roc_auc)) plt.plot([0, 1], [0, 1], 'k--') plt.title("ROC curve SVM vs RF - Class Level") plt.legend(("SVM", "RandomForest")) plt.xlabel("fpr") plt.ylabel("tpr") if not os.path.exists(dirpath_output): os.makedirs(dirpath_output) plt.savefig(dirpath_output + "/" + "ROC_Class") match2 = 0 for i in range(preds2.shape[0]): if preds2[i] == label1[i]: match2 += 1 accuracy2 = float(match2) / preds2.shape[0] p, r, f1, s = precision_recall_fscore_support(label1, preds2, average='weighted') C = confusion_matrix(label1, preds2) match = 0 for i in range(preds.shape[0]): if preds[i] == label1[i]: match += 1 accuracy = float(match) / preds.shape[0] p2, r2, f12, s = precision_recall_fscore_support(label1, preds, average='weighted') logger.info( 'Test Accuracy, precision, recall and F1 Score for SVM model for class level is {:.3f}, {:.3f}, {:.3f}, {:.3f}' .format(accuracy, p2, r2, f12)) logger.info( 'Test Accuracy, precision, recall and F1 Score for Random Forest model for class level is {:.3f}, {:.3f}, {:.3f}, {:.3f}' .format(accuracy2, p, r, f1))
def test_model(path_config, args=None): logger = utils.get_logger() predicted = np.load(path_config[args.model_name]['results'] + "/" + args.model_name + "_predicted.npy") lab = np.genfromtxt(path_config[args.model_name]['test'], delimiter='\n', dtype=None, encoding=None) labels = [] scheme = [ 'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan' ] if args.model_name == "lstm_vae_ordinal": i = 0 for item in lab[1:]: if item.split(",")[0][0] == "P": labels.append(0) elif item.split(",")[0][0] == "F": labels.append(1) else: labels.append(2) i += 1 else: i = 0 for item in lab[1:]: labels.append(float(item.split(",")[-1])) i += 1 label = np.array(labels) clf = PCA(n_components=3) pred_new = clf.fit_transform(predicted) est = KMeans(n_clusters=10, init=predicted[:10]) plotFigs3D(est, pred_new, 'hclustering_order_10-3D', args, predicted, path_config, scheme) est2 = KMeans(n_clusters=5) clf2 = PCA(n_components=3) pred_new2 = clf2.fit_transform(est.cluster_centers_) plotFigs3D(est2, pred_new2, 'hclustering_class_5-3D', args, est.cluster_centers_, path_config, scheme) est3 = KMeans(n_clusters=3) clf3 = PCA(n_components=3) pred_new3 = clf3.fit_transform(est2.cluster_centers_) plotFigs3D(est3, pred_new3, 'hclustering_phylum_3-3D', args, est2.cluster_centers_, path_config, scheme) fig = plt.figure(figsize=(4, 3)) ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) labelslis = [] for i in range(label.shape[0]): labelslis.append(scheme[label[i]]) ax.scatter(pred_new[:, 0], pred_new[:, 1], pred_new[:, 2], c=labelslis, edgecolor='k') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.set_xlabel('PC1') ax.set_ylabel('PC2') ax.set_zlabel('PC3') ax.set_title('3D_groundClustering') plt.savefig(path_config[args.model_name]['results'] + '/3D_groundClustering') logger.info('3D clustering completed, plot stored in ' + path_config[args.model_name]['results']) # from mpl_toolkits.mplot3d import Axes3D clf = PCA(n_components=2) pred_new = clf.fit_transform(predicted) est = KMeans(n_clusters=10, init=predicted[:10]) plotFigs2D(est, pred_new, 'hclustering_order_10-2D', args, predicted, path_config, scheme) clf2 = PCA(n_components=2) pred_new2 = clf2.fit_transform(est.cluster_centers_) est2 = KMeans(n_clusters=5) plotFigs2D(est2, pred_new2, 'hclustering_class_5-2D', args, est.cluster_centers_, path_config, scheme) clf3 = PCA(n_components=2) pred_new3 = clf3.fit_transform(est2.cluster_centers_) est3 = KMeans(n_clusters=3) plotFigs2D(est3, pred_new3, 'hclustering_phylum_3-2D', args, est2.cluster_centers_, path_config, scheme) clf_ground = PCA(n_components=2) pred_new_ground = clf_ground.fit_transform(predicted) plt.figure(figsize=(4, 3)) labelslis = [] for i in range(label.shape[0]): labelslis.append(scheme[label[i]]) plt.scatter(pred_new_ground[:, 0], pred_new_ground[:, 1], c=labelslis, edgecolor='k') plt.xlabel('PC1') plt.ylabel('PC2') # ax.set_zlabel('Petal length') plt.title('hclustering_ground2D') plt.savefig(path_config[args.model_name]['results'] + '/hclustering_ground2D') logger.info('2D clustering completed, plot stored in ' + path_config[args.model_name]['results'])