def tune_and_evaluate(self, estimator, parameters, score='f1_macro', file_name='results'): ''' :param estimator: :param parameters: :param score: :param file_name: directory/tuning/classifier/features/ :return: ''' # inner cross_validation self.greed_search = GridSearchCV(estimator=estimator, param_grid=parameters, cv=self.inner_cv, scoring=self.scoring, refit=score, error_score=0) # Nested CV with parameter optimization self.nested_score = cross_val_score(self.greed_search, X=self.X, y=self.Y, cv=self.outer_cv) # saving FileUtility.save_obj([self.greed_search, self.nested_score], file_name)
def tune_and_evaluate(self, estimator, parameters, score='macro_f1', n_jobs=-1, file_name='results'): ''' :param estimator: :param parameters:p :param score: :param n_jobs: :param file_name: directory/tuning/classifier/features/ :return: ''' # greed_search self.greed_search = GridSearchCV(estimator=estimator, param_grid=parameters, cv=self.cv, scoring=self.scoring, refit=score, error_score=0, n_jobs=n_jobs) label_set = list(set(self.Y)) # fitting self.greed_search.fit(X=self.X, y=self.Y) y_predicted = cross_val_predict(self.greed_search.best_estimator_, self.X, self.Y) conf = confusion_matrix(self.Y, y_predicted, labels=label_set) # save in file FileUtility.save_obj(file_name, [ label_set, conf, self.greed_search.best_score_, self.greed_search.best_estimator_, self.greed_search.cv_results_, self.greed_search.best_params_, y_predicted ])
def motif_extraction(self, topn=100): cpe_vectorizer = TfidfVectorizer(use_idf=False, analyzer='word', norm=None, stop_words=[], lowercase=True, binary=False, tokenizer=str.split) tf_vec=cpe_vectorizer.fit_transform(self.extended_sequences) vocab=cpe_vectorizer.get_feature_names() CH=Chi2Analysis(tf_vec,self.labels,vocab) vocab_binary=[x[0] for x in CH.extract_features_fdr(self.output_path+'/motifs.txt', N=topn, alpha=5e-2, direction=True, allow_subseq=True, binarization=True, remove_redundant_markers=False) if x[1]>0] vocab_binary=vocab_binary[0:min(100,len(vocab_binary))] idxs=[vocab.index(v) for v in vocab_binary] pos_matrix=tf_vec.toarray()[0:len(self.pos),idxs] DIST=get_sym_kl_rows(pos_matrix.T) FileUtility.save_obj(self.output_path+'/sym_KL', DIST) #HC=HierarchicalClutering(DIST,vocab_binary) self.motifs=vocab_binary
sampled_lengths = [10000, 20000, 50000, 100000, 200000, 500000, -1] triples = dict() for i in sampled_lengths: print(i) f = open('../data_config/swissprot_ppe', 'r') CPE_Applier = CPE(f, separator='', merge_size=i) sequences = FileUtility.read_fasta_sequences('../data_config/ss_N.txt') for pdb_idx, (x, y) in tqdm.tqdm(enumerate(pairwise(sequences))): segments = CPE_Applier.segment(x).split() label_segments = according_segmentation(segments, y) if i not in triples: triples[i] = [] triples[i] += [(seg, label_segments[idx], pdb_idx) for idx, seg in enumerate(segments)] for i in sampled_lengths: FileUtility.save_obj('../data_config/pdbsegments_' + str(i), triples[i]) ## mapping of motifs to PDB ids seq_ids = [ x.strip() for x in FileUtility.load_list('../data_config/ss_N.txt') if x.strip()[0] == '>' ] idx2pdb = { idx: ':'.join(val[1::].split(':')[0:2]) for idx, val in enumerate(seq_ids[::2]) } pdb2idx = { ':'.join(val[1::].split(':')[0:2]): idx for idx, val in enumerate(seq_ids[::2]) }
def tune_and_evaluate(self, estimator, parameters, cv_inner=5, score='f1_macro', n_jobs=-1, file_name='results', NUM_TRIALS=3): ''' :param estimator: :param parameters:p :param score: :param n_jobs: :param file_name: directory/tuning/classifier/features/ :return: ''' self.nested_scores = [] cv_dicts = [] test_predictions_in_trials = [] best_params_in_trials = [] # Loop for each trial for i in tqdm.tqdm(range(NUM_TRIALS)): # Choose cross-validation techniques for the inner and outer loops, # independently of the dataset. # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc. inner_cv = StratifiedKFold(n_splits=cv_inner, shuffle=True, random_state=i) # parameter search and scoring self.greed_search = GridSearchCV(estimator=estimator, param_grid=parameters, cv=inner_cv, scoring=self.scoring, refit=score, error_score=0, n_jobs=n_jobs, verbose=0) # Nested CV with parameter optimization nested_score = cross_val_score(self.greed_search, X=self.X, y=self.Y, cv=self.cv, n_jobs=1, scoring=score) self.nested_scores.append(nested_score) # Nested CV with parameter optimization cv_dict_pred = cross_val_predict(self.greed_search, X=self.X, y=self.Y, cv=self.cv, n_jobs=1) cv_dicts.append(cv_dict_pred) # get the cv results cv_predictions_pred = [] cv_predictions_trues = [] # Non_nested parameter search and scoring self.greed_search = GridSearchCV(estimator=estimator, param_grid=parameters, cv=self.cv, scoring=self.scoring, refit=score, error_score=0, n_jobs=n_jobs, verbose=0) self.greed_search.fit(X=self.X, y=self.Y) isolates = [] for train, test in self.cv: self.greed_search.best_estimator_.fit( self.X[train, :], [self.Y[idx] for idx in train]) preds = self.greed_search.best_estimator_.predict(self.X[test, :]) trues = [self.Y[idx] for idx in test] [cv_predictions_pred.append(pred) for pred in preds] [cv_predictions_trues.append(tr) for tr in trues] for i in test: isolates.append(i) label_set = list(set(self.Y)) label_set.sort() isolates = [self.train_isolate_list[iso] for iso in isolates] conf = confusion_matrix(cv_predictions_trues, cv_predictions_pred, labels=label_set) Y_test_pred = self.greed_search.best_estimator_.predict(self.X_test) # save in file FileUtility.save_obj(file_name, [ self.nested_scores, cv_dicts, label_set, conf, label_set, self.greed_search.best_score_, self.greed_search.best_estimator_, self.greed_search.cv_results_, self.greed_search.best_params_, (cv_predictions_pred, cv_predictions_trues, isolates), (Y_test_pred, self.Y_test) ])
def save_me(self, file_name): ''' :param file_name: file name to be saved :return: ''' FileUtility.save_obj(self.output_dir + file_name, self)
def cross_validation(self, result_filename, gpu_dev='2', n_fold=5, epochs=50, batch_size=100, model_strct='mlp', pretrained_model=False, trainable=False): ''' :param result_filename: :param gpu_dev: :param n_fold: :param epochs: :param batch_size: :param model_strct: :param pretrained_model: :param trainable: :return: ''' os.environ["CUDA_VISIBLE_DEVICES"] = gpu_dev skf = StratifiedKFold(n_splits=n_fold, shuffle=True) p_micro=[] p_macro=[] r_micro=[] r_macro=[] f1_micro=[] f1_macro=[] for train_index, valid_index in skf.split(self.X, self.Y): print ('\n Evaluation on a new fold is now get started ..') X_train=self.X[train_index,:] y_train=self.onehot_y[train_index,:] y_class_train=self.encoded_Y[train_index] X_valid=self.X[valid_index,:] y_valid=self.onehot_y[valid_index,:] y_class_valid=self.encoded_Y[valid_index] if pretrained_model: model=self.get_pretrained_model(model_strct, trainable) else: if model_strct=='mlp': model=self.get_MLP_model() # fitting history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,shuffle=True, validation_data=(X_valid, y_valid), verbose=0) pred=model.predict_classes(X_valid) # score-calculations f1_micro.append(f1_score(y_class_valid,pred, average='micro')) f1_macro.append(f1_score(y_class_valid,pred, average='macro')) p_micro.append(precision_score(y_class_valid,pred, average='micro')) p_macro.append(precision_score(y_class_valid,pred, average='macro')) r_micro.append(recall_score(y_class_valid,pred, average='micro')) r_macro.append(recall_score(y_class_valid,pred, average='macro')) # mean values f1mac=np.mean(f1_macro) f1mic=np.mean(f1_micro) prmac=np.mean(p_macro) prmic=np.mean(p_micro) remac=np.mean(r_macro) remic=np.mean(r_micro) # std values sf1mac=np.std(f1_macro) sf1mic=np.std(f1_micro) sprmac=np.std(p_macro) sprmic=np.std(p_micro) sremac=np.std(r_macro) sremic=np.std(r_micro) # table latex_line=' & '.join([str(np.round(x,2))+' $\\pm$ '+str(np.round(y,2)) for x,y in [[prmic, sprmic], [remic, sremic], [f1mic, sf1mic], [prmac, sprmac], [remac, sremac], [f1mac, sf1mac] ]]) print (latex_line) history_dict = history.history loss_values = history_dict['loss'] val_loss_values = history_dict['val_loss'] epochs = range(1, len(loss_values) + 1) ''' Saving the results ''' if pretrained_model: model_strct='pretrained' #print (model.summary()) FileUtility.save_obj('_'.join([result_filename, model_strct,'-'.join([str(x) for x in self.model_arch]), str(np.round(f1mac,2))]), [latex_line, p_micro, r_micro, f1_micro, p_macro, r_macro, f1_macro, (loss_values, val_loss_values, epochs)]) weights=[] for x in model.layers: weights.append(x.get_weights()) ''' Saving the parameters and weights ''' FileUtility.save_obj('_'.join([result_filename, 'layers', model_strct,'-'.join([str(x) for x in self.model_arch]), str(np.round(f1mac,2))]), weights)
def training_loop(**kwargs): run_parameters = kwargs['run_parameters'] model_paramters = kwargs['model_paramters'] model = eval(kwargs['deep_learning_model']) # which GPU to use os.environ["CUDA_VISIBLE_DEVICES"] = str(run_parameters['gpu']) # read files train_file = 'datasets/train.txt' test_file = 'datasets/test.txt' LD = LabelingData(train_file, test_file) train_lengths = [int(j) for j in FileUtility.load_list('/'.join(train_file.split('/')[0:-1]) + '/train_length.txt')] test_lengths = [int(i) for i in FileUtility.load_list('/'.join(test_file.split('/')[0:-1]) + '/test_length.txt')] # train/test batch parameters train_batch_size = run_parameters['train_batch_size'] test_batch_size = run_parameters['test_batch_size'] patience = run_parameters['patience'] epochs = run_parameters['epochs'] # model model, params = model(LD.n_classes, **model_paramters) # output directory FileUtility.ensure_dir('results/') FileUtility.ensure_dir('results/' + run_parameters['domain_name'] + '/') FileUtility.ensure_dir('results/' + run_parameters['domain_name'] + '/' + run_parameters['setting_name'] + '/') FileUtility.ensure_dir( 'results/' + run_parameters['domain_name'] + '/' + run_parameters['setting_name'] + '/' + params + '/') full_path = 'results/' + run_parameters['domain_name'] + '/' + run_parameters['setting_name'] + '/' + params + '/' # save model with open(full_path + 'config.txt', 'w') as fh: model.summary(print_fn=lambda x: fh.write(x + '\n')) # check points filepath = full_path + "/weights-improvement-{epoch:02d}-{weighted_acc:.3f}-{val_weighted_acc:.3f}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_weighted_acc', verbose=1, save_best_only=True, mode='max', period=1) earlystopping = EarlyStopping(monitor='val_weighted_acc', min_delta=0, patience=patience, verbose=0, mode='max', baseline=None) callbacks_list = [checkpoint, earlystopping] # calculate the sizes steps_per_epoch = len(train_lengths) / train_batch_size if len(train_lengths) % train_batch_size == 0 else int( len(train_lengths) / train_batch_size) + 1 validation_steps = int(len(test_lengths) / test_batch_size) if len(test_lengths) % test_batch_size == 0 else int( len(test_lengths) / test_batch_size) + 1 # feed model h = model.fit_generator(train_batch_generator_408(train_batch_size), steps_per_epoch=steps_per_epoch, validation_data=validation_batch_generator_408(test_batch_size), validation_steps=validation_steps, shuffle=False, epochs=epochs, verbose=1, callbacks=callbacks_list) # Analysis of the performance pred_test = [(model.predict_on_batch(x),y,w) for x,y,w in tqdm.tqdm(validation_batches_fortest_408(1))] acc_test, conf_mat, conf_mat_column_mapping, contingency_metric, chi2_res_pval, gtest_res_pval = generate_report(pred_test) # save the history FileUtility.save_obj(full_path + 'history', h.history)
def biomarker_extraction(self, labeler, label_mapper, phenoname, p_value_threshold=0.05, pos_label=None, neg_label=None, excel=0): ''' :return: ''' print('\t✔ NPE Marker detection is started..') start = time.time() rep_base_path = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) filenames = [ x.split('/')[-1] for x in FileUtility.load_list(rep_base_path + '_meta') ] # CHECK EXISTING LABELS if callable(labeler): selected_samples = [ idx for idx, file in enumerate(filenames) if labeler(file) in label_mapper ] else: selected_samples = [ idx for idx, file in enumerate(filenames) if labeler[file] in label_mapper ] if callable(labeler): Y = [ str(label_mapper[labeler(filenames[sample_id])]) for sample_id in selected_samples ] else: Y = [ str(label_mapper[labeler[filenames[sample_id]]]) for sample_id in selected_samples ] FileUtility.save_list(rep_base_path + '_' + phenoname + '_Y.txt', Y) DiTaxaWorkflow.ensure_dir(self.output_directory_inter + 'npe_marker_files/') if self.override == 1 or not DiTaxaWorkflow.exists( self.output_directory_inter + 'npe_marker_files/' + '_'.join([phenoname, 'chi2_relative.fasta'])): with warnings.catch_warnings(): warnings.simplefilter("ignore") G16s = NPEMarkerDetection( rep_base_path + '.npz', rep_base_path + '_' + phenoname + '_Y.txt', rep_base_path + '_features', self.output_directory_inter + 'npe_marker_files/' + phenoname, selected_samples) G16s.extract_markers() end = time.time() spent = end - start print('\t✔ biomarker extraction ' + phenoname + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + ' cores') self.log_file.append('biomarker extraction ' + phenoname + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + ' cores') else: print( '\t✔ Biomarker are already extracted. Thus, the statistical test was bypassed' ) self.log_file.append( ' Biomarker are already extracted. Thus, the statistical test was bypassed' ) FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) print('\t✔ Taxonomic assignment of the markers..') if callable(labeler): phenotypes = [ labeler(filenames[sample_id]) for sample_id in selected_samples ] else: phenotypes = [ labeler[filenames[sample_id]] for sample_id in selected_samples ] fasta_file = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_chi2_relative.fasta' matrix_path = rep_base_path + '.npz' feature_file_path = rep_base_path + '_features' if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000: remove_redundants = False else: remove_redundants = True FileUtility.ensure_dir(self.output_directory + 'final_outputs/save_states/') if self.override == 1 or not DiTaxaWorkflow.exists( self.output_directory + 'final_outputs/save_states/' + phenoname + '.pickle'): start = time.time() Final_OBJ = NPEMarkerAnlaysis(fasta_file, matrix_path, feature_file_path, phenotypes, label_mapper, selected_samples, p_value_threshold=p_value_threshold, remove_redundants=remove_redundants, num_p=self.num_p, blastn_path=self.blastn_path) end = time.time() spent = end - start DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/') FileUtility.save_obj( self.output_directory + 'final_outputs/save_states/' + phenoname, Final_OBJ) print('\t✔ Marker analysis and alignment ' + phenoname + ' ' + str(spent) + ' seconds, using ' + str(self.num_p) + 'cores') self.log_file.append('Marker analysis and alignment ' + phenoname + ' ' + str(spent) + ' seconds, using ' + str(self.num_p) + 'cores') else: Final_OBJ = FileUtility.load_obj(self.output_directory + 'final_outputs/save_states/' + phenoname + '.pickle') print('\t✔ The aligned markers already existed and are loaded!') self.log_file.append( 'The aligned markers already existed and are loaded!') FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) # generating the tree Final_OBJ.generate_tree(self.output_directory + 'final_outputs/', phenoname) if excel == 1: print('\t✔ Creating marker excel file..') Final_OBJ.generate_excel( self.output_directory + 'final_outputs/' + phenoname + '.xlsx', phenoname) X_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) + '.npz' feature_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) + '_features' markers = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_finalmarker_list.txt' Y = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) + '_' + phenoname + "_Y.txt" print('\t✔ Creating t-sne plot..') DiTaxaWorkflow.plot_res(self.output_directory + 'final_outputs/' + phenoname + '_tsne.pdf', X_addr, feature_addr, markers, Y, labels=['Negative', 'Positive']) if pos_label and neg_label: print('\t✔ Creating marker heatmap..') Final_OBJ.update_matrix_by_markers_N() Final_OBJ.generate_heatmap(self.output_directory + 'final_outputs/' + phenoname + '_heatmap', pos_label=pos_label, neg_label=neg_label) if not excel == 1: print('\t✔ Creating t-sne plot..') DiTaxaWorkflow.plot_res(self.output_directory + 'final_outputs/' + phenoname + '_tsne.pdf', X_addr, feature_addr, markers, Y, labels=[neg_label, pos_label]) DiTaxaWorkflow.temp_cleanup() print( '\t⬛ Marker detection and analysis completed. You can find the results at ' + self.output_directory + ', in partuclar at final_outputs subdirectory.')
def biomarker_extraction(self, labeler, label_mapper, name_setting, p_value_threshold=0.05, pos_label=None, neg_label=None): ''' :return: ''' print('npe marker detection started') DiTaxaWorkflow.blockPrint() start = time.time() rep_base_path = self.output_directory + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) filenames = [ x.split('/')[-1] for x in FileUtility.load_list(rep_base_path + '_meta') ] # CHECK EXISTING LABELS if callable(labeler): selected_samples = [ idx for idx, file in enumerate(filenames) if labeler(file) in label_mapper ] else: selected_samples = [ idx for idx, file in enumerate(filenames) if labeler[file] in label_mapper ] if callable(labeler): Y = [ str(label_mapper[labeler(filenames[sample_id])]) for sample_id in selected_samples ] else: Y = [ str(label_mapper[labeler[filenames[sample_id]]]) for sample_id in selected_samples ] FileUtility.save_list(rep_base_path + '_' + name_setting + '_Y.txt', Y) DiTaxaWorkflow.ensure_dir(self.output_directory + 'npe_marker_files/') G16s = NPEMarkerDetection( rep_base_path + '.npz', rep_base_path + '_' + name_setting + '_Y.txt', rep_base_path + '_features', self.output_directory + 'npe_marker_files/' + name_setting, selected_samples) G16s.extract_markers() end = time.time() spent = end - start self.log_file.append('biomarker extraction ' + name_setting + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + 'cores') FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) DiTaxaWorkflow.enablePrint() print('npe marker taxonomic detection started') start = time.time() if callable(labeler): phenotypes = [ labeler(filenames[sample_id]) for sample_id in selected_samples ] else: phenotypes = [ labeler[filenames[sample_id]] for sample_id in selected_samples ] fasta_file = self.output_directory + 'npe_marker_files/' + name_setting + '_chi2_relative.fasta' matrix_path = rep_base_path + '.npz' feature_file_path = rep_base_path + '_features' if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000: remove_redundants = False else: remove_redundants = True Final_OBJ = NPEMarkerAnlaysis(fasta_file, matrix_path, feature_file_path, phenotypes, label_mapper, selected_samples, p_value_threshold=p_value_threshold, remove_redundants=remove_redundants, num_p=self.num_p) end = time.time() spent = end - start DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/') FileUtility.save_obj( self.output_directory + 'final_outputs/' + name_setting, Final_OBJ) Final_OBJ.generate_tree(self.output_directory + 'final_outputs/', name_setting) self.log_file.append('blasting extraction ' + name_setting + ' ' + str(spent) + ' seconds, using ' + str(self.num_p) + 'cores') FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) if pos_label and neg_label: Final_OBJ.generate_heatmap(self.output_directory + 'final_outputs/' + name_setting + '_heatmap', pos_label=pos_label, neg_label=neg_label)