def launch(self, data, classifiers, output_dir, plot, cv): """ Launch classifiers evaluation and allow to save output results Args: cv: data: Pandas Dataframe Dataframe with the preprocessed data corresponding to the selected mode (complete data, selected predictors) classifiers: list List of classifiers tested output_dir: str Name of the output directory plot: bool If enable, save the different results plots into the output directory Returns: None """ self.logger.info('Encoding data...') encode_data, labels, classes, predicted_labels, le = ML.encode(data) results_proba, dict_y_test, classifiers = self.stratifier( encode_data, labels, classifiers, cv, output_dir) self.logger.info('Saving outputs...') ML.dump_models(classifiers, output_dir) if plot is True: utils.mkdir(output_dir + "/Plots") self.logger.info('Saving plots and results...') visualization.plot_roc_curve_training(dict_y_test, results_proba, output_dir)
def save_args_and_results(args, results, loggers): print('Saving Args and Results') mkdir('../model/{}'.format(args.run)) datetime_str = create_datetime_str() file_save_path = "../model/{}/{}.res".format( args.run, datetime_str, ) print('Saving Args and Results at: {}'.format(file_save_path)) pickle.dump({ 'args': args, 'res': results, 'loggers': loggers }, open(file_save_path, 'wb'))
def dump_models(classifiers, output_dir): """ Args: classifiers: output_dir: Returns: """ utils.mkdir(output_dir + "/Models") for clf in classifiers: try: best_clf = clf.best_estimator_ except AttributeError: best_clf = clf clf_name = "/Models/" + best_clf.__class__.__name__ joblib.dump(best_clf, output_dir + clf_name + '.pkl')
def __init__(self, input_data, output_dir, model_dir, standardize, logger, threshold): """ Init Args: input_data: output_dir: model_dir: logger: """ warnings.filterwarnings('ignore') utils.mkdir(output_dir) self.logger = logger self.logger.info('\n') self.logger.info('=' * 100) self.logger.info( 'You will TEST the trained model on selected data : {}'.format( os.path.basename(input_data))) self.logger.info('=' * 100) self.logger.info('\n') df = utils.prepare_input_data( input_data=input_data, standardize=standardize, ) df = df.reset_index(drop=True) logger.info('TESTING on {} samples'.format(df.shape[0])) if model_dir.endswith("Models"): model = model_dir elif 'Model' in model_dir: model = model_dir else: model = model_dir + "/TRAIN/Models" classifiers = self.load_classifiers(model_dir=model) output_dir = output_dir + "/TEST" utils.mkdir(output_dir) self.launch(data=df, classifiers=classifiers, output_dir=output_dir, threshold=threshold)
def __init__( self, input_data, output, classifiers, standardize, logger, cv, plot=True, ): """ Init method for Classification class Parameters ----------- plot : bool If enable, save graphs and subplots in the output directory predictors : list List of predictors present in the header of the dataset, default=Complete table standardize : bool If enable, standardize the dataframe (mu=0, std=1) with StandardScaler() (see scikit-learn) split_columns : bool If enable, allows to split columns in the dataframe check : http://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html classifiers : list list of specific classifiers selected to test on the dataset, default = GaussianNB, LogisticRegression. Complete list : 'MLPClassifier, KNeighborsClassifier, SVC, NuSVC, DecisionTreeClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, GaussianNB, LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis, LogisticRegression') output : str output: Output_directory, default = current directory input_data : String reference the input file (CSV format) path to the input file full_clf : bool Enable test of all available Classification Algorithms """ utils.mkdir(output) starttime = datetime.now() self.logger = logger self.logger.info('Processing of input data'.format( os.path.splitext(input_data)[0])) print('\n') print('=' * 100) self.logger.info('You will TRAIN outputs on selected data : {}'.format( os.path.splitext(input_data)[0])) print('=' * 100) print('\n') df = utils.prepare_input_data( input_data=input_data, standardize=standardize, ) pd.set_option('display.float_format', lambda x: '%.3f' % x) logger.info('TRAINING on {} samples'.format(df.shape[0])) output = output + "/TRAIN" self.launch(data=df, classifiers=classifiers, output_dir=output, plot=plot, cv=cv) endtime = datetime.now() self.logger.info("Script duration : " + str(endtime - starttime).split('.', 2)[0])
def training_and_testing(ARGS): # Check conditions if ARGS['list_columns']: list_columns = list(sorted(ARGS['list_columns'])) if not ARGS['list_columns']: list_columns = [ 'CADD_phred', 'SIFTval', 'VEST4_score', 'gnomAD_exomes_AF' ] if ARGS['flag']: flag = list(sorted(ARGS['flag'])) if not ARGS['flag']: flag = [ "REVEL_score", "ClinPred_score", "M-CAP_score", "fathmm-XF_coding_score", "Eigen-raw_coding", "PrimateAI_score", ] if not os.path.exists(ARGS['output_dir'] + '/TRAIN/training.csv.gz') or not os.path.exists( ARGS['output_dir'] + '/TEST/testing.csv.gz'): logger.warn( '--train_and_test mode selected but training and testing file not found, creation with following parameters :' '--ratio : ' + str(ARGS['ratio']) + ', --proportion : ' + str(ARGS['proportion'])) ARGS['force_datasets'] = True if os.path.exists(ARGS['output_dir'] + '/TRAIN/training.csv.gz') or os.path.exists( ARGS['output_dir'] + '/TEST/testing.csv.gz'): logger.info('Training and testing file found') if ARGS['combinatory'] is True: pass # if enable, erase previously generated training and testing file from a global dataframe to creating other ones if ARGS['force_datasets'] is True: utils.mkdir(ARGS['output_dir']) utils.mkdir(ARGS['output_dir'] + '/TRAIN') utils.mkdir(ARGS['output_dir'] + '/TEST') logger.warn('Creating new files or overwriting old ones') prop = ARGS['proportion'] t = float(round(prop / (1 - prop), 2)) ratio = ARGS['ratio'] tmp = pd.read_csv(filepath_or_buffer=ARGS['input'], sep='\t', compression='gzip', encoding='utf-8', low_memory=False) if list_columns and flag: # Selection of specific columns to be used from a global dataframe # Example : df with 10 columns, --list_columns column1 column2 column5 tmp = select_columns_pandas.select_columns_pandas( tmp, list_columns, flag) logger.info(tmp) # Use of input parameters to build training and testing dataframes (proportion, ratio of data between train and test) # Special attention is paid to remove overlap between evaluation|test sets and training dataset to prevent any overfitting complete_data_path = tmp.loc[tmp['True_Label'] == 1] complete_data_path = complete_data_path.sample(frac=1) complete_data_begn = tmp.loc[tmp['True_Label'] == -1] complete_data_begn = complete_data_begn.sample(frac=1) max_size = max(complete_data_path.shape[0], complete_data_begn.shape[0]) min_size = min(complete_data_path.shape[0], complete_data_begn.shape[0]) if max_size > (t * min_size): max_size = min_size * t elif max_size < (t * min_size): min_size = max_size / t if min_size < 1000 and min(complete_data_path.shape[0], complete_data_begn.shape[0]) == \ complete_data_path.shape[0]: logger.warn( 'CAREFUL : Size of the pathogenic dataset will be < 1000 samples' ) eval_test_size = ratio train_path = complete_data_path.head( n=int(round(min_size * (1 - eval_test_size)))) train_begn = complete_data_begn.head( n=int(round(max_size * (1 - eval_test_size)))) eval_path = complete_data_path.tail( n=int(round(min_size * eval_test_size))) eval_begn = complete_data_begn.tail( n=int(round(min_size * eval_test_size))) eval_path.dropna(inplace=True) eval_begn.dropna(inplace=True) complete_training = pd.concat([train_path, train_begn ]).drop_duplicates(keep='first') complete_training = complete_training[complete_training.columns.drop( list(complete_training.filter(regex='pred|flag')))] complete_training.dropna(inplace=True) # Some stats on Pathogenic and Benign variant numbers in both training and testing dataframes logger.info('Training - Path : ' + str(complete_training[ complete_training['True_Label'] == 1].shape[0])) logger.info('Training - Benign : ' + str(complete_training[ complete_training['True_Label'] == -1].shape[0])) min_size_eval = min(eval_path.shape[0], eval_begn.shape[0]) complete_eval = pd.concat([ eval_path.sample(frac=1).head(min_size_eval), eval_begn.sample(frac=1).head(min_size_eval) ]).drop_duplicates(keep='first') logger.info( 'Testing - Path : ' + str(complete_eval[complete_eval['True_Label'] == 1].shape[0])) logger.info( 'Testing - Benign : ' + str(complete_eval[complete_eval['True_Label'] == -1].shape[0])) # Dumping data complete_training.to_csv(path_or_buf=ARGS['output_dir'] + '/TRAIN/training.csv.gz', sep='\t', compression='gzip', encoding='utf-8', index=False) complete_eval.to_csv(path_or_buf=ARGS['output_dir'] + '/TEST/testing.csv.gz', sep='\t', compression='gzip', encoding='utf-8', index=False) check_dir_train = False if os.path.isdir(ARGS['output_dir'] + '/TRAIN/Models'): check_dir_train = True if (ARGS['force_training'] is True) or (check_dir_train is False): # Training model # TrainingClassification(input_data=ARGS['output_dir'] + '/TRAIN/training.csv.gz', # classifiers=classifiers, # standardize=ARGS['standardize'], # output=ARGS["output_dir"], # logger=logger, # cv=ARGS['cross_validation'] # ) TestingClassification(input_data=ARGS['output_dir'] + '/TEST/testing.csv.gz', standardize=ARGS['standardize'], output_dir=ARGS["output_dir"], model_dir=ARGS['model'], logger=logger, threshold=ARGS['threshold']) # Generation of a histogram to see most important features used in builded model # histo_weights.histo_and_metrics(folder=ARGS['output_dir'], logger=logger) # This parameter, if enabled, will build all possible combinations from a single dataframe if sources are mentionned # Example : A global dataframe based on 3 databases (2 pathogenic : Clinvar and HGMD and 1 benign : gnomAD) was generated # The following lines will generate 2 evaluation sets : (clinvar|gnomAD) and (HGMD|gnomAD) with various MAF thresholds (<0.01, <0.001, 0.0001, AC=1(singleton), AF=0) # and each of these combinations will be tested with the previously generated outputs. (Overlapping is checked between these combinations and training dataset) if ARGS['eval'] and ARGS['eval'].endswith('.csv.gz'): # TODO : CHANGE NAME print('\n\n') logger.info('--BUILDING & TESTING ON EVALUATION SETS--') output_dir = ARGS['output_dir'] eval_output_dir = output_dir eval_output_dir = eval_output_dir.split('/') eval_output_dir[-1] = 'EVALUATION_SETS_' + eval_output_dir[-1] eval_output_dir = "/".join(eval_output_dir) if os.path.isdir(eval_output_dir): pass else: utils.mkdir(eval_output_dir) # if ARGS['list_columns'] and ARGS['flag']: combination_pandas.combination_pandas( ARGS['eval'], output_dir + '/TRAIN/training.csv.gz', eval_output_dir, logger, list_columns, flag, CV=ARGS['cross_validation_evaluation']) # else: # combination_pandas.combination_pandas(ARGS['eval'], ARGS['output_dir'] + '/TRAIN/training.csv.gz', output_dir, CV=ARGS['cross_validation_evaluation']) l_dir = os.listdir(eval_output_dir) print(list(zip(l_dir))) parmap.starmap(test_eval_mp, list(zip(l_dir)), pm_pbar=True, pm_processes=ARGS['threads']) # Plots are automatically generated to visualize performance across various scenario for the different combinations print('\n\n') logger.info('--GENERATING PLOTS & STATS--') utils.mkdir(eval_output_dir + '/PLOTS_AND_MEAN_TABLE') # maf_plot.violin_plot_scores(eval_output_dir, logger) # maf_plot.maf_plot_maf_0(eval_output_dir, ARGS['cross_validation_evaluation'], logger) maf_plot.maf_plot_others(eval_output_dir, ARGS['cross_validation_evaluation'], logger)
def prediction(ARGS): return_df = True # BASIC list_columns = list(sorted(ARGS['list_columns'])) flag = list(sorted(ARGS['flag'])) input_file = ARGS['input'] output_file = input_file.replace('.csv.gz', '_MISTIC.csv.gz') output_dir = ARGS['output_dir'] model_dir = ARGS['model'] select = ARGS['wt_select'] utils.mkdir(output_dir) # IMPORT DF data = pd.read_csv(filepath_or_buffer=input_file, sep='\t', compression='gzip', encoding='utf-8', low_memory=False) data['ID'] = data['ID'].str.lstrip('chr_') # SELECT GOOD COLUMNS if select is True: data = select_columns_pandas.select_columns_pandas(data, list_columns, flag, progress_bar=False, fill=True, dropna=False) col_ordered = ['ID', 'True_Label'] + list( sorted(set(list(data.columns)) - set(['ID', 'True_Label']))) data = data[col_ordered] if select is False: data = data[list_columns + flag] data['True_Label'] = data['True_Label'].replace(-1, 0) if 'Amino_acids' in list_columns: l_cols = [e for e in list_columns if e != 'Amino_acids'] else: l_cols = list_columns data_scoring = data.dropna(subset=l_cols) # IMPORT SKLEARN MODELS classifiers = dict() log = list() for mod in glob.glob(model_dir + "/*.pkl"): sk_model = joblib.load(mod) classifiers[os.path.basename(mod).replace('.pkl', '')] = sk_model name = os.path.basename(mod).replace('.pkl', '') data_scoring[name + '_proba'] = sk_model.predict_proba( data_scoring[l_cols])[:, 1] data_scoring[name + '_pred'] = sk_model.predict(data_scoring[l_cols]) data = pd.concat( [data, data_scoring[[name + '_proba', name + '_pred']]], axis=1) col_ordered = ['ID', 'True_Label'] + list( sorted(set(list(data.columns)) - set(['ID', 'True_Label']))) data = data[col_ordered] with_maf = data[data['gnomAD_exomes_AF'] != 0] without_maf = data[data['gnomAD_exomes_AF'] == 0] data['MISTIC_pred'] = pd.concat( [with_maf['MISTIC_VC_pred'], without_maf['MISTIC_LR_pred']], axis=0).sort_index() data['MISTIC_proba'] = pd.concat( [with_maf['MISTIC_VC_proba'], without_maf['MISTIC_LR_proba']], axis=0).sort_index() data.drop([ 'MISTIC_VC_pred', 'MISTIC_VC_proba', 'MISTIC_LR_pred', 'MISTIC_LR_proba' ], axis=1, inplace=True) if return_df is False: data.to_csv(output_file, compression='gzip', index=False, sep='\t') elif return_df is True: return data
import os import argparse from DatasetClass import MyDataset from datetime import datetime from utils.engine import train_one_epoch, evaluate import utils.helper as helper import src.utils.utils as utils # ----------------------------------------------- Default Arguments & Variables ---------------------------------------- # File name of this runtime now = datetime.now() filename = now.strftime("%Y_%b_%d_%Hh_%mm") # Make dir to save the resulting data from training PATH = '../models/model_ces_' + filename utils.mkdir(PATH) # Defaults batch_size = 1 epochs = 1 optimizer_type = 'sgd' lr = 0.1 # Aux best_mAP = 0 # ----------------------------------------------- Parsed Arguments ----------------------------------------------------- # Initiate the parser parser = argparse.ArgumentParser() # Add long and short argument parser.add_argument("--batch_size", help="Set batch size.")
def train_fsa_rnn(args, paths): logger = Logger() # config = Config_Integrate(args) dset = load_classification_dataset(args.dataset) t2i, i2t, in2i, i2in = dset['t2i'], dset['i2t'], dset['in2i'], dset['i2in'] query_train, intent_train = dset['query_train'], dset['intent_train'] query_dev, intent_dev = dset['query_dev'], dset['intent_dev'] query_test, intent_test = dset['query_test'], dset['intent_test'] len_stats(query_train) len_stats(query_dev) len_stats(query_test) # extend the padding # add pad <pad> to the last of vocab i2t[len(i2t)] = '<pad>' t2i['<pad>'] = len(i2t) - 1 train_query, train_query_inverse, train_lengths = pad_dataset( query_train, args, t2i['<pad>']) dev_query, dev_query_inverse, dev_lengths = pad_dataset( query_dev, args, t2i['<pad>']) test_query, test_query_inverse, test_lengths = pad_dataset( query_test, args, t2i['<pad>']) shots = int(len(train_query) * args.train_portion) if args.use_unlabel: all_pred_train, all_pred_dev, all_pred_test, all_out_train, all_out_dev, all_out_test = PredictByRE( args) intent_data_train = ATISIntentBatchDatasetUtilizeUnlabel( train_query, train_query_inverse, train_lengths, intent_train, all_pred_train, all_out_train, shots) elif args.train_portion == 0: # special case when train portion==0 and do not use unlabel data, should have no data intent_data_train = None else: intent_data_train = ATISIntentBatchDatasetBidirection( train_query, train_query_inverse, train_lengths, intent_train, shots) # should have no/few dev data in low-resource setting if args.train_portion == 0: intent_data_dev = None elif args.train_portion <= 0.01: intent_data_dev = ATISIntentBatchDatasetBidirection( dev_query, dev_query_inverse, dev_lengths, intent_dev, shots) else: intent_data_dev = ATISIntentBatchDatasetBidirection( dev_query, dev_query_inverse, dev_lengths, intent_dev) intent_data_test = ATISIntentBatchDatasetBidirection( test_query, test_query_inverse, test_lengths, intent_test, ) intent_dataloader_train = DataLoader( intent_data_train, batch_size=args.bz) if intent_data_train else None intent_dataloader_dev = DataLoader( intent_data_dev, batch_size=args.bz) if intent_data_dev else None intent_dataloader_test = DataLoader(intent_data_test, batch_size=args.bz) print('len train dataset {}'.format( len(intent_data_train) if intent_data_train else 0)) print('len dev dataset {}'.format( len(intent_data_dev) if intent_data_dev else 0)) print('len test dataset {}'.format(len(intent_data_test))) print('num labels: {}'.format(len(in2i))) print('num vocabs: {}'.format(len(t2i))) forward_params = dict() forward_params['V_embed_extend'], forward_params['pretrain_embed_extend'], forward_params['mat'], forward_params['bias'], \ forward_params['D1'], forward_params['D2'], forward_params['language_mask'], forward_params['language'], forward_params['wildcard_mat'], \ forward_params['wildcard_mat_origin_extend'] = \ get_init_params(args, in2i, i2in, t2i, paths[0]) if args.bidirection: backward_params = dict() backward_params['V_embed_extend'], backward_params['pretrain_embed_extend'], backward_params['mat'], backward_params['bias'], \ backward_params['D1'], backward_params['D2'], backward_params['language_mask'], backward_params['language'], backward_params['wildcard_mat'], \ backward_params['wildcard_mat_origin_extend'] = \ get_init_params(args, in2i, i2in, t2i, paths[1]) # get h1 for FSAGRU h1_forward = None h1_backward = None if args.farnn == 1: args.farnn = 0 temp_model = FSARNNIntegrateEmptyStateSaperateGRU( pretrained_embed=forward_params['pretrain_embed_extend'], trans_r_1=forward_params['D1'], trans_r_2=forward_params['D2'], embed_r=forward_params['V_embed_extend'], trans_wildcard=forward_params['wildcard_mat'], config=args, ) input_x = torch.LongTensor([[t2i['BOS']]]) if torch.cuda.is_available(): temp_model.cuda() input_x = input_x.cuda() h1_forward = temp_model.viterbi(input_x, None).detach() h1_forward = h1_forward.reshape(-1) if args.bidirection: temp_model = FSARNNIntegrateEmptyStateSaperateGRU( pretrained_embed=backward_params['pretrain_embed_extend'], trans_r_1=backward_params['D1'], trans_r_2=backward_params['D2'], embed_r=backward_params['V_embed_extend'], trans_wildcard=backward_params['wildcard_mat'], config=args, ) input_x = torch.LongTensor([[t2i['EOS']]]) if torch.cuda.is_available(): temp_model.cuda() input_x = input_x.cuda() h1_backward = temp_model.viterbi(input_x, None).detach() h1_backward = h1_backward.reshape(-1) args.farnn = 1 if args.bidirection: model = IntentIntegrateSaperateBidirection_B( pretrained_embed=forward_params['pretrain_embed_extend'], forward_params=forward_params, backward_params=backward_params, config=args, h1_forward=h1_forward, h1_backward=h1_backward) else: model = IntentIntegrateSaperate_B( pretrained_embed=forward_params['pretrain_embed_extend'], trans_r_1=forward_params['D1'], trans_r_2=forward_params['D2'], embed_r=forward_params['V_embed_extend'], trans_wildcard=forward_params['wildcard_mat'], config=args, mat=forward_params['mat'], bias=forward_params['bias'], h1_forward=h1_forward, ) if args.loss_type == 'CrossEntropy': criterion = torch.nn.CrossEntropyLoss() elif args.loss_type == 'NormalizeNLL': criterion = relu_normalized_NLLLoss else: print("Wrong loss function") if args.optimizer == 'SGD': optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=0) if args.optimizer == 'ADAM': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0) if torch.cuda.is_available(): model = model.cuda() pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print('ALL TRAINABLE PARAMETERS: {}'.format(pytorch_total_params)) # TRAIN acc_train_init, avg_loss_train_init, train_init_p, train_init_r = val( model, intent_dataloader_train, epoch=0, mode='TRAIN', config=args, i2in=i2in, logger=logger, criterion=criterion) # DEV acc_dev_init, avg_loss_dev_init, dev_init_p, dev_init_r = val( model, intent_dataloader_dev, epoch=0, mode='DEV', config=args, i2in=i2in, logger=logger, criterion=criterion) # TEST acc_test_init, avg_loss_test_init, test_init_p, test_init_r = val( model, intent_dataloader_test, epoch=0, mode='TEST', config=args, i2in=i2in, logger=logger, criterion=criterion) print("{} INITIAL: ACC: {}, LOSS: {}, P: {}, R: {}".format( 'TRAIN', acc_train_init, avg_loss_train_init, train_init_p, train_init_r)) print("{} INITIAL: ACC: {}, LOSS: {}, P: {}, R: {}".format( 'DEV', acc_dev_init, avg_loss_dev_init, dev_init_p, dev_init_r)) print("{} INITIAL: ACC: {}, LOSS: {}, P: {}, R: {}".format( 'TEST', acc_test_init, avg_loss_test_init, test_init_p, test_init_r)) logger.add("{} INITIAL: ACC: {}, LOSS: {}, P: {}, R: {}".format( 'TRAIN', acc_train_init, avg_loss_train_init, train_init_p, train_init_r)) logger.add("{} INITIAL: ACC: {}, LOSS: {}, P: {}, R: {}".format( 'DEV', acc_dev_init, avg_loss_dev_init, dev_init_p, dev_init_r)) logger.add("{} INITIAL: ACC: {}, LOSS: {}, P: {}, R: {}".format( 'TEST', acc_test_init, avg_loss_test_init, test_init_p, test_init_r)) if args.only_probe: exit(0) best_dev_acc = acc_dev_init counter = 0 best_dev_model = deepcopy(model) if not intent_dataloader_train: args.epoch = 0 for epoch in range(1, args.epoch + 1): avg_loss = 0 acc = 0 pbar_train = tqdm(intent_dataloader_train) pbar_train.set_description("TRAIN EPOCH {}".format(epoch)) model.train() for batch in pbar_train: optimizer.zero_grad() x_forward = batch['x_forward'] x_backward = batch['x_backward'] label = batch['i'].view(-1) lengths = batch['l'] if torch.cuda.is_available(): x_forward = batch['x_forward'].cuda() x_backward = batch['x_backward'].cuda() lengths = lengths.cuda() label = label.cuda() if args.bidirection: scores = model(x_forward, x_backward, lengths) else: scores = model(x_forward, lengths) loss = criterion(scores, label) loss.backward() optimizer.step() avg_loss += loss.item() acc += (scores.argmax(1) == label).sum().item() pbar_train.set_postfix_str( "{} - total right: {}, total loss: {}".format( 'TRAIN', acc, loss)) acc = acc / len(intent_data_train) avg_loss = avg_loss / len(intent_data_train) print("{} Epoch: {} | ACC: {}, LOSS: {}".format( 'TRAIN', epoch, acc, avg_loss)) logger.add("{} Epoch: {} | ACC: {}, LOSS: {}".format( 'TRAIN', epoch, acc, avg_loss)) # DEV acc_dev, avg_loss_dev, p, r = val(model, intent_dataloader_dev, epoch, 'DEV', logger, config=args, criterion=criterion) counter += 1 # counter for early stopping if (acc_dev is None) or (acc_dev > best_dev_acc): counter = 0 best_dev_acc = acc_dev best_dev_model = deepcopy(model) if counter > args.early_stop: break best_dev_test_acc, avg_loss_test, best_dev_test_p, best_dev_test_r \ = val(best_dev_model, intent_dataloader_test, epoch, 'TEST', logger, config=args, criterion=criterion) # Save the model datetime_str = create_datetime_str() model_save_path = "../model/{}/D{:.4f}-T{:.4f}-DI{:.4f}-TI{:.4f}-{}-{}-{}".format( args.run, best_dev_acc, best_dev_test_acc, acc_dev_init, acc_test_init, datetime_str, args.dataset, args.seed) mkdir("../model/{}/".format(args.run)) mkdir(model_save_path) print("SAVING MODEL {} .....".format(model_save_path)) torch.save(model.state_dict(), model_save_path + '.model') return acc_dev_init, acc_test_init, best_dev_acc, best_dev_test_acc, best_dev_test_p, best_dev_test_r, logger.record
def stats_exomes_1000G(directory): pred_dict = { "ClinPred_score_pred": "ClinPred_flag", "PrimateAI_score_pred": "PrimateAI_flag", "M-CAP_score_pred": "M-CAP_flag", "REVEL_score_pred": "REVEL_flag", "Eigen-raw_coding_pred": "Eigen-raw_coding_flag", "fathmm-XF_coding_score_pred": "fathmm-XF_coding_flag", "MISTIC_pred": "MISTIC_proba", } check = ['Eigen-raw_coding_pred', 'hsEigen-raw_coding_pred', 'M-CAP_score_pred', 'hsM-CAP_score_pred', 'PrimateAI_score_pred', 'hsPrimateAI_score_pred', 'ClinPred_score_pred', 'hsClinPred_score_pred', 'REVEL_score_pred', 'hsREVEL_score_pred', 'fathmm-XF_coding_score_pred', 'hsfathmm-XF_coding_score_pred', 'MISTIC_pred', 'hsMISTIC_pred'] mkdir(directory + '/PLOTS_AND_TABLES') m = multiprocessing.Manager() df_l = m.list() l_dir = list(sorted(os.listdir(directory))) directory = directory + '/' if directory.endswith('/') is False else directory l_dir = [directory + f for f in l_dir if 'PLOTS' not in f] parmap.starmap(mp_exome, list(zip(l_dir)), pred_dict, df_l, check, pm_pbar=True) df_l = list(df_l) sorter = ['Eigen', 'PrimateAI', 'FATHMM-XF', 'ClinPred', 'REVEL', 'M-CAP', 'MISTIC'] df_stats = pd.DataFrame(df_l).sort_values(by=['Classifier']) df_stats.Classifier = df_stats.Classifier.str.split('_') df_stats.Classifier = df_stats.Classifier.str[0] df_stats.Classifier = df_stats.Classifier.str.replace('-raw', '') df_stats.Classifier = df_stats.Classifier.str.replace('fathmm-XF', 'FATHMM-XF') hs_df = df_stats.copy() hs_df.Classifier = hs_df.Classifier.astype('category') hs_df.Classifier.cat.set_categories(sorter, inplace=True) hs_df.loc[hs_df['Percentile index'].isna() == True, 'Causative index'] = np.nan for ylim, name in zip([(0,50),(-5,350)], ['zoom', 'full_scale']): f = plt.figure(figsize=(6, 4)) sns.violinplot(x="Classifier", y="Causative index", data=hs_df, palette=["#0c2461", "#22a6b3", "#9b59b6", "#2196F3", "#4CAF50", "#FF9800", "#f44336"], showfliers=False, cut=0.1, linewidth=2, inner="box", scale='width') plt.xticks(rotation=45) plt.xlabel('') plt.ylabel('Ranking of causative deleterious variants') plt.ylim(ylim[0], ylim[1]) f.tight_layout() plt.savefig(directory + '/PLOTS_AND_TABLES/Ranking_exomes_index_{}.png'.format(name), dpi=300) plt.close() for ylim, name in zip([(80,100),(0,100)], ['zoom', 'full_scale']): f = plt.figure(figsize=(8, 6)) sns.boxplot(x="Classifier", y="Percentile index", data=hs_df, palette=["#bdc3c7"], showfliers=False,) sns.stripplot(x="Classifier", y="Percentile index", data=hs_df, palette=["#0c2461", "#22a6b3", "#9b59b6", "#2196F3", "#4CAF50", "#FF9800", "#f44336"], alpha=0.25, linewidth=0.4) f.tight_layout() plt.xlabel('') plt.ylabel('Percentile rank of causative variant') plt.ylim(ylim[0], ylim[1]) plt.savefig(directory + '/PLOTS_AND_TABLES/Ranking_exomes_percentile_{}.png'.format(name), dpi=300) plt.close() sns.set_context("paper", font_scale=1) plt.figure(figsize=(6, 4)) hs_df['Path_amount'] = 100 * (hs_df['Pathogenic detected']) / ( hs_df['Pathogenic detected'] + hs_df['Benign detected']) plt.grid(axis='y', alpha=0.2) sns.violinplot(x="Classifier", y="Ratio_path_percentage", data=hs_df, palette=["#0c2461", "#22a6b3", "#9b59b6", "#2196F3", "#4CAF50", "#FF9800", "#f44336"], showfliers=False, inner="box", scale='width', linewidth=2) plt.xticks(rotation=45) plt.ylabel('Percentage of predicted deleterious variants') plt.xlabel('') l_hs = list() for clf in list(hs_df['Classifier'].unique()): d_hs = dict() values_path_amount = hs_df.loc[hs_df['Classifier'] == clf]['Ratio_path_percentage'].values d_hs['Classifier'] = clf d_hs['Path_amount_mean'] = np.mean(values_path_amount) d_hs['Path_amount_median'] = np.median(values_path_amount) d_hs['Path_amount_std'] = np.std(values_path_amount) values_causative_index = hs_df.loc[hs_df['Classifier'] == clf]['Causative index'].values d_hs['Causative_index_mean'] = np.mean(values_causative_index) d_hs['Causative_index_median'] = np.median(values_causative_index) d_hs['Causative_index_std'] = np.std(values_causative_index) l_hs.append(d_hs) final_results_mean_df = pd.DataFrame(l_hs) final_results_mean_df.Classifier = final_results_mean_df.Classifier.astype('category') final_results_mean_df.Classifier.cat.set_categories(sorter, inplace=True) final_results_mean_df.sort_values(by='Classifier').T.to_excel(directory + '/PLOTS_AND_TABLES/Results_mean_median_std.xlsx') plt.tight_layout() plt.savefig(directory + '/PLOTS_AND_TABLES/Pathogenic_number_test.png', dpi=600) plt.close() df_stats = df_stats[['Classifier', 'Filename', 'Exome_name', 'Source', 'Causative_variant', 'Benign detected', 'No score', 'Pathogenic detected', 'Ratio_path_percentage', 'Causative index', 'Percentile index', 'Score', 'Prediction']] df_stats.to_csv(directory + '/PLOTS_AND_TABLES/Stats_exomes_raw.csv', sep='\t') cols = ['Benign detected', 'Pathogenic detected', 'Ratio_path_percentage', 'No score', 'Causative index', 'Percentile index', 'Score', 'Prediction'] merge_df = pd.DataFrame() for col in cols: df_stats_predictors = df_stats[['Classifier', 'Filename', 'Exome_name', 'Source', col]].pivot( index='Exome_name', columns='Classifier', values=col).add_suffix('_' + col.replace(' ', '_')) df_stats_predictors.index.name = None df_stats_predictors.columns.name = None merge_df = pd.concat([merge_df, df_stats_predictors], axis=1) df_stats = df_stats[['Filename', 'Exome_name', 'Source', 'Causative_variant']].drop_duplicates(keep='first') df_stats.set_index('Exome_name', inplace=True, ) concat_df = pd.concat([df_stats, merge_df], axis=1, sort=True) concat_df = concat_df[concat_df.columns.drop(list(concat_df.filter(regex='^hs')))] concat_df = concat_df[concat_df.columns.drop(list(concat_df.filter(regex='_detected|_Prediction|Source|No_score')))] stats_df = concat_df.copy() stats_df.loc['mean'] = concat_df.mean() stats_df.loc['25'] = concat_df.quantile(0.25) stats_df.loc['50'] = concat_df.quantile(0.5) stats_df.loc['75'] = concat_df.quantile(0.75) stats_df.loc['90'] = concat_df.quantile(0.9) stats_df.loc['95'] = concat_df.quantile(0.95) stats_df.loc['99'] = concat_df.quantile(0.99) stats_df.loc['std'] = concat_df.std() stats_df.to_excel(directory + '/PLOTS_AND_TABLES/Stats_exomes.xlsx', index=True) clfs = ['Eigen', 'PrimateAI', 'FATHMM-XF', 'ClinPred', 'REVEL', 'M-CAP', 'MISTIC', ] d = collections.defaultdict(dict) for clf in clfs: d[clf]['Path_detected_t_test'] = ttest_ind(hs_df[hs_df['Classifier'] == clf]['Ratio_path_percentage'].dropna().values, hs_df[hs_df['Classifier'] == 'MISTIC']['Ratio_path_percentage'].dropna().values)[1] d[clf]['Causative_index_t_test'] = ttest_ind(hs_df[hs_df['Classifier'] == clf]['Causative index'].dropna().values, hs_df[hs_df['Classifier'] == 'MISTIC']['Causative index'].dropna().values)[1] d[clf]['Path_detected_mannwhitneyu'] = mannwhitneyu(hs_df[hs_df['Classifier'] == clf]['Ratio_path_percentage'].dropna().values, hs_df[hs_df['Classifier'] == 'MISTIC']['Ratio_path_percentage'].dropna().values)[1] d[clf]['Causative_index_mannwhitneyu'] = mannwhitneyu(hs_df[hs_df['Classifier'] == clf]['Causative index'].dropna().values, hs_df[hs_df['Classifier'] == 'MISTIC']['Causative index'].dropna().values)[1] d[clf]['Path_detected_kruskal'] = kruskal(hs_df[hs_df['Classifier'] == clf]['Ratio_path_percentage'].dropna().values, hs_df[hs_df['Classifier'] == 'MISTIC']['Ratio_path_percentage'].dropna().values)[1] d[clf]['Causative_index_kruskal'] = kruskal(hs_df[hs_df['Classifier'] == clf]['Causative index'].dropna().values, hs_df[hs_df['Classifier'] == 'MISTIC']['Causative index'].dropna().values)[1] pd.DataFrame(d).to_excel(directory + '/PLOTS_AND_TABLES/Test_stats.xlsx')
dict( list_columns=list_columns[1:], flag=flag, input=deleterious_df_path, output_dir=os.path.dirname(deleterious_df_path), model=model_dir, wt_select=True, )) deleterious_df = deleterious_df.sample(frac=1, random_state=1) directory = directory + '/' if directory.endswith( '/') is False else directory output_dir = output_dir + '/' if output_dir.endswith( '/') is False else output_dir utils.mkdir(output_dir) l_dir = list(sorted(os.listdir(directory)))[:deleterious_df.shape[0]] exomes_id = [e.replace('.csv.gz', '') for e in l_dir] l_dir = [directory + file for file in l_dir if file.endswith('.csv.gz')] deleterious_df = deleterious_df.filter( regex='flag|pred|proba|ID|True_Label', axis=1) deleterious_df['Source'] = pd.Series(exomes_id) col_ordered = ['ID', 'True_Label'] + list( sorted( set(list(deleterious_df.columns)) - set(['ID', 'True_Label', 'MISTIC_proba', 'MISTIC_pred']))) + [ 'MISTIC_proba', 'MISTIC_pred' ] deleterious_df = deleterious_df[col_ordered]
def plot_roc_curve_training(dict_y_test, results_proba, output_dir): """ Method for plot ROC comparison between algorithms Args: dict_y_test: dict Store the y_test used for each iteration of CV results_proba : dict Store the proba obtained for each iteration of CV for every algorithm used output_dir: str Directory where will be save the plots Returns: None """ output_dir = output_dir + '/Plots' utils.mkdir(output_dir) dict_auc = dict() ordered_dict = collections.defaultdict(dict) matplotlib.rcParams.update({'font.size': 8}) for algo, results in results_proba.items(): fig_algo = figure(num=algo, dpi=180, facecolor='w', edgecolor='k') plt.figure(algo) tprs = list() aucs = list() mean_fpr = np.linspace(0, 1, 100) for index, arrays in results.items(): fpr, tpr, thresholds = metrics.roc_curve(dict_y_test[index], arrays) tprs.append(np.interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = metrics.auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label=r'ROC CV n°%s (AUC = %0.2f)' % (index, roc_auc)) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = metrics.auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, color='red', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) dict_auc[algo] = mean_auc ordered_dict[algo]['mean_tpr'] = mean_tpr ordered_dict[algo]['mean_fpr'] = mean_fpr ordered_dict[algo]['std_auc'] = std_auc ordered_dict[algo]['std_tpr'] = std_tpr #f44336 ordered_dict[algo]['tprs_upper'] = tprs_upper ordered_dict[algo]['tprs_lower'] = tprs_lower plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating curve : ' + str(algo) + ' | ' + str(index) + ' fold cross-validation') plt.legend(loc="lower right") plt.savefig(output_dir + "/ROC_" + str(algo) + ".png", bbox_inches='tight', dpi=180) plt.close(fig_algo) fig_glob = figure(num='Global', dpi=180, facecolor='w', edgecolor='k') plt.figure('Global') ordered_dict_auc = sorted(dict_auc.items(), key=operator.itemgetter(1), reverse=True) for elem in ordered_dict_auc: algo = elem[0] mean_auc = elem[1] try: plt.plot(ordered_dict[algo]['mean_fpr'], ordered_dict[algo]['mean_tpr'], label=r'Mean ROC : ' + str(algo) + ' - AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, ordered_dict[algo]['std_auc']), lw=2, alpha=.8) except: plt.plot(ordered_dict[algo]['mean_fpr'], ordered_dict[algo]['mean_tpr'], label=r'Mean ROC : ' + str(algo) + ' - AUC = %0.2f' % (mean_auc), lw=2, alpha=.8) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating curve : Global results | ' + str(index) + ' fold cross-validation') plt.legend(loc="lower right") plt.savefig(output_dir + "/ROC_Summary.png", bbox_inches='tight', dpi=180) plt.close(fig_glob)