Beispiel #1
0
    def launch(self, data, classifiers, output_dir, plot, cv):
        """
        Launch classifiers evaluation and allow to save output results

        Args:
            cv:
            data: Pandas Dataframe
                Dataframe with the preprocessed data corresponding to the selected mode
                (complete data, selected predictors)
            classifiers: list
                List of classifiers tested
            output_dir: str
                Name of the output directory
            plot: bool
                If enable, save the different results plots into the output directory
        Returns: None

        """
        self.logger.info('Encoding data...')
        encode_data, labels, classes, predicted_labels, le = ML.encode(data)
        results_proba, dict_y_test, classifiers = self.stratifier(
            encode_data, labels, classifiers, cv, output_dir)
        self.logger.info('Saving outputs...')
        ML.dump_models(classifiers, output_dir)
        if plot is True:
            utils.mkdir(output_dir + "/Plots")
            self.logger.info('Saving plots and results...')
            visualization.plot_roc_curve_training(dict_y_test, results_proba,
                                                  output_dir)
Beispiel #2
0
def save_args_and_results(args, results, loggers):
    print('Saving Args and Results')
    mkdir('../model/{}'.format(args.run))
    datetime_str = create_datetime_str()
    file_save_path = "../model/{}/{}.res".format(
        args.run, datetime_str,
    )
    print('Saving Args and Results at: {}'.format(file_save_path))
    pickle.dump({
        'args': args,
        'res': results,
        'loggers': loggers
    }, open(file_save_path, 'wb'))
Beispiel #3
0
def dump_models(classifiers, output_dir):
	"""

	Args:
		classifiers:
		output_dir:

	Returns:

	"""
	utils.mkdir(output_dir + "/Models")
	for clf in classifiers:
		try:
			best_clf = clf.best_estimator_
		except AttributeError:
			best_clf = clf
		clf_name = "/Models/" + best_clf.__class__.__name__
		joblib.dump(best_clf, output_dir + clf_name + '.pkl')
Beispiel #4
0
    def __init__(self, input_data, output_dir, model_dir, standardize, logger,
                 threshold):
        """
        Init
        Args:
            input_data:
            output_dir:
            model_dir:
            logger:
        """
        warnings.filterwarnings('ignore')

        utils.mkdir(output_dir)
        self.logger = logger
        self.logger.info('\n')
        self.logger.info('=' * 100)
        self.logger.info(
            'You will TEST the trained model on selected data : {}'.format(
                os.path.basename(input_data)))
        self.logger.info('=' * 100)
        self.logger.info('\n')
        df = utils.prepare_input_data(
            input_data=input_data,
            standardize=standardize,
        )
        df = df.reset_index(drop=True)
        logger.info('TESTING on {} samples'.format(df.shape[0]))
        if model_dir.endswith("Models"):
            model = model_dir
        elif 'Model' in model_dir:
            model = model_dir
        else:
            model = model_dir + "/TRAIN/Models"
        classifiers = self.load_classifiers(model_dir=model)
        output_dir = output_dir + "/TEST"
        utils.mkdir(output_dir)
        self.launch(data=df,
                    classifiers=classifiers,
                    output_dir=output_dir,
                    threshold=threshold)
Beispiel #5
0
    def __init__(
        self,
        input_data,
        output,
        classifiers,
        standardize,
        logger,
        cv,
        plot=True,
    ):
        """
        Init method for Classification class

        Parameters
        -----------
        plot : bool
            If enable, save graphs and subplots in the output directory
        predictors : list
            List of predictors present in the header of the dataset, default=Complete table
        standardize : bool
            If enable, standardize the dataframe (mu=0, std=1) with StandardScaler() (see scikit-learn)
        split_columns : bool
            If enable, allows to split columns in the dataframe
            check : http://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
        classifiers : list
            list of specific classifiers selected to test on the dataset, default = GaussianNB, LogisticRegression.
            Complete list : 'MLPClassifier, KNeighborsClassifier,
            SVC, NuSVC, DecisionTreeClassifier, RandomForestClassifier,
            AdaBoostClassifier, GradientBoostingClassifier, GaussianNB,
            LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis, LogisticRegression')
        output : str
            output: Output_directory, default = current directory
        input_data : String reference the input file (CSV format)
            path to the input file
        full_clf : bool
            Enable test of all available Classification Algorithms

        """
        utils.mkdir(output)
        starttime = datetime.now()
        self.logger = logger
        self.logger.info('Processing of input data'.format(
            os.path.splitext(input_data)[0]))
        print('\n')
        print('=' * 100)
        self.logger.info('You will TRAIN outputs on selected data : {}'.format(
            os.path.splitext(input_data)[0]))
        print('=' * 100)
        print('\n')

        df = utils.prepare_input_data(
            input_data=input_data,
            standardize=standardize,
        )

        pd.set_option('display.float_format', lambda x: '%.3f' % x)

        logger.info('TRAINING on {} samples'.format(df.shape[0]))
        output = output + "/TRAIN"
        self.launch(data=df,
                    classifiers=classifiers,
                    output_dir=output,
                    plot=plot,
                    cv=cv)

        endtime = datetime.now()
        self.logger.info("Script duration : " +
                         str(endtime - starttime).split('.', 2)[0])
Beispiel #6
0
def training_and_testing(ARGS):
    # Check conditions
    if ARGS['list_columns']:
        list_columns = list(sorted(ARGS['list_columns']))
    if not ARGS['list_columns']:
        list_columns = [
            'CADD_phred', 'SIFTval', 'VEST4_score', 'gnomAD_exomes_AF'
        ]

    if ARGS['flag']:
        flag = list(sorted(ARGS['flag']))
    if not ARGS['flag']:
        flag = [
            "REVEL_score",
            "ClinPred_score",
            "M-CAP_score",
            "fathmm-XF_coding_score",
            "Eigen-raw_coding",
            "PrimateAI_score",
        ]

    if not os.path.exists(ARGS['output_dir'] +
                          '/TRAIN/training.csv.gz') or not os.path.exists(
                              ARGS['output_dir'] + '/TEST/testing.csv.gz'):
        logger.warn(
            '--train_and_test mode selected but training and testing file not found, creation with following parameters :'
            '--ratio : ' + str(ARGS['ratio']) + ', --proportion : ' +
            str(ARGS['proportion']))
        ARGS['force_datasets'] = True
    if os.path.exists(ARGS['output_dir'] +
                      '/TRAIN/training.csv.gz') or os.path.exists(
                          ARGS['output_dir'] + '/TEST/testing.csv.gz'):
        logger.info('Training and testing file found')

    if ARGS['combinatory'] is True:
        pass

    # if enable, erase previously generated training and testing file from a global dataframe to creating other ones
    if ARGS['force_datasets'] is True:

        utils.mkdir(ARGS['output_dir'])
        utils.mkdir(ARGS['output_dir'] + '/TRAIN')
        utils.mkdir(ARGS['output_dir'] + '/TEST')
        logger.warn('Creating new files or overwriting old ones')
        prop = ARGS['proportion']
        t = float(round(prop / (1 - prop), 2))

        ratio = ARGS['ratio']
        tmp = pd.read_csv(filepath_or_buffer=ARGS['input'],
                          sep='\t',
                          compression='gzip',
                          encoding='utf-8',
                          low_memory=False)

        if list_columns and flag:
            # Selection of specific columns to be used from a global dataframe
            # Example : df with 10 columns, --list_columns column1 column2 column5
            tmp = select_columns_pandas.select_columns_pandas(
                tmp, list_columns, flag)
            logger.info(tmp)

        # Use of input parameters to build training and testing dataframes (proportion, ratio of data between train and test)
        # Special attention is paid to remove overlap between evaluation|test sets and training dataset to prevent any overfitting

        complete_data_path = tmp.loc[tmp['True_Label'] == 1]
        complete_data_path = complete_data_path.sample(frac=1)
        complete_data_begn = tmp.loc[tmp['True_Label'] == -1]
        complete_data_begn = complete_data_begn.sample(frac=1)
        max_size = max(complete_data_path.shape[0],
                       complete_data_begn.shape[0])
        min_size = min(complete_data_path.shape[0],
                       complete_data_begn.shape[0])

        if max_size > (t * min_size):
            max_size = min_size * t
        elif max_size < (t * min_size):
            min_size = max_size / t
        if min_size < 1000 and min(complete_data_path.shape[0], complete_data_begn.shape[0]) == \
          complete_data_path.shape[0]:
            logger.warn(
                'CAREFUL : Size of the pathogenic dataset will be < 1000 samples'
            )

        eval_test_size = ratio
        train_path = complete_data_path.head(
            n=int(round(min_size * (1 - eval_test_size))))
        train_begn = complete_data_begn.head(
            n=int(round(max_size * (1 - eval_test_size))))
        eval_path = complete_data_path.tail(
            n=int(round(min_size * eval_test_size)))
        eval_begn = complete_data_begn.tail(
            n=int(round(min_size * eval_test_size)))

        eval_path.dropna(inplace=True)
        eval_begn.dropna(inplace=True)

        complete_training = pd.concat([train_path, train_begn
                                       ]).drop_duplicates(keep='first')
        complete_training = complete_training[complete_training.columns.drop(
            list(complete_training.filter(regex='pred|flag')))]
        complete_training.dropna(inplace=True)

        # Some stats on Pathogenic and Benign variant numbers in both training and testing dataframes

        logger.info('Training - Path : ' + str(complete_training[
            complete_training['True_Label'] == 1].shape[0]))
        logger.info('Training - Benign : ' + str(complete_training[
            complete_training['True_Label'] == -1].shape[0]))

        min_size_eval = min(eval_path.shape[0], eval_begn.shape[0])
        complete_eval = pd.concat([
            eval_path.sample(frac=1).head(min_size_eval),
            eval_begn.sample(frac=1).head(min_size_eval)
        ]).drop_duplicates(keep='first')

        logger.info(
            'Testing - Path : ' +
            str(complete_eval[complete_eval['True_Label'] == 1].shape[0]))
        logger.info(
            'Testing - Benign : ' +
            str(complete_eval[complete_eval['True_Label'] == -1].shape[0]))

        # Dumping data

        complete_training.to_csv(path_or_buf=ARGS['output_dir'] +
                                 '/TRAIN/training.csv.gz',
                                 sep='\t',
                                 compression='gzip',
                                 encoding='utf-8',
                                 index=False)

        complete_eval.to_csv(path_or_buf=ARGS['output_dir'] +
                             '/TEST/testing.csv.gz',
                             sep='\t',
                             compression='gzip',
                             encoding='utf-8',
                             index=False)

    check_dir_train = False
    if os.path.isdir(ARGS['output_dir'] + '/TRAIN/Models'):
        check_dir_train = True
    if (ARGS['force_training'] is True) or (check_dir_train is False):
        # Training model
        # TrainingClassification(input_data=ARGS['output_dir'] + '/TRAIN/training.csv.gz',
        #                        classifiers=classifiers,
        #                        standardize=ARGS['standardize'],
        #                        output=ARGS["output_dir"],
        #                        logger=logger,
        #                        cv=ARGS['cross_validation']
        #                        )

        TestingClassification(input_data=ARGS['output_dir'] +
                              '/TEST/testing.csv.gz',
                              standardize=ARGS['standardize'],
                              output_dir=ARGS["output_dir"],
                              model_dir=ARGS['model'],
                              logger=logger,
                              threshold=ARGS['threshold'])

        # Generation of a histogram to see most important features used in builded model
        # histo_weights.histo_and_metrics(folder=ARGS['output_dir'], logger=logger)

    # This parameter, if enabled, will build all possible combinations from a single dataframe if sources are mentionned
    # Example : A global dataframe based on 3 databases (2 pathogenic : Clinvar and HGMD and 1 benign : gnomAD) was generated
    # The following lines will generate 2 evaluation sets : (clinvar|gnomAD) and (HGMD|gnomAD) with various MAF thresholds (<0.01, <0.001, 0.0001, AC=1(singleton), AF=0)
    # and each of these combinations will be tested with the previously generated outputs. (Overlapping is checked between these combinations and training dataset)

    if ARGS['eval'] and ARGS['eval'].endswith('.csv.gz'):
        # TODO : CHANGE NAME
        print('\n\n')
        logger.info('--BUILDING & TESTING ON EVALUATION SETS--')

        output_dir = ARGS['output_dir']
        eval_output_dir = output_dir
        eval_output_dir = eval_output_dir.split('/')
        eval_output_dir[-1] = 'EVALUATION_SETS_' + eval_output_dir[-1]
        eval_output_dir = "/".join(eval_output_dir)

        if os.path.isdir(eval_output_dir):
            pass
        else:
            utils.mkdir(eval_output_dir)

            # if ARGS['list_columns'] and ARGS['flag']:
            combination_pandas.combination_pandas(
                ARGS['eval'],
                output_dir + '/TRAIN/training.csv.gz',
                eval_output_dir,
                logger,
                list_columns,
                flag,
                CV=ARGS['cross_validation_evaluation'])
        # else:
        # 	combination_pandas.combination_pandas(ARGS['eval'], ARGS['output_dir'] + '/TRAIN/training.csv.gz', output_dir, CV=ARGS['cross_validation_evaluation'])
        l_dir = os.listdir(eval_output_dir)
        print(list(zip(l_dir)))
        parmap.starmap(test_eval_mp,
                       list(zip(l_dir)),
                       pm_pbar=True,
                       pm_processes=ARGS['threads'])

    # Plots are automatically generated to visualize performance across various scenario for the different combinations
    print('\n\n')
    logger.info('--GENERATING PLOTS & STATS--')
    utils.mkdir(eval_output_dir + '/PLOTS_AND_MEAN_TABLE')
    # maf_plot.violin_plot_scores(eval_output_dir, logger)
    # maf_plot.maf_plot_maf_0(eval_output_dir, ARGS['cross_validation_evaluation'], logger)
    maf_plot.maf_plot_others(eval_output_dir,
                             ARGS['cross_validation_evaluation'], logger)
Beispiel #7
0
def prediction(ARGS):
    return_df = True
    # BASIC
    list_columns = list(sorted(ARGS['list_columns']))
    flag = list(sorted(ARGS['flag']))
    input_file = ARGS['input']
    output_file = input_file.replace('.csv.gz', '_MISTIC.csv.gz')

    output_dir = ARGS['output_dir']
    model_dir = ARGS['model']
    select = ARGS['wt_select']

    utils.mkdir(output_dir)

    # IMPORT DF
    data = pd.read_csv(filepath_or_buffer=input_file,
                       sep='\t',
                       compression='gzip',
                       encoding='utf-8',
                       low_memory=False)
    data['ID'] = data['ID'].str.lstrip('chr_')

    # SELECT GOOD COLUMNS
    if select is True:
        data = select_columns_pandas.select_columns_pandas(data,
                                                           list_columns,
                                                           flag,
                                                           progress_bar=False,
                                                           fill=True,
                                                           dropna=False)
        col_ordered = ['ID', 'True_Label'] + list(
            sorted(set(list(data.columns)) - set(['ID', 'True_Label'])))
        data = data[col_ordered]
    if select is False:
        data = data[list_columns + flag]

    data['True_Label'] = data['True_Label'].replace(-1, 0)

    if 'Amino_acids' in list_columns:
        l_cols = [e for e in list_columns if e != 'Amino_acids']
    else:
        l_cols = list_columns

    data_scoring = data.dropna(subset=l_cols)

    # IMPORT SKLEARN MODELS
    classifiers = dict()
    log = list()
    for mod in glob.glob(model_dir + "/*.pkl"):
        sk_model = joblib.load(mod)
        classifiers[os.path.basename(mod).replace('.pkl', '')] = sk_model
        name = os.path.basename(mod).replace('.pkl', '')
        data_scoring[name + '_proba'] = sk_model.predict_proba(
            data_scoring[l_cols])[:, 1]
        data_scoring[name + '_pred'] = sk_model.predict(data_scoring[l_cols])
        data = pd.concat(
            [data, data_scoring[[name + '_proba', name + '_pred']]], axis=1)

    col_ordered = ['ID', 'True_Label'] + list(
        sorted(set(list(data.columns)) - set(['ID', 'True_Label'])))
    data = data[col_ordered]
    with_maf = data[data['gnomAD_exomes_AF'] != 0]
    without_maf = data[data['gnomAD_exomes_AF'] == 0]
    data['MISTIC_pred'] = pd.concat(
        [with_maf['MISTIC_VC_pred'], without_maf['MISTIC_LR_pred']],
        axis=0).sort_index()
    data['MISTIC_proba'] = pd.concat(
        [with_maf['MISTIC_VC_proba'], without_maf['MISTIC_LR_proba']],
        axis=0).sort_index()
    data.drop([
        'MISTIC_VC_pred', 'MISTIC_VC_proba', 'MISTIC_LR_pred',
        'MISTIC_LR_proba'
    ],
              axis=1,
              inplace=True)

    if return_df is False:
        data.to_csv(output_file, compression='gzip', index=False, sep='\t')
    elif return_df is True:
        return data
Beispiel #8
0
import os
import argparse
from DatasetClass import MyDataset
from datetime import datetime
from utils.engine import train_one_epoch, evaluate
import utils.helper as helper
import src.utils.utils as utils

# ----------------------------------------------- Default Arguments & Variables ----------------------------------------

# File name of this runtime
now = datetime.now()
filename = now.strftime("%Y_%b_%d_%Hh_%mm")
# Make dir to save the resulting data from training
PATH = '../models/model_ces_' + filename
utils.mkdir(PATH)
# Defaults
batch_size = 1
epochs = 1
optimizer_type = 'sgd'
lr = 0.1
# Aux
best_mAP = 0

# ----------------------------------------------- Parsed Arguments -----------------------------------------------------

# Initiate the parser
parser = argparse.ArgumentParser()

# Add long and short argument
parser.add_argument("--batch_size", help="Set batch size.")
Beispiel #9
0
def train_fsa_rnn(args, paths):
    logger = Logger()

    # config = Config_Integrate(args)

    dset = load_classification_dataset(args.dataset)
    t2i, i2t, in2i, i2in = dset['t2i'], dset['i2t'], dset['in2i'], dset['i2in']
    query_train, intent_train = dset['query_train'], dset['intent_train']
    query_dev, intent_dev = dset['query_dev'], dset['intent_dev']
    query_test, intent_test = dset['query_test'], dset['intent_test']

    len_stats(query_train)
    len_stats(query_dev)
    len_stats(query_test)
    # extend the padding
    # add pad <pad> to the last of vocab
    i2t[len(i2t)] = '<pad>'
    t2i['<pad>'] = len(i2t) - 1

    train_query, train_query_inverse, train_lengths = pad_dataset(
        query_train, args, t2i['<pad>'])
    dev_query, dev_query_inverse, dev_lengths = pad_dataset(
        query_dev, args, t2i['<pad>'])
    test_query, test_query_inverse, test_lengths = pad_dataset(
        query_test, args, t2i['<pad>'])
    shots = int(len(train_query) * args.train_portion)
    if args.use_unlabel:
        all_pred_train, all_pred_dev, all_pred_test, all_out_train, all_out_dev, all_out_test = PredictByRE(
            args)
        intent_data_train = ATISIntentBatchDatasetUtilizeUnlabel(
            train_query, train_query_inverse, train_lengths, intent_train,
            all_pred_train, all_out_train, shots)
    elif args.train_portion == 0:
        # special case when train portion==0 and do not use unlabel data, should have no data
        intent_data_train = None
    else:
        intent_data_train = ATISIntentBatchDatasetBidirection(
            train_query, train_query_inverse, train_lengths, intent_train,
            shots)

    # should have no/few dev data in low-resource setting
    if args.train_portion == 0:
        intent_data_dev = None
    elif args.train_portion <= 0.01:
        intent_data_dev = ATISIntentBatchDatasetBidirection(
            dev_query, dev_query_inverse, dev_lengths, intent_dev, shots)
    else:
        intent_data_dev = ATISIntentBatchDatasetBidirection(
            dev_query, dev_query_inverse, dev_lengths, intent_dev)
    intent_data_test = ATISIntentBatchDatasetBidirection(
        test_query,
        test_query_inverse,
        test_lengths,
        intent_test,
    )

    intent_dataloader_train = DataLoader(
        intent_data_train, batch_size=args.bz) if intent_data_train else None
    intent_dataloader_dev = DataLoader(
        intent_data_dev, batch_size=args.bz) if intent_data_dev else None
    intent_dataloader_test = DataLoader(intent_data_test, batch_size=args.bz)

    print('len train dataset {}'.format(
        len(intent_data_train) if intent_data_train else 0))
    print('len dev dataset {}'.format(
        len(intent_data_dev) if intent_data_dev else 0))
    print('len test dataset {}'.format(len(intent_data_test)))
    print('num labels: {}'.format(len(in2i)))
    print('num vocabs: {}'.format(len(t2i)))

    forward_params = dict()
    forward_params['V_embed_extend'], forward_params['pretrain_embed_extend'], forward_params['mat'], forward_params['bias'], \
    forward_params['D1'], forward_params['D2'], forward_params['language_mask'], forward_params['language'], forward_params['wildcard_mat'], \
    forward_params['wildcard_mat_origin_extend'] = \
        get_init_params(args, in2i, i2in, t2i, paths[0])

    if args.bidirection:
        backward_params = dict()
        backward_params['V_embed_extend'], backward_params['pretrain_embed_extend'], backward_params['mat'], backward_params['bias'], \
        backward_params['D1'], backward_params['D2'], backward_params['language_mask'], backward_params['language'], backward_params['wildcard_mat'], \
        backward_params['wildcard_mat_origin_extend'] = \
            get_init_params(args, in2i, i2in, t2i, paths[1])

    # get h1 for FSAGRU
    h1_forward = None
    h1_backward = None
    if args.farnn == 1:
        args.farnn = 0
        temp_model = FSARNNIntegrateEmptyStateSaperateGRU(
            pretrained_embed=forward_params['pretrain_embed_extend'],
            trans_r_1=forward_params['D1'],
            trans_r_2=forward_params['D2'],
            embed_r=forward_params['V_embed_extend'],
            trans_wildcard=forward_params['wildcard_mat'],
            config=args,
        )
        input_x = torch.LongTensor([[t2i['BOS']]])
        if torch.cuda.is_available():
            temp_model.cuda()
            input_x = input_x.cuda()
        h1_forward = temp_model.viterbi(input_x, None).detach()
        h1_forward = h1_forward.reshape(-1)

        if args.bidirection:
            temp_model = FSARNNIntegrateEmptyStateSaperateGRU(
                pretrained_embed=backward_params['pretrain_embed_extend'],
                trans_r_1=backward_params['D1'],
                trans_r_2=backward_params['D2'],
                embed_r=backward_params['V_embed_extend'],
                trans_wildcard=backward_params['wildcard_mat'],
                config=args,
            )
            input_x = torch.LongTensor([[t2i['EOS']]])
            if torch.cuda.is_available():
                temp_model.cuda()
                input_x = input_x.cuda()
            h1_backward = temp_model.viterbi(input_x, None).detach()
            h1_backward = h1_backward.reshape(-1)

        args.farnn = 1

    if args.bidirection:
        model = IntentIntegrateSaperateBidirection_B(
            pretrained_embed=forward_params['pretrain_embed_extend'],
            forward_params=forward_params,
            backward_params=backward_params,
            config=args,
            h1_forward=h1_forward,
            h1_backward=h1_backward)
    else:
        model = IntentIntegrateSaperate_B(
            pretrained_embed=forward_params['pretrain_embed_extend'],
            trans_r_1=forward_params['D1'],
            trans_r_2=forward_params['D2'],
            embed_r=forward_params['V_embed_extend'],
            trans_wildcard=forward_params['wildcard_mat'],
            config=args,
            mat=forward_params['mat'],
            bias=forward_params['bias'],
            h1_forward=h1_forward,
        )

    if args.loss_type == 'CrossEntropy':
        criterion = torch.nn.CrossEntropyLoss()
    elif args.loss_type == 'NormalizeNLL':
        criterion = relu_normalized_NLLLoss
    else:
        print("Wrong loss function")

    if args.optimizer == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=args.lr,
                                    weight_decay=0)
    if args.optimizer == 'ADAM':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr,
                                     weight_decay=0)

    if torch.cuda.is_available():
        model = model.cuda()

    pytorch_total_params = sum(p.numel() for p in model.parameters()
                               if p.requires_grad)
    print('ALL TRAINABLE PARAMETERS: {}'.format(pytorch_total_params))

    # TRAIN
    acc_train_init, avg_loss_train_init, train_init_p, train_init_r = val(
        model,
        intent_dataloader_train,
        epoch=0,
        mode='TRAIN',
        config=args,
        i2in=i2in,
        logger=logger,
        criterion=criterion)
    # DEV
    acc_dev_init, avg_loss_dev_init, dev_init_p, dev_init_r = val(
        model,
        intent_dataloader_dev,
        epoch=0,
        mode='DEV',
        config=args,
        i2in=i2in,
        logger=logger,
        criterion=criterion)
    # TEST
    acc_test_init, avg_loss_test_init, test_init_p, test_init_r = val(
        model,
        intent_dataloader_test,
        epoch=0,
        mode='TEST',
        config=args,
        i2in=i2in,
        logger=logger,
        criterion=criterion)

    print("{} INITIAL: ACC: {}, LOSS: {}, P: {}, R: {}".format(
        'TRAIN', acc_train_init, avg_loss_train_init, train_init_p,
        train_init_r))
    print("{} INITIAL: ACC: {}, LOSS: {}, P: {}, R: {}".format(
        'DEV', acc_dev_init, avg_loss_dev_init, dev_init_p, dev_init_r))
    print("{} INITIAL: ACC: {}, LOSS: {}, P: {}, R: {}".format(
        'TEST', acc_test_init, avg_loss_test_init, test_init_p, test_init_r))
    logger.add("{} INITIAL: ACC: {}, LOSS: {}, P: {}, R: {}".format(
        'TRAIN', acc_train_init, avg_loss_train_init, train_init_p,
        train_init_r))
    logger.add("{} INITIAL: ACC: {}, LOSS: {}, P: {}, R: {}".format(
        'DEV', acc_dev_init, avg_loss_dev_init, dev_init_p, dev_init_r))
    logger.add("{} INITIAL: ACC: {}, LOSS: {}, P: {}, R: {}".format(
        'TEST', acc_test_init, avg_loss_test_init, test_init_p, test_init_r))

    if args.only_probe:
        exit(0)

    best_dev_acc = acc_dev_init
    counter = 0
    best_dev_model = deepcopy(model)

    if not intent_dataloader_train: args.epoch = 0

    for epoch in range(1, args.epoch + 1):
        avg_loss = 0
        acc = 0

        pbar_train = tqdm(intent_dataloader_train)
        pbar_train.set_description("TRAIN EPOCH {}".format(epoch))

        model.train()
        for batch in pbar_train:

            optimizer.zero_grad()

            x_forward = batch['x_forward']
            x_backward = batch['x_backward']
            label = batch['i'].view(-1)
            lengths = batch['l']

            if torch.cuda.is_available():
                x_forward = batch['x_forward'].cuda()
                x_backward = batch['x_backward'].cuda()
                lengths = lengths.cuda()
                label = label.cuda()

            if args.bidirection:
                scores = model(x_forward, x_backward, lengths)
            else:
                scores = model(x_forward, lengths)

            loss = criterion(scores, label)

            loss.backward()
            optimizer.step()
            avg_loss += loss.item()

            acc += (scores.argmax(1) == label).sum().item()

            pbar_train.set_postfix_str(
                "{} - total right: {}, total loss: {}".format(
                    'TRAIN', acc, loss))

        acc = acc / len(intent_data_train)
        avg_loss = avg_loss / len(intent_data_train)
        print("{} Epoch: {} | ACC: {}, LOSS: {}".format(
            'TRAIN', epoch, acc, avg_loss))
        logger.add("{} Epoch: {} | ACC: {}, LOSS: {}".format(
            'TRAIN', epoch, acc, avg_loss))

        # DEV
        acc_dev, avg_loss_dev, p, r = val(model,
                                          intent_dataloader_dev,
                                          epoch,
                                          'DEV',
                                          logger,
                                          config=args,
                                          criterion=criterion)
        counter += 1  # counter for early stopping

        if (acc_dev is None) or (acc_dev > best_dev_acc):
            counter = 0
            best_dev_acc = acc_dev
            best_dev_model = deepcopy(model)

        if counter > args.early_stop:
            break

    best_dev_test_acc, avg_loss_test, best_dev_test_p, best_dev_test_r \
        = val(best_dev_model, intent_dataloader_test, epoch, 'TEST', logger, config=args, criterion=criterion)

    # Save the model
    datetime_str = create_datetime_str()
    model_save_path = "../model/{}/D{:.4f}-T{:.4f}-DI{:.4f}-TI{:.4f}-{}-{}-{}".format(
        args.run, best_dev_acc, best_dev_test_acc, acc_dev_init, acc_test_init,
        datetime_str, args.dataset, args.seed)
    mkdir("../model/{}/".format(args.run))
    mkdir(model_save_path)
    print("SAVING MODEL {} .....".format(model_save_path))
    torch.save(model.state_dict(), model_save_path + '.model')

    return acc_dev_init, acc_test_init, best_dev_acc, best_dev_test_acc, best_dev_test_p, best_dev_test_r, logger.record
def stats_exomes_1000G(directory):
	pred_dict = {
		"ClinPred_score_pred": "ClinPred_flag",
		"PrimateAI_score_pred": "PrimateAI_flag",
		"M-CAP_score_pred": "M-CAP_flag",
		"REVEL_score_pred": "REVEL_flag",
		"Eigen-raw_coding_pred": "Eigen-raw_coding_flag",
		"fathmm-XF_coding_score_pred": "fathmm-XF_coding_flag",
		"MISTIC_pred": "MISTIC_proba",
	}

	check = ['Eigen-raw_coding_pred', 'hsEigen-raw_coding_pred', 'M-CAP_score_pred', 'hsM-CAP_score_pred',
	         'PrimateAI_score_pred', 'hsPrimateAI_score_pred', 'ClinPred_score_pred', 'hsClinPred_score_pred',
	         'REVEL_score_pred', 'hsREVEL_score_pred', 'fathmm-XF_coding_score_pred',
	         'hsfathmm-XF_coding_score_pred', 'MISTIC_pred', 'hsMISTIC_pred']

	mkdir(directory + '/PLOTS_AND_TABLES')

	m = multiprocessing.Manager()
	df_l = m.list()

	l_dir = list(sorted(os.listdir(directory)))
	directory = directory + '/' if directory.endswith('/') is False else directory
	l_dir = [directory + f for f in l_dir if 'PLOTS' not in f]
	parmap.starmap(mp_exome, list(zip(l_dir)), pred_dict, df_l, check, pm_pbar=True)
	df_l = list(df_l)


	sorter = ['Eigen', 'PrimateAI', 'FATHMM-XF', 'ClinPred', 'REVEL', 'M-CAP', 'MISTIC']
	df_stats = pd.DataFrame(df_l).sort_values(by=['Classifier'])
	df_stats.Classifier = df_stats.Classifier.str.split('_')
	df_stats.Classifier = df_stats.Classifier.str[0]
	df_stats.Classifier = df_stats.Classifier.str.replace('-raw', '')
	df_stats.Classifier = df_stats.Classifier.str.replace('fathmm-XF', 'FATHMM-XF')


	hs_df = df_stats.copy()
	hs_df.Classifier = hs_df.Classifier.astype('category')
	hs_df.Classifier.cat.set_categories(sorter, inplace=True)



	hs_df.loc[hs_df['Percentile index'].isna() == True, 'Causative index'] = np.nan


	for ylim, name in zip([(0,50),(-5,350)], ['zoom', 'full_scale']):

		f = plt.figure(figsize=(6, 4))
		sns.violinplot(x="Classifier", y="Causative index", data=hs_df,   palette=["#0c2461", "#22a6b3", "#9b59b6", "#2196F3", "#4CAF50", "#FF9800", "#f44336"], showfliers=False, cut=0.1, linewidth=2, inner="box", scale='width')
		plt.xticks(rotation=45)

		plt.xlabel('')
		plt.ylabel('Ranking of causative deleterious variants')
		plt.ylim(ylim[0], ylim[1])
		f.tight_layout()
		plt.savefig(directory + '/PLOTS_AND_TABLES/Ranking_exomes_index_{}.png'.format(name), dpi=300)
		plt.close()

	for ylim, name in zip([(80,100),(0,100)], ['zoom', 'full_scale']):

		f = plt.figure(figsize=(8, 6))
		sns.boxplot(x="Classifier", y="Percentile index", data=hs_df,   palette=["#bdc3c7"], showfliers=False,)
		sns.stripplot(x="Classifier", y="Percentile index", data=hs_df,  palette=["#0c2461", "#22a6b3", "#9b59b6", "#2196F3", "#4CAF50", "#FF9800", "#f44336"], alpha=0.25, linewidth=0.4)
		f.tight_layout()
		plt.xlabel('')
		plt.ylabel('Percentile rank of causative variant')
		plt.ylim(ylim[0], ylim[1])
		plt.savefig(directory + '/PLOTS_AND_TABLES/Ranking_exomes_percentile_{}.png'.format(name), dpi=300)
		plt.close()

	sns.set_context("paper", font_scale=1)


	plt.figure(figsize=(6, 4))
	hs_df['Path_amount'] = 100 * (hs_df['Pathogenic detected']) / (
				hs_df['Pathogenic detected'] + hs_df['Benign detected'])
	plt.grid(axis='y', alpha=0.2)

	sns.violinplot(x="Classifier", y="Ratio_path_percentage", data=hs_df, palette=["#0c2461", "#22a6b3", "#9b59b6", "#2196F3", "#4CAF50", "#FF9800", "#f44336"], showfliers=False, inner="box", scale='width', linewidth=2)
	plt.xticks(rotation=45)

	plt.ylabel('Percentage of predicted deleterious variants')
	plt.xlabel('')

	l_hs = list()
	for clf in list(hs_df['Classifier'].unique()):
		d_hs = dict()
		values_path_amount = hs_df.loc[hs_df['Classifier'] == clf]['Ratio_path_percentage'].values
		d_hs['Classifier'] = clf
		d_hs['Path_amount_mean'] = np.mean(values_path_amount)
		d_hs['Path_amount_median'] = np.median(values_path_amount)
		d_hs['Path_amount_std'] = np.std(values_path_amount)
		values_causative_index = hs_df.loc[hs_df['Classifier'] == clf]['Causative index'].values
		d_hs['Causative_index_mean'] = np.mean(values_causative_index)
		d_hs['Causative_index_median'] = np.median(values_causative_index)
		d_hs['Causative_index_std'] = np.std(values_causative_index)
		l_hs.append(d_hs)

	final_results_mean_df = pd.DataFrame(l_hs)
	final_results_mean_df.Classifier = final_results_mean_df.Classifier.astype('category')
	final_results_mean_df.Classifier.cat.set_categories(sorter, inplace=True)
	final_results_mean_df.sort_values(by='Classifier').T.to_excel(directory + '/PLOTS_AND_TABLES/Results_mean_median_std.xlsx')


	plt.tight_layout()
	plt.savefig(directory + '/PLOTS_AND_TABLES/Pathogenic_number_test.png', dpi=600)
	plt.close()

	df_stats = df_stats[['Classifier', 'Filename', 'Exome_name', 'Source', 'Causative_variant', 'Benign detected', 'No score',
	                     'Pathogenic detected', 'Ratio_path_percentage', 'Causative index', 'Percentile index', 'Score', 'Prediction']]

	df_stats.to_csv(directory + '/PLOTS_AND_TABLES/Stats_exomes_raw.csv', sep='\t')

	cols = ['Benign detected', 'Pathogenic detected', 'Ratio_path_percentage', 'No score', 'Causative index', 'Percentile index', 'Score', 'Prediction']
	merge_df = pd.DataFrame()
	for col in cols:
		df_stats_predictors = df_stats[['Classifier', 'Filename', 'Exome_name', 'Source', col]].pivot(
			index='Exome_name', columns='Classifier', values=col).add_suffix('_' + col.replace(' ', '_'))
		df_stats_predictors.index.name = None
		df_stats_predictors.columns.name = None
		merge_df = pd.concat([merge_df, df_stats_predictors], axis=1)
	df_stats = df_stats[['Filename', 'Exome_name', 'Source', 'Causative_variant']].drop_duplicates(keep='first')
	df_stats.set_index('Exome_name', inplace=True, )
	concat_df = pd.concat([df_stats, merge_df], axis=1, sort=True)
	concat_df = concat_df[concat_df.columns.drop(list(concat_df.filter(regex='^hs')))]
	concat_df = concat_df[concat_df.columns.drop(list(concat_df.filter(regex='_detected|_Prediction|Source|No_score')))]


	stats_df = concat_df.copy()
	stats_df.loc['mean'] = concat_df.mean()
	stats_df.loc['25']   = concat_df.quantile(0.25)
	stats_df.loc['50']   = concat_df.quantile(0.5)
	stats_df.loc['75']   = concat_df.quantile(0.75)
	stats_df.loc['90']   = concat_df.quantile(0.9)
	stats_df.loc['95']   = concat_df.quantile(0.95)
	stats_df.loc['99']   = concat_df.quantile(0.99)
	stats_df.loc['std']  = concat_df.std()
	stats_df.to_excel(directory + '/PLOTS_AND_TABLES/Stats_exomes.xlsx', index=True)


	clfs = ['Eigen', 'PrimateAI', 'FATHMM-XF', 'ClinPred', 'REVEL', 'M-CAP', 'MISTIC', ]
	d = collections.defaultdict(dict)
	for clf in clfs:
		d[clf]['Path_detected_t_test'] = ttest_ind(hs_df[hs_df['Classifier'] == clf]['Ratio_path_percentage'].dropna().values, hs_df[hs_df['Classifier'] == 'MISTIC']['Ratio_path_percentage'].dropna().values)[1]
		d[clf]['Causative_index_t_test'] = ttest_ind(hs_df[hs_df['Classifier'] == clf]['Causative index'].dropna().values, hs_df[hs_df['Classifier'] == 'MISTIC']['Causative index'].dropna().values)[1]
		d[clf]['Path_detected_mannwhitneyu'] = mannwhitneyu(hs_df[hs_df['Classifier'] == clf]['Ratio_path_percentage'].dropna().values, hs_df[hs_df['Classifier'] == 'MISTIC']['Ratio_path_percentage'].dropna().values)[1]
		d[clf]['Causative_index_mannwhitneyu'] = mannwhitneyu(hs_df[hs_df['Classifier'] == clf]['Causative index'].dropna().values, hs_df[hs_df['Classifier'] == 'MISTIC']['Causative index'].dropna().values)[1]
		d[clf]['Path_detected_kruskal'] = kruskal(hs_df[hs_df['Classifier'] == clf]['Ratio_path_percentage'].dropna().values, hs_df[hs_df['Classifier'] == 'MISTIC']['Ratio_path_percentage'].dropna().values)[1]
		d[clf]['Causative_index_kruskal'] = kruskal(hs_df[hs_df['Classifier'] == clf]['Causative index'].dropna().values, hs_df[hs_df['Classifier'] == 'MISTIC']['Causative index'].dropna().values)[1]
	pd.DataFrame(d).to_excel(directory + '/PLOTS_AND_TABLES/Test_stats.xlsx')
Beispiel #11
0
        dict(
            list_columns=list_columns[1:],
            flag=flag,
            input=deleterious_df_path,
            output_dir=os.path.dirname(deleterious_df_path),
            model=model_dir,
            wt_select=True,
        ))

    deleterious_df = deleterious_df.sample(frac=1, random_state=1)

    directory = directory + '/' if directory.endswith(
        '/') is False else directory
    output_dir = output_dir + '/' if output_dir.endswith(
        '/') is False else output_dir
    utils.mkdir(output_dir)

    l_dir = list(sorted(os.listdir(directory)))[:deleterious_df.shape[0]]
    exomes_id = [e.replace('.csv.gz', '') for e in l_dir]
    l_dir = [directory + file for file in l_dir if file.endswith('.csv.gz')]
    deleterious_df = deleterious_df.filter(
        regex='flag|pred|proba|ID|True_Label', axis=1)

    deleterious_df['Source'] = pd.Series(exomes_id)
    col_ordered = ['ID', 'True_Label'] + list(
        sorted(
            set(list(deleterious_df.columns)) -
            set(['ID', 'True_Label', 'MISTIC_proba', 'MISTIC_pred']))) + [
                'MISTIC_proba', 'MISTIC_pred'
            ]
    deleterious_df = deleterious_df[col_ordered]
def plot_roc_curve_training(dict_y_test, results_proba, output_dir):
    """
    Method for plot ROC comparison between algorithms

    Args:
        dict_y_test: dict
            Store the y_test used for each iteration of CV
        results_proba : dict
            Store the proba obtained for each iteration of CV for every algorithm used
        output_dir: str
            Directory where will be save the plots
    Returns:
        None
    """

    output_dir = output_dir + '/Plots'
    utils.mkdir(output_dir)
    dict_auc = dict()
    ordered_dict = collections.defaultdict(dict)

    matplotlib.rcParams.update({'font.size': 8})

    for algo, results in results_proba.items():
        fig_algo = figure(num=algo, dpi=180, facecolor='w', edgecolor='k')
        plt.figure(algo)
        tprs = list()
        aucs = list()
        mean_fpr = np.linspace(0, 1, 100)
        for index, arrays in results.items():
            fpr, tpr, thresholds = metrics.roc_curve(dict_y_test[index],
                                                     arrays)
            tprs.append(np.interp(mean_fpr, fpr, tpr))
            tprs[-1][0] = 0.0
            roc_auc = metrics.auc(fpr, tpr)
            aucs.append(roc_auc)

            plt.plot(fpr,
                     tpr,
                     lw=1,
                     alpha=0.3,
                     label=r'ROC CV n°%s (AUC = %0.2f)' % (index, roc_auc))

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = metrics.auc(mean_fpr, mean_tpr)

        std_auc = np.std(aucs)
        plt.plot(mean_fpr,
                 mean_tpr,
                 color='red',
                 label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' %
                 (mean_auc, std_auc),
                 lw=2,
                 alpha=.8)
        std_tpr = np.std(tprs, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

        dict_auc[algo] = mean_auc

        ordered_dict[algo]['mean_tpr'] = mean_tpr
        ordered_dict[algo]['mean_fpr'] = mean_fpr
        ordered_dict[algo]['std_auc'] = std_auc
        ordered_dict[algo]['std_tpr'] = std_tpr
        #f44336		ordered_dict[algo]['tprs_upper'] = tprs_upper
        ordered_dict[algo]['tprs_lower'] = tprs_lower

        plt.fill_between(mean_fpr,
                         tprs_lower,
                         tprs_upper,
                         color='grey',
                         alpha=.2,
                         label=r'$\pm$ 1 std. dev.')
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating curve : ' + str(algo) + ' | ' +
                  str(index) + ' fold cross-validation')
        plt.legend(loc="lower right")

        plt.savefig(output_dir + "/ROC_" + str(algo) + ".png",
                    bbox_inches='tight',
                    dpi=180)

        plt.close(fig_algo)

    fig_glob = figure(num='Global', dpi=180, facecolor='w', edgecolor='k')
    plt.figure('Global')

    ordered_dict_auc = sorted(dict_auc.items(),
                              key=operator.itemgetter(1),
                              reverse=True)

    for elem in ordered_dict_auc:
        algo = elem[0]
        mean_auc = elem[1]

        try:

            plt.plot(ordered_dict[algo]['mean_fpr'],
                     ordered_dict[algo]['mean_tpr'],
                     label=r'Mean ROC : ' + str(algo) +
                     ' - AUC = %0.2f $\pm$ %0.2f)' %
                     (mean_auc, ordered_dict[algo]['std_auc']),
                     lw=2,
                     alpha=.8)
        except:
            plt.plot(ordered_dict[algo]['mean_fpr'],
                     ordered_dict[algo]['mean_tpr'],
                     label=r'Mean ROC : ' + str(algo) + ' - AUC = %0.2f' %
                     (mean_auc),
                     lw=2,
                     alpha=.8)

        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating curve : Global results | ' + str(index) +
                  ' fold cross-validation')
        plt.legend(loc="lower right")
    plt.savefig(output_dir + "/ROC_Summary.png", bbox_inches='tight', dpi=180)
    plt.close(fig_glob)