Exemple #1
0
    def create_report_biblecom(self):
        self.df_biblecom['verses']=0

        biblecom_files=FileUtility.recursive_glob(self.output_path+'/', '*.biblecom.txt')
        for bib_file in biblecom_files:
            iso,code=bib_file.split('/')[-1].split('.')[0:-1][0:-1][-1].split('_')
            length=len(FileUtility.load_list(bib_file))
            self.df_biblecom.loc[:,'verses'][(self.df_biblecom['language_iso']==iso) & (self.df_biblecom['trans_ID']==int(code))]=length
        self.df_biblecom.set_index('trans_ID')
        self.df_biblecom.to_csv(self.output_path + '/reports/crawl_report_biblecom.tsv', sep='\t', index=False, columns=['language_iso','trans_ID','language_name','verses'])
        self.generate_final_rep()
Exemple #2
0
 def generate_final_rep(self):
     rep_files=FileUtility.recursive_glob(self.output_path+'/reports/','crawl_report_*.tsv')
     df_s=[]
     for report_file in rep_files:
         version=report_file.split('/')[-1].split('.')[0].split('_')[-1]
         temp=pd.read_table(report_file)[['trans_ID','language_iso','language_name','verses']]
         temp['source']=version
         df_s.append(temp.copy())
     df_s=pd.concat(df_s)
     df_s.set_index('trans_ID')
     self.aggregated_rep=df_s
     df_s.to_csv(self.output_path + '/reports/final_rep.tsv', sep='\t', index=False, columns=['language_iso','trans_ID','language_name','verses','source'])
Exemple #3
0
 def get_list_of_bible_trans_path_by_lang(self, lang):
     '''
     :param lang: 3letters code of the language
     This function reads translation names and their path e.g.: [[goodnews, path1],...]
     '''
     lang_files = FileUtility.recursive_glob(self.path, lang + '*.txt')
     lang_files_id_address = [[
         (file.split('/')[-1].split('.')[0].split('-')[-1].replace(' ',
                                                                   '')),
         file
     ] for file in lang_files]
     return lang_files_id_address
    def get_stats_samples(self, k_mer):
        '''
        get the D_R and D_S
        :param k_mer:
        :return:
        '''
        x = []
        y = []
        y_tot = []
        error = []
        error_tot = []
        # To find the files
        if isinstance(self.input_dir, str):
            sample_files = FileUtility.recursive_glob(self.input_dir,
                                                      "*" + self.seqtype)
        else:
            sample_files = self.input_dir
        sample_files = random.sample(sample_files, self.M)

        # To iterate over the sampling sizes
        for sample_size in self.sampling_sizes:
            distance_i = []
            tot_dist_i = []
            print(' sampling size ', sample_size, ' is started ...')
            # To iterate over random files
            for sample_file in sample_files:
                comp_dist = self._get_kmer_distribution(
                    sample_file, k_mer, -1, 1)
                resamples_kmers = self._get_kmer_distribution(
                    sample_file, k_mer, sample_size, self.n_resamples)
                distance_i.append(
                    np.mean(get_kl_rows(np.array(resamples_kmers))))
                tot_dist_i = tot_dist_i + list(
                    get_kl_rows(
                        np.vstack(
                            (np.array(resamples_kmers), comp_dist[0])))[0:10,
                                                                        10])
            print(' sampling size ', sample_size, ' is completed.')
            mean_distance = np.mean(distance_i)
            std_distance = np.std(distance_i)
            mean_total_distance = np.mean(tot_dist_i)
            std_total_distance = np.std(tot_dist_i)
            x.append(sample_size)
            y.append(mean_distance)
            error.append(std_distance)
            y_tot.append(mean_total_distance)
            error_tot.append(std_total_distance)
        return x, y, error, y_tot, error_tot
Exemple #5
0
def create_excel_file(input_path, output_path):
    files_cv = FileUtility.recursive_glob(input_path, '*.pickle')
    if len(files_cv) >0:
        files_cv.sort()
        table_test = {'classifier': [], 'feature': [], 'CV': [], 'Precision': [], 'Recall': [], 'F1': [],'macroF1': [], 'accuracy': []}
        table_cv = {'classifier': [], 'feature': [], 'CV': [], 'Precision': [], 'Recall': [], 'F1': [], 'macroF1': [],'accuracy': []}

        import warnings
        warnings.filterwarnings('ignore')
        df1=[]
        df2=[]
        for file in files_cv:
            [label_set, conf, label_set, best_score_, best_estimator_,
             cv_results_, best_params_, (cv_predictions_pred, cv_predictions_trues, isolates),
             (Y_test_pred, Y_test)] = FileUtility.load_obj(file)
            rep = file.split('/')[-1].split('_CV_')[0]
            CV_scheme = file.split('_CV_')[1].split('_')[0]
            classifier = file.split('_CV_')[1].split('_')[1].split('.')[0]

            table_test['feature'].append(rep)
            table_test['classifier'].append(classifier)
            table_test['CV'].append(CV_scheme)
            table_test['Precision'].append(np.round(precision_score(Y_test, Y_test_pred), 2))
            table_test['Recall'].append(np.round(recall_score(Y_test, Y_test_pred), 2))
            table_test['F1'].append(np.round(f1_score(Y_test, Y_test_pred), 2))
            table_test['macroF1'].append(np.round(f1_score(Y_test, Y_test_pred,average='macro'), 2))
            table_test['accuracy'].append(np.round(accuracy_score(Y_test, Y_test_pred), 2))

            table_cv['feature'].append(rep)
            table_cv['classifier'].append(classifier)
            table_cv['CV'].append(CV_scheme)
            table_cv['Precision'].append(np.round(precision_score(cv_predictions_trues, cv_predictions_pred), 2))
            table_cv['Recall'].append(np.round(recall_score(cv_predictions_trues, cv_predictions_pred), 2))
            table_cv['F1'].append(np.round(f1_score(cv_predictions_trues, cv_predictions_pred), 2))
            table_cv['macroF1'].append(np.round(f1_score(cv_predictions_trues, cv_predictions_pred,average='macro'), 2))
            table_cv['accuracy'].append(np.round(accuracy_score(cv_predictions_trues, cv_predictions_pred), 2))
            df1 = pd.DataFrame(data=table_test,
                               columns=['feature', 'CV', 'classifier', 'accuracy', 'Precision', 'Recall', 'F1','macroF1'])
            df2 = pd.DataFrame(data=table_cv,
                               columns=['feature', 'CV', 'classifier', 'accuracy', 'Precision', 'Recall', 'F1','macroF1'])
        writer = pd.ExcelWriter(output_path)
        df1.sort_values(['macroF1','feature','classifier'], ascending=[False, True, True], inplace=True)
        df1.to_excel(writer, 'Test', index=False)

        df2.sort_values(['macroF1','feature','classifier'], ascending=[False, True, True], inplace=True)
        df2.to_excel(writer, 'Cross-validation', index=False)
        writer.save()
Exemple #6
0
 def __init__(self, path):
     '''
     :param path: directory of bible corpus
     '''
     self.bible_id_rgx = re.compile('^\s*[0-9]')
     if not os.access(path, os.F_OK):
         print(
             "\nError: Permission denied or could not find the directory of bible corpus!"
         )
         exit()
     else:
         print('Bible directory has been found successfully!')
         self.path = path
         self.all_bible_files = FileUtility.recursive_glob(
             self.path, '*.txt')
         print('%d bible translations have been found!' %
               len(self.all_bible_files))
    @staticmethod
    def load_precalculated(file_path):
        '''
        load precalculated results
        :param file_path:
        :return:
        '''
        return FileUtility.load_obj(file_path)


if __name__ == '__main__':
    '''
        test-case
    '''
    files = FileUtility.recursive_glob(
        '/mounts/data/proj/asgari/github_repos/microbiomephenotype/data_config/bodysites/',
        '*.txt')
    list_of_files = []
    for file in files:
        list_of_files += FileUtility.load_list(file)
    list_of_files = [x + '.fsa' for x in list_of_files]
    fasta_files, mapping = FileUtility.read_fasta_directory(
        '/mounts/data/proj/asgari/dissertation/datasets/deepbio/microbiome/hmb_data/',
        'fsa',
        only_files=list_of_files)
    BS = BootStrapping(fasta_files, 'body', seqtype='fsa', M=10)
    for k in [3, 4, 5, 6, 7, 8]:
        print(k)
        BS.add_kmer_sampling(k)
Exemple #8
0
    def get_stats_samples_npe(self, npe_file, npe_size):
        '''
        :param npe_file:
        :param npe_size:
        :return:
        '''
        x = []
        y = []
        y_tot = []
        error = []
        error_tot = []

        f = open(npe_file, 'r')
        npe_Applier = NPE(f, separator='', merge_size=npe_size)
        npe_vocab = [
            ''.join(x.split()).replace('</w>', '').lower()
            for x in FileUtility.load_list(npe_file)[1::]
        ]
        npe_vocab = list(set(npe_vocab))
        npe_vocab.sort()
        npe_vectorizer = TfidfVectorizer(use_idf=False,
                                         vocabulary=npe_vocab,
                                         analyzer='word',
                                         norm=None,
                                         stop_words=[],
                                         lowercase=True,
                                         binary=False,
                                         tokenizer=str.split)
        # To find the files
        if isinstance(self.input_dir, str):
            sample_files = FileUtility.recursive_glob(self.input_dir,
                                                      "*" + self.seqtype)
        else:
            sample_files = self.input_dir
        sample_files = random.sample(sample_files, self.M)

        # To iterate over the sampling sizes
        for sample_size in self.sampling_sizes:
            distance_i = []
            tot_dist_i = []
            print(' sampling size ', sample_size, ' is started ...')
            # To iterate over random files
            for sample_file in sample_files:
                comp_dist = self._get_npe_distribution(sample_file,
                                                       npe_Applier,
                                                       npe_vectorizer, -1, 1)
                resamples_npes = self._get_npe_distribution(
                    sample_file, npe_Applier, npe_vectorizer, sample_size,
                    self.n_resamples)
                distance_i.append(
                    np.mean(get_kl_rows(np.array(resamples_npes))))
                tot_dist_i = tot_dist_i + list(
                    get_kl_rows(
                        np.vstack(
                            (np.array(resamples_npes), comp_dist[0])))[0:10,
                                                                       10])
            print(' sampling size ', sample_size, ' is completed.')
            mean_distance = np.mean(distance_i)
            std_distance = np.std(distance_i)
            mean_total_distance = np.mean(tot_dist_i)
            std_total_distance = np.std(tot_dist_i)
            x.append(sample_size)
            y.append(mean_distance)
            error.append(std_distance)
            y_tot.append(mean_total_distance)
            error_tot.append(std_total_distance)
        return x, y, error, y_tot, error_tot
Exemple #9
0
    def read_data(self):

        self.xmldoc = minidom.parse(self.genml_path)

        # parse project part
        self.project = self.xmldoc.getElementsByTagName('project')
        self.output = self.project[0].attributes['output'].value
        self.project_name = self.project[0].attributes['name'].value

        if self.override and os.path.exists(self.output):
            var = input("Delete existing files at the output path? (y/n)")
            if var == 'y':
                shutil.rmtree(self.output)
        if not os.path.exists(self.output):
            os.makedirs(self.output)

        log_file = self.output + '/' + 'logfile'
        log_info = ['Project ' + self.project_name]


        self.representation_path = self.output + '/intermediate_rep/'
        IC = IntermediateRepCreate(self.representation_path)

        # load tables
        tabless = self.xmldoc.getElementsByTagName('tables')
        for tables in tabless:
            path = tables.attributes['path'].value
            normalization = tables.attributes['normalization'].value
            prefix = tables.firstChild.nodeValue.strip() + '_'
            if len(prefix) == 1:
                prefix = ''
            for file in FileUtility.recursive_glob(path, '*.uniq.mat'):
                log=IC.create_table(file, prefix + file.split('/')[-1], normalization, self.override)
                log_info.append(log)

        tables = self.xmldoc.getElementsByTagName('table')
        for table in tables:
            path = table.attributes['path'].value
            normalization = table.attributes['normalization'].value
            prefix = table.firstChild.nodeValue.strip()
            log=IC.create_table(path, prefix + path.split('/')[-1] if prefix=='' else prefix, normalization, self.override)
            log_info.append(log)

        # load sequences
        sequences = self.xmldoc.getElementsByTagName('sequence')
        for sequence in sequences:
            path = sequence.attributes['path'].value
            kmer = int(sequence.attributes['kmer'].value)
            log=IC.create_kmer_table(path,kmer,cores=min(self.cores,4),override=self.override)
            log_info.append(log)

        ## Adding metadata
        self.metadata_path = self.output + '/metadata/'
        if not os.path.exists(self.metadata_path):
            os.makedirs(self.metadata_path)
        # phenotype
        phenotype = self.xmldoc.getElementsByTagName('phenotype')
        if not os.path.exists(self.metadata_path + 'phenotypes.txt') or self.override:
            FileUtility.save_list(self.metadata_path + 'phenotypes.txt',
                                  FileUtility.load_list(phenotype[0].attributes['path'].value))

        # tree
        phylogentictree = self.xmldoc.getElementsByTagName('phylogentictree')
        if not os.path.exists(self.metadata_path + 'phylogentictree.txt') or self.override:
            FileUtility.save_list(self.metadata_path + 'phylogentictree.txt',
                                  FileUtility.load_list(phylogentictree[0].attributes['path'].value))
        tree2mat_group(self.metadata_path + 'phylogentictree.txt',n_group=20)

        FileUtility.save_list(log_file, log_info)
Exemple #10
0
    def predict_block(self, ultimate=False):
        '''
        :return:
        '''
        import warnings
        from sklearn.exceptions import DataConversionWarning, FitFailedWarning, UndefinedMetricWarning, ConvergenceWarning
        warnings.filterwarnings(action='ignore', category=DataConversionWarning)
        warnings.filterwarnings(action='ignore', category=FitFailedWarning)
        warnings.filterwarnings(action='ignore', category=DeprecationWarning)
        warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
        warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
        
        predict_blocks = self.xmldoc.getElementsByTagName('predict')
        predict_path=self.output+'/classifications/'

        # iterate over predict block
        for predict in predict_blocks:
            # Sub prediction
            FileUtility.ensure_dir(predict_path)
            setting_name=predict.attributes['name'].value
            subdir=predict_path+setting_name+'/'

            FileUtility.ensure_dir(subdir)
            ## label mapping
            labels=predict.getElementsByTagName('labels')[0].getElementsByTagName('label')
            mapping=dict()
            for label in labels:
                val=label.attributes['value'].value
                phenotype=label.firstChild.nodeValue.strip()
                mapping[phenotype]=int(val)

            ## optimizing for ..
            optimization=predict.getElementsByTagName('optimize')[0].firstChild.nodeValue.strip()
            ## number of folds
            self.cvbasis=predict.getElementsByTagName('eval')[0].firstChild.nodeValue.strip()
            folds=int(predict.getElementsByTagName('eval')[0].attributes['folds'].value)
            test_ratio=float(predict.getElementsByTagName('eval')[0].attributes['test'].value)

            if optimization not in ['accuracy','scores_r_1','scores_f1_1','scores_f1_0','f1_macro','f1_micro']:
                print ('Error in choosing optimization score')

            ## Genotype tables
            GPA=GenotypePhenotypeAccess(self.output)
            ## iterate over phenotypes if there exist more than one
            for phenotype in GPA.phenotypes:
                print ('working on phenotype ',phenotype)
                FileUtility.ensure_dir(subdir+phenotype+'/')
                ## create cross-validation
                FileUtility.ensure_dir(subdir+phenotype+'/cv/')
                cv_file=''
                cv_test_file=''
                if not ultimate:
                    if self.cvbasis=='tree':
                        FileUtility.ensure_dir(subdir+phenotype+'/cv/tree/')
                        if self.override or not FileUtility.exists(subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt'])):
                            GPA.create_treefold(subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt']), self.metadata_path + 'phylogentictree.txt', folds, test_ratio, phenotype, mapping)
                        cv_file=subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt'])
                        cv_test_file=subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_test.txt'])
                    else:
                        FileUtility.ensure_dir(subdir+phenotype+'/cv/rand/')
                        if self.override or not FileUtility.exists(subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt'])):
                            GPA.create_randfold(subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt']), folds, test_ratio, phenotype, mapping)
                        cv_file=subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt'])
                        cv_test_file=subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_test.txt'])

                features=[x.split('/')[-1].replace('_feature_vect.npz','') for x in FileUtility.recursive_glob(self.representation_path, '*.npz')]
                feature_combinations=[]
                ## TODO: ask as an input
                max_length_feature_comb = 3#len(features)

                for x in [[list(x) for x in list(itertools.combinations(features,r))] for r in range(3,max_length_feature_comb+1)]:
                    feature_combinations+=x


                ## iterate over feature sets
                for feature_setting in feature_combinations:
                    classifiers=[]
                    for model in predict.getElementsByTagName('model'):
                        for x in model.childNodes:
                            if not x.nodeName=="#text":
                                classifiers.append(x.nodeName)
                    if not ultimate:
                        X, Y, feature_names, final_strains = GPA.get_xy_prediction_mats(feature_setting, phenotype, mapping)

                        feature_setting =[''.join(feature.split('.')[0:-1]) if len(feature.split('.'))>1 else feature for feature in feature_setting]
                        feature_text='##'.join(feature_setting)

                        ## iterate over classifiers
                        for classifier in tqdm.tqdm(classifiers):
                            basepath_cls=subdir+phenotype+'/'+feature_text+'_CV_'+self.cvbasis
                            if classifier.lower()=='svm' and (not FileUtility.exists(basepath_cls+'_SVM.pickle') or self.override):
                                Model = SVM(X, Y)
                                Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names, params=[{'C': [1000, 500, 200, 100, 50, 20, 10, 5, 2, 1, 0.2, 0.5, 0.01, 0.02, 0.05, 0.001]}])
                            if classifier.lower()=='rf' and  (not FileUtility.exists(basepath_cls+'_RF.pickle') or self.override):
                                Model = RFClassifier(X, Y)
                                Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names)
                            if classifier.lower()=='lr' and (not FileUtility.exists(basepath_cls+'_LR.pickle') or self.override):
                                Model = LogRegression(X, Y)
                                Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names)
                            #if classifier.lower()=='dnn':
                            #    Model = DNN(X, Y)
                            #    Model.tune_and_eval(subdir+phenotype+'/'+'_'.join([feature]),njobs=self.cores, kfold=10)
                        # generate selected features
                        FileUtility.ensure_dir(self.output+'/'+'ultimate_outputs/')
                        print ('Select the top markers..')
                        generate_top_features(self.output, [x.upper() for x in classifiers], topk=200)
                FileUtility.ensure_dir(subdir+phenotype+'/'+'final_results/')
                #create_excel_file(subdir+phenotype+'/', subdir+phenotype+'/final_results/classification_res.xlsx')


        FileUtility.ensure_dir(self.output+'/'+'ultimate_outputs/')
Exemple #11
0
def checkArgs(args):
    '''
        This function checks the input arguments and returns the errors (if exist) otherwise reads the parameters
    '''
    # keep all errors
    err = ""
    # Using the argument parser in case of -h or wrong usage the correct argument usage
    # will be prompted
    parser = argparse.ArgumentParser()

    # top level ######################################################################################################
    parser.add_argument('--bootstrapping',
                        action='store_true',
                        help='To enable classification and parameter tuning')

    parser.add_argument(
        '--genkmer',
        action='store_true',
        help=
        'To enable generation of representations for input fasta file or directory of 16S rRNA samples'
    )

    parser.add_argument('--train_predictor',
                        action='store_true',
                        help='To enable classification and parameter tuning')

    # boot strapping #################################################################################################
    parser.add_argument('--indir',
                        action='store',
                        dest='input_dir_bootstrapping',
                        default=False,
                        type=str,
                        help='bootstrapping: directory of 16S rRNA samples',
                        required='--bootstrapping' in sys.argv)

    # generate k-mers ################################################################################################
    parser.add_argument(
        '--inaddr',
        action='store',
        dest='genrep_input_addr',
        default=False,
        type=str,
        help=
        'genkmer: Generate representations for input fasta file or directory of 16S rRNA samples',
        required='--genkmer' in sys.argv)

    # classification ################################################################################################

    parser.add_argument(
        '--x',
        action='store',
        dest='X',
        type=str,
        default=False,
        help=
        'train_predictor: The data in the npy format rows are instances and columns are features'
    )

    parser.add_argument(
        '--y',
        action='store',
        dest='Y',
        type=str,
        default=False,
        help=
        'train_predictor: The labels associated with the rows of classifyX, each line is a associated with a row'
    )

    parser.add_argument(
        '--model',
        action='store',
        dest='model',
        type=str,
        default=False,
        choices=[False, 'RF', 'SVM', 'DNN'],
        help='train_predictor: choice of classifier from RF, SVM, DNN')

    parser.add_argument(
        '--batchsize',
        action='store',
        dest='batch_size',
        type=int,
        default=10,
        help='train_predictor-model/DNN: batch size for deep learning')

    parser.add_argument(
        '--gpu_id',
        action='store',
        dest='gpu_id',
        type=str,
        default='0',
        help='train_predictor-model/DNN: GPU id for deep learning')

    parser.add_argument(
        '--epochs',
        action='store',
        dest='epochs',
        type=int,
        default=100,
        help='train_predictor-model/DNN: number of epochs for deep learning')

    parser.add_argument(
        '--arch',
        action='store',
        dest='dnn_arch',
        type=str,
        default='1024,0.2,512',
        help=
        'train_predictor-model/DNN: The comma separated definition of neural network layers connected to eahc other, you do not need to specify the input and output layers, values between 0 and 1 will be considered as dropouts'
    )

    # general to bootstrap  and rep ##################################################################################
    parser.add_argument('--filetype',
                        action='store',
                        dest='filetype',
                        type=str,
                        default='fastq',
                        help='fasta fsa fastq etc')

    # bootstrap ################################################################################
    parser.add_argument('--kvals',
                        action='store',
                        dest='kvals',
                        type=str,
                        default='3,4,5,6,7,8',
                        help='Comma separated k-mer values 2,3,4,5,6')

    parser.add_argument('--nvals',
                        action='store',
                        dest='nvals',
                        type=str,
                        default='10,20,50,100,200,500,1000,2000,5000,10000',
                        help='Comma separated sample sizes')

    # rep / classifier ################################################################################
    parser.add_argument('--cores',
                        action='store',
                        dest='cores',
                        default=4,
                        type=int,
                        help='Number of cores to be used')

    # rep ##################################################################################
    parser.add_argument(
        '--KN',
        action='store',
        dest='K_N',
        default=None,
        type=str,
        help=
        'pair of comma separated Kmer:sub-sample-size ==> 2:100,6:-1 (N=-1 means using all sequences)'
    )

    parser.add_argument('--out',
                        action='store',
                        dest='output_addr',
                        type=str,
                        default='out',
                        help='Out put directory')

    parser.add_argument('--in',
                        action='store',
                        dest='input_addr',
                        type=str,
                        default=None,
                        help='Input fasta file or directory of samples')

    parser.add_argument('--name',
                        action='store',
                        dest='data_name',
                        type=str,
                        default=None,
                        help='name of the dataset')

    parsedArgs = parser.parse_args()

    if parsedArgs.bootstrapping:
        '''
            bootstrapping functionality
        '''
        print('Bootstrapping requested..\n')
        if (not os.access(parsedArgs.input_dir_bootstrapping, os.F_OK)):
            err = err + "\nError: Permission denied or could not find the directory!"
            return err
        else:
            try:
                os.stat(parsedArgs.output_addr)
            except:

                os.mkdir(parsedArgs.output_addr)

            if len(
                    FileUtility.recursive_glob(
                        parsedArgs.input_dir_bootstrapping,
                        '*' + parsedArgs.filetype)) == 0:
                err = err + "\nThe filetype " + parsedArgs.filetype + " could not find the directory!"
                return err

            if not parsedArgs.data_name:
                parsedArgs.data_name = parsedArgs.input_dir_bootstrapping.split(
                    '/')[-1]

            try:
                k_values = [int(x) for x in parsedArgs.kvals.split(',')]
                n_values = [int(x) for x in parsedArgs.nvals.split(',')]
            except:
                err = err + "\n k-mers or sampling sizes are not fed correctly; see the help with -h!"
                return err
            MicroPheno.bootstrapping(parsedArgs.input_dir_bootstrapping,
                                     parsedArgs.output_addr,
                                     parsedArgs.data_name,
                                     filetype=parsedArgs.filetype,
                                     k_values=k_values,
                                     sampling_sizes=n_values)
        return False

    if parsedArgs.genkmer:
        '''
            Representation creation functionality
        '''
        if (not os.access(parsedArgs.genrep_input_addr, os.F_OK)):
            err = err + "\nError: Permission denied or could not find the directory!"
            return err
        elif os.path.isdir(parsedArgs.genrep_input_addr):
            print('Representation creation requested for directory ' +
                  parsedArgs.genrep_input_addr + '\n')
            try:
                os.stat(parsedArgs.output_addr)
            except:
                os.mkdir(parsedArgs.output_addr)

            if len(
                    FileUtility.recursive_glob(parsedArgs.genrep_input_addr,
                                               '*' +
                                               parsedArgs.filetype)) == 0:
                err = err + "\nThe filetype " + parsedArgs.filetype + " could not find the directory!"
                return err

            if not parsedArgs.data_name:
                parsedArgs.data_name = parsedArgs.genrep_input_addr.split(
                    '/')[-1]

            try:
                sampling_dict = dict()
                for x in parsedArgs.K_N.split(','):
                    k, n = x.split(':')
                    k = int(k)
                    n = int(n)
                    if k in sampling_dict:
                        sampling_dict[k].append(n)
                    else:
                        sampling_dict[k] = [n]
            except:
                err = err + "\nWrong format for KN (k-mer sample sizes)!"
                return err

            MicroPheno.representation_creation_dir(
                parsedArgs.genrep_input_addr,
                parsedArgs.output_addr,
                parsedArgs.data_name,
                parsedArgs.cores,
                filetype=parsedArgs.filetype,
                sampling_dict=sampling_dict)
        else:
            print('Representation creation requested for file ' +
                  parsedArgs.genrep_input_addr + '\n')

    if parsedArgs.train_predictor:
        print('Classification and parameter tuning requested..\n')
        if not parsedArgs.model:
            err = err + "\nNo classification model is specified"
        if (not os.access(parsedArgs.X, os.F_OK)):
            err = err + "\nError: Permission denied or could not find the X!"
            return err
        if (not os.access(parsedArgs.Y, os.F_OK)):
            err = err + "\nError: Permission denied or could not find the Y!"
            return err
        else:
            try:
                os.stat(parsedArgs.output_addr)
            except:
                os.mkdir(parsedArgs.output_addr)
                print(parsedArgs.output_addr, ' directory created')

        if not parsedArgs.data_name:
            parsedArgs.data_name = parsedArgs.X.split('/')[-1].split('.')[0]

        if parsedArgs.model == 'DNN':
            '''
                Deep learning
            '''
            arch = [
                int(layer) if float(layer) > 1 else float(layer)
                for layer in parsedArgs.dnn_arch.split(',')
            ]
            MicroPheno.DNN_classifier(parsedArgs.X, parsedArgs.Y, arch,
                                      parsedArgs.output_addr,
                                      parsedArgs.data_name, parsedArgs.gpu_id,
                                      parsedArgs.epochs, parsedArgs.batch_size)
        else:
            '''
                SVM and Random Forest
            '''
            if parsedArgs.model in ['SVM', 'RF']:
                MicroPheno.classical_classifier(parsedArgs.X, parsedArgs.Y,
                                                parsedArgs.model,
                                                parsedArgs.output_addr,
                                                parsedArgs.data_name,
                                                parsedArgs.cores)
            else:
                return "\nNot able to recognize the model!"

    else:
        err = err + "\nError: You need to specify an input corpus file!"
        print('others')

    return False
# This file displays results from the picle files.
# provide the location of result files as input to path2res

# works fine for .pickle files
# can read all pickle in the path so make sure that you only supply the files whi are the resulting pickle files.

import sys
sys.path.append('./')
from utility.file_utility import FileUtility
import numpy as np
path2res = sys.argv[1]
files = FileUtility.recursive_glob(path2res, '*.pickle')
import warnings
warnings.filterwarnings("ignore")

# In[8]:


def get_cv_res(filename):
    try:
        [
            label_set, conf, best_score_, best_estimator_, cv_results_,
            best_params_, pred
        ] = FileUtility.load_obj(filename)

        res = dict()
        print(filename.split('/')[-1] + " ", conf)
        #print (cv_results_.keys())
        idx = np.argmax(cv_results_['mean_test_f1_macro'])
        res['f1_macro'] = np.round(cv_results_['mean_test_f1_macro'][idx], 2)
        res['f1_macro*'] = str(