def create_report_biblecom(self): self.df_biblecom['verses']=0 biblecom_files=FileUtility.recursive_glob(self.output_path+'/', '*.biblecom.txt') for bib_file in biblecom_files: iso,code=bib_file.split('/')[-1].split('.')[0:-1][0:-1][-1].split('_') length=len(FileUtility.load_list(bib_file)) self.df_biblecom.loc[:,'verses'][(self.df_biblecom['language_iso']==iso) & (self.df_biblecom['trans_ID']==int(code))]=length self.df_biblecom.set_index('trans_ID') self.df_biblecom.to_csv(self.output_path + '/reports/crawl_report_biblecom.tsv', sep='\t', index=False, columns=['language_iso','trans_ID','language_name','verses']) self.generate_final_rep()
def generate_final_rep(self): rep_files=FileUtility.recursive_glob(self.output_path+'/reports/','crawl_report_*.tsv') df_s=[] for report_file in rep_files: version=report_file.split('/')[-1].split('.')[0].split('_')[-1] temp=pd.read_table(report_file)[['trans_ID','language_iso','language_name','verses']] temp['source']=version df_s.append(temp.copy()) df_s=pd.concat(df_s) df_s.set_index('trans_ID') self.aggregated_rep=df_s df_s.to_csv(self.output_path + '/reports/final_rep.tsv', sep='\t', index=False, columns=['language_iso','trans_ID','language_name','verses','source'])
def get_list_of_bible_trans_path_by_lang(self, lang): ''' :param lang: 3letters code of the language This function reads translation names and their path e.g.: [[goodnews, path1],...] ''' lang_files = FileUtility.recursive_glob(self.path, lang + '*.txt') lang_files_id_address = [[ (file.split('/')[-1].split('.')[0].split('-')[-1].replace(' ', '')), file ] for file in lang_files] return lang_files_id_address
def get_stats_samples(self, k_mer): ''' get the D_R and D_S :param k_mer: :return: ''' x = [] y = [] y_tot = [] error = [] error_tot = [] # To find the files if isinstance(self.input_dir, str): sample_files = FileUtility.recursive_glob(self.input_dir, "*" + self.seqtype) else: sample_files = self.input_dir sample_files = random.sample(sample_files, self.M) # To iterate over the sampling sizes for sample_size in self.sampling_sizes: distance_i = [] tot_dist_i = [] print(' sampling size ', sample_size, ' is started ...') # To iterate over random files for sample_file in sample_files: comp_dist = self._get_kmer_distribution( sample_file, k_mer, -1, 1) resamples_kmers = self._get_kmer_distribution( sample_file, k_mer, sample_size, self.n_resamples) distance_i.append( np.mean(get_kl_rows(np.array(resamples_kmers)))) tot_dist_i = tot_dist_i + list( get_kl_rows( np.vstack( (np.array(resamples_kmers), comp_dist[0])))[0:10, 10]) print(' sampling size ', sample_size, ' is completed.') mean_distance = np.mean(distance_i) std_distance = np.std(distance_i) mean_total_distance = np.mean(tot_dist_i) std_total_distance = np.std(tot_dist_i) x.append(sample_size) y.append(mean_distance) error.append(std_distance) y_tot.append(mean_total_distance) error_tot.append(std_total_distance) return x, y, error, y_tot, error_tot
def create_excel_file(input_path, output_path): files_cv = FileUtility.recursive_glob(input_path, '*.pickle') if len(files_cv) >0: files_cv.sort() table_test = {'classifier': [], 'feature': [], 'CV': [], 'Precision': [], 'Recall': [], 'F1': [],'macroF1': [], 'accuracy': []} table_cv = {'classifier': [], 'feature': [], 'CV': [], 'Precision': [], 'Recall': [], 'F1': [], 'macroF1': [],'accuracy': []} import warnings warnings.filterwarnings('ignore') df1=[] df2=[] for file in files_cv: [label_set, conf, label_set, best_score_, best_estimator_, cv_results_, best_params_, (cv_predictions_pred, cv_predictions_trues, isolates), (Y_test_pred, Y_test)] = FileUtility.load_obj(file) rep = file.split('/')[-1].split('_CV_')[0] CV_scheme = file.split('_CV_')[1].split('_')[0] classifier = file.split('_CV_')[1].split('_')[1].split('.')[0] table_test['feature'].append(rep) table_test['classifier'].append(classifier) table_test['CV'].append(CV_scheme) table_test['Precision'].append(np.round(precision_score(Y_test, Y_test_pred), 2)) table_test['Recall'].append(np.round(recall_score(Y_test, Y_test_pred), 2)) table_test['F1'].append(np.round(f1_score(Y_test, Y_test_pred), 2)) table_test['macroF1'].append(np.round(f1_score(Y_test, Y_test_pred,average='macro'), 2)) table_test['accuracy'].append(np.round(accuracy_score(Y_test, Y_test_pred), 2)) table_cv['feature'].append(rep) table_cv['classifier'].append(classifier) table_cv['CV'].append(CV_scheme) table_cv['Precision'].append(np.round(precision_score(cv_predictions_trues, cv_predictions_pred), 2)) table_cv['Recall'].append(np.round(recall_score(cv_predictions_trues, cv_predictions_pred), 2)) table_cv['F1'].append(np.round(f1_score(cv_predictions_trues, cv_predictions_pred), 2)) table_cv['macroF1'].append(np.round(f1_score(cv_predictions_trues, cv_predictions_pred,average='macro'), 2)) table_cv['accuracy'].append(np.round(accuracy_score(cv_predictions_trues, cv_predictions_pred), 2)) df1 = pd.DataFrame(data=table_test, columns=['feature', 'CV', 'classifier', 'accuracy', 'Precision', 'Recall', 'F1','macroF1']) df2 = pd.DataFrame(data=table_cv, columns=['feature', 'CV', 'classifier', 'accuracy', 'Precision', 'Recall', 'F1','macroF1']) writer = pd.ExcelWriter(output_path) df1.sort_values(['macroF1','feature','classifier'], ascending=[False, True, True], inplace=True) df1.to_excel(writer, 'Test', index=False) df2.sort_values(['macroF1','feature','classifier'], ascending=[False, True, True], inplace=True) df2.to_excel(writer, 'Cross-validation', index=False) writer.save()
def __init__(self, path): ''' :param path: directory of bible corpus ''' self.bible_id_rgx = re.compile('^\s*[0-9]') if not os.access(path, os.F_OK): print( "\nError: Permission denied or could not find the directory of bible corpus!" ) exit() else: print('Bible directory has been found successfully!') self.path = path self.all_bible_files = FileUtility.recursive_glob( self.path, '*.txt') print('%d bible translations have been found!' % len(self.all_bible_files))
@staticmethod def load_precalculated(file_path): ''' load precalculated results :param file_path: :return: ''' return FileUtility.load_obj(file_path) if __name__ == '__main__': ''' test-case ''' files = FileUtility.recursive_glob( '/mounts/data/proj/asgari/github_repos/microbiomephenotype/data_config/bodysites/', '*.txt') list_of_files = [] for file in files: list_of_files += FileUtility.load_list(file) list_of_files = [x + '.fsa' for x in list_of_files] fasta_files, mapping = FileUtility.read_fasta_directory( '/mounts/data/proj/asgari/dissertation/datasets/deepbio/microbiome/hmb_data/', 'fsa', only_files=list_of_files) BS = BootStrapping(fasta_files, 'body', seqtype='fsa', M=10) for k in [3, 4, 5, 6, 7, 8]: print(k) BS.add_kmer_sampling(k)
def get_stats_samples_npe(self, npe_file, npe_size): ''' :param npe_file: :param npe_size: :return: ''' x = [] y = [] y_tot = [] error = [] error_tot = [] f = open(npe_file, 'r') npe_Applier = NPE(f, separator='', merge_size=npe_size) npe_vocab = [ ''.join(x.split()).replace('</w>', '').lower() for x in FileUtility.load_list(npe_file)[1::] ] npe_vocab = list(set(npe_vocab)) npe_vocab.sort() npe_vectorizer = TfidfVectorizer(use_idf=False, vocabulary=npe_vocab, analyzer='word', norm=None, stop_words=[], lowercase=True, binary=False, tokenizer=str.split) # To find the files if isinstance(self.input_dir, str): sample_files = FileUtility.recursive_glob(self.input_dir, "*" + self.seqtype) else: sample_files = self.input_dir sample_files = random.sample(sample_files, self.M) # To iterate over the sampling sizes for sample_size in self.sampling_sizes: distance_i = [] tot_dist_i = [] print(' sampling size ', sample_size, ' is started ...') # To iterate over random files for sample_file in sample_files: comp_dist = self._get_npe_distribution(sample_file, npe_Applier, npe_vectorizer, -1, 1) resamples_npes = self._get_npe_distribution( sample_file, npe_Applier, npe_vectorizer, sample_size, self.n_resamples) distance_i.append( np.mean(get_kl_rows(np.array(resamples_npes)))) tot_dist_i = tot_dist_i + list( get_kl_rows( np.vstack( (np.array(resamples_npes), comp_dist[0])))[0:10, 10]) print(' sampling size ', sample_size, ' is completed.') mean_distance = np.mean(distance_i) std_distance = np.std(distance_i) mean_total_distance = np.mean(tot_dist_i) std_total_distance = np.std(tot_dist_i) x.append(sample_size) y.append(mean_distance) error.append(std_distance) y_tot.append(mean_total_distance) error_tot.append(std_total_distance) return x, y, error, y_tot, error_tot
def read_data(self): self.xmldoc = minidom.parse(self.genml_path) # parse project part self.project = self.xmldoc.getElementsByTagName('project') self.output = self.project[0].attributes['output'].value self.project_name = self.project[0].attributes['name'].value if self.override and os.path.exists(self.output): var = input("Delete existing files at the output path? (y/n)") if var == 'y': shutil.rmtree(self.output) if not os.path.exists(self.output): os.makedirs(self.output) log_file = self.output + '/' + 'logfile' log_info = ['Project ' + self.project_name] self.representation_path = self.output + '/intermediate_rep/' IC = IntermediateRepCreate(self.representation_path) # load tables tabless = self.xmldoc.getElementsByTagName('tables') for tables in tabless: path = tables.attributes['path'].value normalization = tables.attributes['normalization'].value prefix = tables.firstChild.nodeValue.strip() + '_' if len(prefix) == 1: prefix = '' for file in FileUtility.recursive_glob(path, '*.uniq.mat'): log=IC.create_table(file, prefix + file.split('/')[-1], normalization, self.override) log_info.append(log) tables = self.xmldoc.getElementsByTagName('table') for table in tables: path = table.attributes['path'].value normalization = table.attributes['normalization'].value prefix = table.firstChild.nodeValue.strip() log=IC.create_table(path, prefix + path.split('/')[-1] if prefix=='' else prefix, normalization, self.override) log_info.append(log) # load sequences sequences = self.xmldoc.getElementsByTagName('sequence') for sequence in sequences: path = sequence.attributes['path'].value kmer = int(sequence.attributes['kmer'].value) log=IC.create_kmer_table(path,kmer,cores=min(self.cores,4),override=self.override) log_info.append(log) ## Adding metadata self.metadata_path = self.output + '/metadata/' if not os.path.exists(self.metadata_path): os.makedirs(self.metadata_path) # phenotype phenotype = self.xmldoc.getElementsByTagName('phenotype') if not os.path.exists(self.metadata_path + 'phenotypes.txt') or self.override: FileUtility.save_list(self.metadata_path + 'phenotypes.txt', FileUtility.load_list(phenotype[0].attributes['path'].value)) # tree phylogentictree = self.xmldoc.getElementsByTagName('phylogentictree') if not os.path.exists(self.metadata_path + 'phylogentictree.txt') or self.override: FileUtility.save_list(self.metadata_path + 'phylogentictree.txt', FileUtility.load_list(phylogentictree[0].attributes['path'].value)) tree2mat_group(self.metadata_path + 'phylogentictree.txt',n_group=20) FileUtility.save_list(log_file, log_info)
def predict_block(self, ultimate=False): ''' :return: ''' import warnings from sklearn.exceptions import DataConversionWarning, FitFailedWarning, UndefinedMetricWarning, ConvergenceWarning warnings.filterwarnings(action='ignore', category=DataConversionWarning) warnings.filterwarnings(action='ignore', category=FitFailedWarning) warnings.filterwarnings(action='ignore', category=DeprecationWarning) warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning) warnings.filterwarnings(action='ignore', category=ConvergenceWarning) predict_blocks = self.xmldoc.getElementsByTagName('predict') predict_path=self.output+'/classifications/' # iterate over predict block for predict in predict_blocks: # Sub prediction FileUtility.ensure_dir(predict_path) setting_name=predict.attributes['name'].value subdir=predict_path+setting_name+'/' FileUtility.ensure_dir(subdir) ## label mapping labels=predict.getElementsByTagName('labels')[0].getElementsByTagName('label') mapping=dict() for label in labels: val=label.attributes['value'].value phenotype=label.firstChild.nodeValue.strip() mapping[phenotype]=int(val) ## optimizing for .. optimization=predict.getElementsByTagName('optimize')[0].firstChild.nodeValue.strip() ## number of folds self.cvbasis=predict.getElementsByTagName('eval')[0].firstChild.nodeValue.strip() folds=int(predict.getElementsByTagName('eval')[0].attributes['folds'].value) test_ratio=float(predict.getElementsByTagName('eval')[0].attributes['test'].value) if optimization not in ['accuracy','scores_r_1','scores_f1_1','scores_f1_0','f1_macro','f1_micro']: print ('Error in choosing optimization score') ## Genotype tables GPA=GenotypePhenotypeAccess(self.output) ## iterate over phenotypes if there exist more than one for phenotype in GPA.phenotypes: print ('working on phenotype ',phenotype) FileUtility.ensure_dir(subdir+phenotype+'/') ## create cross-validation FileUtility.ensure_dir(subdir+phenotype+'/cv/') cv_file='' cv_test_file='' if not ultimate: if self.cvbasis=='tree': FileUtility.ensure_dir(subdir+phenotype+'/cv/tree/') if self.override or not FileUtility.exists(subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt'])): GPA.create_treefold(subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt']), self.metadata_path + 'phylogentictree.txt', folds, test_ratio, phenotype, mapping) cv_file=subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt']) cv_test_file=subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_test.txt']) else: FileUtility.ensure_dir(subdir+phenotype+'/cv/rand/') if self.override or not FileUtility.exists(subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt'])): GPA.create_randfold(subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt']), folds, test_ratio, phenotype, mapping) cv_file=subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt']) cv_test_file=subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_test.txt']) features=[x.split('/')[-1].replace('_feature_vect.npz','') for x in FileUtility.recursive_glob(self.representation_path, '*.npz')] feature_combinations=[] ## TODO: ask as an input max_length_feature_comb = 3#len(features) for x in [[list(x) for x in list(itertools.combinations(features,r))] for r in range(3,max_length_feature_comb+1)]: feature_combinations+=x ## iterate over feature sets for feature_setting in feature_combinations: classifiers=[] for model in predict.getElementsByTagName('model'): for x in model.childNodes: if not x.nodeName=="#text": classifiers.append(x.nodeName) if not ultimate: X, Y, feature_names, final_strains = GPA.get_xy_prediction_mats(feature_setting, phenotype, mapping) feature_setting =[''.join(feature.split('.')[0:-1]) if len(feature.split('.'))>1 else feature for feature in feature_setting] feature_text='##'.join(feature_setting) ## iterate over classifiers for classifier in tqdm.tqdm(classifiers): basepath_cls=subdir+phenotype+'/'+feature_text+'_CV_'+self.cvbasis if classifier.lower()=='svm' and (not FileUtility.exists(basepath_cls+'_SVM.pickle') or self.override): Model = SVM(X, Y) Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names, params=[{'C': [1000, 500, 200, 100, 50, 20, 10, 5, 2, 1, 0.2, 0.5, 0.01, 0.02, 0.05, 0.001]}]) if classifier.lower()=='rf' and (not FileUtility.exists(basepath_cls+'_RF.pickle') or self.override): Model = RFClassifier(X, Y) Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names) if classifier.lower()=='lr' and (not FileUtility.exists(basepath_cls+'_LR.pickle') or self.override): Model = LogRegression(X, Y) Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names) #if classifier.lower()=='dnn': # Model = DNN(X, Y) # Model.tune_and_eval(subdir+phenotype+'/'+'_'.join([feature]),njobs=self.cores, kfold=10) # generate selected features FileUtility.ensure_dir(self.output+'/'+'ultimate_outputs/') print ('Select the top markers..') generate_top_features(self.output, [x.upper() for x in classifiers], topk=200) FileUtility.ensure_dir(subdir+phenotype+'/'+'final_results/') #create_excel_file(subdir+phenotype+'/', subdir+phenotype+'/final_results/classification_res.xlsx') FileUtility.ensure_dir(self.output+'/'+'ultimate_outputs/')
def checkArgs(args): ''' This function checks the input arguments and returns the errors (if exist) otherwise reads the parameters ''' # keep all errors err = "" # Using the argument parser in case of -h or wrong usage the correct argument usage # will be prompted parser = argparse.ArgumentParser() # top level ###################################################################################################### parser.add_argument('--bootstrapping', action='store_true', help='To enable classification and parameter tuning') parser.add_argument( '--genkmer', action='store_true', help= 'To enable generation of representations for input fasta file or directory of 16S rRNA samples' ) parser.add_argument('--train_predictor', action='store_true', help='To enable classification and parameter tuning') # boot strapping ################################################################################################# parser.add_argument('--indir', action='store', dest='input_dir_bootstrapping', default=False, type=str, help='bootstrapping: directory of 16S rRNA samples', required='--bootstrapping' in sys.argv) # generate k-mers ################################################################################################ parser.add_argument( '--inaddr', action='store', dest='genrep_input_addr', default=False, type=str, help= 'genkmer: Generate representations for input fasta file or directory of 16S rRNA samples', required='--genkmer' in sys.argv) # classification ################################################################################################ parser.add_argument( '--x', action='store', dest='X', type=str, default=False, help= 'train_predictor: The data in the npy format rows are instances and columns are features' ) parser.add_argument( '--y', action='store', dest='Y', type=str, default=False, help= 'train_predictor: The labels associated with the rows of classifyX, each line is a associated with a row' ) parser.add_argument( '--model', action='store', dest='model', type=str, default=False, choices=[False, 'RF', 'SVM', 'DNN'], help='train_predictor: choice of classifier from RF, SVM, DNN') parser.add_argument( '--batchsize', action='store', dest='batch_size', type=int, default=10, help='train_predictor-model/DNN: batch size for deep learning') parser.add_argument( '--gpu_id', action='store', dest='gpu_id', type=str, default='0', help='train_predictor-model/DNN: GPU id for deep learning') parser.add_argument( '--epochs', action='store', dest='epochs', type=int, default=100, help='train_predictor-model/DNN: number of epochs for deep learning') parser.add_argument( '--arch', action='store', dest='dnn_arch', type=str, default='1024,0.2,512', help= 'train_predictor-model/DNN: The comma separated definition of neural network layers connected to eahc other, you do not need to specify the input and output layers, values between 0 and 1 will be considered as dropouts' ) # general to bootstrap and rep ################################################################################## parser.add_argument('--filetype', action='store', dest='filetype', type=str, default='fastq', help='fasta fsa fastq etc') # bootstrap ################################################################################ parser.add_argument('--kvals', action='store', dest='kvals', type=str, default='3,4,5,6,7,8', help='Comma separated k-mer values 2,3,4,5,6') parser.add_argument('--nvals', action='store', dest='nvals', type=str, default='10,20,50,100,200,500,1000,2000,5000,10000', help='Comma separated sample sizes') # rep / classifier ################################################################################ parser.add_argument('--cores', action='store', dest='cores', default=4, type=int, help='Number of cores to be used') # rep ################################################################################## parser.add_argument( '--KN', action='store', dest='K_N', default=None, type=str, help= 'pair of comma separated Kmer:sub-sample-size ==> 2:100,6:-1 (N=-1 means using all sequences)' ) parser.add_argument('--out', action='store', dest='output_addr', type=str, default='out', help='Out put directory') parser.add_argument('--in', action='store', dest='input_addr', type=str, default=None, help='Input fasta file or directory of samples') parser.add_argument('--name', action='store', dest='data_name', type=str, default=None, help='name of the dataset') parsedArgs = parser.parse_args() if parsedArgs.bootstrapping: ''' bootstrapping functionality ''' print('Bootstrapping requested..\n') if (not os.access(parsedArgs.input_dir_bootstrapping, os.F_OK)): err = err + "\nError: Permission denied or could not find the directory!" return err else: try: os.stat(parsedArgs.output_addr) except: os.mkdir(parsedArgs.output_addr) if len( FileUtility.recursive_glob( parsedArgs.input_dir_bootstrapping, '*' + parsedArgs.filetype)) == 0: err = err + "\nThe filetype " + parsedArgs.filetype + " could not find the directory!" return err if not parsedArgs.data_name: parsedArgs.data_name = parsedArgs.input_dir_bootstrapping.split( '/')[-1] try: k_values = [int(x) for x in parsedArgs.kvals.split(',')] n_values = [int(x) for x in parsedArgs.nvals.split(',')] except: err = err + "\n k-mers or sampling sizes are not fed correctly; see the help with -h!" return err MicroPheno.bootstrapping(parsedArgs.input_dir_bootstrapping, parsedArgs.output_addr, parsedArgs.data_name, filetype=parsedArgs.filetype, k_values=k_values, sampling_sizes=n_values) return False if parsedArgs.genkmer: ''' Representation creation functionality ''' if (not os.access(parsedArgs.genrep_input_addr, os.F_OK)): err = err + "\nError: Permission denied or could not find the directory!" return err elif os.path.isdir(parsedArgs.genrep_input_addr): print('Representation creation requested for directory ' + parsedArgs.genrep_input_addr + '\n') try: os.stat(parsedArgs.output_addr) except: os.mkdir(parsedArgs.output_addr) if len( FileUtility.recursive_glob(parsedArgs.genrep_input_addr, '*' + parsedArgs.filetype)) == 0: err = err + "\nThe filetype " + parsedArgs.filetype + " could not find the directory!" return err if not parsedArgs.data_name: parsedArgs.data_name = parsedArgs.genrep_input_addr.split( '/')[-1] try: sampling_dict = dict() for x in parsedArgs.K_N.split(','): k, n = x.split(':') k = int(k) n = int(n) if k in sampling_dict: sampling_dict[k].append(n) else: sampling_dict[k] = [n] except: err = err + "\nWrong format for KN (k-mer sample sizes)!" return err MicroPheno.representation_creation_dir( parsedArgs.genrep_input_addr, parsedArgs.output_addr, parsedArgs.data_name, parsedArgs.cores, filetype=parsedArgs.filetype, sampling_dict=sampling_dict) else: print('Representation creation requested for file ' + parsedArgs.genrep_input_addr + '\n') if parsedArgs.train_predictor: print('Classification and parameter tuning requested..\n') if not parsedArgs.model: err = err + "\nNo classification model is specified" if (not os.access(parsedArgs.X, os.F_OK)): err = err + "\nError: Permission denied or could not find the X!" return err if (not os.access(parsedArgs.Y, os.F_OK)): err = err + "\nError: Permission denied or could not find the Y!" return err else: try: os.stat(parsedArgs.output_addr) except: os.mkdir(parsedArgs.output_addr) print(parsedArgs.output_addr, ' directory created') if not parsedArgs.data_name: parsedArgs.data_name = parsedArgs.X.split('/')[-1].split('.')[0] if parsedArgs.model == 'DNN': ''' Deep learning ''' arch = [ int(layer) if float(layer) > 1 else float(layer) for layer in parsedArgs.dnn_arch.split(',') ] MicroPheno.DNN_classifier(parsedArgs.X, parsedArgs.Y, arch, parsedArgs.output_addr, parsedArgs.data_name, parsedArgs.gpu_id, parsedArgs.epochs, parsedArgs.batch_size) else: ''' SVM and Random Forest ''' if parsedArgs.model in ['SVM', 'RF']: MicroPheno.classical_classifier(parsedArgs.X, parsedArgs.Y, parsedArgs.model, parsedArgs.output_addr, parsedArgs.data_name, parsedArgs.cores) else: return "\nNot able to recognize the model!" else: err = err + "\nError: You need to specify an input corpus file!" print('others') return False
# This file displays results from the picle files. # provide the location of result files as input to path2res # works fine for .pickle files # can read all pickle in the path so make sure that you only supply the files whi are the resulting pickle files. import sys sys.path.append('./') from utility.file_utility import FileUtility import numpy as np path2res = sys.argv[1] files = FileUtility.recursive_glob(path2res, '*.pickle') import warnings warnings.filterwarnings("ignore") # In[8]: def get_cv_res(filename): try: [ label_set, conf, best_score_, best_estimator_, cv_results_, best_params_, pred ] = FileUtility.load_obj(filename) res = dict() print(filename.split('/')[-1] + " ", conf) #print (cv_results_.keys()) idx = np.argmax(cv_results_['mean_test_f1_macro']) res['f1_macro'] = np.round(cv_results_['mean_test_f1_macro'][idx], 2) res['f1_macro*'] = str(