def RA_healthy(): Pipeline = DiTaxaWorkflow( '/mounts/data/proj/asgari/dissertation/datasets/deepbio/microbiome/RA/', 'fastq', '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/RAoutput/', 'RA', 50000, 5000, -1, num_p=20) #Pipeline.train_npe() #Pipeline.representation_npe() labels = FileUtility.load_list( '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/ra/rep/labels.txt' ) labels = { x.split('/')[-1]: labels[idx] for idx, x in enumerate( FileUtility.load_list( '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/ra/rep/ra_selfposnpe_10000_npe_5000_meta' )) } Pipeline.biomarker_extraction(labels, { 'untreated_RA': 1, 'treated_RA': 0 }, 'untreated_vs_treated')
def __init__(self, pos_fasta, neg_fasta, output_path, segmentation_schemes=10, topN=100): ''' ''' if not isinstance(pos_fasta, str): self.pos=pos_fasta elif pos_fasta.split('.')[-1]=='txt': self.pos=FileUtility.load_list(pos_fasta) elif pos_fasta.split('.')[-1]=='fasta': self.pos=FileUtility.read_fasta_sequences(pos_fasta) if not isinstance(neg_fasta, str): self.neg=neg_fasta elif neg_fasta.split('.')[-1]=='txt': self.neg=FileUtility.load_list(neg_fasta) elif neg_fasta.split('.')[-1]=='fasta': self.neg=FileUtility.read_fasta_sequences(neg_fasta) self.seqs=[seq.lower() for seq in self.pos+self.neg] self.labels=[1]*len(self.pos)+[0]*len(self.neg) self.segmentation_schemes=segmentation_schemes self.load_alpha_distribution() self.prepare_segmentations() print (output_path) FileUtility.ensure_dir(output_path) self.output_path=output_path self.motif_extraction(topN)
def __init__(self, fasta_file, matrix_path, feature_file_path, phenotypes, phenotype_mapping, selected_samples, p_value_threshold=0.01, remove_redundants=False, num_p=4): self.num_p = num_p self.seq_IDS = FileUtility.read_fasta_sequences_ids(fasta_file) self.remove_redundants = remove_redundants self.ez_taxa_dict = { x.split()[0]: x.split()[1].split(';') for x in FileUtility.load_list('../db/ez_idx_taxonomy.txt') } self.mat = FileUtility.load_sparse_csr(matrix_path) self.mat = self.mat.toarray() self.mat = self.mat[selected_samples, :] self.mat = csr_matrix(self.mat) self.features = FileUtility.load_list(feature_file_path) self.align_markers_parallel(p_value_threshold) self.redundant_columns_indentification() self.phenotype_mapping = phenotype_mapping self.phenotypes = phenotypes
def __init__(self, X_file, Y_file, features_file, path, selected_samples): ''' :param X: :param Y: :param features: :param path: ''' self.X = FileUtility.load_sparse_csr(X_file) self.X = self.X.toarray() self.X = self.X[selected_samples, :] self.X = csr_matrix(self.X) self.Y = [int(x) for x in FileUtility.load_list(Y_file)] self.features = FileUtility.load_list(features_file) self.path = path
def __init__(self, X, Y, isolate_list, fold_file, test_file): ''' :param X: :param Y: :param folds: :param random_state: ''' CrossValidator.__init__(self, X, Y) map_to_idx = {isolate: idx for idx, isolate in enumerate(isolate_list)} test_idx = [ map_to_idx[test] for test in FileUtility.load_list(test_file)[0].split() if test in map_to_idx ] train_idx = [ map_to_idx[train] for train in list( itertools.chain( *[l.split() for l in FileUtility.load_list(fold_file)])) ] self.X_test = X[test_idx, :] self.Y_test = [Y[idy] for idy in test_idx] train_idx = list(set(map_to_idx.values()) - set(test_idx)) X = X[train_idx, :] Y = [Y[idy] for idy in train_idx] isolate_list = [isolate_list[idx] for idx in train_idx] self.train_isolate_list = isolate_list map_to_idx = {isolate: idx for idx, isolate in enumerate(isolate_list)} splits = [[ map_to_idx[item] for item in fold_list.split() if item in map_to_idx ] for fold_list in FileUtility.load_list(fold_file)] new_splits = [] for i in range(len(splits)): train = [j for i in splits[:i] + splits[i + 1:] for j in i] test = splits[i] new_splits.append([train, test]) self.cv = new_splits self.X = X self.Y = Y
def load_data(self, dir, prefix_list): ''' Load list of features :param dir: :param prefix_list: :return: ''' for save_pref in prefix_list: print('@@@' + '_'.join([dir + save_pref, 'feature', 'vect.npz'])) self.X[save_pref] = FileUtility.load_sparse_csr('_'.join( [dir + save_pref, 'feature', 'vect.npz'])) self.feature_names[save_pref] = FileUtility.load_list('_'.join( [dir + save_pref, 'feature', 'list.txt'])) self.isolates[save_pref] = FileUtility.load_list('_'.join( [dir + save_pref, 'isolates', 'list.txt']))
def load_data(self, prefix_list=None): ''' Load list of features :param dir: :param prefix_list: :return: ''' for save_pref in prefix_list: print('@@@' + '_'.join([self.representation_path + save_pref, 'feature', 'vect.npz'])) self.X[save_pref] = FileUtility.load_sparse_csr( '_'.join([self.representation_path + save_pref, 'feature', 'vect.npz'])) self.feature_names[save_pref] = FileUtility.load_list( '_'.join([self.representation_path + save_pref, 'feature', 'list.txt'])) self.strains[save_pref] = FileUtility.load_list( '_'.join([self.representation_path + save_pref, 'strains', 'list.txt']))
def __init__(self, file_directory, file_extenstion, npe_file, onlyfiles=[], sampling_number=3000, num_p=20, vocab_size=-1): ''' :param fasta_files: list of fasta files :param indexing: the index :param sampling_number: :param num_p: ''' self.file_directory = file_directory self.file_extenstion = file_extenstion self.fasta_files, self.indexing = FileUtility.read_fasta_directory( self.file_directory, self.file_extenstion, only_files=onlyfiles) print(str(len(self.fasta_files)), 'fasta files found in', self.file_directory) self.num_p = num_p self.sampling_number = sampling_number self.npe_file = npe_file if '.model' in npe_file: self.model_type = 'seqpiece' self.npe_vocab = [ x.split()[0] for x in FileUtility.load_list( npe_file.replace('.model', '.vocab')) ] else: self.model_type = 'normal_bpe' self.npe_vocab = [ ''.join(x.split()).replace('</w>', '').lower() for x in FileUtility.load_list(npe_file)[1::] ] self.npe_vocab = list(set(self.npe_vocab)) self.vocab_size = vocab_size self.npe_vocab.sort() self.npe_vectorizer = TfidfVectorizer(use_idf=False, vocabulary=self.npe_vocab, analyzer='word', norm=None, stop_words=[], lowercase=True, binary=False, tokenizer=str.split)
def make_labels(self, mapping=None): ''' This function load labels mapping from strain to phenotypes ''' label_file_address = self.metadata_path + 'phenotypes.txt' rows = FileUtility.load_list(label_file_address) self.strain2labelvector = { str(entry.split()[0]): [str(x) for idx, x in enumerate(entry.split('\t')[1::])] for entry in rows[1::]} self.labeled_strains = list(self.strain2labelvector) self.labeled_strains.sort() self.phenotypes = [x for x in rows[0].rstrip().split()[1::]] # init for phenotype in self.phenotypes: self.phenotype2labeled_strains_mapping[phenotype] = [] # only consider non-empty values for strain, phenotype_vec in self.strain2labelvector.items(): for idx, val in enumerate(phenotype_vec): if mapping: if val in mapping: self.phenotype2labeled_strains_mapping[self.phenotypes[idx]].append((strain, mapping[val])) else: self.phenotype2labeled_strains_mapping[self.phenotypes[idx]].append((strain, val)) # generate dict of labels for each class for phenotype in self.phenotypes: self.phenotype2labeled_strains_mapping[phenotype] = dict(self.phenotype2labeled_strains_mapping[phenotype])
def org_classification(): ''' ''' X=FileUtility.load_sparse_csr('../../datasets/processed_data/org/K/6-mer_org_restrictedkmer.npz').toarray() Y=FileUtility.load_list('../../datasets/processed_data/org/K/org_label_restrictedkmer.txt') DNN=DNNMutliclass16S(X,Y,model_arch=[1024,0.2,256,0.1,256,0.1,128,0.1,64]) DNN.cross_validation('../../datasets/results/org/classifier/nn', gpu_dev='2', n_fold=10, epochs=30, batch_size=100, model_strct='mlp')
def create_report_png(self): report = { 'language_iso': [], 'trans_ID': [], 'language_name': [], 'verses': [] } self.df_png = pd.DataFrame(report) png_files = FileUtility.recursive_glob(self.output_path + '/', '*.png.txt') for png_file in png_files: iso, code = png_file.split('/')[-1].split( '.')[0:-1][0:-1][-1].split('_') length = len(FileUtility.load_list(png_file)) lang_name = self.lang_dict[ iso] if iso in self.lang_dict else 'ISO: ' + iso self.df_png = self.df_png.append( { 'language_iso': iso, 'trans_ID': code, 'language_name': lang_name, 'verses': length }, ignore_index=True) self.df_png.set_index('trans_ID') self.df_png.to_csv( self.output_path + '/reports/crawl_report_png.tsv', sep='\t', index=False, columns=['language_iso', 'trans_ID', 'language_name', 'verses']) self.generate_final_rep()
def create_report_cloud(self): report = { 'language_iso': [], 'trans_ID': [], 'language_name': [], 'Description': [], 'verses': [] } for trID in self.df_cloud.trans_ID: iso = self.id2iso_dict[trID] if not FileUtility.exists(self.output_path + '/' + iso + '_' + trID + '.cloud.txt'): length = 0 else: length = len( FileUtility.load_list(self.output_path + '/' + iso + '_' + trID + '.cloud.txt')) report['language_iso'].append(iso) report['trans_ID'].append(trID) report['language_name'].append(self.id2lang_dict[trID]) report['Description'].append(self.id2version[trID]) report['verses'].append(length) report = pd.DataFrame(report) report.set_index('trans_ID') report.to_csv(self.output_path + '/reports/crawl_report_cloud.tsv', sep='\t', index=False, columns=[ 'language_iso', 'trans_ID', 'language_name', 'Description', 'verses' ]) self.generate_final_rep()
def train_batch_generator_408(batch_size=64): ''' :param batch_size: :return: ''' start_idx = 0 train_lengths = [int(j) for j in FileUtility.load_list( 'datasets/train_length.txt')] X_train = np.load('datasets/X_train_408.npy') Y_train = np.array( np.load('datasets/train_mat_Y.npy')) while True: if not start_idx < len(train_lengths): start_idx = 0 X = X_train[start_idx:(min(start_idx + batch_size, len(train_lengths))), 0:train_lengths[min(start_idx + batch_size, len(train_lengths)) - 1]] Y = Y_train[start_idx:(min(start_idx + batch_size, len(train_lengths))), 0:train_lengths[min(start_idx + batch_size, len(train_lengths)) - 1], :] W = [] for idx in range(start_idx, (min(start_idx + batch_size, len(train_lengths)))): W.append([1 if l < train_lengths[idx] else 0 for l in range(0, train_lengths[min(start_idx + batch_size, len(train_lengths)) - 1])]) start_idx += batch_size yield X, Y, np.array(W)
def create_report_biblecom(self): self.df_biblecom['verses'] = 0 biblecom_files = FileUtility.recursive_glob(self.output_path + '/', '*.biblecom.txt') for bib_file in biblecom_files: file_parts = bib_file.split('/')[-1].split( '.')[0:-1][0:-1][-1].split('_') num_file_parts = len(file_parts) if num_file_parts == 2: iso, code = file_parts elif num_file_parts == 3: iso = "_".join(file_parts[:2]) code = file_parts[2] else: continue length = len(FileUtility.load_list(bib_file)) self.df_biblecom.loc[:, 'verses'][ (self.df_biblecom['language_iso'] == iso) & (self.df_biblecom['trans_ID'] == int(code))] = length self.df_biblecom.set_index('trans_ID') self.df_biblecom.to_csv( self.output_path + '/reports/crawl_report_biblecom.tsv', sep='\t', index=False, columns=['language_iso', 'trans_ID', 'language_name', 'verses']) self.generate_final_rep()
def validation_batch_generator_408(batch_size=100): ''' :param batch_size: :return: ''' test_lengths = [int(i) for i in FileUtility.load_list( 'datasets/test_length.txt')] X_test = np.load('datasets/X_test_408.npy') Y_test = np.array( np.load('datasets/test_mat_Y.npy')) start_idx = 0 while True: if not start_idx < len(test_lengths): start_idx = 0 X = X_test[start_idx:(min(start_idx + batch_size, len(test_lengths))), 0:test_lengths[min(start_idx + batch_size, len(test_lengths)) - 1]] Y = Y_test[start_idx:(min(start_idx + batch_size, len(test_lengths))), 0:test_lengths[min(start_idx + batch_size, len(test_lengths)) - 1], :] W = [] for idx in range(start_idx, (min(start_idx + batch_size, len(test_lengths)))): W.append([1 if l < test_lengths[idx] else 0 for l in range(0, test_lengths[min(start_idx + batch_size, len(test_lengths)) - 1])]) start_idx += batch_size yield X, Y, np.array(W)
def create_treefold(self, path, tree_addr, cv, test_ratio, phenotype, mapping=None): ## find a mapping from strains to the phenotypes if mapping: mapping_isolate_label = dict(self.get_new_labeling(mapping)[phenotype]) else: mapping_isolate_label = self.phenotype2labeled_strains_mapping[phenotype] # get common strains list_of_list_of_strains = list(self.strains.values()) list_of_list_of_strains.append(list(mapping_isolate_label.keys())) final_strains = GenotypePhenotypeAccess.get_common_strains(list_of_list_of_strains) final_strains.sort() # prepare test Y = [mapping_isolate_label[strain] for strain in final_strains] isolate_to_group=dict([tuple(l.split('\t')) for l in FileUtility.load_list(tree_addr.replace(tree_addr.split('/')[-1], 'phylogenetic_nodes_and_clusters.txt'))]) groups=[int(isolate_to_group[iso]) for iso in final_strains] group_kfold = GroupKFold(n_splits=round(1/test_ratio)) train_index, test_index = list(group_kfold.split(final_strains, Y, groups))[0] X_test=[final_strains[x] for x in test_index] FileUtility.save_list(path.replace('_folds.txt', '_test.txt'), ['\t'.join(X_test)]) final_strains = [final_strains[ix] for ix in train_index] group_kfold = GroupKFold(n_splits=cv) folds=[] for _, test_index in group_kfold.split(train_index, [Y[idx] for idx in train_index], [groups[idx] for idx in train_index]): folds.append(test_index) folds=['\t'.join([final_strains[x] for x in fold.tolist()]) for fold in folds] FileUtility.save_list(path, folds)
def eco_all_classification_transfer_learning(): ''' ''' #[1024,0.2,256,0.1,256,0.1,128,0.1,64] X=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_all_classes/6-mer_eco_restrictedmer_all.npz').toarray() Y=FileUtility.load_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt') DNN=DNNMutliclass16S(X,Y,model_arch=[512,0.1,256, 0.1,128]) DNN.cross_validation('../../datasets/results/eco_all/nn', gpu_dev='6', pretrained_model=True,trainable=False, n_fold=5, epochs=10, batch_size=10, model_strct='../../datasets/results/eco_10000/classifiers/nn_layers_mlp_1024-0.2-512-0.2-512_0.88.pickle')
def eco_all_classification(): ''' ''' #[1024,0.2,256,0.1,256,0.1,128,0.1,64] X=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_all_classes/6-mer_eco_restrictedmer_all.npz').toarray() Y=FileUtility.load_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt') DNN=DNNMutliclass16S(X,Y,model_arch=[1024,0.2,512,0.2,512,0.1,256]) DNN.cross_validation('../../datasets/results/eco_all/nn', gpu_dev='1', n_fold=10, epochs=20, batch_size=10, model_strct='mlp')
def crohns_disease(): ''' ''' #[1024,0.2,256,0.1,256,0.1,128,0.1,64] X=FileUtility.load_sparse_csr('../../datasets/processed_data/crohn/sample-size/6-mers_rate_complete1359_seq_5000.npz').toarray() Y=FileUtility.load_list('../../datasets/processed_data/crohn/data_config/labels_disease_complete1359.txt') DNN=DNNMutliclass16S(X,Y,model_arch=[512,0.2,256,0.2,128,0.1,64,16]) DNN.cross_validation('../../datasets/results/crohn/classifier/nn', gpu_dev='2', n_fold=3, epochs=25, batch_size=10, model_strct='mlp')
def load_book_map(self): ''' loading book number mapping ''' self.book_map = dict() for l in FileUtility.load_list('../meta/books2numbers.txt'): for y in l.split('\t')[1].split(','): self.book_map[y] = l.split('\t')[0]
def generate_top_features(path, classifier_list, topk=200): ## TODO: ask as an input topk writer = pd.ExcelWriter(path + '/ultimate_outputs/selected_features.xls', engine='xlsxwriter') final_results = dict() for classifier in classifier_list: feature_files = FileUtility.recursive_glob( path + '/feature_selection/', '*_' + classifier) res = dict() for file in feature_files: phenotype = file.split('/')[0:-1][-1] if not phenotype in res: res[phenotype] = [file] else: if file.split('/')[-1].count('##') > res[phenotype][0].split( '/')[-1].count('##'): res[phenotype] = [file] elif file.split('/')[-1].count( '##') == res[phenotype][0].split('/')[-1].count('##'): res[phenotype].append(file) for phenotype in res.keys(): if phenotype not in final_results: final_results[phenotype] = [] final_results[phenotype] += res[phenotype] for phenotype, files in final_results.items(): selected = [{ x.split('\t')[0]: 1 / (idx + 1) for idx, x in enumerate(FileUtility.load_list(file)[1:topk]) } for file in files] res = set(selected[0]) for set_select in selected[1::]: res = res.intersection(set_select) geno_val_res = dict() for dict_geno_val in selected: for x, val in dict_geno_val.items(): if x not in geno_val_res: geno_val_res[x] = [val, 1] else: geno_val_res[x][0] += val geno_val_res[x][1] += 1 df_dict = {'feature_name': [], 'mrr': [], 'freq_confirmation': []} for name, values in geno_val_res.items(): rr, nr = values df_dict['feature_name'].append(name) df_dict['mrr'].append(rr / nr) df_dict['freq_confirmation'].append(nr) df = pd.DataFrame(df_dict) df.sort_values(['freq_confirmation', 'mrr', 'feature_name'], ascending=[False, False, False], inplace=True) df = df.copy() df.to_excel(writer, sheet_name=phenotype, index=False)
def test(): X = FileUtility.load_sparse_csr( '../body-sites/npe_rate_5000.npz').toarray() Y = FileUtility.load_list( '../body-sites/npe_representations_labels/labels_phen.txt') DNN = DNNMutliclass16S(X, Y, model_arch=[512, 0.2, 256, 0.2, 128, 0.1, 64]) DNN.cross_validation('../body-sites/nn', gpu_dev='2', n_fold=3, epochs=300, batch_size=10, model_strct='mlp')
def sequence_lengths(input_file): train = FileUtility.load_list(input_file) training_data = [line.split() for line in train] final_list = list() temp = [] for x in training_data: if x == []: final_list.append(temp) temp = [] else: temp.append(x) return [len(prot) for prot in final_list]
def DNN_classifier(out_dir, X_file, Y_file, arch, gpu_id, epochs, batch_size): # k-mer data X = FileUtility.load_sparse_csr(X_file).toarray() # labels Y = [int(y) for y in FileUtility.load_list(Y_file)] DeepNN = DNN(X, Y, model_arch=arch) DeepNN.cross_validation(out_dir, gpu_dev=gpu_id, n_fold=10, epochs=epochs, batch_size=batch_size, model_strct='mlp')
def DNN_classifier(X_file, Y_file, arch, out_dir, dataset_name, gpu_id, epochs, batch_size): # k-mer data X = FileUtility.load_sparse_csr(X_file).toarray() # labels Y = FileUtility.load_list(Y_file) DNN = DNNMutliclass16S(X, Y, model_arch=arch) DNN.cross_validation(out_dir + 'nn_classification_results_' + dataset_name, gpu_dev=gpu_id, n_fold=10, epochs=epochs, batch_size=batch_size, model_strct='mlp')
def jump_url(self): ''' :return: ''' while self.counter < 1188: self.counter+=1 url_select='/'.join(self.url.split('/')[0:-1])+'/'+FileUtility.load_list('../meta/pngscript_filenames.txt')[self.counter] if url_select not in self.seen and url_select not in self.useless_url: if requests.get(url_select).status_code==404: if requests.get('/'.join(self.url.split('/')[0:-1])).status_code==404: self.counter=1189 return None self.useless_url.add(url_select) else: url=url_select self.useless_url.add(url) return url return None
def convert_to_kmer(input_file, out_file, n=3): train = FileUtility.load_list(input_file) training_data = [line.split() for line in train] final_list = list() temp = [] for x in training_data: if x == []: final_list.append(temp) temp = [] else: temp.append(x) res = [] for prot in final_list: sentence = ''.join(['$'] + [aa[0] for aa in prot] + ['#']) res += [(sentence[i:i + n], prot[i][1]) for i in range(len(sentence) - n + 1)] res += [''] FileUtility.save_list(out_file, [' '.join(list(x)) for x in res])
def __init__(self, file_directory, file_extenstion, output_directory, dbname, vocab_size, seg_train_depth, rep_sampling_depth, blastn_path, num_p=1, onlyfiles=[], override=1): ''' :param file_directory: the samples directory :param file_extenstion: the file extension fastq or fasta :param onlyfiles: filter a list of files :param backend: which backend to use ''' self.override = override self.file_directory = file_directory self.file_extenstion = file_extenstion self.fasta_files, self.filename_mapping = FileUtility.read_fasta_directory( self.file_directory, self.file_extenstion, only_files=onlyfiles) print(str(len(self.fasta_files)), ' fasta files found in', self.file_directory) self.dbname = dbname self.vocab_size = vocab_size self.seg_train_depth = seg_train_depth self.rep_sampling_depth = rep_sampling_depth self.num_p = num_p self.output_directory = output_directory self.output_directory_inter = ( output_directory[0:-1] if output_directory[-1] == '/' else output_directory) + '/intermediate_files/' self.blastn_path = blastn_path DiTaxaWorkflow.ensure_dir(self.output_directory) if not os.path.exists(self.output_directory + 'logfile.txt'): self.log_file = [] else: self.log_file = FileUtility.load_list(self.output_directory + 'logfile.txt') print('\t✔ DiTaxa workflow is getting started')
def IBD(): Pipeline = DiTaxaWorkflow( '/mounts/data/proj/asgari/dissertation/datasets/deepbio/microbiome/crohn/', 'fastq', '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/IBDout/', 'IBD', 50000, 5000, -1, num_p=20) Pipeline.train_npe() Pipeline.representation_npe() labels = dict([(x.split()[0] + '.fastq', x.split( )[1]) for x in FileUtility.load_list( '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/crohns/rep/Crohns_lables.txt' )]) Pipeline.biomarker_extraction(labels, { 'CD': 1, 'no': 0, 'control': 0 }, 'CD_vs_healthy')
def classical_classifier(out_dir, X_file, Y_file, model, cores): # X = FileUtility.load_sparse_csr(X_file) # labels Y = [int(y) for y in FileUtility.load_list(Y_file)] if model == 'RF': #### Random Forest classifier MRF = RFClassifier(X, Y) # results containing the best parameter, confusion matrix, best estimator, results on fold will be stored in this address MRF.tune_and_eval(out_dir, njobs=cores) elif model == 'SVM': #### Support Vector Machine classifier MSVM = SVM(X, Y) # results containing the best parameter, confusion matrix, best estimator, results on fold will be stored in this address MSVM.tune_and_eval(out_dir, njobs=cores) elif model == 'LR': #### Logistic regression classifier MLR = LogRegression(X, Y) # results containing the best parameter, confusion matrix, best estimator, results on fold will be stored in this address MLR.tune_and_eval(out_dir, njobs=cores)