def tune_and_evaluate(self, estimator, parameters, score='macro_f1', n_jobs=-1, file_name='results'): ''' :param estimator: :param parameters:p :param score: :param n_jobs: :param file_name: directory/tuning/classifier/features/ :return: ''' # greed_search self.greed_search = GridSearchCV(estimator=estimator, param_grid=parameters, cv=self.cv, scoring=self.scoring, refit=score, error_score=0, n_jobs=n_jobs) label_set = list(set(self.Y)) # fitting self.greed_search.fit(X=self.X, y=self.Y) y_predicted = cross_val_predict(self.greed_search.best_estimator_, self.X, self.Y) conf = confusion_matrix(self.Y, y_predicted, labels=label_set) # save in file FileUtility.save_obj(file_name, [ label_set, conf, self.greed_search.best_score_, self.greed_search.best_estimator_, self.greed_search.cv_results_, self.greed_search.best_params_, y_predicted ])
def __init__(self, triple, crawl=True, parse=True, remove_after_parse=False, printing=False): ''' :param url: :param destination_directory: :param output_file: ''' # get parameters self.url, self.destination_directory, output_file = triple response = requests.get(self.url) if response.status_code == 200: try: self.output_file = self.destination_directory + output_file self.print = printing # crawl the pages if crawl: BibleCrawler.run_crawler(self, '//a[@class = "next"]/@href', self.url, self.destination_directory) if parse: # find the lang ID in the website self.lang_directory = '/'.join(self.url.split('/')[3:7]) + '/' self.url = self.url[self.url.find('.com') + 5::] if '.' in self.url.split('/')[-1]: self.lang_directory = '/'.join(self.url.split('/')[3:-1]) + '/' books = self.destination_directory + self.lang_directory self.run_parser(books, self.output_file) if remove_after_parse: # parse the output file # remove the directory FileUtility.remove_dir(self.destination_directory + self.lang_directory) except: return None else: return None return None
def create_report_cloud(self): report = { 'language_iso': [], 'trans_ID': [], 'language_name': [], 'Description': [], 'verses': [] } for trID in self.df_cloud.trans_ID: iso = self.id2iso_dict[trID] if not FileUtility.exists(self.output_path + '/' + iso + '_' + trID + '.cloud.txt'): length = 0 else: length = len( FileUtility.load_list(self.output_path + '/' + iso + '_' + trID + '.cloud.txt')) report['language_iso'].append(iso) report['trans_ID'].append(trID) report['language_name'].append(self.id2lang_dict[trID]) report['Description'].append(self.id2version[trID]) report['verses'].append(length) report = pd.DataFrame(report) report.set_index('trans_ID') report.to_csv(self.output_path + '/reports/crawl_report_cloud.tsv', sep='\t', index=False, columns=[ 'language_iso', 'trans_ID', 'language_name', 'Description', 'verses' ]) self.generate_final_rep()
def tune_and_evaluate(self, estimator, parameters, score='f1_macro', file_name='results'): ''' :param estimator: :param parameters: :param score: :param file_name: directory/tuning/classifier/features/ :return: ''' # inner cross_validation self.greed_search = GridSearchCV(estimator=estimator, param_grid=parameters, cv=self.inner_cv, scoring=self.scoring, refit=score, error_score=0) # Nested CV with parameter optimization self.nested_score = cross_val_score(self.greed_search, X=self.X, y=self.Y, cv=self.outer_cv) # saving FileUtility.save_obj([self.greed_search, self.nested_score], file_name)
def RA_healthy(): Pipeline = DiTaxaWorkflow( '/mounts/data/proj/asgari/dissertation/datasets/deepbio/microbiome/RA/', 'fastq', '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/RAoutput/', 'RA', 50000, 5000, -1, num_p=20) #Pipeline.train_npe() #Pipeline.representation_npe() labels = FileUtility.load_list( '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/ra/rep/labels.txt' ) labels = { x.split('/')[-1]: labels[idx] for idx, x in enumerate( FileUtility.load_list( '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/ra/rep/ra_selfposnpe_10000_npe_5000_meta' )) } Pipeline.biomarker_extraction(labels, { 'untreated_RA': 1, 'treated_RA': 0 }, 'untreated_vs_treated')
def crawl_bpc(self,nump=20,update_meta=False, override=False, repeat=1): # update the metadata table if update_meta: self.update_meta_data() # read the metadata table self.df_biblecom=pd.read_table('../meta/biblecom.tsv', sep='\t') urliso=self.df_biblecom[['url','language_iso']].values.tolist() if not override: new_list=[] for url, iso in urliso: num=url.split('/')[0:-1][-1] if not FileUtility.exists(self.output_path+'/'+iso+'_'+num+'.biblecom.txt'): new_list.append([url,iso]) urliso=new_list res=BibleComAPl.make_parallel(min(nump,len(urliso)),self.crawl_a_lang,urliso) # iterating for max coverage continue_iter = True count =0; while continue_iter and count < repeat: # update list new_list=[] for url, iso in urliso: num=url.split('/')[0:-1][-1] if not FileUtility.exists(self.output_path+'/'+iso+'_'+num+'.biblecom.txt'): new_list.append([url,iso]) if len(new_list)==len(urliso): continue_iter=False count+=1; urliso=new_list print ('Double check for the missing translations..') res=BibleComAPl.make_parallel(min(nump,len(urliso)),self.crawl_a_lang,urliso) self.create_report_biblecom()
def org_classification(): ''' ''' X=FileUtility.load_sparse_csr('../../datasets/processed_data/org/K/6-mer_org_restrictedkmer.npz').toarray() Y=FileUtility.load_list('../../datasets/processed_data/org/K/org_label_restrictedkmer.txt') DNN=DNNMutliclass16S(X,Y,model_arch=[1024,0.2,256,0.1,256,0.1,128,0.1,64]) DNN.cross_validation('../../datasets/results/org/classifier/nn', gpu_dev='2', n_fold=10, epochs=30, batch_size=100, model_strct='mlp')
def __init__(self): # 初始化static_parser和dynamic parser dynamic_path = "%s/input/qe_config_dynamic.JSON" % os.path.abspath( os.pardir) static_path = "%s/input/qe_config_static.JSON" % os.path.abspath( os.pardir) self.dynamic_parser = json_parser.ReadJson(dynamic_path) self.static_parser = json_parser.ReadJson(static_path) # 初始化老化的起始时间和结束时间 self.start_time = self.dynamic_parser.get_first_layer("START_TIME") self.end_time = self.dynamic_parser.get_first_layer("END_TIME") # 初始化聚合数据的time interval,单位秒 # 我们会用每个CAP_TIME除以time_interval,获得一个time_index,将连续的时间离散化 self.time_interval = self.static_parser.get_first_layer( "AGGREGATION_INTERVAL") # 有时候我们不关心过高量程上的值是否准确,例如PM25>500,或者PM10>600 self.max_exp_val = self.static_parser.get_first_layer("MAX_EXP_VAL") # file_utility对象负责新建输出目录,以及获得数据、图片、输出文件保存的地方 self.file_utility = FileUtility() # 由于我们对每一台需要质保的设备分开处理,这里记录当前处理的参数是什么,以及当前处理的是否为该参数下的第一台设备,主要是为了输出数据而建立的flag self.first_qe_device = True self.current_var = ''
def representation_npe(self): ''' :return: ''' print('npe generation started..') start = time.time() G16s = NPESegmentApplyMetagenomics( self.file_directory, self.file_extenstion, self.output_directory + 'npe_segmentatation/' + self.dbname + '_' + '_'.join([ 'unique', str(self.vocab_size), 'v', str(self.seg_train_depth), 's.model' ]), sampling_number=self.rep_sampling_depth, num_p=self.num_p) DiTaxaWorkflow.ensure_dir(self.output_directory + 'npe_representation/') G16s.generate_npes_all(save=self.output_directory + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(self.rep_sampling_depth)) end = time.time() spent = end - start self.log_file.append( 'generating the representations npe_representation/' + self.dbname + '_uniquepiece_' + str(self.rep_sampling_depth) + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + 'cores') FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file)
def generate_LR_important_features(self, clf_LR, feature_names, results_file, N=1000): ''' :param clf_logistic_regression: :param feature_names: :param results_file: :param N: :return: ''' results_file = results_file.replace( '/classifications/', '/feature_selection/classifications/') FileUtility.ensure_dir(results_file) file_name = results_file + '_LR' idxs = argsort(np.abs(clf_LR.coef_.tolist()[0]).tolist(), rev=True)[0:N] f = codecs.open(file_name, 'w') f.write('\t'.join(['feature', 'score']) + '\n') for idx in idxs: f.write('\t'.join( [feature_names[idx], str(clf_LR.coef_.tolist()[0][idx])]) + '\n') f.close()
def create_report_png(self): report = { 'language_iso': [], 'trans_ID': [], 'language_name': [], 'verses': [] } self.df_png = pd.DataFrame(report) png_files = FileUtility.recursive_glob(self.output_path + '/', '*.png.txt') for png_file in png_files: iso, code = png_file.split('/')[-1].split( '.')[0:-1][0:-1][-1].split('_') length = len(FileUtility.load_list(png_file)) lang_name = self.lang_dict[ iso] if iso in self.lang_dict else 'ISO: ' + iso self.df_png = self.df_png.append( { 'language_iso': iso, 'trans_ID': code, 'language_name': lang_name, 'verses': length }, ignore_index=True) self.df_png.set_index('trans_ID') self.df_png.to_csv( self.output_path + '/reports/crawl_report_png.tsv', sep='\t', index=False, columns=['language_iso', 'trans_ID', 'language_name', 'verses']) self.generate_final_rep()
def write_in_file(filename, pos, neg): lines = [['direction', 'marker', 'p-value']] for marker, pval in pos: lines.append(['+', marker, str(pval)]) for marker, pval in neg: lines.append(['-', marker, str(pval)]) FileUtility.save_list(filename, ['\t'.join(line) for line in lines])
def generate(self, vocab_size, sample_size, output_dir, num_p=4, backend='Sentencepiece'): ''' :param vocab_size: the size of final vocabulary :param sample_size: how many reads from each file :param output_dir: where to write the results :param num_p: number of cores :return: ''' start = timeit.default_timer() fasta_files = [(x, sample_size) for x in self.fasta_files] corpus = [] pool = Pool(processes=num_p) for ky, v in tqdm.tqdm(pool.imap_unordered(self._get_corpus, fasta_files, chunksize=num_p), total=len(fasta_files)): corpus = corpus + v pool.close() print('Corpus size for training NPE is ', len(corpus)) if backend == 'Sentencepiece': FileUtility.save_list('../tmp/tmp_txt', corpus) spm.SentencePieceTrainer.Train( '--input=../tmp/tmp_txt --model_prefix=' + output_dir + ' --add_dummy_prefix false --max_sentencepiece_length=512 --model_type=bpe --mining_sentence_size=5000000 --input_sentence_size=10000000 --vocab_size=50000' ) FileUtility.save_list('../tmp/tmp_txt', corpus[0:10]) elif backend == 'normalbpe': train_npe(corpus, output_dir, vocab_size, output_dir + '_freq') print(' The segmentation training took ', timeit.default_timer() - start, ' ms.')
def __init__(self, fasta_file, matrix_path, feature_file_path, phenotypes, phenotype_mapping, selected_samples, p_value_threshold=0.01, remove_redundants=False, num_p=4, blastn_path=''): if len(blastn_path) > 0: os.environ['PATH'] += ':' + blastn_path self.num_p = num_p self.seq_IDS = FileUtility.read_fasta_sequences_ids(fasta_file) self.remove_redundants = remove_redundants self.ez_taxa_dict = { x.split()[0]: x.split()[1].split(';') for x in FileUtility.load_list('db/ez_idx_taxonomy.txt') } self.mat = FileUtility.load_sparse_csr(matrix_path) self.mat = self.mat.toarray() self.mat = self.mat[selected_samples, :] self.mat = csr_matrix(self.mat) self.features = FileUtility.load_list(feature_file_path) self.align_markers_parallel(p_value_threshold) self.redundant_columns_indentification() self.phenotype_mapping = phenotype_mapping self.phenotypes = phenotypes
def train_npe(self): ''' :return: ''' print( 'npe training started.. it might take more than 1 hour for more than 1000 samples' ) DiTaxaWorkflow.blockPrint() start = time.time() G16s = NPESegmentTrainMetagenomics(self.file_directory, self.file_extenstion) DiTaxaWorkflow.ensure_dir(self.output_directory + 'npe_segmentatation/') G16s.generate(self.vocab_size, self.seg_train_depth, self.output_directory + 'npe_segmentatation/' + self.dbname + '_' + '_'.join([ 'unique', str(self.vocab_size), 'v', str(self.seg_train_depth), 's' ]), backend='Sentencepiece', num_p=self.num_p) end = time.time() spent = (end - start) self.log_file.append('training segmentation ' + '_'.join([ 'unique', str(self.vocab_size), 'v', str(self.seg_train_depth), 's ' ]) + str(spent) + ' seconds , using ' + str(self.num_p) + 'cores') DiTaxaWorkflow.enablePrint() FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file)
def create_report_biblecom(self): self.df_biblecom['verses'] = 0 biblecom_files = FileUtility.recursive_glob(self.output_path + '/', '*.biblecom.txt') for bib_file in biblecom_files: file_parts = bib_file.split('/')[-1].split( '.')[0:-1][0:-1][-1].split('_') num_file_parts = len(file_parts) if num_file_parts == 2: iso, code = file_parts elif num_file_parts == 3: iso = "_".join(file_parts[:2]) code = file_parts[2] else: continue length = len(FileUtility.load_list(bib_file)) self.df_biblecom.loc[:, 'verses'][ (self.df_biblecom['language_iso'] == iso) & (self.df_biblecom['trans_ID'] == int(code))] = length self.df_biblecom.set_index('trans_ID') self.df_biblecom.to_csv( self.output_path + '/reports/crawl_report_biblecom.tsv', sep='\t', index=False, columns=['language_iso', 'trans_ID', 'language_name', 'verses']) self.generate_final_rep()
def create_treefold(self, path, tree_addr, cv, test_ratio, phenotype, mapping=None): ## find a mapping from strains to the phenotypes if mapping: mapping_isolate_label = dict(self.get_new_labeling(mapping)[phenotype]) else: mapping_isolate_label = self.phenotype2labeled_strains_mapping[phenotype] # get common strains list_of_list_of_strains = list(self.strains.values()) list_of_list_of_strains.append(list(mapping_isolate_label.keys())) final_strains = GenotypePhenotypeAccess.get_common_strains(list_of_list_of_strains) final_strains.sort() # prepare test Y = [mapping_isolate_label[strain] for strain in final_strains] isolate_to_group=dict([tuple(l.split('\t')) for l in FileUtility.load_list(tree_addr.replace(tree_addr.split('/')[-1], 'phylogenetic_nodes_and_clusters.txt'))]) groups=[int(isolate_to_group[iso]) for iso in final_strains] group_kfold = GroupKFold(n_splits=round(1/test_ratio)) train_index, test_index = list(group_kfold.split(final_strains, Y, groups))[0] X_test=[final_strains[x] for x in test_index] FileUtility.save_list(path.replace('_folds.txt', '_test.txt'), ['\t'.join(X_test)]) final_strains = [final_strains[ix] for ix in train_index] group_kfold = GroupKFold(n_splits=cv) folds=[] for _, test_index in group_kfold.split(train_index, [Y[idx] for idx in train_index], [groups[idx] for idx in train_index]): folds.append(test_index) folds=['\t'.join([final_strains[x] for x in fold.tolist()]) for fold in folds] FileUtility.save_list(path, folds)
def __init__(self, triple, crawl=True, parse=True, remove_after_parse=False, printing=False): ''' :param url: :param destination_directory: :param output_file: ''' try: # get parameters self.url, self.destination_directory, output_file = triple self.output_file = self.destination_directory + output_file self.print = printing # crawl the pages # to be fixed if crawl: BibleCrawler.run_crawler(self, '//a[@class = "chapter-nav-right"]/@href', self.url, self.destination_directory) if parse: self.lang_directory = self.url.split('/')[3] # crawl the pages books=self.destination_directory + self.lang_directory self.run_parser(books, self.output_file) if remove_after_parse: # parse the output file # remove the directory FileUtility.remove_dir(self.destination_directory + self.lang_directory) except: try: print(triple) except: return None return None
def eco_all_classification_transfer_learning(): ''' ''' #[1024,0.2,256,0.1,256,0.1,128,0.1,64] X=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_all_classes/6-mer_eco_restrictedmer_all.npz').toarray() Y=FileUtility.load_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt') DNN=DNNMutliclass16S(X,Y,model_arch=[512,0.1,256, 0.1,128]) DNN.cross_validation('../../datasets/results/eco_all/nn', gpu_dev='6', pretrained_model=True,trainable=False, n_fold=5, epochs=10, batch_size=10, model_strct='../../datasets/results/eco_10000/classifiers/nn_layers_mlp_1024-0.2-512-0.2-512_0.88.pickle')
def eco_all_classification(): ''' ''' #[1024,0.2,256,0.1,256,0.1,128,0.1,64] X=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_all_classes/6-mer_eco_restrictedmer_all.npz').toarray() Y=FileUtility.load_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt') DNN=DNNMutliclass16S(X,Y,model_arch=[1024,0.2,512,0.2,512,0.1,256]) DNN.cross_validation('../../datasets/results/eco_all/nn', gpu_dev='1', n_fold=10, epochs=20, batch_size=10, model_strct='mlp')
def crohns_disease(): ''' ''' #[1024,0.2,256,0.1,256,0.1,128,0.1,64] X=FileUtility.load_sparse_csr('../../datasets/processed_data/crohn/sample-size/6-mers_rate_complete1359_seq_5000.npz').toarray() Y=FileUtility.load_list('../../datasets/processed_data/crohn/data_config/labels_disease_complete1359.txt') DNN=DNNMutliclass16S(X,Y,model_arch=[512,0.2,256,0.2,128,0.1,64,16]) DNN.cross_validation('../../datasets/results/crohn/classifier/nn', gpu_dev='2', n_fold=3, epochs=25, batch_size=10, model_strct='mlp')
def __init__(self, output_path): ''' Constructor ''' # set the parameters self.output_path = output_path FileUtility.ensure_dir(self.output_path + '/biblecom_intermediate/') FileUtility.ensure_dir(self.output_path + '/reports/')
def write_in_fastafile(filename, res, min_length=50): corpus = [] labels = [] for seq, score, pval, _, _ in res: if len(seq) > min_length and pval < 0.05: corpus.append(seq) labels.append(' '.join( ['+' if score > 0 else '-', 'p-val:' + str(pval)])) FileUtility.create_fasta_file(filename, corpus, labels)
def generate_top_features(path, classifier_list, topk=200): ## TODO: ask as an input topk writer = pd.ExcelWriter(path + '/ultimate_outputs/selected_features.xls', engine='xlsxwriter') final_results = dict() for classifier in classifier_list: feature_files = FileUtility.recursive_glob( path + '/feature_selection/', '*_' + classifier) res = dict() for file in feature_files: phenotype = file.split('/')[0:-1][-1] if not phenotype in res: res[phenotype] = [file] else: if file.split('/')[-1].count('##') > res[phenotype][0].split( '/')[-1].count('##'): res[phenotype] = [file] elif file.split('/')[-1].count( '##') == res[phenotype][0].split('/')[-1].count('##'): res[phenotype].append(file) for phenotype in res.keys(): if phenotype not in final_results: final_results[phenotype] = [] final_results[phenotype] += res[phenotype] for phenotype, files in final_results.items(): selected = [{ x.split('\t')[0]: 1 / (idx + 1) for idx, x in enumerate(FileUtility.load_list(file)[1:topk]) } for file in files] res = set(selected[0]) for set_select in selected[1::]: res = res.intersection(set_select) geno_val_res = dict() for dict_geno_val in selected: for x, val in dict_geno_val.items(): if x not in geno_val_res: geno_val_res[x] = [val, 1] else: geno_val_res[x][0] += val geno_val_res[x][1] += 1 df_dict = {'feature_name': [], 'mrr': [], 'freq_confirmation': []} for name, values in geno_val_res.items(): rr, nr = values df_dict['feature_name'].append(name) df_dict['mrr'].append(rr / nr) df_dict['freq_confirmation'].append(nr) df = pd.DataFrame(df_dict) df.sort_values(['freq_confirmation', 'mrr', 'feature_name'], ascending=[False, False, False], inplace=True) df = df.copy() df.to_excel(writer, sheet_name=phenotype, index=False)
def train_resampling_npe(sentenses, outfile, num_symbols, frequency_file, min_frequency=2, verbose=False, is_dict=False, resample_size=10000, N=10): """Learn num_symbols BPE operations from vocabulary, and write to outfile. """ outfile_name=outfile list_of_seg=[] outfile = codecs.open(outfile, 'w', 'utf-8') f = codecs.open(frequency_file, 'w', 'utf-8') # version 0.2 changes the handling of the end-of-word token ('</w>'); # version numbering allows bckward compatibility outfile.write('#version: 0.2\n') list_of_seg.append('#version: 0.2') vocab = get_vocabulary(sentenses, is_dict) vocab = dict([(tuple(x[:-1]) + (x[-1] + '</w>',), y) for (x, y) in vocab.items()]) sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True) stats, indices = get_pair_statistics(sorted_vocab) big_stats = copy.deepcopy(stats) # threshold is inspired by Zipfian assumption, but should only affect speed threshold = max(stats.values()) / 10 for i in tqdm.tqdm(range(num_symbols)): if stats: most_frequent = max(stats, key=lambda x: (stats[x], x)) # we probably missed the best pair because of pruning; go back to full statistics if not stats or (i and stats[most_frequent] < threshold): prune_stats(stats, big_stats, threshold) stats = copy.deepcopy(big_stats) most_frequent = max(stats, key=lambda x: (stats[x], x)) # threshold is inspired by Zipfian assumption, but should only affect speed threshold = stats[most_frequent] * i / (i + 10000.0) prune_stats(stats, big_stats, threshold) if stats[most_frequent] < min_frequency: sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency)) break f.write('{0} {1} '.format(*most_frequent) + str(stats[most_frequent]) + '\n') list_of_seg.append('{0} {1} '.format(*most_frequent)) #print('{0} {1} '.format(*most_frequent) + str(stats[most_frequent]) + '\n') if verbose: sys.stderr.write( 'pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent])) outfile.write('{0} {1}\n'.format(*most_frequent)) changes = replace_pair(most_frequent, sorted_vocab, indices) update_pair_statistics(most_frequent, changes, stats, indices) stats[most_frequent] = 0 if not i % 100: prune_stats(stats, big_stats, threshold) if not i % 100: FileUtility.save_list(outfile_name+'_temp',list_of_seg) f.close()
def create_tsne_web(X, Y, tsne_file_coor, tsne_file_label): classes = list(set(Y)) classes.sort() L = [classes.index(y) for y in Y] tsne_res = np.hstack((X, np.array([L]).T)) tsne_res[:, 0:2] = np.round(tsne_res[:, 0:2], 2) tsne_lines = [] for l in tsne_res: tsne_lines.append('\t'.join([str(l[0]), str(l[1]), str(int(l[2]))])) FileUtility.save_list(tsne_file_coor, tsne_lines) FileUtility.save_list(tsne_file_label, Y)
def test(): X = FileUtility.load_sparse_csr( '../body-sites/npe_rate_5000.npz').toarray() Y = FileUtility.load_list( '../body-sites/npe_representations_labels/labels_phen.txt') DNN = DNNMutliclass16S(X, Y, model_arch=[512, 0.2, 256, 0.2, 128, 0.1, 64]) DNN.cross_validation('../body-sites/nn', gpu_dev='2', n_fold=3, epochs=300, batch_size=10, model_strct='mlp')
def sequential_crawl(triples, override=False): if not override: new_list=[] for x,y,z in triples: if not FileUtility.exists(y+z): new_list.append((x,y,z)) triples=new_list print ('Start crawling..') for x in tqdm.tqdm(triples): PNGScriptRetrieve(x) FileUtility.save_list(triples[0][1]+'log.txt',PNGScriptRetrieve.log)
def DNN_classifier(out_dir, X_file, Y_file, arch, gpu_id, epochs, batch_size): # k-mer data X = FileUtility.load_sparse_csr(X_file).toarray() # labels Y = [int(y) for y in FileUtility.load_list(Y_file)] DeepNN = DNN(X, Y, model_arch=arch) DeepNN.cross_validation(out_dir, gpu_dev=gpu_id, n_fold=10, epochs=epochs, batch_size=batch_size, model_strct='mlp')
def numpy2trainfiles(file, name, out='../data/s8_features/'): ''' test_file='/mounts/data/proj/asgari/dissertation/datasets/deepbio/protein_general/ss/data/cb513+profile_split1.npy' train_file='/mounts/data/proj/asgari/dissertation/datasets/deepbio/protein_general/ss/data/cullpdb+profile_6133_filtered.npy' :param name: :param out: :return: ''' db = np.load(file) a = np.arange(0, 21) b = np.arange(35, 56) c = np.hstack((a, b)) db = np.reshape(db, (db.shape[0], int(db.shape[1] / 57), 57)) seq = [ 'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X', 'NoSeq' ] label = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T'] sequences = [] labels = [] possible_features = dict() for i in range(0, db.shape[0]): sequences.append(''.join([ seq[np.argmax(x)] if np.max(x) == 1 else '' for x in db[i, :, 0:21] ]).lower()) labels.append(''.join([ label[np.argmax(y)] if np.max(y) == 1 else '' for y in db[i, :, 22:30] ]).lower()) lengths = [len(x) for x in sequences] sorted_idxs = argsort(lengths) lengths.sort() sequences = [sequences[i] for i in sorted_idxs] labels = [labels[i] for i in sorted_idxs] FileUtility.save_list(out + name, [ '\n'.join([ ' '.join([elx, labels[idx][idy]]) for idy, elx in enumerate(list(seq)) ] + ['']) for idx, seq in enumerate(sequences) ]) db_new = db[sorted_idxs, :, :] label_encoding = [[([0] if np.max(row) == 1 else [1]) + row for row in db_new[i, :, 22:30].tolist()] for i in range(0, db.shape[0])] np.save(out + name + '_mat_Y', label_encoding) db_new = db_new[:, :, c] np.save(out + name + '_mat_X', db_new) FileUtility.save_list(out + name + '_length.txt', [str(l) for l in lengths])