def crawl_bpc(self,nump=20,update_meta=False, override=False, repeat=1): # update the metadata table if update_meta: self.update_meta_data() # read the metadata table self.df_biblecom=pd.read_table('../meta/biblecom.tsv', sep='\t') urliso=self.df_biblecom[['url','language_iso']].values.tolist() if not override: new_list=[] for url, iso in urliso: num=url.split('/')[0:-1][-1] if not FileUtility.exists(self.output_path+'/'+iso+'_'+num+'.biblecom.txt'): new_list.append([url,iso]) urliso=new_list res=BibleComAPl.make_parallel(min(nump,len(urliso)),self.crawl_a_lang,urliso) # iterating for max coverage continue_iter = True count =0; while continue_iter and count < repeat: # update list new_list=[] for url, iso in urliso: num=url.split('/')[0:-1][-1] if not FileUtility.exists(self.output_path+'/'+iso+'_'+num+'.biblecom.txt'): new_list.append([url,iso]) if len(new_list)==len(urliso): continue_iter=False count+=1; urliso=new_list print ('Double check for the missing translations..') res=BibleComAPl.make_parallel(min(nump,len(urliso)),self.crawl_a_lang,urliso) self.create_report_biblecom()
def create_report_cloud(self): report = { 'language_iso': [], 'trans_ID': [], 'language_name': [], 'Description': [], 'verses': [] } for trID in self.df_cloud.trans_ID: iso = self.id2iso_dict[trID] if not FileUtility.exists(self.output_path + '/' + iso + '_' + trID + '.cloud.txt'): length = 0 else: length = len( FileUtility.load_list(self.output_path + '/' + iso + '_' + trID + '.cloud.txt')) report['language_iso'].append(iso) report['trans_ID'].append(trID) report['language_name'].append(self.id2lang_dict[trID]) report['Description'].append(self.id2version[trID]) report['verses'].append(length) report = pd.DataFrame(report) report.set_index('trans_ID') report.to_csv(self.output_path + '/reports/crawl_report_cloud.tsv', sep='\t', index=False, columns=[ 'language_iso', 'trans_ID', 'language_name', 'Description', 'verses' ]) self.generate_final_rep()
def ret_bible_books(self, nump=10, trList=[], override=False): ''' Retrieving all bibles :param nump: :return: ''' # parallel input creation tr_meta = [] exists = dict() for x in (self.df['trans_ID'].unique().tolist() if len(trList) == 0 else trList): if not FileUtility.exists(self.output_path + '/' + self.id2iso_dict[x] + '_' + x + '.api.txt') or override: tr_meta.append( (self.df[self.df['trans_ID'] == x]['language_iso'].tolist()[0], x, self.df[self.df['trans_ID'] == x]['fcbh_id'].tolist())) else: exists[x] = 'existed' # call in parallel print('Retrieving the bible translations') res = BDPAPl.make_parallel(min(nump, len(tr_meta)), self.ret_a_book, tr_meta) res.update(exists) return res
def crawl_all_found_langs(self, nump=20, override=False, repeat=3): table = [] inputs = [] self.lang_dict = dict() for code, rec in tqdm.tqdm(self.url_dict.items()): inputs.append( (rec[2], self.output_path + '/pngscripture_intermediate/' + rec[1] + '/', rec[0], rec[1], rec[3])) self.lang_dict[rec[0]] = rec[3] if not override: new_list = [] for url, outpath, iso, code, langname in inputs: if not FileUtility.exists(self.output_path + '/' + iso + '_' + code.replace('_', '-') + '.png.txt'): new_list.append((url, outpath, iso, code, langname)) inputs = new_list res = PNGAPl.make_parallel(min(nump, len(inputs)), self.download_zipfile, inputs) # iterating for max coverage continue_iter = True count = 0 while continue_iter and count < repeat: # update list new_list = [] for url, outpath, iso, code, langname in inputs: if not FileUtility.exists(self.output_path + '/' + iso + '_' + code.replace('_', '-') + '.png.txt'): new_list.append((url, outpath, iso, code, langname)) inputs = new_list if len(new_list) == len(inputs): continue_iter = False inputs = new_list count += 1 print('Double check for the missing translations..') res = PNGAPl.make_parallel(min(nump, len(inputs)), self.download_zipfile, inputs)
def sequential_crawl(triples, override=False): if not override: new_list=[] for x,y,z in triples: if not FileUtility.exists(y+z): new_list.append((x,y,z)) triples=new_list print ('Start crawling..') for x in tqdm.tqdm(triples): PNGScriptRetrieve(x) FileUtility.save_list(triples[0][1]+'log.txt',PNGScriptRetrieve.log)
def crawl_bible_cloud(self, nump=20, override=False, repeat=3): urls = ('https://bible.cloud/inscript/content/texts/' + self.df_cloud['trans_ID'] + '/MT1.html').tolist() outputs = [self.output_path + '/biblecloud_intermediate/'] * len( self.df_cloud['trans_ID'].tolist()) txt_files = ('../' + self.df_cloud['language_iso'] + '_' + self.df_cloud['trans_ID'] + '.cloud.txt').tolist() triples = [(url, outputs[idx], txt_files[idx]) for idx, url in enumerate(urls)] if not override: new_list = [] for x, y, z in triples: if not FileUtility.exists(y + z): new_list.append((x, y, z)) triples = new_list BibleCloud.parallel_crawl(triples, min(nump, len(triples)), True) # iterating for max coverage continue_iter = True count = 0 while continue_iter and count < repeat: # update list new_list = [] for x, y, z in triples: if not FileUtility.exists(y + z): new_list.append((x, y, z)) if len(new_list) == len(triples): continue_iter = False triples = new_list count += 1 print('Double check for the missing translations..') BibleCloud.parallel_crawl(triples, min(nump, len(triples)), True) self.create_report_cloud()
def parallel_crawl(triples, num_p, override=False): if not override: new_list=[] for x,y,z in triples: if not FileUtility.exists(y+z): new_list.append((x,y,z)) triples=new_list if len(triples)>0: print ('Start parallel crawling..') pool = Pool(processes=num_p) res=[] for x in tqdm.tqdm(pool.imap_unordered(PNGScriptRetrieve, triples, chunksize=num_p),total=len(triples)): res.append(x) pool.close() FileUtility.save_list(triples[0][1]+'log.txt',PNGScriptRetrieve.log)
def sequential_crawl(triples, override=False): ''' :param triples: :param override: :return: ''' if not override: new_list=[] for x,y,z in triples: if not FileUtility.exists(y+z): new_list.append((x,y,z)) triples=new_list print ('Start crawling..') for x in tqdm.tqdm(triples): BibleCom(x) FileUtility.save_list(triples[0][1]+'log.txt',BibleCom.log)
def predict_block(self, ultimate=False): ''' :return: ''' import warnings from sklearn.exceptions import DataConversionWarning, FitFailedWarning, UndefinedMetricWarning, ConvergenceWarning warnings.filterwarnings(action='ignore', category=DataConversionWarning) warnings.filterwarnings(action='ignore', category=FitFailedWarning) warnings.filterwarnings(action='ignore', category=DeprecationWarning) warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning) warnings.filterwarnings(action='ignore', category=ConvergenceWarning) predict_blocks = self.xmldoc.getElementsByTagName('predict') predict_path=self.output+'/classifications/' # iterate over predict block for predict in predict_blocks: # Sub prediction FileUtility.ensure_dir(predict_path) setting_name=predict.attributes['name'].value subdir=predict_path+setting_name+'/' FileUtility.ensure_dir(subdir) ## label mapping labels=predict.getElementsByTagName('labels')[0].getElementsByTagName('label') mapping=dict() for label in labels: val=label.attributes['value'].value phenotype=label.firstChild.nodeValue.strip() mapping[phenotype]=int(val) ## optimizing for .. optimization=predict.getElementsByTagName('optimize')[0].firstChild.nodeValue.strip() ## number of folds self.cvbasis=predict.getElementsByTagName('eval')[0].firstChild.nodeValue.strip() folds=int(predict.getElementsByTagName('eval')[0].attributes['folds'].value) test_ratio=float(predict.getElementsByTagName('eval')[0].attributes['test'].value) if optimization not in ['accuracy','scores_r_1','scores_f1_1','scores_f1_0','f1_macro','f1_micro']: print ('Error in choosing optimization score') ## Genotype tables GPA=GenotypePhenotypeAccess(self.output) ## iterate over phenotypes if there exist more than one for phenotype in GPA.phenotypes: print ('working on phenotype ',phenotype) FileUtility.ensure_dir(subdir+phenotype+'/') ## create cross-validation FileUtility.ensure_dir(subdir+phenotype+'/cv/') cv_file='' cv_test_file='' if not ultimate: if self.cvbasis=='tree': FileUtility.ensure_dir(subdir+phenotype+'/cv/tree/') if self.override or not FileUtility.exists(subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt'])): GPA.create_treefold(subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt']), self.metadata_path + 'phylogentictree.txt', folds, test_ratio, phenotype, mapping) cv_file=subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt']) cv_test_file=subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_test.txt']) else: FileUtility.ensure_dir(subdir+phenotype+'/cv/rand/') if self.override or not FileUtility.exists(subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt'])): GPA.create_randfold(subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt']), folds, test_ratio, phenotype, mapping) cv_file=subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt']) cv_test_file=subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_test.txt']) features=[x.split('/')[-1].replace('_feature_vect.npz','') for x in FileUtility.recursive_glob(self.representation_path, '*.npz')] feature_combinations=[] ## TODO: ask as an input max_length_feature_comb = 3#len(features) for x in [[list(x) for x in list(itertools.combinations(features,r))] for r in range(3,max_length_feature_comb+1)]: feature_combinations+=x ## iterate over feature sets for feature_setting in feature_combinations: classifiers=[] for model in predict.getElementsByTagName('model'): for x in model.childNodes: if not x.nodeName=="#text": classifiers.append(x.nodeName) if not ultimate: X, Y, feature_names, final_strains = GPA.get_xy_prediction_mats(feature_setting, phenotype, mapping) feature_setting =[''.join(feature.split('.')[0:-1]) if len(feature.split('.'))>1 else feature for feature in feature_setting] feature_text='##'.join(feature_setting) ## iterate over classifiers for classifier in tqdm.tqdm(classifiers): basepath_cls=subdir+phenotype+'/'+feature_text+'_CV_'+self.cvbasis if classifier.lower()=='svm' and (not FileUtility.exists(basepath_cls+'_SVM.pickle') or self.override): Model = SVM(X, Y) Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names, params=[{'C': [1000, 500, 200, 100, 50, 20, 10, 5, 2, 1, 0.2, 0.5, 0.01, 0.02, 0.05, 0.001]}]) if classifier.lower()=='rf' and (not FileUtility.exists(basepath_cls+'_RF.pickle') or self.override): Model = RFClassifier(X, Y) Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names) if classifier.lower()=='lr' and (not FileUtility.exists(basepath_cls+'_LR.pickle') or self.override): Model = LogRegression(X, Y) Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names) #if classifier.lower()=='dnn': # Model = DNN(X, Y) # Model.tune_and_eval(subdir+phenotype+'/'+'_'.join([feature]),njobs=self.cores, kfold=10) # generate selected features FileUtility.ensure_dir(self.output+'/'+'ultimate_outputs/') print ('Select the top markers..') generate_top_features(self.output, [x.upper() for x in classifiers], topk=200) FileUtility.ensure_dir(subdir+phenotype+'/'+'final_results/') #create_excel_file(subdir+phenotype+'/', subdir+phenotype+'/final_results/classification_res.xlsx') FileUtility.ensure_dir(self.output+'/'+'ultimate_outputs/')
def create_BPC(self, nump=20, update_meta_data=False, override=False, repeat=4): ''' Creating PBC ''' # update metadata file through api call if update_meta_data: self.update_meta_data() # read the metadata file and create the dataframe for line in codecs.open('../meta/api_volumes.txt', 'r', 'utf-8'): books = json.loads(line) books_filtered = ([x for x in books if x['media'] == 'text']) df = pd.DataFrame(books_filtered) df['version'] = df[['version_code', 'volume_name']].apply(lambda x: ' # '.join(x), axis=1) df['trans_ID'] = df['fcbh_id'].str[0:6] self.df = df[[ 'language_iso', 'trans_ID', 'fcbh_id', 'language_english', 'language_name', 'version' ]] # bible retrieval self.id2iso_dict = Series(self.df['language_iso'].values, index=self.df['trans_ID']).to_dict() self.id2langeng_dict = Series(self.df['language_english'].values, index=self.df['trans_ID']).to_dict() self.id2lang_dict = Series(self.df['language_name'].values, index=self.df['trans_ID']).to_dict() self.id2version = Series(self.df['version'].values, index=self.df['trans_ID']).to_dict() # report creation report = { 'language_iso': [], 'trans_ID': [], 'language_english': [], 'language_name': [], 'version': [], 'verses': [] } # retrieve all bibles bible_ids = self.ret_bible_books(nump=nump, override=override) bible_ids = list(bible_ids.keys()) bible_ids.sort() # iterating for max coverage continue_iter = True prev_missings = [] missing_tr_list = [] count = 0 while continue_iter and count < repeat: prev_missings = missing_tr_list missing_tr_list = [] for trID in bible_ids: iso = self.id2iso_dict[trID] if not FileUtility.exists(self.output_path + '/' + iso + '_' + trID + '.api.txt'): length = 0 missing_tr_list.append(trID) else: length = len( FileUtility.load_list(self.output_path + '/' + iso + '_' + trID + '.api.txt')) report['language_iso'].append(iso) report['trans_ID'].append(trID) report['language_english'].append( self.id2langeng_dict[trID]) report['language_name'].append(self.id2lang_dict[trID]) report['version'].append(self.id2version[trID]) report['verses'].append(length) print('Double check for the missing translations..') bible_ids_new = self.ret_bible_books(nump=nump, trList=missing_tr_list) bible_ids_new = list(bible_ids_new.keys()) bible_ids_new.sort() count += 1 if missing_tr_list == prev_missings: continue_iter = False report = pd.DataFrame(report) report.set_index('trans_ID') report.to_csv(self.output_path + '/reports/crawl_report_API.tsv', sep='\t', index=False, columns=[ 'language_iso', 'trans_ID', 'language_english', 'language_name', 'version', 'verses' ]) self.generate_final_rep()