コード例 #1
0
 def crawl_bpc(self,nump=20,update_meta=False, override=False, repeat=1):
     # update the metadata table
     if update_meta:
         self.update_meta_data()
     # read the metadata table
     self.df_biblecom=pd.read_table('../meta/biblecom.tsv', sep='\t')
     urliso=self.df_biblecom[['url','language_iso']].values.tolist()
     
     if not override:
         new_list=[]
         for url, iso in urliso:
             num=url.split('/')[0:-1][-1]
             if not FileUtility.exists(self.output_path+'/'+iso+'_'+num+'.biblecom.txt'):
                 new_list.append([url,iso])
         urliso=new_list
     res=BibleComAPl.make_parallel(min(nump,len(urliso)),self.crawl_a_lang,urliso)
     
     # iterating for max coverage
     continue_iter = True
     count =0;
     while continue_iter and count < repeat:
         # update list
         new_list=[]
         for url, iso in urliso:
             num=url.split('/')[0:-1][-1]
             if not FileUtility.exists(self.output_path+'/'+iso+'_'+num+'.biblecom.txt'):
                 new_list.append([url,iso])
         if len(new_list)==len(urliso):
             continue_iter=False
         count+=1;
         urliso=new_list
         print ('Double check for the missing translations..')
         res=BibleComAPl.make_parallel(min(nump,len(urliso)),self.crawl_a_lang,urliso)
     
     self.create_report_biblecom()
コード例 #2
0
ファイル: biblecloudAPI.py プロジェクト: jjdelvalle/1000Langs
 def create_report_cloud(self):
     report = {
         'language_iso': [],
         'trans_ID': [],
         'language_name': [],
         'Description': [],
         'verses': []
     }
     for trID in self.df_cloud.trans_ID:
         iso = self.id2iso_dict[trID]
         if not FileUtility.exists(self.output_path + '/' + iso + '_' +
                                   trID + '.cloud.txt'):
             length = 0
         else:
             length = len(
                 FileUtility.load_list(self.output_path + '/' + iso + '_' +
                                       trID + '.cloud.txt'))
             report['language_iso'].append(iso)
             report['trans_ID'].append(trID)
             report['language_name'].append(self.id2lang_dict[trID])
             report['Description'].append(self.id2version[trID])
             report['verses'].append(length)
     report = pd.DataFrame(report)
     report.set_index('trans_ID')
     report.to_csv(self.output_path + '/reports/crawl_report_cloud.tsv',
                   sep='\t',
                   index=False,
                   columns=[
                       'language_iso', 'trans_ID', 'language_name',
                       'Description', 'verses'
                   ])
     self.generate_final_rep()
コード例 #3
0
    def ret_bible_books(self, nump=10, trList=[], override=False):
        '''
        Retrieving all bibles
        :param nump:
        :return:
        '''

        # parallel input creation
        tr_meta = []
        exists = dict()
        for x in (self.df['trans_ID'].unique().tolist()
                  if len(trList) == 0 else trList):

            if not FileUtility.exists(self.output_path + '/' +
                                      self.id2iso_dict[x] + '_' + x +
                                      '.api.txt') or override:
                tr_meta.append(
                    (self.df[self.df['trans_ID'] ==
                             x]['language_iso'].tolist()[0], x,
                     self.df[self.df['trans_ID'] == x]['fcbh_id'].tolist()))
            else:
                exists[x] = 'existed'

        # call in parallel
        print('Retrieving the bible translations')
        res = BDPAPl.make_parallel(min(nump, len(tr_meta)), self.ret_a_book,
                                   tr_meta)
        res.update(exists)

        return res
コード例 #4
0
ファイル: pngAPI.py プロジェクト: JHurricane96/1000Langs
    def crawl_all_found_langs(self, nump=20, override=False, repeat=3):
        table = []
        inputs = []
        self.lang_dict = dict()
        for code, rec in tqdm.tqdm(self.url_dict.items()):
            inputs.append(
                (rec[2], self.output_path + '/pngscripture_intermediate/' +
                 rec[1] + '/', rec[0], rec[1], rec[3]))
            self.lang_dict[rec[0]] = rec[3]

        if not override:
            new_list = []
            for url, outpath, iso, code, langname in inputs:
                if not FileUtility.exists(self.output_path + '/' + iso + '_' +
                                          code.replace('_', '-') + '.png.txt'):
                    new_list.append((url, outpath, iso, code, langname))
            inputs = new_list

        res = PNGAPl.make_parallel(min(nump, len(inputs)),
                                   self.download_zipfile, inputs)

        # iterating for max coverage
        continue_iter = True
        count = 0
        while continue_iter and count < repeat:
            # update list
            new_list = []
            for url, outpath, iso, code, langname in inputs:
                if not FileUtility.exists(self.output_path + '/' + iso + '_' +
                                          code.replace('_', '-') + '.png.txt'):
                    new_list.append((url, outpath, iso, code, langname))
            inputs = new_list
            if len(new_list) == len(inputs):
                continue_iter = False
            inputs = new_list
            count += 1
            print('Double check for the missing translations..')
            res = PNGAPl.make_parallel(min(nump, len(inputs)),
                                       self.download_zipfile, inputs)
コード例 #5
0
    def sequential_crawl(triples, override=False):

        if not override:
            new_list=[]
            for x,y,z in triples:
                if not FileUtility.exists(y+z):
                    new_list.append((x,y,z))
            triples=new_list

        print ('Start crawling..')
        for x in tqdm.tqdm(triples):
            PNGScriptRetrieve(x)
        FileUtility.save_list(triples[0][1]+'log.txt',PNGScriptRetrieve.log)
コード例 #6
0
ファイル: biblecloudAPI.py プロジェクト: jjdelvalle/1000Langs
    def crawl_bible_cloud(self, nump=20, override=False, repeat=3):
        urls = ('https://bible.cloud/inscript/content/texts/' +
                self.df_cloud['trans_ID'] + '/MT1.html').tolist()
        outputs = [self.output_path + '/biblecloud_intermediate/'] * len(
            self.df_cloud['trans_ID'].tolist())
        txt_files = ('../' + self.df_cloud['language_iso'] + '_' +
                     self.df_cloud['trans_ID'] + '.cloud.txt').tolist()
        triples = [(url, outputs[idx], txt_files[idx])
                   for idx, url in enumerate(urls)]

        if not override:
            new_list = []
            for x, y, z in triples:
                if not FileUtility.exists(y + z):
                    new_list.append((x, y, z))
            triples = new_list

        BibleCloud.parallel_crawl(triples, min(nump, len(triples)), True)

        # iterating for max coverage
        continue_iter = True
        count = 0
        while continue_iter and count < repeat:
            # update list
            new_list = []
            for x, y, z in triples:
                if not FileUtility.exists(y + z):
                    new_list.append((x, y, z))
            if len(new_list) == len(triples):
                continue_iter = False
            triples = new_list
            count += 1
            print('Double check for the missing translations..')
            BibleCloud.parallel_crawl(triples, min(nump, len(triples)), True)

        self.create_report_cloud()
コード例 #7
0
 def parallel_crawl(triples, num_p, override=False):
     if not override:
         new_list=[]
         for x,y,z in triples:
             if not FileUtility.exists(y+z):
                 new_list.append((x,y,z))
         triples=new_list
     if len(triples)>0:
         print ('Start parallel crawling..')
         pool = Pool(processes=num_p)
         res=[]
         for x in tqdm.tqdm(pool.imap_unordered(PNGScriptRetrieve, triples, chunksize=num_p),total=len(triples)):
             res.append(x)
         pool.close()
         FileUtility.save_list(triples[0][1]+'log.txt',PNGScriptRetrieve.log)
コード例 #8
0
ファイル: bibleCOM.py プロジェクト: jjdelvalle/1000Langs
    def sequential_crawl(triples, override=False):
        '''
        :param triples:
        :param override:
        :return:
        '''
        if not override:
            new_list=[]
            for x,y,z in triples:
                if not FileUtility.exists(y+z):
                    new_list.append((x,y,z))
            triples=new_list

        print ('Start crawling..')
        for x in tqdm.tqdm(triples):
            BibleCom(x)
        FileUtility.save_list(triples[0][1]+'log.txt',BibleCom.log)
コード例 #9
0
ファイル: geno2pheno.py プロジェクト: ehsanasgari/Geno2Pheno
    def predict_block(self, ultimate=False):
        '''
        :return:
        '''
        import warnings
        from sklearn.exceptions import DataConversionWarning, FitFailedWarning, UndefinedMetricWarning, ConvergenceWarning
        warnings.filterwarnings(action='ignore', category=DataConversionWarning)
        warnings.filterwarnings(action='ignore', category=FitFailedWarning)
        warnings.filterwarnings(action='ignore', category=DeprecationWarning)
        warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
        warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
        
        predict_blocks = self.xmldoc.getElementsByTagName('predict')
        predict_path=self.output+'/classifications/'

        # iterate over predict block
        for predict in predict_blocks:
            # Sub prediction
            FileUtility.ensure_dir(predict_path)
            setting_name=predict.attributes['name'].value
            subdir=predict_path+setting_name+'/'

            FileUtility.ensure_dir(subdir)
            ## label mapping
            labels=predict.getElementsByTagName('labels')[0].getElementsByTagName('label')
            mapping=dict()
            for label in labels:
                val=label.attributes['value'].value
                phenotype=label.firstChild.nodeValue.strip()
                mapping[phenotype]=int(val)

            ## optimizing for ..
            optimization=predict.getElementsByTagName('optimize')[0].firstChild.nodeValue.strip()
            ## number of folds
            self.cvbasis=predict.getElementsByTagName('eval')[0].firstChild.nodeValue.strip()
            folds=int(predict.getElementsByTagName('eval')[0].attributes['folds'].value)
            test_ratio=float(predict.getElementsByTagName('eval')[0].attributes['test'].value)

            if optimization not in ['accuracy','scores_r_1','scores_f1_1','scores_f1_0','f1_macro','f1_micro']:
                print ('Error in choosing optimization score')

            ## Genotype tables
            GPA=GenotypePhenotypeAccess(self.output)
            ## iterate over phenotypes if there exist more than one
            for phenotype in GPA.phenotypes:
                print ('working on phenotype ',phenotype)
                FileUtility.ensure_dir(subdir+phenotype+'/')
                ## create cross-validation
                FileUtility.ensure_dir(subdir+phenotype+'/cv/')
                cv_file=''
                cv_test_file=''
                if not ultimate:
                    if self.cvbasis=='tree':
                        FileUtility.ensure_dir(subdir+phenotype+'/cv/tree/')
                        if self.override or not FileUtility.exists(subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt'])):
                            GPA.create_treefold(subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt']), self.metadata_path + 'phylogentictree.txt', folds, test_ratio, phenotype, mapping)
                        cv_file=subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt'])
                        cv_test_file=subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_test.txt'])
                    else:
                        FileUtility.ensure_dir(subdir+phenotype+'/cv/rand/')
                        if self.override or not FileUtility.exists(subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt'])):
                            GPA.create_randfold(subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt']), folds, test_ratio, phenotype, mapping)
                        cv_file=subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt'])
                        cv_test_file=subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_test.txt'])

                features=[x.split('/')[-1].replace('_feature_vect.npz','') for x in FileUtility.recursive_glob(self.representation_path, '*.npz')]
                feature_combinations=[]
                ## TODO: ask as an input
                max_length_feature_comb = 3#len(features)

                for x in [[list(x) for x in list(itertools.combinations(features,r))] for r in range(3,max_length_feature_comb+1)]:
                    feature_combinations+=x


                ## iterate over feature sets
                for feature_setting in feature_combinations:
                    classifiers=[]
                    for model in predict.getElementsByTagName('model'):
                        for x in model.childNodes:
                            if not x.nodeName=="#text":
                                classifiers.append(x.nodeName)
                    if not ultimate:
                        X, Y, feature_names, final_strains = GPA.get_xy_prediction_mats(feature_setting, phenotype, mapping)

                        feature_setting =[''.join(feature.split('.')[0:-1]) if len(feature.split('.'))>1 else feature for feature in feature_setting]
                        feature_text='##'.join(feature_setting)

                        ## iterate over classifiers
                        for classifier in tqdm.tqdm(classifiers):
                            basepath_cls=subdir+phenotype+'/'+feature_text+'_CV_'+self.cvbasis
                            if classifier.lower()=='svm' and (not FileUtility.exists(basepath_cls+'_SVM.pickle') or self.override):
                                Model = SVM(X, Y)
                                Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names, params=[{'C': [1000, 500, 200, 100, 50, 20, 10, 5, 2, 1, 0.2, 0.5, 0.01, 0.02, 0.05, 0.001]}])
                            if classifier.lower()=='rf' and  (not FileUtility.exists(basepath_cls+'_RF.pickle') or self.override):
                                Model = RFClassifier(X, Y)
                                Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names)
                            if classifier.lower()=='lr' and (not FileUtility.exists(basepath_cls+'_LR.pickle') or self.override):
                                Model = LogRegression(X, Y)
                                Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names)
                            #if classifier.lower()=='dnn':
                            #    Model = DNN(X, Y)
                            #    Model.tune_and_eval(subdir+phenotype+'/'+'_'.join([feature]),njobs=self.cores, kfold=10)
                        # generate selected features
                        FileUtility.ensure_dir(self.output+'/'+'ultimate_outputs/')
                        print ('Select the top markers..')
                        generate_top_features(self.output, [x.upper() for x in classifiers], topk=200)
                FileUtility.ensure_dir(subdir+phenotype+'/'+'final_results/')
                #create_excel_file(subdir+phenotype+'/', subdir+phenotype+'/final_results/classification_res.xlsx')


        FileUtility.ensure_dir(self.output+'/'+'ultimate_outputs/')
コード例 #10
0
    def create_BPC(self,
                   nump=20,
                   update_meta_data=False,
                   override=False,
                   repeat=4):
        '''
            Creating PBC
        '''
        # update metadata file through api call
        if update_meta_data:
            self.update_meta_data()

        # read the metadata file and create the dataframe
        for line in codecs.open('../meta/api_volumes.txt', 'r', 'utf-8'):
            books = json.loads(line)
        books_filtered = ([x for x in books if x['media'] == 'text'])
        df = pd.DataFrame(books_filtered)
        df['version'] = df[['version_code',
                            'volume_name']].apply(lambda x: ' # '.join(x),
                                                  axis=1)
        df['trans_ID'] = df['fcbh_id'].str[0:6]
        self.df = df[[
            'language_iso', 'trans_ID', 'fcbh_id', 'language_english',
            'language_name', 'version'
        ]]

        # bible retrieval
        self.id2iso_dict = Series(self.df['language_iso'].values,
                                  index=self.df['trans_ID']).to_dict()
        self.id2langeng_dict = Series(self.df['language_english'].values,
                                      index=self.df['trans_ID']).to_dict()
        self.id2lang_dict = Series(self.df['language_name'].values,
                                   index=self.df['trans_ID']).to_dict()
        self.id2version = Series(self.df['version'].values,
                                 index=self.df['trans_ID']).to_dict()

        # report creation
        report = {
            'language_iso': [],
            'trans_ID': [],
            'language_english': [],
            'language_name': [],
            'version': [],
            'verses': []
        }

        # retrieve all bibles
        bible_ids = self.ret_bible_books(nump=nump, override=override)
        bible_ids = list(bible_ids.keys())
        bible_ids.sort()

        # iterating for max coverage
        continue_iter = True
        prev_missings = []
        missing_tr_list = []
        count = 0
        while continue_iter and count < repeat:
            prev_missings = missing_tr_list
            missing_tr_list = []
            for trID in bible_ids:
                iso = self.id2iso_dict[trID]
                if not FileUtility.exists(self.output_path + '/' + iso + '_' +
                                          trID + '.api.txt'):
                    length = 0
                    missing_tr_list.append(trID)
                else:
                    length = len(
                        FileUtility.load_list(self.output_path + '/' + iso +
                                              '_' + trID + '.api.txt'))
                    report['language_iso'].append(iso)
                    report['trans_ID'].append(trID)
                    report['language_english'].append(
                        self.id2langeng_dict[trID])
                    report['language_name'].append(self.id2lang_dict[trID])
                    report['version'].append(self.id2version[trID])
                    report['verses'].append(length)
            print('Double check for the missing translations..')
            bible_ids_new = self.ret_bible_books(nump=nump,
                                                 trList=missing_tr_list)
            bible_ids_new = list(bible_ids_new.keys())
            bible_ids_new.sort()
            count += 1
            if missing_tr_list == prev_missings:
                continue_iter = False

        report = pd.DataFrame(report)
        report.set_index('trans_ID')
        report.to_csv(self.output_path + '/reports/crawl_report_API.tsv',
                      sep='\t',
                      index=False,
                      columns=[
                          'language_iso', 'trans_ID', 'language_english',
                          'language_name', 'version', 'verses'
                      ])
        self.generate_final_rep()