Esempio n. 1
0
    def tune_and_evaluate(self,
                          estimator,
                          parameters,
                          score='macro_f1',
                          n_jobs=-1,
                          file_name='results'):
        '''
        :param estimator:
        :param parameters:p
        :param score:
        :param n_jobs:
        :param file_name: directory/tuning/classifier/features/
        :return:
        '''
        # greed_search
        self.greed_search = GridSearchCV(estimator=estimator,
                                         param_grid=parameters,
                                         cv=self.cv,
                                         scoring=self.scoring,
                                         refit=score,
                                         error_score=0,
                                         n_jobs=n_jobs)

        label_set = list(set(self.Y))
        # fitting
        self.greed_search.fit(X=self.X, y=self.Y)
        y_predicted = cross_val_predict(self.greed_search.best_estimator_,
                                        self.X, self.Y)
        conf = confusion_matrix(self.Y, y_predicted, labels=label_set)
        # save in file
        FileUtility.save_obj(file_name, [
            label_set, conf, self.greed_search.best_score_,
            self.greed_search.best_estimator_, self.greed_search.cv_results_,
            self.greed_search.best_params_, y_predicted
        ])
Esempio n. 2
0
 def __init__(self, triple, crawl=True, parse=True, remove_after_parse=False, printing=False):
     '''
     :param url:
     :param destination_directory:
     :param output_file:
     '''
     # get parameters
     self.url, self.destination_directory, output_file = triple
     response = requests.get(self.url)
     if response.status_code == 200:
         try:
             self.output_file = self.destination_directory + output_file
             self.print = printing
             # crawl the pages
             if crawl:
                 BibleCrawler.run_crawler(self, '//a[@class = "next"]/@href', self.url, self.destination_directory)
             if parse:
                 # find the lang ID in the website
                 self.lang_directory = '/'.join(self.url.split('/')[3:7]) + '/'
                 self.url = self.url[self.url.find('.com') + 5::]
                 if '.' in self.url.split('/')[-1]:
                     self.lang_directory = '/'.join(self.url.split('/')[3:-1]) + '/'
                 books = self.destination_directory + self.lang_directory
                 self.run_parser(books, self.output_file)
                 if remove_after_parse:
                     # parse the output file
                     # remove the directory
                     FileUtility.remove_dir(self.destination_directory + self.lang_directory)
         except:
             return None
     else:
         return None
     return None
Esempio n. 3
0
 def create_report_cloud(self):
     report = {
         'language_iso': [],
         'trans_ID': [],
         'language_name': [],
         'Description': [],
         'verses': []
     }
     for trID in self.df_cloud.trans_ID:
         iso = self.id2iso_dict[trID]
         if not FileUtility.exists(self.output_path + '/' + iso + '_' +
                                   trID + '.cloud.txt'):
             length = 0
         else:
             length = len(
                 FileUtility.load_list(self.output_path + '/' + iso + '_' +
                                       trID + '.cloud.txt'))
             report['language_iso'].append(iso)
             report['trans_ID'].append(trID)
             report['language_name'].append(self.id2lang_dict[trID])
             report['Description'].append(self.id2version[trID])
             report['verses'].append(length)
     report = pd.DataFrame(report)
     report.set_index('trans_ID')
     report.to_csv(self.output_path + '/reports/crawl_report_cloud.tsv',
                   sep='\t',
                   index=False,
                   columns=[
                       'language_iso', 'trans_ID', 'language_name',
                       'Description', 'verses'
                   ])
     self.generate_final_rep()
Esempio n. 4
0
    def tune_and_evaluate(self,
                          estimator,
                          parameters,
                          score='f1_macro',
                          file_name='results'):
        '''
        :param estimator:
        :param parameters:
        :param score:
        :param file_name: directory/tuning/classifier/features/
        :return:
        '''
        # inner cross_validation
        self.greed_search = GridSearchCV(estimator=estimator,
                                         param_grid=parameters,
                                         cv=self.inner_cv,
                                         scoring=self.scoring,
                                         refit=score,
                                         error_score=0)
        # Nested CV with parameter optimization
        self.nested_score = cross_val_score(self.greed_search,
                                            X=self.X,
                                            y=self.Y,
                                            cv=self.outer_cv)

        # saving
        FileUtility.save_obj([self.greed_search, self.nested_score], file_name)
Esempio n. 5
0
def RA_healthy():
    Pipeline = DiTaxaWorkflow(
        '/mounts/data/proj/asgari/dissertation/datasets/deepbio/microbiome/RA/',
        'fastq',
        '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/RAoutput/',
        'RA',
        50000,
        5000,
        -1,
        num_p=20)
    #Pipeline.train_npe()
    #Pipeline.representation_npe()
    labels = FileUtility.load_list(
        '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/ra/rep/labels.txt'
    )
    labels = {
        x.split('/')[-1]: labels[idx]
        for idx, x in enumerate(
            FileUtility.load_list(
                '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/ra/rep/ra_selfposnpe_10000_npe_5000_meta'
            ))
    }
    Pipeline.biomarker_extraction(labels, {
        'untreated_RA': 1,
        'treated_RA': 0
    }, 'untreated_vs_treated')
Esempio n. 6
0
 def crawl_bpc(self,nump=20,update_meta=False, override=False, repeat=1):
     # update the metadata table
     if update_meta:
         self.update_meta_data()
     # read the metadata table
     self.df_biblecom=pd.read_table('../meta/biblecom.tsv', sep='\t')
     urliso=self.df_biblecom[['url','language_iso']].values.tolist()
     
     if not override:
         new_list=[]
         for url, iso in urliso:
             num=url.split('/')[0:-1][-1]
             if not FileUtility.exists(self.output_path+'/'+iso+'_'+num+'.biblecom.txt'):
                 new_list.append([url,iso])
         urliso=new_list
     res=BibleComAPl.make_parallel(min(nump,len(urliso)),self.crawl_a_lang,urliso)
     
     # iterating for max coverage
     continue_iter = True
     count =0;
     while continue_iter and count < repeat:
         # update list
         new_list=[]
         for url, iso in urliso:
             num=url.split('/')[0:-1][-1]
             if not FileUtility.exists(self.output_path+'/'+iso+'_'+num+'.biblecom.txt'):
                 new_list.append([url,iso])
         if len(new_list)==len(urliso):
             continue_iter=False
         count+=1;
         urliso=new_list
         print ('Double check for the missing translations..')
         res=BibleComAPl.make_parallel(min(nump,len(urliso)),self.crawl_a_lang,urliso)
     
     self.create_report_biblecom()
Esempio n. 7
0
def org_classification():
    '''
    '''
    X=FileUtility.load_sparse_csr('../../datasets/processed_data/org/K/6-mer_org_restrictedkmer.npz').toarray()
    Y=FileUtility.load_list('../../datasets/processed_data/org/K/org_label_restrictedkmer.txt')
    DNN=DNNMutliclass16S(X,Y,model_arch=[1024,0.2,256,0.1,256,0.1,128,0.1,64])
    DNN.cross_validation('../../datasets/results/org/classifier/nn', gpu_dev='2', n_fold=10, epochs=30, batch_size=100, model_strct='mlp')
Esempio n. 8
0
    def __init__(self):
        # 初始化static_parser和dynamic parser
        dynamic_path = "%s/input/qe_config_dynamic.JSON" % os.path.abspath(
            os.pardir)
        static_path = "%s/input/qe_config_static.JSON" % os.path.abspath(
            os.pardir)

        self.dynamic_parser = json_parser.ReadJson(dynamic_path)
        self.static_parser = json_parser.ReadJson(static_path)

        # 初始化老化的起始时间和结束时间
        self.start_time = self.dynamic_parser.get_first_layer("START_TIME")
        self.end_time = self.dynamic_parser.get_first_layer("END_TIME")

        # 初始化聚合数据的time interval,单位秒
        # 我们会用每个CAP_TIME除以time_interval,获得一个time_index,将连续的时间离散化
        self.time_interval = self.static_parser.get_first_layer(
            "AGGREGATION_INTERVAL")

        # 有时候我们不关心过高量程上的值是否准确,例如PM25>500,或者PM10>600
        self.max_exp_val = self.static_parser.get_first_layer("MAX_EXP_VAL")

        # file_utility对象负责新建输出目录,以及获得数据、图片、输出文件保存的地方
        self.file_utility = FileUtility()

        # 由于我们对每一台需要质保的设备分开处理,这里记录当前处理的参数是什么,以及当前处理的是否为该参数下的第一台设备,主要是为了输出数据而建立的flag
        self.first_qe_device = True
        self.current_var = ''
Esempio n. 9
0
 def representation_npe(self):
     '''
     :return:
     '''
     print('npe generation started..')
     start = time.time()
     G16s = NPESegmentApplyMetagenomics(
         self.file_directory,
         self.file_extenstion,
         self.output_directory + 'npe_segmentatation/' + self.dbname + '_' +
         '_'.join([
             'unique',
             str(self.vocab_size), 'v',
             str(self.seg_train_depth), 's.model'
         ]),
         sampling_number=self.rep_sampling_depth,
         num_p=self.num_p)
     DiTaxaWorkflow.ensure_dir(self.output_directory +
                               'npe_representation/')
     G16s.generate_npes_all(save=self.output_directory +
                            'npe_representation/' + self.dbname +
                            '_uniquepiece_' + str(self.rep_sampling_depth))
     end = time.time()
     spent = end - start
     self.log_file.append(
         'generating the representations npe_representation/' +
         self.dbname + '_uniquepiece_' + str(self.rep_sampling_depth) +
         '  ' + str(spent) + ' seconds , using ' + str(self.num_p) +
         'cores')
     FileUtility.save_list(self.output_directory + 'logfile.txt',
                           self.log_file)
    def generate_LR_important_features(self,
                                       clf_LR,
                                       feature_names,
                                       results_file,
                                       N=1000):
        '''
        :param clf_logistic_regression:
        :param feature_names:
        :param results_file:
        :param N:
        :return:
        '''

        results_file = results_file.replace(
            '/classifications/', '/feature_selection/classifications/')
        FileUtility.ensure_dir(results_file)
        file_name = results_file + '_LR'

        idxs = argsort(np.abs(clf_LR.coef_.tolist()[0]).tolist(),
                       rev=True)[0:N]

        f = codecs.open(file_name, 'w')
        f.write('\t'.join(['feature', 'score']) + '\n')
        for idx in idxs:
            f.write('\t'.join(
                [feature_names[idx],
                 str(clf_LR.coef_.tolist()[0][idx])]) + '\n')
        f.close()
Esempio n. 11
0
 def create_report_png(self):
     report = {
         'language_iso': [],
         'trans_ID': [],
         'language_name': [],
         'verses': []
     }
     self.df_png = pd.DataFrame(report)
     png_files = FileUtility.recursive_glob(self.output_path + '/',
                                            '*.png.txt')
     for png_file in png_files:
         iso, code = png_file.split('/')[-1].split(
             '.')[0:-1][0:-1][-1].split('_')
         length = len(FileUtility.load_list(png_file))
         lang_name = self.lang_dict[
             iso] if iso in self.lang_dict else 'ISO: ' + iso
         self.df_png = self.df_png.append(
             {
                 'language_iso': iso,
                 'trans_ID': code,
                 'language_name': lang_name,
                 'verses': length
             },
             ignore_index=True)
     self.df_png.set_index('trans_ID')
     self.df_png.to_csv(
         self.output_path + '/reports/crawl_report_png.tsv',
         sep='\t',
         index=False,
         columns=['language_iso', 'trans_ID', 'language_name', 'verses'])
     self.generate_final_rep()
 def write_in_file(filename, pos, neg):
     lines = [['direction', 'marker', 'p-value']]
     for marker, pval in pos:
         lines.append(['+', marker, str(pval)])
     for marker, pval in neg:
         lines.append(['-', marker, str(pval)])
     FileUtility.save_list(filename, ['\t'.join(line) for line in lines])
Esempio n. 13
0
 def generate(self,
              vocab_size,
              sample_size,
              output_dir,
              num_p=4,
              backend='Sentencepiece'):
     '''
     :param vocab_size: the size of final vocabulary
     :param sample_size: how many reads from each file
     :param output_dir: where to write the results
     :param num_p: number of cores
     :return:
     '''
     start = timeit.default_timer()
     fasta_files = [(x, sample_size) for x in self.fasta_files]
     corpus = []
     pool = Pool(processes=num_p)
     for ky, v in tqdm.tqdm(pool.imap_unordered(self._get_corpus,
                                                fasta_files,
                                                chunksize=num_p),
                            total=len(fasta_files)):
         corpus = corpus + v
     pool.close()
     print('Corpus size for training NPE is ', len(corpus))
     if backend == 'Sentencepiece':
         FileUtility.save_list('../tmp/tmp_txt', corpus)
         spm.SentencePieceTrainer.Train(
             '--input=../tmp/tmp_txt --model_prefix=' + output_dir +
             ' --add_dummy_prefix false --max_sentencepiece_length=512 --model_type=bpe --mining_sentence_size=5000000 --input_sentence_size=10000000 --vocab_size=50000'
         )
         FileUtility.save_list('../tmp/tmp_txt', corpus[0:10])
     elif backend == 'normalbpe':
         train_npe(corpus, output_dir, vocab_size, output_dir + '_freq')
     print(' The segmentation training took ',
           timeit.default_timer() - start, ' ms.')
Esempio n. 14
0
 def __init__(self,
              fasta_file,
              matrix_path,
              feature_file_path,
              phenotypes,
              phenotype_mapping,
              selected_samples,
              p_value_threshold=0.01,
              remove_redundants=False,
              num_p=4,
              blastn_path=''):
     if len(blastn_path) > 0:
         os.environ['PATH'] += ':' + blastn_path
     self.num_p = num_p
     self.seq_IDS = FileUtility.read_fasta_sequences_ids(fasta_file)
     self.remove_redundants = remove_redundants
     self.ez_taxa_dict = {
         x.split()[0]: x.split()[1].split(';')
         for x in FileUtility.load_list('db/ez_idx_taxonomy.txt')
     }
     self.mat = FileUtility.load_sparse_csr(matrix_path)
     self.mat = self.mat.toarray()
     self.mat = self.mat[selected_samples, :]
     self.mat = csr_matrix(self.mat)
     self.features = FileUtility.load_list(feature_file_path)
     self.align_markers_parallel(p_value_threshold)
     self.redundant_columns_indentification()
     self.phenotype_mapping = phenotype_mapping
     self.phenotypes = phenotypes
Esempio n. 15
0
 def train_npe(self):
     '''
     :return:
     '''
     print(
         'npe training started.. it might take more than 1 hour for more than 1000 samples'
     )
     DiTaxaWorkflow.blockPrint()
     start = time.time()
     G16s = NPESegmentTrainMetagenomics(self.file_directory,
                                        self.file_extenstion)
     DiTaxaWorkflow.ensure_dir(self.output_directory +
                               'npe_segmentatation/')
     G16s.generate(self.vocab_size,
                   self.seg_train_depth,
                   self.output_directory + 'npe_segmentatation/' +
                   self.dbname + '_' + '_'.join([
                       'unique',
                       str(self.vocab_size), 'v',
                       str(self.seg_train_depth), 's'
                   ]),
                   backend='Sentencepiece',
                   num_p=self.num_p)
     end = time.time()
     spent = (end - start)
     self.log_file.append('training segmentation ' + '_'.join([
         'unique',
         str(self.vocab_size), 'v',
         str(self.seg_train_depth), 's '
     ]) + str(spent) + ' seconds , using ' + str(self.num_p) + 'cores')
     DiTaxaWorkflow.enablePrint()
     FileUtility.save_list(self.output_directory + 'logfile.txt',
                           self.log_file)
Esempio n. 16
0
    def create_report_biblecom(self):
        self.df_biblecom['verses'] = 0

        biblecom_files = FileUtility.recursive_glob(self.output_path + '/',
                                                    '*.biblecom.txt')
        for bib_file in biblecom_files:
            file_parts = bib_file.split('/')[-1].split(
                '.')[0:-1][0:-1][-1].split('_')
            num_file_parts = len(file_parts)
            if num_file_parts == 2:
                iso, code = file_parts
            elif num_file_parts == 3:
                iso = "_".join(file_parts[:2])
                code = file_parts[2]
            else:
                continue
            length = len(FileUtility.load_list(bib_file))
            self.df_biblecom.loc[:, 'verses'][
                (self.df_biblecom['language_iso'] == iso)
                & (self.df_biblecom['trans_ID'] == int(code))] = length
        self.df_biblecom.set_index('trans_ID')
        self.df_biblecom.to_csv(
            self.output_path + '/reports/crawl_report_biblecom.tsv',
            sep='\t',
            index=False,
            columns=['language_iso', 'trans_ID', 'language_name', 'verses'])
        self.generate_final_rep()
Esempio n. 17
0
    def create_treefold(self, path, tree_addr, cv, test_ratio, phenotype, mapping=None):

        ## find a mapping from strains to the phenotypes
        if mapping:
            mapping_isolate_label = dict(self.get_new_labeling(mapping)[phenotype])
        else:
            mapping_isolate_label = self.phenotype2labeled_strains_mapping[phenotype]

        # get common strains
        list_of_list_of_strains = list(self.strains.values())
        list_of_list_of_strains.append(list(mapping_isolate_label.keys()))
        final_strains = GenotypePhenotypeAccess.get_common_strains(list_of_list_of_strains)
        final_strains.sort()

        # prepare test
        Y = [mapping_isolate_label[strain] for strain in final_strains]

        isolate_to_group=dict([tuple(l.split('\t')) for l in FileUtility.load_list(tree_addr.replace(tree_addr.split('/')[-1], 'phylogenetic_nodes_and_clusters.txt'))])

        groups=[int(isolate_to_group[iso]) for iso in final_strains]
        group_kfold = GroupKFold(n_splits=round(1/test_ratio))

        train_index, test_index = list(group_kfold.split(final_strains, Y, groups))[0]
        X_test=[final_strains[x] for x in test_index]
        FileUtility.save_list(path.replace('_folds.txt', '_test.txt'), ['\t'.join(X_test)])
        final_strains = [final_strains[ix] for ix in train_index]
        group_kfold = GroupKFold(n_splits=cv)

        folds=[]
        for _, test_index in group_kfold.split(train_index, [Y[idx] for idx in train_index],  [groups[idx] for idx in train_index]):
            folds.append(test_index)
        folds=['\t'.join([final_strains[x] for x in fold.tolist()]) for fold in  folds]
        FileUtility.save_list(path, folds)
Esempio n. 18
0
 def __init__(self, triple, crawl=True, parse=True, remove_after_parse=False, printing=False):
     '''
     :param url:
     :param destination_directory:
     :param output_file:
     '''
     try:
         # get parameters
         self.url, self.destination_directory, output_file = triple
         self.output_file = self.destination_directory + output_file
         self.print = printing
         # crawl the pages
         # to be fixed
         if crawl:
             BibleCrawler.run_crawler(self, '//a[@class = "chapter-nav-right"]/@href', self.url, self.destination_directory)
         if parse:
             self.lang_directory = self.url.split('/')[3]
             # crawl the pages
             books=self.destination_directory + self.lang_directory
             self.run_parser(books, self.output_file)
             if remove_after_parse:
                 # parse the output file
                 # remove the directory
                 FileUtility.remove_dir(self.destination_directory + self.lang_directory)
     except:
         try:
             print(triple)
         except:
             return None
     return None
Esempio n. 19
0
def eco_all_classification_transfer_learning():
    '''
    '''
    #[1024,0.2,256,0.1,256,0.1,128,0.1,64]
    X=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_all_classes/6-mer_eco_restrictedmer_all.npz').toarray()
    Y=FileUtility.load_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt')
    DNN=DNNMutliclass16S(X,Y,model_arch=[512,0.1,256, 0.1,128])
    DNN.cross_validation('../../datasets/results/eco_all/nn', gpu_dev='6', pretrained_model=True,trainable=False, n_fold=5, epochs=10, batch_size=10, model_strct='../../datasets/results/eco_10000/classifiers/nn_layers_mlp_1024-0.2-512-0.2-512_0.88.pickle')
Esempio n. 20
0
def eco_all_classification():
    '''
    '''
    #[1024,0.2,256,0.1,256,0.1,128,0.1,64]
    X=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_all_classes/6-mer_eco_restrictedmer_all.npz').toarray()
    Y=FileUtility.load_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt')
    DNN=DNNMutliclass16S(X,Y,model_arch=[1024,0.2,512,0.2,512,0.1,256])
    DNN.cross_validation('../../datasets/results/eco_all/nn', gpu_dev='1', n_fold=10, epochs=20, batch_size=10, model_strct='mlp')
Esempio n. 21
0
def crohns_disease():
    '''
    '''
    #[1024,0.2,256,0.1,256,0.1,128,0.1,64]
    X=FileUtility.load_sparse_csr('../../datasets/processed_data/crohn/sample-size/6-mers_rate_complete1359_seq_5000.npz').toarray()
    Y=FileUtility.load_list('../../datasets/processed_data/crohn/data_config/labels_disease_complete1359.txt')
    DNN=DNNMutliclass16S(X,Y,model_arch=[512,0.2,256,0.2,128,0.1,64,16])
    DNN.cross_validation('../../datasets/results/crohn/classifier/nn', gpu_dev='2', n_fold=3, epochs=25, batch_size=10, model_strct='mlp')
Esempio n. 22
0
 def __init__(self, output_path):
     '''
         Constructor
     '''
     # set the parameters
     self.output_path = output_path
     FileUtility.ensure_dir(self.output_path + '/biblecom_intermediate/')
     FileUtility.ensure_dir(self.output_path + '/reports/')
 def write_in_fastafile(filename, res, min_length=50):
     corpus = []
     labels = []
     for seq, score, pval, _, _ in res:
         if len(seq) > min_length and pval < 0.05:
             corpus.append(seq)
             labels.append(' '.join(
                 ['+' if score > 0 else '-', 'p-val:' + str(pval)]))
     FileUtility.create_fasta_file(filename, corpus, labels)
def generate_top_features(path, classifier_list, topk=200):
    ## TODO: ask as an input topk

    writer = pd.ExcelWriter(path + '/ultimate_outputs/selected_features.xls',
                            engine='xlsxwriter')

    final_results = dict()
    for classifier in classifier_list:
        feature_files = FileUtility.recursive_glob(
            path + '/feature_selection/', '*_' + classifier)
        res = dict()
        for file in feature_files:
            phenotype = file.split('/')[0:-1][-1]
            if not phenotype in res:
                res[phenotype] = [file]
            else:
                if file.split('/')[-1].count('##') > res[phenotype][0].split(
                        '/')[-1].count('##'):
                    res[phenotype] = [file]
                elif file.split('/')[-1].count(
                        '##') == res[phenotype][0].split('/')[-1].count('##'):
                    res[phenotype].append(file)
        for phenotype in res.keys():
            if phenotype not in final_results:
                final_results[phenotype] = []
            final_results[phenotype] += res[phenotype]
    for phenotype, files in final_results.items():
        selected = [{
            x.split('\t')[0]: 1 / (idx + 1)
            for idx, x in enumerate(FileUtility.load_list(file)[1:topk])
        } for file in files]
        res = set(selected[0])
        for set_select in selected[1::]:
            res = res.intersection(set_select)

        geno_val_res = dict()
        for dict_geno_val in selected:
            for x, val in dict_geno_val.items():
                if x not in geno_val_res:
                    geno_val_res[x] = [val, 1]
                else:
                    geno_val_res[x][0] += val
                    geno_val_res[x][1] += 1

        df_dict = {'feature_name': [], 'mrr': [], 'freq_confirmation': []}
        for name, values in geno_val_res.items():
            rr, nr = values
            df_dict['feature_name'].append(name)
            df_dict['mrr'].append(rr / nr)
            df_dict['freq_confirmation'].append(nr)
        df = pd.DataFrame(df_dict)
        df.sort_values(['freq_confirmation', 'mrr', 'feature_name'],
                       ascending=[False, False, False],
                       inplace=True)
        df = df.copy()
        df.to_excel(writer, sheet_name=phenotype, index=False)
Esempio n. 25
0
def train_resampling_npe(sentenses, outfile, num_symbols, frequency_file, min_frequency=2, verbose=False, is_dict=False, resample_size=10000, N=10):
    """Learn num_symbols BPE operations from vocabulary, and write to outfile.
    """
    outfile_name=outfile
    list_of_seg=[]
    outfile = codecs.open(outfile, 'w', 'utf-8')
    f = codecs.open(frequency_file, 'w', 'utf-8')
    # version 0.2 changes the handling of the end-of-word token ('</w>');
    # version numbering allows bckward compatibility
    outfile.write('#version: 0.2\n')
    list_of_seg.append('#version: 0.2')


    vocab = get_vocabulary(sentenses, is_dict)
    vocab = dict([(tuple(x[:-1]) + (x[-1] + '</w>',), y) for (x, y) in vocab.items()])
    sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)

    stats, indices = get_pair_statistics(sorted_vocab)
    big_stats = copy.deepcopy(stats)
    # threshold is inspired by Zipfian assumption, but should only affect speed
    threshold = max(stats.values()) / 10
    for i in tqdm.tqdm(range(num_symbols)):
        if stats:
            most_frequent = max(stats, key=lambda x: (stats[x], x))

        # we probably missed the best pair because of pruning; go back to full statistics
        if not stats or (i and stats[most_frequent] < threshold):
            prune_stats(stats, big_stats, threshold)
            stats = copy.deepcopy(big_stats)
            most_frequent = max(stats, key=lambda x: (stats[x], x))
            # threshold is inspired by Zipfian assumption, but should only affect speed
            threshold = stats[most_frequent] * i / (i + 10000.0)
            prune_stats(stats, big_stats, threshold)

        if stats[most_frequent] < min_frequency:
            sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency))
            break

        f.write('{0} {1} '.format(*most_frequent) + str(stats[most_frequent]) + '\n')
        list_of_seg.append('{0} {1} '.format(*most_frequent))
        #print('{0} {1} '.format(*most_frequent) + str(stats[most_frequent]) + '\n')
        if verbose:
            sys.stderr.write(
                'pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1],
                                                                       stats[most_frequent]))
        outfile.write('{0} {1}\n'.format(*most_frequent))
        changes = replace_pair(most_frequent, sorted_vocab, indices)
        update_pair_statistics(most_frequent, changes, stats, indices)
        stats[most_frequent] = 0
        if not i % 100:
            prune_stats(stats, big_stats, threshold)
        if not i % 100:
            FileUtility.save_list(outfile_name+'_temp',list_of_seg)


    f.close()
Esempio n. 26
0
def create_tsne_web(X, Y, tsne_file_coor, tsne_file_label):
    classes = list(set(Y))
    classes.sort()
    L = [classes.index(y) for y in Y]
    tsne_res = np.hstack((X, np.array([L]).T))
    tsne_res[:, 0:2] = np.round(tsne_res[:, 0:2], 2)
    tsne_lines = []
    for l in tsne_res:
        tsne_lines.append('\t'.join([str(l[0]), str(l[1]), str(int(l[2]))]))
    FileUtility.save_list(tsne_file_coor, tsne_lines)
    FileUtility.save_list(tsne_file_label, Y)
Esempio n. 27
0
def test():
    X = FileUtility.load_sparse_csr(
        '../body-sites/npe_rate_5000.npz').toarray()
    Y = FileUtility.load_list(
        '../body-sites/npe_representations_labels/labels_phen.txt')
    DNN = DNNMutliclass16S(X, Y, model_arch=[512, 0.2, 256, 0.2, 128, 0.1, 64])
    DNN.cross_validation('../body-sites/nn',
                         gpu_dev='2',
                         n_fold=3,
                         epochs=300,
                         batch_size=10,
                         model_strct='mlp')
    def sequential_crawl(triples, override=False):

        if not override:
            new_list=[]
            for x,y,z in triples:
                if not FileUtility.exists(y+z):
                    new_list.append((x,y,z))
            triples=new_list

        print ('Start crawling..')
        for x in tqdm.tqdm(triples):
            PNGScriptRetrieve(x)
        FileUtility.save_list(triples[0][1]+'log.txt',PNGScriptRetrieve.log)
Esempio n. 29
0
 def DNN_classifier(out_dir, X_file, Y_file, arch, gpu_id, epochs,
                    batch_size):
     # k-mer data
     X = FileUtility.load_sparse_csr(X_file).toarray()
     # labels
     Y = [int(y) for y in FileUtility.load_list(Y_file)]
     DeepNN = DNN(X, Y, model_arch=arch)
     DeepNN.cross_validation(out_dir,
                             gpu_dev=gpu_id,
                             n_fold=10,
                             epochs=epochs,
                             batch_size=batch_size,
                             model_strct='mlp')
Esempio n. 30
0
 def numpy2trainfiles(file, name, out='../data/s8_features/'):
     '''
     test_file='/mounts/data/proj/asgari/dissertation/datasets/deepbio/protein_general/ss/data/cb513+profile_split1.npy'
     train_file='/mounts/data/proj/asgari/dissertation/datasets/deepbio/protein_general/ss/data/cullpdb+profile_6133_filtered.npy'
     :param name:
     :param out:
     :return:
     '''
     db = np.load(file)
     a = np.arange(0, 21)
     b = np.arange(35, 56)
     c = np.hstack((a, b))
     db = np.reshape(db, (db.shape[0], int(db.shape[1] / 57), 57))
     seq = [
         'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q',
         'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X', 'NoSeq'
     ]
     label = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T']
     sequences = []
     labels = []
     possible_features = dict()
     for i in range(0, db.shape[0]):
         sequences.append(''.join([
             seq[np.argmax(x)] if np.max(x) == 1 else ''
             for x in db[i, :, 0:21]
         ]).lower())
         labels.append(''.join([
             label[np.argmax(y)] if np.max(y) == 1 else ''
             for y in db[i, :, 22:30]
         ]).lower())
     lengths = [len(x) for x in sequences]
     sorted_idxs = argsort(lengths)
     lengths.sort()
     sequences = [sequences[i] for i in sorted_idxs]
     labels = [labels[i] for i in sorted_idxs]
     FileUtility.save_list(out + name, [
         '\n'.join([
             ' '.join([elx, labels[idx][idy]])
             for idy, elx in enumerate(list(seq))
         ] + ['']) for idx, seq in enumerate(sequences)
     ])
     db_new = db[sorted_idxs, :, :]
     label_encoding = [[([0] if np.max(row) == 1 else [1]) + row
                        for row in db_new[i, :, 22:30].tolist()]
                       for i in range(0, db.shape[0])]
     np.save(out + name + '_mat_Y', label_encoding)
     db_new = db_new[:, :, c]
     np.save(out + name + '_mat_X', db_new)
     FileUtility.save_list(out + name + '_length.txt',
                           [str(l) for l in lengths])