Python FileUtility.save_objの例、utility.file_utility.FileUtility.save_obj Pythonの例

コード例 #1

0

ファイルを表示

ファイル: cross_validation.py プロジェクト: ehsanasgari/Geno2Pheno

    def tune_and_evaluate(self,
                          estimator,
                          parameters,
                          score='f1_macro',
                          file_name='results'):
        '''
        :param estimator:
        :param parameters:
        :param score:
        :param file_name: directory/tuning/classifier/features/
        :return:
        '''
        # inner cross_validation
        self.greed_search = GridSearchCV(estimator=estimator,
                                         param_grid=parameters,
                                         cv=self.inner_cv,
                                         scoring=self.scoring,
                                         refit=score,
                                         error_score=0)
        # Nested CV with parameter optimization
        self.nested_score = cross_val_score(self.greed_search,
                                            X=self.X,
                                            y=self.Y,
                                            cv=self.outer_cv)

        # saving
        FileUtility.save_obj([self.greed_search, self.nested_score], file_name)

コード例 #2

0

ファイルを表示

    def tune_and_evaluate(self,
                          estimator,
                          parameters,
                          score='macro_f1',
                          n_jobs=-1,
                          file_name='results'):
        '''
        :param estimator:
        :param parameters:p
        :param score:
        :param n_jobs:
        :param file_name: directory/tuning/classifier/features/
        :return:
        '''
        # greed_search
        self.greed_search = GridSearchCV(estimator=estimator,
                                         param_grid=parameters,
                                         cv=self.cv,
                                         scoring=self.scoring,
                                         refit=score,
                                         error_score=0,
                                         n_jobs=n_jobs)

        label_set = list(set(self.Y))
        # fitting
        self.greed_search.fit(X=self.X, y=self.Y)
        y_predicted = cross_val_predict(self.greed_search.best_estimator_,
                                        self.X, self.Y)
        conf = confusion_matrix(self.Y, y_predicted, labels=label_set)
        # save in file
        FileUtility.save_obj(file_name, [
            label_set, conf, self.greed_search.best_score_,
            self.greed_search.best_estimator_, self.greed_search.cv_results_,
            self.greed_search.best_params_, y_predicted
        ])

コード例 #3

0

ファイルを表示

 def motif_extraction(self, topn=100):
     cpe_vectorizer = TfidfVectorizer(use_idf=False, analyzer='word',
                                           norm=None, stop_words=[], lowercase=True, binary=False, tokenizer=str.split)
     tf_vec=cpe_vectorizer.fit_transform(self.extended_sequences)
     vocab=cpe_vectorizer.get_feature_names()
     CH=Chi2Analysis(tf_vec,self.labels,vocab)
     vocab_binary=[x[0] for x in CH.extract_features_fdr(self.output_path+'/motifs.txt', N=topn, alpha=5e-2, direction=True, allow_subseq=True, binarization=True, remove_redundant_markers=False) if x[1]>0]
     vocab_binary=vocab_binary[0:min(100,len(vocab_binary))]
     idxs=[vocab.index(v) for v in vocab_binary]
     pos_matrix=tf_vec.toarray()[0:len(self.pos),idxs]
     DIST=get_sym_kl_rows(pos_matrix.T)
     FileUtility.save_obj(self.output_path+'/sym_KL', DIST)
     #HC=HierarchicalClutering(DIST,vocab_binary)
     self.motifs=vocab_binary

コード例 #4

0

ファイルを表示

sampled_lengths = [10000, 20000, 50000, 100000, 200000, 500000, -1]
triples = dict()
for i in sampled_lengths:
    print(i)
    f = open('../data_config/swissprot_ppe', 'r')
    CPE_Applier = CPE(f, separator='', merge_size=i)
    sequences = FileUtility.read_fasta_sequences('../data_config/ss_N.txt')
    for pdb_idx, (x, y) in tqdm.tqdm(enumerate(pairwise(sequences))):
        segments = CPE_Applier.segment(x).split()
        label_segments = according_segmentation(segments, y)
        if i not in triples:
            triples[i] = []
        triples[i] += [(seg, label_segments[idx], pdb_idx)
                       for idx, seg in enumerate(segments)]
for i in sampled_lengths:
    FileUtility.save_obj('../data_config/pdbsegments_' + str(i), triples[i])

## mapping of motifs to PDB ids
seq_ids = [
    x.strip() for x in FileUtility.load_list('../data_config/ss_N.txt')
    if x.strip()[0] == '>'
]
idx2pdb = {
    idx: ':'.join(val[1::].split(':')[0:2])
    for idx, val in enumerate(seq_ids[::2])
}
pdb2idx = {
    ':'.join(val[1::].split(':')[0:2]): idx
    for idx, val in enumerate(seq_ids[::2])
}

コード例 #5

0

ファイルを表示

ファイル: cross_validation.py プロジェクト: ehsanasgari/Geno2Pheno

    def tune_and_evaluate(self,
                          estimator,
                          parameters,
                          cv_inner=5,
                          score='f1_macro',
                          n_jobs=-1,
                          file_name='results',
                          NUM_TRIALS=3):
        '''
        :param estimator:
        :param parameters:p
        :param score:
        :param n_jobs:
        :param file_name: directory/tuning/classifier/features/
        :return:
        '''
        self.nested_scores = []
        cv_dicts = []
        test_predictions_in_trials = []
        best_params_in_trials = []

        # Loop for each trial
        for i in tqdm.tqdm(range(NUM_TRIALS)):

            # Choose cross-validation techniques for the inner and outer loops,
            # independently of the dataset.
            # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
            inner_cv = StratifiedKFold(n_splits=cv_inner,
                                       shuffle=True,
                                       random_state=i)

            # parameter search and scoring
            self.greed_search = GridSearchCV(estimator=estimator,
                                             param_grid=parameters,
                                             cv=inner_cv,
                                             scoring=self.scoring,
                                             refit=score,
                                             error_score=0,
                                             n_jobs=n_jobs,
                                             verbose=0)

            # Nested CV with parameter optimization
            nested_score = cross_val_score(self.greed_search,
                                           X=self.X,
                                           y=self.Y,
                                           cv=self.cv,
                                           n_jobs=1,
                                           scoring=score)
            self.nested_scores.append(nested_score)

            # Nested CV with parameter optimization
            cv_dict_pred = cross_val_predict(self.greed_search,
                                             X=self.X,
                                             y=self.Y,
                                             cv=self.cv,
                                             n_jobs=1)
            cv_dicts.append(cv_dict_pred)

        # get the cv results
        cv_predictions_pred = []
        cv_predictions_trues = []

        # Non_nested parameter search and scoring
        self.greed_search = GridSearchCV(estimator=estimator,
                                         param_grid=parameters,
                                         cv=self.cv,
                                         scoring=self.scoring,
                                         refit=score,
                                         error_score=0,
                                         n_jobs=n_jobs,
                                         verbose=0)

        self.greed_search.fit(X=self.X, y=self.Y)

        isolates = []
        for train, test in self.cv:
            self.greed_search.best_estimator_.fit(
                self.X[train, :], [self.Y[idx] for idx in train])
            preds = self.greed_search.best_estimator_.predict(self.X[test, :])
            trues = [self.Y[idx] for idx in test]
            [cv_predictions_pred.append(pred) for pred in preds]
            [cv_predictions_trues.append(tr) for tr in trues]
            for i in test:
                isolates.append(i)

        label_set = list(set(self.Y))
        label_set.sort()

        isolates = [self.train_isolate_list[iso] for iso in isolates]
        conf = confusion_matrix(cv_predictions_trues,
                                cv_predictions_pred,
                                labels=label_set)

        Y_test_pred = self.greed_search.best_estimator_.predict(self.X_test)

        # save in file
        FileUtility.save_obj(file_name, [
            self.nested_scores, cv_dicts, label_set, conf, label_set,
            self.greed_search.best_score_, self.greed_search.best_estimator_,
            self.greed_search.cv_results_, self.greed_search.best_params_,
            (cv_predictions_pred, cv_predictions_trues, isolates),
            (Y_test_pred, self.Y_test)
        ])

コード例 #6

0

ファイルを表示

ファイル: bootstrapping.py プロジェクト: aebustion/MicroPheno

 def save_me(self, file_name):
     '''
     :param file_name: file name to be saved
     :return:
     '''
     FileUtility.save_obj(self.output_dir + file_name, self)

コード例 #7

0

ファイルを表示

    def cross_validation(self, result_filename, gpu_dev='2', n_fold=5, epochs=50, batch_size=100, model_strct='mlp', pretrained_model=False, trainable=False):
        '''
        :param result_filename:
        :param gpu_dev:
        :param n_fold:
        :param epochs:
        :param batch_size:
        :param model_strct:
        :param pretrained_model:
        :param trainable:
        :return:
        '''
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu_dev
        
        skf = StratifiedKFold(n_splits=n_fold, shuffle=True)

        p_micro=[]
        p_macro=[]
        r_micro=[]
        r_macro=[]
        f1_micro=[]
        f1_macro=[]

        for train_index, valid_index in skf.split(self.X, self.Y):
            print ('\n Evaluation on a new fold is now get started ..')
            X_train=self.X[train_index,:]
            y_train=self.onehot_y[train_index,:]
            y_class_train=self.encoded_Y[train_index]
            X_valid=self.X[valid_index,:]
            y_valid=self.onehot_y[valid_index,:]
            y_class_valid=self.encoded_Y[valid_index]
            
            if pretrained_model:
                model=self.get_pretrained_model(model_strct, trainable)
            else:
                if model_strct=='mlp':
                    model=self.get_MLP_model()
            
            # fitting
            history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,shuffle=True, validation_data=(X_valid, y_valid), verbose=0)
            pred=model.predict_classes(X_valid)
            # score-calculations
            f1_micro.append(f1_score(y_class_valid,pred, average='micro'))
            f1_macro.append(f1_score(y_class_valid,pred, average='macro'))
            p_micro.append(precision_score(y_class_valid,pred, average='micro'))
            p_macro.append(precision_score(y_class_valid,pred, average='macro'))
            r_micro.append(recall_score(y_class_valid,pred, average='micro'))
            r_macro.append(recall_score(y_class_valid,pred, average='macro'))

        # mean values
        f1mac=np.mean(f1_macro)
        f1mic=np.mean(f1_micro)
        prmac=np.mean(p_macro)
        prmic=np.mean(p_micro)
        remac=np.mean(r_macro)
        remic=np.mean(r_micro)
        # std values
        sf1mac=np.std(f1_macro)
        sf1mic=np.std(f1_micro)
        sprmac=np.std(p_macro)
        sprmic=np.std(p_micro)
        sremac=np.std(r_macro)
        sremic=np.std(r_micro)
        # table
        latex_line=' & '.join([str(np.round(x,2))+' $\\pm$ '+str(np.round(y,2)) for x,y in [[prmic, sprmic], [remic, sremic], [f1mic, sf1mic], [prmac, sprmac], [remac, sremac], [f1mac, sf1mac] ]])      
        
        print (latex_line)
        
        
        
        history_dict = history.history
        loss_values = history_dict['loss']
        val_loss_values = history_dict['val_loss']
        epochs = range(1, len(loss_values) + 1)

        '''
        Saving the results
        '''
        if pretrained_model:
            model_strct='pretrained'
            #print (model.summary())
        FileUtility.save_obj('_'.join([result_filename, model_strct,'-'.join([str(x) for x in self.model_arch]), str(np.round(f1mac,2))]), [latex_line, p_micro, r_micro, f1_micro, p_macro, r_macro, f1_macro, (loss_values, val_loss_values, epochs)])
        
        weights=[]
        for x in model.layers:
            weights.append(x.get_weights())

        '''
        Saving the parameters and weights
        '''

        FileUtility.save_obj('_'.join([result_filename, 'layers', model_strct,'-'.join([str(x) for x in self.model_arch]), str(np.round(f1mac,2))]), weights)

コード例 #8

0

ファイルを表示

ファイル: training.py プロジェクト: mrzResearchArena/DeepPrime2Sec

def training_loop(**kwargs):
    run_parameters = kwargs['run_parameters']
    model_paramters = kwargs['model_paramters']
    model = eval(kwargs['deep_learning_model'])

    # which GPU to use
    os.environ["CUDA_VISIBLE_DEVICES"] = str(run_parameters['gpu'])

    # read files
    train_file = 'datasets/train.txt'
    test_file = 'datasets/test.txt'
    LD = LabelingData(train_file, test_file)
    train_lengths = [int(j) for j in FileUtility.load_list('/'.join(train_file.split('/')[0:-1]) + '/train_length.txt')]
    test_lengths = [int(i) for i in FileUtility.load_list('/'.join(test_file.split('/')[0:-1]) + '/test_length.txt')]

    # train/test batch parameters
    train_batch_size = run_parameters['train_batch_size']
    test_batch_size = run_parameters['test_batch_size']
    patience = run_parameters['patience']
    epochs = run_parameters['epochs']

    # model
    model, params = model(LD.n_classes, **model_paramters)

    # output directory
    FileUtility.ensure_dir('results/')
    FileUtility.ensure_dir('results/' + run_parameters['domain_name'] + '/')
    FileUtility.ensure_dir('results/' + run_parameters['domain_name'] + '/' + run_parameters['setting_name'] + '/')
    FileUtility.ensure_dir(
        'results/' + run_parameters['domain_name'] + '/' + run_parameters['setting_name'] + '/' + params + '/')
    full_path = 'results/' + run_parameters['domain_name'] + '/' + run_parameters['setting_name'] + '/' + params + '/'

    # save model
    with open(full_path + 'config.txt', 'w') as fh:
        model.summary(print_fn=lambda x: fh.write(x + '\n'))

    # check points
    filepath = full_path + "/weights-improvement-{epoch:02d}-{weighted_acc:.3f}-{val_weighted_acc:.3f}.hdf5"

    checkpoint = ModelCheckpoint(filepath, monitor='val_weighted_acc', verbose=1, save_best_only=True, mode='max',
                                 period=1)
    earlystopping = EarlyStopping(monitor='val_weighted_acc', min_delta=0, patience=patience, verbose=0, mode='max',
                                  baseline=None)
    callbacks_list = [checkpoint, earlystopping]

    # calculate the sizes
    steps_per_epoch = len(train_lengths) / train_batch_size if len(train_lengths) % train_batch_size == 0 else int(
        len(train_lengths) / train_batch_size) + 1
    validation_steps = int(len(test_lengths) / test_batch_size) if len(test_lengths) % test_batch_size == 0 else int(
        len(test_lengths) / test_batch_size) + 1

    # feed model
    h = model.fit_generator(train_batch_generator_408(train_batch_size), steps_per_epoch=steps_per_epoch,
                            validation_data=validation_batch_generator_408(test_batch_size),
                            validation_steps=validation_steps,
                            shuffle=False, epochs=epochs, verbose=1, callbacks=callbacks_list)

    # Analysis of the performance
    pred_test = [(model.predict_on_batch(x),y,w) for x,y,w in tqdm.tqdm(validation_batches_fortest_408(1))]

    acc_test, conf_mat, conf_mat_column_mapping, contingency_metric, chi2_res_pval, gtest_res_pval = generate_report(pred_test)

    # save the history
    FileUtility.save_obj(full_path + 'history', h.history)

コード例 #9

0

ファイルを表示

ファイル: DiTaxa.py プロジェクト: seedpcseed/DiTaxa

    def biomarker_extraction(self,
                             labeler,
                             label_mapper,
                             phenoname,
                             p_value_threshold=0.05,
                             pos_label=None,
                             neg_label=None,
                             excel=0):
        '''

        :return:
        '''
        print('\t✔ NPE Marker detection is started..')
        start = time.time()
        rep_base_path = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
            self.rep_sampling_depth)
        filenames = [
            x.split('/')[-1]
            for x in FileUtility.load_list(rep_base_path + '_meta')
        ]

        # CHECK EXISTING LABELS
        if callable(labeler):
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler(file) in label_mapper
            ]
        else:
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler[file] in label_mapper
            ]

        if callable(labeler):
            Y = [
                str(label_mapper[labeler(filenames[sample_id])])
                for sample_id in selected_samples
            ]
        else:
            Y = [
                str(label_mapper[labeler[filenames[sample_id]]])
                for sample_id in selected_samples
            ]

        FileUtility.save_list(rep_base_path + '_' + phenoname + '_Y.txt', Y)
        DiTaxaWorkflow.ensure_dir(self.output_directory_inter +
                                  'npe_marker_files/')

        if self.override == 1 or not DiTaxaWorkflow.exists(
                self.output_directory_inter + 'npe_marker_files/' +
                '_'.join([phenoname, 'chi2_relative.fasta'])):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                G16s = NPEMarkerDetection(
                    rep_base_path + '.npz',
                    rep_base_path + '_' + phenoname + '_Y.txt',
                    rep_base_path + '_features', self.output_directory_inter +
                    'npe_marker_files/' + phenoname, selected_samples)
                G16s.extract_markers()

            end = time.time()
            spent = end - start
            print('\t✔ biomarker extraction ' + phenoname + '  ' + str(spent) +
                  ' seconds , using ' + str(self.num_p) + ' cores')
            self.log_file.append('biomarker extraction ' + phenoname + '  ' +
                                 str(spent) + ' seconds , using ' +
                                 str(self.num_p) + ' cores')
        else:
            print(
                '\t✔ Biomarker are already extracted. Thus, the statistical test was bypassed'
            )
            self.log_file.append(
                ' Biomarker are already extracted. Thus, the statistical test was bypassed'
            )

        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)

        print('\t✔ Taxonomic assignment of the markers..')

        if callable(labeler):
            phenotypes = [
                labeler(filenames[sample_id]) for sample_id in selected_samples
            ]
        else:
            phenotypes = [
                labeler[filenames[sample_id]] for sample_id in selected_samples
            ]

        fasta_file = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_chi2_relative.fasta'
        matrix_path = rep_base_path + '.npz'
        feature_file_path = rep_base_path + '_features'

        if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000:
            remove_redundants = False
        else:
            remove_redundants = True

        FileUtility.ensure_dir(self.output_directory +
                               'final_outputs/save_states/')
        if self.override == 1 or not DiTaxaWorkflow.exists(
                self.output_directory + 'final_outputs/save_states/' +
                phenoname + '.pickle'):
            start = time.time()
            Final_OBJ = NPEMarkerAnlaysis(fasta_file,
                                          matrix_path,
                                          feature_file_path,
                                          phenotypes,
                                          label_mapper,
                                          selected_samples,
                                          p_value_threshold=p_value_threshold,
                                          remove_redundants=remove_redundants,
                                          num_p=self.num_p,
                                          blastn_path=self.blastn_path)
            end = time.time()
            spent = end - start
            DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/')
            FileUtility.save_obj(
                self.output_directory + 'final_outputs/save_states/' +
                phenoname, Final_OBJ)
            print('\t✔ Marker analysis and alignment ' + phenoname + '  ' +
                  str(spent) + ' seconds, using ' + str(self.num_p) + 'cores')
            self.log_file.append('Marker analysis and alignment ' + phenoname +
                                 '  ' + str(spent) + ' seconds, using ' +
                                 str(self.num_p) + 'cores')
        else:
            Final_OBJ = FileUtility.load_obj(self.output_directory +
                                             'final_outputs/save_states/' +
                                             phenoname + '.pickle')
            print('\t✔ The aligned markers already existed and are loaded!')
            self.log_file.append(
                'The aligned markers already existed and are loaded!')
        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)

        # generating the tree
        Final_OBJ.generate_tree(self.output_directory + 'final_outputs/',
                                phenoname)

        if excel == 1:
            print('\t✔ Creating marker excel file..')
            Final_OBJ.generate_excel(
                self.output_directory + 'final_outputs/' + phenoname + '.xlsx',
                phenoname)
            X_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '.npz'
            feature_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '_features'
            markers = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_finalmarker_list.txt'
            Y = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '_' + phenoname + "_Y.txt"
            print('\t✔ Creating t-sne plot..')
            DiTaxaWorkflow.plot_res(self.output_directory + 'final_outputs/' +
                                    phenoname + '_tsne.pdf',
                                    X_addr,
                                    feature_addr,
                                    markers,
                                    Y,
                                    labels=['Negative', 'Positive'])

        if pos_label and neg_label:
            print('\t✔ Creating marker heatmap..')
            Final_OBJ.update_matrix_by_markers_N()
            Final_OBJ.generate_heatmap(self.output_directory +
                                       'final_outputs/' + phenoname +
                                       '_heatmap',
                                       pos_label=pos_label,
                                       neg_label=neg_label)
            if not excel == 1:
                print('\t✔ Creating t-sne plot..')
                DiTaxaWorkflow.plot_res(self.output_directory +
                                        'final_outputs/' + phenoname +
                                        '_tsne.pdf',
                                        X_addr,
                                        feature_addr,
                                        markers,
                                        Y,
                                        labels=[neg_label, pos_label])
        DiTaxaWorkflow.temp_cleanup()
        print(
            '\t⬛ Marker detection and analysis completed. You can find the results at '
            + self.output_directory +
            ', in partuclar at final_outputs subdirectory.')

コード例 #10

0

ファイルを表示

ファイル: DiTaxa.py プロジェクト: llpberkeley/DiTaxa

    def biomarker_extraction(self,
                             labeler,
                             label_mapper,
                             name_setting,
                             p_value_threshold=0.05,
                             pos_label=None,
                             neg_label=None):
        '''

        :return:
        '''
        print('npe marker detection started')
        DiTaxaWorkflow.blockPrint()
        start = time.time()
        rep_base_path = self.output_directory + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
            self.rep_sampling_depth)
        filenames = [
            x.split('/')[-1]
            for x in FileUtility.load_list(rep_base_path + '_meta')
        ]

        # CHECK EXISTING LABELS
        if callable(labeler):
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler(file) in label_mapper
            ]
        else:
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler[file] in label_mapper
            ]

        if callable(labeler):
            Y = [
                str(label_mapper[labeler(filenames[sample_id])])
                for sample_id in selected_samples
            ]
        else:
            Y = [
                str(label_mapper[labeler[filenames[sample_id]]])
                for sample_id in selected_samples
            ]

        FileUtility.save_list(rep_base_path + '_' + name_setting + '_Y.txt', Y)
        DiTaxaWorkflow.ensure_dir(self.output_directory + 'npe_marker_files/')
        G16s = NPEMarkerDetection(
            rep_base_path + '.npz',
            rep_base_path + '_' + name_setting + '_Y.txt',
            rep_base_path + '_features',
            self.output_directory + 'npe_marker_files/' + name_setting,
            selected_samples)
        G16s.extract_markers()
        end = time.time()
        spent = end - start
        self.log_file.append('biomarker extraction ' + name_setting + '  ' +
                             str(spent) + ' seconds , using ' +
                             str(self.num_p) + 'cores')
        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)
        DiTaxaWorkflow.enablePrint()
        print('npe marker taxonomic detection started')
        start = time.time()

        if callable(labeler):
            phenotypes = [
                labeler(filenames[sample_id]) for sample_id in selected_samples
            ]
        else:
            phenotypes = [
                labeler[filenames[sample_id]] for sample_id in selected_samples
            ]

        fasta_file = self.output_directory + 'npe_marker_files/' + name_setting + '_chi2_relative.fasta'
        matrix_path = rep_base_path + '.npz'
        feature_file_path = rep_base_path + '_features'

        if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000:
            remove_redundants = False
        else:
            remove_redundants = True

        Final_OBJ = NPEMarkerAnlaysis(fasta_file,
                                      matrix_path,
                                      feature_file_path,
                                      phenotypes,
                                      label_mapper,
                                      selected_samples,
                                      p_value_threshold=p_value_threshold,
                                      remove_redundants=remove_redundants,
                                      num_p=self.num_p)
        end = time.time()
        spent = end - start
        DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/')
        FileUtility.save_obj(
            self.output_directory + 'final_outputs/' + name_setting, Final_OBJ)
        Final_OBJ.generate_tree(self.output_directory + 'final_outputs/',
                                name_setting)
        self.log_file.append('blasting extraction ' + name_setting + '  ' +
                             str(spent) + ' seconds, using ' +
                             str(self.num_p) + 'cores')
        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)
        if pos_label and neg_label:
            Final_OBJ.generate_heatmap(self.output_directory +
                                       'final_outputs/' + name_setting +
                                       '_heatmap',
                                       pos_label=pos_label,
                                       neg_label=neg_label)