def tune_and_eval(self,
                   results_file,
                   params=None,
                   feature_names=None,
                   njobs=50,
                   kfold=10,
                   optimized_for='f1_macro'):
     '''
     :param results_file:
     :param params:
     :param feature_names:
     :param njobs:
     :param kfold:
     :return:
     '''
     if params is None:
         params = RFClassifier.params_tuning
     self.CV = KFoldCrossVal(self.X, self.Y, folds=kfold)
     self.CV.tune_and_evaluate(self.model,
                               parameters=params,
                               score=optimized_for,
                               file_name=results_file + '_RF',
                               n_jobs=njobs)
     if feature_names is not None:
         [
             label_set, conf, label_set, best_score_, best_estimator_,
             cv_results_, best_params_,
             (cv_predictions_pred, cv_predictions_trues, isolates),
             (Y_test_pred, Y_test)
         ] = FileUtility.load_obj(results_file + '_RF.pickle')
         self.generate_RF_important_features(best_estimator_, feature_names,
                                             results_file)
Esempio n. 2
0
 def load_precalculated(file_path):
     '''
     load precalculated results
     :param file_path:
     :return:
     '''
     return FileUtility.load_obj(file_path)
 def tune_and_eval(self,
                   results_file,
                   params=None,
                   njobs=50,
                   kfold=10,
                   feature_names=None,
                   optimized_for='f1_macro'):
     '''
     K-fold cross-validation
     :param results_file: file to save the results
     :param params: parameters to be tuned
     :param njobs: number of cores
     :param kfold: number of folds
     :return:
     '''
     if params == None:
         params = SVM.params_tuning
     CV = KFoldCrossVal(self.X, self.Y, folds=kfold)
     CV.tune_and_evaluate(self.model,
                          parameters=params,
                          score=optimized_for,
                          file_name=results_file + '_SVM',
                          n_jobs=njobs)
     if feature_names is not None:
         [
             nested_scores, cv_dicts, label_set, conf, label_set,
             best_score_, best_estimator_, cv_results_, best_params_,
             (cv_predictions_pred, cv_predictions_trues, isolates),
             (Y_test_pred, Y_test)
         ] = FileUtility.load_obj(results_file + '_SVM.pickle')
         self.generate_SVM_important_features(best_estimator_,
                                              feature_names, results_file)
Esempio n. 4
0
def get_cv_res(filename):
    [label_set, conf, best_score_, best_estimator_, cv_results_,
        best_params_, pred] = FileUtility.load_obj(filename)
    res = dict()
    print (conf)
    #print (cv_results_.keys())
    idx = np.argmax(cv_results_['mean_test_f1_macro'])
    res['f1_macro'] = np.round(cv_results_['mean_test_f1_macro'][idx], 2)
    res['f1_macro*'] = str(np.round(cv_results_['mean_test_f1_macro'][idx], 2)) + \
        ' $\pm$ ' + str(np.round(cv_results_['std_test_f1_macro'][idx], 2))
    res['f1_micro'] = str(np.round(cv_results_['mean_test_f1_micro'][idx], 2)) + \
        ' $\pm$ ' + str(np.round(cv_results_['std_test_f1_micro'][idx], 2))
    res['precision_micro'] = str(np.round(cv_results_['mean_test_precision_micro'][idx], 2)) + \
        ' $\pm$ ' + \
        str(np.round(cv_results_['std_test_precision_micro'][idx], 2))
    res['precision_macro'] = str(np.round(cv_results_['mean_test_precision_macro'][idx], 2)) + \
        ' $\pm$ ' + \
        str(np.round(cv_results_['std_test_precision_macro'][idx], 2))
    res['recall_micro'] = str(np.round(cv_results_['mean_test_recall_micro'][idx], 2)) + \
        ' $\pm$ ' + str(np.round(cv_results_['std_test_recall_micro'][idx], 2))
    res['recall_macro'] = str(np.round(cv_results_['mean_test_recall_macro'][idx], 2)) + \
        ' $\pm$ ' + str(np.round(cv_results_['std_test_recall_macro'][idx], 2))
    #res['accuracy']=str(np.round(cv_results_['mean_test_accuracy'][idx],2))+ ' $\pm$ ' + str(np.round(cv_results_['std_test_accuracy'][idx],2))
    res['file'] = file
    res['auc_macro'] = str(conf['auc_macro'])
    res['score'] = str(best_score_)
    return res
Esempio n. 5
0
 def tune_and_eval_predefined(self,
                              results_file,
                              isolates,
                              folds,
                              params=None,
                              feature_names=None,
                              njobs=50):
     '''
     :param results_file:
     :param isolates:
     :param folds:
     :param params:
     :param feature_names:
     :param njobs:
     :return:
     '''
     if params is None:
         params = [{
             "n_estimators": [100, 200, 500, 1000],
             "criterion": ["entropy"],  # "gini",
             'max_features': ['sqrt', 'auto'],  # 'auto',
             'min_samples_split': [2, 5, 10],  # 2,5,10
             'min_samples_leaf': [1, 2],
             'class_weight': ['balanced', None]
         }]
     self.CV = PredefinedFoldCrossVal(self.X, self.Y, isolates, folds)
     self.CV.tune_and_evaluate(self.model,
                               parameters=params,
                               score='f1_macro',
                               file_name=results_file + '_RF',
                               n_jobs=njobs)
     if feature_names is not None:
         try:
             [
                 label_set, conf, best_score_, best_estimator_, cv_results_,
                 best_params_, (y_predicted, Y, label_set)
             ] = FileUtility.load_obj(results_file + '_RF.pickle')
         except:
             [
                 label_set, best_score_, best_estimator_, cv_results_,
                 best_params_, (Y, label_set)
             ] = FileUtility.load_obj(results_file + '_RF.pickle')
         self.generate_RF_important_features(best_estimator_, feature_names,
                                             results_file, 1000)
 def __init__(self):
     self.seq2freqstructs = FileUtility.load_obj(
         '../data_config/seg2sec.pickle')
     # color dictionary for secondary structures
     color_dict = {
         'e': 'yellow',
         'g': 'blue',
         'h': 'blue',
         'n': 'red',
         's': 'red',
         't': 'red'
     }
Esempio n. 7
0
    def load_alpha_distribution(self):
        swiss_size_change=FileUtility.load_obj('data_config/swiss_1000_samples.pickle')
        all_samples=[]
        for i in tqdm.tqdm(range(0,1000)):
            sample=[]
            for vocab in np.arange(10000,1000000,10000):
                sample.append(swiss_size_change[vocab][i])
            all_samples.append(-np.diff(sample))

        sample_mat=np.mean(normalize_mat(all_samples),axis=0)
        sample_mat_std=np.std(normalize_mat(all_samples),axis=0)
        self.alpha_param = st.alpha.fit(sample_mat)
Esempio n. 8
0
def create_excel_file(input_path, output_path):
    files_cv = FileUtility.recursive_glob(input_path, '*.pickle')
    if len(files_cv) >0:
        files_cv.sort()
        table_test = {'classifier': [], 'feature': [], 'CV': [], 'Precision': [], 'Recall': [], 'F1': [],'macroF1': [], 'accuracy': []}
        table_cv = {'classifier': [], 'feature': [], 'CV': [], 'Precision': [], 'Recall': [], 'F1': [], 'macroF1': [],'accuracy': []}

        import warnings
        warnings.filterwarnings('ignore')
        df1=[]
        df2=[]
        for file in files_cv:
            [label_set, conf, label_set, best_score_, best_estimator_,
             cv_results_, best_params_, (cv_predictions_pred, cv_predictions_trues, isolates),
             (Y_test_pred, Y_test)] = FileUtility.load_obj(file)
            rep = file.split('/')[-1].split('_CV_')[0]
            CV_scheme = file.split('_CV_')[1].split('_')[0]
            classifier = file.split('_CV_')[1].split('_')[1].split('.')[0]

            table_test['feature'].append(rep)
            table_test['classifier'].append(classifier)
            table_test['CV'].append(CV_scheme)
            table_test['Precision'].append(np.round(precision_score(Y_test, Y_test_pred), 2))
            table_test['Recall'].append(np.round(recall_score(Y_test, Y_test_pred), 2))
            table_test['F1'].append(np.round(f1_score(Y_test, Y_test_pred), 2))
            table_test['macroF1'].append(np.round(f1_score(Y_test, Y_test_pred,average='macro'), 2))
            table_test['accuracy'].append(np.round(accuracy_score(Y_test, Y_test_pred), 2))

            table_cv['feature'].append(rep)
            table_cv['classifier'].append(classifier)
            table_cv['CV'].append(CV_scheme)
            table_cv['Precision'].append(np.round(precision_score(cv_predictions_trues, cv_predictions_pred), 2))
            table_cv['Recall'].append(np.round(recall_score(cv_predictions_trues, cv_predictions_pred), 2))
            table_cv['F1'].append(np.round(f1_score(cv_predictions_trues, cv_predictions_pred), 2))
            table_cv['macroF1'].append(np.round(f1_score(cv_predictions_trues, cv_predictions_pred,average='macro'), 2))
            table_cv['accuracy'].append(np.round(accuracy_score(cv_predictions_trues, cv_predictions_pred), 2))
            df1 = pd.DataFrame(data=table_test,
                               columns=['feature', 'CV', 'classifier', 'accuracy', 'Precision', 'Recall', 'F1','macroF1'])
            df2 = pd.DataFrame(data=table_cv,
                               columns=['feature', 'CV', 'classifier', 'accuracy', 'Precision', 'Recall', 'F1','macroF1'])
        writer = pd.ExcelWriter(output_path)
        df1.sort_values(['macroF1','feature','classifier'], ascending=[False, True, True], inplace=True)
        df1.to_excel(writer, 'Test', index=False)

        df2.sort_values(['macroF1','feature','classifier'], ascending=[False, True, True], inplace=True)
        df2.to_excel(writer, 'Cross-validation', index=False)
        writer.save()
Esempio n. 9
0
 def make_activation_function(file_name, X, last_layer=None):
     pretrained_weights = FileUtility.load_obj(file_name)
     if last_layer:
         h_sizes = [
             float(x)
             for x in file_name.split('/')[-1].split('_')[3].split('-')
         ] + [last_layer]
     else:
         h_sizes = [
             float(x)
             for x in file_name.split('/')[-1].split('_')[3].split('-')
         ]
     model = Sequential()
     for layer_idx, h_layer_size in enumerate(h_sizes):
         if layer_idx == 0:
             model.add(
                 Dense(int(h_layer_size),
                       input_dim=X.shape[1],
                       weights=pretrained_weights[0],
                       activation='relu'))
         else:
             if h_layer_size < 1:
                 model.add(
                     Dropout(h_layer_size,
                             weights=pretrained_weights[layer_idx]))
             else:
                 if layer_idx == len(h_sizes) - 1 and last_layer:
                     model.add(
                         Dense(int(h_layer_size),
                               weights=pretrained_weights[layer_idx],
                               activation='softmax'))
                 else:
                     model.add(
                         Dense(int(h_layer_size),
                               weights=pretrained_weights[layer_idx],
                               activation='relu'))
     activations = model.predict(X)
     np.savetxt(
         file_name.replace(
             file_name.split('/')[-1].split('_')[0], 'activationlayer'),
         activations)
     return activations
Esempio n. 10
0
 def get_pretrained_model(self, file_name, trainable):
     pretrained_weights=FileUtility.load_obj(file_name)
     
     h_sizes=[float(x) for x in file_name.split('/')[-1].split('_')[3].split('-')]
     model = Sequential()
     for layer_idx, h_layer_size in enumerate(h_sizes):
         if layer_idx==0:
             model.add(Dense(int(h_layer_size), input_dim=self.X.shape[1], weights=pretrained_weights[0],  activation='relu', trainable=trainable))
         else:
             if h_layer_size < 1:
                 model.add(Dropout(h_layer_size, weights=pretrained_weights[layer_idx], trainable=trainable))
             else:
                 model.add(Dense(int(h_layer_size), weights=pretrained_weights[layer_idx], activation='relu', trainable=trainable))
     if self.model_arch:
         for layer_idx, h_layer_size in enumerate(self.model_arch):
             if h_layer_size < 1:
                 model.add(Dropout(h_layer_size))
             else:
                 model.add(Dense(h_layer_size, activation='relu'))
     model.add(Dense(self.C, activation='softmax'))
     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
     return model
    def tune_and_eval_predefined(self,
                                 results_file,
                                 isolates,
                                 folds_file,
                                 test_file,
                                 params=None,
                                 njobs=50,
                                 feature_names=None,
                                 optimized_for='f1_macro'):
        '''
        :param results_file:
        :param isolates:
        :param folds:
        :param params:
        :param njobs:
        :return:
        '''

        if params == None:
            params = SVM.params_tuning
        self.CV = PredefinedFoldCrossVal(self.X, self.Y, isolates, folds_file,
                                         test_file)
        self.CV.tune_and_evaluate(self.model,
                                  parameters=params,
                                  score=optimized_for,
                                  file_name=results_file + '_SVM',
                                  n_jobs=njobs)
        if feature_names is not None:
            [
                nested_scores, cv_dicts, label_set, conf, label_set,
                best_score_, best_estimator_, cv_results_, best_params_,
                (cv_predictions_pred, cv_predictions_trues, isolates),
                (Y_test_pred, Y_test)
            ] = FileUtility.load_obj(results_file + '_SVM.pickle')
            self.generate_SVM_important_features(best_estimator_,
                                                 feature_names, results_file)
Esempio n. 12
0
 def load_history(filename, fileout):
     '''
     Plot the history
     :param filename:
     :param fileout:
     :return:
     '''
     [
         latex_line, p_micro, r_micro, f1_micro, p_macro, r_macro, f1_macro,
         history
     ] = FileUtility.load_obj(filename)
     (loss_values, val_loss_values, epochs) = history
     matplotlib.rcParams['mathtext.fontset'] = 'stix'
     matplotlib.rcParams['font.family'] = 'STIXGeneral'
     matplotlib.rcParams['mathtext.fontset'] = 'custom'
     matplotlib.rcParams['mathtext.rm'] = 'Bitstream Vera Sans'
     matplotlib.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic'
     matplotlib.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold'
     matplotlib.rcParams["axes.edgecolor"] = "black"
     matplotlib.rcParams["axes.linewidth"] = 0.6
     plt.rc('text', usetex=True)
     plt.plot(epochs, loss_values, 'ro', label='Loss for train set')
     plt.plot(epochs, val_loss_values, 'b+', label='Loss for test set')
     plt.xlabel('Epochs')
     plt.ylabel('Loss')
     plt.legend(loc=1,
                prop={'size': 8},
                ncol=1,
                edgecolor='black',
                facecolor='white',
                frameon=True)
     plt.title(
         'Loss with respect to the number of epochs for train and test sets'
     )
     plt.savefig(fileout + '.pdf')
     plt.show()
Esempio n. 13
0
    def tune_and_evaluate(self,
                          estimator,
                          parameters,
                          cv_inner=5,
                          score='f1_macro',
                          n_jobs=-1,
                          file_name='results',
                          NUM_TRIALS=3):
        '''
        :param estimator:
        :param parameters:p
        :param score:
        :param n_jobs:
        :param file_name: directory/tuning/classifier/features/
        :return:
        '''

        print(
            'ummaaaaad injaaaa ==============================================')
        self.nested_scores = []
        cv_dicts = []
        test_predictions_in_trials = []
        best_params_in_trials = []

        # Loop for each trial
        for i in tqdm.tqdm(range(NUM_TRIALS)):
            # Choose cross-validation techniques for the inner and outer loops,
            # independently of the dataset.
            # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
            inner_cv = StratifiedKFold(n_splits=cv_inner,
                                       shuffle=True,
                                       random_state=i)

            # parameter search and scoring
            self.greed_search = GridSearchCV(estimator=estimator,
                                             param_grid=parameters,
                                             cv=inner_cv,
                                             scoring=self.scoring,
                                             refit=score,
                                             error_score=0,
                                             n_jobs=n_jobs,
                                             verbose=0)

            # Nested CV with parameter optimization
            nested_score = cross_val_score(self.greed_search,
                                           X=self.X,
                                           y=self.Y,
                                           cv=self.cv,
                                           n_jobs=1,
                                           scoring=score)
            self.nested_scores.append(nested_score)

            # Nested CV with parameter optimization
            cv_dict_pred = cross_val_predict(self.greed_search,
                                             X=self.X,
                                             y=self.Y,
                                             cv=self.cv,
                                             n_jobs=1)
            cv_dicts.append(cv_dict_pred)

        # get the cv results
        cv_predictions_pred = []
        cv_predictions_trues = []

        # Non_nested parameter search and scoring
        self.greed_search = GridSearchCV(estimator=estimator,
                                         param_grid=parameters,
                                         cv=self.cv,
                                         scoring=self.scoring,
                                         refit=score,
                                         error_score=0,
                                         n_jobs=n_jobs,
                                         verbose=0)

        self.greed_search.fit(X=self.X, y=self.Y)

        isolates = []
        for train, test in self.cv:
            self.greed_search.best_estimator_.fit(
                self.X[train, :], [self.Y[idx] for idx in train])
            preds = self.greed_search.best_estimator_.predict(self.X[test, :])
            trues = [self.Y[idx] for idx in test]
            [cv_predictions_pred.append(pred) for pred in preds]
            [cv_predictions_trues.append(tr) for tr in trues]
            for i in test:
                isolates.append(i)

        label_set = list(set(self.Y))
        label_set.sort()

        isolates = [self.train_isolate_list[iso] for iso in isolates]
        conf = confusion_matrix(cv_predictions_trues,
                                cv_predictions_pred,
                                labels=label_set)

        Y_test_pred = self.greed_search.best_estimator_.predict(self.X_test)

        # save in file
        FileUtility.save_obj(file_name, [
            self.nested_scores, cv_dicts, label_set, conf, label_set,
            self.greed_search.best_score_, self.greed_search.best_estimator_,
            self.greed_search.cv_results_, self.greed_search.best_params_,
            (cv_predictions_pred, cv_predictions_trues, isolates),
            (Y_test_pred, self.Y_test)
        ])
        [
            nested_scores, cv_dicts, label_set, conf, label_set, best_score_,
            best_estimator_, cv_results_, best_params_,
            (cv_predictions_pred, cv_predictions_trues, isolates),
            (Y_test_pred, Y_test)
        ] = FileUtility.load_obj(filename)
Esempio n. 14
0
 def result_visualization(filename):
     [latex_line, p_micro, r_micro, f1_micro, p_macro, r_macro, f1_macro, (loss_values, val_loss_values, epochs)]=FileUtility.load_obj(filename)
     print(latex_line)
def generate_report(full_path, pred_test, domain, setting):
    '''
    :param pred_test: test results
    :return:
    '''
    # Error location analysis
    error_edge=0
    error_NOTedge=0
    correct_edge=0
    correct_NOTedge=0

    all_pred = []
    all_true = []

    for i in tqdm.tqdm(range(0,514)):
        pred=np.array([np.argmax(x, axis=1) for x in pred_test[i][0]])
        true=np.array([np.argmax(x, axis=1) for x in pred_test[i][1]])
        all_pred = all_pred + pred.tolist()
        all_true = all_true + true.tolist()
        diff=np.diff(true)
        errors = [y for x,y in np.argwhere(pred!=true)]
        corrects = list(set(list(range(len(pred[0]))))-set(errors))
        edges_edge  = [y for x,y in np.argwhere(diff!=0)]
        edges_before = [x-1 for x in edges_edge if x-1>=0]
        edges_after = [x+1 for x in edges_edge if x+1<len(pred[0])]
        edges = list(set(edges_edge + edges_before + edges_after))
        # contingency matrix
        error_edge = error_edge+len(list(set(errors).intersection(edges)))
        error_NOTedge = error_NOTedge+len(list(set(errors)-set(edges)))

        correct_edge = correct_edge+len(list(set(corrects).intersection(edges)))
        correct_NOTedge = correct_NOTedge+len(list(set(corrects)-set(edges)))

    all_pred = list(itertools.chain(*all_pred))
    all_true = list(itertools.chain(*all_true))

    acc_test = accuracy_score(all_true, all_pred)
    f1_macro = f1_score(all_true, all_pred, average='macro')
    f1_micro = f1_score(all_true, all_pred, average='micro')

    conf_mat = confusion_matrix(all_true, all_pred, labels=list(range(1,9)))
    conf_mat_column_mapping = {3: 'E (Beta sheet)', 4: 'G (3-10 Helix)', 2: 'B (Beta bridge)', 6: 'H (Alpha helix)', 8: 'T (Turn)', 1: 'L (Loop)', 7: 'S (Bend)', 5: 'I (Pi Helix)'}

    contingency_metric = [[error_edge, error_NOTedge],[correct_edge, correct_NOTedge]]

    # Chi2 test
    chi2_res = scipy.stats.chi2_contingency([[error_edge, error_NOTedge],[correct_edge, correct_NOTedge]], correction=True)
    chi2_res_pval = chi2_res[1]

    #log-likelihood ratio (i.e. the “G-test”)
    gtest_res = scipy.stats.chi2_contingency([[error_edge, error_NOTedge],[correct_edge, correct_NOTedge]], lambda_="log-likelihood", correction=True)
    gtest_res_pval = gtest_res[1]
    #https://stackoverflow.com/questions/51864730/python-what-is-the-process-to-create-pdf-reports-with-charts-from-a-db

    cmap = sns.cubehelix_palette(light=1, as_cmap=True)
    create_mat_plot(conf_mat,[conf_mat_column_mapping[x] for x in list(range(1,9))], 'Confusion matrix of protein secondary structure prediction', full_path+'confusion'+F"{domain}_{setting}",'Predicted Label', 'True Label' ,filetype='png', annot=False, cmap=cmap )


    pdf = MyFPDF()
    pdf.add_page()
    pdf.set_xy(0, 0)

    html = F"""
    
    <h2>DeepPrime2Sec Report on Protein Secondary Structure Prediction</h2>
    <h3>Experiment name: {domain} - {setting} </h3>
    <hr/>
    
    <H3 align="left">The performance on CB513</H3>
    <h4>Report on the accuracy</h4>
    
    <table border="1" align="center" width="70%">
    <thead><tr><th width="30%">Test-set Accuray</th><th width="30%">Test-set micro F1</th><th width="30%">Test-set macro F1</th></tr></thead>
    <tbody>
    <tr><td>{round(acc_test,3)}</td><td>{round(f1_micro,3)}</td><td>{round(f1_macro,3)}</td></tr>
    </tbody>
    </table>

    <hr/>
    
    <h4>Confusion matrix</h4>
    
    
    """

    pdf.write_html(html)
    pdf.image(full_path+'confusion'+F"{domain}_{setting}"+'.png', x = 50, y = None, w = 100, h = 0, type = '', link = '')

    html=F"""
    <center>
    <image src='confusion{domain}_{setting}.png'/>
    </center>
    
    <hr/>
    
    <h4>Error analysis</h4>
    
    <h5>Contingency table for location analysis of the misclassified amino acids</h5>
    <table border="1" align="center" width="100%">
    <thead><tr><th width="30%">\</th><th width="30%">Located at the PSS transition</th><th width="30%">NOT Located at the PSS transition</th></tr></thead>
    <tbody>
    <tr><td><b>Miss-classified</b></td><td>{error_edge}</td><td>{error_NOTedge}</td></tr>
    <tr><td><b>Truely classified</b></td><td>{correct_edge}</td><td>{correct_NOTedge}</td></tr>
    </tbody>
    </table>
    <br/>
    <b>P-value for Chi-square test</b> = {chi2_res_pval}
    <br/>
    <b>P-value for G-test</b> = {gtest_res_pval}
    
    <hr/>
    <br/>
    <br/>
    <br/>
    
    <h4>Learning curve</h4>
    """
    pdf.write_html(html)

    # learning curve
    history_dict=FileUtility.load_obj(full_path+'history.pickle')
    plt.clf()
    loss_values = history_dict['loss']
    val_loss_values = history_dict['val_loss']
    epochs = range(1, len(loss_values) + 1)
    matplotlib.rcParams['mathtext.fontset'] = 'stix'
    matplotlib.rcParams['font.family'] = 'STIXGeneral'
    matplotlib.rcParams['mathtext.fontset'] = 'custom'
    matplotlib.rcParams['mathtext.rm'] = 'Bitstream Vera Sans'
    matplotlib.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic'
    matplotlib.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold'
    matplotlib.rcParams["axes.edgecolor"] = "black"
    matplotlib.rcParams["axes.linewidth"] = 0.6
    plt.plot(epochs, loss_values, 'ro', label='Loss for train set')
    plt.plot(epochs, val_loss_values, 'b', label='Loss for test set')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend(loc=1, prop={'size': 8},ncol=1, edgecolor='black', facecolor='white', frameon=True)
    plt.title('Loss with respect to the number of epochs for train and test sets')
    plt.savefig(full_path + 'learning_curve'+F"{domain}_{setting}"+'.png', dpi=300)
    pdf.image(full_path + 'learning_curve'+F"{domain}_{setting}"+'.png', x = 50, y = None, w = 100, h = 0, type = '', link = '')


    pdf.output(full_path+'final_report.pdf', 'F')

    return acc_test, conf_mat, conf_mat_column_mapping, contingency_metric, chi2_res_pval, gtest_res_pval
Esempio n. 16
0
    def biomarker_extraction(self,
                             labeler,
                             label_mapper,
                             phenoname,
                             p_value_threshold=0.05,
                             pos_label=None,
                             neg_label=None,
                             excel=0):
        '''

        :return:
        '''
        print('\t✔ NPE Marker detection is started..')
        start = time.time()
        rep_base_path = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
            self.rep_sampling_depth)
        filenames = [
            x.split('/')[-1]
            for x in FileUtility.load_list(rep_base_path + '_meta')
        ]

        # CHECK EXISTING LABELS
        if callable(labeler):
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler(file) in label_mapper
            ]
        else:
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler[file] in label_mapper
            ]

        if callable(labeler):
            Y = [
                str(label_mapper[labeler(filenames[sample_id])])
                for sample_id in selected_samples
            ]
        else:
            Y = [
                str(label_mapper[labeler[filenames[sample_id]]])
                for sample_id in selected_samples
            ]

        FileUtility.save_list(rep_base_path + '_' + phenoname + '_Y.txt', Y)
        DiTaxaWorkflow.ensure_dir(self.output_directory_inter +
                                  'npe_marker_files/')

        if self.override == 1 or not DiTaxaWorkflow.exists(
                self.output_directory_inter + 'npe_marker_files/' +
                '_'.join([phenoname, 'chi2_relative.fasta'])):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                G16s = NPEMarkerDetection(
                    rep_base_path + '.npz',
                    rep_base_path + '_' + phenoname + '_Y.txt',
                    rep_base_path + '_features', self.output_directory_inter +
                    'npe_marker_files/' + phenoname, selected_samples)
                G16s.extract_markers()

            end = time.time()
            spent = end - start
            print('\t✔ biomarker extraction ' + phenoname + '  ' + str(spent) +
                  ' seconds , using ' + str(self.num_p) + ' cores')
            self.log_file.append('biomarker extraction ' + phenoname + '  ' +
                                 str(spent) + ' seconds , using ' +
                                 str(self.num_p) + ' cores')
        else:
            print(
                '\t✔ Biomarker are already extracted. Thus, the statistical test was bypassed'
            )
            self.log_file.append(
                ' Biomarker are already extracted. Thus, the statistical test was bypassed'
            )

        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)

        print('\t✔ Taxonomic assignment of the markers..')

        if callable(labeler):
            phenotypes = [
                labeler(filenames[sample_id]) for sample_id in selected_samples
            ]
        else:
            phenotypes = [
                labeler[filenames[sample_id]] for sample_id in selected_samples
            ]

        fasta_file = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_chi2_relative.fasta'
        matrix_path = rep_base_path + '.npz'
        feature_file_path = rep_base_path + '_features'

        if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000:
            remove_redundants = False
        else:
            remove_redundants = True

        FileUtility.ensure_dir(self.output_directory +
                               'final_outputs/save_states/')
        if self.override == 1 or not DiTaxaWorkflow.exists(
                self.output_directory + 'final_outputs/save_states/' +
                phenoname + '.pickle'):
            start = time.time()
            Final_OBJ = NPEMarkerAnlaysis(fasta_file,
                                          matrix_path,
                                          feature_file_path,
                                          phenotypes,
                                          label_mapper,
                                          selected_samples,
                                          p_value_threshold=p_value_threshold,
                                          remove_redundants=remove_redundants,
                                          num_p=self.num_p,
                                          blastn_path=self.blastn_path)
            end = time.time()
            spent = end - start
            DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/')
            FileUtility.save_obj(
                self.output_directory + 'final_outputs/save_states/' +
                phenoname, Final_OBJ)
            print('\t✔ Marker analysis and alignment ' + phenoname + '  ' +
                  str(spent) + ' seconds, using ' + str(self.num_p) + 'cores')
            self.log_file.append('Marker analysis and alignment ' + phenoname +
                                 '  ' + str(spent) + ' seconds, using ' +
                                 str(self.num_p) + 'cores')
        else:
            Final_OBJ = FileUtility.load_obj(self.output_directory +
                                             'final_outputs/save_states/' +
                                             phenoname + '.pickle')
            print('\t✔ The aligned markers already existed and are loaded!')
            self.log_file.append(
                'The aligned markers already existed and are loaded!')
        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)

        # generating the tree
        Final_OBJ.generate_tree(self.output_directory + 'final_outputs/',
                                phenoname)

        if excel == 1:
            print('\t✔ Creating marker excel file..')
            Final_OBJ.generate_excel(
                self.output_directory + 'final_outputs/' + phenoname + '.xlsx',
                phenoname)
            X_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '.npz'
            feature_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '_features'
            markers = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_finalmarker_list.txt'
            Y = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '_' + phenoname + "_Y.txt"
            print('\t✔ Creating t-sne plot..')
            DiTaxaWorkflow.plot_res(self.output_directory + 'final_outputs/' +
                                    phenoname + '_tsne.pdf',
                                    X_addr,
                                    feature_addr,
                                    markers,
                                    Y,
                                    labels=['Negative', 'Positive'])

        if pos_label and neg_label:
            print('\t✔ Creating marker heatmap..')
            Final_OBJ.update_matrix_by_markers_N()
            Final_OBJ.generate_heatmap(self.output_directory +
                                       'final_outputs/' + phenoname +
                                       '_heatmap',
                                       pos_label=pos_label,
                                       neg_label=neg_label)
            if not excel == 1:
                print('\t✔ Creating t-sne plot..')
                DiTaxaWorkflow.plot_res(self.output_directory +
                                        'final_outputs/' + phenoname +
                                        '_tsne.pdf',
                                        X_addr,
                                        feature_addr,
                                        markers,
                                        Y,
                                        labels=[neg_label, pos_label])
        DiTaxaWorkflow.temp_cleanup()
        print(
            '\t⬛ Marker detection and analysis completed. You can find the results at '
            + self.output_directory +
            ', in partuclar at final_outputs subdirectory.')