Esempio n. 1
0
    def __apply_filter(self, protein):

        filter_operators = {
            'greater': np.greater,
            'less': np.less,
            'greater_equal': np.greater_equal,
            'less_equal': np.less_equal,
            'equal': np.equal,
            'not_equal': np.not_equal
        }

        for method in self.benchmark_methods:
            eval_file = self.eval_dir + "/" + protein + "." + method
            eval_meta = io.read_json_from_mat(eval_file)

            for f in self.filter:
                filter_res = u.find_dict_key(f['key'], eval_meta)
                if not filter_operators[f['operator']](filter_res, f['value']):
                    print(
                        "{0} did not pass filter for {1} {2} {3}: {4}".format(
                            method, f['key'], f['operator'], f['value'],
                            filter_res))
                    return False

        return True
    def compute_evaluation_statistics_protein(self, protein, ranks, seqsep, contact_thr, noncontact_thr ):

        evaluation_file_protein = self.eval_dir + "/" + protein + ".protein"
        if not os.path.exists(evaluation_file_protein):
            print("Evaluation file for protein {0} does not exist: {1}!".format(protein, evaluation_file_protein))
            return

        eval_df = pd.DataFrame(io.read_matfile(evaluation_file_protein))
        eval_df.columns = ['cb_distance', 'i', 'j']

        ### add evaluation statistics for all methods ============================================================
        for method in self.benchmark_methods:
            evaluation_file_method = self.eval_dir + "/" + protein + "."  + method
            eval_df[method] = io.read_matfile(evaluation_file_method)

        ### apply constraints ====================================================================================
        eval_df['class'] = (eval_df['cb_distance'] <= contact_thr) * 1
        eval_df = eval_df[eval_df['j'] >= (eval_df['i'] + seqsep)]

        if noncontact_thr > contact_thr:
            eval_df = eval_df[(eval_df['cb_distance'] <= contact_thr) | (eval_df['cb_distance'] > noncontact_thr)]

        eval_df.sort_values(by=['i', 'j'], inplace=True)
        eval_df.reset_index(inplace=True)

        ### read protein info =====================================================================================
        protein_eval_metrics = io.read_json_from_mat(evaluation_file_protein)
        if 'L' not in protein_eval_metrics:
            print("Protein length L is missing in evaluation meta file for protein {0}!".format(protein))
            return

        L = protein_eval_metrics['L']
        if "cath class" in protein_eval_metrics:
            protein_eval_metrics["cath class"] = int(protein_eval_metrics["cath class"].split(".")[0])

        ### determine the ranks according to protein length L=====================================================
        # if there are less precision values than max(rank_L): adjust rank_L
        ranks_L = np.round(L * ranks).astype(int)
        ranks_L = np.array([rank for rank in ranks_L if rank < len(eval_df)])

        ### compute precision and recall values ==================================================================
        protein_eval_metrics['methods']={}
        for method in self.benchmark_methods:

            precision, recall, threshold = bu.compute_precision_recall(eval_df['class'], eval_df[method])
            mean_error                   = bu.compute_mean_error(eval_df['cb_distance'], eval_df[method], contact_thr)

            protein_eval_metrics['methods'][method] = {}
            protein_eval_metrics['methods'][method]['precision']    = [np.nan] * len(ranks)
            protein_eval_metrics['methods'][method]['mean_error']   = [np.nan] * len(ranks)
            protein_eval_metrics['methods'][method]['recall']       = [np.nan] * len(ranks)
            for rank_id, rank in enumerate(ranks_L):
                protein_eval_metrics['methods'][method]['precision'][rank_id]   = np.array(precision)[rank]
                protein_eval_metrics['methods'][method]["mean_error"][rank_id]  = np.array(mean_error)[rank]
                protein_eval_metrics['methods'][method]["recall"][rank_id]      = np.array(recall)[rank]


        return protein_eval_metrics
def get_meta_property(b, method, key):
    eval_files_method = [eval_file for eval_file in b.evaluation_files if method == eval_file.split(".")[1]]

    ### iterate over all proteins in evaluation suite from that method ==========================
    values = np.zeros(len(eval_files_method))
    for id, eval_file in enumerate(eval_files_method):
        eval_meta = io.read_json_from_mat(b.eval_dir + "/" + eval_file)
        values[id] = u.find_dict_key(key, eval_meta)

    return values
Esempio n. 4
0
    def __add_method_evaluation_file(self, eval_file, mat, meta):
        """
            Write new evaluation file for a protein:
            scores will be extracted from mat and written to file together with meta data

        :param eval_file: path to evaluation file
        :param mat: matrix of scores for all residue pairs
        :param meta: meta data for method for this protein
        :return: NONE
        """

        #get indices of residues pairs and filter the scores
        protein = os.path.basename(eval_file).split(".")[0]
        protein_eval_file = self.eval_dir + "/" + protein + ".protein"

        if not os.path.exists(protein_eval_file):
            print(
                "Protein evaluation file {0} for protein {1} does not exist!".
                format(protein_eval_file, protein))
            return

        protein_eval = pd.DataFrame(io.read_matfile(protein_eval_file),
                                    columns=['cb_distance', 'i', 'j'])
        protein_eval = protein_eval.astype({
            'cb_distance': float,
            'i': int,
            'j': int
        })

        ij_scores = mat[protein_eval['i'], protein_eval['j']]

        # read meta data from eval_file if it exists
        if os.path.exists(eval_file):
            meta_method_protein = io.read_json_from_mat(eval_file)
            meta_method_protein.update(meta)
        else:
            meta_method_protein = meta

        #write new eval file for method/protein with meta data
        io.write_matfile(ij_scores, eval_file, meta_method_protein)

        ##### update statistics of class to ensure consistency
        method_name = eval_file.split(".")[-1]
        if eval_file not in self.evaluation_files:
            self.evaluation_files.append(eval_file)

            if method_name not in self.methods:
                self.methods.append(method_name)
                self.methods_count[method_name] = 1
            else:
                self.methods_count[method_name] += 1

        print(
            "Successfully added evaluation file for protein {0} and method {1}!"
            .format(protein, method_name))
def get_meta_property(b, method, key):
    eval_files_method = [
        eval_file for eval_file in b.evaluation_files
        if method == eval_file.split(".")[1]
    ]

    ### iterate over all proteins in evaluation suite from that method ==========================
    values = np.zeros(len(eval_files_method))
    for id, eval_file in enumerate(eval_files_method):
        eval_meta = io.read_json_from_mat(b.eval_dir + "/" + eval_file)
        values[id] = u.find_dict_key(key, eval_meta)

    return values
    def __add_method_evaluation_file(self, eval_file, mat, meta):
        """
            Write new evaluation file for a protein:
            scores will be extracted from mat and written to file together with meta data

        :param eval_file: path to evaluation file
        :param mat: matrix of scores for all residue pairs
        :param meta: meta data for method for this protein
        :return: NONE
        """

        #get indices of residues pairs and filter the scores
        protein = os.path.basename(eval_file).split(".")[0]
        protein_eval_file = self.eval_dir + "/" + protein + ".protein"

        if not os.path.exists(protein_eval_file):
            print("Protein evaluation file {0} for protein {1} does not exist!".format(protein_eval_file, protein))
            return


        protein_eval = pd.DataFrame(io.read_matfile(protein_eval_file), columns=['cb_distance', 'i', 'j'])
        protein_eval = protein_eval.astype({'cb_distance': float, 'i': int, 'j': int})

        ij_scores = mat[protein_eval['i'], protein_eval['j']]

        # read meta data from eval_file if it exists
        if os.path.exists(eval_file):
            meta_method_protein = io.read_json_from_mat(eval_file)
            meta_method_protein.update(meta)
        else:
            meta_method_protein=meta


        #write new eval file for method/protein with meta data
        io.write_matfile(ij_scores, eval_file, meta_method_protein)


        ##### update statistics of class to ensure consistency
        method_name = eval_file.split(".")[-1]
        if eval_file not in self.evaluation_files:
            self.evaluation_files.append(eval_file)

            if method_name not in self.methods:
                self.methods.append(method_name)
                self.methods_count[method_name] = 1
            else:
                self.methods_count[method_name] += 1


        print("Successfully added evaluation file for protein {0} and method {1}!".format(protein, method_name ))
Esempio n. 7
0
    def add_method_from_file(self,
                             method_name,
                             method_dir,
                             is_mat_file,
                             apc=True,
                             update=True,
                             filter=""):

        print("Will add scores to evaluation files for method {0} \n "
              "from files of *{1}* in {2}".format(method_name, filter,
                                                  method_dir))

        for i, protein_name in enumerate(self.proteins):

            print("{0}/{1} {2}".format(i + 1, len(self.proteins),
                                       protein_name))

            method_file = glob.glob(method_dir + "/" + protein_name + "*" +
                                    filter + "*")

            if (len(method_file) == 0):
                continue

            if not os.path.exists(method_file[0]):
                raise IOError("File " + str(method_file[0]) +
                              "cannot be found. ")

            eval_file = self.eval_dir + "/" + protein_name + "." + method_name
            if os.path.exists(eval_file) and not update:
                print(
                    "Evaluation file {0} for protein {1} already exists. Do not update."
                    .format(eval_file, protein_name))
                continue

            if is_mat_file:
                mat = io.read_matfile(method_file[0])
                if (apc):
                    mat = bu.compute_apc_corrected_matrix(mat)
                meta = io.read_json_from_mat(method_file[0])
            else:
                braw = raw.parse_msgpack(method_file[0])
                mat = bu.compute_l2norm_from_braw(braw, apc)
                meta = braw.meta

            self.__add_method_evaluation_file(eval_file, mat, meta)
Esempio n. 8
0
    def add_protein_meta_data(self, protein, meta_data):

        if not isinstance(meta_data, dict):
            print("meta_data must be a dictionary!")
            return

        eval_file = self.eval_dir + "/" + protein + ".protein"
        if not os.path.exists(eval_file):
            print("There is no eval file for protein {0}".format(protein))
            return

        mat = io.read_matfile(eval_file)
        meta = io.read_json_from_mat(eval_file)

        #update meta data for all keys in meta_data
        for key, value in meta_data.iteritems():
            if key not in meta.keys():
                meta[key] = {}
            meta[key] = value

        #write back to file
        io.write_matfile(mat, eval_file, meta)
    def add_protein_meta_data(self, protein, meta_data):

        if not isinstance(meta_data, dict):
            print("meta_data must be a dictionary!")
            return

        eval_file = self.eval_dir + "/" + protein + ".protein"
        if not os.path.exists(eval_file):
            print("There is no eval file for protein {0}".format(protein))
            return

        mat     = io.read_matfile(eval_file)
        meta    = io.read_json_from_mat(eval_file)

        #update meta data for all keys in meta_data
        for key, value in meta_data.iteritems():
            if key not in meta.keys():
                meta[key] = {}
            meta[key] = value

        #write back to file
        io.write_matfile(mat, eval_file, meta)
Esempio n. 10
0
    def __apply_filter(self, protein):

        filter_operators = {
            'greater': np.greater,
            'less': np.less,
            'greater_equal': np.greater_equal,
            'less_equal': np.less_equal,
            'equal': np.equal,
            'not_equal': np.not_equal
        }

        for method in self.benchmark_methods:
            eval_file = self.eval_dir + "/" + protein + "." + method
            eval_meta = io.read_json_from_mat(eval_file)

            for f in self.filter:
                filter_res = u.find_dict_key(f['key'], eval_meta)
                if not filter_operators[f['operator']](filter_res, f['value']):
                    print("{0} did not pass filter for {1} {2} {3}: {4}".format(method, f['key'], f['operator'], f['value'], filter_res))
                    return False

        return True
Esempio n. 11
0
    def add_method_from_file(self, method_name, method_dir, is_mat_file, apc=True, update=True, filter=""):




        print("Will add scores to evaluation files for method {0} \n "
              "from files of *{1}* in {2}".format(method_name, filter, method_dir))

        for i, protein_name in enumerate(self.proteins):

            print("{0}/{1} {2}".format(i+1, len(self.proteins), protein_name))

            method_file = glob.glob(method_dir+"/"+protein_name+"*"+filter+"*")

            if(len(method_file) == 0):
                continue

            if not os.path.exists(method_file[0]):
             raise IOError("File " + str(method_file[0]) + "cannot be found. ")

            eval_file = self.eval_dir + "/" + protein_name + "." + method_name
            if os.path.exists(eval_file) and not update:
                print("Evaluation file {0} for protein {1} already exists. Do not update.".format(eval_file, protein_name))
                continue

            if is_mat_file:
                mat = io.read_matfile(method_file[0])
                if(apc):
                    mat = bu.compute_apc_corrected_matrix(mat)
                meta = io.read_json_from_mat(method_file[0])
            else:
                braw = raw.parse_msgpack(method_file[0])
                mat = bu.compute_l2norm_from_braw(braw, apc)
                meta = braw.meta

            self.__add_method_evaluation_file(eval_file, mat, meta)
def main():


    args = parse_args()

    mat_dir_method1      = args.braw_dir_method1
    mat_dir_method2      = args.braw_dir_method2
    plot_dir            = args.plot_dir
    method_1             = args.method1
    method_2             = args.method2
    seq_sep              = args.seq_sep
    alignment_dir        = args.alignment_dir
    pdb_dir              = args.pdb_dir



    ### debug
    method_1 = "persistent contrastive divergence"
    method_2 = "pseudo-likelihood maximization"

    method_1_short="PCD"
    method_2_short="PLL"

    # protein="1g2rA"
    # mat_file_1="/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/" + protein + ".frobenius.mat"
    # mat_file_2="/home/vorberg/work/data/ccmgen/psicov/predictions_pll/" + protein + ".frobenius.mat"

    mat_dir_method1 = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/"
    mat_dir_method2 = "/home/vorberg/work/data/ccmgen/psicov/predictions_pll/"

    alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/"
    alignment_dir = "/home/vorberg/work/data/ccmgen/psicov/alignments/"

    pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/"
    pdb_dir = "/home/vorberg/work/data/ccmgen/psicov/pdb/"

    seq_sep = 4
    #plot_dir = "/home/vorberg/work/plots/benchmark_full_likelihood_optimization/compare_cd_pll/"
    plot_dir = "/home/vorberg//work/plots/ccmgen/psicov/pll_vs_pcd_comparison/"

    #braw_file_1 = glob.glob(coupling_dir_1 +'/*' + protein + '*')
    #braw_file_2 = glob.glob(coupling_dir_2 + '/*' + protein + '*')

    #braw_1 = raw.parse_msgpack(braw_file_1)
    #braw_2 = raw.parse_msgpack(braw_file_2)




    mat_files_method2 = glob.glob(mat_dir_method2 +"/*.frobenius.mat")

    stats_dict={}
    for mat_file_2 in mat_files_method2:

        protein = os.path.basename(mat_file_2).split(".")[0]
        #mat_file_2 = glob.glob(mat_dir_method2 +"/"+protein+"*mat")[0]
        print(protein)

        mat_file_1 = glob.glob(mat_dir_method1 +"/"+protein+"*.frobenius.mat")[0]

        if len(mat_file_1) == 0 :
            print("There is no mat file for protein {0} in directory {1}. Skip protein".format(protein, mat_dir_method2))
            continue

        if alignment_dir is None:
            alignment_file = None
        else:
            alignment_file = alignment_dir + "/" + protein + ".aln"

        if pdb_dir is None:
            pdb_file = None
        else:
            pdb_file = pdb_dir + "/"+ protein + ".pdb"

        mat_1 = io.read_matfile(mat_file_1)
        mat_2 = io.read_matfile(mat_file_2)
        mat_meta = io.read_json_from_mat(mat_file_2)
        L = mat_1.shape[0]
        Neff =  np.round(u.find_dict_key("neff",mat_meta), decimals=2)

        ### Plot Scatter of Frobenius Norm scores for both methods
        # alignment = io.read_alignment(alignment_file)
        # single_counts, pairwise_counts = au.compute_counts(alignment)
        # single_counts_binary = (single_counts[:, :20] > 0) * 1
        # sum_counts = np.sum(single_counts_binary, axis=1)
        # color_vector = np.multiply.outer(sum_counts, sum_counts)
        # color_vector = color_vector[np.triu_indices(L, k=1)]
        #
        # plot_file = plot_dir + "/scatter_for_" + method_1.replace(" ", "_") + "_vs_" + method_2.replace(" ", "_") + "_"+protein +".html"
        # x_axis_title = method_1
        # y_axis_title = method_2
        # title = protein + " L: "+str(L)+" Neff: "+str(Neff)+"<br>"
        # plot_scatter_comparison(title, x_axis_title, y_axis_title, mat_1, mat_2, plot_file, color_vector=color_vector)


        ### Compute APC corrected Frobenius Score
        mat_apc_1 = bu.compute_apc_corrected_matrix(mat_1)
        mat_apc_2 = bu.compute_apc_corrected_matrix(mat_2)


        ### Plot Scatter and QQPlot of Frobenius Norm + APC scores for both methods
        plot_file = plot_dir + "/scatter_for_" + method_1_short.replace(" ", "_") + "vs_" +  method_2_short.replace(" ", "_") + "_apc_"+protein +".html"
        x_axis_title = method_1
        y_axis_title = method_2
        title = "APC corrected contact scores for protein {0}".format(protein)
        plot_scatter_comparison(title, x_axis_title, y_axis_title, mat_apc_1, mat_apc_2, plot_file, qqplot=True)


        ### Plot Ranks for both methods
        # plot_ranked_predictions_sidebyside(protein, method_1, method_2, mat_apc_1, mat_apc_2, seq_sep, plot_dir, rank_only=False)
        #plot_ranked_predictions_sidebyside(protein, method_1, method_2, mat_apc_1, mat_apc_2, seq_sep, plot_dir, rank_only=True)



        ### Plot Contact Maps for L2norm + APC score for both methods
        # plot_file = plot_dir + "/contact_map_" + method_1.replace(" ", "_") + "_apc_"+protein +".html"
        # title = protein + " L: "+str(L)+" Neff: "+str(Neff)+"<br>" + method_1
        # plot_contact_map(mat_apc_1, seq_sep, 8, plot_file, title, alignment_file=alignment_file, pdb_file=pdb_file)
        #
        # plot_file = plot_dir + "/contact_map_" + method_2.replace(" ", "_") + "_apc_"+protein +".html"
        # title = protein + " L: "+str(L)+" Neff: "+str(Neff)+"<br>" + method_2
        # plot_contact_map(mat_apc_2, seq_sep, 8, plot_file, title, alignment_file=alignment_file, pdb_file=pdb_file)

        ### Plot Precision vs Rank
        # dict_scores = {
        #     method_1 + "_apc": mat_apc_1,
        #     method_2 + "_apc": mat_apc_2
        # }
        # plot_precision_vs_rank(dict_scores, pdb_file, seq_sep, 8, plot_dir)


        scores_1 = mat_apc_1[np.triu_indices(mat_apc_1.shape[0], k=1)]
        scores_2 = mat_apc_2[np.triu_indices(mat_apc_2.shape[0], k=1)]
        stats_dict[protein] = {
            "pearson": pearsonr(scores_1, scores_2),
            "kolmogorov-smirnov":  ks_2samp(scores_1, scores_2),
            "spearmanrho": spearmanr(scores_1, scores_2),
            "kendalltau": kendalltau(scores_1, scores_2),
            "Neff": Neff,
            "L": L,
            "linreg": linregress(scores_1, scores_2)
        }

    stats_dump_file=plot_dir+"/stats_dump.json"
    with open(stats_dump_file, 'w') as outfile:
        json.dump(stats_dict, outfile)


    plot_boxplot_correlation(stats_dict, method_1, method_2, ["Pearson r", "Spearman rho", "Kendalls tau", "linear fit slope"], plot_dir)
def main():
    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')

    group_append = parser.add_mutually_exclusive_group(required=True)
    group_append.add_argument('-m',
                              '--mat_file',
                              type=str,
                              dest='mat_file',
                              help='path to mat file')
    group_append.add_argument('-b',
                              '--braw_file',
                              type=str,
                              dest='braw_file',
                              help='path to braw file')

    parser.add_argument("-o",
                        "--plot-out",
                        dest="plot_out",
                        type=str,
                        help="directory for plot")

    parser.add_argument("--seqsep",
                        type=int,
                        default=6,
                        help="sequence separation")
    parser.add_argument(
        "--contact_threshold",
        type=int,
        default=8,
        help="contact definition; C_beta distance between residue pairs")
    parser.add_argument(
        "--pdb_file",
        type=str,
        help="path to pdb file [optional] -  plotting true contacs")
    parser.add_argument(
        "--alignment_file",
        type=str,
        help="path to alignment file [optional] - plotting coverage")
    parser.add_argument("--apc",
                        action="store_true",
                        default=False,
                        help="Apply average product correction")
    parser.add_argument("--entropy_correction",
                        action="store_true",
                        default=False,
                        help="Apply entropy correction")

    args = parser.parse_args()

    if args.mat_file is None and args.braw_file is None:
        print("Either mat_file or braw_file need to be set.")

    plot_out = args.plot_out
    seqsep = args.seqsep
    contact_threshold = args.contact_threshold
    apc = args.apc
    entropy_correction = args.entropy_correction

    alignment_file = args.alignment_file
    pdb_file = args.pdb_file

    ##### debugging

    protein = "2hs1A"
    topology = "binary"
    topology = "star"

    alignment_file = "/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln"
    alignment_file = None
    #alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".aln"
    # alignment_format = "psicov"

    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" + protein + ".filt.braw.gz"
    # braw_file = "/home/vorberg/" + protein + ".gx.gz"
    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw_ec_correction/" + protein + ".braw.ec.gz"
    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw/" + protein + ".filt.braw.gz"
    #
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pll/" + protein + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/" + protein + ".frobenius.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.ec.20.log2.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.ec.20.log2.mat"

    # pdb_file = "/home/vorberg/work/data/ccmgen/psicov/pdb/" + protein + ".pdb"
    # # pdb_file=None

    # seqsep = 4
    # # seqsep = 1

    # contact_threshold = 8

    # plot_out = "/home/vorberg/work/plots/ccmgen/psicov/contact_maps/"

    apc = True
    apc = False
    entropy_correction = True
    entropy_correction = False

    ### Compute l2norm score from braw
    if args.braw_file is not None:
        braw_file = args.braw_file
        protein_name = '.'.join(os.path.basename(braw_file).split('.')[:-1])
        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta

        neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
        lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info),
                            decimals=3)

        if entropy_correction:
            alignment = io.read_alignment(alignment_file)
            single_freq, pair_freq = au.calculate_frequencies(
                alignment, au.uniform_pseudocounts)
            mat = bu.compute_corrected_mat_entropy(braw.x_pair,
                                                   single_freq,
                                                   neff,
                                                   lambda_w,
                                                   entropy=True,
                                                   squared=False,
                                                   nr_states=20)
        else:
            mat = bu.compute_l2norm_from_braw(braw, apc)

    ### Read score from mat
    if args.mat_file is not None:
        mat_file = args.mat_file
        mat = io.read_matfile(mat_file)
        if (apc):
            mat = bu.compute_apc_corrected_matrix(mat)
        meta_info = io.read_json_from_mat(mat_file)
        protein_name = os.path.basename(mat_file).split('.')[0]

    correction = ""
    if apc:
        correction = "_apc"
    if entropy_correction:
        correction = "_ec"
    plot_file = plot_out + protein_name + "_seqsep" + str(
        seqsep) + "_contacthr" + str(contact_threshold) + correction + ".html"
    neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
    N = u.find_dict_key("nrow", meta_info)
    L = u.find_dict_key("ncol", meta_info)
    title = protein_name + "<br>L: " + str(L) + " N: " + str(
        N) + " Neff: " + str(neff) + " diversity: " + str(
            np.round(np.sqrt(N) / L, decimals=3))
    plot_contact_map(mat,
                     seqsep,
                     contact_threshold,
                     title,
                     plot_file,
                     alignment_file=alignment_file,
                     pdb_file=pdb_file)
Esempio n. 14
0
    def compute_evaluation_statistics_protein(self, protein, ranks, seqsep,
                                              contact_thr, noncontact_thr):

        evaluation_file_protein = self.eval_dir + "/" + protein + ".protein"
        if not os.path.exists(evaluation_file_protein):
            print(
                "Evaluation file for protein {0} does not exist: {1}!".format(
                    protein, evaluation_file_protein))
            return

        eval_df = pd.DataFrame(io.read_matfile(evaluation_file_protein))
        eval_df.columns = ['cb_distance', 'i', 'j']

        ### add evaluation statistics for all methods ============================================================
        for method in self.benchmark_methods:
            evaluation_file_method = self.eval_dir + "/" + protein + "." + method
            eval_df[method] = io.read_matfile(evaluation_file_method)

        ### apply constraints ====================================================================================
        eval_df['class'] = (eval_df['cb_distance'] <= contact_thr) * 1
        eval_df = eval_df[eval_df['j'] >= (eval_df['i'] + seqsep)]

        if noncontact_thr > contact_thr:
            eval_df = eval_df[(eval_df['cb_distance'] <= contact_thr) |
                              (eval_df['cb_distance'] > noncontact_thr)]

        eval_df.sort_values(by=['i', 'j'], inplace=True)
        eval_df.reset_index(inplace=True)

        ### read protein info =====================================================================================
        protein_eval_metrics = io.read_json_from_mat(evaluation_file_protein)
        if 'L' not in protein_eval_metrics:
            print(
                "Protein length L is missing in evaluation meta file for protein {0}!"
                .format(protein))
            return

        L = protein_eval_metrics['L']
        if "cath class" in protein_eval_metrics:
            protein_eval_metrics["cath class"] = int(
                protein_eval_metrics["cath class"].split(".")[0])

        ### determine the ranks according to protein length L=====================================================
        # if there are less precision values than max(rank_L): adjust rank_L
        ranks_L = np.round(L * ranks).astype(int)
        ranks_L = np.array([rank for rank in ranks_L if rank < len(eval_df)])

        ### compute precision and recall values ==================================================================
        protein_eval_metrics['methods'] = {}
        for method in self.benchmark_methods:

            precision, recall, threshold = bu.compute_precision_recall(
                eval_df['class'], eval_df[method])
            mean_error = bu.compute_mean_error(eval_df['cb_distance'],
                                               eval_df[method], contact_thr)

            protein_eval_metrics['methods'][method] = {}
            protein_eval_metrics['methods'][method]['precision'] = [
                np.nan
            ] * len(ranks)
            protein_eval_metrics['methods'][method]['mean_error'] = [
                np.nan
            ] * len(ranks)
            protein_eval_metrics['methods'][method]['recall'] = [np.nan
                                                                 ] * len(ranks)
            for rank_id, rank in enumerate(ranks_L):
                protein_eval_metrics['methods'][method]['precision'][
                    rank_id] = np.array(precision)[rank]
                protein_eval_metrics['methods'][method]["mean_error"][
                    rank_id] = np.array(mean_error)[rank]
                protein_eval_metrics['methods'][method]["recall"][
                    rank_id] = np.array(recall)[rank]

        return protein_eval_metrics
def main():

    args = parse_args()

    mat_dir_method1 = args.braw_dir_method1
    mat_dir_method2 = args.braw_dir_method2
    plot_dir = args.plot_dir
    method_1 = args.method1
    method_2 = args.method2
    seq_sep = args.seq_sep
    alignment_dir = args.alignment_dir
    pdb_dir = args.pdb_dir

    ### debug
    method_1 = "persistent contrastive divergence"
    method_2 = "pseudo-likelihood maximization"

    method_1_short = "PCD"
    method_2_short = "PLL"

    # protein="1g2rA"
    # mat_file_1="/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/" + protein + ".frobenius.mat"
    # mat_file_2="/home/vorberg/work/data/ccmgen/psicov/predictions_pll/" + protein + ".frobenius.mat"

    mat_dir_method1 = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/"
    mat_dir_method2 = "/home/vorberg/work/data/ccmgen/psicov/predictions_pll/"

    alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/"
    alignment_dir = "/home/vorberg/work/data/ccmgen/psicov/alignments/"

    pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/"
    pdb_dir = "/home/vorberg/work/data/ccmgen/psicov/pdb/"

    seq_sep = 4
    #plot_dir = "/home/vorberg/work/plots/benchmark_full_likelihood_optimization/compare_cd_pll/"
    plot_dir = "/home/vorberg//work/plots/ccmgen/psicov/pll_vs_pcd_comparison/"

    #braw_file_1 = glob.glob(coupling_dir_1 +'/*' + protein + '*')
    #braw_file_2 = glob.glob(coupling_dir_2 + '/*' + protein + '*')

    #braw_1 = raw.parse_msgpack(braw_file_1)
    #braw_2 = raw.parse_msgpack(braw_file_2)

    mat_files_method2 = glob.glob(mat_dir_method2 + "/*.frobenius.mat")

    stats_dict = {}
    for mat_file_2 in mat_files_method2:

        protein = os.path.basename(mat_file_2).split(".")[0]
        #mat_file_2 = glob.glob(mat_dir_method2 +"/"+protein+"*mat")[0]
        print(protein)

        mat_file_1 = glob.glob(mat_dir_method1 + "/" + protein +
                               "*.frobenius.mat")[0]

        if len(mat_file_1) == 0:
            print(
                "There is no mat file for protein {0} in directory {1}. Skip protein"
                .format(protein, mat_dir_method2))
            continue

        if alignment_dir is None:
            alignment_file = None
        else:
            alignment_file = alignment_dir + "/" + protein + ".aln"

        if pdb_dir is None:
            pdb_file = None
        else:
            pdb_file = pdb_dir + "/" + protein + ".pdb"

        mat_1 = io.read_matfile(mat_file_1)
        mat_2 = io.read_matfile(mat_file_2)
        mat_meta = io.read_json_from_mat(mat_file_2)
        L = mat_1.shape[0]
        Neff = np.round(u.find_dict_key("neff", mat_meta), decimals=2)

        ### Plot Scatter of Frobenius Norm scores for both methods
        # alignment = io.read_alignment(alignment_file)
        # single_counts, pairwise_counts = au.compute_counts(alignment)
        # single_counts_binary = (single_counts[:, :20] > 0) * 1
        # sum_counts = np.sum(single_counts_binary, axis=1)
        # color_vector = np.multiply.outer(sum_counts, sum_counts)
        # color_vector = color_vector[np.triu_indices(L, k=1)]
        #
        # plot_file = plot_dir + "/scatter_for_" + method_1.replace(" ", "_") + "_vs_" + method_2.replace(" ", "_") + "_"+protein +".html"
        # x_axis_title = method_1
        # y_axis_title = method_2
        # title = protein + " L: "+str(L)+" Neff: "+str(Neff)+"<br>"
        # plot_scatter_comparison(title, x_axis_title, y_axis_title, mat_1, mat_2, plot_file, color_vector=color_vector)

        ### Compute APC corrected Frobenius Score
        mat_apc_1 = bu.compute_apc_corrected_matrix(mat_1)
        mat_apc_2 = bu.compute_apc_corrected_matrix(mat_2)

        ### Plot Scatter and QQPlot of Frobenius Norm + APC scores for both methods
        plot_file = plot_dir + "/scatter_for_" + method_1_short.replace(
            " ", "_") + "vs_" + method_2_short.replace(
                " ", "_") + "_apc_" + protein + ".html"
        x_axis_title = method_1
        y_axis_title = method_2
        title = "APC corrected contact scores for protein {0}".format(protein)
        plot_scatter_comparison(title,
                                x_axis_title,
                                y_axis_title,
                                mat_apc_1,
                                mat_apc_2,
                                plot_file,
                                qqplot=True)

        ### Plot Ranks for both methods
        # plot_ranked_predictions_sidebyside(protein, method_1, method_2, mat_apc_1, mat_apc_2, seq_sep, plot_dir, rank_only=False)
        #plot_ranked_predictions_sidebyside(protein, method_1, method_2, mat_apc_1, mat_apc_2, seq_sep, plot_dir, rank_only=True)

        ### Plot Contact Maps for L2norm + APC score for both methods
        # plot_file = plot_dir + "/contact_map_" + method_1.replace(" ", "_") + "_apc_"+protein +".html"
        # title = protein + " L: "+str(L)+" Neff: "+str(Neff)+"<br>" + method_1
        # plot_contact_map(mat_apc_1, seq_sep, 8, plot_file, title, alignment_file=alignment_file, pdb_file=pdb_file)
        #
        # plot_file = plot_dir + "/contact_map_" + method_2.replace(" ", "_") + "_apc_"+protein +".html"
        # title = protein + " L: "+str(L)+" Neff: "+str(Neff)+"<br>" + method_2
        # plot_contact_map(mat_apc_2, seq_sep, 8, plot_file, title, alignment_file=alignment_file, pdb_file=pdb_file)

        ### Plot Precision vs Rank
        # dict_scores = {
        #     method_1 + "_apc": mat_apc_1,
        #     method_2 + "_apc": mat_apc_2
        # }
        # plot_precision_vs_rank(dict_scores, pdb_file, seq_sep, 8, plot_dir)

        scores_1 = mat_apc_1[np.triu_indices(mat_apc_1.shape[0], k=1)]
        scores_2 = mat_apc_2[np.triu_indices(mat_apc_2.shape[0], k=1)]
        stats_dict[protein] = {
            "pearson": pearsonr(scores_1, scores_2),
            "kolmogorov-smirnov": ks_2samp(scores_1, scores_2),
            "spearmanrho": spearmanr(scores_1, scores_2),
            "kendalltau": kendalltau(scores_1, scores_2),
            "Neff": Neff,
            "L": L,
            "linreg": linregress(scores_1, scores_2)
        }

    stats_dump_file = plot_dir + "/stats_dump.json"
    with open(stats_dump_file, 'w') as outfile:
        json.dump(stats_dict, outfile)

    plot_boxplot_correlation(
        stats_dict, method_1, method_2,
        ["Pearson r", "Spearman rho", "Kendalls tau", "linear fit slope"],
        plot_dir)
def main():
    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')

    group_append = parser.add_mutually_exclusive_group(required=True)
    group_append.add_argument('-m', '--mat_file', type=str, dest='mat_file', help='path to mat file')
    group_append.add_argument('-b', '--braw_file', type=str, dest='braw_file', help='path to braw file')

    parser.add_argument("-o", "--plot-out", dest="plot_out", type=str, help="directory for plot")

    parser.add_argument("--seqsep", type=int, default=6, help="sequence separation")
    parser.add_argument("--contact_threshold", type=int, default=8,
                        help="contact definition; C_beta distance between residue pairs")
    parser.add_argument("--pdb_file", type=str, help="path to pdb file [optional] -  plotting true contacs")
    parser.add_argument("--alignment_file", type=str, help="path to alignment file [optional] - plotting coverage")
    parser.add_argument("--apc", action="store_true", default=False, help="Apply average product correction")
    parser.add_argument("--entropy_correction", action="store_true", default=False, help="Apply entropy correction")

    args = parser.parse_args()

    if args.mat_file is None and args.braw_file is None:
        print("Either mat_file or braw_file need to be set.")

    plot_out = args.plot_out
    seqsep = args.seqsep
    contact_threshold = args.contact_threshold
    apc = args.apc
    entropy_correction = args.entropy_correction

    alignment_file = args.alignment_file
    pdb_file = args.pdb_file

    ##### debugging

    protein = "2hs1A"
    topology = "binary"
    topology = "star"

    alignment_file = "/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln"
    alignment_file = None
    #alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".aln"
    # alignment_format = "psicov"

    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" + protein + ".filt.braw.gz"
    # braw_file = "/home/vorberg/" + protein + ".gx.gz"
    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw_ec_correction/" + protein + ".braw.ec.gz"
    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw/" + protein + ".filt.braw.gz"
    #
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pll/" + protein + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/" + protein + ".frobenius.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.ec.20.log2.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.ec.20.log2.mat"



    # pdb_file = "/home/vorberg/work/data/ccmgen/psicov/pdb/" + protein + ".pdb"
    # # pdb_file=None

    # seqsep = 4
    # # seqsep = 1

    # contact_threshold = 8

    # plot_out = "/home/vorberg/work/plots/ccmgen/psicov/contact_maps/"

    apc=True
    apc = False
    entropy_correction = True
    entropy_correction = False



    ### Compute l2norm score from braw
    if args.braw_file is not None:
        braw_file = args.braw_file
        protein_name = '.'.join(os.path.basename(braw_file).split('.')[:-1])
        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta

        neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
        lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info), decimals=3)

        if entropy_correction:
            alignment = io.read_alignment(alignment_file)
            single_freq, pair_freq = au.calculate_frequencies(alignment, au.uniform_pseudocounts)
            mat = bu.compute_corrected_mat_entropy(braw.x_pair, single_freq, neff, lambda_w, entropy=True, squared=False, nr_states = 20)
        else:
            mat = bu.compute_l2norm_from_braw(braw, apc)

    ### Read score from mat
    if args.mat_file is not None:
        mat_file = args.mat_file
        mat = io.read_matfile(mat_file)
        if (apc):
            mat = bu.compute_apc_corrected_matrix(mat)
        meta_info = io.read_json_from_mat(mat_file)
        protein_name = os.path.basename(mat_file).split('.')[0]

    correction=""
    if apc:
        correction = "_apc"
    if entropy_correction:
        correction ="_ec"
    plot_file = plot_out + protein_name + "_seqsep" + str(seqsep) + "_contacthr" + str(
        contact_threshold) + correction + ".html"
    neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
    N = u.find_dict_key("nrow", meta_info)
    L = u.find_dict_key("ncol", meta_info)
    title = protein_name + "<br>L: " + str(L) + " N: " + str(N) + " Neff: " + str(neff) + " diversity: " + str(
        np.round(np.sqrt(N) / L, decimals=3))
    plot_contact_map(mat, seqsep, contact_threshold, title, plot_file, alignment_file=alignment_file, pdb_file=pdb_file)