Exemple #1
0
    def __apply_filter(self, protein):

        filter_operators = {
            'greater': np.greater,
            'less': np.less,
            'greater_equal': np.greater_equal,
            'less_equal': np.less_equal,
            'equal': np.equal,
            'not_equal': np.not_equal
        }

        for method in self.benchmark_methods:
            eval_file = self.eval_dir + "/" + protein + "." + method
            eval_meta = io.read_json_from_mat(eval_file)

            for f in self.filter:
                filter_res = u.find_dict_key(f['key'], eval_meta)
                if not filter_operators[f['operator']](filter_res, f['value']):
                    print(
                        "{0} did not pass filter for {1} {2} {3}: {4}".format(
                            method, f['key'], f['operator'], f['value'],
                            filter_res))
                    return False

        return True
def get_meta_property(b, method, key):
    eval_files_method = [eval_file for eval_file in b.evaluation_files if method == eval_file.split(".")[1]]

    ### iterate over all proteins in evaluation suite from that method ==========================
    values = np.zeros(len(eval_files_method))
    for id, eval_file in enumerate(eval_files_method):
        eval_meta = io.read_json_from_mat(b.eval_dir + "/" + eval_file)
        values[id] = u.find_dict_key(key, eval_meta)

    return values
def get_meta_property(b, method, key):
    eval_files_method = [
        eval_file for eval_file in b.evaluation_files
        if method == eval_file.split(".")[1]
    ]

    ### iterate over all proteins in evaluation suite from that method ==========================
    values = np.zeros(len(eval_files_method))
    for id, eval_file in enumerate(eval_files_method):
        eval_meta = io.read_json_from_mat(b.eval_dir + "/" + eval_file)
        values[id] = u.find_dict_key(key, eval_meta)

    return values
    def __apply_filter(self, protein):

        filter_operators = {
            'greater': np.greater,
            'less': np.less,
            'greater_equal': np.greater_equal,
            'less_equal': np.less_equal,
            'equal': np.equal,
            'not_equal': np.not_equal
        }

        for method in self.benchmark_methods:
            eval_file = self.eval_dir + "/" + protein + "." + method
            eval_meta = io.read_json_from_mat(eval_file)

            for f in self.filter:
                filter_res = u.find_dict_key(f['key'], eval_meta)
                if not filter_operators[f['operator']](filter_res, f['value']):
                    print("{0} did not pass filter for {1} {2} {3}: {4}".format(method, f['key'], f['operator'], f['value'], filter_res))
                    return False

        return True
def main():


    args = parse_args()

    mat_dir_method1      = args.braw_dir_method1
    mat_dir_method2      = args.braw_dir_method2
    plot_dir            = args.plot_dir
    method_1             = args.method1
    method_2             = args.method2
    seq_sep              = args.seq_sep
    alignment_dir        = args.alignment_dir
    pdb_dir              = args.pdb_dir



    ### debug
    method_1 = "persistent contrastive divergence"
    method_2 = "pseudo-likelihood maximization"

    method_1_short="PCD"
    method_2_short="PLL"

    # protein="1g2rA"
    # mat_file_1="/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/" + protein + ".frobenius.mat"
    # mat_file_2="/home/vorberg/work/data/ccmgen/psicov/predictions_pll/" + protein + ".frobenius.mat"

    mat_dir_method1 = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/"
    mat_dir_method2 = "/home/vorberg/work/data/ccmgen/psicov/predictions_pll/"

    alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/"
    alignment_dir = "/home/vorberg/work/data/ccmgen/psicov/alignments/"

    pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/"
    pdb_dir = "/home/vorberg/work/data/ccmgen/psicov/pdb/"

    seq_sep = 4
    #plot_dir = "/home/vorberg/work/plots/benchmark_full_likelihood_optimization/compare_cd_pll/"
    plot_dir = "/home/vorberg//work/plots/ccmgen/psicov/pll_vs_pcd_comparison/"

    #braw_file_1 = glob.glob(coupling_dir_1 +'/*' + protein + '*')
    #braw_file_2 = glob.glob(coupling_dir_2 + '/*' + protein + '*')

    #braw_1 = raw.parse_msgpack(braw_file_1)
    #braw_2 = raw.parse_msgpack(braw_file_2)




    mat_files_method2 = glob.glob(mat_dir_method2 +"/*.frobenius.mat")

    stats_dict={}
    for mat_file_2 in mat_files_method2:

        protein = os.path.basename(mat_file_2).split(".")[0]
        #mat_file_2 = glob.glob(mat_dir_method2 +"/"+protein+"*mat")[0]
        print(protein)

        mat_file_1 = glob.glob(mat_dir_method1 +"/"+protein+"*.frobenius.mat")[0]

        if len(mat_file_1) == 0 :
            print("There is no mat file for protein {0} in directory {1}. Skip protein".format(protein, mat_dir_method2))
            continue

        if alignment_dir is None:
            alignment_file = None
        else:
            alignment_file = alignment_dir + "/" + protein + ".aln"

        if pdb_dir is None:
            pdb_file = None
        else:
            pdb_file = pdb_dir + "/"+ protein + ".pdb"

        mat_1 = io.read_matfile(mat_file_1)
        mat_2 = io.read_matfile(mat_file_2)
        mat_meta = io.read_json_from_mat(mat_file_2)
        L = mat_1.shape[0]
        Neff =  np.round(u.find_dict_key("neff",mat_meta), decimals=2)

        ### Plot Scatter of Frobenius Norm scores for both methods
        # alignment = io.read_alignment(alignment_file)
        # single_counts, pairwise_counts = au.compute_counts(alignment)
        # single_counts_binary = (single_counts[:, :20] > 0) * 1
        # sum_counts = np.sum(single_counts_binary, axis=1)
        # color_vector = np.multiply.outer(sum_counts, sum_counts)
        # color_vector = color_vector[np.triu_indices(L, k=1)]
        #
        # plot_file = plot_dir + "/scatter_for_" + method_1.replace(" ", "_") + "_vs_" + method_2.replace(" ", "_") + "_"+protein +".html"
        # x_axis_title = method_1
        # y_axis_title = method_2
        # title = protein + " L: "+str(L)+" Neff: "+str(Neff)+"<br>"
        # plot_scatter_comparison(title, x_axis_title, y_axis_title, mat_1, mat_2, plot_file, color_vector=color_vector)


        ### Compute APC corrected Frobenius Score
        mat_apc_1 = bu.compute_apc_corrected_matrix(mat_1)
        mat_apc_2 = bu.compute_apc_corrected_matrix(mat_2)


        ### Plot Scatter and QQPlot of Frobenius Norm + APC scores for both methods
        plot_file = plot_dir + "/scatter_for_" + method_1_short.replace(" ", "_") + "vs_" +  method_2_short.replace(" ", "_") + "_apc_"+protein +".html"
        x_axis_title = method_1
        y_axis_title = method_2
        title = "APC corrected contact scores for protein {0}".format(protein)
        plot_scatter_comparison(title, x_axis_title, y_axis_title, mat_apc_1, mat_apc_2, plot_file, qqplot=True)


        ### Plot Ranks for both methods
        # plot_ranked_predictions_sidebyside(protein, method_1, method_2, mat_apc_1, mat_apc_2, seq_sep, plot_dir, rank_only=False)
        #plot_ranked_predictions_sidebyside(protein, method_1, method_2, mat_apc_1, mat_apc_2, seq_sep, plot_dir, rank_only=True)



        ### Plot Contact Maps for L2norm + APC score for both methods
        # plot_file = plot_dir + "/contact_map_" + method_1.replace(" ", "_") + "_apc_"+protein +".html"
        # title = protein + " L: "+str(L)+" Neff: "+str(Neff)+"<br>" + method_1
        # plot_contact_map(mat_apc_1, seq_sep, 8, plot_file, title, alignment_file=alignment_file, pdb_file=pdb_file)
        #
        # plot_file = plot_dir + "/contact_map_" + method_2.replace(" ", "_") + "_apc_"+protein +".html"
        # title = protein + " L: "+str(L)+" Neff: "+str(Neff)+"<br>" + method_2
        # plot_contact_map(mat_apc_2, seq_sep, 8, plot_file, title, alignment_file=alignment_file, pdb_file=pdb_file)

        ### Plot Precision vs Rank
        # dict_scores = {
        #     method_1 + "_apc": mat_apc_1,
        #     method_2 + "_apc": mat_apc_2
        # }
        # plot_precision_vs_rank(dict_scores, pdb_file, seq_sep, 8, plot_dir)


        scores_1 = mat_apc_1[np.triu_indices(mat_apc_1.shape[0], k=1)]
        scores_2 = mat_apc_2[np.triu_indices(mat_apc_2.shape[0], k=1)]
        stats_dict[protein] = {
            "pearson": pearsonr(scores_1, scores_2),
            "kolmogorov-smirnov":  ks_2samp(scores_1, scores_2),
            "spearmanrho": spearmanr(scores_1, scores_2),
            "kendalltau": kendalltau(scores_1, scores_2),
            "Neff": Neff,
            "L": L,
            "linreg": linregress(scores_1, scores_2)
        }

    stats_dump_file=plot_dir+"/stats_dump.json"
    with open(stats_dump_file, 'w') as outfile:
        json.dump(stats_dict, outfile)


    plot_boxplot_correlation(stats_dict, method_1, method_2, ["Pearson r", "Spearman rho", "Kendalls tau", "linear fit slope"], plot_dir)
def main():
    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')

    group_append = parser.add_mutually_exclusive_group(required=True)
    group_append.add_argument('-m',
                              '--mat_file',
                              type=str,
                              dest='mat_file',
                              help='path to mat file')
    group_append.add_argument('-b',
                              '--braw_file',
                              type=str,
                              dest='braw_file',
                              help='path to braw file')

    parser.add_argument("-o",
                        "--plot-out",
                        dest="plot_out",
                        type=str,
                        help="directory for plot")

    parser.add_argument("--seqsep",
                        type=int,
                        default=6,
                        help="sequence separation")
    parser.add_argument(
        "--contact_threshold",
        type=int,
        default=8,
        help="contact definition; C_beta distance between residue pairs")
    parser.add_argument(
        "--pdb_file",
        type=str,
        help="path to pdb file [optional] -  plotting true contacs")
    parser.add_argument(
        "--alignment_file",
        type=str,
        help="path to alignment file [optional] - plotting coverage")
    parser.add_argument("--apc",
                        action="store_true",
                        default=False,
                        help="Apply average product correction")
    parser.add_argument("--entropy_correction",
                        action="store_true",
                        default=False,
                        help="Apply entropy correction")

    args = parser.parse_args()

    if args.mat_file is None and args.braw_file is None:
        print("Either mat_file or braw_file need to be set.")

    plot_out = args.plot_out
    seqsep = args.seqsep
    contact_threshold = args.contact_threshold
    apc = args.apc
    entropy_correction = args.entropy_correction

    alignment_file = args.alignment_file
    pdb_file = args.pdb_file

    ##### debugging

    protein = "2hs1A"
    topology = "binary"
    topology = "star"

    alignment_file = "/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln"
    alignment_file = None
    #alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".aln"
    # alignment_format = "psicov"

    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" + protein + ".filt.braw.gz"
    # braw_file = "/home/vorberg/" + protein + ".gx.gz"
    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw_ec_correction/" + protein + ".braw.ec.gz"
    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw/" + protein + ".filt.braw.gz"
    #
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pll/" + protein + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/" + protein + ".frobenius.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.ec.20.log2.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.ec.20.log2.mat"

    # pdb_file = "/home/vorberg/work/data/ccmgen/psicov/pdb/" + protein + ".pdb"
    # # pdb_file=None

    # seqsep = 4
    # # seqsep = 1

    # contact_threshold = 8

    # plot_out = "/home/vorberg/work/plots/ccmgen/psicov/contact_maps/"

    apc = True
    apc = False
    entropy_correction = True
    entropy_correction = False

    ### Compute l2norm score from braw
    if args.braw_file is not None:
        braw_file = args.braw_file
        protein_name = '.'.join(os.path.basename(braw_file).split('.')[:-1])
        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta

        neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
        lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info),
                            decimals=3)

        if entropy_correction:
            alignment = io.read_alignment(alignment_file)
            single_freq, pair_freq = au.calculate_frequencies(
                alignment, au.uniform_pseudocounts)
            mat = bu.compute_corrected_mat_entropy(braw.x_pair,
                                                   single_freq,
                                                   neff,
                                                   lambda_w,
                                                   entropy=True,
                                                   squared=False,
                                                   nr_states=20)
        else:
            mat = bu.compute_l2norm_from_braw(braw, apc)

    ### Read score from mat
    if args.mat_file is not None:
        mat_file = args.mat_file
        mat = io.read_matfile(mat_file)
        if (apc):
            mat = bu.compute_apc_corrected_matrix(mat)
        meta_info = io.read_json_from_mat(mat_file)
        protein_name = os.path.basename(mat_file).split('.')[0]

    correction = ""
    if apc:
        correction = "_apc"
    if entropy_correction:
        correction = "_ec"
    plot_file = plot_out + protein_name + "_seqsep" + str(
        seqsep) + "_contacthr" + str(contact_threshold) + correction + ".html"
    neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
    N = u.find_dict_key("nrow", meta_info)
    L = u.find_dict_key("ncol", meta_info)
    title = protein_name + "<br>L: " + str(L) + " N: " + str(
        N) + " Neff: " + str(neff) + " diversity: " + str(
            np.round(np.sqrt(N) / L, decimals=3))
    plot_contact_map(mat,
                     seqsep,
                     contact_threshold,
                     title,
                     plot_file,
                     alignment_file=alignment_file,
                     pdb_file=pdb_file)
def main():

    args = parse_args()

    mat_dir_method1 = args.braw_dir_method1
    mat_dir_method2 = args.braw_dir_method2
    plot_dir = args.plot_dir
    method_1 = args.method1
    method_2 = args.method2
    seq_sep = args.seq_sep
    alignment_dir = args.alignment_dir
    pdb_dir = args.pdb_dir

    ### debug
    method_1 = "persistent contrastive divergence"
    method_2 = "pseudo-likelihood maximization"

    method_1_short = "PCD"
    method_2_short = "PLL"

    # protein="1g2rA"
    # mat_file_1="/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/" + protein + ".frobenius.mat"
    # mat_file_2="/home/vorberg/work/data/ccmgen/psicov/predictions_pll/" + protein + ".frobenius.mat"

    mat_dir_method1 = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/"
    mat_dir_method2 = "/home/vorberg/work/data/ccmgen/psicov/predictions_pll/"

    alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/"
    alignment_dir = "/home/vorberg/work/data/ccmgen/psicov/alignments/"

    pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/"
    pdb_dir = "/home/vorberg/work/data/ccmgen/psicov/pdb/"

    seq_sep = 4
    #plot_dir = "/home/vorberg/work/plots/benchmark_full_likelihood_optimization/compare_cd_pll/"
    plot_dir = "/home/vorberg//work/plots/ccmgen/psicov/pll_vs_pcd_comparison/"

    #braw_file_1 = glob.glob(coupling_dir_1 +'/*' + protein + '*')
    #braw_file_2 = glob.glob(coupling_dir_2 + '/*' + protein + '*')

    #braw_1 = raw.parse_msgpack(braw_file_1)
    #braw_2 = raw.parse_msgpack(braw_file_2)

    mat_files_method2 = glob.glob(mat_dir_method2 + "/*.frobenius.mat")

    stats_dict = {}
    for mat_file_2 in mat_files_method2:

        protein = os.path.basename(mat_file_2).split(".")[0]
        #mat_file_2 = glob.glob(mat_dir_method2 +"/"+protein+"*mat")[0]
        print(protein)

        mat_file_1 = glob.glob(mat_dir_method1 + "/" + protein +
                               "*.frobenius.mat")[0]

        if len(mat_file_1) == 0:
            print(
                "There is no mat file for protein {0} in directory {1}. Skip protein"
                .format(protein, mat_dir_method2))
            continue

        if alignment_dir is None:
            alignment_file = None
        else:
            alignment_file = alignment_dir + "/" + protein + ".aln"

        if pdb_dir is None:
            pdb_file = None
        else:
            pdb_file = pdb_dir + "/" + protein + ".pdb"

        mat_1 = io.read_matfile(mat_file_1)
        mat_2 = io.read_matfile(mat_file_2)
        mat_meta = io.read_json_from_mat(mat_file_2)
        L = mat_1.shape[0]
        Neff = np.round(u.find_dict_key("neff", mat_meta), decimals=2)

        ### Plot Scatter of Frobenius Norm scores for both methods
        # alignment = io.read_alignment(alignment_file)
        # single_counts, pairwise_counts = au.compute_counts(alignment)
        # single_counts_binary = (single_counts[:, :20] > 0) * 1
        # sum_counts = np.sum(single_counts_binary, axis=1)
        # color_vector = np.multiply.outer(sum_counts, sum_counts)
        # color_vector = color_vector[np.triu_indices(L, k=1)]
        #
        # plot_file = plot_dir + "/scatter_for_" + method_1.replace(" ", "_") + "_vs_" + method_2.replace(" ", "_") + "_"+protein +".html"
        # x_axis_title = method_1
        # y_axis_title = method_2
        # title = protein + " L: "+str(L)+" Neff: "+str(Neff)+"<br>"
        # plot_scatter_comparison(title, x_axis_title, y_axis_title, mat_1, mat_2, plot_file, color_vector=color_vector)

        ### Compute APC corrected Frobenius Score
        mat_apc_1 = bu.compute_apc_corrected_matrix(mat_1)
        mat_apc_2 = bu.compute_apc_corrected_matrix(mat_2)

        ### Plot Scatter and QQPlot of Frobenius Norm + APC scores for both methods
        plot_file = plot_dir + "/scatter_for_" + method_1_short.replace(
            " ", "_") + "vs_" + method_2_short.replace(
                " ", "_") + "_apc_" + protein + ".html"
        x_axis_title = method_1
        y_axis_title = method_2
        title = "APC corrected contact scores for protein {0}".format(protein)
        plot_scatter_comparison(title,
                                x_axis_title,
                                y_axis_title,
                                mat_apc_1,
                                mat_apc_2,
                                plot_file,
                                qqplot=True)

        ### Plot Ranks for both methods
        # plot_ranked_predictions_sidebyside(protein, method_1, method_2, mat_apc_1, mat_apc_2, seq_sep, plot_dir, rank_only=False)
        #plot_ranked_predictions_sidebyside(protein, method_1, method_2, mat_apc_1, mat_apc_2, seq_sep, plot_dir, rank_only=True)

        ### Plot Contact Maps for L2norm + APC score for both methods
        # plot_file = plot_dir + "/contact_map_" + method_1.replace(" ", "_") + "_apc_"+protein +".html"
        # title = protein + " L: "+str(L)+" Neff: "+str(Neff)+"<br>" + method_1
        # plot_contact_map(mat_apc_1, seq_sep, 8, plot_file, title, alignment_file=alignment_file, pdb_file=pdb_file)
        #
        # plot_file = plot_dir + "/contact_map_" + method_2.replace(" ", "_") + "_apc_"+protein +".html"
        # title = protein + " L: "+str(L)+" Neff: "+str(Neff)+"<br>" + method_2
        # plot_contact_map(mat_apc_2, seq_sep, 8, plot_file, title, alignment_file=alignment_file, pdb_file=pdb_file)

        ### Plot Precision vs Rank
        # dict_scores = {
        #     method_1 + "_apc": mat_apc_1,
        #     method_2 + "_apc": mat_apc_2
        # }
        # plot_precision_vs_rank(dict_scores, pdb_file, seq_sep, 8, plot_dir)

        scores_1 = mat_apc_1[np.triu_indices(mat_apc_1.shape[0], k=1)]
        scores_2 = mat_apc_2[np.triu_indices(mat_apc_2.shape[0], k=1)]
        stats_dict[protein] = {
            "pearson": pearsonr(scores_1, scores_2),
            "kolmogorov-smirnov": ks_2samp(scores_1, scores_2),
            "spearmanrho": spearmanr(scores_1, scores_2),
            "kendalltau": kendalltau(scores_1, scores_2),
            "Neff": Neff,
            "L": L,
            "linreg": linregress(scores_1, scores_2)
        }

    stats_dump_file = plot_dir + "/stats_dump.json"
    with open(stats_dump_file, 'w') as outfile:
        json.dump(stats_dict, outfile)

    plot_boxplot_correlation(
        stats_dict, method_1, method_2,
        ["Pearson r", "Spearman rho", "Kendalls tau", "linear fit slope"],
        plot_dir)
def main():

    args = parse_args()

    braw_dir = args.braw_dir
    alignment_dir = args.alignment_dir
    plot_dir = args.plot_dir

    #debug
    braw_dir = "/home/vorberg//work/data/ccmgen/psicov/predictions_pcd/"
    alignment_dir = "/home/vorberg//work/data/ccmgen/psicov/alignments/"
    plot_dir = "/home/vorberg//work/plots/ccmgen/psicov/scatter_apc_vs_ec/pcd/"

    pearson_r_list = []
    proteins = []
    for braw_file in glob.glob(braw_dir + "/*braw.gz"):

        protein_name = os.path.basename(braw_file).split('.')[0]
        proteins.append(protein_name)
        print(protein_name)

        #read braw file
        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta
        neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
        lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info),
                            decimals=3)
        L = braw.ncol

        # read alignment file
        alignment_file = alignment_dir + "/" + protein_name + ".aln"
        alignment = io.read_alignment(alignment_file)
        single_freq, pair_freq = au.calculate_frequencies(
            alignment, au.uniform_pseudocounts)

        #get the highly gapped positions that need to be excluded from analysis
        alignment_ungapped, gapped_positions = io.remove_gapped_positions(
            alignment, max_gap_percentage=50)
        non_gapped_positions = [
            i for i in range(L) if i not in gapped_positions
        ]
        indices_i, indices_j = np.triu_indices(len(non_gapped_positions), k=1)

        #compute ec
        uij, scaling_factor = bu.compute_entropy_correction(single_freq,
                                                            neff,
                                                            lambda_w,
                                                            braw.x_pair,
                                                            entropy=True,
                                                            squared=False,
                                                            nr_states=20)
        ec_term = scaling_factor * np.sqrt(np.sum(uij, axis=(3, 2)))
        ec_term_ungapped = ec_term[non_gapped_positions, :]
        ec_term_ungapped = ec_term_ungapped[:, non_gapped_positions]

        #compute joint EC instead of geometric mean of per-column entropies
        # uij, scaling_factor = bu.compute_joint_entropy_correction(pair_freq, neff, lambda_w, braw.x_pair, nr_states = 20)
        # ec_term = scaling_factor * uij
        # ec_term_ungapped = ec_term[non_gapped_positions, :]
        # ec_term_ungapped = ec_term_ungapped[:, non_gapped_positions]

        #compute contact matrix for ungapped positions
        cmat = bu.compute_l2norm_from_braw(braw.x_pair,
                                           apc=False,
                                           squared=False)

        #compute apc
        cmat_ungapped = cmat[non_gapped_positions, :]
        cmat_ungapped = cmat_ungapped[:, non_gapped_positions]
        mean = np.mean(cmat_ungapped, axis=0)
        apc_term_ungapped = mean[:, np.newaxis] * mean[
            np.newaxis, :] / np.mean(cmat_ungapped)

        #plot
        plot_file = plot_dir + "/" + protein_name + "_apc_vs_ec.html"
        plot_scatter(apc_term_ungapped[indices_i, indices_j],
                     ec_term_ungapped[indices_i, indices_j], [
                         "i: " + str(i) + "<br>j: " + str(j)
                         for i, j in zip(indices_i, indices_j)
                     ], plot_file)

        #compute pearson correlation coefficient
        pearson_r_list.append(
            pearsonr(apc_term_ungapped[indices_i, indices_j],
                     ec_term_ungapped[indices_i, indices_j])[0])

    #plot boxplot with jitter
    plot_file = plot_dir + "/boxplot_pearsonr_apc_vs_ec.html"
    plot_boxplot_correlation(pearson_r_list, proteins, plot_file)
def main():


    args = parse_args()

    braw_dir      = args.braw_dir
    alignment_dir      = args.alignment_dir
    plot_dir  = args.plot_dir


    #debug
    braw_dir = "/home/vorberg//work/data/ccmgen/psicov/predictions_pcd/"
    alignment_dir = "/home/vorberg//work/data/ccmgen/psicov/alignments/"
    plot_dir = "/home/vorberg//work/plots/ccmgen/psicov/scatter_apc_vs_ec/pcd/"


    pearson_r_list = []
    proteins = []
    for braw_file in glob.glob(braw_dir  +"/*braw.gz"):

        protein_name = os.path.basename(braw_file).split('.')[0]
        proteins.append(protein_name)
        print(protein_name)

        #read braw file
        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta
        neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
        lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info), decimals=3)
        L = braw.ncol

        # read alignment file
        alignment_file = alignment_dir  +"/" + protein_name + ".aln"
        alignment = io.read_alignment(alignment_file)
        single_freq, pair_freq = au.calculate_frequencies(alignment, au.uniform_pseudocounts)

        #get the highly gapped positions that need to be excluded from analysis
        alignment_ungapped, gapped_positions = io.remove_gapped_positions(alignment, max_gap_percentage=50)
        non_gapped_positions = [i for i in range(L) if i not in gapped_positions]
        indices_i, indices_j = np.triu_indices(len(non_gapped_positions), k=1)

        #compute ec
        uij, scaling_factor = bu.compute_entropy_correction(
            single_freq, neff, lambda_w, braw.x_pair,
            entropy=True, squared=False, nr_states = 20)
        ec_term = scaling_factor * np.sqrt(np.sum(uij, axis=(3, 2)))
        ec_term_ungapped = ec_term[non_gapped_positions, :]
        ec_term_ungapped = ec_term_ungapped[:, non_gapped_positions]

        #compute joint EC instead of geometric mean of per-column entropies
        # uij, scaling_factor = bu.compute_joint_entropy_correction(pair_freq, neff, lambda_w, braw.x_pair, nr_states = 20)
        # ec_term = scaling_factor * uij
        # ec_term_ungapped = ec_term[non_gapped_positions, :]
        # ec_term_ungapped = ec_term_ungapped[:, non_gapped_positions]

        #compute contact matrix for ungapped positions
        cmat = bu.compute_l2norm_from_braw(braw.x_pair, apc=False, squared=False)

        #compute apc
        cmat_ungapped = cmat[non_gapped_positions, :]
        cmat_ungapped = cmat_ungapped[:, non_gapped_positions]
        mean = np.mean(cmat_ungapped, axis=0)
        apc_term_ungapped = mean[:, np.newaxis] * mean[np.newaxis, :] / np.mean(cmat_ungapped)

        #plot
        plot_file = plot_dir + "/" + protein_name + "_apc_vs_ec.html"
        plot_scatter(
            apc_term_ungapped[indices_i, indices_j],
            ec_term_ungapped[indices_i, indices_j],
            ["i: " + str(i) + "<br>j: " + str(j) for i,j in zip(indices_i, indices_j)],
            plot_file)


        #compute pearson correlation coefficient
        pearson_r_list.append(pearsonr(apc_term_ungapped[indices_i, indices_j], ec_term_ungapped[indices_i, indices_j])[0])

    #plot boxplot with jitter
    plot_file = plot_dir + "/boxplot_pearsonr_apc_vs_ec.html"
    plot_boxplot_correlation(pearson_r_list, proteins, plot_file)
def main():
    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')

    group_append = parser.add_mutually_exclusive_group(required=True)
    group_append.add_argument('-m', '--mat_file', type=str, dest='mat_file', help='path to mat file')
    group_append.add_argument('-b', '--braw_file', type=str, dest='braw_file', help='path to braw file')

    parser.add_argument("-o", "--plot-out", dest="plot_out", type=str, help="directory for plot")

    parser.add_argument("--seqsep", type=int, default=6, help="sequence separation")
    parser.add_argument("--contact_threshold", type=int, default=8,
                        help="contact definition; C_beta distance between residue pairs")
    parser.add_argument("--pdb_file", type=str, help="path to pdb file [optional] -  plotting true contacs")
    parser.add_argument("--alignment_file", type=str, help="path to alignment file [optional] - plotting coverage")
    parser.add_argument("--apc", action="store_true", default=False, help="Apply average product correction")
    parser.add_argument("--entropy_correction", action="store_true", default=False, help="Apply entropy correction")

    args = parser.parse_args()

    if args.mat_file is None and args.braw_file is None:
        print("Either mat_file or braw_file need to be set.")

    plot_out = args.plot_out
    seqsep = args.seqsep
    contact_threshold = args.contact_threshold
    apc = args.apc
    entropy_correction = args.entropy_correction

    alignment_file = args.alignment_file
    pdb_file = args.pdb_file

    ##### debugging

    protein = "2hs1A"
    topology = "binary"
    topology = "star"

    alignment_file = "/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln"
    alignment_file = None
    #alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".aln"
    # alignment_format = "psicov"

    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" + protein + ".filt.braw.gz"
    # braw_file = "/home/vorberg/" + protein + ".gx.gz"
    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw_ec_correction/" + protein + ".braw.ec.gz"
    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw/" + protein + ".filt.braw.gz"
    #
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pll/" + protein + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/" + protein + ".frobenius.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.ec.20.log2.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.ec.20.log2.mat"



    # pdb_file = "/home/vorberg/work/data/ccmgen/psicov/pdb/" + protein + ".pdb"
    # # pdb_file=None

    # seqsep = 4
    # # seqsep = 1

    # contact_threshold = 8

    # plot_out = "/home/vorberg/work/plots/ccmgen/psicov/contact_maps/"

    apc=True
    apc = False
    entropy_correction = True
    entropy_correction = False



    ### Compute l2norm score from braw
    if args.braw_file is not None:
        braw_file = args.braw_file
        protein_name = '.'.join(os.path.basename(braw_file).split('.')[:-1])
        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta

        neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
        lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info), decimals=3)

        if entropy_correction:
            alignment = io.read_alignment(alignment_file)
            single_freq, pair_freq = au.calculate_frequencies(alignment, au.uniform_pseudocounts)
            mat = bu.compute_corrected_mat_entropy(braw.x_pair, single_freq, neff, lambda_w, entropy=True, squared=False, nr_states = 20)
        else:
            mat = bu.compute_l2norm_from_braw(braw, apc)

    ### Read score from mat
    if args.mat_file is not None:
        mat_file = args.mat_file
        mat = io.read_matfile(mat_file)
        if (apc):
            mat = bu.compute_apc_corrected_matrix(mat)
        meta_info = io.read_json_from_mat(mat_file)
        protein_name = os.path.basename(mat_file).split('.')[0]

    correction=""
    if apc:
        correction = "_apc"
    if entropy_correction:
        correction ="_ec"
    plot_file = plot_out + protein_name + "_seqsep" + str(seqsep) + "_contacthr" + str(
        contact_threshold) + correction + ".html"
    neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
    N = u.find_dict_key("nrow", meta_info)
    L = u.find_dict_key("ncol", meta_info)
    title = protein_name + "<br>L: " + str(L) + " N: " + str(N) + " Neff: " + str(neff) + " diversity: " + str(
        np.round(np.sqrt(N) / L, decimals=3))
    plot_contact_map(mat, seqsep, contact_threshold, title, plot_file, alignment_file=alignment_file, pdb_file=pdb_file)