def plot_contact_map(mat, seqsep, contact_threshold, title, plot_file=None, alignment_file=None, pdb_file=None):
    L = len(mat)
    indices_upper_tri = np.triu_indices(L, seqsep)

    ### if alignment file is specified, compute Ni
    if (alignment_file):
        alignment = io.read_alignment(alignment_file)
        gaps_percentage_plot = aligncov.plot_percentage_gaps_per_position(alignment, plot_file=None)
    else:
        gaps_percentage_plot = None

    plot_matrix = pd.DataFrame()

    ###compute distance map from pdb file
    if (pdb_file):
        pdb_file = pdb_file
        observed_distances = pdb.distance_map(pdb_file, L)
        plot_matrix['distance'] = observed_distances[indices_upper_tri]
        plot_matrix['contact'] = ((plot_matrix.distance < contact_threshold) * 1).tolist()

    # add scores
    plot_matrix['residue_i'] = indices_upper_tri[0] + 1
    plot_matrix['residue_j'] = indices_upper_tri[1] + 1
    plot_matrix['confidence'] = mat[indices_upper_tri]

    ### Plot Contact Map
    plot.plot_contact_map_someScore_plotly(plot_matrix, title, seqsep, gaps_percentage_plot, plot_file=plot_file)
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting amino acid distribution over the alignment.')
    parser.add_argument("alignment_file",       type=str,   help="path to aligment file")
    parser.add_argument("plot_file",            type=str,   help="path to plot file")

    args = parser.parse_args()

    alignment_file              = str(args.alignment_file)
    plot_file                   = str(args.plot_file)

    #protein='2cuaA'
    #alignment_file="/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln"
    #alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr/" + protein + ".star.aln"
    #plot_file = "/home/vorberg/alignment_"+protein+".cheating_12_incmr.html"


    alignment = io.read_alignment(alignment_file)
    protein = os.path.basename(alignment_file).split(".")[0]
    N = float(len(alignment))
    L = len(alignment[0])

    title="Distribution of Amino Acids per position in alignment of " + str(protein) + \
          "<br> N="+str(N) + ", L="+str(L)

    #compute amino acid counts only once
    aa_counts_single, aa_counts_pair = au.compute_counts(alignment, compute_weights=False)


    plot_amino_acid_distribution_per_position(aa_counts_single, title, plot_file, freq=True)
    plot_amino_acid_distribution_per_position(aa_counts_single, title, plot_file, freq=False)
def plot_alignment_entropy(alignment_file, plot_dir=None):

    # read alignment
    protein = os.path.basename(alignment_file).split(".")[0]
    alignment = io.read_alignment(alignment_file)
    N = float(len(alignment))
    L = len(alignment[0])

    alignment = alignment.transpose()

    #determine amino acid frequencies (without any pseudocounts)
    aa_freq_per_pos = np.zeros((21, L))
    for position in range(L):
        aa_counts = Counter(alignment[position])
        for aa, counts in aa_counts.iteritems():
            freq = counts / N
            aa_freq_per_pos[aa, position] = freq

    aa_freq_per_pos = aa_freq_per_pos[1:]  #remove gaps
    aa_freq_per_pos = aa_freq_per_pos.transpose()

    entropy_per_position = [
        entropy(aa_freq_per_pos[pos], base=2) for pos in range(L)
    ]

    #create plot
    data = []
    data.append(
        go.Scatter(x=[x for x in range(L)],
                   y=entropy_per_position,
                   name="percentage of gaps",
                   mode="Lines"))

    layout = {
        'title':
        "Entropy (base 2) in alignment of " + str(protein) + "<br> N=" +
        str(N) + ", L=" + str(L),
        'xaxis': {
            'title': "Alignment Position"
        },
        'yaxis': {
            'title': "Entropy "
        },
        'font': {
            'size': 18
        }
    }

    plot = {'data': data, 'layout': layout}
    if plot_dir is None:
        return plot
    else:
        plot_file = plot_dir + "/alignment_entropy_" + protein + ".html"
        plotly_plot(plot, filename=plot_file, auto_open=False)
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(
        description='Plotting amino acid distribution over the alignment.')
    parser.add_argument("alignment_file",
                        type=str,
                        help="path to aligment file")
    parser.add_argument("plot_file", type=str, help="path to plot file")

    args = parser.parse_args()

    alignment_file = str(args.alignment_file)
    plot_file = str(args.plot_file)

    #protein='2cuaA'
    #alignment_file="/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln"
    #alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr/" + protein + ".star.aln"
    #plot_file = "/home/vorberg/alignment_"+protein+".cheating_12_incmr.html"

    alignment = io.read_alignment(alignment_file)
    protein = os.path.basename(alignment_file).split(".")[0]
    N = float(len(alignment))
    L = len(alignment[0])

    title="Distribution of Amino Acids per position in alignment of " + str(protein) + \
          "<br> N="+str(N) + ", L="+str(L)

    #compute amino acid counts only once
    aa_counts_single, aa_counts_pair = au.compute_counts(alignment,
                                                         compute_weights=False)

    plot_amino_acid_distribution_per_position(aa_counts_single,
                                              title,
                                              plot_file,
                                              freq=True)
    plot_amino_acid_distribution_per_position(aa_counts_single,
                                              title,
                                              plot_file,
                                              freq=False)
def plot_contact_map(mat,
                     seqsep,
                     contact_threshold,
                     title,
                     plot_file=None,
                     alignment_file=None,
                     pdb_file=None):
    L = len(mat)
    indices_upper_tri = np.triu_indices(L, seqsep)

    ### if alignment file is specified, compute Ni
    if (alignment_file):
        alignment = io.read_alignment(alignment_file)
        gaps_percentage_plot = aligncov.plot_percentage_gaps_per_position(
            alignment, plot_file=None)
    else:
        gaps_percentage_plot = None

    plot_matrix = pd.DataFrame()

    ###compute distance map from pdb file
    if (pdb_file):
        pdb_file = pdb_file
        observed_distances = pdb.distance_map(pdb_file, L)
        plot_matrix['distance'] = observed_distances[indices_upper_tri]
        plot_matrix['contact'] = ((plot_matrix.distance < contact_threshold) *
                                  1).tolist()

    # add scores
    plot_matrix['residue_i'] = indices_upper_tri[0] + 1
    plot_matrix['residue_j'] = indices_upper_tri[1] + 1
    plot_matrix['confidence'] = mat[indices_upper_tri]

    ### Plot Contact Map
    plot.plot_contact_map_someScore_plotly(plot_matrix,
                                           title,
                                           seqsep,
                                           gaps_percentage_plot,
                                           plot_file=plot_file)
    def __init__(self, alignment_file, seq_separation=8, contact_threshold=8, non_contact_threshold=25):


        self.alignment_file = alignment_file
        self.protein=os.path.basename(self.alignment_file).split(".")[0]
        self.msa = io.read_alignment(alignment_file)


        self.seq_separation = seq_separation
        self.contact_threshold = contact_threshold
        self.non_contact_threshold = non_contact_threshold
        self.max_gap_percentage = 0.9

        self.L = self.msa.shape[1]
        self.N = self.msa.shape[0]
        self.weights=None
        self.neff=None

        #indices of upper triangle without diagonal
        self.ij_ind_upper = np.triu_indices(self.L, k=self.seq_separation)

        #with gap and without pseudocounts!
        self.single_counts=None
        self.pairwise_counts=None

        #without gap and with pseudocounts!
        self.single_frequencies=None
        self.pairwise_frequencies=None

        self.Ni = None
        self.Nij = None

        self.features = {'global': {},
                         'single':{},
                         'pair':{}
                         }

        self.compute_frequencies(pseudocounts='background')
Beispiel #7
0
    def __init__(self,
                 alignment_file,
                 seq_separation=8,
                 contact_threshold=8,
                 non_contact_threshold=25):

        self.alignment_file = alignment_file
        self.protein = os.path.basename(self.alignment_file).split(".")[0]
        self.msa = io.read_alignment(alignment_file)

        self.seq_separation = seq_separation
        self.contact_threshold = contact_threshold
        self.non_contact_threshold = non_contact_threshold
        self.max_gap_percentage = 0.9

        self.L = self.msa.shape[1]
        self.N = self.msa.shape[0]
        self.weights = None
        self.neff = None

        #indices of upper triangle without diagonal
        self.ij_ind_upper = np.triu_indices(self.L, k=self.seq_separation)

        #with gap and without pseudocounts!
        self.single_counts = None
        self.pairwise_counts = None

        #without gap and with pseudocounts!
        self.single_frequencies = None
        self.pairwise_frequencies = None

        self.Ni = None
        self.Nij = None

        self.features = {'global': {}, 'single': {}, 'pair': {}}

        self.compute_frequencies(pseudocounts='background')
Beispiel #8
0
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(
        description='Plotting sequence similarity matrix.')
    parser.add_argument("alignment_file",
                        type=str,
                        help="path to aligment file")
    parser.add_argument("plot_dir", type=str, help="path to plot dir")

    args = parser.parse_args()

    alignment_file = str(args.alignment_file)
    plot_dir = str(args.plot_dir)

    plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/seq_identity_matrices_alignments/"
    protein = '1dqgA'  #'1i5gA' # '1dqgA'#'1ag6A'#'1ej0A'#'1g2rA'
    topology = ""
    topology = ".star"
    topology = ".binary"
    # alignment_file="/home/vorberg/" + protein +topology+".mr50.aln"
    # alignment_file="/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein +topology+".aln"
    # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein +topology+".aln"
    # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12/" + protein +topology+".aln"
    alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr_4/" + protein + topology + ".aln"
    # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr100/" + protein +topology+".aln"
    # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr10/" + protein +topology+".aln"
    # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr1/" + protein +topology+".aln"
    # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_lfactor1e-3_cheating_12_mr100/" + protein +topology+".aln"
    # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_lfactor1e-3_cheating_12_mr10/" + protein +topology+".aln"
    # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_lfactor1e-3_cheating_12_mr1/" + protein +topology+".aln"

    alignment = io.read_alignment(alignment_file)
    protein = os.path.basename(alignment_file).split(".")[0]

    #compute amino acid counts only once
    similarity_matrix = compute_seq_identities(alignment)
    #similarity_matrix = hamming_distance_matrix(alignment)
    print(np.mean(similarity_matrix[-100, :-100]))
    print(np.min(similarity_matrix))
    print(np.mean(similarity_matrix))

    #plot seq similarity matrix
    plot_file = plot_dir + "/sequence_similarity_matrix_" + protein + topology + ".html"
    plot_seq_id_matrix(similarity_matrix, plot_file=plot_file)

    #plot dendrogramm with similarity matrix - use hamming distance matrix
    plot_file = plot_dir + "/sequence_similarity_matrix_dendrogram_" + protein + topology + ".html"
    plot_seq_id_matrix_with_dendrogram(alignment,
                                       similarity_matrix,
                                       plot_file=plot_file)

    #plot boxplot of pairwise sequence identities for one protein and different methods
    plot_file = plot_dir + "/boxplot_sequence_similarities_" + topology + ".html"
    alignment_dir_list = [
        "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr1/",
        "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr3/",
        "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr10/",
        "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr100/"
    ]
    plot_seq_id_boxplot(alignment_dir_list,
                        topology,
                        plot_file=plot_file,
                        protein=None)

    plot_file = plot_dir + "/boxplot_sequence_similarities" + topology + "." + protein + ".html"
    plot_seq_id_boxplot(alignment_dir_list,
                        topology,
                        plot_file,
                        protein=protein)
def main():

    parser = argparse.ArgumentParser(description="Generate SEQATOM sequences from deprecated database or recompute")

    parser.add_argument("-a", "--alignment",    dest="ali",                                 help="path to alignment files")
    parser.add_argument("-p", "--pdb",          dest="pdb",                                 help="path to pdb files")
    parser.add_argument("-o", "--output",       dest="output",                              help="path to filter directory")
    parser.add_argument("--min-N",              dest="minN",    default=10,     type=int,   help="Minimum number of sequences")
    parser.add_argument("--max-gap-percentage", dest="maxGap",  default=0.8,    type=float, help="Maximum percentage of gaps in alignment")
    parser.add_argument("--max-L",              dest="maxL",    default=600,    type=float, help="Maximum length of protein")
    parser.add_argument("--min-L",              dest="minL",    default=20,     type=float, help="Minimum length of protein")
    parser.add_argument("--min-contacts",       dest="mincontacts", default=1,  type=int,   help="Minimum number of contacts")
    parser.add_argument("--contact-threshold",  dest="contact_threshold", default=8, type=int, help="Contact defined as distance between Cbeta atoms < threshold")
    parser.add_argument("--sequence-separation",  dest="seqsep", default=12, type=int,      help="Consider only residues separated by this many positions in sequence.")

    args = parser.parse_args()
    alignment_dir  = args.ali
    pdb_dir  = args.pdb
    output_dir     = args.output

    minL = args.minL
    maxL = args.maxL
    minN = args.minN
    maxgappercentage = args.maxGap
    mincontacts = args.mincontacts
    contact_threshold = args.contact_threshold
    seqsep = args.seqsep

    aln_files = glob.glob(alignment_dir + "/*")


    for alignment_file in aln_files:
        protein = os.path.basename(alignment_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"

        if not os.path.exists(pdb_file):
            print("PDB file {0} does not exist. Skip protein.".format(pdb_file))
            continue

        alignment = io.read_alignment(alignment_file, format="psicov")

        N = alignment.shape[0]
        L = alignment.shape[1]

        percent_gaps = np.mean(ali_ut.compute_gaps_per_position(alignment))

        distance_map = pdb.distance_map(pdb_file, L)
        nr_contacts = np.sum((distance_map[np.triu_indices(L, k=seqsep)] < contact_threshold) * 1)

        filter=False
        if N < minN:
            print("Alignment size {0} is smaller than filter threshold of {1}".format(N, minN))
            filter=True

        if L < minL:
            print("Protein length {0} is smaller than filter threshold of {1}".format(L, minL))
            filter=True

        if L > maxL:
            print("Protein length {0} is bigger than filter threshold of {1}".format(L, maxL))
            filter=True

        if percent_gaps > maxgappercentage:
            print("Percentag of gaps in alignment ({0}) is larger than filter threshold of {1}".format(percent_gaps, maxgappercentage))
            filter=True

        if nr_contacts < mincontacts:
            print("Number of contacts (contact_thr = {0}, sequence separation = {1}) in protein structure ({2}) is less than {3}".format(contact_threshold,seqsep, nr_contacts, mincontacts))
            filter=True


        if filter:
            dest_alignment_file = output_dir + "/" + os.path.basename(alignment_file)
            os.rename(alignment_file, dest_alignment_file)
            print("Successfully moved {0} to {1}".format(alignment_file, dest_alignment_file))
Beispiel #10
0
def plot_seq_id_boxplot(alignment_dir_list, topology, plot_file, protein=None):

    data = []

    for alignment_dir in alignment_dir_list:
        method = os.path.basename(os.path.abspath(alignment_dir))
        print(method)

        box_data = []

        if protein is not None:
            alignment_file = alignment_dir + "/" + protein + topology + ".aln"
            alignment = io.read_alignment(alignment_file)
            similarity_matrix = compute_seq_identities(alignment)
            box_data = similarity_matrix[np.triu_indices(
                similarity_matrix.shape[0], k=1)]

        else:
            alignment_files = glob.glob(alignment_dir + "/*" + topology +
                                        ".aln")

            for alignment_file in alignment_files:
                alignment = io.read_alignment(alignment_file)
                protein = os.path.basename(alignment_file).split(".")[0]
                similarity_matrix = compute_seq_identities(alignment)
                mean_seq_id = np.mean(similarity_matrix[np.triu_indices(
                    similarity_matrix.shape[0], k=1)])
                median_seq_id = np.median(similarity_matrix[np.triu_indices(
                    similarity_matrix.shape[0], k=1)])

                box_data.append(mean_seq_id)

        box = go.Box(y=box_data,
                     boxmean='sd',
                     boxpoints='Outliers',
                     name=method,
                     marker=dict(opacity=1),
                     orientation='v',
                     showlegend=False)

        data.append(box)

    plot = {
        "data":
        data,
        "layout":
        go.Layout(yaxis=dict(exponentformat='e',
                             showexponent='All',
                             range=[0, 1]),
                  font=dict(size=18))
    }

    if protein is None:
        plot['layout']['title'] = "Mean sequence Ids for all proteins"
        plot['layout']['yaxis']['title'] = "mean sequence id"
    else:
        plot['layout'][
            'title'] = "Pairwise sequence identities for protein {0}".format(
                protein)
        plot['layout']['yaxis']['title'] = "pairwise sequence id"

    plotly_plot(plot, filename=plot_file, auto_open=False)
def main():

    parser = argparse.ArgumentParser(
        description=
        "Generate SEQATOM sequences from deprecated database or recompute")

    parser.add_argument("-a",
                        "--alignment",
                        dest="ali",
                        help="path to alignment files")
    parser.add_argument("-p", "--pdb", dest="pdb", help="path to pdb files")
    parser.add_argument("-o",
                        "--output",
                        dest="output",
                        help="path to filter directory")
    parser.add_argument("--min-N",
                        dest="minN",
                        default=10,
                        type=int,
                        help="Minimum number of sequences")
    parser.add_argument("--max-gap-percentage",
                        dest="maxGap",
                        default=0.8,
                        type=float,
                        help="Maximum percentage of gaps in alignment")
    parser.add_argument("--max-L",
                        dest="maxL",
                        default=600,
                        type=float,
                        help="Maximum length of protein")
    parser.add_argument("--min-L",
                        dest="minL",
                        default=20,
                        type=float,
                        help="Minimum length of protein")
    parser.add_argument("--min-contacts",
                        dest="mincontacts",
                        default=1,
                        type=int,
                        help="Minimum number of contacts")
    parser.add_argument(
        "--contact-threshold",
        dest="contact_threshold",
        default=8,
        type=int,
        help="Contact defined as distance between Cbeta atoms < threshold")
    parser.add_argument(
        "--sequence-separation",
        dest="seqsep",
        default=12,
        type=int,
        help=
        "Consider only residues separated by this many positions in sequence.")

    args = parser.parse_args()
    alignment_dir = args.ali
    pdb_dir = args.pdb
    output_dir = args.output

    minL = args.minL
    maxL = args.maxL
    minN = args.minN
    maxgappercentage = args.maxGap
    mincontacts = args.mincontacts
    contact_threshold = args.contact_threshold
    seqsep = args.seqsep

    aln_files = glob.glob(alignment_dir + "/*")

    for alignment_file in aln_files:
        protein = os.path.basename(alignment_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"

        if not os.path.exists(pdb_file):
            print(
                "PDB file {0} does not exist. Skip protein.".format(pdb_file))
            continue

        alignment = io.read_alignment(alignment_file, format="psicov")

        N = alignment.shape[0]
        L = alignment.shape[1]

        percent_gaps = np.mean(ali_ut.compute_gaps_per_position(alignment))

        distance_map = pdb.distance_map(pdb_file, L)
        nr_contacts = np.sum(
            (distance_map[np.triu_indices(L, k=seqsep)] < contact_threshold) *
            1)

        filter = False
        if N < minN:
            print("Alignment size {0} is smaller than filter threshold of {1}".
                  format(N, minN))
            filter = True

        if L < minL:
            print("Protein length {0} is smaller than filter threshold of {1}".
                  format(L, minL))
            filter = True

        if L > maxL:
            print("Protein length {0} is bigger than filter threshold of {1}".
                  format(L, maxL))
            filter = True

        if percent_gaps > maxgappercentage:
            print(
                "Percentag of gaps in alignment ({0}) is larger than filter threshold of {1}"
                .format(percent_gaps, maxgappercentage))
            filter = True

        if nr_contacts < mincontacts:
            print(
                "Number of contacts (contact_thr = {0}, sequence separation = {1}) in protein structure ({2}) is less than {3}"
                .format(contact_threshold, seqsep, nr_contacts, mincontacts))
            filter = True

        if filter:
            dest_alignment_file = output_dir + "/" + os.path.basename(
                alignment_file)
            os.rename(alignment_file, dest_alignment_file)
            print("Successfully moved {0} to {1}".format(
                alignment_file, dest_alignment_file))
    def __create_evaluation_file(self, protein, pdb_file, aln_file, seqsep):
        """
        Create evaluation file for a protein that contains information about:
             - cb_distance
             - i
             - j

        :param protein:     protein identifier
        :param pdb_file:    path to pdb file for protein
        :param seqsep:      minimal assumed sequence separation
        :return:
        """

        if not os.path.exists(pdb_file):
            raise IOError("PDB File " + str(pdb_file) + "does not exist. ")

        if not os.path.exists(aln_file):
            raise IOError("Alignment File " + str(aln_file) + "does not exist. ")

        # determine indices that are resolved in PDB and have minimal required seq sep
        distance_matrix = pdb.distance_map(pdb_file)

        # get residue pairs that are resolved and (j-i) > seqsep
        indices_pairs_resolved = list(zip(*np.where(~np.isnan(distance_matrix))))
        indices_pairs_seqsep = list(zip(*np.triu_indices(len(distance_matrix), seqsep)))
        ij_indices = list(set(indices_pairs_resolved).intersection(indices_pairs_seqsep))

        # Create the evaluation file
        eval_df = pd.DataFrame(
            {
                'i': [i for i,j in ij_indices],
                'j': [j for i,j in ij_indices],
                'cb_distance': distance_matrix[[i for i,j in ij_indices], [j for i,j in ij_indices]],
            }
        )
        eval_df.sort_values(by=['i', 'j'], inplace=True)

        #read alignment
        alignment = io.read_alignment(aln_file)

        #compute percentage of gaps
        percent_gaps = np.mean(ali_ut.compute_gaps_per_position(alignment))

        #compute effective number of sequences
        weights = weighting.calculate_weights_simple(alignment, 0.8, False)
        neff = np.sum(weights)

        meta_protein = {
            'name': protein,
            'L': alignment.shape[1],
            'N': alignment.shape[0],
            'diversity': np.sqrt(alignment.shape[0]) / alignment.shape[1],
            'gap_percentage': percent_gaps,
            'neff': neff
        }

        # write evaluation data to file
        evaluation_file = self.eval_dir + "/" + protein + ".protein"
        io.write_matfile(eval_df.values, evaluation_file, meta_protein)

        #add to proteins in evaluation suite
        if protein not in self.proteins:
            self.proteins.append(protein)
def collect_data(braw_dir, alignment_dir, pdb_dir, size, diversity_thr,
                 contact_threshold, noncontact_threshold, Nij_threshold):

    braw_files = glob.glob(braw_dir + "/*braw.gz")

    couplings_df = pd.DataFrame()
    nr_contacts = 0
    nr_noncontacts = 0
    sequence_separation = 10

    for braw_file in braw_files:
        #braw_file = braw_files[0]
        if nr_contacts >= size and nr_noncontacts >= size:
            break

        if not os.path.exists(braw_file):
            print("Braw File " + str(braw_file) + "cannot be found. ")
            continue

        braw = raw.parse_msgpack(braw_file)
        L = braw.ncol
        if 'msafile' in braw.meta['workflow'][0]:
            N = braw.meta['workflow'][0]['msafile']['nrow']
        else:
            N = braw.meta['workflow'][0]['parameters']['msafile']['nrow']
        diversity = np.sqrt(N) / L
        if diversity < diversity_thr:
            continue

        protein = os.path.basename(braw_file).split(".")[0]

        alignment_file = alignment_dir + "/" + protein + ".filt.psc"
        if not os.path.exists(alignment_file):
            print("Alignment File " + str(alignment_file) +
                  " cannot be found. ")
            continue

        pdb_file = pdb_dir + "/" + protein.replace("_", "") + ".pdb"
        if not os.path.exists(pdb_file):
            print("PDB File " + str(pdb_file) + " cannot be found. ")
            continue

        print protein, "N =", N, "L =", L, "diversity =", diversity

        indices_upper_tri = np.triu_indices(L, k=sequence_separation)

        #filter pair indices that have specified Cb distances
        dist_matrix = pdb.distance_map(pdb_file, L)
        indices_contact = np.where(
            (dist_matrix[indices_upper_tri] < contact_threshold))[0]
        indices_noncontact = np.where(
            (dist_matrix[indices_upper_tri] > noncontact_threshold))[0]

        #filter pair indices that have more than Nij_threshold ungapped sequences
        alignment = io.read_alignment(alignment_file)
        weights = weighting.calculate_weights_simple(alignment, 0.8, True)
        pairwise_counts = counts.pair_counts(alignment, weights)
        Nij = pairwise_counts[:, :, :20, :20].sum(3).sum(2)
        indices_Nij_true = np.where(Nij[indices_upper_tri] > Nij_threshold)[0]

        #get pair indices that fullfill both requirements
        indices_contact = list(
            set(indices_contact).intersection(indices_Nij_true))
        indices_noncontact = list(
            set(indices_noncontact).intersection(indices_Nij_true))

        #get couplings for filtered pairs
        braw_reshaped = braw.x_pair[:, :, :20, :20].reshape(L, L, 400)
        if nr_contacts < size:
            couplings_contact = pd.DataFrame(
                braw_reshaped[indices_upper_tri][indices_contact])
            couplings_contact['distance'] = dist_matrix[indices_upper_tri][
                indices_contact]
            couplings_df = couplings_df.append(couplings_contact)
            nr_contacts += len(indices_contact)

        if nr_noncontacts < size:
            couplings_noncontact = pd.DataFrame(
                braw_reshaped[indices_upper_tri][indices_noncontact])
            couplings_noncontact['distance'] = dist_matrix[indices_upper_tri][
                indices_noncontact]
            couplings_df = couplings_df.append(couplings_noncontact)
            nr_noncontacts += len(indices_noncontact)

        print "Nr of couplings contact: {0} and non-contact: {1}".format(
            nr_contacts, nr_noncontacts)

    couplings_df['class'] = (couplings_df['distance'] < contact_threshold) * 1

    return couplings_df
    def collect_data(self, protein_set=[]):
        """
        Setup a list of residue pairs that will be used for training
            - get the same amount of contacts/non-contacts
            - according to some filtering criteria (seqsep, diverstity, etc)

        :param protein_set: list of protein identifiers or None
                            if None, protein list will be parsed from braw files
        :return:
        """


        if len(protein_set) == 0:
            braw_files = glob.glob(self.braw_dir + "/*braw*")
            for braw in braw_files:
                protein_set.append(os.path.basename(braw).split(".")[0])

        # shuffle rows WITH seed for reproducibility ! ! !
        random.seed(self.seed)
        random.shuffle(protein_set)

        print('\nNumber of available proteins: {0}. Selecting {1} contacts and {2} non-contacts...'.format(
            len(protein_set), self.number_of_pairs, self.number_of_pairs * self.balance))

        nr_pairs_contact_crossval = 0
        nr_pairs_noncontact_crossval = 0
        nr_pairs_contacts = 0
        nr_pairs_bg = 0

        # Iterate over protein files
        for p in protein_set:
            # p = protein_set[0]

            # set up file names
            psicov_file     = self.psicov_dir   + "/" + p + ".filt.psc"
            braw_file_gz    = self.braw_dir     + "/" + p + ".filt.braw.gz"
            qijabfile       = self.qijab_dir    + "/" + p + ".filt.bqij.gz"
            pdb_file        = self.pdb_dir      + "/" + p + ".pdb"
            # p_short = p.replace("_", "")
            # psicov_file     = self.psicov_dir   + "/" + p + ".psc"
            # braw_file_gz    = self.braw_dir     + "/" + p + ".braw.gz"
            # qijabfile       = self.qijab_dir    + "/" + p + ".bqijab.gz"
            # pdb_file        = self.pdb_dir      + "/" + p_short + "_ren.pdb"


            # check if braw file exists, otherwise continue
            if not os.path.isfile(braw_file_gz):
                print("Binary raw file {0} for protein {1} could not be found!".format(braw_file_gz, p))
                continue

            if not os.path.isfile(psicov_file):
                print("Alignment file {0} for protein {1} could not be found!".format(psicov_file, p))
                continue

            if not os.path.isfile(qijabfile):
                print("qij file {0} for protein {1} could not be found!".format(qijabfile, p))
                continue

            if not os.path.isfile(pdb_file):
                print("PDB file {0} for protein {1} could not be found!".format(pdb_file, p))
                continue

            psicov = io.read_alignment(psicov_file)
            N = len(psicov)
            L = len(psicov[0])
            diversity = np.sqrt(N) / L

            # skip proteins with low diversities
            if diversity < self.diversity_thr:
                continue

            indices_contact, indices_non_contact = self.get_residue_pairs_from_protein(
                braw_file_gz, qijabfile, pdb_file, psicov)

            # if no data
            if len(indices_contact[0]) == 0 and len(indices_non_contact[0]) == 0:
                continue

            protein_data = {}
            protein_data['N'] = N
            protein_data['L'] = L
            protein_data['diversity'] = diversity
            protein_data['braw_file_path'] = braw_file_gz
            protein_data['msafilename'] = psicov_file
            protein_data['qijabfilename'] = qijabfile
            protein_data['residue_i'] = []
            protein_data['residue_j'] = []
            protein_data['contact'] = []


            # shuffle indices, so to not introduce any bias when choosing only the first X pairs from each protein
            random.seed(self.seed)
            random.shuffle(indices_contact[0], lambda: 0.1)
            random.shuffle(indices_contact[1], lambda: 0.1)
            random.shuffle(indices_non_contact[0], lambda: 0.1)
            random.shuffle(indices_non_contact[1], lambda: 0.1)


            if len(indices_contact[0]) > 0 and (nr_pairs_contacts < self.number_of_pairs):
                    protein_data['residue_i'].extend(indices_contact[0][:self.maxcontacts_per_protein])
                    protein_data['residue_j'].extend(indices_contact[1][:self.maxcontacts_per_protein])
                    protein_data['contact'].extend([1] * len(indices_contact[0][:self.maxcontacts_per_protein]))
                    nr_pairs_contacts += len(indices_contact[0][:self.maxcontacts_per_protein])
                    self.training_data[p] = protein_data

            if len(indices_non_contact[0]) > 0 and nr_pairs_bg < (self.number_of_pairs * self.balance):
                    protein_data['residue_i'].extend(indices_non_contact[0][:self.maxnoncontacts_per_protein])
                    protein_data['residue_j'].extend(indices_non_contact[1][:self.maxnoncontacts_per_protein])
                    protein_data['contact'].extend([0] * len(indices_non_contact[0][:self.maxnoncontacts_per_protein]))
                    nr_pairs_bg += len(indices_non_contact[0][:self.maxnoncontacts_per_protein])
                    self.training_data[p] = protein_data


            if p not in self.training_data:

                if len(indices_contact[0]) > 0 and nr_pairs_contact_crossval < self.nr_crossval_pairs:
                        protein_data['residue_i'].extend(indices_contact[0][:self.maxcontacts_per_protein])
                        protein_data['residue_j'].extend(indices_contact[1][:self.maxcontacts_per_protein])
                        protein_data['contact'].extend([1] * len(indices_contact[0][:self.maxcontacts_per_protein]))
                        nr_pairs_contact_crossval += len(indices_contact[0][:self.maxcontacts_per_protein])
                        self.test_data[p] = protein_data

                if len(indices_non_contact[0]) > 0 and nr_pairs_noncontact_crossval < (self.nr_crossval_pairs * self.balance):
                        protein_data['residue_i'].extend(indices_non_contact[0][:self.maxnoncontacts_per_protein])
                        protein_data['residue_j'].extend(indices_non_contact[1][:self.maxnoncontacts_per_protein])
                        protein_data['contact'].extend([0] * len(indices_non_contact[0][:self.maxnoncontacts_per_protein]))
                        nr_pairs_noncontact_crossval += len(indices_non_contact[0][:self.maxnoncontacts_per_protein])
                        self.test_data[p] = protein_data


            print("{0}, #pairs in training set: contact={1} bg={2}, #pairs in testset: contact={3} bg={4}".format(
                    p,  nr_pairs_contacts, nr_pairs_bg, nr_pairs_contact_crossval, nr_pairs_noncontact_crossval))

            # stop condition
            condition_training = [nr_pairs_contacts >= self.number_of_pairs,
                                  nr_pairs_bg >= (self.number_of_pairs * self.balance)]
            condition_test = [nr_pairs_contact_crossval >= self.nr_crossval_pairs,
                              nr_pairs_noncontact_crossval >= self.nr_crossval_pairs]
            if (all(condition_training) and all(condition_test)):
                break

        self.nr_pairs_contact = nr_pairs_contacts
        self.nr_pairs_noncontact = nr_pairs_bg
        self.nr_pairs_contact_cross_val = nr_pairs_contact_crossval
        self.nr_pairs_noncontact_cross_val = nr_pairs_noncontact_crossval
Beispiel #15
0
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(
        description='Plotting empirical vs model alignment statistics.')
    parser.add_argument("observed_alignments",
                        type=str,
                        help="path to original aligment files")
    parser.add_argument("sampled_alignments_pll",
                        type=str,
                        help="path to sampled alignment files (using PLL)")
    parser.add_argument("sampled_alignments_pcd",
                        type=str,
                        help="path to sampled alignment files (using PCD)")
    parser.add_argument("plot_dir",
                        type=str,
                        help="path to output directory for plots")

    args = parser.parse_args()

    observed_alignments_path = args.observed_alignments
    sampled_alignments_paths = args.observed_alignments
    plot_dir = args.plot_out
    log = False
    max_gap_pos = 50

    #debug
    # observed_alignments_path = "/home/vorberg/work/data/ccmgen/psicov/alignments/"
    # sampled_alignments_paths_pll = "/home/vorberg/work/data/ccmgen/psicov/sampled_pll/"
    # sampled_alignments_paths_pcd = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/"
    # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/pll_vs_pcd_comparison/alignment_statistics_correlation/"

    data_dict = {
        'pseudo-likelihood': {
            'x': [],
            'y': []
        },
        'contrastive divergence': {
            'x': [],
            'y': []
        }
    }

    observed_alignments = glob.glob(observed_alignments_path + "/*aln")
    for obs_aln_file in observed_alignments:
        protein = os.path.basename(obs_aln_file).split(".")[0]
        sampled_aln_file_pll = glob.glob(sampled_alignments_paths_pll + "/" +
                                         protein + "*.ind.aln")
        sampled_aln_file_pcd = glob.glob(sampled_alignments_paths_pcd + "/" +
                                         protein + "*.ind.aln")

        if len(sampled_aln_file_pll) == 0 or not os.path.exists(
                sampled_aln_file_pll[0]):
            print("Sampled alignment file {0} does not exist!".format(
                sampled_aln_file_pll[0]))
            continue

        if len(sampled_aln_file_pcd) == 0 or not os.path.exists(
                sampled_aln_file_pcd[0]):
            print("Sampled alignment file {0} does not exist!".format(
                sampled_aln_file_pcd[0]))
            continue

        print(protein)

        #read in alignments and remove columns with >50% gaps
        alignment_o = io.read_alignment(obs_aln_file,
                                        max_gap_pos=100,
                                        max_gap_seq=100)
        L_original = alignment_o.shape[1]
        alignment_o, gapped_positions = io.remove_gapped_positions(
            alignment_o, max_gap_percentage=max_gap_pos)
        non_gapped_positions = [
            i for i in range(L_original) if i not in gapped_positions
        ]
        alignment_s_pll = io.read_alignment(sampled_aln_file_pll[0],
                                            max_gap_pos=100,
                                            max_gap_seq=100)
        alignment_s_pll = np.ascontiguousarray(
            alignment_s_pll[:, non_gapped_positions])
        alignment_s_pcd = io.read_alignment(sampled_aln_file_pcd[0],
                                            max_gap_pos=100,
                                            max_gap_seq=100)
        alignment_s_pcd = np.ascontiguousarray(
            alignment_s_pcd[:, non_gapped_positions])

        # compute amino acid counts
        single_freq_observed, pairwise_freq_observed = au.calculate_frequencies(
            alignment_o, au.uniform_pseudocounts)
        single_freq_sampled_pll, pairwise_freq_sampled_pll = au.calculate_frequencies(
            alignment_s_pll, au.uniform_pseudocounts)
        single_freq_sampled_pcd, pairwise_freq_sampled_pcd = au.calculate_frequencies(
            alignment_s_pcd, au.uniform_pseudocounts)

        # degap the frequencies (ignore gap frequencies)
        single_freq_observed = au.degap(single_freq_observed, False)
        single_freq_sampled_pll = au.degap(single_freq_sampled_pll, False)
        single_freq_sampled_pcd = au.degap(single_freq_sampled_pcd, False)
        pairwise_freq_observed = au.degap(pairwise_freq_observed, False)
        pairwise_freq_sampled_pll = au.degap(pairwise_freq_sampled_pll, False)
        pairwise_freq_sampled_pcd = au.degap(pairwise_freq_sampled_pcd, False)

        #reshape frequencies
        L = alignment_o.shape[1]
        indices_upper_triangle = np.triu_indices(L, k=1)

        x_single = single_freq_observed.flatten().tolist()
        y_single_pll = single_freq_sampled_pll.flatten().tolist()
        y_single_pcd = single_freq_sampled_pcd.flatten().tolist()
        pair_freq_observed = pairwise_freq_observed[
            indices_upper_triangle[0],
            indices_upper_triangle[1], :, :].flatten().tolist()
        pair_freq_sampled_pll = pairwise_freq_sampled_pll[
            indices_upper_triangle[0],
            indices_upper_triangle[1], :, :].flatten().tolist()
        pair_freq_sampled_pcd = pairwise_freq_sampled_pcd[
            indices_upper_triangle[0],
            indices_upper_triangle[1], :, :].flatten().tolist()
        cov_observed = [
            pairwise_freq_observed[i, j, a, b] -
            (single_freq_observed[i, a] * single_freq_observed[j, b])
            for i in range(L - 1) for j in range(i + 1, L) for a in range(20)
            for b in range(20)
        ]
        cov_sampled_pll = [
            pairwise_freq_sampled_pll[i, j, a, b] -
            (single_freq_sampled_pll[i, a] * single_freq_sampled_pll[j, b])
            for i in range(L - 1) for j in range(i + 1, L) for a in range(20)
            for b in range(20)
        ]
        cov_sampled_pcd = [
            pairwise_freq_sampled_pcd[i, j, a, b] -
            (single_freq_sampled_pcd[i, a] * single_freq_sampled_pcd[j, b])
            for i in range(L - 1) for j in range(i + 1, L) for a in range(20)
            for b in range(20)
        ]

        if log:
            x_single = np.log(x_single)
            y_single_pll = np.log(y_single_pll)
            y_single_pcd = np.log(y_single_pcd)
            pair_freq_observed = np.log(pair_freq_observed)
            pair_freq_sampled_pll = np.log(pair_freq_sampled_pll)
            pair_freq_sampled_pcd = np.log(pair_freq_sampled_pcd)

        #compute pearson correlation coefficient
        data_dict['pseudo-likelihood']['x'].append(
            np.corrcoef(x_single, y_single_pll)[0, 1])
        data_dict['pseudo-likelihood']['y'].append('single site frequencies')

        data_dict['pseudo-likelihood']['x'].append(
            np.corrcoef(pair_freq_observed, pair_freq_sampled_pll)[0, 1])
        data_dict['pseudo-likelihood']['y'].append('pairwise frequencies')

        data_dict['pseudo-likelihood']['x'].append(
            np.corrcoef(cov_observed, cov_sampled_pll)[0, 1])
        data_dict['pseudo-likelihood']['y'].append('Covariances')

        data_dict['contrastive divergence']['x'].append(
            np.corrcoef(x_single, y_single_pcd)[0, 1])
        data_dict['contrastive divergence']['y'].append(
            'single site frequencies')

        data_dict['contrastive divergence']['x'].append(
            np.corrcoef(pair_freq_observed, pair_freq_sampled_pcd)[0, 1])
        data_dict['contrastive divergence']['y'].append('pairwise frequencies')

        data_dict['contrastive divergence']['x'].append(
            np.corrcoef(cov_observed, cov_sampled_pcd)[0, 1])
        data_dict['contrastive divergence']['y'].append('Covariances')

    #plot boxplot
    plot_boxplot_correlation_alignment_statistics_pll_vs_pcd(
        data_dict, plot_dir)
Beispiel #16
0
    def __create_evaluation_file(self, protein, pdb_file, aln_file, seqsep):
        """
        Create evaluation file for a protein that contains information about:
             - cb_distance
             - i
             - j

        :param protein:     protein identifier
        :param pdb_file:    path to pdb file for protein
        :param seqsep:      minimal assumed sequence separation
        :return:
        """

        if not os.path.exists(pdb_file):
            raise IOError("PDB File " + str(pdb_file) + "does not exist. ")

        if not os.path.exists(aln_file):
            raise IOError("Alignment File " + str(aln_file) +
                          "does not exist. ")

        # determine indices that are resolved in PDB and have minimal required seq sep
        distance_matrix = pdb.distance_map(pdb_file)

        # get residue pairs that are resolved and (j-i) > seqsep
        indices_pairs_resolved = list(
            zip(*np.where(~np.isnan(distance_matrix))))
        indices_pairs_seqsep = list(
            zip(*np.triu_indices(len(distance_matrix), seqsep)))
        ij_indices = list(
            set(indices_pairs_resolved).intersection(indices_pairs_seqsep))

        # Create the evaluation file
        eval_df = pd.DataFrame({
            'i': [i for i, j in ij_indices],
            'j': [j for i, j in ij_indices],
            'cb_distance':
            distance_matrix[[i for i, j in ij_indices],
                            [j for i, j in ij_indices]],
        })
        eval_df.sort_values(by=['i', 'j'], inplace=True)

        #read alignment
        alignment = io.read_alignment(aln_file)

        #compute percentage of gaps
        percent_gaps = np.mean(ali_ut.compute_gaps_per_position(alignment))

        #compute effective number of sequences
        weights = weighting.calculate_weights_simple(alignment, 0.8, False)
        neff = np.sum(weights)

        meta_protein = {
            'name': protein,
            'L': alignment.shape[1],
            'N': alignment.shape[0],
            'diversity': np.sqrt(alignment.shape[0]) / alignment.shape[1],
            'gap_percentage': percent_gaps,
            'neff': neff
        }

        # write evaluation data to file
        evaluation_file = self.eval_dir + "/" + protein + ".protein"
        io.write_matfile(eval_df.values, evaluation_file, meta_protein)

        #add to proteins in evaluation suite
        if protein not in self.proteins:
            self.proteins.append(protein)
Beispiel #17
0
    def collect_data(self, protein_set=[]):
        """
        Setup a list of residue pairs that will be used for training
            - get the same amount of contacts/non-contacts
            - according to some filtering criteria (seqsep, diverstity, etc)

        :param protein_set: list of protein identifiers or None
                            if None, protein list will be parsed from braw files
        :return:
        """

        if len(protein_set) == 0:
            braw_files = glob.glob(self.braw_dir + "/*braw*")
            for braw in braw_files:
                protein_set.append(os.path.basename(braw).split(".")[0])

        # shuffle rows WITH seed for reproducibility ! ! !
        random.seed(self.seed)
        random.shuffle(protein_set)

        print(
            '\nNumber of available proteins: {0}. Selecting {1} contacts and {2} non-contacts...'
            .format(len(protein_set), self.number_of_pairs,
                    self.number_of_pairs * self.balance))

        nr_pairs_contact_crossval = 0
        nr_pairs_noncontact_crossval = 0
        nr_pairs_contacts = 0
        nr_pairs_bg = 0

        # Iterate over protein files
        for p in protein_set:
            # p = protein_set[0]

            # set up file names
            psicov_file = self.psicov_dir + "/" + p + ".filt.psc"
            braw_file_gz = self.braw_dir + "/" + p + ".filt.braw.gz"
            qijabfile = self.qijab_dir + "/" + p + ".filt.bqij.gz"
            pdb_file = self.pdb_dir + "/" + p + ".pdb"
            # p_short = p.replace("_", "")
            # psicov_file     = self.psicov_dir   + "/" + p + ".psc"
            # braw_file_gz    = self.braw_dir     + "/" + p + ".braw.gz"
            # qijabfile       = self.qijab_dir    + "/" + p + ".bqijab.gz"
            # pdb_file        = self.pdb_dir      + "/" + p_short + "_ren.pdb"

            # check if braw file exists, otherwise continue
            if not os.path.isfile(braw_file_gz):
                print(
                    "Binary raw file {0} for protein {1} could not be found!".
                    format(braw_file_gz, p))
                continue

            if not os.path.isfile(psicov_file):
                print("Alignment file {0} for protein {1} could not be found!".
                      format(psicov_file, p))
                continue

            if not os.path.isfile(qijabfile):
                print(
                    "qij file {0} for protein {1} could not be found!".format(
                        qijabfile, p))
                continue

            if not os.path.isfile(pdb_file):
                print(
                    "PDB file {0} for protein {1} could not be found!".format(
                        pdb_file, p))
                continue

            psicov = io.read_alignment(psicov_file)
            N = len(psicov)
            L = len(psicov[0])
            diversity = np.sqrt(N) / L

            # skip proteins with low diversities
            if diversity < self.diversity_thr:
                continue

            indices_contact, indices_non_contact = self.get_residue_pairs_from_protein(
                braw_file_gz, qijabfile, pdb_file, psicov)

            # if no data
            if len(indices_contact[0]) == 0 and len(
                    indices_non_contact[0]) == 0:
                continue

            protein_data = {}
            protein_data['N'] = N
            protein_data['L'] = L
            protein_data['diversity'] = diversity
            protein_data['braw_file_path'] = braw_file_gz
            protein_data['msafilename'] = psicov_file
            protein_data['qijabfilename'] = qijabfile
            protein_data['residue_i'] = []
            protein_data['residue_j'] = []
            protein_data['contact'] = []

            # shuffle indices, so to not introduce any bias when choosing only the first X pairs from each protein
            random.seed(self.seed)
            random.shuffle(indices_contact[0], lambda: 0.1)
            random.shuffle(indices_contact[1], lambda: 0.1)
            random.shuffle(indices_non_contact[0], lambda: 0.1)
            random.shuffle(indices_non_contact[1], lambda: 0.1)

            if len(indices_contact[0]) > 0 and (nr_pairs_contacts <
                                                self.number_of_pairs):
                protein_data['residue_i'].extend(
                    indices_contact[0][:self.maxcontacts_per_protein])
                protein_data['residue_j'].extend(
                    indices_contact[1][:self.maxcontacts_per_protein])
                protein_data['contact'].extend(
                    [1] *
                    len(indices_contact[0][:self.maxcontacts_per_protein]))
                nr_pairs_contacts += len(
                    indices_contact[0][:self.maxcontacts_per_protein])
                self.training_data[p] = protein_data

            if len(indices_non_contact[0]) > 0 and nr_pairs_bg < (
                    self.number_of_pairs * self.balance):
                protein_data['residue_i'].extend(
                    indices_non_contact[0][:self.maxnoncontacts_per_protein])
                protein_data['residue_j'].extend(
                    indices_non_contact[1][:self.maxnoncontacts_per_protein])
                protein_data['contact'].extend([0] * len(
                    indices_non_contact[0][:self.maxnoncontacts_per_protein]))
                nr_pairs_bg += len(
                    indices_non_contact[0][:self.maxnoncontacts_per_protein])
                self.training_data[p] = protein_data

            if p not in self.training_data:

                if len(
                        indices_contact[0]
                ) > 0 and nr_pairs_contact_crossval < self.nr_crossval_pairs:
                    protein_data['residue_i'].extend(
                        indices_contact[0][:self.maxcontacts_per_protein])
                    protein_data['residue_j'].extend(
                        indices_contact[1][:self.maxcontacts_per_protein])
                    protein_data['contact'].extend(
                        [1] *
                        len(indices_contact[0][:self.maxcontacts_per_protein]))
                    nr_pairs_contact_crossval += len(
                        indices_contact[0][:self.maxcontacts_per_protein])
                    self.test_data[p] = protein_data

                if len(indices_non_contact[0]
                       ) > 0 and nr_pairs_noncontact_crossval < (
                           self.nr_crossval_pairs * self.balance):
                    protein_data['residue_i'].extend(
                        indices_non_contact[0]
                        [:self.maxnoncontacts_per_protein])
                    protein_data['residue_j'].extend(
                        indices_non_contact[1]
                        [:self.maxnoncontacts_per_protein])
                    protein_data['contact'].extend(
                        [0] * len(indices_non_contact[0]
                                  [:self.maxnoncontacts_per_protein]))
                    nr_pairs_noncontact_crossval += len(
                        indices_non_contact[0]
                        [:self.maxnoncontacts_per_protein])
                    self.test_data[p] = protein_data

            print(
                "{0}, #pairs in training set: contact={1} bg={2}, #pairs in testset: contact={3} bg={4}"
                .format(p, nr_pairs_contacts, nr_pairs_bg,
                        nr_pairs_contact_crossval,
                        nr_pairs_noncontact_crossval))

            # stop condition
            condition_training = [
                nr_pairs_contacts >= self.number_of_pairs, nr_pairs_bg >=
                (self.number_of_pairs * self.balance)
            ]
            condition_test = [
                nr_pairs_contact_crossval >= self.nr_crossval_pairs,
                nr_pairs_noncontact_crossval >= self.nr_crossval_pairs
            ]
            if (all(condition_training) and all(condition_test)):
                break

        self.nr_pairs_contact = nr_pairs_contacts
        self.nr_pairs_noncontact = nr_pairs_bg
        self.nr_pairs_contact_cross_val = nr_pairs_contact_crossval
        self.nr_pairs_noncontact_cross_val = nr_pairs_noncontact_crossval
def collect_data(braw_dir, alignment_dir, pdb_dir,
                 size, diversity_thr, contact_threshold, noncontact_threshold, Nij_threshold):


    braw_files = glob.glob(braw_dir + "/*braw.gz")

    couplings_df = pd.DataFrame()
    nr_contacts = 0
    nr_noncontacts = 0
    sequence_separation=10

    for braw_file in braw_files:
        #braw_file = braw_files[0]
        if nr_contacts >= size and nr_noncontacts >= size:
            break

        if not os.path.exists(braw_file):
            print("Braw File " + str(braw_file) + "cannot be found. ")
            continue

        braw = raw.parse_msgpack(braw_file)
        L  = braw.ncol
        if 'msafile' in braw.meta['workflow'][0]:
            N = braw.meta['workflow'][0]['msafile']['nrow']
        else:
            N = braw.meta['workflow'][0]['parameters']['msafile']['nrow']
        diversity = np.sqrt(N)/L
        if diversity < diversity_thr:
            continue

        protein = os.path.basename(braw_file).split(".")[0]

        alignment_file = alignment_dir + "/" + protein + ".filt.psc"
        if not os.path.exists(alignment_file):
            print("Alignment File " + str(alignment_file) + " cannot be found. ")
            continue

        pdb_file = pdb_dir + "/" + protein.replace("_", "") + ".pdb"
        if not os.path.exists(pdb_file):
            print("PDB File " + str(pdb_file) + " cannot be found. ")
            continue

        print protein, "N =", N, "L =", L, "diversity =", diversity

        indices_upper_tri  =  np.triu_indices(L, k=sequence_separation)

        #filter pair indices that have specified Cb distances
        dist_matrix = pdb.distance_map(pdb_file, L)
        indices_contact = np.where((dist_matrix[indices_upper_tri] < contact_threshold))[0]
        indices_noncontact = np.where((dist_matrix[indices_upper_tri] > noncontact_threshold))[0]

        #filter pair indices that have more than Nij_threshold ungapped sequences
        alignment = io.read_alignment(alignment_file)
        weights = weighting.calculate_weights_simple(alignment, 0.8, True)
        pairwise_counts = counts.pair_counts(alignment, weights)
        Nij = pairwise_counts[:, :, :20, :20].sum(3).sum(2)
        indices_Nij_true = np.where(Nij[indices_upper_tri] > Nij_threshold)[0]

        #get pair indices that fullfill both requirements
        indices_contact = list(set(indices_contact).intersection(indices_Nij_true))
        indices_noncontact = list(set(indices_noncontact).intersection(indices_Nij_true))

        #get couplings for filtered pairs
        braw_reshaped =  braw.x_pair[:,:,:20,:20].reshape(L,L,400)
        if nr_contacts < size:
            couplings_contact = pd.DataFrame(braw_reshaped[indices_upper_tri][indices_contact])
            couplings_contact['distance'] = dist_matrix[indices_upper_tri][indices_contact]
            couplings_df  = couplings_df.append(couplings_contact)
            nr_contacts += len(indices_contact)

        if nr_noncontacts < size:
            couplings_noncontact = pd.DataFrame(braw_reshaped[indices_upper_tri][indices_noncontact])
            couplings_noncontact['distance'] = dist_matrix[indices_upper_tri][indices_noncontact]
            couplings_df = couplings_df.append(couplings_noncontact)
            nr_noncontacts += len(indices_noncontact)

        print "Nr of couplings contact: {0} and non-contact: {1}".format(nr_contacts, nr_noncontacts)

    couplings_df['class'] = (couplings_df['distance'] < contact_threshold) * 1

    return couplings_df
def main():

    args = parse_args()

    alignment_dir    = args.alignment_dir
    sampled_alignment_dir   = args.sampled_alignment_dir
    filter  = args.filter
    plot_dir        = args.plot_dir

    #debug
    #alignment_dir = "/home/vorberg/work/data/ccmgen/psicov/alignments/"
    #sampled_alignment_dir = "/home/vorberg/work/data/ccmgen/psicov/sampled_pll/"
    #sampled_alignment_dir = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/"
    #plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/pca/"
    #filter = "ind."
    #filter = "ind-rand."
    #filter = "ind-rand-gap."
    #filter = "star."
    #filter = "binary."

    alignment_files = glob.glob(alignment_dir + "/*aln")
    for alignment_file in alignment_files:
        #alignment_file='/home/vorberg/work/data/ccmgen/psicov/alignments/1gmxA.aln'
        #alignment_file='/home/vorberg/work/data/ccmgen/psicov/alignments/1bkrA.aln'

        #read alignment
        alignment = io.read_alignment(alignment_file, max_gap_pos=100, max_gap_seq=100)
        L_original=alignment.shape[1]
        alignment, gapped_positions = io.remove_gapped_positions(alignment, max_gap_percentage=50)
        non_gapped_positions = [i for i in range(L_original) if i not in gapped_positions]

        name=os.path.basename(alignment_file).split(".")[0]
        print("{0}: N={1}, L={2}".format(name, alignment.shape[0], alignment.shape[1]))

        #one hot encoding: transform alignment into binary dummy variables
        enc = OneHotEncoder(n_values=21)
        enc.fit(alignment)
        alignment_one_hot = enc.transform(alignment).toarray()

        pca = PCA(n_components=2)
        pca.fit(alignment_one_hot)
        alignment_transformed = pca.transform(alignment_one_hot)
        print("N={0}, L={1}".format(alignment_transformed.shape[0], alignment_transformed.shape[1]))
        print('explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_))

        plot_dict={}
        plot_dict['name']=name

        plot_dict['data']=[
            {
                'name': 'Pfam',
                'x': alignment_transformed[:, 0],
                'y': alignment_transformed[:, 1],
                'seq': alignment,
                'N': alignment.shape[0],
                'L': alignment.shape[1],
                'neff(weights)': au.compute_neff(alignment),
                'neff(entropy)': au.compute_neff_hhblits(alignment)
            }
        ]

        plot_projection_on_two_components_gapstructure(
            plot_dict, plot_out=plot_dir + "/" + name + ".original.PCA_projection.gapstructure.html")

        #read in sampled alignment
        sampled_alignment_file = glob.glob(sampled_alignment_dir + "/" + name + "*" + filter + "*aln")
        #sampled_alignment_file=["/home/vorberg/1bkrA.binary.5.aln"]
        #sampled_alignment_file=["/home/vorberg/1bkrA.star.5.aln"]
        if len(sampled_alignment_file) > 0:
            sampled_alignment = io.read_alignment(sampled_alignment_file[0], max_gap_pos=100, max_gap_seq=100)
            sampled_alignment = np.ascontiguousarray(sampled_alignment[:, non_gapped_positions])
            method=os.path.dirname(sampled_alignment_file[0]).split("/")[-1]

            #one hot encoding
            enc = OneHotEncoder(n_values=21)
            enc.fit(sampled_alignment)
            sampled_alignment_one_hot = enc.transform(sampled_alignment).toarray()

            sampled_alignment_transformed = pca.transform(sampled_alignment_one_hot)
            print("N={0}, L={1}".format(sampled_alignment_transformed.shape[0], sampled_alignment_transformed.shape[1]))

            plot_dict['data'].append(
                {
                    #'name': method+ "." + filter,
                    'name': "MCMC PCD",
                    'x': sampled_alignment_transformed[:, 0],
                    'y': sampled_alignment_transformed[:, 1],
                    'seq': sampled_alignment,
                    'N': sampled_alignment.shape[0],
                    'L': sampled_alignment.shape[1],
                    'neff(weights)': au.compute_neff(sampled_alignment),
                    'neff(entropy)': au.compute_neff_hhblits(sampled_alignment)
                }
            )

            title=""
            # title = "Projection onto first 2 PC for protein " + plot_dict['name']
            # for plot_data in plot_dict['data']:
            #     title += "<br>{0}: N={1}, L={2}, Neff(weights)={3}, Neff(entropy)={4}".format(
            #         plot_data['name'],
            #         plot_data['N'], plot_data['L'],
            #         np.round(plot_data['neff(weights)'], decimals=3),
            #         np.round(plot_data['neff(entropy)'], decimals=3)
            #     )

            plot_projection_on_two_components(
                plot_dict,
                title=title,
                plot_out=plot_dir + "/" + name + "." + method+ "." + filter + "PCA_projection.html"
            )






    for alignment_file in alignment_files[5:]:
        #alignment_file=alignment_files[0]

        # read alignment
        alignment = io.read_alignment(alignment_file, max_gap_pos=100, max_gap_seq=100)
        L_original=alignment.shape[1]
        alignment, gapped_positions = io.remove_gapped_positions(alignment, max_gap_percentage=50)
        non_gapped_positions = [i for i in range(L_original) if i not in gapped_positions]
        name = os.path.basename(alignment_file).split(".")[0]
        print("{0}: N={1}, L={2}".format(
            name, alignment.shape[0],alignment.shape[1])
        )

        #multiple correspondence analysis for categorical data
        # MCA is "essentially PCA for categorical variables"
        # MCA can also be viewed as a PCA applied to the complete disjunctive table (CDT aka indicator matrix)
        # unstandardized PCA applied to transformed CDT,..., leads to the results of MCA
        # MCA is defined as the application of weighted Principal component analysis (PCA) to the indicator matrix G
        df = pd.DataFrame(alignment)
        df.columns=['col'+str(i) for i in range(1, alignment.shape[1]+1)]

        mca_df = mca.MCA(df, benzecri=False)
        #fs_r much slower than fs_r_sup....
        #alignment_transformed = mca_df.fs_r(N=2)
        alignment_transformed = -mca_df.fs_r_sup(df, N=2)

        plot_dict={}
        plot_dict['name']=name
        plot_dict['N'] = alignment.shape[0]
        plot_dict['L'] = alignment.shape[1]
        plot_dict['neff(weights)'] = au.compute_neff(alignment)
        plot_dict['neff(entropy)'] = au.compute_neff_hhblits(alignment)
        plot_dict['data']=[
            {
                'name': 'original',
                'x': alignment_transformed[:, 0],
                'y': alignment_transformed[:, 1],
                'N': alignment.shape[0],
                'L': alignment.shape[1],
                'neff(weights)': au.compute_neff(alignment),
                'neff(entropy)': au.compute_neff_hhblits(alignment)
            }
        ]


        #read in sampled alignment
        sampled_alignment_file = glob.glob(sampled_alignment_dir + "/" + name + "*" + filter + "*.aln")
        #sampled_alignment_file=["/home/vorberg/1gmxA.star.2.aln"]
        if len(sampled_alignment_file) > 0:
            sampled_alignment = io.read_alignment(sampled_alignment_file[0], max_gap_pos=100, max_gap_seq=100)
            sampled_alignment = np.ascontiguousarray(sampled_alignment[:, non_gapped_positions])
            method=os.path.dirname(sampled_alignment_file[0]).split("/")[-1]

            df = pd.DataFrame(sampled_alignment)
            df.columns = ['col' + str(i) for i in range(1, alignment.shape[1] + 1)]

            sampled_alignment_transformed = -mca_df.fs_r_sup(df, N=2)
            print("N={0}, L={1}".format(sampled_alignment_transformed.shape[0], sampled_alignment_transformed.shape[1]))

            plot_dict['data'].append(
                {
                    'name': method+ "." + filter,
                    'x': sampled_alignment_transformed[:, 0],
                    'y': sampled_alignment_transformed[:, 1],
                    'N': sampled_alignment.shape[0],
                    'L': sampled_alignment.shape[1],
                    'neff(weights)': au.compute_neff(sampled_alignment),
                    'neff(entropy)': au.compute_neff_hhblits(sampled_alignment)
                }
            )

            plot_projection_on_two_components(
                plot_dict,
                plot_out=plot_dir + "/" + name + "." + method+ "." + filter + ".MCA_projection.html"
            )
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting empirical vs model alignment statistics.')
    parser.add_argument("observed_alignments",   type=str,   help="path to original aligment files")
    parser.add_argument("sampled_alignments_pll",    type=str,   help="path to sampled alignment files (using PLL)")
    parser.add_argument("sampled_alignments_pcd", type=str, help="path to sampled alignment files (using PCD)")
    parser.add_argument("plot_dir",             type=str,   help="path to output directory for plots")


    args = parser.parse_args()

    observed_alignments_path = args.observed_alignments
    sampled_alignments_paths  = args.observed_alignments
    plot_dir                = args.plot_out
    log=False
    max_gap_pos=50


    #debug
    # observed_alignments_path = "/home/vorberg/work/data/ccmgen/psicov/alignments/"
    # sampled_alignments_paths_pll = "/home/vorberg/work/data/ccmgen/psicov/sampled_pll/"
    # sampled_alignments_paths_pcd = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/"
    # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/pll_vs_pcd_comparison/alignment_statistics_correlation/"


    data_dict = {
        'pseudo-likelihood': {
            'x': [],
            'y': []
        },
        'contrastive divergence': {
            'x': [],
            'y': []
        }
    }


    observed_alignments = glob.glob(observed_alignments_path+"/*aln")
    for obs_aln_file in observed_alignments:
        protein= os.path.basename(obs_aln_file).split(".")[0]
        sampled_aln_file_pll = glob.glob(sampled_alignments_paths_pll + "/" + protein + "*.ind.aln")
        sampled_aln_file_pcd = glob.glob(sampled_alignments_paths_pcd + "/" + protein + "*.ind.aln")

        if len(sampled_aln_file_pll) == 0 or not os.path.exists(sampled_aln_file_pll[0]):
            print("Sampled alignment file {0} does not exist!".format(sampled_aln_file_pll[0]))
            continue

        if len(sampled_aln_file_pcd) == 0 or not os.path.exists(sampled_aln_file_pcd[0]):
            print("Sampled alignment file {0} does not exist!".format(sampled_aln_file_pcd[0]))
            continue

        print(protein)

        #read in alignments and remove columns with >50% gaps
        alignment_o = io.read_alignment(obs_aln_file, max_gap_pos=100, max_gap_seq=100)
        L_original = alignment_o.shape[1]
        alignment_o, gapped_positions = io.remove_gapped_positions(alignment_o, max_gap_percentage=max_gap_pos)
        non_gapped_positions = [i for i in range(L_original) if i not in gapped_positions]
        alignment_s_pll = io.read_alignment(sampled_aln_file_pll[0], max_gap_pos=100, max_gap_seq=100)
        alignment_s_pll = np.ascontiguousarray(alignment_s_pll[:, non_gapped_positions])
        alignment_s_pcd = io.read_alignment(sampled_aln_file_pcd[0], max_gap_pos=100, max_gap_seq=100)
        alignment_s_pcd = np.ascontiguousarray(alignment_s_pcd[:, non_gapped_positions])

        # compute amino acid counts
        single_freq_observed, pairwise_freq_observed = au.calculate_frequencies(alignment_o, au.uniform_pseudocounts)
        single_freq_sampled_pll, pairwise_freq_sampled_pll = au.calculate_frequencies(alignment_s_pll, au.uniform_pseudocounts)
        single_freq_sampled_pcd, pairwise_freq_sampled_pcd = au.calculate_frequencies(alignment_s_pcd, au.uniform_pseudocounts)

        # degap the frequencies (ignore gap frequencies)
        single_freq_observed = au.degap(single_freq_observed, False)
        single_freq_sampled_pll = au.degap(single_freq_sampled_pll, False)
        single_freq_sampled_pcd = au.degap(single_freq_sampled_pcd, False)
        pairwise_freq_observed = au.degap(pairwise_freq_observed, False)
        pairwise_freq_sampled_pll = au.degap(pairwise_freq_sampled_pll, False)
        pairwise_freq_sampled_pcd = au.degap(pairwise_freq_sampled_pcd, False)

        #reshape frequencies
        L = alignment_o.shape[1]
        indices_upper_triangle = np.triu_indices(L, k=1)

        x_single = single_freq_observed.flatten().tolist()
        y_single_pll = single_freq_sampled_pll.flatten().tolist()
        y_single_pcd = single_freq_sampled_pcd.flatten().tolist()
        pair_freq_observed = pairwise_freq_observed[
                             indices_upper_triangle[0],
                             indices_upper_triangle[1], :, :].flatten().tolist()
        pair_freq_sampled_pll = pairwise_freq_sampled_pll[
                            indices_upper_triangle[0],
                            indices_upper_triangle[1], :, :].flatten().tolist()
        pair_freq_sampled_pcd = pairwise_freq_sampled_pcd[
                            indices_upper_triangle[0],
                            indices_upper_triangle[1], :, :].flatten().tolist()
        cov_observed = [
            pairwise_freq_observed[i, j, a, b] - (single_freq_observed[i, a] * single_freq_observed[j, b])
            for i in range(L - 1) for j in range(i + 1, L) for a in range(20) for b in range(20)]
        cov_sampled_pll = [pairwise_freq_sampled_pll[i, j, a, b] - (single_freq_sampled_pll[i, a] * single_freq_sampled_pll[j, b])
                       for i in range(L - 1) for j in range(i + 1, L) for a in range(20) for b in range(20)]
        cov_sampled_pcd = [pairwise_freq_sampled_pcd[i, j, a, b] - (single_freq_sampled_pcd[i, a] * single_freq_sampled_pcd[j, b])
                       for i in range(L - 1) for j in range(i + 1, L) for a in range(20) for b in range(20)]


        if log:
            x_single = np.log(x_single)
            y_single_pll = np.log(y_single_pll)
            y_single_pcd = np.log(y_single_pcd)
            pair_freq_observed = np.log(pair_freq_observed)
            pair_freq_sampled_pll = np.log(pair_freq_sampled_pll)
            pair_freq_sampled_pcd = np.log(pair_freq_sampled_pcd)


        #compute pearson correlation coefficient
        data_dict['pseudo-likelihood']['x'].append(np.corrcoef(x_single, y_single_pll)[0, 1])
        data_dict['pseudo-likelihood']['y'].append('single site frequencies')

        data_dict['pseudo-likelihood']['x'].append(np.corrcoef(pair_freq_observed, pair_freq_sampled_pll)[0, 1])
        data_dict['pseudo-likelihood']['y'].append('pairwise frequencies')

        data_dict['pseudo-likelihood']['x'].append(np.corrcoef(cov_observed, cov_sampled_pll)[0, 1])
        data_dict['pseudo-likelihood']['y'].append('Covariances')

        data_dict['contrastive divergence']['x'].append(np.corrcoef(x_single, y_single_pcd)[0, 1])
        data_dict['contrastive divergence']['y'].append('single site frequencies')

        data_dict['contrastive divergence']['x'].append(np.corrcoef(pair_freq_observed, pair_freq_sampled_pcd)[0, 1])
        data_dict['contrastive divergence']['y'].append('pairwise frequencies')

        data_dict['contrastive divergence']['x'].append(np.corrcoef(cov_observed, cov_sampled_pcd)[0, 1])
        data_dict['contrastive divergence']['y'].append('Covariances')



    #plot boxplot
    plot_boxplot_correlation_alignment_statistics_pll_vs_pcd(data_dict,plot_dir )
def main():


    args = parse_args()

    braw_dir      = args.braw_dir
    alignment_dir      = args.alignment_dir
    plot_dir  = args.plot_dir


    #debug
    braw_dir = "/home/vorberg//work/data/ccmgen/psicov/predictions_pcd/"
    alignment_dir = "/home/vorberg//work/data/ccmgen/psicov/alignments/"
    plot_dir = "/home/vorberg//work/plots/ccmgen/psicov/scatter_apc_vs_ec/pcd/"


    pearson_r_list = []
    proteins = []
    for braw_file in glob.glob(braw_dir  +"/*braw.gz"):

        protein_name = os.path.basename(braw_file).split('.')[0]
        proteins.append(protein_name)
        print(protein_name)

        #read braw file
        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta
        neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
        lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info), decimals=3)
        L = braw.ncol

        # read alignment file
        alignment_file = alignment_dir  +"/" + protein_name + ".aln"
        alignment = io.read_alignment(alignment_file)
        single_freq, pair_freq = au.calculate_frequencies(alignment, au.uniform_pseudocounts)

        #get the highly gapped positions that need to be excluded from analysis
        alignment_ungapped, gapped_positions = io.remove_gapped_positions(alignment, max_gap_percentage=50)
        non_gapped_positions = [i for i in range(L) if i not in gapped_positions]
        indices_i, indices_j = np.triu_indices(len(non_gapped_positions), k=1)

        #compute ec
        uij, scaling_factor = bu.compute_entropy_correction(
            single_freq, neff, lambda_w, braw.x_pair,
            entropy=True, squared=False, nr_states = 20)
        ec_term = scaling_factor * np.sqrt(np.sum(uij, axis=(3, 2)))
        ec_term_ungapped = ec_term[non_gapped_positions, :]
        ec_term_ungapped = ec_term_ungapped[:, non_gapped_positions]

        #compute joint EC instead of geometric mean of per-column entropies
        # uij, scaling_factor = bu.compute_joint_entropy_correction(pair_freq, neff, lambda_w, braw.x_pair, nr_states = 20)
        # ec_term = scaling_factor * uij
        # ec_term_ungapped = ec_term[non_gapped_positions, :]
        # ec_term_ungapped = ec_term_ungapped[:, non_gapped_positions]

        #compute contact matrix for ungapped positions
        cmat = bu.compute_l2norm_from_braw(braw.x_pair, apc=False, squared=False)

        #compute apc
        cmat_ungapped = cmat[non_gapped_positions, :]
        cmat_ungapped = cmat_ungapped[:, non_gapped_positions]
        mean = np.mean(cmat_ungapped, axis=0)
        apc_term_ungapped = mean[:, np.newaxis] * mean[np.newaxis, :] / np.mean(cmat_ungapped)

        #plot
        plot_file = plot_dir + "/" + protein_name + "_apc_vs_ec.html"
        plot_scatter(
            apc_term_ungapped[indices_i, indices_j],
            ec_term_ungapped[indices_i, indices_j],
            ["i: " + str(i) + "<br>j: " + str(j) for i,j in zip(indices_i, indices_j)],
            plot_file)


        #compute pearson correlation coefficient
        pearson_r_list.append(pearsonr(apc_term_ungapped[indices_i, indices_j], ec_term_ungapped[indices_i, indices_j])[0])

    #plot boxplot with jitter
    plot_file = plot_dir + "/boxplot_pearsonr_apc_vs_ec.html"
    plot_boxplot_correlation(pearson_r_list, proteins, plot_file)
Beispiel #22
0
def main():


    # ===============================================================================
    ### Parse arguments
    # ===============================================================================

    parser = argparse.ArgumentParser(description='plot statistics about dataset.')
    parser.add_argument("-d", "--dataset_files",    type=str, help="path to directory with dataset description files")
    parser.add_argument("-a", "--alignments",       type=str, help="path to directory with alignment files")
    parser.add_argument("-o", "--plot_out",         type=str, help="path to directory where to put plot")

    args = parser.parse_args()

    plot_out = args.plot_out
    alignment_path = args.alignments
    dataset_files = args.dataset_files

    print ("--------------------------------------------------------")
    print ("plot_out: \t"                   + str(plot_out))
    print ("path to alignemnt files: \t"    + str(alignment_path))
    print ("path to dataset files: \t"      + str(dataset_files))
    print ("--------------------------------------------------------")

    #plot_out           = "/home/vorberg/work/plots/bayesian_framework/dataset_statistics/dataset_cath4.1/"
    #alignment_path     = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/"
    #dataset_files      = "/home/vorberg/work/data/benchmarkset_cathV4.1/dataset/dataset_properties/"


    stats = {
        'protein' :        [],
        'diversity' :      [],
        'N':               [],
        'L':               [],
        'percent_gaps':    []
    }

    if dataset_files is not None:

        dataset_folds = {}
        for file in glob.glob(dataset_files + "/*n5e01*"):
            id = os.path.basename(file).split("_")[2]
            dataset_folds[id] = pd.read_table(file, skipinitialspace=True)
            dataset_folds[id].columns = ['domain', 'resolution', 'CATH', 'L', 'N']

        stats['dataset']=        []
        stats['cath_topology'] = []

        cath_classes = {
            1 : 'CATH class 1 (mainly alpha)',
            2 : 'CATH class 2 (mainly beta)',
            3 : 'CATH class 3 (alpha beta)'
        }

        for fold in dataset_folds.keys():
            for index, row in dataset_folds[fold].iterrows():

                protein = row['domain']
                cath = row['CATH']

                psicov_file = alignment_path + "/" + protein +".filt.psc"

                #if it does not exist, it has been filtered due to
                #combs ambiguity or alignment filter
                if os.path.exists(psicov_file):
                    alignment = io.read_alignment(psicov_file)

                    L = len(alignment[0])
                    N = len(alignment)

                    percent_gaps = ali_ut.compute_gaps_per_position(alignment)
                    percent_gaps_alignment = np.mean(percent_gaps)

                    stats['protein'].append(protein)
                    stats['diversity'].append(np.sqrt(N)/L)
                    stats['N'].append(N)
                    stats['L'].append(L)
                    stats['percent_gaps'].append(percent_gaps_alignment)
                    stats['dataset'].append(fold)
                    stats['cath_topology'].append(cath_classes[int(cath.split(".")[0])])

        stats_df = pd.DataFrame(stats)


    if dataset_files is None:

        psicov_files= glob.glob(alignment_path+"/*")

        for psicov_file in psicov_files:
            protein = os.path.basename(psicov_file).split(".")[0]
            print(protein)

            alignment = io.read_alignment(psicov_file)
            L = len(alignment[0])
            N = len(alignment)

            percent_gaps = ali_ut.compute_gaps_per_position(alignment)
            percent_gaps_alignment = np.mean(percent_gaps)

            stats['protein'].append(protein)
            stats['diversity'].append(np.sqrt(N) / L)
            stats['N'].append(N)
            stats['L'].append(L)
            stats['percent_gaps'].append(percent_gaps_alignment)

        stats_df = pd.DataFrame(stats)

        ### Plot
        plot_boxplot_all_stats(stats_df, plot_out=plot_out+"/dataset_stats.html")

    #===============================================================================
    ### Plot
    #===============================================================================



    plot_boxplot_for_statistic(
        stats_df, 'diversity', 'Distribution of Diversity (sqrt(N)/L)', jitter_pos=2,
        plot_out=plot_out +"/diversity_dataset_boxplot.html"
    )

    plot_boxplot_for_statistic(
        stats_df, 'diversity', '', jitter_pos=2,
        plot_out=plot_out +"/diversity_dataset_boxplot_notitle.html"
    )

    plot_boxplot_for_statistic(
        stats_df, 'N', 'Distribution of MSA size (# sequences)', jitter_pos=2,
        plot_out=plot_out + "/msa_size_dataset_boxplot.html")

    plot_boxplot_for_statistic(
        stats_df, 'N', '', jitter_pos=2,
        plot_out=plot_out + "/msa_size_dataset_boxplot_notitle.html")


    plot_boxplot_for_statistic(
        stats_df, 'L', 'Distribution of protein lengths', jitter_pos=2,
        plot_out=plot_out + "/protein_length_dataset_boxplot.html")

    plot_boxplot_for_statistic(
        stats_df, 'L', '', jitter_pos=2,
        plot_out=plot_out + "/protein_length_dataset_boxplot_notitle.html")


    plot_boxplot_for_statistic(
        stats_df, 'percent_gaps', 'Distribution of gap percentage',  jitter_pos=2,
        plot_out=plot_out +"/gap_percentage_boxplot.html")

    plot_boxplot_for_statistic(
        stats_df, 'percent_gaps', '',  jitter_pos=2,
        plot_out=plot_out +"/gap_percentage_boxplot_notitle.html")

    plot_stacked_barchart_cath(
        stats_df, 'Proportion of CATH classes in all datasets',
        plot_out=plot_out + "/cath_topologies_stacked_relative.html"
    )

    plot_stacked_barchart_cath(
        stats_df, '',
        plot_out=plot_out + "/cath_topologies_stacked_reative_notitle.html"
    )
def main():
    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')

    group_append = parser.add_mutually_exclusive_group(required=True)
    group_append.add_argument('-m', '--mat_file', type=str, dest='mat_file', help='path to mat file')
    group_append.add_argument('-b', '--braw_file', type=str, dest='braw_file', help='path to braw file')

    parser.add_argument("-o", "--plot-out", dest="plot_out", type=str, help="directory for plot")

    parser.add_argument("--seqsep", type=int, default=6, help="sequence separation")
    parser.add_argument("--contact_threshold", type=int, default=8,
                        help="contact definition; C_beta distance between residue pairs")
    parser.add_argument("--pdb_file", type=str, help="path to pdb file [optional] -  plotting true contacs")
    parser.add_argument("--alignment_file", type=str, help="path to alignment file [optional] - plotting coverage")
    parser.add_argument("--apc", action="store_true", default=False, help="Apply average product correction")
    parser.add_argument("--entropy_correction", action="store_true", default=False, help="Apply entropy correction")

    args = parser.parse_args()

    if args.mat_file is None and args.braw_file is None:
        print("Either mat_file or braw_file need to be set.")

    plot_out = args.plot_out
    seqsep = args.seqsep
    contact_threshold = args.contact_threshold
    apc = args.apc
    entropy_correction = args.entropy_correction

    alignment_file = args.alignment_file
    pdb_file = args.pdb_file

    ##### debugging

    protein = "2hs1A"
    topology = "binary"
    topology = "star"

    alignment_file = "/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln"
    alignment_file = None
    #alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".aln"
    # alignment_format = "psicov"

    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" + protein + ".filt.braw.gz"
    # braw_file = "/home/vorberg/" + protein + ".gx.gz"
    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw_ec_correction/" + protein + ".braw.ec.gz"
    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw/" + protein + ".filt.braw.gz"
    #
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pll/" + protein + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/" + protein + ".frobenius.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.ec.20.log2.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.ec.20.log2.mat"



    # pdb_file = "/home/vorberg/work/data/ccmgen/psicov/pdb/" + protein + ".pdb"
    # # pdb_file=None

    # seqsep = 4
    # # seqsep = 1

    # contact_threshold = 8

    # plot_out = "/home/vorberg/work/plots/ccmgen/psicov/contact_maps/"

    apc=True
    apc = False
    entropy_correction = True
    entropy_correction = False



    ### Compute l2norm score from braw
    if args.braw_file is not None:
        braw_file = args.braw_file
        protein_name = '.'.join(os.path.basename(braw_file).split('.')[:-1])
        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta

        neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
        lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info), decimals=3)

        if entropy_correction:
            alignment = io.read_alignment(alignment_file)
            single_freq, pair_freq = au.calculate_frequencies(alignment, au.uniform_pseudocounts)
            mat = bu.compute_corrected_mat_entropy(braw.x_pair, single_freq, neff, lambda_w, entropy=True, squared=False, nr_states = 20)
        else:
            mat = bu.compute_l2norm_from_braw(braw, apc)

    ### Read score from mat
    if args.mat_file is not None:
        mat_file = args.mat_file
        mat = io.read_matfile(mat_file)
        if (apc):
            mat = bu.compute_apc_corrected_matrix(mat)
        meta_info = io.read_json_from_mat(mat_file)
        protein_name = os.path.basename(mat_file).split('.')[0]

    correction=""
    if apc:
        correction = "_apc"
    if entropy_correction:
        correction ="_ec"
    plot_file = plot_out + protein_name + "_seqsep" + str(seqsep) + "_contacthr" + str(
        contact_threshold) + correction + ".html"
    neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
    N = u.find_dict_key("nrow", meta_info)
    L = u.find_dict_key("ncol", meta_info)
    title = protein_name + "<br>L: " + str(L) + " N: " + str(N) + " Neff: " + str(neff) + " diversity: " + str(
        np.round(np.sqrt(N) / L, decimals=3))
    plot_contact_map(mat, seqsep, contact_threshold, title, plot_file, alignment_file=alignment_file, pdb_file=pdb_file)
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(
        description='Plotting empirical vs model alignment statistics.')
    parser.add_argument("observed_alignment",
                        type=str,
                        help="path to original aligment file")
    parser.add_argument("sampled_alignment",
                        type=str,
                        help="path to sampled alignment file")
    parser.add_argument("plot_dir",
                        type=str,
                        help="path to output directory for plots")

    args = parser.parse_args()

    observed_alignment_file = args.observed_alignment
    sampled_alignment_file = args.sampled_alignment
    plot_dir = args.plot_out
    max_gap_pos = 50

    ######debugging
    protein = "1bkrA"
    observed_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln"

    #
    # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein + ".star.aln"
    # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd_ccmgen_star/"
    #
    # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein + ".binary.aln"
    # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd_ccmgen_binary/"
    #
    # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein + ".ind.aln"
    # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd/"
    #
    # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pll/" + protein + ".ind.aln"
    # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pll/"
    #
    # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_lfactor1e-3_cheating_12/" + protein + ".star.aln"
    # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd_1e-3_cheating_12/"

    sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr/" + protein + ".binary.aln"
    plot_dir = "/home/vorberg/"

    #
    # sampled_alignment_file = "/home/vorberg/" + protein + ".binary.5.aln"
    # sampled_alignment_file = "/home/vorberg/" + protein + ".star.5.aln"
    # plot_dir = "/home/vorberg/"

    #read both alignments
    alignment_o = io.read_alignment(observed_alignment_file,
                                    max_gap_pos=100,
                                    max_gap_seq=100)
    L_original = alignment_o.shape[1]
    alignment_o, gapped_positions = io.remove_gapped_positions(
        alignment_o, max_gap_percentage=max_gap_pos)
    non_gapped_positions = [
        i for i in range(L_original) if i not in gapped_positions
    ]
    alignment_s = io.read_alignment(sampled_alignment_file,
                                    max_gap_pos=100,
                                    max_gap_seq=100)
    alignment_s = np.ascontiguousarray(alignment_s[:, non_gapped_positions])
    print(alignment_o.shape, alignment_s.shape)

    #alignment dimensions
    N_o = alignment_o.shape[0]
    N_s = alignment_s.shape[0]
    L = alignment_o.shape[1]
    div = np.round(np.sqrt(N_o) / L, decimals=3)
    neff_weights_o = np.round(au.compute_neff(alignment_o), decimals=3)
    neff_weights_s = np.round(au.compute_neff(alignment_s), decimals=3)
    neff_entropy_o = np.round(au.compute_neff_hhblits(alignment_o), decimals=3)
    neff_entropy_s = np.round(au.compute_neff_hhblits(alignment_s), decimals=3)

    #compute amino acid counts only once
    single_freq_observed, pairwise_freq_observed = au.calculate_frequencies(
        alignment_o, au.uniform_pseudocounts)
    single_freq_sampled, pairwise_freq_sampled = au.calculate_frequencies(
        alignment_s, au.uniform_pseudocounts)

    #degap the frequencies (ignore gap frequencies)
    single_freq_observed = au.degap(single_freq_observed, False)
    single_freq_sampled = au.degap(single_freq_sampled, False)
    pairwise_freq_observed = au.degap(pairwise_freq_observed, False)
    pairwise_freq_sampled = au.degap(pairwise_freq_sampled, False)

    #prepare plot properties
    protein = os.path.basename(observed_alignment_file).split(".")[0]
    method = os.path.basename(sampled_alignment_file).split(".")[1]

    title = "Observed and model alignment statistics for {0}".format(protein)
    title += "<br>original: N={0}, L={1}, div={2}, neff(weights)={3}, neff(entropy)={4}".format(
        N_o, L, div, neff_weights_o, neff_entropy_o)
    title += "<br>sampled: N={0}, L={1}, neff(weights)={2}, neff(entropy)={3}".format(
        N_s, L, neff_weights_s, neff_entropy_s)
    #title=""

    #plot in normal and in log space
    plot_out = plot_dir + "/" + protein + ".empirical_vs_model_alignment_stats_" + method + ".html"
    plot_empirical_vs_model_statistics(single_freq_observed,
                                       single_freq_sampled,
                                       pairwise_freq_observed,
                                       pairwise_freq_sampled,
                                       title=title,
                                       plot_out=plot_out,
                                       log=False,
                                       width=1200)

    plot_out = plot_dir + "/" + protein + ".empirical_vs_model_alignment_stats_" + method + "_log.html"
    plot_empirical_vs_model_statistics(single_freq_observed,
                                       single_freq_sampled,
                                       pairwise_freq_observed,
                                       pairwise_freq_sampled,
                                       title=title,
                                       plot_out=plot_out,
                                       log=True)
def main():
    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')

    group_append = parser.add_mutually_exclusive_group(required=True)
    group_append.add_argument('-m',
                              '--mat_file',
                              type=str,
                              dest='mat_file',
                              help='path to mat file')
    group_append.add_argument('-b',
                              '--braw_file',
                              type=str,
                              dest='braw_file',
                              help='path to braw file')

    parser.add_argument("-o",
                        "--plot-out",
                        dest="plot_out",
                        type=str,
                        help="directory for plot")

    parser.add_argument("--seqsep",
                        type=int,
                        default=6,
                        help="sequence separation")
    parser.add_argument(
        "--contact_threshold",
        type=int,
        default=8,
        help="contact definition; C_beta distance between residue pairs")
    parser.add_argument(
        "--pdb_file",
        type=str,
        help="path to pdb file [optional] -  plotting true contacs")
    parser.add_argument(
        "--alignment_file",
        type=str,
        help="path to alignment file [optional] - plotting coverage")
    parser.add_argument("--apc",
                        action="store_true",
                        default=False,
                        help="Apply average product correction")
    parser.add_argument("--entropy_correction",
                        action="store_true",
                        default=False,
                        help="Apply entropy correction")

    args = parser.parse_args()

    if args.mat_file is None and args.braw_file is None:
        print("Either mat_file or braw_file need to be set.")

    plot_out = args.plot_out
    seqsep = args.seqsep
    contact_threshold = args.contact_threshold
    apc = args.apc
    entropy_correction = args.entropy_correction

    alignment_file = args.alignment_file
    pdb_file = args.pdb_file

    ##### debugging

    protein = "2hs1A"
    topology = "binary"
    topology = "star"

    alignment_file = "/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln"
    alignment_file = None
    #alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".aln"
    # alignment_format = "psicov"

    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" + protein + ".filt.braw.gz"
    # braw_file = "/home/vorberg/" + protein + ".gx.gz"
    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw_ec_correction/" + protein + ".braw.ec.gz"
    # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw/" + protein + ".filt.braw.gz"
    #
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pll/" + protein + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/" + protein + ".frobenius.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.ec.20.log2.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.apc.mat"
    mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.ec.20.log2.mat"

    # pdb_file = "/home/vorberg/work/data/ccmgen/psicov/pdb/" + protein + ".pdb"
    # # pdb_file=None

    # seqsep = 4
    # # seqsep = 1

    # contact_threshold = 8

    # plot_out = "/home/vorberg/work/plots/ccmgen/psicov/contact_maps/"

    apc = True
    apc = False
    entropy_correction = True
    entropy_correction = False

    ### Compute l2norm score from braw
    if args.braw_file is not None:
        braw_file = args.braw_file
        protein_name = '.'.join(os.path.basename(braw_file).split('.')[:-1])
        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta

        neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
        lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info),
                            decimals=3)

        if entropy_correction:
            alignment = io.read_alignment(alignment_file)
            single_freq, pair_freq = au.calculate_frequencies(
                alignment, au.uniform_pseudocounts)
            mat = bu.compute_corrected_mat_entropy(braw.x_pair,
                                                   single_freq,
                                                   neff,
                                                   lambda_w,
                                                   entropy=True,
                                                   squared=False,
                                                   nr_states=20)
        else:
            mat = bu.compute_l2norm_from_braw(braw, apc)

    ### Read score from mat
    if args.mat_file is not None:
        mat_file = args.mat_file
        mat = io.read_matfile(mat_file)
        if (apc):
            mat = bu.compute_apc_corrected_matrix(mat)
        meta_info = io.read_json_from_mat(mat_file)
        protein_name = os.path.basename(mat_file).split('.')[0]

    correction = ""
    if apc:
        correction = "_apc"
    if entropy_correction:
        correction = "_ec"
    plot_file = plot_out + protein_name + "_seqsep" + str(
        seqsep) + "_contacthr" + str(contact_threshold) + correction + ".html"
    neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
    N = u.find_dict_key("nrow", meta_info)
    L = u.find_dict_key("ncol", meta_info)
    title = protein_name + "<br>L: " + str(L) + " N: " + str(
        N) + " Neff: " + str(neff) + " diversity: " + str(
            np.round(np.sqrt(N) / L, decimals=3))
    plot_contact_map(mat,
                     seqsep,
                     contact_threshold,
                     title,
                     plot_file,
                     alignment_file=alignment_file,
                     pdb_file=pdb_file)
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting empirical vs model alignment statistics.')
    parser.add_argument("observed_alignment",   type=str,   help="path to original aligment file")
    parser.add_argument("sampled_alignment",    type=str,   help="path to sampled alignment file")
    parser.add_argument("plot_dir",             type=str,   help="path to output directory for plots")


    args = parser.parse_args()

    observed_alignment_file = args.observed_alignment
    sampled_alignment_file  = args.sampled_alignment
    plot_dir                = args.plot_out
    max_gap_pos = 50

    ######debugging
    protein="1bkrA"
    observed_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln"

    #
    # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein + ".star.aln"
    # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd_ccmgen_star/"
    #
    # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein + ".binary.aln"
    # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd_ccmgen_binary/"
    #
    # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein + ".ind.aln"
    # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd/"
    #
    # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pll/" + protein + ".ind.aln"
    # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pll/"
    #
    # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_lfactor1e-3_cheating_12/" + protein + ".star.aln"
    # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd_1e-3_cheating_12/"

    sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr/" + protein + ".binary.aln"
    plot_dir = "/home/vorberg/"

    #
    # sampled_alignment_file = "/home/vorberg/" + protein + ".binary.5.aln"
    # sampled_alignment_file = "/home/vorberg/" + protein + ".star.5.aln"
    # plot_dir = "/home/vorberg/"



    #read both alignments
    alignment_o = io.read_alignment(observed_alignment_file, max_gap_pos=100, max_gap_seq=100)
    L_original = alignment_o.shape[1]
    alignment_o, gapped_positions = io.remove_gapped_positions(alignment_o, max_gap_percentage=max_gap_pos)
    non_gapped_positions = [i for i in range(L_original) if i not in gapped_positions]
    alignment_s = io.read_alignment(sampled_alignment_file, max_gap_pos=100, max_gap_seq=100)
    alignment_s = np.ascontiguousarray(alignment_s[:, non_gapped_positions])
    print(alignment_o.shape, alignment_s.shape)

    #alignment dimensions
    N_o = alignment_o.shape[0]
    N_s = alignment_s.shape[0]
    L = alignment_o.shape[1]
    div=np.round(np.sqrt(N_o)/L, decimals=3)
    neff_weights_o = np.round(au.compute_neff(alignment_o), decimals=3)
    neff_weights_s = np.round(au.compute_neff(alignment_s), decimals=3)
    neff_entropy_o = np.round(au.compute_neff_hhblits(alignment_o), decimals=3)
    neff_entropy_s = np.round(au.compute_neff_hhblits(alignment_s), decimals=3)

    #compute amino acid counts only once
    single_freq_observed, pairwise_freq_observed = au.calculate_frequencies(alignment_o, au.uniform_pseudocounts)
    single_freq_sampled, pairwise_freq_sampled = au.calculate_frequencies(alignment_s, au.uniform_pseudocounts)

    #degap the frequencies (ignore gap frequencies)
    single_freq_observed = au.degap(single_freq_observed, False)
    single_freq_sampled = au.degap(single_freq_sampled, False)
    pairwise_freq_observed = au.degap(pairwise_freq_observed, False)
    pairwise_freq_sampled = au.degap(pairwise_freq_sampled, False)

    #prepare plot properties
    protein = os.path.basename(observed_alignment_file).split(".")[0]
    method = os.path.basename(sampled_alignment_file).split(".")[1]

    title="Observed and model alignment statistics for {0}".format(protein)
    title+="<br>original: N={0}, L={1}, div={2}, neff(weights)={3}, neff(entropy)={4}".format(N_o,L,div,neff_weights_o, neff_entropy_o)
    title+="<br>sampled: N={0}, L={1}, neff(weights)={2}, neff(entropy)={3}".format(N_s,L,neff_weights_s, neff_entropy_s)
    #title=""

    #plot in normal and in log space
    plot_out = plot_dir + "/"+ protein + ".empirical_vs_model_alignment_stats_"+method+".html"
    plot_empirical_vs_model_statistics(
        single_freq_observed, single_freq_sampled,
        pairwise_freq_observed, pairwise_freq_sampled,
        title=title, plot_out=plot_out, log=False, width=1200)

    plot_out = plot_dir + "/"+ protein + ".empirical_vs_model_alignment_stats_"+method+"_log.html"
    plot_empirical_vs_model_statistics(
        single_freq_observed, single_freq_sampled,
        pairwise_freq_observed, pairwise_freq_sampled,
        title=title, plot_out=plot_out, log=True)
def main():

    args = parse_args()

    braw_dir = args.braw_dir
    alignment_dir = args.alignment_dir
    plot_dir = args.plot_dir

    #debug
    braw_dir = "/home/vorberg//work/data/ccmgen/psicov/predictions_pcd/"
    alignment_dir = "/home/vorberg//work/data/ccmgen/psicov/alignments/"
    plot_dir = "/home/vorberg//work/plots/ccmgen/psicov/scatter_apc_vs_ec/pcd/"

    pearson_r_list = []
    proteins = []
    for braw_file in glob.glob(braw_dir + "/*braw.gz"):

        protein_name = os.path.basename(braw_file).split('.')[0]
        proteins.append(protein_name)
        print(protein_name)

        #read braw file
        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta
        neff = np.round(u.find_dict_key("neff", meta_info), decimals=3)
        lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info),
                            decimals=3)
        L = braw.ncol

        # read alignment file
        alignment_file = alignment_dir + "/" + protein_name + ".aln"
        alignment = io.read_alignment(alignment_file)
        single_freq, pair_freq = au.calculate_frequencies(
            alignment, au.uniform_pseudocounts)

        #get the highly gapped positions that need to be excluded from analysis
        alignment_ungapped, gapped_positions = io.remove_gapped_positions(
            alignment, max_gap_percentage=50)
        non_gapped_positions = [
            i for i in range(L) if i not in gapped_positions
        ]
        indices_i, indices_j = np.triu_indices(len(non_gapped_positions), k=1)

        #compute ec
        uij, scaling_factor = bu.compute_entropy_correction(single_freq,
                                                            neff,
                                                            lambda_w,
                                                            braw.x_pair,
                                                            entropy=True,
                                                            squared=False,
                                                            nr_states=20)
        ec_term = scaling_factor * np.sqrt(np.sum(uij, axis=(3, 2)))
        ec_term_ungapped = ec_term[non_gapped_positions, :]
        ec_term_ungapped = ec_term_ungapped[:, non_gapped_positions]

        #compute joint EC instead of geometric mean of per-column entropies
        # uij, scaling_factor = bu.compute_joint_entropy_correction(pair_freq, neff, lambda_w, braw.x_pair, nr_states = 20)
        # ec_term = scaling_factor * uij
        # ec_term_ungapped = ec_term[non_gapped_positions, :]
        # ec_term_ungapped = ec_term_ungapped[:, non_gapped_positions]

        #compute contact matrix for ungapped positions
        cmat = bu.compute_l2norm_from_braw(braw.x_pair,
                                           apc=False,
                                           squared=False)

        #compute apc
        cmat_ungapped = cmat[non_gapped_positions, :]
        cmat_ungapped = cmat_ungapped[:, non_gapped_positions]
        mean = np.mean(cmat_ungapped, axis=0)
        apc_term_ungapped = mean[:, np.newaxis] * mean[
            np.newaxis, :] / np.mean(cmat_ungapped)

        #plot
        plot_file = plot_dir + "/" + protein_name + "_apc_vs_ec.html"
        plot_scatter(apc_term_ungapped[indices_i, indices_j],
                     ec_term_ungapped[indices_i, indices_j], [
                         "i: " + str(i) + "<br>j: " + str(j)
                         for i, j in zip(indices_i, indices_j)
                     ], plot_file)

        #compute pearson correlation coefficient
        pearson_r_list.append(
            pearsonr(apc_term_ungapped[indices_i, indices_j],
                     ec_term_ungapped[indices_i, indices_j])[0])

    #plot boxplot with jitter
    plot_file = plot_dir + "/boxplot_pearsonr_apc_vs_ec.html"
    plot_boxplot_correlation(pearson_r_list, proteins, plot_file)