Beispiel #1
0
    def generate_coupling_decoy_set(self, size=1000):

        print("Get couplings for Plotting (size={0})...".format(size))

        couplings = []
        non_couplings = []
        lambda_pair = []

        print len(self.training_data)

        for p in self.training_data:

            braw_file_gz = self.braw_dir + "/" + p + ".filt.braw.gz"
            braw = raw.parse_msgpack(braw_file_gz)

            residue_i = np.array(self.training_data[p]['residue_i'])
            residue_j = np.array(self.training_data[p]['residue_j'])
            contact = np.array(self.training_data[p]['contact'])

            if 'regularization' in braw.meta['workflow'][0]['parameters'].keys(
            ):
                lambda_pair.append(braw.meta['workflow'][0]['parameters']
                                   ['regularization']['lambda_pair'])
            else:
                lambda_pair.append(
                    braw.meta['workflow'][0]['regularization']['lambda_pair'])

            indices_contact = [
                residue_i[np.where(contact == 1)[0]],
                residue_j[np.where(contact == 1)[0]]
            ]
            indices_non_contact = [
                residue_i[np.where(contact == 0)[0]],
                residue_j[np.where(contact == 0)[0]]
            ]

            if (len(couplings) < size and len(indices_contact) > 0):
                for index in range(min(len(indices_contact[0]), 100)):
                    i = indices_contact[0][index]
                    j = indices_contact[1][index]
                    couplings.append(braw.x_pair[i, j][:20, :20].flatten())

            if (len(non_couplings) < size and len(indices_non_contact) > 0):
                for index in range(min(len(indices_non_contact[0]), 100)):
                    i = indices_non_contact[0][index]
                    j = indices_non_contact[1][index]
                    non_couplings.append(braw.x_pair[i, j][:20, :20].flatten())

            # stop condition
            if (len(non_couplings) >= size and len(couplings) >= size):
                break

        self.couplings_contacts = couplings[:size]
        self.couplings_noncontacts = non_couplings[:size]
        self.avg_lambda_pair = np.mean(lambda_pair)
    def f_df_py_protein_parallel(protein,
                                 parameters_transformed_back,
                                 prec_wrt_L,
                                 nr_components,
                                 fixed_parameters,
                                 status=True,
                                 compute_gradients=True):

        protein_name, protein_data = protein
        if status:
            print("Compute likelihood and gradients for {0}".format(
                protein_name))

        braw = raw.parse_msgpack(protein_data['braw_file_path'])
        Nij, qij = io.read_qij(protein_data['qijabfilename'], braw.ncol)
        L = braw.ncol

        #prepare parameters
        if prec_wrt_L:
            for component in range(nr_components):
                parameter = 'prec_' + str(component)
                parameters_transformed_back[parameter] = list(
                    np.array(parameters_transformed_back[parameter]) * L)

        covMatdiag = np.zeros((nr_components, 400))
        log_det_precMat = np.zeros(nr_components)
        for component in range(nr_components):
            parameter = 'prec_' + str(component)
            covMatdiag[component] = 1.0 / np.array(
                parameters_transformed_back[parameter])
            log_det_precMat[component] = np.sum(
                np.log(parameters_transformed_back[parameter]))

        lik_protein = LikelihoodProtein(braw, Nij, qij)

        lik_protein.set_pairs(protein_data['residue_i'],
                              protein_data['residue_j'],
                              protein_data['contact'])

        lik_protein.set_parameters_parallel(parameters_transformed_back,
                                            covMatdiag, log_det_precMat,
                                            nr_components, fixed_parameters,
                                            prec_wrt_L)

        #compute f and gradients
        lik_protein.compute_f_df(compute_gradients=compute_gradients)
        f_protein = lik_protein.get_f()
        grad_protein = lik_protein.get_gradients(
        )  #={} if no gradients are caluclated

        return f_protein, grad_protein
def generate_coupling_decoy_set(size, braw_dir, pdb_dir):

    seqsep =  8
    non_contact_thr = 25
    contact_thr = 8

    couplings = []
    non_couplings = []

    braw_files = glob.glob(braw_dir + "/*braw*")


    for braw_file in braw_files[:size]:
        p = os.path.basename(braw_file).split(".")[0]

        pdb_file = pdb_dir      + "/" + p + ".pdb"
        try:
            braw = raw.parse_msgpack(braw_file)
        except:
            print("Problems reading {0}".format(braw_file))
            continue

        indices_contact, indices_non_contact = pdb.determine_residue_pair_indices(
            pdb_file, seqsep, non_contact_thr, contact_thr
        )


        if (len(couplings) < size and len(indices_contact) > 0):
            for index in range(min(len(indices_contact[0]), 100)):
                i = indices_contact[0][index]
                j = indices_contact[1][index]
                couplings.append(braw.x_pair[i, j][:20, :20].flatten())

            if (len(non_couplings) < size and len(indices_non_contact) > 0):
                for index in range(min(len(indices_non_contact[0]), 100)):
                    i = indices_non_contact[0][index]
                    j = indices_non_contact[1][index]
                    non_couplings.append(braw.x_pair[i, j][:20, :20].flatten())

            # stop condition
            if (len(non_couplings) >= size and len(couplings) >= size):
                break


    evaluation_set = {}
    evaluation_set['contact']   = np.array(couplings[:size]).transpose()
    evaluation_set['bg']        = np.array(non_couplings[:size]).transpose()

    return evaluation_set
    def generate_coupling_decoy_set(self, size=1000):

        print("Get couplings for Plotting (size={0})...".format(size))

        couplings = []
        non_couplings = []
        lambda_pair = []

        print len(self.training_data)

        for p in self.training_data:

            braw_file_gz = self.braw_dir + "/" + p + ".filt.braw.gz"
            braw = raw.parse_msgpack(braw_file_gz)

            residue_i   = np.array(self.training_data[p]['residue_i'])
            residue_j   = np.array(self.training_data[p]['residue_j'])
            contact     = np.array(self.training_data[p]['contact'])

            if 'regularization' in braw.meta['workflow'][0]['parameters'].keys():
                lambda_pair.append(braw.meta['workflow'][0]['parameters']['regularization']['lambda_pair'])
            else:
                lambda_pair.append(braw.meta['workflow'][0]['regularization']['lambda_pair'])

            indices_contact     = [residue_i[np.where(contact == 1)[0]], residue_j[np.where(contact == 1)[0]]]
            indices_non_contact = [residue_i[np.where(contact == 0)[0]], residue_j[np.where(contact == 0)[0]]]

            if (len(couplings) < size and len(indices_contact) > 0):
                for index in range(min(len(indices_contact[0]), 100)):
                    i = indices_contact[0][index]
                    j = indices_contact[1][index]
                    couplings.append(braw.x_pair[i, j][:20, :20].flatten())

            if (len(non_couplings) < size and len(indices_non_contact) > 0):
                for index in range(min(len(indices_non_contact[0]), 100)):
                    i = indices_non_contact[0][index]
                    j = indices_non_contact[1][index]
                    non_couplings.append(braw.x_pair[i, j][:20, :20].flatten())

            # stop condition
            if (len(non_couplings) >= size and len(couplings) >= size):
                break

        self.couplings_contacts = couplings[:size]
        self.couplings_noncontacts = non_couplings[:size]
        self.avg_lambda_pair = np.mean(lambda_pair)
Beispiel #5
0
    def contact_likelihood_py(self, coupling_prior_parameter_file, braw_file,
                              qij_file):

        # load parameters for coupling prior
        self.contact_likelihood_parameters = Parameters("")
        self.contact_likelihood_parameters.read_parameters_metadata(
            coupling_prior_parameter_file + ".settings")
        self.contact_likelihood_parameters.read_parameters(
            coupling_prior_parameter_file, transform=True)

        braw = raw.parse_msgpack(braw_file)
        Nij, qij = io.read_qij(qij_file, braw.ncol)

        #instantiate class for computing neg likelihood of coupling prior for one protein
        lik_protein = LikelihoodProtein(braw, Nij, qij)
        lik_protein.set_parameters(self.contact_likelihood_parameters)

        distances = [0, 1]
        self.contact_log_likelihood_mat = np.zeros(
            (len(distances), self.L, self.L))

        for distance in distances:
            lik_protein.set_pairs(self.residues_i, self.residues_j,
                                  [distance] * len(self.residues_i))

            lik_protein.compute_f_df(compute_gradients=False)
            neg_log_likelihood = lik_protein.get_f_pairwise()

            self.contact_log_likelihood_mat[
                distance, self.residues_i,
                self.residues_j] = -np.array(neg_log_likelihood)
            #likelihood_residue_pairs = np.exp(-np.array(neg_log_likelihood))
            #self.contact_likelihood_mat[distance, self.residues_i, self.residues_j] = likelihood_residue_pairs.tolist()

        # collect meta data
        self.contact_likelihood_meta = {}
        self.contact_likelihood_meta['dataset'] = {}
        self.contact_likelihood_meta['dataset']['braw_file'] = braw_file
        self.contact_likelihood_meta['dataset']['qij_file'] = qij_file
        self.contact_likelihood_meta['dataset'][
            'sequence_separation'] = self.sequence_separation
        self.contact_likelihood_meta[
            'parameters'] = self.contact_likelihood_parameters.get_settings()
    def f_df_py_protein_parallel(protein, parameters_transformed_back, prec_wrt_L, nr_components, fixed_parameters, status=True, compute_gradients=True):


        protein_name, protein_data = protein
        if status:
            print("Compute likelihood and gradients for {0}".format(protein_name))

        braw = raw.parse_msgpack(protein_data['braw_file_path'])
        Nij, qij = io.read_qij(protein_data['qijabfilename'], braw.ncol)
        L = braw.ncol

        #prepare parameters
        if prec_wrt_L:
            for component in range(nr_components):
                parameter = 'prec_'+str(component)
                parameters_transformed_back[parameter] = list(np.array(parameters_transformed_back[parameter]) * L)

        covMatdiag = np.zeros((nr_components, 400))
        log_det_precMat = np.zeros(nr_components)
        for component in range(nr_components):
            parameter = 'prec_'+str(component)
            covMatdiag[component] = 1.0 / np.array(parameters_transformed_back[parameter])
            log_det_precMat[component] =  np.sum(np.log(parameters_transformed_back[parameter]))

        lik_protein = LikelihoodProtein(braw, Nij, qij)

        lik_protein.set_pairs(
            protein_data['residue_i'],
            protein_data['residue_j'],
            protein_data['contact']
        )

        lik_protein.set_parameters_parallel(
            parameters_transformed_back, covMatdiag, log_det_precMat, nr_components, fixed_parameters, prec_wrt_L)

        #compute f and gradients
        lik_protein.compute_f_df(compute_gradients=compute_gradients)
        f_protein = lik_protein.get_f()
        grad_protein = lik_protein.get_gradients() #={} if no gradients are caluclated

        return f_protein, grad_protein
    def contact_likelihood_py(self, coupling_prior_parameter_file, braw_file, qij_file):


        # load parameters for coupling prior
        self.contact_likelihood_parameters = Parameters("")
        self.contact_likelihood_parameters.read_parameters_metadata(coupling_prior_parameter_file + ".settings")
        self.contact_likelihood_parameters.read_parameters(coupling_prior_parameter_file, transform=True)

        braw = raw.parse_msgpack(braw_file)
        Nij, qij = io.read_qij(qij_file, braw.ncol)

        #instantiate class for computing neg likelihood of coupling prior for one protein
        lik_protein = LikelihoodProtein(braw, Nij, qij)
        lik_protein.set_parameters(self.contact_likelihood_parameters)

        distances = [0, 1]
        self.contact_log_likelihood_mat = np.zeros((len(distances), self.L, self.L))

        for distance in distances:
            lik_protein.set_pairs(
                self.residues_i,
                self.residues_j,
                [distance] * len(self.residues_i)
            )

            lik_protein.compute_f_df(compute_gradients=False)
            neg_log_likelihood = lik_protein.get_f_pairwise()

            self.contact_log_likelihood_mat[distance, self.residues_i, self.residues_j] = -np.array(neg_log_likelihood)
            #likelihood_residue_pairs = np.exp(-np.array(neg_log_likelihood))
            #self.contact_likelihood_mat[distance, self.residues_i, self.residues_j] = likelihood_residue_pairs.tolist()

        # collect meta data
        self.contact_likelihood_meta = {}
        self.contact_likelihood_meta['dataset'] = {}
        self.contact_likelihood_meta['dataset']['braw_file'] = braw_file
        self.contact_likelihood_meta['dataset']['qij_file'] = qij_file
        self.contact_likelihood_meta['dataset']['sequence_separation'] = self.sequence_separation
        self.contact_likelihood_meta['parameters'] = self.contact_likelihood_parameters.get_settings()
Beispiel #8
0
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')
    parser.add_argument("braw_dir", type=str, help="path to binary raw files")
    parser.add_argument("pdb_dir", type=str, help="path to pdb files")
    parser.add_argument("alignment_dir",
                        type=str,
                        help="path to alignment files")
    parser.add_argument("nr_couplings",
                        type=int,
                        default=10000,
                        help="number of couplings")
    parser.add_argument("plot_out", type=str, help="path to plot file")
    parser.add_argument("max_per_protein",
                        type=int,
                        default=100,
                        help="maximum numbr couplings per protein")

    args = parser.parse_args()

    braw_dir = args.braw_dir
    pdb_dir = args.pdb_dir
    alignment_dir = args.alignment_dir
    nr_couplings = args.nr_couplings
    plot_out = args.plot_out
    max_per_protein = args.max_per_protein

    #debugging
    braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpredpy_cd/braw/"
    pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/"
    alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/"
    nr_couplings = 20000
    plot_out = '/home/vorberg/'
    max_per_protein = 100

    if not os.path.exists(braw_dir):
        raise IOError("Braw Path {0} does not exist.".format(braw_dir))

    coupling_df = pd.DataFrame(columns=range(400) + ['Neff'])

    braw_files = glob.glob(braw_dir + "/*braw*")
    for braw_file in braw_files:

        if len(coupling_df) > nr_couplings:
            break

        protein = os.path.basename(braw_file).split(".")[0]
        print protein

        #-------------get couplings and metadata ---------------------------------------------------------------------
        braw = raw.parse_msgpack(braw_file)
        meta = braw.meta
        neff = meta['workflow'][0]['parameters']['msafile']['neff']
        L = meta['workflow'][0]['parameters']['msafile']['ncol']
        N = meta['workflow'][0]['parameters']['msafile']['nrow']
        diversity = np.sqrt(N) / L
        #-------------------------------------------------------------------------------------------------------------

        #-------------filter contacts -------------------------------------------------------------------------------
        pdb_file = pdb_dir + "/" + protein + ".pdb"
        dist_matrix = pdb.distance_map(pdb_file)

        # get contact map (either 1 encoding a contact or 1 encoding non-contact (according to class variable)
        contact_map = dist_matrix < 8

        # select all residue pairs within contact Threshold
        indices_contact = list(np.where(np.triu(contact_map, k=1)))
        #-------------------------------------------------------------------------------------------------------------

        #--------------filter gap columns ---------------------------------------------------------------------------
        psicov_file = alignment_dir + "/" + protein + ".filt.psc"
        psicov = io.read_alignment(psicov_file)

        percent_gaps_per_column = [
            float(psicov[:, l].tolist().count(0)) / N for l in range(L)
        ]
        columns_with_many_gaps = [
            i for i, j in enumerate(percent_gaps_per_column) if j > 0.2
        ]

        index_delete_contact_i = [
            index for index in range(len(indices_contact[0]))
            if indices_contact[0][index] in columns_with_many_gaps
        ]
        index_delete_contact_j = [
            index for index in range(len(indices_contact[1]))
            if indices_contact[1][index] in columns_with_many_gaps
        ]

        # delete column pairs from indices_contact
        indices_contact[0] = np.delete(
            indices_contact[0],
            np.unique(index_delete_contact_i + index_delete_contact_j))
        indices_contact[1] = np.delete(
            indices_contact[1],
            np.unique(index_delete_contact_i + index_delete_contact_j))
        #-------------------------------------------------------------------------------------------------------------

        nr_contacts = len(indices_contact[0])

        if nr_contacts == 0:
            continue

        random_sample = np.random.choice(range(nr_contacts),
                                         replace=False,
                                         size=np.min(
                                             [max_per_protein, nr_contacts]))
        couplings = braw.x_pair[
            indices_contact[0][random_sample],
            indices_contact[1][random_sample], :20, :20].reshape(
                len(random_sample), 400)
        df = pd.DataFrame(couplings)
        df['L'] = L
        df['Neff'] = neff
        df['Diversity'] = diversity
        df['sum_wij'] = couplings.sum(1)
        df['ratio_0.2L_Neff'] = 0.2 * L / neff

        coupling_df = coupling_df.append(df)
        print "nr of couplings: {0}".format(len(coupling_df))

    plot_file = plot_out + "/coupling_matrix_neff_" + str(
        nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'Neff', plot_file)

    plot_file = plot_out + "/coupling_matrix_diversity_" + str(
        nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'Diversity', plot_file)

    plot_file = plot_out + "/coupling_matrix_L_" + str(nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'L', plot_file)

    plot_file = plot_out + "/coupling_matrix_ratio_0.2L_Neff_" + str(
        nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'ratio_0.2L_Neff', plot_file)
Beispiel #9
0
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')
    parser.add_argument("binary_raw_file",  type=str,   help="path to binary_raw_file")
    parser.add_argument("alignment_file",  type=str,   help="path to alignment_file")
    parser.add_argument("plot_out",         type=str,   help="path to plot file")
    parser.add_argument("--residue_i",  "residue_i",    default=None, type=int,   help="position of residue i")
    parser.add_argument("--residue_j",  "residue_j",    default=None, type=int,   help="position of residue j")
    parser.add_argument("--entropy-correction", "entropy_correction",   default=False, action="store_true",   help="plot entropy correction")
    parser.add_argument("--count-statistic-correction", "count_statistic_correction",    default=False, action="store_true",  help="plot coutn stat correction")

    args = parser.parse_args()

    binary_raw_file             = args.binary_raw_file
    alignment_file              = args.alignment_file
    plot_out                    = args.plot_out
    residue_i                   = args.residue_i
    residue_j                   = args.residue_j
    plot_entropy_correction          = args.entropy_correction
    plot_count_statistic_correction  = args.count_statistic_correction




    #debugging
    protein = "1dv1A03"
    alignment_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/" + protein + ".filt.psc"
    binary_raw_file = "/home/vorberg/"+protein+".gx.gz"
    binary_raw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/"+protein+".filt.braw.gz"
    residue_i=34
    residue_j=65
    plot_out="/home/vorberg/"
    plot_entropy_correction=True
    plot_count_statistic_correction=True



    if not os.path.exists(binary_raw_file):
        raise IOError("Braw file " + str(binary_raw_file) + " cannot be found. ")

    #get couplings and lambda_w
    braw = raw.parse_msgpack(binary_raw_file)
    alignment = io.read_alignment(alignment_file)
    protein = os.path.basename(binary_raw_file).split(".")[0]
    L = braw.ncol
    if "regularization" in braw.meta['workflow'][0].keys():
        lambda_w = braw.meta['workflow'][0]['regularization']['lambda_pair']
    else:
        lambda_w = braw.meta['workflow'][0]['parameters']['regularization']['lambda_pair']

    #read amino acid frequencies with pseudo-counts
    neff = au.compute_neff(alignment)
    single_freq, pair_freq = au.calculate_frequencies(alignment, au.uniform_pseudocounts)



    if plot_entropy_correction:
        ui, entropy_correction, eta = bu.compute_correction_ij(single_freq, neff, lambda_w, braw.x_pair, residue_i, residue_j, entropy=True, squared=True)
        plot_correction(entropy_correction, ui[residue_i-1, :], ui[residue_j-1, :], protein, residue_i, residue_j, eta, plot_out, correction_type="entropy", plot_type="heatmap")
        plot_correction(entropy_correction, ui[residue_i-1, :], ui[residue_j-1, :], protein, residue_i, residue_j, eta, plot_out, correction_type="entropy", plot_type="bubble")

    if plot_count_statistic_correction:
        ui, csc_correction, eta = bu.compute_correction_ij(single_freq, neff, lambda_w, braw.x_pair, residue_i, residue_j, entropy=False, squared=True)
        plot_correction(csc_correction, ui[residue_i-1, :], ui[residue_j-1, :], protein, residue_i, residue_j,  eta, plot_out, correction_type="count-statistic", plot_type="heatmap")
        plot_correction(csc_correction, ui[residue_i-1, :], ui[residue_j-1, :], protein, residue_i, residue_j,  eta, plot_out, correction_type="count-statistic", plot_type="bubble")
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')
    parser.add_argument("braw_dir",         type=str,   help="path to binary raw files")
    parser.add_argument("pdb_dir",          type=str,   help="path to pdb files")
    parser.add_argument("alignment_dir",    type=str,   help="path to alignment files")
    parser.add_argument("nr_couplings",     type=int,   default=10000, help="number of couplings")
    parser.add_argument("plot_out",         type=str,   help="path to plot file")
    parser.add_argument("max_per_protein",  type=int,   default=100, help="maximum numbr couplings per protein")


    args = parser.parse_args()

    braw_dir        = args.braw_dir
    pdb_dir         = args.pdb_dir
    alignment_dir   = args.alignment_dir
    nr_couplings    = args.nr_couplings
    plot_out        = args.plot_out
    max_per_protein = args.max_per_protein

    #debugging
    braw_dir    = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpredpy_cd/braw/"
    pdb_dir     = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/"
    alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/"
    nr_couplings = 20000
    plot_out='/home/vorberg/'
    max_per_protein=100


    if not os.path.exists(braw_dir):
        raise IOError("Braw Path {0} does not exist.".format(braw_dir))


    coupling_df = pd.DataFrame(columns=range(400) + ['Neff'])

    braw_files = glob.glob(braw_dir + "/*braw*")
    for braw_file in braw_files:


        if len(coupling_df) > nr_couplings:
            break

        protein = os.path.basename(braw_file).split(".")[0]
        print protein


        #-------------get couplings and metadata ---------------------------------------------------------------------
        braw = raw.parse_msgpack(braw_file)
        meta = braw.meta
        neff = meta['workflow'][0]['parameters']['msafile']['neff']
        L = meta['workflow'][0]['parameters']['msafile']['ncol']
        N = meta['workflow'][0]['parameters']['msafile']['nrow']
        diversity = np.sqrt(N)/L
        #-------------------------------------------------------------------------------------------------------------


        #-------------filter contacts -------------------------------------------------------------------------------
        pdb_file = pdb_dir +"/"+protein+".pdb"
        dist_matrix = pdb.distance_map(pdb_file)

        # get contact map (either 1 encoding a contact or 1 encoding non-contact (according to class variable)
        contact_map = dist_matrix < 8

        # select all residue pairs within contact Threshold
        indices_contact = list(np.where(np.triu(contact_map, k=1)))
        #-------------------------------------------------------------------------------------------------------------



        #--------------filter gap columns ---------------------------------------------------------------------------
        psicov_file = alignment_dir + "/"+protein+".filt.psc"
        psicov = io.read_alignment(psicov_file)

        percent_gaps_per_column = [float(psicov[:, l].tolist().count(0)) / N for l in range(L)]
        columns_with_many_gaps = [i for i, j in enumerate(percent_gaps_per_column) if j > 0.2]

        index_delete_contact_i = [index for index in range(len(indices_contact[0])) if
                                  indices_contact[0][index] in columns_with_many_gaps]
        index_delete_contact_j = [index for index in range(len(indices_contact[1])) if
                                  indices_contact[1][index] in columns_with_many_gaps]

        # delete column pairs from indices_contact
        indices_contact[0] = np.delete(indices_contact[0],
                                       np.unique(index_delete_contact_i + index_delete_contact_j))
        indices_contact[1] = np.delete(indices_contact[1],
                                       np.unique(index_delete_contact_i + index_delete_contact_j))
        #-------------------------------------------------------------------------------------------------------------


        nr_contacts = len(indices_contact[0])

        if nr_contacts == 0:
            continue


        random_sample = np.random.choice(range(nr_contacts), replace=False, size=np.min([max_per_protein, nr_contacts]))
        couplings = braw.x_pair[indices_contact[0][random_sample], indices_contact[1][random_sample],:20,:20].reshape(len(random_sample), 400)
        df = pd.DataFrame(couplings)
        df['L'] = L
        df['Neff'] = neff
        df['Diversity'] = diversity
        df['sum_wij'] = couplings.sum(1)
        df['ratio_0.2L_Neff'] = 0.2 * L / neff

        coupling_df = coupling_df.append(df)
        print "nr of couplings: {0}".format(len(coupling_df))


    plot_file = plot_out + "/coupling_matrix_neff_" + str(nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'Neff', plot_file)

    plot_file = plot_out + "/coupling_matrix_diversity_" + str(nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'Diversity', plot_file)

    plot_file = plot_out + "/coupling_matrix_L_" + str(nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'L', plot_file)


    plot_file = plot_out + "/coupling_matrix_ratio_0.2L_Neff_" + str(nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'ratio_0.2L_Neff', plot_file)
def collect_data(braw_dir, alignment_dir, pdb_dir, ab, cd, cb_lower, cb_upper):

    #define distance bins
    couplings = {ab: [], cd: []}

    max_nr_couplings_per_protein = 500
    sequence_separation = 10
    evidence_threshold = 80
    max_nr_couplings = 5000
    diversity_thr = 0.3
    a = ab[0]
    b = ab[2]
    c = cd[0]
    d = cd[2]

    # iterate over proteins
    braw_files = glob.glob(braw_dir + "/*braw.gz")
    for braw_file in braw_files:
        # braw_file = braw_files[0]

        protein = os.path.basename(braw_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"
        alignment_file = alignment_dir + "/" + protein + ".filt.psc"

        if not os.path.exists(pdb_file):
            print("PDB file {0} does not exist. Skip this protein.".format(
                pdb_file))
            continue

        if not os.path.exists(braw_file):
            print("Braw file {0} does not exist. Skip this protein.".format(
                braw_file))
            continue

        if not os.path.exists(alignment_file):
            print("Alignment file {0} does not exist. Skip this protein.".
                  format(alignment_file))
            continue

        AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8)

        diversity = np.sqrt(AF.N) / AF.L
        if diversity < diversity_thr:
            print("Diversity = {0}. Skip this protein.".format(diversity))
            continue

        braw = raw.parse_msgpack(braw_file)
        distance_map = pdb.distance_map(pdb_file, AF.L)

        #mask highly gapped positions
        gaps = 1 - (AF.Ni / AF.neff)
        highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0]
        distance_map[highly_gapped_pos, :] = np.nan
        distance_map[:, highly_gapped_pos] = np.nan

        # iterate over pairs for bins
        residue_i, residue_j = np.where((distance_map > cb_lower)
                                        & (distance_map < cb_upper))

        Nij = AF.Nij[residue_i, residue_j]
        q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]]
        q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]]
        q_i_c = AF.single_frequencies[residue_i, io.AMINO_INDICES[c]]
        q_j_d = AF.single_frequencies[residue_j, io.AMINO_INDICES[d]]

        evidence_ab = Nij * q_i_a * q_j_b
        evidence_cd = Nij * q_i_c * q_j_d

        residue_i = residue_i[(evidence_ab > evidence_threshold)
                              & (evidence_cd > evidence_threshold)]
        residue_j = residue_j[(evidence_ab > evidence_threshold)
                              & (evidence_cd > evidence_threshold)]

        if len(residue_i) == 0:
            continue

        ab_coupling = braw.x_pair[
            residue_i, residue_j, io.AMINO_INDICES[a],
            io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein]
        cd_coupling = braw.x_pair[
            residue_i, residue_j, io.AMINO_INDICES[c],
            io.AMINO_INDICES[d]].tolist()[:max_nr_couplings_per_protein]
        couplings[ab].extend(ab_coupling)
        couplings[cd].extend(cd_coupling)

        print("\nprotein {0}  size: {1}".format(protein, len(couplings[ab])))

        # stop condition: all bins are full
        if len(couplings[ab]) >= max_nr_couplings:
            break

    return couplings
Beispiel #12
0
def collect_data(braw_dir, alignment_dir, pdb_dir, ab):

    #define distance bins
    couplings_per_bin = {
        'bin1': {
            'couplings': [],
            'lower': 0,
            'upper': 8
        },
        'bin2': {
            'couplings': [],
            'lower': 5,
            'upper': 10
        },
        'bin3': {
            'couplings': [],
            'lower': 8,
            'upper': 12
        },
        'bin4': {
            'couplings': [],
            'lower': 10,
            'upper': 15
        },
        'bin5': {
            'couplings': [],
            'lower': 20,
            'upper': 50
        }
    }

    max_nr_couplings_per_protein = 500
    sequence_separation = 10
    evidence_threshold = 100
    max_couplings_per_bin = 10000
    a = ab[0]
    b = ab[2]

    # iterate over proteins
    braw_files = glob.glob(braw_dir + "/*braw.gz")
    for braw_file in braw_files:
        # braw_file = braw_files[0]

        protein = os.path.basename(braw_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"
        alignment_file = alignment_dir + "/" + protein + ".filt.psc"

        if not os.path.exists(pdb_file):
            print("PDB file {0} does not exist. Skip this protein.".format(
                pdb_file))
            continue

        if not os.path.exists(braw_file):
            print("Braw file {0} does not exist. Skip this protein.".format(
                braw_file))
            continue

        if not os.path.exists(alignment_file):
            print(
                "Alignment file {0} does not exist. Skip this protein.".format(
                    alignment_file))
            continue

        AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8)

        diversity = np.sqrt(AF.N) / AF.L
        if diversity < 0.3:
            print("Diversity = {0}. Skip this protein.".format(diversity))
            continue

        braw = raw.parse_msgpack(braw_file)
        distance_map = pdb.distance_map(pdb_file, AF.L)

        #mask highly gapped positions
        gaps = 1 - (AF.Ni / AF.neff)
        highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0]
        distance_map[highly_gapped_pos, :] = np.nan
        distance_map[:, highly_gapped_pos] = np.nan

        # iterate over pairs for bins
        for bin_name in sorted(couplings_per_bin.keys(), reverse=True):

            if len(couplings_per_bin[bin_name]
                   ['couplings']) >= max_couplings_per_bin:
                continue

            cb_lower = couplings_per_bin[bin_name]['lower']
            cb_upper = couplings_per_bin[bin_name]['upper']

            residue_i, residue_j = np.where((distance_map > cb_lower)
                                            & (distance_map < cb_upper))

            Nij = AF.Nij[residue_i, residue_i]
            q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]]
            q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]]

            evidence = Nij * q_i_a * q_j_b

            residue_i = residue_i[evidence > evidence_threshold]
            residue_j = residue_j[evidence > evidence_threshold]

            if len(residue_i) == 0:
                continue

            ab_coupling = braw.x_pair[
                residue_i, residue_j, io.AMINO_INDICES[a],
                io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein]
            couplings_per_bin[bin_name]['couplings'].extend(ab_coupling)

        for bin_name in sorted(couplings_per_bin.keys(), reverse=True):
            print("\nprotein {0} {1:<8} size: {2}".format(
                protein, bin_name,
                len(couplings_per_bin[bin_name]['couplings'])))

        # stop condition: all bins are full
        if all([
                len(bindict['couplings']) >= max_couplings_per_bin
                for bindict in couplings_per_bin.values()
        ]):
            break

    return couplings_per_bin
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')
    parser.add_argument("braw_dir",         type=str,   help="path to binary_raw_files")
    parser.add_argument("alignment_dir",    type=str,   help="path to alignment files")
    parser.add_argument("pdb_dir",          type=str,   help="path to pdb files")
    parser.add_argument("ab",               type=str,   help="ab in range(400)")
    parser.add_argument("cd",               type=str,   help="cd in range(400)")
    parser.add_argument("dist_lower",       type=int,   default=0, help="Lower Cbeta distance threshold")
    parser.add_argument("dist_upper",       type=int,   default=8, help="Upper Cbeta distance threshold")
    parser.add_argument("Nij_threshold",    type=int,   default=100, help="Minimum number of non-gapped sequences at positions i and j ")
    parser.add_argument("size",             type=int,   help="number of pairs ij")
    parser.add_argument("plot_dir",         type=str,   help="where to save the plot")


    args = parser.parse_args()

    braw_dir        = args.braw_dir
    pdb_dir         = args.pdb_dir
    alignment_dir   = args.alignment_dir
    ab              = args.ab
    cd              = args.cd
    dist_lower      = args.dist_lower
    dist_upper      = args.dist_upper
    Nij_threshold   = args.Nij_threshold
    size            = args.size
    plot_dir        = args.plot_dir

    #debugging
    # pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/"
    # braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/"
    # alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/"
    # ab='R-E'
    # cd='E-R'
    # dist_lower = 0
    # dist_upper = 8
    # Nij_threshold = 100
    # size = 10000
    # plot_dir='/home/vorberg/'


    braw_files = glob.glob(braw_dir + "/*braw.gz")

    couplings={}
    couplings[ab]=[]
    couplings[cd]=[]
    for braw_file in braw_files:
        if len(couplings[ab]) > size:
            break

        if not os.path.exists(braw_file):
            print("Braw File " + str(braw_file) + "cannot be found. ")
            continue

        braw = raw.parse_msgpack(braw_file)
        L  = braw.ncol

        protein = os.path.basename(braw_file).split(".")[0]


        alignment_file = alignment_dir + "/" + protein + ".filt.psc"
        if not os.path.exists(alignment_file):
            print("Alignment File " + str(alignment_file) + " cannot be found. ")
            continue


        pdb_file = pdb_dir + "/" + protein.replace("_", "") + ".pdb"
        if not os.path.exists(pdb_file):
            print("PDB File " + str(pdb_file) + " cannot be found. ")
            continue

        print protein

        indices_upper_tri  =  np.triu_indices(L, k=1)

        #filter pair indices that have specified Cb distances
        dist_matrix = pdb.distance_map(pdb_file, L)
        indices_dist_true = np.where((dist_matrix[indices_upper_tri] > dist_lower) & (dist_matrix[indices_upper_tri] < dist_upper))[0]

        #filter pair indices that have more than Nij_threshold ungapped sequences
        alignment = io.read_alignment(alignment_file)
        weights = weighting.calculate_weights_simple(alignment, 0.8, True)
        pairwise_counts = counts.pair_counts(alignment, weights)
        Nij = pairwise_counts[:, :, :20, :20].sum(3).sum(2)
        indices_Nij_true = np.where(Nij[indices_upper_tri] > Nij_threshold)[0]

        #get pair indices that fullfill both requirements
        indices_merge = list(set(indices_dist_true).intersection(indices_Nij_true))

        #get couplings for filtered pairs
        braw_reshaped =  braw.x_pair[:,:,:20,:20].reshape(L,L,400)
        couplings[ab].extend(braw_reshaped[indices_upper_tri][indices_merge][:, io.AB_INDICES[ab]])
        couplings[cd].extend(braw_reshaped[indices_upper_tri][indices_merge][:, io.AB_INDICES[cd]])

        print "Nr of couplings: {0}".format(len(couplings[ab]))


    plot_file = plot_dir + "/pairwise_couplings_" + ab + "_"+ cd + "_Nijthreshold" + str(Nij_threshold) + "_Cbdistance_" + str(dist_lower) +"_" + str(dist_upper) + ".html"
    title="Couplings {0} vs  {1} <br> Nij threshold: {2},  {3} <= Cb_ij <= {4}".format(ab, cd, Nij_threshold, dist_lower, dist_upper)
    plots.plot_pairwise_couplings_density(couplings, title, plot_out=plot_file)
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')
    parser.add_argument("braw_dir", type=str, help="path to binary_raw_files")
    parser.add_argument("alignment_dir",
                        type=str,
                        help="path to alignment files")
    parser.add_argument("pdb_dir", type=str, help="path to pdb files")
    parser.add_argument("ab", type=str, help="ab in range(400)")
    parser.add_argument("cd", type=str, help="cd in range(400)")
    parser.add_argument("dist_lower",
                        type=int,
                        default=0,
                        help="Lower Cbeta distance threshold")
    parser.add_argument("dist_upper",
                        type=int,
                        default=8,
                        help="Upper Cbeta distance threshold")
    parser.add_argument(
        "Nij_threshold",
        type=int,
        default=100,
        help="Minimum number of non-gapped sequences at positions i and j ")
    parser.add_argument("size", type=int, help="number of pairs ij")
    parser.add_argument("plot_dir", type=str, help="where to save the plot")

    args = parser.parse_args()

    braw_dir = args.braw_dir
    pdb_dir = args.pdb_dir
    alignment_dir = args.alignment_dir
    ab = args.ab
    cd = args.cd
    dist_lower = args.dist_lower
    dist_upper = args.dist_upper
    Nij_threshold = args.Nij_threshold
    size = args.size
    plot_dir = args.plot_dir

    #debugging
    # pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/"
    # braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/"
    # alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/"
    # ab='R-E'
    # cd='E-R'
    # dist_lower = 0
    # dist_upper = 8
    # Nij_threshold = 100
    # size = 10000
    # plot_dir='/home/vorberg/'

    braw_files = glob.glob(braw_dir + "/*braw.gz")

    couplings = {}
    couplings[ab] = []
    couplings[cd] = []
    for braw_file in braw_files:
        if len(couplings[ab]) > size:
            break

        if not os.path.exists(braw_file):
            print("Braw File " + str(braw_file) + "cannot be found. ")
            continue

        braw = raw.parse_msgpack(braw_file)
        L = braw.ncol

        protein = os.path.basename(braw_file).split(".")[0]

        alignment_file = alignment_dir + "/" + protein + ".filt.psc"
        if not os.path.exists(alignment_file):
            print("Alignment File " + str(alignment_file) +
                  " cannot be found. ")
            continue

        pdb_file = pdb_dir + "/" + protein.replace("_", "") + ".pdb"
        if not os.path.exists(pdb_file):
            print("PDB File " + str(pdb_file) + " cannot be found. ")
            continue

        print protein

        indices_upper_tri = np.triu_indices(L, k=1)

        #filter pair indices that have specified Cb distances
        dist_matrix = pdb.distance_map(pdb_file, L)
        indices_dist_true = np.where(
            (dist_matrix[indices_upper_tri] > dist_lower)
            & (dist_matrix[indices_upper_tri] < dist_upper))[0]

        #filter pair indices that have more than Nij_threshold ungapped sequences
        alignment = io.read_alignment(alignment_file)
        weights = weighting.calculate_weights_simple(alignment, 0.8, True)
        pairwise_counts = counts.pair_counts(alignment, weights)
        Nij = pairwise_counts[:, :, :20, :20].sum(3).sum(2)
        indices_Nij_true = np.where(Nij[indices_upper_tri] > Nij_threshold)[0]

        #get pair indices that fullfill both requirements
        indices_merge = list(
            set(indices_dist_true).intersection(indices_Nij_true))

        #get couplings for filtered pairs
        braw_reshaped = braw.x_pair[:, :, :20, :20].reshape(L, L, 400)
        couplings[ab].extend(
            braw_reshaped[indices_upper_tri][indices_merge][:,
                                                            io.AB_INDICES[ab]])
        couplings[cd].extend(
            braw_reshaped[indices_upper_tri][indices_merge][:,
                                                            io.AB_INDICES[cd]])

        print "Nr of couplings: {0}".format(len(couplings[ab]))

    plot_file = plot_dir + "/pairwise_couplings_" + ab + "_" + cd + "_Nijthreshold" + str(
        Nij_threshold) + "_Cbdistance_" + str(dist_lower) + "_" + str(
            dist_upper) + ".html"
    title = "Couplings {0} vs  {1} <br> Nij threshold: {2},  {3} <= Cb_ij <= {4}".format(
        ab, cd, Nij_threshold, dist_lower, dist_upper)
    plots.plot_pairwise_couplings_density(couplings, title, plot_out=plot_file)
def collect_data(braw_dir, alignment_dir, pdb_dir, ab):

    #define distance bins
    couplings_per_bin={
        'bin1': {
            'couplings' : [],
            'lower':0,
            'upper':8
        },
        'bin2': {
            'couplings': [],
            'lower': 5,
            'upper': 10
        },
        'bin3': {
            'couplings': [],
            'lower': 8,
            'upper': 12
        },
        'bin4': {
            'couplings': [],
            'lower': 10,
            'upper': 15
        },
        'bin5': {
            'couplings': [],
            'lower': 20,
            'upper': 50
        }
    }


    max_nr_couplings_per_protein = 500
    sequence_separation=10
    evidence_threshold = 100
    max_couplings_per_bin = 10000
    a = ab[0]
    b = ab[2]

    # iterate over proteins
    braw_files = glob.glob(braw_dir + "/*braw.gz")
    for braw_file in braw_files:
        # braw_file = braw_files[0]

        protein = os.path.basename(braw_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"
        alignment_file = alignment_dir + "/" + protein + ".filt.psc"

        if not os.path.exists(pdb_file):
            print("PDB file {0} does not exist. Skip this protein.".format(pdb_file))
            continue

        if not os.path.exists(braw_file):
            print("Braw file {0} does not exist. Skip this protein.".format(braw_file))
            continue

        if not os.path.exists(alignment_file):
            print("Alignment file {0} does not exist. Skip this protein.".format(alignment_file))
            continue

        AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8)

        diversity = np.sqrt(AF.N) / AF.L
        if diversity < 0.3:
            print("Diversity = {0}. Skip this protein.".format(diversity))
            continue

        braw = raw.parse_msgpack(braw_file)
        distance_map = pdb.distance_map(pdb_file, AF.L)

        #mask highly gapped positions
        gaps = 1 - (AF.Ni / AF.neff)
        highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0]
        distance_map[highly_gapped_pos, :] = np.nan
        distance_map[:, highly_gapped_pos] = np.nan


        # iterate over pairs for bins
        for bin_name in sorted(couplings_per_bin.keys(), reverse=True):

            if len(couplings_per_bin[bin_name]['couplings']) >= max_couplings_per_bin:
                continue

            cb_lower = couplings_per_bin[bin_name]['lower']
            cb_upper = couplings_per_bin[bin_name]['upper']

            residue_i, residue_j = np.where((distance_map > cb_lower) & (distance_map < cb_upper))

            Nij = AF.Nij[residue_i, residue_i]
            q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]]
            q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]]

            evidence = Nij * q_i_a  * q_j_b

            residue_i = residue_i[evidence > evidence_threshold]
            residue_j = residue_j[evidence > evidence_threshold]

            if len(residue_i) == 0:
                continue

            ab_coupling = braw.x_pair[residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein]
            couplings_per_bin[bin_name]['couplings'].extend(ab_coupling)

        for bin_name in sorted(couplings_per_bin.keys(), reverse=True):
            print("\nprotein {0} {1:<8} size: {2}".format(
                protein, bin_name, len(couplings_per_bin[bin_name]['couplings'])))

        # stop condition: all bins are full
        if all([len(bindict['couplings']) >= max_couplings_per_bin for bindict in couplings_per_bin.values()]):
            break

    return couplings_per_bin
def collect_data(braw_dirs, alignment_dir, pdb_dir, bin_size, ab):

    #define distance bins
    bins=[0, 5, 8, 12, 15, 20, np.inf]

    max_nr_couplings_per_protein = 500

    methods = braw_dirs.keys()
    couplings_per_bin = {}
    for method in methods:
        couplings_per_bin[method] = {}
        for bin in range(len(bins) - 1):
            bin_name = str(bin+1) + ": " + str(bins[bin]) + "-" + str(bins[bin + 1])
            couplings_per_bin[method][bin_name] = []

    # iterate over proteins
    psc_files = glob.glob(alignment_dir + "/*psc")
    for psc_file in psc_files:

        # psc_file = psc_files[0]
        protein = os.path.basename(psc_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"

        # check if ALL braw files exist
        braw_files = {}
        for method in methods:
            braw_files[method] = braw_dirs[method] + "/" + protein + ".filt.braw.gz"

        if any([not os.path.exists(braw_files[method]) for method in methods]):
            print("Skip this protein (braw files does not exist).")
            continue

        alignment = io.read_alignment(psc_file, format="psicov")
        distance_map = pdb.distance_map(pdb_file, alignment.shape[1])

        diversity = np.sqrt(alignment.shape[0]) / alignment.shape[1]
        if diversity < 0.3:
            print("Skip this protein (low diversity = {0}).".format(diversity))
            continue

        # read braw files
        braw = {}
        for method in methods:
            if ab == 'all':
                braw[method] = bu.compute_l2norm_from_brawfile(braw_files[method], apc=True)
            else:
                braw[method] = raw.parse_msgpack(braw_files[method])



        # mask highly gapped positions
        gaps = ali.compute_gaps_per_position(alignment)
        highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0]
        distance_map[highly_gapped_pos, :] = np.nan
        distance_map[:, highly_gapped_pos] = np.nan

        # iterate over pairs for bins
        for bin in range(len(bins) - 1):
            cb_lower = bins[bin]
            cb_upper = bins[bin + 1]
            bin_name = sorted(couplings_per_bin[methods[0]].keys())[bin]

            residue_indices = np.where((distance_map > cb_lower) & (distance_map < cb_upper))

            #shuffle indices to remove positioning bias
            c = list(zip(residue_indices[0], residue_indices[1]))
            random.shuffle(c)
            residue_indices = zip(*c)


            for method in methods:
                if len(couplings_per_bin[method][bin_name]) < bin_size:
                    if ab == 'all':
                        ab_coupling = braw[method][residue_indices[0], residue_indices[1]].tolist()[:max_nr_couplings_per_protein]
                    else:
                        ab_coupling = braw[method].x_pair[residue_indices[0], residue_indices[1], io.AMINO_INDICES[ab[0]], io.AMINO_INDICES[ab[2]]].tolist()[:max_nr_couplings_per_protein]

                    couplings_per_bin[method][bin_name].extend(ab_coupling)

            print("\nprotein {0} bin: {1:<8} size: {2}".format(
                protein, bin_name, len(couplings_per_bin[methods[0]][bin_name])))

        # stop condition: all bins are full
        if all([len(v) >= bin_size for v in couplings_per_bin[methods[0]].values()]):
            break

    return couplings_per_bin
def collect_data(braw_dirs, alignment_dir, pdb_dir, bin_size, ab):

    #define distance bins
    bins = [0, 5, 8, 12, 15, 20, np.inf]

    max_nr_couplings_per_protein = 500

    methods = braw_dirs.keys()
    couplings_per_bin = {}
    for method in methods:
        couplings_per_bin[method] = {}
        for bin in range(len(bins) - 1):
            bin_name = str(bin + 1) + ": " + str(bins[bin]) + "-" + str(
                bins[bin + 1])
            couplings_per_bin[method][bin_name] = []

    # iterate over proteins
    psc_files = glob.glob(alignment_dir + "/*psc")
    for psc_file in psc_files:

        # psc_file = psc_files[0]
        protein = os.path.basename(psc_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"

        # check if ALL braw files exist
        braw_files = {}
        for method in methods:
            braw_files[
                method] = braw_dirs[method] + "/" + protein + ".filt.braw.gz"

        if any([not os.path.exists(braw_files[method]) for method in methods]):
            print("Skip this protein (braw files does not exist).")
            continue

        alignment = io.read_alignment(psc_file, format="psicov")
        distance_map = pdb.distance_map(pdb_file, alignment.shape[1])

        diversity = np.sqrt(alignment.shape[0]) / alignment.shape[1]
        if diversity < 0.3:
            print("Skip this protein (low diversity = {0}).".format(diversity))
            continue

        # read braw files
        braw = {}
        for method in methods:
            if ab == 'all':
                braw[method] = bu.compute_l2norm_from_brawfile(
                    braw_files[method], apc=True)
            else:
                braw[method] = raw.parse_msgpack(braw_files[method])

        # mask highly gapped positions
        gaps = ali.compute_gaps_per_position(alignment)
        highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0]
        distance_map[highly_gapped_pos, :] = np.nan
        distance_map[:, highly_gapped_pos] = np.nan

        # iterate over pairs for bins
        for bin in range(len(bins) - 1):
            cb_lower = bins[bin]
            cb_upper = bins[bin + 1]
            bin_name = sorted(couplings_per_bin[methods[0]].keys())[bin]

            residue_indices = np.where((distance_map > cb_lower)
                                       & (distance_map < cb_upper))

            #shuffle indices to remove positioning bias
            c = list(zip(residue_indices[0], residue_indices[1]))
            random.shuffle(c)
            residue_indices = zip(*c)

            for method in methods:
                if len(couplings_per_bin[method][bin_name]) < bin_size:
                    if ab == 'all':
                        ab_coupling = braw[method][
                            residue_indices[0], residue_indices[1]].tolist(
                            )[:max_nr_couplings_per_protein]
                    else:
                        ab_coupling = braw[method].x_pair[
                            residue_indices[0], residue_indices[1],
                            io.AMINO_INDICES[ab[0]],
                            io.AMINO_INDICES[ab[2]]].tolist(
                            )[:max_nr_couplings_per_protein]

                    couplings_per_bin[method][bin_name].extend(ab_coupling)

            print("\nprotein {0} bin: {1:<8} size: {2}".format(
                protein, bin_name,
                len(couplings_per_bin[methods[0]][bin_name])))

        # stop condition: all bins are full
        if all([
                len(v) >= bin_size
                for v in couplings_per_bin[methods[0]].values()
        ]):
            break

    return couplings_per_bin
def collect_data(braw_dir, alignment_dir, pdb_dir, pairs, lower_cb_distance, upper_cb_distance):

    #define distance bins
    couplings_per_pair={}
    for pair in pairs:
        couplings_per_pair[pair] = []


    max_nr_couplings_per_protein = 500
    sequence_separation=8
    evidence_threshold = 100
    max_couplings_per_bin = 1000

    # iterate over proteins
    braw_files = glob.glob(braw_dir + "/*braw.gz")
    for braw_file in braw_files:
        # braw_file = braw_files[0]

        protein = os.path.basename(braw_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"
        alignment_file = alignment_dir + "/" + protein + ".filt.psc"

        if not os.path.exists(pdb_file):
            print("PDB file {0} does not exist. Skip this protein.".format(pdb_file))
            continue

        if not os.path.exists(braw_file):
            print("Braw file {0} does not exist. Skip this protein.".format(braw_file))
            continue

        if not os.path.exists(alignment_file):
            print("Alignment file {0} does not exist. Skip this protein.".format(alignment_file))
            continue

        AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8)

        diversity = np.sqrt(AF.N) / AF.L
        if diversity < 0.3:
            print("Diversity = {0}. Skip this protein.".format(diversity))
            continue

        braw = raw.parse_msgpack(braw_file)
        distance_map = pdb.distance_map(pdb_file, AF.L)

        #mask highly gapped positions
        gaps = 1 - (AF.Ni / AF.neff)
        highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0]
        distance_map[highly_gapped_pos, :] = np.nan
        distance_map[:, highly_gapped_pos] = np.nan


        # iterate over pairs for bins
        for pair in pairs:

            if len(couplings_per_pair[pair]) >= max_couplings_per_bin:
                continue

            residue_i, residue_j = np.where((distance_map > lower_cb_distance) & (distance_map < upper_cb_distance))

            if len(residue_i) == 0:
                continue

            a = pair[0]
            b = pair[2]

            Nij = AF.Nij[residue_i, residue_i]
            q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]]
            q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]]
            q_ij_ab = AF.pairwise_frequencies[residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]]

            evidence = np.max([Nij * q_i_a  * q_j_b, Nij * q_ij_ab])

            residue_i = residue_i[evidence > evidence_threshold]
            residue_j = residue_j[evidence > evidence_threshold]

            if len(residue_i) == 0:
                continue

            ab_coupling = braw.x_pair[residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein]
            couplings_per_pair[pair].extend(ab_coupling)


        str="\n\nprotein {0}".format(protein)
        for pair in sorted(couplings_per_pair.keys()):
            str += "\n{0:<8} : {1}".format(pair, len(couplings_per_pair[pair]))
        print str

        # stop condition: all bins are full
        if all([len(couplings_per_pair[pair]) >= max_couplings_per_bin for pair in pairs]):
            break

    return couplings_per_pair