def generate_coupling_decoy_set(self, size=1000): print("Get couplings for Plotting (size={0})...".format(size)) couplings = [] non_couplings = [] lambda_pair = [] print len(self.training_data) for p in self.training_data: braw_file_gz = self.braw_dir + "/" + p + ".filt.braw.gz" braw = raw.parse_msgpack(braw_file_gz) residue_i = np.array(self.training_data[p]['residue_i']) residue_j = np.array(self.training_data[p]['residue_j']) contact = np.array(self.training_data[p]['contact']) if 'regularization' in braw.meta['workflow'][0]['parameters'].keys( ): lambda_pair.append(braw.meta['workflow'][0]['parameters'] ['regularization']['lambda_pair']) else: lambda_pair.append( braw.meta['workflow'][0]['regularization']['lambda_pair']) indices_contact = [ residue_i[np.where(contact == 1)[0]], residue_j[np.where(contact == 1)[0]] ] indices_non_contact = [ residue_i[np.where(contact == 0)[0]], residue_j[np.where(contact == 0)[0]] ] if (len(couplings) < size and len(indices_contact) > 0): for index in range(min(len(indices_contact[0]), 100)): i = indices_contact[0][index] j = indices_contact[1][index] couplings.append(braw.x_pair[i, j][:20, :20].flatten()) if (len(non_couplings) < size and len(indices_non_contact) > 0): for index in range(min(len(indices_non_contact[0]), 100)): i = indices_non_contact[0][index] j = indices_non_contact[1][index] non_couplings.append(braw.x_pair[i, j][:20, :20].flatten()) # stop condition if (len(non_couplings) >= size and len(couplings) >= size): break self.couplings_contacts = couplings[:size] self.couplings_noncontacts = non_couplings[:size] self.avg_lambda_pair = np.mean(lambda_pair)
def f_df_py_protein_parallel(protein, parameters_transformed_back, prec_wrt_L, nr_components, fixed_parameters, status=True, compute_gradients=True): protein_name, protein_data = protein if status: print("Compute likelihood and gradients for {0}".format( protein_name)) braw = raw.parse_msgpack(protein_data['braw_file_path']) Nij, qij = io.read_qij(protein_data['qijabfilename'], braw.ncol) L = braw.ncol #prepare parameters if prec_wrt_L: for component in range(nr_components): parameter = 'prec_' + str(component) parameters_transformed_back[parameter] = list( np.array(parameters_transformed_back[parameter]) * L) covMatdiag = np.zeros((nr_components, 400)) log_det_precMat = np.zeros(nr_components) for component in range(nr_components): parameter = 'prec_' + str(component) covMatdiag[component] = 1.0 / np.array( parameters_transformed_back[parameter]) log_det_precMat[component] = np.sum( np.log(parameters_transformed_back[parameter])) lik_protein = LikelihoodProtein(braw, Nij, qij) lik_protein.set_pairs(protein_data['residue_i'], protein_data['residue_j'], protein_data['contact']) lik_protein.set_parameters_parallel(parameters_transformed_back, covMatdiag, log_det_precMat, nr_components, fixed_parameters, prec_wrt_L) #compute f and gradients lik_protein.compute_f_df(compute_gradients=compute_gradients) f_protein = lik_protein.get_f() grad_protein = lik_protein.get_gradients( ) #={} if no gradients are caluclated return f_protein, grad_protein
def generate_coupling_decoy_set(size, braw_dir, pdb_dir): seqsep = 8 non_contact_thr = 25 contact_thr = 8 couplings = [] non_couplings = [] braw_files = glob.glob(braw_dir + "/*braw*") for braw_file in braw_files[:size]: p = os.path.basename(braw_file).split(".")[0] pdb_file = pdb_dir + "/" + p + ".pdb" try: braw = raw.parse_msgpack(braw_file) except: print("Problems reading {0}".format(braw_file)) continue indices_contact, indices_non_contact = pdb.determine_residue_pair_indices( pdb_file, seqsep, non_contact_thr, contact_thr ) if (len(couplings) < size and len(indices_contact) > 0): for index in range(min(len(indices_contact[0]), 100)): i = indices_contact[0][index] j = indices_contact[1][index] couplings.append(braw.x_pair[i, j][:20, :20].flatten()) if (len(non_couplings) < size and len(indices_non_contact) > 0): for index in range(min(len(indices_non_contact[0]), 100)): i = indices_non_contact[0][index] j = indices_non_contact[1][index] non_couplings.append(braw.x_pair[i, j][:20, :20].flatten()) # stop condition if (len(non_couplings) >= size and len(couplings) >= size): break evaluation_set = {} evaluation_set['contact'] = np.array(couplings[:size]).transpose() evaluation_set['bg'] = np.array(non_couplings[:size]).transpose() return evaluation_set
def generate_coupling_decoy_set(self, size=1000): print("Get couplings for Plotting (size={0})...".format(size)) couplings = [] non_couplings = [] lambda_pair = [] print len(self.training_data) for p in self.training_data: braw_file_gz = self.braw_dir + "/" + p + ".filt.braw.gz" braw = raw.parse_msgpack(braw_file_gz) residue_i = np.array(self.training_data[p]['residue_i']) residue_j = np.array(self.training_data[p]['residue_j']) contact = np.array(self.training_data[p]['contact']) if 'regularization' in braw.meta['workflow'][0]['parameters'].keys(): lambda_pair.append(braw.meta['workflow'][0]['parameters']['regularization']['lambda_pair']) else: lambda_pair.append(braw.meta['workflow'][0]['regularization']['lambda_pair']) indices_contact = [residue_i[np.where(contact == 1)[0]], residue_j[np.where(contact == 1)[0]]] indices_non_contact = [residue_i[np.where(contact == 0)[0]], residue_j[np.where(contact == 0)[0]]] if (len(couplings) < size and len(indices_contact) > 0): for index in range(min(len(indices_contact[0]), 100)): i = indices_contact[0][index] j = indices_contact[1][index] couplings.append(braw.x_pair[i, j][:20, :20].flatten()) if (len(non_couplings) < size and len(indices_non_contact) > 0): for index in range(min(len(indices_non_contact[0]), 100)): i = indices_non_contact[0][index] j = indices_non_contact[1][index] non_couplings.append(braw.x_pair[i, j][:20, :20].flatten()) # stop condition if (len(non_couplings) >= size and len(couplings) >= size): break self.couplings_contacts = couplings[:size] self.couplings_noncontacts = non_couplings[:size] self.avg_lambda_pair = np.mean(lambda_pair)
def contact_likelihood_py(self, coupling_prior_parameter_file, braw_file, qij_file): # load parameters for coupling prior self.contact_likelihood_parameters = Parameters("") self.contact_likelihood_parameters.read_parameters_metadata( coupling_prior_parameter_file + ".settings") self.contact_likelihood_parameters.read_parameters( coupling_prior_parameter_file, transform=True) braw = raw.parse_msgpack(braw_file) Nij, qij = io.read_qij(qij_file, braw.ncol) #instantiate class for computing neg likelihood of coupling prior for one protein lik_protein = LikelihoodProtein(braw, Nij, qij) lik_protein.set_parameters(self.contact_likelihood_parameters) distances = [0, 1] self.contact_log_likelihood_mat = np.zeros( (len(distances), self.L, self.L)) for distance in distances: lik_protein.set_pairs(self.residues_i, self.residues_j, [distance] * len(self.residues_i)) lik_protein.compute_f_df(compute_gradients=False) neg_log_likelihood = lik_protein.get_f_pairwise() self.contact_log_likelihood_mat[ distance, self.residues_i, self.residues_j] = -np.array(neg_log_likelihood) #likelihood_residue_pairs = np.exp(-np.array(neg_log_likelihood)) #self.contact_likelihood_mat[distance, self.residues_i, self.residues_j] = likelihood_residue_pairs.tolist() # collect meta data self.contact_likelihood_meta = {} self.contact_likelihood_meta['dataset'] = {} self.contact_likelihood_meta['dataset']['braw_file'] = braw_file self.contact_likelihood_meta['dataset']['qij_file'] = qij_file self.contact_likelihood_meta['dataset'][ 'sequence_separation'] = self.sequence_separation self.contact_likelihood_meta[ 'parameters'] = self.contact_likelihood_parameters.get_settings()
def f_df_py_protein_parallel(protein, parameters_transformed_back, prec_wrt_L, nr_components, fixed_parameters, status=True, compute_gradients=True): protein_name, protein_data = protein if status: print("Compute likelihood and gradients for {0}".format(protein_name)) braw = raw.parse_msgpack(protein_data['braw_file_path']) Nij, qij = io.read_qij(protein_data['qijabfilename'], braw.ncol) L = braw.ncol #prepare parameters if prec_wrt_L: for component in range(nr_components): parameter = 'prec_'+str(component) parameters_transformed_back[parameter] = list(np.array(parameters_transformed_back[parameter]) * L) covMatdiag = np.zeros((nr_components, 400)) log_det_precMat = np.zeros(nr_components) for component in range(nr_components): parameter = 'prec_'+str(component) covMatdiag[component] = 1.0 / np.array(parameters_transformed_back[parameter]) log_det_precMat[component] = np.sum(np.log(parameters_transformed_back[parameter])) lik_protein = LikelihoodProtein(braw, Nij, qij) lik_protein.set_pairs( protein_data['residue_i'], protein_data['residue_j'], protein_data['contact'] ) lik_protein.set_parameters_parallel( parameters_transformed_back, covMatdiag, log_det_precMat, nr_components, fixed_parameters, prec_wrt_L) #compute f and gradients lik_protein.compute_f_df(compute_gradients=compute_gradients) f_protein = lik_protein.get_f() grad_protein = lik_protein.get_gradients() #={} if no gradients are caluclated return f_protein, grad_protein
def contact_likelihood_py(self, coupling_prior_parameter_file, braw_file, qij_file): # load parameters for coupling prior self.contact_likelihood_parameters = Parameters("") self.contact_likelihood_parameters.read_parameters_metadata(coupling_prior_parameter_file + ".settings") self.contact_likelihood_parameters.read_parameters(coupling_prior_parameter_file, transform=True) braw = raw.parse_msgpack(braw_file) Nij, qij = io.read_qij(qij_file, braw.ncol) #instantiate class for computing neg likelihood of coupling prior for one protein lik_protein = LikelihoodProtein(braw, Nij, qij) lik_protein.set_parameters(self.contact_likelihood_parameters) distances = [0, 1] self.contact_log_likelihood_mat = np.zeros((len(distances), self.L, self.L)) for distance in distances: lik_protein.set_pairs( self.residues_i, self.residues_j, [distance] * len(self.residues_i) ) lik_protein.compute_f_df(compute_gradients=False) neg_log_likelihood = lik_protein.get_f_pairwise() self.contact_log_likelihood_mat[distance, self.residues_i, self.residues_j] = -np.array(neg_log_likelihood) #likelihood_residue_pairs = np.exp(-np.array(neg_log_likelihood)) #self.contact_likelihood_mat[distance, self.residues_i, self.residues_j] = likelihood_residue_pairs.tolist() # collect meta data self.contact_likelihood_meta = {} self.contact_likelihood_meta['dataset'] = {} self.contact_likelihood_meta['dataset']['braw_file'] = braw_file self.contact_likelihood_meta['dataset']['qij_file'] = qij_file self.contact_likelihood_meta['dataset']['sequence_separation'] = self.sequence_separation self.contact_likelihood_meta['parameters'] = self.contact_likelihood_parameters.get_settings()
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting a contact map.') parser.add_argument("braw_dir", type=str, help="path to binary raw files") parser.add_argument("pdb_dir", type=str, help="path to pdb files") parser.add_argument("alignment_dir", type=str, help="path to alignment files") parser.add_argument("nr_couplings", type=int, default=10000, help="number of couplings") parser.add_argument("plot_out", type=str, help="path to plot file") parser.add_argument("max_per_protein", type=int, default=100, help="maximum numbr couplings per protein") args = parser.parse_args() braw_dir = args.braw_dir pdb_dir = args.pdb_dir alignment_dir = args.alignment_dir nr_couplings = args.nr_couplings plot_out = args.plot_out max_per_protein = args.max_per_protein #debugging braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpredpy_cd/braw/" pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/" alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/" nr_couplings = 20000 plot_out = '/home/vorberg/' max_per_protein = 100 if not os.path.exists(braw_dir): raise IOError("Braw Path {0} does not exist.".format(braw_dir)) coupling_df = pd.DataFrame(columns=range(400) + ['Neff']) braw_files = glob.glob(braw_dir + "/*braw*") for braw_file in braw_files: if len(coupling_df) > nr_couplings: break protein = os.path.basename(braw_file).split(".")[0] print protein #-------------get couplings and metadata --------------------------------------------------------------------- braw = raw.parse_msgpack(braw_file) meta = braw.meta neff = meta['workflow'][0]['parameters']['msafile']['neff'] L = meta['workflow'][0]['parameters']['msafile']['ncol'] N = meta['workflow'][0]['parameters']['msafile']['nrow'] diversity = np.sqrt(N) / L #------------------------------------------------------------------------------------------------------------- #-------------filter contacts ------------------------------------------------------------------------------- pdb_file = pdb_dir + "/" + protein + ".pdb" dist_matrix = pdb.distance_map(pdb_file) # get contact map (either 1 encoding a contact or 1 encoding non-contact (according to class variable) contact_map = dist_matrix < 8 # select all residue pairs within contact Threshold indices_contact = list(np.where(np.triu(contact_map, k=1))) #------------------------------------------------------------------------------------------------------------- #--------------filter gap columns --------------------------------------------------------------------------- psicov_file = alignment_dir + "/" + protein + ".filt.psc" psicov = io.read_alignment(psicov_file) percent_gaps_per_column = [ float(psicov[:, l].tolist().count(0)) / N for l in range(L) ] columns_with_many_gaps = [ i for i, j in enumerate(percent_gaps_per_column) if j > 0.2 ] index_delete_contact_i = [ index for index in range(len(indices_contact[0])) if indices_contact[0][index] in columns_with_many_gaps ] index_delete_contact_j = [ index for index in range(len(indices_contact[1])) if indices_contact[1][index] in columns_with_many_gaps ] # delete column pairs from indices_contact indices_contact[0] = np.delete( indices_contact[0], np.unique(index_delete_contact_i + index_delete_contact_j)) indices_contact[1] = np.delete( indices_contact[1], np.unique(index_delete_contact_i + index_delete_contact_j)) #------------------------------------------------------------------------------------------------------------- nr_contacts = len(indices_contact[0]) if nr_contacts == 0: continue random_sample = np.random.choice(range(nr_contacts), replace=False, size=np.min( [max_per_protein, nr_contacts])) couplings = braw.x_pair[ indices_contact[0][random_sample], indices_contact[1][random_sample], :20, :20].reshape( len(random_sample), 400) df = pd.DataFrame(couplings) df['L'] = L df['Neff'] = neff df['Diversity'] = diversity df['sum_wij'] = couplings.sum(1) df['ratio_0.2L_Neff'] = 0.2 * L / neff coupling_df = coupling_df.append(df) print "nr of couplings: {0}".format(len(coupling_df)) plot_file = plot_out + "/coupling_matrix_neff_" + str( nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'Neff', plot_file) plot_file = plot_out + "/coupling_matrix_diversity_" + str( nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'Diversity', plot_file) plot_file = plot_out + "/coupling_matrix_L_" + str(nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'L', plot_file) plot_file = plot_out + "/coupling_matrix_ratio_0.2L_Neff_" + str( nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'ratio_0.2L_Neff', plot_file)
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting a contact map.') parser.add_argument("binary_raw_file", type=str, help="path to binary_raw_file") parser.add_argument("alignment_file", type=str, help="path to alignment_file") parser.add_argument("plot_out", type=str, help="path to plot file") parser.add_argument("--residue_i", "residue_i", default=None, type=int, help="position of residue i") parser.add_argument("--residue_j", "residue_j", default=None, type=int, help="position of residue j") parser.add_argument("--entropy-correction", "entropy_correction", default=False, action="store_true", help="plot entropy correction") parser.add_argument("--count-statistic-correction", "count_statistic_correction", default=False, action="store_true", help="plot coutn stat correction") args = parser.parse_args() binary_raw_file = args.binary_raw_file alignment_file = args.alignment_file plot_out = args.plot_out residue_i = args.residue_i residue_j = args.residue_j plot_entropy_correction = args.entropy_correction plot_count_statistic_correction = args.count_statistic_correction #debugging protein = "1dv1A03" alignment_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/" + protein + ".filt.psc" binary_raw_file = "/home/vorberg/"+protein+".gx.gz" binary_raw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/"+protein+".filt.braw.gz" residue_i=34 residue_j=65 plot_out="/home/vorberg/" plot_entropy_correction=True plot_count_statistic_correction=True if not os.path.exists(binary_raw_file): raise IOError("Braw file " + str(binary_raw_file) + " cannot be found. ") #get couplings and lambda_w braw = raw.parse_msgpack(binary_raw_file) alignment = io.read_alignment(alignment_file) protein = os.path.basename(binary_raw_file).split(".")[0] L = braw.ncol if "regularization" in braw.meta['workflow'][0].keys(): lambda_w = braw.meta['workflow'][0]['regularization']['lambda_pair'] else: lambda_w = braw.meta['workflow'][0]['parameters']['regularization']['lambda_pair'] #read amino acid frequencies with pseudo-counts neff = au.compute_neff(alignment) single_freq, pair_freq = au.calculate_frequencies(alignment, au.uniform_pseudocounts) if plot_entropy_correction: ui, entropy_correction, eta = bu.compute_correction_ij(single_freq, neff, lambda_w, braw.x_pair, residue_i, residue_j, entropy=True, squared=True) plot_correction(entropy_correction, ui[residue_i-1, :], ui[residue_j-1, :], protein, residue_i, residue_j, eta, plot_out, correction_type="entropy", plot_type="heatmap") plot_correction(entropy_correction, ui[residue_i-1, :], ui[residue_j-1, :], protein, residue_i, residue_j, eta, plot_out, correction_type="entropy", plot_type="bubble") if plot_count_statistic_correction: ui, csc_correction, eta = bu.compute_correction_ij(single_freq, neff, lambda_w, braw.x_pair, residue_i, residue_j, entropy=False, squared=True) plot_correction(csc_correction, ui[residue_i-1, :], ui[residue_j-1, :], protein, residue_i, residue_j, eta, plot_out, correction_type="count-statistic", plot_type="heatmap") plot_correction(csc_correction, ui[residue_i-1, :], ui[residue_j-1, :], protein, residue_i, residue_j, eta, plot_out, correction_type="count-statistic", plot_type="bubble")
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting a contact map.') parser.add_argument("braw_dir", type=str, help="path to binary raw files") parser.add_argument("pdb_dir", type=str, help="path to pdb files") parser.add_argument("alignment_dir", type=str, help="path to alignment files") parser.add_argument("nr_couplings", type=int, default=10000, help="number of couplings") parser.add_argument("plot_out", type=str, help="path to plot file") parser.add_argument("max_per_protein", type=int, default=100, help="maximum numbr couplings per protein") args = parser.parse_args() braw_dir = args.braw_dir pdb_dir = args.pdb_dir alignment_dir = args.alignment_dir nr_couplings = args.nr_couplings plot_out = args.plot_out max_per_protein = args.max_per_protein #debugging braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpredpy_cd/braw/" pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/" alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/" nr_couplings = 20000 plot_out='/home/vorberg/' max_per_protein=100 if not os.path.exists(braw_dir): raise IOError("Braw Path {0} does not exist.".format(braw_dir)) coupling_df = pd.DataFrame(columns=range(400) + ['Neff']) braw_files = glob.glob(braw_dir + "/*braw*") for braw_file in braw_files: if len(coupling_df) > nr_couplings: break protein = os.path.basename(braw_file).split(".")[0] print protein #-------------get couplings and metadata --------------------------------------------------------------------- braw = raw.parse_msgpack(braw_file) meta = braw.meta neff = meta['workflow'][0]['parameters']['msafile']['neff'] L = meta['workflow'][0]['parameters']['msafile']['ncol'] N = meta['workflow'][0]['parameters']['msafile']['nrow'] diversity = np.sqrt(N)/L #------------------------------------------------------------------------------------------------------------- #-------------filter contacts ------------------------------------------------------------------------------- pdb_file = pdb_dir +"/"+protein+".pdb" dist_matrix = pdb.distance_map(pdb_file) # get contact map (either 1 encoding a contact or 1 encoding non-contact (according to class variable) contact_map = dist_matrix < 8 # select all residue pairs within contact Threshold indices_contact = list(np.where(np.triu(contact_map, k=1))) #------------------------------------------------------------------------------------------------------------- #--------------filter gap columns --------------------------------------------------------------------------- psicov_file = alignment_dir + "/"+protein+".filt.psc" psicov = io.read_alignment(psicov_file) percent_gaps_per_column = [float(psicov[:, l].tolist().count(0)) / N for l in range(L)] columns_with_many_gaps = [i for i, j in enumerate(percent_gaps_per_column) if j > 0.2] index_delete_contact_i = [index for index in range(len(indices_contact[0])) if indices_contact[0][index] in columns_with_many_gaps] index_delete_contact_j = [index for index in range(len(indices_contact[1])) if indices_contact[1][index] in columns_with_many_gaps] # delete column pairs from indices_contact indices_contact[0] = np.delete(indices_contact[0], np.unique(index_delete_contact_i + index_delete_contact_j)) indices_contact[1] = np.delete(indices_contact[1], np.unique(index_delete_contact_i + index_delete_contact_j)) #------------------------------------------------------------------------------------------------------------- nr_contacts = len(indices_contact[0]) if nr_contacts == 0: continue random_sample = np.random.choice(range(nr_contacts), replace=False, size=np.min([max_per_protein, nr_contacts])) couplings = braw.x_pair[indices_contact[0][random_sample], indices_contact[1][random_sample],:20,:20].reshape(len(random_sample), 400) df = pd.DataFrame(couplings) df['L'] = L df['Neff'] = neff df['Diversity'] = diversity df['sum_wij'] = couplings.sum(1) df['ratio_0.2L_Neff'] = 0.2 * L / neff coupling_df = coupling_df.append(df) print "nr of couplings: {0}".format(len(coupling_df)) plot_file = plot_out + "/coupling_matrix_neff_" + str(nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'Neff', plot_file) plot_file = plot_out + "/coupling_matrix_diversity_" + str(nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'Diversity', plot_file) plot_file = plot_out + "/coupling_matrix_L_" + str(nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'L', plot_file) plot_file = plot_out + "/coupling_matrix_ratio_0.2L_Neff_" + str(nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'ratio_0.2L_Neff', plot_file)
def collect_data(braw_dir, alignment_dir, pdb_dir, ab, cd, cb_lower, cb_upper): #define distance bins couplings = {ab: [], cd: []} max_nr_couplings_per_protein = 500 sequence_separation = 10 evidence_threshold = 80 max_nr_couplings = 5000 diversity_thr = 0.3 a = ab[0] b = ab[2] c = cd[0] d = cd[2] # iterate over proteins braw_files = glob.glob(braw_dir + "/*braw.gz") for braw_file in braw_files: # braw_file = braw_files[0] protein = os.path.basename(braw_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(pdb_file): print("PDB file {0} does not exist. Skip this protein.".format( pdb_file)) continue if not os.path.exists(braw_file): print("Braw file {0} does not exist. Skip this protein.".format( braw_file)) continue if not os.path.exists(alignment_file): print("Alignment file {0} does not exist. Skip this protein.". format(alignment_file)) continue AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8) diversity = np.sqrt(AF.N) / AF.L if diversity < diversity_thr: print("Diversity = {0}. Skip this protein.".format(diversity)) continue braw = raw.parse_msgpack(braw_file) distance_map = pdb.distance_map(pdb_file, AF.L) #mask highly gapped positions gaps = 1 - (AF.Ni / AF.neff) highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0] distance_map[highly_gapped_pos, :] = np.nan distance_map[:, highly_gapped_pos] = np.nan # iterate over pairs for bins residue_i, residue_j = np.where((distance_map > cb_lower) & (distance_map < cb_upper)) Nij = AF.Nij[residue_i, residue_j] q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]] q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]] q_i_c = AF.single_frequencies[residue_i, io.AMINO_INDICES[c]] q_j_d = AF.single_frequencies[residue_j, io.AMINO_INDICES[d]] evidence_ab = Nij * q_i_a * q_j_b evidence_cd = Nij * q_i_c * q_j_d residue_i = residue_i[(evidence_ab > evidence_threshold) & (evidence_cd > evidence_threshold)] residue_j = residue_j[(evidence_ab > evidence_threshold) & (evidence_cd > evidence_threshold)] if len(residue_i) == 0: continue ab_coupling = braw.x_pair[ residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein] cd_coupling = braw.x_pair[ residue_i, residue_j, io.AMINO_INDICES[c], io.AMINO_INDICES[d]].tolist()[:max_nr_couplings_per_protein] couplings[ab].extend(ab_coupling) couplings[cd].extend(cd_coupling) print("\nprotein {0} size: {1}".format(protein, len(couplings[ab]))) # stop condition: all bins are full if len(couplings[ab]) >= max_nr_couplings: break return couplings
def collect_data(braw_dir, alignment_dir, pdb_dir, ab): #define distance bins couplings_per_bin = { 'bin1': { 'couplings': [], 'lower': 0, 'upper': 8 }, 'bin2': { 'couplings': [], 'lower': 5, 'upper': 10 }, 'bin3': { 'couplings': [], 'lower': 8, 'upper': 12 }, 'bin4': { 'couplings': [], 'lower': 10, 'upper': 15 }, 'bin5': { 'couplings': [], 'lower': 20, 'upper': 50 } } max_nr_couplings_per_protein = 500 sequence_separation = 10 evidence_threshold = 100 max_couplings_per_bin = 10000 a = ab[0] b = ab[2] # iterate over proteins braw_files = glob.glob(braw_dir + "/*braw.gz") for braw_file in braw_files: # braw_file = braw_files[0] protein = os.path.basename(braw_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(pdb_file): print("PDB file {0} does not exist. Skip this protein.".format( pdb_file)) continue if not os.path.exists(braw_file): print("Braw file {0} does not exist. Skip this protein.".format( braw_file)) continue if not os.path.exists(alignment_file): print( "Alignment file {0} does not exist. Skip this protein.".format( alignment_file)) continue AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8) diversity = np.sqrt(AF.N) / AF.L if diversity < 0.3: print("Diversity = {0}. Skip this protein.".format(diversity)) continue braw = raw.parse_msgpack(braw_file) distance_map = pdb.distance_map(pdb_file, AF.L) #mask highly gapped positions gaps = 1 - (AF.Ni / AF.neff) highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0] distance_map[highly_gapped_pos, :] = np.nan distance_map[:, highly_gapped_pos] = np.nan # iterate over pairs for bins for bin_name in sorted(couplings_per_bin.keys(), reverse=True): if len(couplings_per_bin[bin_name] ['couplings']) >= max_couplings_per_bin: continue cb_lower = couplings_per_bin[bin_name]['lower'] cb_upper = couplings_per_bin[bin_name]['upper'] residue_i, residue_j = np.where((distance_map > cb_lower) & (distance_map < cb_upper)) Nij = AF.Nij[residue_i, residue_i] q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]] q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]] evidence = Nij * q_i_a * q_j_b residue_i = residue_i[evidence > evidence_threshold] residue_j = residue_j[evidence > evidence_threshold] if len(residue_i) == 0: continue ab_coupling = braw.x_pair[ residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein] couplings_per_bin[bin_name]['couplings'].extend(ab_coupling) for bin_name in sorted(couplings_per_bin.keys(), reverse=True): print("\nprotein {0} {1:<8} size: {2}".format( protein, bin_name, len(couplings_per_bin[bin_name]['couplings']))) # stop condition: all bins are full if all([ len(bindict['couplings']) >= max_couplings_per_bin for bindict in couplings_per_bin.values() ]): break return couplings_per_bin
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting a contact map.') parser.add_argument("braw_dir", type=str, help="path to binary_raw_files") parser.add_argument("alignment_dir", type=str, help="path to alignment files") parser.add_argument("pdb_dir", type=str, help="path to pdb files") parser.add_argument("ab", type=str, help="ab in range(400)") parser.add_argument("cd", type=str, help="cd in range(400)") parser.add_argument("dist_lower", type=int, default=0, help="Lower Cbeta distance threshold") parser.add_argument("dist_upper", type=int, default=8, help="Upper Cbeta distance threshold") parser.add_argument("Nij_threshold", type=int, default=100, help="Minimum number of non-gapped sequences at positions i and j ") parser.add_argument("size", type=int, help="number of pairs ij") parser.add_argument("plot_dir", type=str, help="where to save the plot") args = parser.parse_args() braw_dir = args.braw_dir pdb_dir = args.pdb_dir alignment_dir = args.alignment_dir ab = args.ab cd = args.cd dist_lower = args.dist_lower dist_upper = args.dist_upper Nij_threshold = args.Nij_threshold size = args.size plot_dir = args.plot_dir #debugging # pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/" # braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" # alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/" # ab='R-E' # cd='E-R' # dist_lower = 0 # dist_upper = 8 # Nij_threshold = 100 # size = 10000 # plot_dir='/home/vorberg/' braw_files = glob.glob(braw_dir + "/*braw.gz") couplings={} couplings[ab]=[] couplings[cd]=[] for braw_file in braw_files: if len(couplings[ab]) > size: break if not os.path.exists(braw_file): print("Braw File " + str(braw_file) + "cannot be found. ") continue braw = raw.parse_msgpack(braw_file) L = braw.ncol protein = os.path.basename(braw_file).split(".")[0] alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(alignment_file): print("Alignment File " + str(alignment_file) + " cannot be found. ") continue pdb_file = pdb_dir + "/" + protein.replace("_", "") + ".pdb" if not os.path.exists(pdb_file): print("PDB File " + str(pdb_file) + " cannot be found. ") continue print protein indices_upper_tri = np.triu_indices(L, k=1) #filter pair indices that have specified Cb distances dist_matrix = pdb.distance_map(pdb_file, L) indices_dist_true = np.where((dist_matrix[indices_upper_tri] > dist_lower) & (dist_matrix[indices_upper_tri] < dist_upper))[0] #filter pair indices that have more than Nij_threshold ungapped sequences alignment = io.read_alignment(alignment_file) weights = weighting.calculate_weights_simple(alignment, 0.8, True) pairwise_counts = counts.pair_counts(alignment, weights) Nij = pairwise_counts[:, :, :20, :20].sum(3).sum(2) indices_Nij_true = np.where(Nij[indices_upper_tri] > Nij_threshold)[0] #get pair indices that fullfill both requirements indices_merge = list(set(indices_dist_true).intersection(indices_Nij_true)) #get couplings for filtered pairs braw_reshaped = braw.x_pair[:,:,:20,:20].reshape(L,L,400) couplings[ab].extend(braw_reshaped[indices_upper_tri][indices_merge][:, io.AB_INDICES[ab]]) couplings[cd].extend(braw_reshaped[indices_upper_tri][indices_merge][:, io.AB_INDICES[cd]]) print "Nr of couplings: {0}".format(len(couplings[ab])) plot_file = plot_dir + "/pairwise_couplings_" + ab + "_"+ cd + "_Nijthreshold" + str(Nij_threshold) + "_Cbdistance_" + str(dist_lower) +"_" + str(dist_upper) + ".html" title="Couplings {0} vs {1} <br> Nij threshold: {2}, {3} <= Cb_ij <= {4}".format(ab, cd, Nij_threshold, dist_lower, dist_upper) plots.plot_pairwise_couplings_density(couplings, title, plot_out=plot_file)
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting a contact map.') parser.add_argument("braw_dir", type=str, help="path to binary_raw_files") parser.add_argument("alignment_dir", type=str, help="path to alignment files") parser.add_argument("pdb_dir", type=str, help="path to pdb files") parser.add_argument("ab", type=str, help="ab in range(400)") parser.add_argument("cd", type=str, help="cd in range(400)") parser.add_argument("dist_lower", type=int, default=0, help="Lower Cbeta distance threshold") parser.add_argument("dist_upper", type=int, default=8, help="Upper Cbeta distance threshold") parser.add_argument( "Nij_threshold", type=int, default=100, help="Minimum number of non-gapped sequences at positions i and j ") parser.add_argument("size", type=int, help="number of pairs ij") parser.add_argument("plot_dir", type=str, help="where to save the plot") args = parser.parse_args() braw_dir = args.braw_dir pdb_dir = args.pdb_dir alignment_dir = args.alignment_dir ab = args.ab cd = args.cd dist_lower = args.dist_lower dist_upper = args.dist_upper Nij_threshold = args.Nij_threshold size = args.size plot_dir = args.plot_dir #debugging # pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/" # braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" # alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/" # ab='R-E' # cd='E-R' # dist_lower = 0 # dist_upper = 8 # Nij_threshold = 100 # size = 10000 # plot_dir='/home/vorberg/' braw_files = glob.glob(braw_dir + "/*braw.gz") couplings = {} couplings[ab] = [] couplings[cd] = [] for braw_file in braw_files: if len(couplings[ab]) > size: break if not os.path.exists(braw_file): print("Braw File " + str(braw_file) + "cannot be found. ") continue braw = raw.parse_msgpack(braw_file) L = braw.ncol protein = os.path.basename(braw_file).split(".")[0] alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(alignment_file): print("Alignment File " + str(alignment_file) + " cannot be found. ") continue pdb_file = pdb_dir + "/" + protein.replace("_", "") + ".pdb" if not os.path.exists(pdb_file): print("PDB File " + str(pdb_file) + " cannot be found. ") continue print protein indices_upper_tri = np.triu_indices(L, k=1) #filter pair indices that have specified Cb distances dist_matrix = pdb.distance_map(pdb_file, L) indices_dist_true = np.where( (dist_matrix[indices_upper_tri] > dist_lower) & (dist_matrix[indices_upper_tri] < dist_upper))[0] #filter pair indices that have more than Nij_threshold ungapped sequences alignment = io.read_alignment(alignment_file) weights = weighting.calculate_weights_simple(alignment, 0.8, True) pairwise_counts = counts.pair_counts(alignment, weights) Nij = pairwise_counts[:, :, :20, :20].sum(3).sum(2) indices_Nij_true = np.where(Nij[indices_upper_tri] > Nij_threshold)[0] #get pair indices that fullfill both requirements indices_merge = list( set(indices_dist_true).intersection(indices_Nij_true)) #get couplings for filtered pairs braw_reshaped = braw.x_pair[:, :, :20, :20].reshape(L, L, 400) couplings[ab].extend( braw_reshaped[indices_upper_tri][indices_merge][:, io.AB_INDICES[ab]]) couplings[cd].extend( braw_reshaped[indices_upper_tri][indices_merge][:, io.AB_INDICES[cd]]) print "Nr of couplings: {0}".format(len(couplings[ab])) plot_file = plot_dir + "/pairwise_couplings_" + ab + "_" + cd + "_Nijthreshold" + str( Nij_threshold) + "_Cbdistance_" + str(dist_lower) + "_" + str( dist_upper) + ".html" title = "Couplings {0} vs {1} <br> Nij threshold: {2}, {3} <= Cb_ij <= {4}".format( ab, cd, Nij_threshold, dist_lower, dist_upper) plots.plot_pairwise_couplings_density(couplings, title, plot_out=plot_file)
def collect_data(braw_dir, alignment_dir, pdb_dir, ab): #define distance bins couplings_per_bin={ 'bin1': { 'couplings' : [], 'lower':0, 'upper':8 }, 'bin2': { 'couplings': [], 'lower': 5, 'upper': 10 }, 'bin3': { 'couplings': [], 'lower': 8, 'upper': 12 }, 'bin4': { 'couplings': [], 'lower': 10, 'upper': 15 }, 'bin5': { 'couplings': [], 'lower': 20, 'upper': 50 } } max_nr_couplings_per_protein = 500 sequence_separation=10 evidence_threshold = 100 max_couplings_per_bin = 10000 a = ab[0] b = ab[2] # iterate over proteins braw_files = glob.glob(braw_dir + "/*braw.gz") for braw_file in braw_files: # braw_file = braw_files[0] protein = os.path.basename(braw_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(pdb_file): print("PDB file {0} does not exist. Skip this protein.".format(pdb_file)) continue if not os.path.exists(braw_file): print("Braw file {0} does not exist. Skip this protein.".format(braw_file)) continue if not os.path.exists(alignment_file): print("Alignment file {0} does not exist. Skip this protein.".format(alignment_file)) continue AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8) diversity = np.sqrt(AF.N) / AF.L if diversity < 0.3: print("Diversity = {0}. Skip this protein.".format(diversity)) continue braw = raw.parse_msgpack(braw_file) distance_map = pdb.distance_map(pdb_file, AF.L) #mask highly gapped positions gaps = 1 - (AF.Ni / AF.neff) highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0] distance_map[highly_gapped_pos, :] = np.nan distance_map[:, highly_gapped_pos] = np.nan # iterate over pairs for bins for bin_name in sorted(couplings_per_bin.keys(), reverse=True): if len(couplings_per_bin[bin_name]['couplings']) >= max_couplings_per_bin: continue cb_lower = couplings_per_bin[bin_name]['lower'] cb_upper = couplings_per_bin[bin_name]['upper'] residue_i, residue_j = np.where((distance_map > cb_lower) & (distance_map < cb_upper)) Nij = AF.Nij[residue_i, residue_i] q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]] q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]] evidence = Nij * q_i_a * q_j_b residue_i = residue_i[evidence > evidence_threshold] residue_j = residue_j[evidence > evidence_threshold] if len(residue_i) == 0: continue ab_coupling = braw.x_pair[residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein] couplings_per_bin[bin_name]['couplings'].extend(ab_coupling) for bin_name in sorted(couplings_per_bin.keys(), reverse=True): print("\nprotein {0} {1:<8} size: {2}".format( protein, bin_name, len(couplings_per_bin[bin_name]['couplings']))) # stop condition: all bins are full if all([len(bindict['couplings']) >= max_couplings_per_bin for bindict in couplings_per_bin.values()]): break return couplings_per_bin
def collect_data(braw_dirs, alignment_dir, pdb_dir, bin_size, ab): #define distance bins bins=[0, 5, 8, 12, 15, 20, np.inf] max_nr_couplings_per_protein = 500 methods = braw_dirs.keys() couplings_per_bin = {} for method in methods: couplings_per_bin[method] = {} for bin in range(len(bins) - 1): bin_name = str(bin+1) + ": " + str(bins[bin]) + "-" + str(bins[bin + 1]) couplings_per_bin[method][bin_name] = [] # iterate over proteins psc_files = glob.glob(alignment_dir + "/*psc") for psc_file in psc_files: # psc_file = psc_files[0] protein = os.path.basename(psc_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" # check if ALL braw files exist braw_files = {} for method in methods: braw_files[method] = braw_dirs[method] + "/" + protein + ".filt.braw.gz" if any([not os.path.exists(braw_files[method]) for method in methods]): print("Skip this protein (braw files does not exist).") continue alignment = io.read_alignment(psc_file, format="psicov") distance_map = pdb.distance_map(pdb_file, alignment.shape[1]) diversity = np.sqrt(alignment.shape[0]) / alignment.shape[1] if diversity < 0.3: print("Skip this protein (low diversity = {0}).".format(diversity)) continue # read braw files braw = {} for method in methods: if ab == 'all': braw[method] = bu.compute_l2norm_from_brawfile(braw_files[method], apc=True) else: braw[method] = raw.parse_msgpack(braw_files[method]) # mask highly gapped positions gaps = ali.compute_gaps_per_position(alignment) highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0] distance_map[highly_gapped_pos, :] = np.nan distance_map[:, highly_gapped_pos] = np.nan # iterate over pairs for bins for bin in range(len(bins) - 1): cb_lower = bins[bin] cb_upper = bins[bin + 1] bin_name = sorted(couplings_per_bin[methods[0]].keys())[bin] residue_indices = np.where((distance_map > cb_lower) & (distance_map < cb_upper)) #shuffle indices to remove positioning bias c = list(zip(residue_indices[0], residue_indices[1])) random.shuffle(c) residue_indices = zip(*c) for method in methods: if len(couplings_per_bin[method][bin_name]) < bin_size: if ab == 'all': ab_coupling = braw[method][residue_indices[0], residue_indices[1]].tolist()[:max_nr_couplings_per_protein] else: ab_coupling = braw[method].x_pair[residue_indices[0], residue_indices[1], io.AMINO_INDICES[ab[0]], io.AMINO_INDICES[ab[2]]].tolist()[:max_nr_couplings_per_protein] couplings_per_bin[method][bin_name].extend(ab_coupling) print("\nprotein {0} bin: {1:<8} size: {2}".format( protein, bin_name, len(couplings_per_bin[methods[0]][bin_name]))) # stop condition: all bins are full if all([len(v) >= bin_size for v in couplings_per_bin[methods[0]].values()]): break return couplings_per_bin
def collect_data(braw_dirs, alignment_dir, pdb_dir, bin_size, ab): #define distance bins bins = [0, 5, 8, 12, 15, 20, np.inf] max_nr_couplings_per_protein = 500 methods = braw_dirs.keys() couplings_per_bin = {} for method in methods: couplings_per_bin[method] = {} for bin in range(len(bins) - 1): bin_name = str(bin + 1) + ": " + str(bins[bin]) + "-" + str( bins[bin + 1]) couplings_per_bin[method][bin_name] = [] # iterate over proteins psc_files = glob.glob(alignment_dir + "/*psc") for psc_file in psc_files: # psc_file = psc_files[0] protein = os.path.basename(psc_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" # check if ALL braw files exist braw_files = {} for method in methods: braw_files[ method] = braw_dirs[method] + "/" + protein + ".filt.braw.gz" if any([not os.path.exists(braw_files[method]) for method in methods]): print("Skip this protein (braw files does not exist).") continue alignment = io.read_alignment(psc_file, format="psicov") distance_map = pdb.distance_map(pdb_file, alignment.shape[1]) diversity = np.sqrt(alignment.shape[0]) / alignment.shape[1] if diversity < 0.3: print("Skip this protein (low diversity = {0}).".format(diversity)) continue # read braw files braw = {} for method in methods: if ab == 'all': braw[method] = bu.compute_l2norm_from_brawfile( braw_files[method], apc=True) else: braw[method] = raw.parse_msgpack(braw_files[method]) # mask highly gapped positions gaps = ali.compute_gaps_per_position(alignment) highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0] distance_map[highly_gapped_pos, :] = np.nan distance_map[:, highly_gapped_pos] = np.nan # iterate over pairs for bins for bin in range(len(bins) - 1): cb_lower = bins[bin] cb_upper = bins[bin + 1] bin_name = sorted(couplings_per_bin[methods[0]].keys())[bin] residue_indices = np.where((distance_map > cb_lower) & (distance_map < cb_upper)) #shuffle indices to remove positioning bias c = list(zip(residue_indices[0], residue_indices[1])) random.shuffle(c) residue_indices = zip(*c) for method in methods: if len(couplings_per_bin[method][bin_name]) < bin_size: if ab == 'all': ab_coupling = braw[method][ residue_indices[0], residue_indices[1]].tolist( )[:max_nr_couplings_per_protein] else: ab_coupling = braw[method].x_pair[ residue_indices[0], residue_indices[1], io.AMINO_INDICES[ab[0]], io.AMINO_INDICES[ab[2]]].tolist( )[:max_nr_couplings_per_protein] couplings_per_bin[method][bin_name].extend(ab_coupling) print("\nprotein {0} bin: {1:<8} size: {2}".format( protein, bin_name, len(couplings_per_bin[methods[0]][bin_name]))) # stop condition: all bins are full if all([ len(v) >= bin_size for v in couplings_per_bin[methods[0]].values() ]): break return couplings_per_bin
def collect_data(braw_dir, alignment_dir, pdb_dir, pairs, lower_cb_distance, upper_cb_distance): #define distance bins couplings_per_pair={} for pair in pairs: couplings_per_pair[pair] = [] max_nr_couplings_per_protein = 500 sequence_separation=8 evidence_threshold = 100 max_couplings_per_bin = 1000 # iterate over proteins braw_files = glob.glob(braw_dir + "/*braw.gz") for braw_file in braw_files: # braw_file = braw_files[0] protein = os.path.basename(braw_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(pdb_file): print("PDB file {0} does not exist. Skip this protein.".format(pdb_file)) continue if not os.path.exists(braw_file): print("Braw file {0} does not exist. Skip this protein.".format(braw_file)) continue if not os.path.exists(alignment_file): print("Alignment file {0} does not exist. Skip this protein.".format(alignment_file)) continue AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8) diversity = np.sqrt(AF.N) / AF.L if diversity < 0.3: print("Diversity = {0}. Skip this protein.".format(diversity)) continue braw = raw.parse_msgpack(braw_file) distance_map = pdb.distance_map(pdb_file, AF.L) #mask highly gapped positions gaps = 1 - (AF.Ni / AF.neff) highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0] distance_map[highly_gapped_pos, :] = np.nan distance_map[:, highly_gapped_pos] = np.nan # iterate over pairs for bins for pair in pairs: if len(couplings_per_pair[pair]) >= max_couplings_per_bin: continue residue_i, residue_j = np.where((distance_map > lower_cb_distance) & (distance_map < upper_cb_distance)) if len(residue_i) == 0: continue a = pair[0] b = pair[2] Nij = AF.Nij[residue_i, residue_i] q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]] q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]] q_ij_ab = AF.pairwise_frequencies[residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]] evidence = np.max([Nij * q_i_a * q_j_b, Nij * q_ij_ab]) residue_i = residue_i[evidence > evidence_threshold] residue_j = residue_j[evidence > evidence_threshold] if len(residue_i) == 0: continue ab_coupling = braw.x_pair[residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein] couplings_per_pair[pair].extend(ab_coupling) str="\n\nprotein {0}".format(protein) for pair in sorted(couplings_per_pair.keys()): str += "\n{0:<8} : {1}".format(pair, len(couplings_per_pair[pair])) print str # stop condition: all bins are full if all([len(couplings_per_pair[pair]) >= max_couplings_per_bin for pair in pairs]): break return couplings_per_pair