def main(): args = parse_args() data_dir = args.data_dir plot_dir = data_dir+"/plots/benchmarks/" if not os.path.exists(plot_dir): os.makedirs(plot_dir) # create benchmark object b = Benchmark(data_dir+"/pdb/") #specify methods to benchmark b.add_method("pseudo-likelihood APC", data_dir +"/predictions_pll/", "apc.mat") b.add_method("pseudo-likelihood raw", data_dir +"/predictions_pll/", "raw.mat") b.add_method("persistent contrastive divergence APC", data_dir +"/predictions_pcd/", "apc.mat") b.add_method("persistent contrastive divergence raw", data_dir +"/predictions_pcd/", "raw.mat") #add constraint that all MRF optimizations have exist status 0 b.add_constraint("opt_code", 0, "greater_equal") #compute the precision of predictions b.compute_evaluation_statistics(seqsep=6, contact_thr=8, noncontact_thr=8) #generate a benchmark plot plot = b.plot_precision_vs_rank() #format that benchmark plot to resemble the one in Fig 1C plot_pll_vs_pcd_benchmark_figure(plot, plot_dir, height=500, width=1000)
def main(): args = parse_args() property_files_dir = args.property_files_dir alignment_dir = args.alignment_dir psipred_dir = args.psipred_dir netsurfp_dir = args.netsurfp_dir mi_dir = args.mi_dir omes_dir = args.omes_dir model_file = args.model_file evaluation_dir = args.evaluation_dir method_name = args.method n_proteins = args.n_proteins n_threads = args.n_threads sequence_separation = args.sequence_separation contact_threshold = args.contact_threshold pll_braw_dir = args.pll_braw cd_braw_dir = args.cd_braw pcd_braw_dir = args.pcd_braw bayposterior_mat_dir = args.bayposterior_mat bayesfactor_mat_dir = args.bayesfactor_mat print("Add evaluation files for method {0} to {1}".format(method_name, evaluation_dir)) print("\nPaths to data:\n") print("Alignment dir: \t\t {0}".format(alignment_dir)) print("Psipred dir: \t\t {0}".format(psipred_dir)) print("Netsurfp dir: \t\t {0}".format(netsurfp_dir)) print("MI dir: \t\t {0}".format(mi_dir)) print("OMES dir: \t\t {0}".format(omes_dir)) print("Modelfile dir: \t\t {0}".format(model_file)) print("\nPaths to additional data:\n") print("pLL Braw dir: \t\t {0}".format(pll_braw_dir)) print("CD Braw dir: \t\t {0}".format(cd_braw_dir)) print("PCD Braw dir: \t\t {0}".format(pcd_braw_dir)) print("BayPost Mat dir: \t\t {0}".format(bayposterior_mat_dir)) print("BayFactor Mat dir: \t\t {0}".format(bayesfactor_mat_dir)) #update existing files? update=False ########### Setup dataset_id dataset_properties = pd.DataFrame() for id, property_file in enumerate(sorted(glob.glob(property_files_dir+"/*"))): properties = pd.read_table(property_file) properties['id'] = id+1 properties.columns=['protein', 'resol', 'CATH-topology', 'domlength', 'alilength', 'dataset_id'] dataset_properties = dataset_properties.append(properties, ignore_index=True) ########## Setup Benchmark framework b = Benchmark(evaluation_dir) ########## Benchmark on these datasets benchmark_dataset_id = [6,7,8] ######### Load model rf_clf, rf_meta = BayesianContactPredictor.load_contact_prior_model(model_file) #get all existing alignment files proteins_in_testset = dataset_properties.query('dataset_id in @benchmark_dataset_id')['protein'].values print("Start processing alignment files...") ########## Iterate over proteins counter=0 it = -1 while counter < n_proteins: it += 1 proteins_subset = proteins_in_testset[(it * n_proteins):( (it+1)*n_proteins)] np.random.shuffle(proteins_subset) for protein in proteins_subset: if counter >= n_proteins: break protein = protein.strip() alignment_file = alignment_dir + "/" + protein + ".filt.psc" psipred_file = psipred_dir + "/" + protein + ".filt.withss.a3m.ss2" netsurfp_file = netsurfp_dir + "/" + protein + ".filt.netsurfp" mi_file = mi_dir + "/" + protein + ".filt.mi.pc.mat" omes_file = omes_dir + "/" + protein + "filt.omes.fodoraldrich.mat" eval_file = evaluation_dir + "/" + protein.strip() + "." + method_name pll_braw_file = None cd_braw_file = None pcd_braw_file = None bayposterior_mat_file = None bayfactor_mat_file = None if os.path.exists(eval_file) and not update: print("Evaluation file {0} already exists. Skip this protein. ".format(eval_file)) continue if not os.path.exists(alignment_file): print("Alignment file {0} does not exist. Skip this protein. ".format(alignment_file)) continue if not os.path.exists(psipred_file): print(" Psipred file {0} does not exist. Skip protein {1}!".format(psipred_file, protein)) continue if not os.path.exists(netsurfp_file): print(" NetsurfP file {0} does not exist. Skip protein {1}!".format(netsurfp_file, protein)) continue if pll_braw_dir is not None: pll_braw_file = pll_braw_dir + "/" + protein.strip() + ".filt.braw.gz" if not os.path.exists(pll_braw_file): print(" pLL braw file {0} does not exist. Skip protein {1}!".format(pll_braw_file, protein)) continue if cd_braw_dir is not None: cd_braw_file = cd_braw_dir + "/" + protein.strip() + ".filt.braw.gz" if not os.path.exists(cd_braw_file): print(" CD braw file {0} does not exist. Skip protein {1}!".format(cd_braw_file, protein)) continue if pcd_braw_dir is not None: pcd_braw_file = pcd_braw_dir + "/" + protein.strip() + ".filt.braw.gz" if not os.path.exists(pcd_braw_file): print(" PCD braw file {0} does not exist. Skip protein {1}!".format(pcd_braw_file, protein)) continue if bayposterior_mat_dir is not None: bayposterior_mat_file = bayposterior_mat_dir + "/" + protein.strip() + ".bayesian_3comp_pLL.mat" if not os.path.exists(bayposterior_mat_file): print(" bayesian posterior mat file {0} does not exist. Skip protein {1}!".format(bayposterior_mat_file, protein)) continue if bayesfactor_mat_dir is not None: bayfactor_mat_file = bayesfactor_mat_dir + "/" + protein.strip() + ".bayesian_3comp_pLL.mat" if not os.path.exists(bayfactor_mat_file): print(" bayes factor mat file {0} does not exist. Skip protein {1}!".format(bayfactor_mat_file, protein)) continue BCP = BayesianContactPredictor(alignment_file) BCP.set_contact_prior_model(rf_clf, rf_meta) BCP.set_n_threads(n_threads) BCP.set_sequence_separation(sequence_separation) BCP.set_contact_threshold(contact_threshold) BCP.contact_prior( psipred_file, netsurfp_file, mi_file, omes_file, pll_braw_file, cd_braw_file, pcd_braw_file, bayposterior_mat_file, bayfactor_mat_file ) contact_prior_mat = BCP.get_contact_prior(contact=1) meta = { 'opt_code' : 1, 'rf' : rf_meta } b.add_method(protein, method_name, contact_prior_mat, meta, apc=False, update=update) counter += 1
def main(): args = parse_args() data_dir = args.data_dir plot_dir = data_dir + "/plots/benchmarks/" pdb_dir = data_dir + "/pdb/" sequence_separation = 6 contact_thr = 8 non_contact_thr = 8 if not os.path.exists(plot_dir): os.makedirs(plot_dir) ### create benchmark plot for star-tree topologies # create benchmark object b = Benchmark(pdb_dir) #specify methods to benchmark b.add_method("APC", data_dir + "/recover_pcd_constrained/", "apc.star.mat") b.add_method("EC", data_dir + "/recover_pcd_constrained/", "ec.star.mat") b.add_method("no APC", data_dir + "/recover_pcd_constrained/", "raw.star.mat") #add constraint that all MRF optimizations have exist status 0 b.add_constraint("opt_code", 0, "greater_equal") #compute the precision of predictions b.compute_evaluation_statistics(seqsep=sequence_separation, contact_thr=contact_thr, noncontact_thr=non_contact_thr) #generate a benchmark plot benchmark_plot_star = b.plot_precision_vs_rank() plot_file = plot_dir + "/" + "fig_6b.html" plot_ccmgen_benchmark_figure(benchmark_plot_star, 'star topology', plot_file, height=350, width=500) ### create benchmark plot for binary-tree topologies # create benchmark object b = Benchmark(pdb_dir) # specify methods to benchmark b.add_method("APC", data_dir + "/recover_pcd_constrained/", "apc.binary.mat") b.add_method("EC", data_dir + "/recover_pcd_constrained/", "ec.binary.mat") b.add_method("no APC", data_dir + "/recover_pcd_constrained/", "raw.binary.mat") # add constraint that all MRF optimizations have exist status 0 b.add_constraint("opt_code", 0, "greater_equal") # compute the precision of predictions b.compute_evaluation_statistics(seqsep=sequence_separation, contact_thr=contact_thr, noncontact_thr=non_contact_thr) # generate a benchmark plot benchmark_plot_binary = b.plot_precision_vs_rank() plot_file = plot_dir + "/" + "fig_6a.html" plot_ccmgen_benchmark_figure(benchmark_plot_binary, 'binary topology', plot_file, height=350, width=500) ### create qunatification of noise plot plot_file = plot_dir + "/" + "fig_6c.html" plot_ccmgen_noise_quant_figure(benchmark_plot_star, benchmark_plot_binary, plot_file, height=350, width=500)
def main(): args = parse_args() property_files_dir = args.property_files_dir alignment_dir = args.alignment_dir psipred_dir = args.psipred_dir netsurfp_dir = args.netsurfp_dir mi_dir = args.mi_dir omes_dir = args.omes_dir braw_dir = args.braw_dir qij_dir = args.qij_dir evaluation_dir = args.evaluation_dir n_proteins = args.n_proteins n_threads = args.n_threads sequence_separation = args.sequence_separation contact_threshold = args.contact_threshold evaluate_likelihood = args.evaluate_likelihood evaluate_bayes_factor = args.evaluate_bayes_factor contact_prior_model_file = args.contact_prior_model_file coupling_prior_parameters_file = args.coupling_prior_parameters_file method_name = args.name #debugging # evaluation_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/evaluation/" # property_files_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/dataset/dataset_properties/" # alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/" # psipred_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psipred/hhfilter_results_n5e01/" # netsurfp_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/netsurfp/" # mi_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/local_methods/mi_pc/" # omes_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/local_methods/omes_fodoraldrich/" # # method_name = "pLL_3comp_reg100prec01mu_100k_ncthr8" # braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" # qij_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/qij/" # contact_prior_model_file = "/home/vorberg/work/data/bayesian_framework/contact_prior/random_forest/new_pipeline_5folds/random_forest/classweightNone_noncontactthr8/100000contacts_500000noncontacts_5window_8noncontactthreshold_maxfeatures030/random_forest_nestimators1000_maxfeatures0.3_maxdepth100_minsamplesleaf10_75features.pkl" # coupling_prior_parameters_file = "/home/vorberg/work/data//bayesian_framework/mle_for_couplingPrior_cath4.1/ccmpred-pll-centerv/3/reg_prec100_mu01/diagonal_100000_nrcomponents3_noncontactthr8/parameters" # sequence_separation = 8 # contact_threshold = 8 # n_threads = 8 # n_proteins = 50 ########### Setup dataset_id dataset_properties = pd.DataFrame() for id, property_file in enumerate(sorted(glob.glob(property_files_dir + "/*"))): properties = pd.read_table(property_file) properties['id'] = id + 1 properties.columns = ['protein', 'resol', 'CATH-topology', 'domlength', 'alilength', 'dataset_id'] dataset_properties = dataset_properties.append(properties, ignore_index=True) ########## Setup Benchmark framework b = Benchmark(evaluation_dir) ######### Load contact prior model rf_clf, rf_meta = BayesianContactPredictor.load_contact_prior_model(contact_prior_model_file) ######### Load coupling prior parameters coupling_prior_parameters = BayesianContactPredictor.load_coupling_prior_hyperparameters(coupling_prior_parameters_file) #get all existing braw files benchmark_dataset_id = [6, 7, 8] proteins_in_testset = dataset_properties.query('dataset_id in @benchmark_dataset_id')['protein'].values braw_files = [braw_dir+"/" + protein.strip() + ".filt.braw.gz" for protein in proteins_in_testset] braw_files_existing = [braw_file for braw_file in braw_files if os.path.exists(braw_file)] braw_files_shuff = random.sample(braw_files_existing[:n_proteins], len(braw_files_existing[:n_proteins])) print("Start processing {0} braw files...".format(len(braw_files_shuff))) ########## Iterate over proteins for braw_file in braw_files_shuff: #braw_file ='/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw//2j5yA00.filt.braw.gz' protein = braw_file.split("/")[-1].split(".")[0] alignment_file = alignment_dir + "/" + protein + ".filt.psc" psipred_file = psipred_dir + "/" + protein + ".filt.withss.a3m.ss2" netsurfp_file = netsurfp_dir + "/" + protein + ".filt.netsurfp" mi_file = mi_dir + "/" + protein + ".filt.mi.pc.mat" omes_file = omes_dir + "/" + protein + ".filt.omes.fodoraldrich.mat" qij_file = qij_dir + "/" + protein + ".filt.bqij.gz" if not os.path.exists(alignment_file): print("Alignment file {0} does not exist. Skip this protein. ".format(alignment_file)) continue if not os.path.exists(braw_file): print("binary Raw file {0} does not exist. Skip this protein. ".format(braw_file)) continue if not os.path.exists(qij_file): print("Qij file {0} does not exist. Skip this protein. ".format(qij_file)) continue print("Compute posterior probabilities for contact with Bayesian model for protein {0}".format(protein)) BCP = BayesianContactPredictor(alignment_file) BCP.set_contact_prior_model(rf_clf, rf_meta) BCP.set_coupling_prior_parameters(coupling_prior_parameters) BCP.set_n_threads(n_threads) BCP.set_sequence_separation(sequence_separation) BCP.set_contact_threshold(contact_threshold) BCP.contact_prior(psipred_file, netsurfp_file, mi_file, omes_file) BCP.contact_likelihood(braw_file, qij_file) BCP.contact_posterior() contact_prior_mat = BCP.get_contact_prior(contact=1) contact_posterior_mat = BCP.get_contact_posterior(contact=1) posterior_meta = BCP.get_meta() b.add_method(protein.strip(), method_name, contact_posterior_mat, posterior_meta, apc=False, update=True) b.add_method(protein.strip(), "rf_contact_prior", contact_prior_mat, posterior_meta, apc=False, update=True) if evaluate_bayes_factor: contact_likelihood_mat = BCP.get_contact_likelihood(contact=1, normalized=False, bayes_factor=True) b.add_method(protein.strip(), method_name+"_logbf", contact_likelihood_mat, posterior_meta, apc=False, update=True) if evaluate_likelihood: contact_likelihood_mat = BCP.get_contact_likelihood(contact=1, normalized=True, bayes_factor=False) b.add_method(protein.strip(), method_name+"_llik", contact_likelihood_mat, posterior_meta, apc=False, update=True)
def main(): args = parse_args() property_files_dir = args.property_files_dir alignment_dir = args.alignment_dir psipred_dir = args.psipred_dir netsurfp_dir = args.netsurfp_dir mi_dir = args.mi_dir omes_dir = args.omes_dir braw_dir = args.braw_dir qij_dir = args.qij_dir evaluation_dir = args.evaluation_dir n_proteins = args.n_proteins n_threads = args.n_threads sequence_separation = args.sequence_separation contact_threshold = args.contact_threshold evaluate_likelihood = args.evaluate_likelihood evaluate_bayes_factor = args.evaluate_bayes_factor contact_prior_model_file = args.contact_prior_model_file coupling_prior_parameters_file = args.coupling_prior_parameters_file method_name = args.name #debugging # evaluation_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/evaluation/" # property_files_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/dataset/dataset_properties/" # alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/" # psipred_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psipred/hhfilter_results_n5e01/" # netsurfp_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/netsurfp/" # mi_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/local_methods/mi_pc/" # omes_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/local_methods/omes_fodoraldrich/" # # method_name = "pLL_3comp_reg100prec01mu_100k_ncthr8" # braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" # qij_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/qij/" # contact_prior_model_file = "/home/vorberg/work/data/bayesian_framework/contact_prior/random_forest/new_pipeline_5folds/random_forest/classweightNone_noncontactthr8/100000contacts_500000noncontacts_5window_8noncontactthreshold_maxfeatures030/random_forest_nestimators1000_maxfeatures0.3_maxdepth100_minsamplesleaf10_75features.pkl" # coupling_prior_parameters_file = "/home/vorberg/work/data//bayesian_framework/mle_for_couplingPrior_cath4.1/ccmpred-pll-centerv/3/reg_prec100_mu01/diagonal_100000_nrcomponents3_noncontactthr8/parameters" # sequence_separation = 8 # contact_threshold = 8 # n_threads = 8 # n_proteins = 50 ########### Setup dataset_id dataset_properties = pd.DataFrame() for id, property_file in enumerate( sorted(glob.glob(property_files_dir + "/*"))): properties = pd.read_table(property_file) properties['id'] = id + 1 properties.columns = [ 'protein', 'resol', 'CATH-topology', 'domlength', 'alilength', 'dataset_id' ] dataset_properties = dataset_properties.append(properties, ignore_index=True) ########## Setup Benchmark framework b = Benchmark(evaluation_dir) ######### Load contact prior model rf_clf, rf_meta = BayesianContactPredictor.load_contact_prior_model( contact_prior_model_file) ######### Load coupling prior parameters coupling_prior_parameters = BayesianContactPredictor.load_coupling_prior_hyperparameters( coupling_prior_parameters_file) #get all existing braw files benchmark_dataset_id = [6, 7, 8] proteins_in_testset = dataset_properties.query( 'dataset_id in @benchmark_dataset_id')['protein'].values braw_files = [ braw_dir + "/" + protein.strip() + ".filt.braw.gz" for protein in proteins_in_testset ] braw_files_existing = [ braw_file for braw_file in braw_files if os.path.exists(braw_file) ] braw_files_shuff = random.sample(braw_files_existing[:n_proteins], len(braw_files_existing[:n_proteins])) print("Start processing {0} braw files...".format(len(braw_files_shuff))) ########## Iterate over proteins for braw_file in braw_files_shuff: #braw_file ='/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw//2j5yA00.filt.braw.gz' protein = braw_file.split("/")[-1].split(".")[0] alignment_file = alignment_dir + "/" + protein + ".filt.psc" psipred_file = psipred_dir + "/" + protein + ".filt.withss.a3m.ss2" netsurfp_file = netsurfp_dir + "/" + protein + ".filt.netsurfp" mi_file = mi_dir + "/" + protein + ".filt.mi.pc.mat" omes_file = omes_dir + "/" + protein + ".filt.omes.fodoraldrich.mat" qij_file = qij_dir + "/" + protein + ".filt.bqij.gz" if not os.path.exists(alignment_file): print("Alignment file {0} does not exist. Skip this protein. ". format(alignment_file)) continue if not os.path.exists(braw_file): print("binary Raw file {0} does not exist. Skip this protein. ". format(braw_file)) continue if not os.path.exists(qij_file): print("Qij file {0} does not exist. Skip this protein. ".format( qij_file)) continue print( "Compute posterior probabilities for contact with Bayesian model for protein {0}" .format(protein)) BCP = BayesianContactPredictor(alignment_file) BCP.set_contact_prior_model(rf_clf, rf_meta) BCP.set_coupling_prior_parameters(coupling_prior_parameters) BCP.set_n_threads(n_threads) BCP.set_sequence_separation(sequence_separation) BCP.set_contact_threshold(contact_threshold) BCP.contact_prior(psipred_file, netsurfp_file, mi_file, omes_file) BCP.contact_likelihood(braw_file, qij_file) BCP.contact_posterior() contact_prior_mat = BCP.get_contact_prior(contact=1) contact_posterior_mat = BCP.get_contact_posterior(contact=1) posterior_meta = BCP.get_meta() b.add_method(protein.strip(), method_name, contact_posterior_mat, posterior_meta, apc=False, update=True) b.add_method(protein.strip(), "rf_contact_prior", contact_prior_mat, posterior_meta, apc=False, update=True) if evaluate_bayes_factor: contact_likelihood_mat = BCP.get_contact_likelihood( contact=1, normalized=False, bayes_factor=True) b.add_method(protein.strip(), method_name + "_logbf", contact_likelihood_mat, posterior_meta, apc=False, update=True) if evaluate_likelihood: contact_likelihood_mat = BCP.get_contact_likelihood( contact=1, normalized=True, bayes_factor=False) b.add_method(protein.strip(), method_name + "_llik", contact_likelihood_mat, posterior_meta, apc=False, update=True)