def __predict_contacts(method, bin_dir, prot_id, fasta_file, aln_file, modified_aln_file, config, cpu): ''' Prediction of protein residue contacts. It sets the working environment and runs DeepCov/DeepConPred2/DeepContact pipeline. ''' pipeline = "{}/contacts/{}.py".format(bin_dir, method) env_file = "{}/set_envs.sh".format(bin_dir) out = "{}/{}.con".format(method, prot_id) description_prog = "Run {} pipeline".format(method).upper() if method == "deepcontact": command = "source {} {} && python {} {} {} -cpu {}".format( env_file, method, pipeline, fasta_file, out, cpu) elif method == "deepconpred": command = "source {} {} && python {} {} {} {} -cpu {}".format( env_file, method, pipeline, fasta_file, aln_file, out, cpu) else: pipeline = "{}/deepcov.sh".format(config.get("deepcov", "path")) command = "source {} {} && bash {} -m {} -r {} -i {} -o {}".format( env_file, method, pipeline, config.get("deepcov", "model"), config.get("deepcov", "receptive_field"), modified_aln_file, out) __run_command(description_prog, command, separator="#" * 60)
def __compute_contacts(contact_script,methods,cpu,ids): ''' Extract native contacts and predict contacts using different methodologies specified by user ''' # Extract native contacts __extract_native(ids) # Predict contacts using DeepCov, DeepConPred2 and DeepContact for each protein run = "" for method in methods: run += method + " " for prot in ids: description_prog = "Predicting contacts for " + prot + " using " + str(methods) + "\n" command = "python {} seq/{}.fa contacts/ -run {} -cpu {}".format(contact_script,prot,run,cpu) __run_command(description_prog,command,separator="="*60) # Contactbench for method in methods: for prot in ids: prediction = "contacts/" + method + "/" + prot + ".con" native = "contacts/native/" + prot + ".con" out = "contacts/" + method + "/" + prot + ".contactbench" __rr_to_contactbench(prediction,native,out)
def __spider3(fasta_file, prot_id, feature_out_prefix, config, env_file, outname, only_sse): ''' Use Spider3 to predict secondary structure.''' # Change to Spider3 directory spider_dir = config.get("spider3-deepconpred", "path") os.chdir(spider_dir) # Copy input files to Spider3 directory a = [(fasta_file, prot_id), (feature_out_prefix + ".pssm", prot_id + ".pssm"), (feature_out_prefix + ".hhm", prot_id + ".hhm")] for files in a: shutil.copy(files[0], files[1]) # Run Spider3 os.system("echo {} {}.pssm {}.hhm > tlist".format(prot_id, prot_id, prot_id)) description_prog = "Spider3: prediction of secondary structure. It is executed from Spider3 directory" command = "source {} spider && bash ./scripts/impute_script.sh tlist".format( env_file) __run_command(description_prog, command, separator="=" * 60) # Organize files shutil.move(prot_id + ".spd33", feature_out_prefix + ".spd33") for a in glob.glob(prot_id + "*"): os.remove(a) os.remove("tlist") if only_sse is True: shutil.copy(feature_out_prefix + ".spd33", outname)
def __hhblits_aln(fasta_file, aln_file, modified_aln_file, config, cpu): ''' Generate alignments for DeepConPred2 and DeepCov. In the case of DeepContact, it already contains a step that generates MSA, although using parameters that are different from the ones we used for DeepConPred2 and DeepCov. Hence, the DeepContact pipeline, that I build by the moment, takes as input a fasta file instead of an alignment. Considering the time factor, for the future it would be great to find the optimal parameters to generate MSA for all predictors. ''' description_prog = "HHBlits: generate MSA" command = "{} -i {} -d {} -oa3m {} -n {} -diff {} -cov {} -cpu {}".format( config.get("alignment-deepcov-deepconpred", "command"), fasta_file, config.get("alignment-deepcov-deepconpred", "database"), aln_file, config.get("alignment-deepcov-deepconpred", "niterations"), config.get("alignment-deepcov-deepconpred", "diff"), config.get("alignment-deepcov-deepconpred", "cov"), cpu) __run_command(description_prog, command, separator="#" * 60) # Conversion from a3m to psicov alignment format (and remove identical rows) # This is the alignment format accepted by DeepCov os.system("egrep -v '^>' {} | sed 's/[a-z]//g' | sort -u > {}".format( aln_file, modified_aln_file))
def __manage_sse(sse_dir,script,ids,actions,methods,cpu): ''' Manage SSE files.''' # Create sse folder if not os.path.exists("sse"): os.makedirs("sse") # If SSE files are provided by user if sse_dir is not None: for prot in ids: a = sse_dir+"/"+prot+".sse" b = "sse/"+prot+".sse" if os.path.abspath(a) != os.path.abspath(b): shutil.copy(a,b) else: # Copy SSE files predicted by Spider3 if "contacts" in actions and "deepconpred" in methods: for prot in ids: a = "contacts/deepconpred/"+prot+"_deepconpred_features/"+prot+".spd33" b = "sse/"+prot+".sse" shutil.copy(a,b) # Predict SSE else: for prot in ids: description_prog = "Predicting SSEs with Spider3 for " + prot command = "python {} seq/{}.fa sse/ -cpu {} -only_sse".format(script,prot,cpu) __run_command(description_prog,command,separator="="*60) os.rename("sse/"+prot+".spd33","sse/"+prot+".sse")
def __deepconpred(deepconpred_dir, fasta_file, prot_id, feature_out_prefix, prediction_map): ''' Refinement of contact map.''' os.chdir( deepconpred_dir ) # DeepConPred2 works in the directory where the program is installed data_dir = "{}/data".format(deepconpred_dir) result_dir = "{}/result".format(deepconpred_dir) # Copy input files to DeepConPred2 directory in_prefix = "{}/{}".format(data_dir, prot_id) shutil.copy(fasta_file, in_prefix) for suffix in [".PSSM", ".spd33", ".ccmpred"]: shutil.copy(feature_out_prefix + suffix, in_prefix + suffix) # Run DeepConPred2 description_prog = "DeepConPred2: refinement of contact map. It is executed from DeepConPred2 directory" command = "python DeepConPred2.py {}".format(prot_id) __run_command(description_prog, command, separator="=" * 60) # Organize files shutil.move(result_dir + "/" + prot_id + ".contactP", prediction_map) for a in glob.glob(data_dir + "/" + prot_id + "*"): os.remove(a)
def __compute_rf(scripts,paths,config): # Calculate RF score description_prog = "Calculate RF score" command = "bash {} {} {} {} {}".format( scripts["rf"], paths["fasta_name"], config.get("tree3d","method"), config.get("tree3d","mode"), config.get("tree3d","mexp")) __run_command(description_prog,command,"="*60)
def booster_tbe(treelist, outf): ''' Run booster ''' import os from other.run_command import __run_command for name1, path1 in treelist.items(): for name2, path2 in treelist.items(): out = "{}/{}_{}.nwk".format(outf, name1, name2) if not os.path.isfile(out): command = "booster -i {} -b {} -o {}".format(path1, path2, out) __run_command(" ", command, " ")
def __run_xplor(ID, seq_file, tbl, tbl_sse, out_dir, bin_dir, args, config): ''' Run simulated annealing using Xplor-NIH ''' description_prog = "Running Xplor-NIH" annealing_out = ID + ".md1.out" script = bin_dir + "/simulations/annealing.py" command = "{} -smp {} -py -o {} {} {} {} {} {} {} {} {}".format( config.get("xplor", "command"), args.cpu, annealing_out, script, seq_file, tbl, tbl_sse, args.xplor_mode, out_dir, args.xplor_nmodels, args.xplor_topavg) __run_command(description_prog, command, " ")
def __hhblits(fasta_file, feature_out_prefix, config, cpu): ''' Use HHBlits to generate HMM profile. This information is used in Spider3 for the prediction of secondary structure. ''' description_prog = "HHBlits: generation of HMM profile" command = "{} -i {} -ohhm {}.hhm -d {} -v{} -maxres {} -cpu {} -Z {}".format( config.get("hhblits-deepconpred", "command"), fasta_file, feature_out_prefix, config.get("hhblits-deepconpred", "database"), config.get("hhblits-deepconpred", "v"), config.get("hhblits-deepconpred", "maxres"), cpu, config.get("hhblits-deepconpred", "z")) __run_command(description_prog, command, separator="=" * 60)
def __ccmpred(feature_out_prefix, aln_file, config, cpu): ''' Use CCMPred (plmDCA) to predict residue contacts.''' # Conversion from a3m to psicov alignment format (and remove identical rows) # This is the alignment format accepted by CCMPred modified_aln_file = "{}.psicov".format(feature_out_prefix) os.system("egrep -v '^>' {} | sed 's/[a-z]//g' | sort -u > {}".format( aln_file, modified_aln_file)) # Predict contacts description_prog = "CCMPred: prediction of residue contacts based on plmDCA" command = "{} {} {}.ccmpred -t {}".format( config.get("ccmpred-deepconpred", "command"), modified_aln_file, feature_out_prefix, cpu) __run_command(description_prog, command, separator="=" * 60)
def __compute_trees(config,paths,scripts,cpu,ids): ''' Compute structure-based trees ''' # Read list of structures into a dic {name : path/to/folder} strdic = {} with open(paths["str_list"]) as f: for line in f: line = line.strip().split(" ") if line[1] != "experimental": strdic[line[1]] = line[0] # Experimental structures strdic["experimental"] = paths["out_dir"] + "/pdb_matched" # List of trees with two columns: 1) path to tree.nwk and 2) tree name if not os.path.exists("trees"): os.makedirs("trees") paths["tree_list"] = paths["out_dir"] + "/trees/tree_list" f = open(paths["tree_list"],"wt") msa,mode,mexp = "tmalign",4,2 treename = paths["fasta_name"] + "_unw_d4_" + msa + str(mode) + "-" + str(mexp) + ".mat_1.txt.nwk" f.write(paths["out_dir"] + "/trees/experimental/out/tree3d/" + treename + " experimental\n") # Compute 3D-tree for every set of structures for name in strdic.keys(): orifolder = strdic[name] out_dir = paths["out_dir"] + "/trees/" + name if not os.path.exists(out_dir): os.makedirs(out_dir) for prot in ids: if name == "experimental": shutil.copy(orifolder+"/"+prot+".pdb",out_dir+"/"+prot+".pdb") else: # Reformat pdb files prediction = orifolder+"/"+prot+".pdb" experimental = "pdb_matched/" + prot + ".pdb" out = out_dir+"/"+prot+".pdb" os.system("bash {} {} {} {}".format(scripts["reformat_pdb"],prediction,experimental,out)) description_prog = "Compute trees for " + name command = "source {} tree && python {} {} {} {} {}".format(scripts["environment"],scripts["trees"],paths["fastauniprot"],paths["ref_file"],out_dir,cpu) __run_command(description_prog,command,"="*60) # Write list of tree with two columns: 1) path to tree.nwk and 2) tree name if name != "experimental": f.write(paths["out_dir"] + "/trees/" + name + "/out/tree3d/" + treename + " " + name + "\n") f.close() __compute_rf(scripts,paths,config)
def calc_rf(treelist, outf): ''' Determine RF score ''' import os from other.run_command import __run_command # Write trees.nwk trees = open(outf + "/trees.nwk", "w") for path in treelist.values(): with open(path) as f: for line in f: line = line.strip("\n") trees.write(line + "\n") # Calculate RF --> rf.txt script = os.path.dirname(os.path.abspath(__file__)) + "/rf.R" command = "Rscript {} {}/treelist {}/trees.nwk {}/rf.txt {}/rf.xlsx".format( script, outf, outf, outf, outf) __run_command(" ", command, " ")
def __psiblast(fasta_file, feature_out_prefix, config, cpu): ''' Use PSI-Blast to create a position-specific scoring matrix (PSSM). This matrix is used in Spider3 for the prediction of secondary structure. It is also used in DeepConPred2 for the prediction of contacts. It is modified before being used in DeepConPred2. ''' description_prog = "PSI-Blast: generation of PSSM" command = "{} -db {} -num_iterations {} -num_alignments {} -num_threads {} -query {} -out {}.bla -out_ascii_pssm {}.pssm".format( config.get("psiblast-deepconpred", "command"), config.get("psiblast-deepconpred", "database"), config.get("psiblast-deepconpred", "niterations"), config.get("psiblast-deepconpred", "nalignments"), cpu, fasta_file, feature_out_prefix, feature_out_prefix) __run_command(description_prog, command, separator="=" * 60) # Transform PSSM format to be used in DeepConPred2 original_pssm = "{}.pssm".format(feature_out_prefix) modified_pssm = "{}.PSSM".format(feature_out_prefix) __modify_pssm(original_pssm, modified_pssm)
def __deepcontact(): ''' DeepContact requires three steps: generation of input files, extract features from files, prediction of contacts.''' # Generation of input files description_prog = "Generation of input files" command = "python {} {} {} {}".format(run_pipeline, default_yaml, fasta_file, feature_dir) __run_command(description_prog, command, separator=" ") # Extract features from input files description_prog = "Extract features from input files" command = "python {} {} {} {} {}".format( feature_generation, feature_yaml, feature_dir, prot_id, feature_pkl) # The original script does not need ProtID. __run_command( description_prog, command, separator=" ") # The script is changed to manipulate input filenames. # Prediction of residue contacts description_prog = "Prediction of protein residue contacts" command = "python {} {} {}".format(main_script, feature_pkl, prediction_pkl) __run_command(description_prog, command, separator=" ")
def __compute_structures(config,paths,scripts,args,ids,cids): ''' Compute 3D structure through simulated annealing and constrained by contact restraints and dihedral angle restraints. ''' # List of contacts if "contacts" in args.actions: paths["con_list"] = paths["out_dir"] + "/contacts/con_list" f = open(paths["con_list"],"wt") #f.write(paths["out_dir"] + "/contacts/native native 0\n") # Reconstruct structures from native contacts for method in args.method: if method == "deepcov": score = args.pdeepcov elif method == "deepconpred": score = args.pdeepconpred elif method == "deepcontact": score = args.pdeepcontact if args.cscore is None: line = paths["out_dir"] + "/contacts/" + method + " " + method + "_score" + str(score).replace(".","-") + "_seqsep" + str(args.seqsep) + " " + str(score) + "\n" else: line = paths["out_dir"] + "/contacts/" + method + " " + method + "_score" + str(score).replace(".","-") + "_seqsep" + str(args.seqsep) + "_cscore" + str(args.cscore).replace(".","-") + " " + str(score) + "\n" f.write(line) f.close() # Filter by conservation score if args.cscore is not None: __apply_cscore(args,paths,scripts,ids,cids) # Manage SSE files __manage_sse(paths["sse_dir"],scripts["contacts_sse"],ids,args.actions,args.method,args.cpu) # List of structures if "trees" in args.actions: paths["str_list"] = paths["out_dir"] + "/simulations/str_list" str_list = open(paths["str_list"],"wt") # Run Xplor-NIH n times for each protein & method nexperiment = int(args.xplor_nexp) for n in range(1,nexperiment+1): # Read contacts folder, name, threshold {name : (path/to/folder, threshold)} cons = {} with open(paths["con_list"]) as f: for line in f: line = filter(None,line.strip().split(" ")) if len(line) == 0: continue folder = line[0] name = line[1] threshold = line[2] cons[name] = (folder,threshold) # Run simulations for name in cons.keys(): confolder = cons[name][0] threshold = cons[name][1] for prot in ids: fasta_file = "seq/" + prot + ".fa" con_file = confolder + "/" + prot + ".con" sse_file = "sse/" + prot + ".sse" out_dir = "simulations/" + name + "/n" + str(n) + "/" + prot + "_models" if not os.path.isfile(con_file): print("Warning: There is no " + con_file) continue if not os.path.exists(out_dir): os.makedirs(out_dir) description_prog = "Run Xplor-NIH for " + prot + " (" + name + ") score>=" + threshold + " seqsep>=" + str(args.seqsep) command = "python {} -fa {} -con {} -sse {} -o {} -cpu {} -score {} -seqsep {} -xplor_mode {} -xplor_nmodels {} -xplor_topavg {} -xplor_betaoo {} -xplor_nexp {}".format( scripts["xplor"],fasta_file, con_file,sse_file,out_dir,args.cpu, threshold,args.seqsep, args.xplor_mode,args.xplor_nmodels,args.xplor_topavg,args.xplor_betaoo,args.xplor_nexp) __run_command(description_prog,command,"="*60) a = out_dir + "/annealing_ave.pdb" b = "simulations/" + name + "/n" + str(n) + "/" + prot + ".pdb" shutil.copy(a,b) print(command) # Write list of structures if "trees" in args.actions: str_list.write(paths["out_dir"] + "/simulations/" + name + "/n" + str(n) + " " + name + "_n" + str(n) + "\n") if "trees" in args.actions: str_list.close()
def __apply_cscore(args,paths,scripts,ids,cids): ''' Calculate the conservation of the predicted contacts and filter them according to the Cscore''' # Copy contacts and create contactbench files if "contacts" not in args.actions: __extract_native(cids) with open(paths["con_list"]) as f: for line in f: line = line.strip().split(" ") folder = line[0] name = line[1] if name == "native": continue if not os.path.exists("contacts/"+name): os.makedirs("contacts/"+name) for prot in cids: pred = folder + "/" + prot + ".con" if os.path.isfile(pred): prediction = "contacts/"+name+"/"+prot+".con" native = "contacts/native/"+prot+".con" contactbench = "contacts/"+name+"/"+prot+".contactbench" shutil.copy(pred,prediction) __rr_to_contactbench(prediction,native,contactbench) # Read con_list {name : (path/to/folder, threshold)} con_list = {} with open(paths["con_list"]) as f: for line in f: line = line.strip().split(" ") name = line[1] p = float(line[2]) if "contacts" in args.actions: folder = line[0] else: folder = paths["out_dir"] + "/contacts/" + name # Create folder if not os.path.exists(folder + "/cscore") and name != "native": os.makedirs(folder + "/cscore") con_list[name] = (folder,p) # Calculation of conservation score paths["con_list"] = paths["out_dir"] + "/contacts/con_list" w = open(paths["con_list"],"wt") for name in con_list.keys(): if name == "native": w.write(paths["out_dir"] + "/contacts/native native 0\n") continue folder = con_list[name][0] p = con_list[name][1] # Filter contactbench files for prot in cids: outprefix = folder + "/cscore/" + prot contactbench1 = folder + "/" + prot + ".contactbench" contactbench2 = outprefix + ".contactbench" if not os.path.isfile(contactbench1): print("Warning: no contact file for " + prot + " " + name +" is provided") print("Warning: The conservation score will be computed without considering the contact predictions of the protein " + prot) else: cont = open(contactbench2,"wt") with open(contactbench1) as f: for line in f: fields = line.strip().split(" ") if int(fields[2]) >= args.seqsep and float(fields[3]) >= p: cont.write(line) cont.close() # Estimate Cscore for prot in ids: outprefix = folder + "/cscore/" + prot contactbench = outprefix + ".contactbench" description_prog = "Estimating the conservation of the predicted contacts" command = "bash {} {} {} {} {} {} {}".format(scripts["cscore"],paths["c_aln"],contactbench,outprefix,p,args.cscore,args.cweight) __run_command(description_prog,command,separator=" ") # Rewrite con_list folder = folder + "/cscore" w.write(folder + " " + name + " " + str(p) + "\n") w.close()