Exemple #1
0
def __predict_contacts(method, bin_dir, prot_id, fasta_file, aln_file,
                       modified_aln_file, config, cpu):
    ''' Prediction of protein residue contacts.
    
    It sets the working environment and runs DeepCov/DeepConPred2/DeepContact pipeline. 
    '''

    pipeline = "{}/contacts/{}.py".format(bin_dir, method)
    env_file = "{}/set_envs.sh".format(bin_dir)
    out = "{}/{}.con".format(method, prot_id)

    description_prog = "Run {} pipeline".format(method).upper()

    if method == "deepcontact":
        command = "source {} {} && python {} {} {} -cpu {}".format(
            env_file, method, pipeline, fasta_file, out, cpu)
    elif method == "deepconpred":
        command = "source {} {} && python {} {} {} {} -cpu {}".format(
            env_file, method, pipeline, fasta_file, aln_file, out, cpu)
    else:
        pipeline = "{}/deepcov.sh".format(config.get("deepcov", "path"))
        command = "source {} {} && bash {} -m {} -r {} -i {} -o {}".format(
            env_file, method, pipeline, config.get("deepcov", "model"),
            config.get("deepcov", "receptive_field"), modified_aln_file, out)
    __run_command(description_prog, command, separator="#" * 60)
Exemple #2
0
def __compute_contacts(contact_script,methods,cpu,ids):
    ''' Extract native contacts and predict contacts using different methodologies specified by user '''
  
  
    # Extract native contacts 
    __extract_native(ids)
  

    # Predict contacts using DeepCov, DeepConPred2 and DeepContact for each protein
    run = ""
    for method in methods:
        run += method + " "
    for prot in ids:
        description_prog = "Predicting contacts for " + prot + " using " + str(methods) + "\n"
        command = "python {} seq/{}.fa contacts/ -run {} -cpu {}".format(contact_script,prot,run,cpu)
        __run_command(description_prog,command,separator="="*60)
    
    
    # Contactbench
    for method in methods:
        for prot in ids:
            prediction = "contacts/" + method + "/" + prot + ".con"
            native = "contacts/native/" + prot + ".con"
            out = "contacts/" + method + "/" + prot + ".contactbench"
            __rr_to_contactbench(prediction,native,out)
Exemple #3
0
def __spider3(fasta_file, prot_id, feature_out_prefix, config, env_file,
              outname, only_sse):
    ''' Use Spider3 to predict secondary structure.'''

    # Change to Spider3 directory
    spider_dir = config.get("spider3-deepconpred", "path")
    os.chdir(spider_dir)

    # Copy input files to Spider3 directory
    a = [(fasta_file, prot_id),
         (feature_out_prefix + ".pssm", prot_id + ".pssm"),
         (feature_out_prefix + ".hhm", prot_id + ".hhm")]
    for files in a:
        shutil.copy(files[0], files[1])

    # Run Spider3
    os.system("echo {} {}.pssm {}.hhm > tlist".format(prot_id, prot_id,
                                                      prot_id))
    description_prog = "Spider3: prediction of secondary structure. It is executed from Spider3 directory"
    command = "source {} spider && bash ./scripts/impute_script.sh tlist".format(
        env_file)
    __run_command(description_prog, command, separator="=" * 60)

    # Organize files
    shutil.move(prot_id + ".spd33", feature_out_prefix + ".spd33")
    for a in glob.glob(prot_id + "*"):
        os.remove(a)
    os.remove("tlist")
    if only_sse is True:
        shutil.copy(feature_out_prefix + ".spd33", outname)
Exemple #4
0
def __hhblits_aln(fasta_file, aln_file, modified_aln_file, config, cpu):
    ''' Generate alignments for DeepConPred2 and DeepCov. 
    
    In the case of DeepContact, it already contains a step that 
    generates MSA, although using parameters that are different from 
    the ones we used for DeepConPred2 and DeepCov.
    
    Hence, the DeepContact pipeline, 
    that I build by the moment, 
    takes as input a fasta file instead of an alignment.
    
    Considering the time factor, for the future it would be great 
    to find the optimal parameters to generate MSA for all predictors.
    '''

    description_prog = "HHBlits: generate MSA"
    command = "{} -i {} -d {} -oa3m {} -n {} -diff {} -cov {} -cpu {}".format(
        config.get("alignment-deepcov-deepconpred", "command"), fasta_file,
        config.get("alignment-deepcov-deepconpred", "database"), aln_file,
        config.get("alignment-deepcov-deepconpred", "niterations"),
        config.get("alignment-deepcov-deepconpred", "diff"),
        config.get("alignment-deepcov-deepconpred", "cov"), cpu)
    __run_command(description_prog, command, separator="#" * 60)

    # Conversion from a3m to psicov alignment format (and remove identical rows)
    # This is the alignment format accepted by DeepCov
    os.system("egrep -v '^>' {} | sed 's/[a-z]//g' | sort -u > {}".format(
        aln_file, modified_aln_file))
Exemple #5
0
def __manage_sse(sse_dir,script,ids,actions,methods,cpu):    
    ''' Manage SSE files.'''
    
    # Create sse folder
    if not os.path.exists("sse"):
        os.makedirs("sse")
        
    # If SSE files are provided by user    
    if sse_dir is not None:
        for prot in ids:
            a = sse_dir+"/"+prot+".sse"
            b = "sse/"+prot+".sse"
            if os.path.abspath(a) != os.path.abspath(b):
                shutil.copy(a,b)
    else:
        # Copy SSE files predicted by Spider3 
        if "contacts" in actions and "deepconpred" in methods:
            for prot in ids:
                a = "contacts/deepconpred/"+prot+"_deepconpred_features/"+prot+".spd33"
                b = "sse/"+prot+".sse"
                shutil.copy(a,b)
        # Predict SSE
        else:
            for prot in ids:
                description_prog = "Predicting SSEs with Spider3 for " + prot
                command = "python {} seq/{}.fa sse/ -cpu {} -only_sse".format(script,prot,cpu)
                __run_command(description_prog,command,separator="="*60)
                os.rename("sse/"+prot+".spd33","sse/"+prot+".sse")
Exemple #6
0
def __deepconpred(deepconpred_dir, fasta_file, prot_id, feature_out_prefix,
                  prediction_map):
    ''' Refinement of contact map.'''

    os.chdir(
        deepconpred_dir
    )  # DeepConPred2 works in the directory where the program is installed
    data_dir = "{}/data".format(deepconpred_dir)
    result_dir = "{}/result".format(deepconpred_dir)

    # Copy input files to DeepConPred2 directory
    in_prefix = "{}/{}".format(data_dir, prot_id)
    shutil.copy(fasta_file, in_prefix)
    for suffix in [".PSSM", ".spd33", ".ccmpred"]:
        shutil.copy(feature_out_prefix + suffix, in_prefix + suffix)

    # Run DeepConPred2
    description_prog = "DeepConPred2: refinement of contact map. It is executed from DeepConPred2 directory"
    command = "python DeepConPred2.py {}".format(prot_id)
    __run_command(description_prog, command, separator="=" * 60)

    # Organize files
    shutil.move(result_dir + "/" + prot_id + ".contactP", prediction_map)
    for a in glob.glob(data_dir + "/" + prot_id + "*"):
        os.remove(a)
Exemple #7
0
def __compute_rf(scripts,paths,config):
    # Calculate RF score
    description_prog = "Calculate RF score"
    command = "bash {} {} {} {} {}".format(
                    scripts["rf"],
                    paths["fasta_name"],
                    config.get("tree3d","method"),
                    config.get("tree3d","mode"),
                    config.get("tree3d","mexp"))
    __run_command(description_prog,command,"="*60)
Exemple #8
0
def booster_tbe(treelist, outf):
    ''' Run booster '''

    import os
    from other.run_command import __run_command

    for name1, path1 in treelist.items():
        for name2, path2 in treelist.items():
            out = "{}/{}_{}.nwk".format(outf, name1, name2)
            if not os.path.isfile(out):
                command = "booster -i {} -b {} -o {}".format(path1, path2, out)
                __run_command(" ", command, " ")
def __run_xplor(ID, seq_file, tbl, tbl_sse, out_dir, bin_dir, args, config):
    ''' Run simulated annealing using Xplor-NIH '''

    description_prog = "Running Xplor-NIH"
    annealing_out = ID + ".md1.out"

    script = bin_dir + "/simulations/annealing.py"
    command = "{} -smp {} -py -o {} {} {} {} {} {} {} {} {}".format(
        config.get("xplor", "command"), args.cpu, annealing_out, script,
        seq_file, tbl, tbl_sse, args.xplor_mode, out_dir, args.xplor_nmodels,
        args.xplor_topavg)

    __run_command(description_prog, command, " ")
Exemple #10
0
def __hhblits(fasta_file, feature_out_prefix, config, cpu):
    ''' Use HHBlits to generate HMM profile.
    
    This information is used in Spider3 for the prediction of secondary structure.
    '''

    description_prog = "HHBlits: generation of HMM profile"
    command = "{} -i {} -ohhm {}.hhm -d {} -v{} -maxres {} -cpu {} -Z {}".format(
        config.get("hhblits-deepconpred", "command"), fasta_file,
        feature_out_prefix, config.get("hhblits-deepconpred", "database"),
        config.get("hhblits-deepconpred", "v"),
        config.get("hhblits-deepconpred", "maxres"), cpu,
        config.get("hhblits-deepconpred", "z"))

    __run_command(description_prog, command, separator="=" * 60)
Exemple #11
0
def __ccmpred(feature_out_prefix, aln_file, config, cpu):
    ''' Use CCMPred (plmDCA) to predict residue contacts.'''

    # Conversion from a3m to psicov alignment format (and remove identical rows)
    # This is the alignment format accepted by CCMPred
    modified_aln_file = "{}.psicov".format(feature_out_prefix)
    os.system("egrep -v '^>' {} | sed 's/[a-z]//g' | sort -u > {}".format(
        aln_file, modified_aln_file))

    # Predict contacts
    description_prog = "CCMPred: prediction of residue contacts based on plmDCA"
    command = "{} {} {}.ccmpred -t {}".format(
        config.get("ccmpred-deepconpred", "command"), modified_aln_file,
        feature_out_prefix, cpu)
    __run_command(description_prog, command, separator="=" * 60)
Exemple #12
0
def __compute_trees(config,paths,scripts,cpu,ids):
    ''' Compute structure-based trees '''
    
    
    # Read list of structures into a dic {name : path/to/folder}
    strdic = {}
    with open(paths["str_list"]) as f:
        for line in f:
            line = line.strip().split(" ")
            if line[1] != "experimental":
                strdic[line[1]] = line[0]
    
    # Experimental structures
    strdic["experimental"] = paths["out_dir"] + "/pdb_matched"
    
    # List of trees with two columns: 1) path to tree.nwk  and  2) tree name
    if not os.path.exists("trees"):
        os.makedirs("trees")
    paths["tree_list"] = paths["out_dir"] + "/trees/tree_list"
    f = open(paths["tree_list"],"wt")
    msa,mode,mexp = "tmalign",4,2
    treename = paths["fasta_name"] + "_unw_d4_" + msa + str(mode) + "-" + str(mexp) + ".mat_1.txt.nwk"
    f.write(paths["out_dir"] + "/trees/experimental/out/tree3d/" + treename + " experimental\n")
    
    # Compute 3D-tree for every set of structures
    for name in strdic.keys():
        orifolder = strdic[name]
        out_dir = paths["out_dir"] + "/trees/" + name
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        for prot in ids:
            if name == "experimental":
                shutil.copy(orifolder+"/"+prot+".pdb",out_dir+"/"+prot+".pdb")
            else:
                # Reformat pdb files
                prediction = orifolder+"/"+prot+".pdb"
                experimental = "pdb_matched/" + prot + ".pdb"
                out = out_dir+"/"+prot+".pdb"
                os.system("bash {} {} {} {}".format(scripts["reformat_pdb"],prediction,experimental,out))
        description_prog = "Compute trees for " + name
        command = "source {} tree && python {} {} {} {} {}".format(scripts["environment"],scripts["trees"],paths["fastauniprot"],paths["ref_file"],out_dir,cpu)
        __run_command(description_prog,command,"="*60)
        
        # Write list of tree with two columns: 1) path to tree.nwk  and  2) tree name
        if name != "experimental":
            f.write(paths["out_dir"] + "/trees/" + name + "/out/tree3d/" + treename + " " + name + "\n")
    f.close()
    __compute_rf(scripts,paths,config)
Exemple #13
0
def calc_rf(treelist, outf):
    ''' Determine RF score '''

    import os
    from other.run_command import __run_command

    # Write trees.nwk
    trees = open(outf + "/trees.nwk", "w")
    for path in treelist.values():
        with open(path) as f:
            for line in f:
                line = line.strip("\n")
                trees.write(line + "\n")

    # Calculate RF --> rf.txt
    script = os.path.dirname(os.path.abspath(__file__)) + "/rf.R"
    command = "Rscript {} {}/treelist {}/trees.nwk {}/rf.txt {}/rf.xlsx".format(
        script, outf, outf, outf, outf)
    __run_command(" ", command, " ")
Exemple #14
0
def __psiblast(fasta_file, feature_out_prefix, config, cpu):
    ''' Use PSI-Blast to create a position-specific scoring matrix (PSSM).
    
    This matrix is used in Spider3 for the prediction of secondary structure.
    It is also used in DeepConPred2 for the prediction of contacts. 
    It is modified before being used in DeepConPred2.
    '''

    description_prog = "PSI-Blast: generation of PSSM"
    command = "{} -db {} -num_iterations {} -num_alignments {} -num_threads {} -query {} -out {}.bla -out_ascii_pssm {}.pssm".format(
        config.get("psiblast-deepconpred", "command"),
        config.get("psiblast-deepconpred", "database"),
        config.get("psiblast-deepconpred", "niterations"),
        config.get("psiblast-deepconpred", "nalignments"), cpu, fasta_file,
        feature_out_prefix, feature_out_prefix)
    __run_command(description_prog, command, separator="=" * 60)

    # Transform PSSM format to be used in DeepConPred2
    original_pssm = "{}.pssm".format(feature_out_prefix)
    modified_pssm = "{}.PSSM".format(feature_out_prefix)
    __modify_pssm(original_pssm, modified_pssm)
Exemple #15
0
def __deepcontact():
    ''' DeepContact requires three steps: generation of input files, extract features from files, prediction of contacts.'''

    # Generation of input files
    description_prog = "Generation of input files"
    command = "python {} {} {} {}".format(run_pipeline, default_yaml,
                                          fasta_file, feature_dir)
    __run_command(description_prog, command, separator=" ")

    # Extract features from input files
    description_prog = "Extract features from input files"
    command = "python {} {} {} {} {}".format(
        feature_generation, feature_yaml, feature_dir, prot_id,
        feature_pkl)  # The original script does not need ProtID.
    __run_command(
        description_prog, command,
        separator=" ")  # The script is changed to manipulate input filenames.

    # Prediction of residue contacts
    description_prog = "Prediction of protein residue contacts"
    command = "python {} {} {}".format(main_script, feature_pkl,
                                       prediction_pkl)
    __run_command(description_prog, command, separator=" ")
Exemple #16
0
def __compute_structures(config,paths,scripts,args,ids,cids):
    ''' Compute 3D structure through simulated annealing 
    
    and constrained by contact restraints and dihedral angle restraints.
    '''
  
    # List of contacts
    if "contacts" in args.actions: 
        paths["con_list"] = paths["out_dir"] + "/contacts/con_list"
        f = open(paths["con_list"],"wt")
        #f.write(paths["out_dir"] + "/contacts/native native 0\n")   # Reconstruct structures from native contacts
        for method in args.method:
            if method == "deepcov":
                score = args.pdeepcov
            elif method == "deepconpred":
                score = args.pdeepconpred
            elif method == "deepcontact":
                score = args.pdeepcontact
            if args.cscore is None:
                line = paths["out_dir"] + "/contacts/" + method + " " + method + "_score" + str(score).replace(".","-") + "_seqsep" + str(args.seqsep) + " " + str(score) + "\n"
            else:
                line = paths["out_dir"] + "/contacts/" + method + " " + method + "_score" + str(score).replace(".","-") + "_seqsep" + str(args.seqsep) + "_cscore" + str(args.cscore).replace(".","-") + " " + str(score) + "\n"
            f.write(line)
        f.close()
  
    
    # Filter by conservation score
    if args.cscore is not None:
        __apply_cscore(args,paths,scripts,ids,cids)
  
  
    # Manage SSE files
    __manage_sse(paths["sse_dir"],scripts["contacts_sse"],ids,args.actions,args.method,args.cpu)


    # List of structures
    if "trees" in args.actions:
        paths["str_list"] = paths["out_dir"] + "/simulations/str_list"
        str_list = open(paths["str_list"],"wt")

    # Run Xplor-NIH n times for each protein & method
    nexperiment = int(args.xplor_nexp)
    for n in range(1,nexperiment+1):
        # Read contacts folder, name, threshold {name : (path/to/folder, threshold)}
        cons = {}
        with open(paths["con_list"]) as f:
            for line in f:
                line = filter(None,line.strip().split(" "))
                if len(line) == 0:
                    continue
                folder = line[0]
                name = line[1]
                threshold = line[2]
                cons[name] = (folder,threshold)
        # Run simulations
        for name in cons.keys():
            confolder = cons[name][0]
            threshold = cons[name][1]
            for prot in ids:
                fasta_file = "seq/" + prot + ".fa"
                con_file = confolder + "/" + prot + ".con"
                sse_file = "sse/" + prot + ".sse"
                out_dir = "simulations/" + name + "/n" + str(n) + "/" + prot + "_models" 
                if not os.path.isfile(con_file):
                    print("Warning: There is no " + con_file)
                    continue
                if not os.path.exists(out_dir):
                    os.makedirs(out_dir)
                description_prog = "Run Xplor-NIH for " + prot + " (" + name + ") score>=" + threshold + " seqsep>=" + str(args.seqsep)
                command = "python {} -fa {} -con {} -sse {} -o {} -cpu {} -score {} -seqsep {} -xplor_mode {} -xplor_nmodels {} -xplor_topavg {} -xplor_betaoo {} -xplor_nexp {}".format(
                                    scripts["xplor"],fasta_file,
                                    con_file,sse_file,out_dir,args.cpu,
                                    threshold,args.seqsep,
                                    args.xplor_mode,args.xplor_nmodels,args.xplor_topavg,args.xplor_betaoo,args.xplor_nexp)
                __run_command(description_prog,command,"="*60)
                a = out_dir + "/annealing_ave.pdb"
                b = "simulations/" + name + "/n" + str(n) + "/" + prot + ".pdb"
                shutil.copy(a,b)
                print(command)
    
            # Write list of structures
            if "trees" in args.actions:
                str_list.write(paths["out_dir"] + "/simulations/" + name + "/n" + str(n) + " " + name + "_n" + str(n) + "\n")
    if "trees" in args.actions:
        str_list.close()
Exemple #17
0
def __apply_cscore(args,paths,scripts,ids,cids):
    ''' Calculate the conservation of the predicted contacts and filter them according to the Cscore'''
                

    # Copy contacts and create contactbench files
    if "contacts" not in args.actions:
        __extract_native(cids)
        with open(paths["con_list"]) as f:
            for line in f:
                line = line.strip().split(" ")
                folder = line[0]
                name = line[1]
                if name == "native":
                    continue
                if not os.path.exists("contacts/"+name):
                    os.makedirs("contacts/"+name)
                for prot in cids:
                    pred = folder + "/" + prot + ".con"
                    if os.path.isfile(pred):
                        prediction = "contacts/"+name+"/"+prot+".con"
                        native = "contacts/native/"+prot+".con"
                        contactbench = "contacts/"+name+"/"+prot+".contactbench"
                        shutil.copy(pred,prediction)
                        __rr_to_contactbench(prediction,native,contactbench)

    # Read con_list {name : (path/to/folder, threshold)}
    con_list = {}
    with open(paths["con_list"]) as f:
        for line in f:
            line = line.strip().split(" ")
            name = line[1]
            p = float(line[2])
            if "contacts" in args.actions:
                folder = line[0] 
            else:
                folder = paths["out_dir"] + "/contacts/" + name
            # Create folder
            if not os.path.exists(folder + "/cscore") and name != "native":
                os.makedirs(folder + "/cscore")
            con_list[name] = (folder,p)

    # Calculation of conservation score
    paths["con_list"] = paths["out_dir"] + "/contacts/con_list"
    w = open(paths["con_list"],"wt")
    for name in con_list.keys():
        if name == "native":
            w.write(paths["out_dir"] + "/contacts/native native 0\n")
            continue
        folder = con_list[name][0]
        p = con_list[name][1]
        # Filter contactbench files
        for prot in cids:
            outprefix = folder + "/cscore/" + prot
            contactbench1 = folder + "/" + prot + ".contactbench"
            contactbench2 = outprefix + ".contactbench"
            if not os.path.isfile(contactbench1):
                print("Warning: no contact file for " + prot + " " + name +" is provided")
                print("Warning: The conservation score will be computed without considering the contact predictions of the protein " + prot)
            else:
                cont = open(contactbench2,"wt")
                with open(contactbench1) as f:
                    for line in f:
                        fields = line.strip().split(" ")
                        if int(fields[2]) >= args.seqsep and float(fields[3]) >= p:
                            cont.write(line)
                cont.close()
        # Estimate Cscore
        for prot in ids:
            outprefix = folder + "/cscore/" + prot
            contactbench = outprefix + ".contactbench"
            description_prog = "Estimating the conservation of the predicted contacts"
            command = "bash {} {} {} {} {} {} {}".format(scripts["cscore"],paths["c_aln"],contactbench,outprefix,p,args.cscore,args.cweight)
            __run_command(description_prog,command,separator=" ")
        # Rewrite con_list
        folder = folder + "/cscore"
        w.write(folder + " " + name + " " + str(p) + "\n")
    w.close()