def roc(scores_path, true_path, auroc_path, curve_path): #Read in scores data lines = filelines2deeplist(scores_path, skipComments=True, skipEmptyLines=True) scores = {pair[0]:pair[1] for pair in lines} #Read in true classification data lines = filelines2deeplist(true_path, skipComments=True, skipEmptyLines=True) true = {pair[0]:pair[1] for pair in lines} data = list() for seq, classification in true.items(): if classification == 'weak': class_type = 0 elif classification == 'strong': class_type = 1 else: continue if seq in scores: data.append((class_type, scores[seq])) else: print 'Error: ' + seq + ' not found in results.' import pyroc roc_data = pyroc.ROCData(data) if auroc_path is not None: auroc = roc_data.auc() list2file([auroc], auroc_path) if curve_path is not None: roc_data.plot_and_save(curve_path, 'STATIUM-based Binding Prediction Model')
def main(argv): helpdoc = """usage: wrapper.py quickrun (--in_pdb=A --position_pairs=B --pdb_lib=C --ip_lib=D) [--out=E] [--noverbose] wrapper.py renumber (--in_pdb=A) [--out_pdb=B --chains=C --SRN=1 --SAN=1] [--noverbose] wrapper.py create_res (--in_pdb_orig=A --in_pdb_renum=B) [--out_res=C --position_pairs=D] [--noverbose] wrapper.py preprocess (--in_dir=A) [--out_dir=B --ip_dist_cutoff=C] [--noverbose] [-r] wrapper.py run_statium (--in_res=A --in_pdb=B --pdb_lib=C) [--ip_lib=D --out=E --ip_dist_cutoff=F --matching_res_dist_cutoffs=G --counts] [--noverbose] wrapper.py [-f] energy (--in_res=A | --in_pdb=B) (--in_probs=C --in_seqs=D) [--out=E] [-z | --zscore] [--histogram=E] [--noverbose] wrapper.py random (--seq_length=A --num_seqs=B) [--out=C] [--noverbose] wrapper.py get_orig_seq (--in_res=A --in_pdb_orig=B --in_pdb_renum=C) [--noverbose] wrapper.py calc_top_seqs (--in_res=A --in_probs=B --N=C) [--out=D] [--noverbose] wrapper.py roc (--scores=A --true=B) [--curve=C --auroc=D] [-noverbose] wrapper.py print_merged (--scores=A --true=B) [--out=C] [--noverbose] wrapper.py [-h | --help] Options: --in_pdb=A Input PDB file path --position_pairs=B Positions to include in the binding sequence --pdb_lib=C Input directory of library PDB files --ip_lib=D Input directory of library IP files --out=E Output directory --in_pdb=A Input PDB file path --out_pdb=B Output PDB file path --chains=C Chosen ligand chains --SRN=1 Starting residue number --SAN=1 Starting atom number --in_pdb_orig=A Input PDB file path (original) --in_pdb_renum=B Input PDB file path (renumbered) --out_res=C Output RES file path --position_pairs=D Positions to include in the binding sequence --in_dir=A Directory containing library PDBs --out_dir=B Output directory for JSON objects --ip_dist_cutoff=C Threshold for interacting pair designation --in_res=A Input .res file path --in_pdb=B Input renumbered PDB path --pdb_lib=C Input directory of library PDB files --ip_lib=D Input directory of library IP files --out=E Output directory --ip_dist_cutoff=F Threshold for interacting pair designation --matching_res_dist_cutoffs=G Thresholds for matching IP designation --in_res=A Input .res file path --in_pdb=B Input .pdb file path --in_probs=C STATIUM probabilities file --in_seqs=D Sequence patter or path to a file of sequence patterns to be scored --out=E File path to output score (if -f flag is present) --histogram=F File path to output histogram (absence outputs nothing) --seq_length=A Length of the random sequences --num_seqs=B Number of random sequences --out=C Output file path --in_res=A Input .res file path --in_pdb_orig=B Input PDB file path (original) --in_pdb_renum-C Input PDB file path (renumbered) --in_res=A --in_probs=B STATIUM output file --N=C Number of sequences to be found --out=D Output file path --scores=A Sequences w/ energy file path --true=B Sequences' true binding classification file path --auroc=C File path to output auroc --curve=D File path to output ROC curve --scores=A Sequences w/ energy file path --true=B Sequences' true binding classification file path --out=C Output file path """ options = docopt(helpdoc, argv, help = True, version = "3.0.0", options_first=False) verbose = not options['--noverbose'] if options['quickrun']: in_pdb = options['--in_pdb'] stem = in_pdb[:-4] renum_pdb = stem + '_renumbered.pdb' res = stem + '.res' pdb_lib = options['--pdb_lib'] ip_lib = options['--ip_lib'] out_dir = options['--out'] if options['--out'] is not None else stem positions = parse_position_pairs(options['--position_pairs']) chains = [term[0] for term in positions] default_match_dist = {'A':2, 'C':6, 'D':6, 'E':6, 'F':6, 'G':2, 'H':6, 'I':6, 'K':6, 'L':6, 'M':6, 'N':6, 'P':6, 'Q':6, 'R':6, 'S':6, 'T':6, 'V':6, 'W':6, 'Y':6, 'X':0} ip_dist = 6.0 if verbose: print "Renumbering PDB file: " + in_pdb renumber(1, 1, chains, in_pdb, renum_pdb) if verbose: print "Creating .res file using: " + in_pdb + " and " + renum_pdb create_res(in_pdb, renum_pdb, res, positions) if verbose: print "Running STATIUM with: " + renum_pdb + " " + res + " " + pdb_lib + ' and IP lib: ' + ip_lib statium(res, renum_pdb, pdb_lib, ip_lib, out_dir, ip_dist, default_match_dist, False, verbose) if verbose: print 'Done' elif options['renumber']: in_pdb = options['--in_pdb'] out_pdb = options['--out_pdb'] if options['--out_pdb'] is not None else in_pdb[:-4]+'_renumbered.pdb' SRN = 1 if options['--SRN'] == None else int(options['--SRN']) SAN = 1 if options['--SAN'] == None else int(options['--SAN']) chains = {'B'} if options['--chains'] == None else set(options['--chains'].split(',')) if verbose: print "Renumbering PDB file: " + in_pdb renumber(SRN, SAN, chains, in_pdb, out_pdb) if verbose: print "Done. Renumbered file: " + out_pdb elif options['create_res']: pdb_orig = options['--in_pdb_orig'] pdb_renum = options['--in_pdb_renum'] res = pdb_orig[:-4]+'.res' if options['--out_res'] is None else options['--out_res'] position_pairs = options['--position_pairs'] if options['--position_pairs'] else 'B' if verbose: print "Creating .res file using: " + pdb_orig + " and " + pdb_renum create_res(pdb_orig, pdb_renum, res, position_pairs) if verbose: print "Done. .res file: " + res elif options['preprocess']: in_dir = options['--in_dir'] out_dir = options['--out_dir'] if options['--out_dir'] else in_dir + '_JSON_preprocessed' ip_dist = float(options['--ip_dist_cutoff']) if options['--ip_dist_cutoff'] is not None else 5.0 restart = options['-r'] if verbose: print 'Preprocessing library: %s' % in_dir preprocess(in_dir, out_dir, ip_dist, restart, verbose) if verbose: print 'Done: %s' % out_dir elif options['run_statium']: res = options['--in_res'] pdb = options['--in_pdb'] pdb_lib = options['--pdb_lib'] ip_lib = options['--ip_lib'] out = options['--out'] if options['--out'] is not None else res[:-4] + '.out' ip_dist = float(options['--ip_dist_cutoff']) if options['--ip_dist_cutoff'] is not None else 6.0 default = {'A':2, 'C':6, 'D':6, 'E':6, 'F':6, 'G':2, 'H':6, 'I':6, 'K':6, 'L':6, 'M':6, 'N':6, 'P':6, 'Q':6, 'R':6, 'S':6, 'T':6, 'V':6, 'W':6, 'Y':6, 'X':0} match_dist = ast.literal_eval(options['--matching_res_dist_cutoffs']) if options['--matching_res_dist_cutoffs'] else default count = options['--counts'] if verbose: print "\nRunning STATIUM with: " + pdb + " " + res + " " + pdb_lib + ' and IP lib: ' + str(ip_lib) statium(res, pdb, pdb_lib, ip_lib, out, ip_dist, match_dist, count, verbose) if verbose: print "Done. STATIUM probabilities in output directory: " + out_dir elif options['energy']: zscores = options['-z'] or options['--zscore'] histogram = options['--histogram'] in_res = options['--in_res'] in_probs = options['--in_probs'] isfile = options['-f'] in_seqs = options['--in_seqs'] outfile = options['--out'] if verbose: print "Writing to file: " + str(isfile) + ". Calculating z-score: " + str(zscores) if zscores: if verbose: print 'Generating random distribution of energies...' distribution = generate_random_distribution(in_res, in_probs) if histogram: if verbose: print 'Drawing histogram...' import matplotlib.pyplot as plt import numpy as np hist, bins = np.histogram(distribution[1], bins=50) width = 0.7 * (bins[1] - bins[0]) center = (bins[:-1] + bins[1:]) / 2 plt.bar(center, hist, align='center', width=width) plt.savefig(histogram) if verbose: print 'Done generating random distribution.' if isfile: if verbose: print 'Reading sequences from file...' lines = filelines2list(in_seqs) out_lines = [] for line in lines: if line != '' and line[0] != '#': seq = line.strip() energy = calc_seq_energy(in_res, in_probs, seq) out = seq + "\t" + str(energy) if(zscores): out += "\t" + str(calc_seq_zscore(distribution[2], distribution[3], energy)) out_lines.append(out) list2file(out_lines, outfile) print('Done.') else: energy = calc_seq_energy(in_res, in_probs, in_seqs) print("Sequence energy for " + in_seqs + " is: " + str(energy)) if(zscores): zscore = calc_seq_zscore(distribution[2], distribution[3], energy) print('Z-score is ' + str(zscore)) #Get the original AA sequence of chain B, along with stats like the length and position of that chain elif options['get_orig_seq']: res = options['--in_res'] orig_pdb = options['--in_pdb_orig'] renum_pdb = options['--in_pdb_renum'] get_orig_seq(res, orig_pdb, renum_pdb) elif options['random']: num_seqs = int(options['--num_seqs']) seq_length = int(options['--seq_length']) out = options['--out'] if verbose: print("Generating " + str(num_seqs) + " random sequences of length " + str(seq_length)) sequences = [generate_random_seq(seq_length) for _ in range(num_seqs)] if out is None: for sequence in sequences: print(sequence) else: list2file(sequences, out) if(verbose): print("Random sequences written to " + out) elif options['calc_top_seqs']: in_res = options['--in_res'] probs_dir = options['--in_probs'] N = int(options['--N']) out = options['--out'] if verbose: print 'Calculating ' + str(N) + ' sequences with lowest energy.' results = calc_top_seqs(in_res, in_probs, N) if out: to_print = [seq + '\t' + str(energy) + '\n' for seq, energy in results] list2file(to_print, out) else: print results elif options['roc']: scores = options['--scores'] true = options['--true'] auroc = options['--auroc'] curve = options['--curve'] if verbose: print 'Calculating AUROC for ' + scores + ' with true classifications in ' + true roc(scores, true, auroc, curve) if verbose: print 'Done. Printed AUROC to: ' + str(auroc) + ' and ROC curve to ' + str(curve) elif options['print_merged']: scores = options['--scores'] true = options['--true'] out = options['--out'] if options['--out'] is not None else 'merged.txt' print_merged(scores, true, out) print 'Done. Printed to ' + out