def calculate_pred_score(svm_file, org, example_type="pos", signal="tss", data_path="SRA-rnaseq"): """ calculate svm prediction score around the true signal site """ local = False ## switch between local and compute cluster ## cluster compute options cluster_resource = {'pvmem':'8gb', 'pmem':'8gb', 'mem':'8gb', 'vmem':'8gb','ppn':'1', 'nodes':'1', 'walltime':'24:00:00'} #cluster_resource = {'mem':'6000', 'nodes':'1', 'walltime':'08:00'} num_seq_ex = 10 ## number of sequences are in single job center_offset = 500 ## nearby regions FIXME args_req_list = data_process_depot(svm_file, org, example_type, signal, data_path, num_seq_ex) intm_ret = pg.pg_map(predict_site_region, args_req_list, param=cluster_resource, local=local, maxNumThreads=1, mem="8gb") print("Done with calculating the score for center region of example sequences") pred_out_val = reduce_pred_score(intm_ret) print("Done with collecting scores from different workers") ## save the scores fname = "%s_%s_ex_pred_score_%s" % (signal, example_type, uuid.uuid1()) compressed_pickle.save(fname, pred_out_val) print("saving the scores in file %s" % fname)
def setup_splits(signal, method_name, method, param, num_folds, test_size, random_state): """ splitting the example data into train/test/validation group """ data = data_loader.load_all(signal) sizes = dict((org, len(data[org]["labels"])) for org in data.keys()) # set up splitting strategy kf = MultitaskShuffleSplitThreeWay(sizes, n_iter=num_folds, indices=True, test_size=test_size * 2, random_state=random_state) param_grid = list(ParameterGrid(param)) argument_list = [] for fold_idx, (train_idx, dev_idx, test_idx) in enumerate(kf): for grid_idx, grid_point in enumerate(param_grid): arg = [ signal, method, fold_idx, train_idx, dev_idx, test_idx, grid_idx, grid_point ] argument_list.append(arg) local = False max_num_threads = 2 if method_name in ['union', 'individual']: param = { 'vmem': '4gb', 'pvmem': '4gb', 'pmem': '4gb', 'mem': '4gb', 'ppn': '1', 'nodes': '1', 'walltime': '2:00:00' } intermediate_ret = pg.pg_map(compute_core, argument_list, param=param, local=local, maxNumThreads=1, mem="4gb") #import ipdb #ipdb.set_trace() print "DONE with computation" flat_intermediate = list(chain.from_iterable(intermediate_ret)) perf_dev, perf_test = reduce_result(flat_intermediate) print "DONE reducing" return perf_dev, perf_test
def calculate_pred_score(svm_file, org, example_type="pos", signal="tss", data_path="SRA-rnaseq"): """ calculate svm prediction score around the true signal site """ local = False ## switch between local and compute cluster ## cluster compute options cluster_resource = { 'pvmem': '8gb', 'pmem': '8gb', 'mem': '8gb', 'vmem': '8gb', 'ppn': '1', 'nodes': '1', 'walltime': '24:00:00' } #cluster_resource = {'mem':'6000', 'nodes':'1', 'walltime':'08:00'} num_seq_ex = 10 ## number of sequences are in single job center_offset = 500 ## nearby regions FIXME args_req_list = data_process_depot(svm_file, org, example_type, signal, data_path, num_seq_ex) intm_ret = pg.pg_map(predict_site_region, args_req_list, param=cluster_resource, local=local, maxNumThreads=1, mem="8gb") print( "Done with calculating the score for center region of example sequences" ) pred_out_val = reduce_pred_score(intm_ret) print("Done with collecting scores from different workers") ## save the scores fname = "%s_%s_ex_pred_score_%s" % (signal, example_type, uuid.uuid1()) compressed_pickle.save(fname, pred_out_val) print("saving the scores in file %s" % fname)
def shift_signal_position(svm_file, org, example_type="pos", signal="tss", data_path="SRA-rnaseq"): """ manually look at the position around the original position """ local = False ## switch between local and compute cluster ## cluster compute options cluster_resource = {'pvmem':'4gb', 'pmem':'4gb', 'mem':'4gb', 'vmem':'4gb','ppn':'1', 'nodes':'1', 'walltime':'24:00:00'} num_seq_ex = 2 ## number of sequences are in a single job args_req_list = data_process_depot(svm_file, org, example_type, signal, data_path, num_seq_ex) ## job dispatching intm_ret = pg.pg_map(recenter_examples, args_req_list, param=cluster_resource, local=local, maxNumThreads=1, mem="4gb") print("Done with trimming example sequences") fixed_example_seq = reduce_modified_seq(intm_ret) print("Done with collecting the trimmed examples") write_fasta_rec(fixed_example_seq, signal, example_type) print("Done with writing examples in fasta format")
def shift_signal_position(svm_file, org, example_type="pos", signal="tss", data_path="SRA-rnaseq"): """ manually look at the position around the original position """ local = False ## switch between local and compute cluster ## cluster compute options cluster_resource = { 'pvmem': '4gb', 'pmem': '4gb', 'mem': '4gb', 'vmem': '4gb', 'ppn': '1', 'nodes': '1', 'walltime': '24:00:00' } num_seq_ex = 2 ## number of sequences are in a single job args_req_list = data_process_depot(svm_file, org, example_type, signal, data_path, num_seq_ex) ## job dispatching intm_ret = pg.pg_map(recenter_examples, args_req_list, param=cluster_resource, local=local, maxNumThreads=1, mem="4gb") print("Done with trimming example sequences") fixed_example_seq = reduce_modified_seq(intm_ret) print("Done with collecting the trimmed examples") write_fasta_rec(fixed_example_seq, signal, example_type) print("Done with writing examples in fasta format")
def setup_splits(signal, method_name, method, param, num_folds, test_size, random_state): """ splitting the example data into train/test/validation group """ data = data_loader.load_all(signal) sizes = dict((org, len(data[org]["labels"])) for org in data.keys()) # set up splitting strategy kf = MultitaskShuffleSplitThreeWay(sizes, n_iter=num_folds, indices=True, test_size=test_size*2, random_state=random_state) param_grid = list(ParameterGrid(param)) argument_list = [] for fold_idx, (train_idx, dev_idx, test_idx) in enumerate(kf): for grid_idx, grid_point in enumerate(param_grid): arg = [signal, method, fold_idx, train_idx, dev_idx, test_idx, grid_idx, grid_point] argument_list.append(arg) local = False max_num_threads = 2 if method_name in ['union', 'individual']: param = {'vmem':'4gb', 'pvmem':'4gb', 'pmem':'4gb', 'mem':'4gb', 'ppn':'1', 'nodes':'1', 'walltime':'2:00:00'} intermediate_ret = pg.pg_map(compute_core, argument_list, param=param, local=local, maxNumThreads=1, mem="4gb") #import ipdb #ipdb.set_trace() print "DONE with computation" flat_intermediate = list(chain.from_iterable(intermediate_ret)) perf_dev, perf_test = reduce_result(flat_intermediate) print "DONE reducing" return perf_dev, perf_test
def manual_pos_shift(svm_file, org, signal="tss", data_path="SRA-rnaseq"): """ manually look at the position around the original position """ ## loading data data = load_examples_from_fasta(signal, org, data_path) assert len(data["examples"]) == len(data["labels"]) ## unpack the model import bz2 import cPickle fh = bz2.BZ2File(svm_file, "rb") model = cPickle.load(fh) fh.close() ## getting the model information center_pos = model.param["center_pos"] center_offset = model.param["center_offset"] print ("model - center pos: %i, center reg: %i" % (center_pos, center_offset)) start_scan = center_pos - center_offset stop_scan = center_pos + center_offset cnt = 0 data_set = [] argument_list = [] label_type = -1 ## label_type will be +1/-1 ## get the individual examples to recenter the signal position manually for idx, single_example in enumerate(data["examples"]): datum = [single_example] label_info = data["labels"][idx] if label_info != label_type: cnt += 1 if cnt % 10 == 0: ## packing 10 seq to one job data_set.append(datum) arg = [start_scan, stop_scan, model, data_set] argument_list.append(arg) data_set = [] else: data_set.append(datum) local = False cluster_resource = { "pvmem": "4gb", "pmem": "4gb", "mem": "4gb", "vmem": "4gb", "ppn": "1", "nodes": "1", "walltime": "4:00:00", } task_type = 0 # 1 recenter seq, 0 predict score if task_type: intm_ret = pg.pg_map( predict_and_recenter, argument_list, param=cluster_resource, local=local, maxNumThreads=2, mem="4gb" ) print "Done with computation" fixed_example_seq = reduce_modified_seq(intm_ret) print "Done reducing the results" write_fasta_rec(fixed_example_seq, signal) else: intm_ret = pg.pg_map( predict_around_region, argument_list, param=cluster_resource, local=local, maxNumThreads=2, mem="4gb" ) print "Done with computation" pred_out_val = reduce_pred_score(intm_ret) print "Done reducing the results" ## save the scores fname = "%s_pred_score_%s" % (signal, uuid.uuid1()) compressed_pickle.save(fname, pred_out_val) print ("saving the score in file %s" % fname)