def calculate_pred_score(svm_file, org, example_type="pos", signal="tss", data_path="SRA-rnaseq"): """ calculate svm prediction score around the true signal site """ local = False ## switch between local and compute cluster ## cluster compute options cluster_resource = {'pvmem':'8gb', 'pmem':'8gb', 'mem':'8gb', 'vmem':'8gb','ppn':'1', 'nodes':'1', 'walltime':'24:00:00'} #cluster_resource = {'mem':'6000', 'nodes':'1', 'walltime':'08:00'} num_seq_ex = 10 ## number of sequences are in single job center_offset = 500 ## nearby regions FIXME args_req_list = data_process_depot(svm_file, org, example_type, signal, data_path, num_seq_ex) intm_ret = pg.pg_map(predict_site_region, args_req_list, param=cluster_resource, local=local, maxNumThreads=1, mem="8gb") print("Done with calculating the score for center region of example sequences") pred_out_val = reduce_pred_score(intm_ret) print("Done with collecting scores from different workers") ## save the scores fname = "%s_%s_ex_pred_score_%s" % (signal, example_type, uuid.uuid1()) compressed_pickle.save(fname, pred_out_val) print("saving the scores in file %s" % fname)
def train_combined_wdspeck_svm(org_list_file, signal="tss", data_path="SRA-seq"): """ train a global classifier for multiple organisms @args org_list_file: organism name in a text file @type org_list_file: str @args signal: genomic signal type (default: tss) @type signal: str @args data_path: file path for training data points @type data_path: str """ t0 = time.time() ## getting multiple oragnisms ORG_LIST = get_orgdb(org_list_file) ## loading data train_examples = [] train_labels = [] for ORG_NAME in ORG_LIST: local_data_path = "%s/%s/set_1" % (data_path, ORG_NAME) ## FIXME common data path data = load_examples_from_fasta(signal, ORG_NAME, local_data_path) assert len(data["examples"]) == len(data["labels"]) ## split the data train_examples.extend(data["examples"]) train_labels.extend(data["labels"]) ## set parameters TODO param = {} param["cost"] = 1.0 param["degree"] = 4 param["degree_spectrum"] = 4 param["center_pos"] = 1200 param["center_offset"] = 50 param["shifts"] = 32 param["kernel_cache"] = 10000 ## invoke training svm = ShogunPredictor(param) svm.train(train_examples, train_labels) ## save the model fname = "%s_model_%s" % (signal, uuid.uuid1()) compressed_pickle.save(fname, svm) print ("saving the model in file %s" % fname) time_taken = time.time() - t0 print ("time taken for the experiment: ", time_taken) return fname
def train_combined_wdspeck_svm(org_list_file, signal="tss", data_path="SRA-seq"): """ train a global classifier for multiple organisms @args org_list_file: organism name in a text file @type org_list_file: str @args signal: genomic signal type (default: tss) @type signal: str @args data_path: file path for training data points @type data_path: str """ t0 = time.time() ## getting multiple oragnisms ORG_LIST = get_orgdb(org_list_file) ## loading data train_examples = [] train_labels = [] for ORG_NAME in ORG_LIST: local_data_path = "%s/%s/set_1" % (data_path, ORG_NAME) ## FIXME common data path data = load_examples_from_fasta(signal, ORG_NAME, local_data_path) assert(len(data["examples"]) == len(data["labels"])) ## split the data train_examples.extend(data["examples"]) train_labels.extend(data["labels"]) ## set parameters TODO param = {} param["cost"] = 1.0 param["degree"] = 4 param["degree_spectrum"] = 4 param["center_pos"] = 1200 param["center_offset"] = 50 param["shifts"] = 32 param["kernel_cache"] = 10000 ## invoke training svm = ShogunPredictor(param) svm.train(train_examples, train_labels) ## save the model fname = "%s_model_%s" % (signal, uuid.uuid1()) compressed_pickle.save(fname, svm) print("saving the model in file %s" % fname) time_taken = time.time() - t0 print("time taken for the experiment: ", time_taken) return fname
def calculate_pred_score(svm_file, org, example_type="pos", signal="tss", data_path="SRA-rnaseq"): """ calculate svm prediction score around the true signal site """ local = False ## switch between local and compute cluster ## cluster compute options cluster_resource = { 'pvmem': '8gb', 'pmem': '8gb', 'mem': '8gb', 'vmem': '8gb', 'ppn': '1', 'nodes': '1', 'walltime': '24:00:00' } #cluster_resource = {'mem':'6000', 'nodes':'1', 'walltime':'08:00'} num_seq_ex = 10 ## number of sequences are in single job center_offset = 500 ## nearby regions FIXME args_req_list = data_process_depot(svm_file, org, example_type, signal, data_path, num_seq_ex) intm_ret = pg.pg_map(predict_site_region, args_req_list, param=cluster_resource, local=local, maxNumThreads=1, mem="8gb") print( "Done with calculating the score for center region of example sequences" ) pred_out_val = reduce_pred_score(intm_ret) print("Done with collecting scores from different workers") ## save the scores fname = "%s_%s_ex_pred_score_%s" % (signal, example_type, uuid.uuid1()) compressed_pickle.save(fname, pred_out_val) print("saving the scores in file %s" % fname)
def train_wdspeck_svm(org_code, signal="tss", data_path="SRA-rnaseq"): """ train SVM based on the examples from different sources @args org_code: organism name (ex: A_thaliana) @type org_code: str @args signal: genomic signal type (default: tss) @type signal: str @args data_path: file path for training data points @type data_path: str """ t0 = time.time() ## loading data data = load_examples_from_fasta(signal, org_code, data_path) assert len(data["examples"]) == len(data["labels"]) ## split the data train_examples = data["examples"] train_labels = data["labels"] ## set parameters TODO param = {} param["cost"] = 1.0 param["degree"] = 4 param["degree_spectrum"] = 4 param["center_pos"] = 1200 param["center_offset"] = 50 param["shifts"] = 32 param["kernel_cache"] = 10000 ## invoke training svm = ShogunPredictor(param) svm.train(train_examples, train_labels) ## save the model fname = "%s_%s_model_%s" % (org_code, signal, uuid.uuid1()) compressed_pickle.save(fname, svm) print ("saving the model in file %s" % fname) time_taken = time.time() - t0 print ("time taken for the experiment: ", time_taken) return fname
def train_wdspeck_svm(org_code, signal="tss", data_path="SRA-rnaseq"): """ train SVM based on the examples from different sources @args org_code: organism name (ex: A_thaliana) @type org_code: str @args signal: genomic signal type (default: tss) @type signal: str @args data_path: file path for training data points @type data_path: str """ t0 = time.time() ## loading data data = load_examples_from_fasta(signal, org_code, data_path) assert(len(data["examples"]) == len(data["labels"])) ## split the data train_examples = data["examples"] train_labels = data["labels"] ## set parameters TODO param = {} param["cost"] = 1.0 param["degree"] = 4 param["degree_spectrum"] = 4 param["center_pos"] = 1200 param["center_offset"] = 50 param["shifts"] = 32 param["kernel_cache"] = 10000 ## invoke training svm = ShogunPredictor(param) svm.train(train_examples, train_labels) ## save the model fname = "%s_%s_model_%s" % (org_code, signal, uuid.uuid1()) compressed_pickle.save(fname, svm) print("saving the model in file %s" % fname) time_taken = time.time() - t0 print("time taken for the experiment: ", time_taken) return fname
def train_shifted_wdk_svm(org_code, signal="tss", data_path="SRA-rnaseq"): """ train SVM based on the examples from different sources """ import time t0 = time.time() ## loading data data = load_examples_from_fasta(signal, org_code, data_path) assert len(data["examples"]) == len(data["labels"]) ## split the data train_examples = data["examples"] train_labels = data["labels"] ## set parameters param = {} param["cost"] = 1.0 param["degree"] = 4 param["degree_spectrum"] = 4 param["center_pos"] = 1200 param["center_offset"] = 50 param["shifts"] = 32 param["kernel_cache"] = 10000 ## invoke training svm = ShogunPredictor(param) svm.train(train_examples, train_labels) ## save the model fname = "%s_%s_model_%s" % (org_code, signal, uuid.uuid1()) compressed_pickle.save(fname, svm) print ("saving the model in file %s" % fname) time_taken = time.time() - t0 print ("time taken for the experiment: ", time_taken) return fname
def manual_pos_shift(svm_file, org, signal="tss", data_path="SRA-rnaseq"): """ manually look at the position around the original position """ ## loading data data = load_examples_from_fasta(signal, org, data_path) assert len(data["examples"]) == len(data["labels"]) ## unpack the model import bz2 import cPickle fh = bz2.BZ2File(svm_file, "rb") model = cPickle.load(fh) fh.close() ## getting the model information center_pos = model.param["center_pos"] center_offset = model.param["center_offset"] print ("model - center pos: %i, center reg: %i" % (center_pos, center_offset)) start_scan = center_pos - center_offset stop_scan = center_pos + center_offset cnt = 0 data_set = [] argument_list = [] label_type = -1 ## label_type will be +1/-1 ## get the individual examples to recenter the signal position manually for idx, single_example in enumerate(data["examples"]): datum = [single_example] label_info = data["labels"][idx] if label_info != label_type: cnt += 1 if cnt % 10 == 0: ## packing 10 seq to one job data_set.append(datum) arg = [start_scan, stop_scan, model, data_set] argument_list.append(arg) data_set = [] else: data_set.append(datum) local = False cluster_resource = { "pvmem": "4gb", "pmem": "4gb", "mem": "4gb", "vmem": "4gb", "ppn": "1", "nodes": "1", "walltime": "4:00:00", } task_type = 0 # 1 recenter seq, 0 predict score if task_type: intm_ret = pg.pg_map( predict_and_recenter, argument_list, param=cluster_resource, local=local, maxNumThreads=2, mem="4gb" ) print "Done with computation" fixed_example_seq = reduce_modified_seq(intm_ret) print "Done reducing the results" write_fasta_rec(fixed_example_seq, signal) else: intm_ret = pg.pg_map( predict_around_region, argument_list, param=cluster_resource, local=local, maxNumThreads=2, mem="4gb" ) print "Done with computation" pred_out_val = reduce_pred_score(intm_ret) print "Done reducing the results" ## save the scores fname = "%s_pred_score_%s" % (signal, uuid.uuid1()) compressed_pickle.save(fname, pred_out_val) print ("saving the score in file %s" % fname)