def calculate_pred_score(svm_file, org, example_type="pos", signal="tss", data_path="SRA-rnaseq"):
    """
    calculate svm prediction score around the true signal site
    """

    local = False ## switch between local and compute cluster 
    ## cluster compute options   
    cluster_resource = {'pvmem':'8gb', 'pmem':'8gb', 'mem':'8gb', 'vmem':'8gb','ppn':'1', 'nodes':'1', 'walltime':'24:00:00'}
    #cluster_resource = {'mem':'6000', 'nodes':'1', 'walltime':'08:00'}

    num_seq_ex = 10 ## number of sequences are in single job 
    center_offset = 500 ## nearby regions FIXME  
    args_req_list = data_process_depot(svm_file, org, example_type, signal, data_path, num_seq_ex)

    intm_ret = pg.pg_map(predict_site_region, args_req_list, param=cluster_resource, local=local, maxNumThreads=1, mem="8gb") 
    print("Done with calculating the score for center region of example sequences")

    pred_out_val = reduce_pred_score(intm_ret) 
    print("Done with collecting scores from different workers")

    ## save the scores 
    fname = "%s_%s_ex_pred_score_%s" % (signal, example_type, uuid.uuid1()) 
    compressed_pickle.save(fname, pred_out_val) 

    print("saving the scores in file %s" % fname)
def train_combined_wdspeck_svm(org_list_file, signal="tss", data_path="SRA-seq"):
    """
    train a global classifier for multiple organisms

    @args org_list_file: organism name in a text file 
    @type org_list_file: str 
    @args signal: genomic signal type (default: tss) 
    @type signal: str 
    @args data_path: file path for training data points 
    @type data_path: str 
    """

    t0 = time.time()
    ## getting multiple oragnisms
    ORG_LIST = get_orgdb(org_list_file)

    ## loading data
    train_examples = []
    train_labels = []
    for ORG_NAME in ORG_LIST:
        local_data_path = "%s/%s/set_1" % (data_path, ORG_NAME)  ## FIXME common data path
        data = load_examples_from_fasta(signal, ORG_NAME, local_data_path)
        assert len(data["examples"]) == len(data["labels"])

        ## split the data
        train_examples.extend(data["examples"])
        train_labels.extend(data["labels"])

    ## set parameters TODO
    param = {}
    param["cost"] = 1.0
    param["degree"] = 4
    param["degree_spectrum"] = 4
    param["center_pos"] = 1200
    param["center_offset"] = 50
    param["shifts"] = 32
    param["kernel_cache"] = 10000

    ## invoke training
    svm = ShogunPredictor(param)
    svm.train(train_examples, train_labels)

    ## save the model
    fname = "%s_model_%s" % (signal, uuid.uuid1())
    compressed_pickle.save(fname, svm)
    print ("saving the model in file %s" % fname)

    time_taken = time.time() - t0
    print ("time taken for the experiment: ", time_taken)

    return fname
def train_combined_wdspeck_svm(org_list_file, signal="tss", data_path="SRA-seq"):
    """
    train a global classifier for multiple organisms

    @args org_list_file: organism name in a text file 
    @type org_list_file: str 
    @args signal: genomic signal type (default: tss) 
    @type signal: str 
    @args data_path: file path for training data points 
    @type data_path: str 
    """

    t0 = time.time()
    ## getting multiple oragnisms 
    ORG_LIST = get_orgdb(org_list_file)

    ## loading data
    train_examples = [] 
    train_labels = []
    for ORG_NAME in ORG_LIST: 
        local_data_path = "%s/%s/set_1" % (data_path, ORG_NAME) ## FIXME common data path
        data = load_examples_from_fasta(signal, ORG_NAME, local_data_path)
        assert(len(data["examples"]) == len(data["labels"]))

        ## split the data 
        train_examples.extend(data["examples"])
        train_labels.extend(data["labels"])

    ## set parameters TODO
    param = {}
    param["cost"] = 1.0
    param["degree"] = 4 
    param["degree_spectrum"] = 4
    param["center_pos"] = 1200
    param["center_offset"] = 50 
    param["shifts"] = 32
    param["kernel_cache"] = 10000

    ## invoke training
    svm = ShogunPredictor(param)
    svm.train(train_examples, train_labels)

    ## save the model 
    fname = "%s_model_%s" % (signal, uuid.uuid1()) 
    compressed_pickle.save(fname, svm) 
    print("saving the model in file %s" % fname)

    time_taken = time.time() - t0
    print("time taken for the experiment: ", time_taken)

    return fname 
def calculate_pred_score(svm_file,
                         org,
                         example_type="pos",
                         signal="tss",
                         data_path="SRA-rnaseq"):
    """
    calculate svm prediction score around the true signal site
    """

    local = False  ## switch between local and compute cluster
    ## cluster compute options
    cluster_resource = {
        'pvmem': '8gb',
        'pmem': '8gb',
        'mem': '8gb',
        'vmem': '8gb',
        'ppn': '1',
        'nodes': '1',
        'walltime': '24:00:00'
    }
    #cluster_resource = {'mem':'6000', 'nodes':'1', 'walltime':'08:00'}

    num_seq_ex = 10  ## number of sequences are in single job
    center_offset = 500  ## nearby regions FIXME
    args_req_list = data_process_depot(svm_file, org, example_type, signal,
                                       data_path, num_seq_ex)

    intm_ret = pg.pg_map(predict_site_region,
                         args_req_list,
                         param=cluster_resource,
                         local=local,
                         maxNumThreads=1,
                         mem="8gb")
    print(
        "Done with calculating the score for center region of example sequences"
    )

    pred_out_val = reduce_pred_score(intm_ret)
    print("Done with collecting scores from different workers")

    ## save the scores
    fname = "%s_%s_ex_pred_score_%s" % (signal, example_type, uuid.uuid1())
    compressed_pickle.save(fname, pred_out_val)

    print("saving the scores in file %s" % fname)
def train_wdspeck_svm(org_code, signal="tss", data_path="SRA-rnaseq"):
    """
    train SVM based on the examples from different sources 

    @args org_code: organism name (ex: A_thaliana)
    @type org_code: str 
    @args signal: genomic signal type (default: tss) 
    @type signal: str 
    @args data_path: file path for training data points 
    @type data_path: str 
    """
    t0 = time.time()

    ## loading data
    data = load_examples_from_fasta(signal, org_code, data_path)
    assert len(data["examples"]) == len(data["labels"])

    ## split the data
    train_examples = data["examples"]
    train_labels = data["labels"]

    ## set parameters TODO
    param = {}
    param["cost"] = 1.0
    param["degree"] = 4
    param["degree_spectrum"] = 4
    param["center_pos"] = 1200
    param["center_offset"] = 50
    param["shifts"] = 32
    param["kernel_cache"] = 10000

    ## invoke training
    svm = ShogunPredictor(param)
    svm.train(train_examples, train_labels)

    ## save the model
    fname = "%s_%s_model_%s" % (org_code, signal, uuid.uuid1())
    compressed_pickle.save(fname, svm)
    print ("saving the model in file %s" % fname)

    time_taken = time.time() - t0
    print ("time taken for the experiment: ", time_taken)

    return fname
def train_wdspeck_svm(org_code, signal="tss", data_path="SRA-rnaseq"):
    """
    train SVM based on the examples from different sources 

    @args org_code: organism name (ex: A_thaliana)
    @type org_code: str 
    @args signal: genomic signal type (default: tss) 
    @type signal: str 
    @args data_path: file path for training data points 
    @type data_path: str 
    """
    t0 = time.time()

    ## loading data
    data = load_examples_from_fasta(signal, org_code, data_path)
    assert(len(data["examples"]) == len(data["labels"]))

    ## split the data 
    train_examples = data["examples"]
    train_labels = data["labels"]

    ## set parameters TODO 
    param = {}
    param["cost"] = 1.0
    param["degree"] = 4 
    param["degree_spectrum"] = 4
    param["center_pos"] = 1200
    param["center_offset"] = 50 
    param["shifts"] = 32
    param["kernel_cache"] = 10000

    ## invoke training
    svm = ShogunPredictor(param)
    svm.train(train_examples, train_labels)

    ## save the model 
    fname = "%s_%s_model_%s" % (org_code, signal, uuid.uuid1()) 
    compressed_pickle.save(fname, svm) 
    print("saving the model in file %s" % fname)

    time_taken = time.time() - t0
    print("time taken for the experiment: ", time_taken)

    return fname 
Example #7
0
def train_shifted_wdk_svm(org_code, signal="tss", data_path="SRA-rnaseq"):
    """
    train SVM based on the examples from different sources 
    """

    import time

    t0 = time.time()

    ## loading data
    data = load_examples_from_fasta(signal, org_code, data_path)
    assert len(data["examples"]) == len(data["labels"])

    ## split the data
    train_examples = data["examples"]
    train_labels = data["labels"]

    ## set parameters
    param = {}
    param["cost"] = 1.0
    param["degree"] = 4
    param["degree_spectrum"] = 4
    param["center_pos"] = 1200
    param["center_offset"] = 50
    param["shifts"] = 32
    param["kernel_cache"] = 10000

    ## invoke training
    svm = ShogunPredictor(param)
    svm.train(train_examples, train_labels)

    ## save the model
    fname = "%s_%s_model_%s" % (org_code, signal, uuid.uuid1())
    compressed_pickle.save(fname, svm)
    print ("saving the model in file %s" % fname)

    time_taken = time.time() - t0
    print ("time taken for the experiment: ", time_taken)

    return fname
Example #8
0
def manual_pos_shift(svm_file, org, signal="tss", data_path="SRA-rnaseq"):
    """
    manually look at the position around the original position 
    """

    ## loading data
    data = load_examples_from_fasta(signal, org, data_path)
    assert len(data["examples"]) == len(data["labels"])

    ## unpack the model
    import bz2
    import cPickle

    fh = bz2.BZ2File(svm_file, "rb")
    model = cPickle.load(fh)
    fh.close()

    ## getting the model information
    center_pos = model.param["center_pos"]
    center_offset = model.param["center_offset"]

    print ("model - center pos: %i, center reg: %i" % (center_pos, center_offset))

    start_scan = center_pos - center_offset
    stop_scan = center_pos + center_offset

    cnt = 0
    data_set = []
    argument_list = []

    label_type = -1  ## label_type will be +1/-1

    ## get the individual examples to recenter the signal position manually
    for idx, single_example in enumerate(data["examples"]):

        datum = [single_example]
        label_info = data["labels"][idx]

        if label_info != label_type:
            cnt += 1

            if cnt % 10 == 0:  ## packing 10 seq to one job
                data_set.append(datum)

                arg = [start_scan, stop_scan, model, data_set]
                argument_list.append(arg)

                data_set = []
            else:
                data_set.append(datum)

    local = False
    cluster_resource = {
        "pvmem": "4gb",
        "pmem": "4gb",
        "mem": "4gb",
        "vmem": "4gb",
        "ppn": "1",
        "nodes": "1",
        "walltime": "4:00:00",
    }
    task_type = 0  # 1 recenter seq, 0 predict score

    if task_type:
        intm_ret = pg.pg_map(
            predict_and_recenter, argument_list, param=cluster_resource, local=local, maxNumThreads=2, mem="4gb"
        )
        print "Done with computation"

        fixed_example_seq = reduce_modified_seq(intm_ret)
        print "Done reducing the results"

        write_fasta_rec(fixed_example_seq, signal)

    else:
        intm_ret = pg.pg_map(
            predict_around_region, argument_list, param=cluster_resource, local=local, maxNumThreads=2, mem="4gb"
        )
        print "Done with computation"

        pred_out_val = reduce_pred_score(intm_ret)
        print "Done reducing the results"

        ## save the scores
        fname = "%s_pred_score_%s" % (signal, uuid.uuid1())
        compressed_pickle.save(fname, pred_out_val)

        print ("saving the score in file %s" % fname)