Esempio n. 1
0
def train_combined_wdspeck_svm(org_list_file, signal="tss", data_path="SRA-seq"):
    """
    train a global classifier for multiple organisms

    @args org_list_file: organism name in a text file 
    @type org_list_file: str 
    @args signal: genomic signal type (default: tss) 
    @type signal: str 
    @args data_path: file path for training data points 
    @type data_path: str 
    """

    t0 = time.time()
    ## getting multiple oragnisms
    ORG_LIST = get_orgdb(org_list_file)

    ## loading data
    train_examples = []
    train_labels = []
    for ORG_NAME in ORG_LIST:
        local_data_path = "%s/%s/set_1" % (data_path, ORG_NAME)  ## FIXME common data path
        data = load_examples_from_fasta(signal, ORG_NAME, local_data_path)
        assert len(data["examples"]) == len(data["labels"])

        ## split the data
        train_examples.extend(data["examples"])
        train_labels.extend(data["labels"])

    ## set parameters TODO
    param = {}
    param["cost"] = 1.0
    param["degree"] = 4
    param["degree_spectrum"] = 4
    param["center_pos"] = 1200
    param["center_offset"] = 50
    param["shifts"] = 32
    param["kernel_cache"] = 10000

    ## invoke training
    svm = ShogunPredictor(param)
    svm.train(train_examples, train_labels)

    ## save the model
    fname = "%s_model_%s" % (signal, uuid.uuid1())
    compressed_pickle.save(fname, svm)
    print ("saving the model in file %s" % fname)

    time_taken = time.time() - t0
    print ("time taken for the experiment: ", time_taken)

    return fname
Esempio n. 2
0
def train_combined_wdspeck_svm(org_list_file, signal="tss", data_path="SRA-seq"):
    """
    train a global classifier for multiple organisms

    @args org_list_file: organism name in a text file 
    @type org_list_file: str 
    @args signal: genomic signal type (default: tss) 
    @type signal: str 
    @args data_path: file path for training data points 
    @type data_path: str 
    """

    t0 = time.time()
    ## getting multiple oragnisms 
    ORG_LIST = get_orgdb(org_list_file)

    ## loading data
    train_examples = [] 
    train_labels = []
    for ORG_NAME in ORG_LIST: 
        local_data_path = "%s/%s/set_1" % (data_path, ORG_NAME) ## FIXME common data path
        data = load_examples_from_fasta(signal, ORG_NAME, local_data_path)
        assert(len(data["examples"]) == len(data["labels"]))

        ## split the data 
        train_examples.extend(data["examples"])
        train_labels.extend(data["labels"])

    ## set parameters TODO
    param = {}
    param["cost"] = 1.0
    param["degree"] = 4 
    param["degree_spectrum"] = 4
    param["center_pos"] = 1200
    param["center_offset"] = 50 
    param["shifts"] = 32
    param["kernel_cache"] = 10000

    ## invoke training
    svm = ShogunPredictor(param)
    svm.train(train_examples, train_labels)

    ## save the model 
    fname = "%s_model_%s" % (signal, uuid.uuid1()) 
    compressed_pickle.save(fname, svm) 
    print("saving the model in file %s" % fname)

    time_taken = time.time() - t0
    print("time taken for the experiment: ", time_taken)

    return fname 
Esempio n. 3
0
def train_wdspeck_svm(org_code, signal="tss", data_path="SRA-rnaseq"):
    """
    train SVM based on the examples from different sources 

    @args org_code: organism name (ex: A_thaliana)
    @type org_code: str 
    @args signal: genomic signal type (default: tss) 
    @type signal: str 
    @args data_path: file path for training data points 
    @type data_path: str 
    """
    t0 = time.time()

    ## loading data
    data = load_examples_from_fasta(signal, org_code, data_path)
    assert len(data["examples"]) == len(data["labels"])

    ## split the data
    train_examples = data["examples"]
    train_labels = data["labels"]

    ## set parameters TODO
    param = {}
    param["cost"] = 1.0
    param["degree"] = 4
    param["degree_spectrum"] = 4
    param["center_pos"] = 1200
    param["center_offset"] = 50
    param["shifts"] = 32
    param["kernel_cache"] = 10000

    ## invoke training
    svm = ShogunPredictor(param)
    svm.train(train_examples, train_labels)

    ## save the model
    fname = "%s_%s_model_%s" % (org_code, signal, uuid.uuid1())
    compressed_pickle.save(fname, svm)
    print ("saving the model in file %s" % fname)

    time_taken = time.time() - t0
    print ("time taken for the experiment: ", time_taken)

    return fname
Esempio n. 4
0
def train_wdspeck_svm(org_code, signal="tss", data_path="SRA-rnaseq"):
    """
    train SVM based on the examples from different sources 

    @args org_code: organism name (ex: A_thaliana)
    @type org_code: str 
    @args signal: genomic signal type (default: tss) 
    @type signal: str 
    @args data_path: file path for training data points 
    @type data_path: str 
    """
    t0 = time.time()

    ## loading data
    data = load_examples_from_fasta(signal, org_code, data_path)
    assert(len(data["examples"]) == len(data["labels"]))

    ## split the data 
    train_examples = data["examples"]
    train_labels = data["labels"]

    ## set parameters TODO 
    param = {}
    param["cost"] = 1.0
    param["degree"] = 4 
    param["degree_spectrum"] = 4
    param["center_pos"] = 1200
    param["center_offset"] = 50 
    param["shifts"] = 32
    param["kernel_cache"] = 10000

    ## invoke training
    svm = ShogunPredictor(param)
    svm.train(train_examples, train_labels)

    ## save the model 
    fname = "%s_%s_model_%s" % (org_code, signal, uuid.uuid1()) 
    compressed_pickle.save(fname, svm) 
    print("saving the model in file %s" % fname)

    time_taken = time.time() - t0
    print("time taken for the experiment: ", time_taken)

    return fname 
Esempio n. 5
0
def train_shifted_wdk_svm(org_code, signal="tss", data_path="SRA-rnaseq"):
    """
    train SVM based on the examples from different sources 
    """

    import time

    t0 = time.time()

    ## loading data
    data = load_examples_from_fasta(signal, org_code, data_path)
    assert len(data["examples"]) == len(data["labels"])

    ## split the data
    train_examples = data["examples"]
    train_labels = data["labels"]

    ## set parameters
    param = {}
    param["cost"] = 1.0
    param["degree"] = 4
    param["degree_spectrum"] = 4
    param["center_pos"] = 1200
    param["center_offset"] = 50
    param["shifts"] = 32
    param["kernel_cache"] = 10000

    ## invoke training
    svm = ShogunPredictor(param)
    svm.train(train_examples, train_labels)

    ## save the model
    fname = "%s_%s_model_%s" % (org_code, signal, uuid.uuid1())
    compressed_pickle.save(fname, svm)
    print ("saving the model in file %s" % fname)

    time_taken = time.time() - t0
    print ("time taken for the experiment: ", time_taken)

    return fname