def check_C_testset(mss_id):
    
    import pylab
    import expenv
    import numpy
    from helper import Options
    from method_hierarchy_svm_new import Method
    #from method_augmented_svm_new import Method
    
    
    #costs = 10000 #[float(c) for c in numpy.exp(numpy.linspace(numpy.log(10), numpy.log(20000), 6))]
    costs = [float(c) for c in numpy.exp(numpy.linspace(numpy.log(0.4), numpy.log(10), 6))] 
    
    print costs
    
    mss = expenv.MultiSplitSet.get(mss_id)
    
    train = mss.get_train_data(-1)
    test = mss.get_eval_data(-1)
    
    au_roc = []
    au_prc = []
    
    for cost in costs:
        #create mock param object by freezable struct
        param = Options()
        param.kernel = "WeightedDegreeStringKernel"
        param.wdk_degree = 10
        param.transform = cost
        param.base_similarity = 1.0
        param.taxonomy = mss.taxonomy
        param.id = 666
    
        #param.cost = cost
        param.cost = 10000
        param.freeze()
    
        # train
        mymethod = Method(param)
        mymethod.train(train)
    
        assessment = mymethod.evaluate(test)
        
        au_roc.append(assessment.auROC)
        au_prc.append(assessment.auPRC)
        
        print assessment
        assessment.destroySelf()

    pylab.title("auROC")
    pylab.semilogx(costs, au_roc, "-o")
    
    pylab.show()
    pylab.figure()
    pylab.title("auPRC")
    pylab.semilogx(costs, au_prc, "-o")
    pylab.show()
    
    return (costs, au_roc, au_prc)
def main():
    
    
    print "starting debugging:"

    SPLIT_POINTER = -1

    from expenv import MultiSplitSet
    from helper import Options 
    
    
    # select dataset
    #multi_split_set = MultiSplitSet.get(387)
    #multi_split_set = MultiSplitSet.get(407)
    multi_split_set = MultiSplitSet.get(399)

    #dataset_name = multi_split_set.description

    
    # create mock param object by freezable struct
    param = Options()
    param.kernel = "WeightedDegreeRBFKernel" #"WeightedDegreeStringKernel"#"PolyKernel" 
    param.wdk_degree = 2
    param.cost = 1.0
    param.transform = 0.2
    param.base_similarity = 1.0
    param.taxonomy = multi_split_set.taxonomy
    param.id = 666
    
    flags= {}
    #flags["boosting"] = "ones"
    #flags["boosting"] = "L1"
    flags["boosting"] = "L2"
    #flags["boosting"] = "L2_reg"
    flags["signum"] = False
    flags["normalize_cost"] = True
    flags["all_positions"] = False
    
    flags["wdk_rbf_on"] = False
    
    param.flags = flags
    
    param.freeze()
    

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)


    # train
    mymethod = Method(param)
    mymethod.train(data_train)


    assessment = mymethod.evaluate(data_eval)
    
    print assessment
    
    assessment.destroySelf()
Ejemplo n.º 3
0
def main():
    
    
    print "starting debugging:"

    SPLIT_POINTER = -1

    from expenv import MultiSplitSet
    from helper import Options 
    
    
    # select dataset
    #multi_split_set = MultiSplitSet.get(387)
    #multi_split_set = MultiSplitSet.get(407)
    multi_split_set = MultiSplitSet.get(399)

    #dataset_name = multi_split_set.description

    
    # create mock param object by freezable struct
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"#"PolyKernel" 
    param.wdk_degree = 2
    param.cost = 1.0
    param.transform = 0.2
    param.base_similarity = 1
    param.taxonomy = multi_split_set.taxonomy
    param.id = 666
    
    flags= {}
    #flags["boosting"] = "ones"
    flags["boosting"] = "L1"
    #flags["boosting"] = "L2"
    #flags["boosting"] = "L2_reg"
    flags["signum"] = False
    flags["normalize_cost"] = True
    flags["all_positions"] = False
    flags["wdk_rbf_on"] = False
    
    param.flags = flags
    
    param.freeze()
    

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)


    # train
    mymethod = Method(param)
    mymethod.train(data_train)


    assessment = mymethod.evaluate(data_eval)
    
    print assessment
    
    assessment.destroySelf()
def main():
    
    
    print "starting debugging:"

    SPLIT_POINTER = 1

    from expenv import MultiSplitSet
    from helper import Options 
    
    
    # select dataset
    multi_split_set = MultiSplitSet.get(434)

    
    # flags
    flags = {}
    flags["normalize_cost"] = False
    flags["epsilon"] = 1.0 
    #0.005
    flags["kernel_cache"] = 200
    flags["use_bias"] = False 

    # arts params
    flags["svm_type"] = "liblineardual"

    flags["degree"] = 24
    flags["degree_spectrum"] = 4
    flags["shifts"] = 0 #32
    flags["center_offset"] = 70
    flags["train_factor"] = 1

    #create mock param object by freezable struct
    param = Options()
    param.kernel = "Promoter"
    param.cost = 1.0
    param.transform = 1.0
    param.id = 666
    param.flags = flags
    param.taxonomy = multi_split_set.taxonomy
    
    param.freeze()

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)


    # train
    mymethod = Method(param)
    mymethod.train(data_train)

    print "training done"

    assessment = mymethod.evaluate(data_eval)
    
    print assessment
    
    assessment.destroySelf()
Ejemplo n.º 5
0
def main():
    
    
    print "starting debugging:"

    SPLIT_POINTER = 1

    from expenv import MultiSplitSet
     
        
    # select dataset
    multi_split_set = MultiSplitSet.get(384)
    
    # flags
    flags = {}
    flags["normalize_cost"] = False
    #flags["epsilon"] = 0.005
    flags["kernel_cache"] = 200
    flags["use_bias"] = False 

    # arts params
    #flags["svm_type"] = "liblineardual"

    flags["degree"] = 24

    flags["local"] = False
    flags["mem"] = "6G"
    flags["maxNumThreads"] = 1
    
    
    #create mock param object by freezable struct
    param = Options()
    #param.kernel = "GaussianKernel"
    param.kernel = "PolyKernel"
    param.sigma = 3.0
    param.cost = 10.0
    param.transform = 1.0
    param.id = 666
    param.flags = flags
    param.taxonomy = multi_split_set.taxonomy.data
    
    param.freeze()
    
    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)

    # train
    mymethod = Method(param)
    mymethod.train(data_train)

    print "training done"

    assessment = mymethod.evaluate(data_eval)
    
    print assessment
    
    assessment.destroySelf()
Ejemplo n.º 6
0
def main():

    print "starting debugging:"

    SPLIT_POINTER = 1

    from expenv import MultiSplitSet
    from helper import Options

    # select dataset
    multi_split_set = MultiSplitSet.get(432)

    # flags
    flags = {}
    flags["normalize_cost"] = False
    #flags["epsilon"] = 0.005
    flags["kernel_cache"] = 200
    flags["use_bias"] = False

    # arts params
    flags["svm_type"] = "liblineardual"

    flags["degree"] = 24
    flags["degree_spectrum"] = 4
    flags["shifts"] = 0  #32
    flags["center_offset"] = 70
    flags["train_factor"] = 1

    flags["local"] = False
    flags["mem"] = "6G"
    flags["maxNumThreads"] = 1

    #create mock param object by freezable struct
    param = Options()
    param.kernel = "Promoter"
    param.cost = 1.0
    param.transform = 1.0
    param.id = 666
    param.flags = flags
    param.taxonomy = multi_split_set.taxonomy.data

    param.freeze()

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)

    # train
    mymethod = Method(param)
    mymethod.train(data_train)

    print "training done"

    assessment = mymethod.evaluate(data_eval)

    print assessment

    assessment.destroySelf()
def main():
    
    
    print "starting debugging:"

    SPLIT_POINTER = -1

    from expenv import MultiSplitSet
    from helper import Options 
    
    
    # select dataset
    multi_split_set = MultiSplitSet.get(399)

    #dataset_name = multi_split_set.description
    flags = {}
    flags["normalize_cost"] = False
    flags["epsilon"] = 0.05
    flags["cache_size"] = 7
    #flags["solver_type"] = "ST_DIRECT" #ST_CPLEX #ST_GLPK) #ST_DIRECT) #ST_NEWTON)
    flags["normalize_trace"] = True
    flags["interleaved"] = True
    
    
    #create mock param object by freezable struct
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"
    param.wdk_degree = 1
    param.cost = 1
    param.transform = 1 #2.0
    param.taxonomy = multi_split_set.taxonomy
    param.id = 666
    
    
    param.flags = flags
    
    param.freeze()
    

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)


    # train
    mymethod = Method(param)
    mymethod.train(data_train)


    assessment = mymethod.evaluate(data_eval)
    
    print assessment
    
    assessment.destroySelf()
Ejemplo n.º 8
0
def main():
    
    
    print "starting debugging:"

    SPLIT_POINTER = 1

    from expenv import MultiSplitSet
    from helper import Options 
    from task_similarities import fetch_gammas
    
    
    # select dataset
    multi_split_set = MultiSplitSet.get(317)
    #multi_split_set = MultiSplitSet.get(374)
    #multi_split_set = MultiSplitSet.get(2)

    dataset_name = multi_split_set.description

    transform = 1.0
    base = 1.0
    similarity_matrix = fetch_gammas(transform, base, dataset_name) 
        

    #create mock taxonomy object by freezable struct
    taxonomy = Options()
    taxonomy.data = similarity_matrix
    taxonomy.description = dataset_name
    taxonomy.freeze()
    
    
    #create mock param object by freezable struct
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"
    param.wdk_degree = 1
    param.cost = 1.0
    param.transform = 1.0
    param.taxonomy = taxonomy
    param.id = 666
    
    param.freeze()
    


    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)

    create_plot_inner(param, data_train, data_eval)
def main():
        
    print "starting debugging:"

    SPLIT_POINTER = 1

    from expenv import MultiSplitSet
    from helper import Options 
    
    
    # select dataset
    multi_split_set = MultiSplitSet.get(379)

    dataset_name = multi_split_set.description

    print "dataset_name", dataset_name
    
    #create mock taxonomy object by freezable struct
    #taxonomy = Options()
    #taxonomy.data = taxonomy_graph.data
    #taxonomy.description = dataset_name
    #taxonomy.freeze()
    
    
    #create mock param object by freezable struct
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"
    param.wdk_degree = 1
    param.cost = 1.0
    param.transform = 2.0
    param.taxonomy = multi_split_set.taxonomy
    param.id = 666
    
    param.freeze()
    

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)

    # train hierarchical xval
    mymethod = Method(param)
    mymethod.train(data_train)
    
    assessment = mymethod.evaluate(data_eval)
    
    print assessment
    
    assessment.destroySelf();
Ejemplo n.º 10
0
def training_for_sigma(sigma):

    print "starting debugging:"


    from expenv import MultiSplitSet
        
    # select dataset
    multi_split_set = MultiSplitSet.get(393)

    SPLIT_POINTER = 1
    
    #create mock param object by freezable struct
    param = Options()
    param.kernel =  "WeightedDegreeStringKernel" #"WeightedDegreeRBFKernel" # #
    param.wdk_degree = 2
    param.cost = 1.0
    param.transform = 1.0 
    param.id = 666
    param.base_similarity = sigma
    param.degree = 2
    param.flags = {}
    
    param.flags["wdk_rbf_on"] = False   
    param.freeze()
    

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)


    # train
    mymethod = Method(param)
    mymethod.train(data_train)

    print "training done"

    assessment = mymethod.evaluate(data_eval)
    
    print assessment
    
    assessment.destroySelf()



    return assessment.auROC
Ejemplo n.º 11
0
def main():

    print "starting debugging:"

    SPLIT_POINTER = -1

    from expenv import MultiSplitSet
    from helper import Options

    # select dataset
    multi_split_set = MultiSplitSet.get(399)

    #dataset_name = multi_split_set.description
    flags = {}
    flags["normalize_cost"] = False
    flags["epsilon"] = 0.05
    flags["cache_size"] = 7
    #flags["solver_type"] = "ST_DIRECT" #ST_CPLEX #ST_GLPK) #ST_DIRECT) #ST_NEWTON)
    flags["normalize_trace"] = True
    flags["interleaved"] = True

    #create mock param object by freezable struct
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"
    param.wdk_degree = 1
    param.cost = 1
    param.transform = 1  #2.0
    param.taxonomy = multi_split_set.taxonomy
    param.id = 666

    param.flags = flags

    param.freeze()

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)

    # train
    mymethod = Method(param)
    mymethod.train(data_train)

    assessment = mymethod.evaluate(data_eval)

    print assessment

    assessment.destroySelf()
def main():

    print "starting debugging:"

    SPLIT_POINTER = 1

    from expenv import MultiSplitSet
    from helper import Options

    # select dataset
    multi_split_set = MultiSplitSet.get(379)

    dataset_name = multi_split_set.description

    print "dataset_name", dataset_name

    #create mock taxonomy object by freezable struct
    #taxonomy = Options()
    #taxonomy.data = taxonomy_graph.data
    #taxonomy.description = dataset_name
    #taxonomy.freeze()

    #create mock param object by freezable struct
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"
    param.wdk_degree = 1
    param.cost = 1.0
    param.transform = 2.0
    param.taxonomy = multi_split_set.taxonomy
    param.id = 666

    param.freeze()

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)

    # train hierarchical xval
    mymethod = Method(param)
    mymethod.train(data_train)

    assessment = mymethod.evaluate(data_eval)

    print assessment

    assessment.destroySelf()
Ejemplo n.º 13
0
def main():
    
    
    print "starting debugging:"

    SPLIT_POINTER = 1

    from expenv import MultiSplitSet
    from helper import Options 
    
    
    # select dataset
    multi_split_set = MultiSplitSet.get(399)

    
    #create mock param object by freezable struct
    param = Options()
    param.kernel =  "WeightedDegreeRBFKernel" #"WeightedDegreeStringKernel"# #
    param.wdk_degree = 1
    param.cost = 1.0
    param.transform = 1.0
    param.sigma = 1.0
    param.id = 666
    param.base_similarity = 1
    param.degree = 2
    
    param.freeze()
    

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)


    # train
    mymethod = Method(param)
    mymethod.train(data_train)

    print "training done"

    assessment = mymethod.evaluate(data_eval)
    
    print assessment
    
    assessment.destroySelf()
Ejemplo n.º 14
0
def main():
    
    
    print "starting debugging:"

    SPLIT_POINTER = -1

    from expenv import MultiSplitSet
    from helper import Options 
    
    
    # select dataset
    #multi_split_set = MultiSplitSet.get(387)
    multi_split_set = MultiSplitSet.get(386)

    #dataset_name = multi_split_set.description

    
    # create mock param object by freezable struct
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"#"PolyKernel" 
    param.wdk_degree = 1
    param.cost = 100
    param.transform = 2 #2.0
    param.taxonomy = multi_split_set.taxonomy
    param.id = 666
    
    param.freeze()
    

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)


    # train
    mymethod = Method(param)
    mymethod.train(data_train)


    assessment = mymethod.evaluate(data_eval)
    
    print assessment
    
    assessment.destroySelf()
Ejemplo n.º 15
0
def main():
    
    
    print "starting debugging:"

    SPLIT_POINTER = -1

    from expenv import MultiSplitSet
    from helper import Options 
    
    
    # select dataset
    #multi_split_set = MultiSplitSet.get(387)
    multi_split_set = MultiSplitSet.get(386)

    #dataset_name = multi_split_set.description

    
    # create mock param object by freezable struct
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"#"PolyKernel" 
    param.wdk_degree = 1
    param.cost = 1
    param.transform = 2 #2.0
    param.taxonomy = multi_split_set.taxonomy
    param.id = 666
    
    param.freeze()
    

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)


    # train
    mymethod = Method(param)
    mymethod.train(data_train)


    assessment = mymethod.evaluate(data_eval)
    
    print assessment
    
    assessment.destroySelf()
Ejemplo n.º 16
0
def training_for_sigma(sigma):

    print "starting debugging:"

    from expenv import MultiSplitSet

    # select dataset
    multi_split_set = MultiSplitSet.get(393)

    SPLIT_POINTER = 1

    #create mock param object by freezable struct
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"  #"WeightedDegreeRBFKernel" # #
    param.wdk_degree = 2
    param.cost = 1.0
    param.transform = 1.0
    param.id = 666
    param.base_similarity = sigma
    param.degree = 2
    param.flags = {}

    param.flags["wdk_rbf_on"] = False
    param.freeze()

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)

    # train
    mymethod = Method(param)
    mymethod.train(data_train)

    print "training done"

    assessment = mymethod.evaluate(data_eval)

    print assessment

    assessment.destroySelf()

    return assessment.auROC
Ejemplo n.º 17
0
def main():

    print "starting debugging:"

    SPLIT_POINTER = 1

    from expenv import MultiSplitSet
    from helper import Options

    # select dataset
    multi_split_set = MultiSplitSet.get(399)

    #create mock param object by freezable struct
    param = Options()
    param.kernel = "WeightedDegreeRBFKernel"  #"WeightedDegreeStringKernel"# #
    param.wdk_degree = 1
    param.cost = 1.0
    param.transform = 1.0
    param.sigma = 1.0
    param.id = 666
    param.base_similarity = 1
    param.degree = 2

    param.freeze()

    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)

    # train
    mymethod = Method(param)
    mymethod.train(data_train)

    print "training done"

    assessment = mymethod.evaluate(data_eval)

    print assessment

    assessment.destroySelf()
def test_data():
    
    ##################################################################
    # select MSS
    ##################################################################
    
    mss = expenv.MultiSplitSet.get(379)
    
    
    
    ##################################################################
    # data
    ##################################################################
    
    # fetch data
    instance_set = mss.get_train_data(-1)
    
    # prepare data
    data = PreparedMultitaskData(instance_set, shuffle=True)
    
    # set parameters
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"
    param.wdk_degree = 4
    param.cost = 1.0
    param.transform = 1.0
    param.id = 666
    param.freeze()
    
    
    
    
    ##################################################################
    # taxonomy
    ##################################################################
    
    
    taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data)
    
    
    support = numpy.linspace(0, 100, 4)
    
    
    distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]]
    
    # create tree normalizer 
    tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names)
    
    
    
    
    task_names = data.get_task_names()
    
    
    FACTOR = 1.0
    
    
    # init gamma matrix
    gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))
    
    for t1_name in task_names:
        for t2_name in task_names:
            
            similarity = taxonomy.compute_node_similarity(taxonomy.get_id(t1_name), taxonomy.get_id(t2_name))        
            gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity
    
    helper.save("/tmp/gammas", gammas)
    
    
    gammas = gammas * FACTOR
    
    cost = param.cost * numpy.sqrt(FACTOR) 
    
    print gammas
    
    
    ##########
    # regular normalizer
    
    normalizer = MultitaskKernelNormalizer(data.task_vector_nums)
    
    for t1_name in task_names:
        for t2_name in task_names:
                    
            similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)]
            normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity)
    
                
    ##################################################################
    # Train SVMs
    ##################################################################
    
    # create shogun objects
    wdk_tree = shogun_factory.create_kernel(data.examples, param)
    lab = shogun_factory.create_labels(data.labels)
    
    wdk_tree.set_normalizer(tree_normalizer)
    wdk_tree.init_normalizer()
    
    print "--->",wdk_tree.get_normalizer().get_name()
    
    svm_tree = SVMLight(cost, wdk_tree, lab)
    svm_tree.set_linadd_enabled(False)
    svm_tree.set_batch_computation_enabled(False)
    
    svm_tree.train()
    
    del wdk_tree
    del tree_normalizer
    
    print "finished training tree-norm SVM:", svm_tree.get_objective()
    
    
    wdk = shogun_factory.create_kernel(data.examples, param)
    wdk.set_normalizer(normalizer)
    wdk.init_normalizer()
    
    print "--->",wdk.get_normalizer().get_name()
    
    svm = SVMLight(cost, wdk, lab)
    svm.set_linadd_enabled(False)
    svm.set_batch_computation_enabled(False)
    
    svm.train()
    
    print "finished training manually set SVM:", svm.get_objective()
    
    
    alphas_tree = svm_tree.get_alphas()
    alphas = svm.get_alphas()
    
    assert(len(alphas_tree)==len(alphas))
    
    for i in xrange(len(alphas)):
        assert(abs(alphas_tree[i] - alphas[i]) < 0.0001)
        
    print "success: all alphas are the same"
##################################################################
# data
##################################################################

# fetch data
instance_set = mss.get_train_data(-1)

# prepare data
data = PreparedMultitaskData(instance_set, shuffle=True)

# set parameters
param = Options()
param.kernel = "WeightedDegreeStringKernel"
param.wdk_degree = 4
param.cost = 1.0
param.transform = 1.0
param.id = 666
param.freeze()

##################################################################
# taxonomy
##################################################################

taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data)

# create tree normalizer
tree_normalizer = MultitaskKernelTreeNormalizer(data.task_vector_names,
                                                data.task_vector_names,
                                                taxonomy)

task_names = data.get_task_names()
def test_data():

    ##################################################################
    # select MSS
    ##################################################################

    mss = expenv.MultiSplitSet.get(379)

    ##################################################################
    # data
    ##################################################################

    # fetch data
    instance_set = mss.get_train_data(-1)

    # prepare data
    data = PreparedMultitaskData(instance_set, shuffle=True)

    # set parameters
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"
    param.wdk_degree = 4
    param.cost = 1.0
    param.transform = 1.0
    param.id = 666
    param.freeze()

    ##################################################################
    # taxonomy
    ##################################################################

    taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data)

    support = numpy.linspace(0, 100, 4)

    distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]]

    # create tree normalizer
    tree_normalizer = MultitaskKernelPlifNormalizer(support,
                                                    data.task_vector_names)

    task_names = data.get_task_names()

    FACTOR = 1.0

    # init gamma matrix
    gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))

    for t1_name in task_names:
        for t2_name in task_names:

            similarity = taxonomy.compute_node_similarity(
                taxonomy.get_id(t1_name), taxonomy.get_id(t2_name))
            gammas[data.name_to_id(t1_name),
                   data.name_to_id(t2_name)] = similarity

    helper.save("/tmp/gammas", gammas)

    gammas = gammas * FACTOR

    cost = param.cost * numpy.sqrt(FACTOR)

    print gammas

    ##########
    # regular normalizer

    normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

    for t1_name in task_names:
        for t2_name in task_names:

            similarity = gammas[data.name_to_id(t1_name),
                                data.name_to_id(t2_name)]
            normalizer.set_task_similarity(data.name_to_id(t1_name),
                                           data.name_to_id(t2_name),
                                           similarity)

    ##################################################################
    # Train SVMs
    ##################################################################

    # create shogun objects
    wdk_tree = shogun_factory.create_kernel(data.examples, param)
    lab = shogun_factory.create_labels(data.labels)

    wdk_tree.set_normalizer(tree_normalizer)
    wdk_tree.init_normalizer()

    print "--->", wdk_tree.get_normalizer().get_name()

    svm_tree = SVMLight(cost, wdk_tree, lab)
    svm_tree.set_linadd_enabled(False)
    svm_tree.set_batch_computation_enabled(False)

    svm_tree.train()

    del wdk_tree
    del tree_normalizer

    print "finished training tree-norm SVM:", svm_tree.get_objective()

    wdk = shogun_factory.create_kernel(data.examples, param)
    wdk.set_normalizer(normalizer)
    wdk.init_normalizer()

    print "--->", wdk.get_normalizer().get_name()

    svm = SVMLight(cost, wdk, lab)
    svm.set_linadd_enabled(False)
    svm.set_batch_computation_enabled(False)

    svm.train()

    print "finished training manually set SVM:", svm.get_objective()

    alphas_tree = svm_tree.get_alphas()
    alphas = svm.get_alphas()

    assert (len(alphas_tree) == len(alphas))

    for i in xrange(len(alphas)):
        assert (abs(alphas_tree[i] - alphas[i]) < 0.0001)

    print "success: all alphas are the same"
def main():
    
    
    print "starting debugging:"
    

    from expenv import MultiSplitSet
    from helper import Options 
    from task_similarities import dataset_to_hierarchy
    
    # select dataset
    #multi_split_set = MultiSplitSet.get(317)
    multi_split_set = MultiSplitSet.get(432)
    #multi_split_set = MultiSplitSet.get(2) #small splicing
    #multi_split_set = MultiSplitSet.get(377) #medium splicing

    dataset_name = multi_split_set.description

    # flags
    flags = {}
    flags["normalize_cost"] = False
    flags["epsilon"] = 1.0 
    #0.005
    flags["kernel_cache"] = 1000
    flags["use_bias"] = False 

    # arts params
    flags["svm_type"] = "liblineardual"

    flags["degree"] = 24
    flags["degree_spectrum"] = 4
    flags["shifts"] = 0 #32
    flags["train_factor"] = 1
    flags["center_offset"] = 70
    flags["center_pos"] = 500


    #create mock param object by freezable struct
    param = Options()
    param.kernel = "Promoter"
    param.cost = 1.0
    param.transform = 1.0
    param.id = 666
    param.flags = flags
    param.taxonomy = multi_split_set.taxonomy
    
    param.freeze()


    data_train = multi_split_set.get_train_data(SPLIT_POINTER)
    data_eval = multi_split_set.get_eval_data(SPLIT_POINTER)

    (perf_xval, final_pred, best_idx_cost) = create_plot_inner(param, data_train, data_eval)
    perf_regular = create_plot_regular(param, data_train, data_eval)


    # plot performances
      
    import pylab
    
    if TARGET_PARAM=="both":


        #X,Y = pylab.meshgrid(range(len(RANGE)), range(len(RANGE)))
        
        cmap = pylab.cm.get_cmap('jet', 20)    # 10 discrete colors
        
        pylab.contourf(RANGE, RANGE, perf_xval, cmap=cmap)
        #im = pylab.imshow(perf_xval, cmap=cmap, interpolation='bilinear')
        pylab.axis('on')
        pylab.colorbar()
        
        pylab.title("mss:" + str(multi_split_set.id) + ", task:" + TARGET_TASK + " , param:" + TARGET_PARAM +  ", split:" + str(SPLIT_POINTER))
        
        pylab.show()
    
    else:
        
        pylab.semilogx(RANGE, perf_regular, "g-o")
        pylab.semilogx(RANGE, perf_xval, "b-o")
        #pylab.semilogx([a*0.66 for a in RANGE], perf_xval, "b-o")
        
        #pylab.plot(numpy.array(perf_regular) - numpy.array(perf_xval), "y-o")
        
        #pylab.plot([best_idx_cost], [final_pred], "r+")
        pylab.axhline(y=final_pred, color="r")
        pylab.axvline(x=RANGE[best_idx_cost], color="r")
        pylab.axvline(x=1.0, color="g")
        
        pylab.ylabel(TARGET_MEASURE)
        pylab.xlabel(TARGET_PARAM)
        
        pylab.legend( ("outer", "inner xval"), loc="best")
        pylab.title("mss:" + str(multi_split_set.id) + ", task:" + TARGET_TASK + " , degree:" + str(param.wdk_degree) +  ", split:" + str(SPLIT_POINTER))
        
        pylab.show()
Ejemplo n.º 22
0
 def __init__(self, degree, sigma, active_set, wdk_rbf_on):
     '''
     loads data into handler
     '''
 
     self.active_set = active_set
     
     fn = "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHCsequenzen/pseudo.txt"
     
     tmp_key = ""
     tmp_idx = 0
     
     self.seqs = []
     self.keys = []
     self.name_to_id = {}
     
     # parse file
     for line in file(fn):
         
         if line.startswith(">"):
             tmp_key = line.strip()[1:]
         else:
        
             if active_set.count(tmp_key) > 0:
             
                 assert self.keys.count(tmp_key) == 0, "key %s is already contained in self.keys" % (tmp_key)
                 
                 self.seqs.append(line.strip())
                 self.keys.append(tmp_key)
                 self.name_to_id[tmp_key] = tmp_idx
         
                 tmp_idx += 1
         
                 assert len(self.seqs) == tmp_idx, "incorrect number of sequences %i != %i" % (len(self.seqs), tmp_idx)
                 assert len(self.keys) == tmp_idx, "incorrect number of keys %i != %i" % (len(self.keys), tmp_idx)
     
         
     
     # setup kernel
     param = Options()
     
     if wdk_rbf_on:
         param.kernel = "WeightedDegreeRBFKernel"
     else:
         param.kernel = "WeightedDegreeStringKernel"
     param.wdk_degree = degree
     param.transform = sigma
     
     self.kernel = shogun_factory.create_kernel(self.seqs, param)
     
     #######################
     # compute kernel
     #######################
     
     num_tasks = len(self.seqs)
     
     self.similarity = numpy.zeros((num_tasks, num_tasks))
     
     for i in xrange(num_tasks):
         for j in xrange(num_tasks):
             self.similarity[i,j] = self.kernel.kernel(i, j)
             
     # normalize kernel
     my_min = numpy.min(self.similarity)
     my_max = numpy.max(self.similarity)
     my_diff = my_max - my_min
 
     # scale to interval [0,1]    
     #self.similarity = (self.similarity - my_min) / my_diff
     self.similarity = (self.similarity) / my_max
 
     print self.similarity
Ejemplo n.º 23
0
    def __init__(self, degree, sigma, active_set, wdk_rbf_on):
        '''
        loads data into handler
        '''

        self.active_set = active_set

        fn = "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHCsequenzen/pseudo.txt"

        tmp_key = ""
        tmp_idx = 0

        self.seqs = []
        self.keys = []
        self.name_to_id = {}

        # parse file
        for line in file(fn):

            if line.startswith(">"):
                tmp_key = line.strip()[1:]
            else:

                if active_set.count(tmp_key) > 0:

                    assert self.keys.count(
                        tmp_key
                    ) == 0, "key %s is already contained in self.keys" % (
                        tmp_key)

                    self.seqs.append(line.strip())
                    self.keys.append(tmp_key)
                    self.name_to_id[tmp_key] = tmp_idx

                    tmp_idx += 1

                    assert len(
                        self.seqs
                    ) == tmp_idx, "incorrect number of sequences %i != %i" % (
                        len(self.seqs), tmp_idx)
                    assert len(
                        self.keys
                    ) == tmp_idx, "incorrect number of keys %i != %i" % (len(
                        self.keys), tmp_idx)

        # setup kernel
        param = Options()

        if wdk_rbf_on:
            param.kernel = "WeightedDegreeRBFKernel"
        else:
            param.kernel = "WeightedDegreeStringKernel"
        param.wdk_degree = degree
        param.transform = sigma

        self.kernel = shogun_factory.create_kernel(self.seqs, param)

        #######################
        # compute kernel
        #######################

        num_tasks = len(self.seqs)

        self.similarity = numpy.zeros((num_tasks, num_tasks))

        for i in xrange(num_tasks):
            for j in xrange(num_tasks):
                self.similarity[i, j] = self.kernel.kernel(i, j)

        # normalize kernel
        my_min = numpy.min(self.similarity)
        my_max = numpy.max(self.similarity)
        my_diff = my_max - my_min

        # scale to interval [0,1]
        #self.similarity = (self.similarity - my_min) / my_diff
        self.similarity = (self.similarity) / my_max

        print self.similarity
##################################################################
# data
##################################################################

# fetch data
instance_set = mss.get_train_data(-1)

# prepare data
data = PreparedMultitaskData(instance_set, shuffle=True)

# set parameters
param = Options()
param.kernel = "WeightedDegreeStringKernel"
param.wdk_degree = 4
param.cost = 1.0
param.transform = 1.0
param.id = 666
param.freeze()




##################################################################
# taxonomy
##################################################################


taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data)


# create tree normalizer 
Ejemplo n.º 25
0
def check_C_testset(mss_id):

    import pylab
    import expenv
    import numpy
    from helper import Options
    from method_hierarchy_svm_new import Method
    #from method_augmented_svm_new import Method

    #costs = 10000 #[float(c) for c in numpy.exp(numpy.linspace(numpy.log(10), numpy.log(20000), 6))]
    costs = [
        float(c)
        for c in numpy.exp(numpy.linspace(numpy.log(0.4), numpy.log(10), 6))
    ]

    print costs

    mss = expenv.MultiSplitSet.get(mss_id)

    train = mss.get_train_data(-1)
    test = mss.get_eval_data(-1)

    au_roc = []
    au_prc = []

    for cost in costs:
        #create mock param object by freezable struct
        param = Options()
        param.kernel = "WeightedDegreeStringKernel"
        param.wdk_degree = 10
        param.transform = cost
        param.base_similarity = 1.0
        param.taxonomy = mss.taxonomy
        param.id = 666

        #param.cost = cost
        param.cost = 10000
        param.freeze()

        # train
        mymethod = Method(param)
        mymethod.train(train)

        assessment = mymethod.evaluate(test)

        au_roc.append(assessment.auROC)
        au_prc.append(assessment.auPRC)

        print assessment
        assessment.destroySelf()

    pylab.title("auROC")
    pylab.semilogx(costs, au_roc, "-o")

    pylab.show()
    pylab.figure()
    pylab.title("auPRC")
    pylab.semilogx(costs, au_prc, "-o")
    pylab.show()

    return (costs, au_roc, au_prc)