labels[8] = 1
labels[19] = 1


feat = StringCharFeatures(DNA)
feat.set_features(examples)
wdk = WeightedDegreeStringKernel(feat, feat, 1)
lab = Labels(numpy.array(labels))


svm = SVMLight(1, wdk, lab)
svm.train()

svm.set_shrinking_enabled(False)

print "simple svm", svm.get_objective()

print "len(examples)", len(examples)

print "##############"


#print "##############"
#print "svm light"

#svm_light = SVMLight(1.0,wdk,lab)

#svm_light.train()


#print "svmlight objective", svm_light.get_objective()
def serialization_svmlight_modular(num, dist, width, C):
    from shogun.IO import MSG_DEBUG
    from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet
    from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
    from shogun.Classifier import SVMLight
    from numpy import concatenate, ones
    from numpy.random import randn, seed

    import sys
    import types
    import random
    import bz2
    try:
        import cPickle as pickle
    except ImportError:
        import pickle as pickle
    import inspect

    def save(filename, myobj):
        """
        save object to file using pickle

        @param filename: name of destination file
        @type filename: str
        @param myobj: object to save (has to be pickleable)
        @type myobj: obj
        """

        try:
            f = bz2.BZ2File(filename, 'wb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be written\n')
            sys.stderr.write(details)
            return

        pickle.dump(myobj, f, protocol=2)
        f.close()

    def load(filename):
        """
        Load from filename using pickle

        @param filename: name of file to load from
        @type filename: str
        """

        try:
            f = bz2.BZ2File(filename, 'rb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be read\n')
            sys.stderr.write(details)
            return

        myobj = pickle.load(f)
        f.close()
        return myobj

    ##################################################
    # set up toy data and svm

    traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                 axis=1)
    testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                axis=1)

    trainlab = concatenate((-ones(num), ones(num)))
    testlab = concatenate((-ones(num), ones(num)))

    feats_train = RealFeatures(traindata_real)
    feats_test = RealFeatures(testdata_real)
    kernel = GaussianKernel(feats_train, feats_train, width)
    #kernel.io.set_loglevel(MSG_DEBUG)

    labels = BinaryLabels(trainlab)

    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    ##################################################
    # serialize to file

    fn = "serialized_svm.bz2"
    print("serializing SVM to file", fn)
    save(fn, svm)

    ##################################################
    # unserialize and sanity check

    print("unserializing SVM")
    svm2 = load(fn)

    print("comparing objectives")

    svm2.train()

    print("objective before serialization:", svm.get_objective())
    print("objective after serialization:", svm2.get_objective())

    print("comparing predictions")

    out = svm.apply(feats_test).get_labels()
    out2 = svm2.apply(feats_test).get_labels()

    # assert outputs are close
    for i in xrange(len(out)):
        assert abs(out[i] - out2[i] < 0.000001)

    print("all checks passed.")

    return True
Exemple #3
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        ########################################################
        print "creating a kernel for each node:"
        ########################################################

        # init seq handler
        task_kernel = SequencesHandlerRbf(1, param.base_similarity,
                                          data.get_task_names(),
                                          param.flags["wdk_rbf_on"])
        similarities = numpy.zeros(
            (data.get_num_tasks(), data.get_num_tasks()))

        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():

                # convert similarity with simple transformation
                similarity = task_kernel.get_similarity(
                    task_name_lhs, task_name_rhs)

                print similarity

                print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs,
                                                 similarity)

                normalizer.set_task_similarity(data.name_to_id(task_name_lhs),
                                               data.name_to_id(task_name_rhs),
                                               similarity)

                # save for later
                similarities[data.name_to_id(task_name_lhs),
                             data.name_to_id(task_name_rhs)] = similarity

        # set normalizer
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()

        # set up svm
        svm = SVMLight(param.cost, base_wdk, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        # normalize cost
        norm_c_pos = param.cost / float(len([l
                                             for l in data.labels if l == 1]))
        norm_c_neg = param.cost / float(
            len([l for l in data.labels if l == -1]))

        svm.set_C(norm_c_neg, norm_c_pos)

        # start training
        svm.train()

        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        #self.additional_information["distances"] = distances
        self.additional_information["similarities"] = similarities

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in data.get_task_names():

            task_num = data.name_to_id(task_name)

            # save svm and task_num
            svms[task_name] = (task_num, param, svm)

        return svms
	#print pickle.dumps(kernel)
	#
	#print "svm"
	#print pickle.dumps(svm)
	#
	#print "#################################"

	fn = "serialized_svm.bz2"
	#print "serializing SVM to file", fn

	save(fn, svm)

	#print "#################################"

	#print "unserializing SVM"
	svm2 = load(fn)


	#print "#################################"
	#print "comparing training"

	svm2.train()

	#print "objective before serialization:", svm.get_objective()
	#print "objective after serialization:", svm2.get_objective()
	return svm, svm.get_objective(), svm2, svm2.get_objective()

if __name__=='__main__':
	print 'Serialization SVMLight'
	serialization_svmlight_modular(*parameter_list[0])
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """


        assert(param.base_similarity >= 1)
        
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)
        
        
        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)
        
        # load data
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt")
        f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt")
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt")
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt")
        
        num_lines = int(f.readline().strip())
        task_distances = numpy.zeros((num_lines, num_lines))
        name_to_id = {}
        for (i, line) in enumerate(f):
            tokens = line.strip().split("\t")
            name = str(tokens[0])
            name_to_id[name] = i
            entry = numpy.array([v for (j,v) in enumerate(tokens) if j!=0])
            assert len(entry)==num_lines, "len_entry %i, num_lines %i" % (len(entry), num_lines)
            task_distances[i,:] = entry
            
        
        # cut relevant submatrix
        active_ids = [name_to_id[name] for name in data.get_task_names()] 
        tmp_distances = task_distances[active_ids, :]
        tmp_distances = tmp_distances[:, active_ids]
        print "distances ", tmp_distances.shape

        
        # normalize distances
        task_distances = task_distances / numpy.max(tmp_distances)
        
        
        similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))
                                
        
        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():
                
                
                # convert similarity with simple transformation
                similarity = param.base_similarity - task_distances[name_to_id[task_name_lhs], name_to_id[task_name_rhs]]
                normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity)
                
                # save for later
                similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity
                
                
        # set normalizer                
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()
        

        # set up svm
        svm = SVMLight(param.cost, base_wdk, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        
        
        # normalize cost
        norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
        norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
        
        svm.set_C(norm_c_neg, norm_c_pos)
        
        
        # start training
        svm.train()


        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        #self.additional_information["distances"] = distances
        self.additional_information["similarities"] = similarities


        # wrap up predictors
        svms = {}
        
        # use a reference to the same svm several times
        for task_name in data.get_task_names():
            
            task_num = data.name_to_id(task_name)
            
            # save svm and task_num
            svms[task_name] = (task_num, param, svm)

        return svms
wdk_tree.set_normalizer(tree_normalizer)
wdk_tree.init_normalizer()

print "--->", wdk_tree.get_normalizer().get_name()

svm_tree = SVMLight(cost, wdk_tree, lab)
svm_tree.set_linadd_enabled(False)
svm_tree.set_batch_computation_enabled(False)

svm_tree.train()

del wdk_tree
del tree_normalizer

print "finished training tree-norm SVM:", svm_tree.get_objective()

wdk = shogun_factory.create_kernel(data.examples, param)
wdk.set_normalizer(normalizer)
wdk.init_normalizer()

print "--->", wdk.get_normalizer().get_name()

svm = SVMLight(cost, wdk, lab)
svm.set_linadd_enabled(False)
svm.set_batch_computation_enabled(False)

svm.train()

print "finished training manually set SVM:", svm.get_objective()
print "features"
print feats_train.to_string()

print "kernel"
print kernel.to_string()

print "svm"
print svm.to_string()

print "#################################"

fn = "serialized_svm.bz2"
print "serializing SVM to file", fn

save(fn, svm)

print "#################################"

print "unserializing SVM"
svm2 = load(fn)


print "#################################"
print "comparing training"

svm2.train()

print "objective before serialization:", svm.get_objective()
print "objective after serialization:", svm2.get_objective()

def serialization_svmlight_modular(num, dist, width, C):
    from shogun.IO import MSG_DEBUG
    from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet
    from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
    from shogun.Classifier import SVMLight
    from numpy import concatenate, ones
    from numpy.random import randn, seed

    import sys
    import types
    import random
    import bz2
    try:
        import cPickle as pickle
    except ImportError:
        import pickle as pickle
    import inspect

    def save(filename, myobj):
        """
		save object to file using pickle
		
		@param filename: name of destination file
		@type filename: str
		@param myobj: object to save (has to be pickleable)
		@type myobj: obj
		"""

        try:
            f = bz2.BZ2File(filename, 'wb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be written\n')
            sys.stderr.write(details)
            return

        pickle.dump(myobj, f, protocol=2)
        f.close()

    def load(filename):
        """
		Load from filename using pickle
		
		@param filename: name of file to load from
		@type filename: str
		"""

        try:
            f = bz2.BZ2File(filename, 'rb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be read\n')
            sys.stderr.write(details)
            return

        myobj = pickle.load(f)
        f.close()
        return myobj

    ##################################################

    seed(17)
    traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                 axis=1)
    testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                axis=1)

    trainlab = concatenate((-ones(num), ones(num)))
    testlab = concatenate((-ones(num), ones(num)))

    feats_train = RealFeatures(traindata_real)
    feats_test = RealFeatures(testdata_real)
    kernel = GaussianKernel(feats_train, feats_train, width)
    #kernel.io.set_loglevel(MSG_DEBUG)

    labels = BinaryLabels(trainlab)

    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    ##################################################

    #print("labels:")
    #print(pickle.dumps(labels))
    #
    #print("features")
    #print(pickle.dumps(feats_train))
    #
    #print("kernel")
    #print(pickle.dumps(kernel))
    #
    #print("svm")
    #print(pickle.dumps(svm))
    #
    #print("#################################")

    fn = "serialized_svm.bz2"
    #print("serializing SVM to file", fn)

    save(fn, svm)

    #print("#################################")

    #print("unserializing SVM")
    svm2 = load(fn)

    #print("#################################")
    #print("comparing training")

    svm2.train()

    #print("objective before serialization:", svm.get_objective())
    #print("objective after serialization:", svm2.get_objective())
    return svm, svm.get_objective(), svm2, svm2.get_objective()
def serialization_svmlight_modular (num, dist, width, C):
    from shogun.IO import MSG_DEBUG
    from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet
    from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
    from shogun.Classifier import SVMLight
    from numpy import concatenate, ones
    from numpy.random import randn, seed

    import sys
    import types
    import random
    import bz2
    try:
        import cPickle as pickle
    except ImportError:
        import pickle as pickle
    import inspect


    def save(filename, myobj):
        """
        save object to file using pickle

        @param filename: name of destination file
        @type filename: str
        @param myobj: object to save (has to be pickleable)
        @type myobj: obj
        """

        try:
            f = bz2.BZ2File(filename, 'wb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be written\n')
            sys.stderr.write(details)
            return

        pickle.dump(myobj, f, protocol=2)
        f.close()



    def load(filename):
        """
        Load from filename using pickle

        @param filename: name of file to load from
        @type filename: str
        """

        try:
            f = bz2.BZ2File(filename, 'rb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be read\n')
            sys.stderr.write(details)
            return

        myobj = pickle.load(f)
        f.close()
        return myobj


    ##################################################
    # set up toy data and svm

    traindata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1)
    testdata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1);

    trainlab = concatenate((-ones(num), ones(num)));
    testlab = concatenate((-ones(num), ones(num)));

    feats_train = RealFeatures(traindata_real);
    feats_test = RealFeatures(testdata_real);
    kernel = GaussianKernel(feats_train, feats_train, width);
    #kernel.io.set_loglevel(MSG_DEBUG)

    labels = BinaryLabels(trainlab);

    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    ##################################################
    # serialize to file

    fn = "serialized_svm.bz2"
    print("serializing SVM to file", fn)
    save(fn, svm)

    ##################################################
    # unserialize and sanity check

    print("unserializing SVM")
    svm2 = load(fn)

    print("comparing objectives")

    svm2.train()

    print("objective before serialization:", svm.get_objective())
    print("objective after serialization:", svm2.get_objective())

    print("comparing predictions")

    out =  svm.apply(feats_test).get_labels()
    out2 =  svm2.apply(feats_test).get_labels()

    # assert outputs are close
    for i in xrange(len(out)):
        assert abs(out[i] - out2[i] < 0.000001)

    print("all checks passed.")

    return True
Exemple #10
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
          
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)
        
        
        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        ########################################################
        print "creating a kernel for each node:"
        ########################################################

        
        # init seq handler 
        task_kernel = SequencesHandlerRbf(1, param.base_similarity, data.get_task_names(), param.flags["wdk_rbf_on"])
        similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))
        
        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():
                
                
                 
                
                # convert similarity with simple transformation
                similarity = task_kernel.get_similarity(task_name_lhs, task_name_rhs)
                
                print similarity
                
                print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs, similarity)
                
                normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity)
                
                # save for later
                similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity
                
                
        # set normalizer                
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()
        

        # set up svm
        svm = SVMLight(param.cost, base_wdk, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        
        
        # normalize cost
        norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
        norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
        
        svm.set_C(norm_c_neg, norm_c_pos)
        
        
        # start training
        svm.train()


        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        #self.additional_information["distances"] = distances
        self.additional_information["similarities"] = similarities


        # wrap up predictors
        svms = {}
        
        # use a reference to the same svm several times
        for task_name in data.get_task_names():
            
            task_num = data.name_to_id(task_name)
            
            # save svm and task_num
            svms[task_name] = (task_num, param, svm)

        return svms
Exemple #11
0
labels_presvm[15] = 1
labels_presvm[8] = 1
labels_presvm[19] = 1

feat_presvm = StringCharFeatures(DNA)
feat_presvm.set_features(examples_presvm)
wdk_presvm = WeightedDegreeStringKernel(feat_presvm, feat_presvm, 1)
lab_presvm = Labels(numpy.array(labels_presvm))

presvm = SVMLight(1, wdk_presvm, lab_presvm)
presvm.train()

presvm2 = LibSVM(1, wdk_presvm, lab_presvm)
presvm2.train()

print "svmlight", presvm.get_objective()
print "libsvm", presvm2.get_objective()

assert (abs(presvm.get_objective() - presvm2.get_objective()) <= 0.001)

print "simple svm", presvm.get_objective()

print "len(examples_presvm)", len(examples_presvm)

print "##############"

#############################################
#    compute linear term manually
#############################################

examples = [i.example for i in d[subset_size:subset_size * 2]]
        else:
            normalizer.set_task_similarity(i,j, 1.0)


base_wdk.set_normalizer(normalizer)

print base_wdk.get_kernel_matrix()
print "--->",base_wdk.get_normalizer().get_name()

svm = SVMLight(1, base_wdk, lab)
svm.set_linadd_enabled(False)
svm.set_batch_computation_enabled(False)

svm.train(feat)

print "interally modified kernel. objective:", svm.get_objective()



##################################################################
# regular SVM
##################################################################


wdk = WeightedDegreeStringKernel(feat, feat, 1)

normalizer = IdentityKernelNormalizer()
wdk.set_normalizer(normalizer)

svm = SVMLight(1, wdk, lab)
svm.set_linadd_enabled(False)
def test_data():

    ##################################################################
    # select MSS
    ##################################################################

    mss = expenv.MultiSplitSet.get(379)

    ##################################################################
    # data
    ##################################################################

    # fetch data
    instance_set = mss.get_train_data(-1)

    # prepare data
    data = PreparedMultitaskData(instance_set, shuffle=True)

    # set parameters
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"
    param.wdk_degree = 4
    param.cost = 1.0
    param.transform = 1.0
    param.id = 666
    param.freeze()

    ##################################################################
    # taxonomy
    ##################################################################

    taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data)

    support = numpy.linspace(0, 100, 4)

    distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]]

    # create tree normalizer
    tree_normalizer = MultitaskKernelPlifNormalizer(support,
                                                    data.task_vector_names)

    task_names = data.get_task_names()

    FACTOR = 1.0

    # init gamma matrix
    gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))

    for t1_name in task_names:
        for t2_name in task_names:

            similarity = taxonomy.compute_node_similarity(
                taxonomy.get_id(t1_name), taxonomy.get_id(t2_name))
            gammas[data.name_to_id(t1_name),
                   data.name_to_id(t2_name)] = similarity

    helper.save("/tmp/gammas", gammas)

    gammas = gammas * FACTOR

    cost = param.cost * numpy.sqrt(FACTOR)

    print gammas

    ##########
    # regular normalizer

    normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

    for t1_name in task_names:
        for t2_name in task_names:

            similarity = gammas[data.name_to_id(t1_name),
                                data.name_to_id(t2_name)]
            normalizer.set_task_similarity(data.name_to_id(t1_name),
                                           data.name_to_id(t2_name),
                                           similarity)

    ##################################################################
    # Train SVMs
    ##################################################################

    # create shogun objects
    wdk_tree = shogun_factory.create_kernel(data.examples, param)
    lab = shogun_factory.create_labels(data.labels)

    wdk_tree.set_normalizer(tree_normalizer)
    wdk_tree.init_normalizer()

    print "--->", wdk_tree.get_normalizer().get_name()

    svm_tree = SVMLight(cost, wdk_tree, lab)
    svm_tree.set_linadd_enabled(False)
    svm_tree.set_batch_computation_enabled(False)

    svm_tree.train()

    del wdk_tree
    del tree_normalizer

    print "finished training tree-norm SVM:", svm_tree.get_objective()

    wdk = shogun_factory.create_kernel(data.examples, param)
    wdk.set_normalizer(normalizer)
    wdk.init_normalizer()

    print "--->", wdk.get_normalizer().get_name()

    svm = SVMLight(cost, wdk, lab)
    svm.set_linadd_enabled(False)
    svm.set_batch_computation_enabled(False)

    svm.train()

    print "finished training manually set SVM:", svm.get_objective()

    alphas_tree = svm_tree.get_alphas()
    alphas = svm.get_alphas()

    assert (len(alphas_tree) == len(alphas))

    for i in xrange(len(alphas)):
        assert (abs(alphas_tree[i] - alphas[i]) < 0.0001)

    print "success: all alphas are the same"
Exemple #14
0
    #print pickle.dumps(kernel)
    #
    #print "svm"
    #print pickle.dumps(svm)
    #
    #print "#################################"

    fn = "serialized_svm.bz2"
    #print "serializing SVM to file", fn

    save(fn, svm)

    #print "#################################"

    #print "unserializing SVM"
    svm2 = load(fn)

    #print "#################################"
    #print "comparing training"

    svm2.train()

    #print "objective before serialization:", svm.get_objective()
    #print "objective after serialization:", svm2.get_objective()
    return svm, svm.get_objective(), svm2, svm2.get_objective()


if __name__ == '__main__':
    print 'Serialization SVMLight'
    serialization_svmlight_modular(*parameter_list[0])
Exemple #15
0
    def _inner_train(self, train_data, param):
        """
        perform inner training by processing the tree
        """

        data_keys = []
        # top-down processing of taxonomy


        classifiers = []
        classifier_at_node = {}

        root = param.taxonomy.data

        grey_nodes = [root]
        
        while len(grey_nodes)>0:
           
            node = grey_nodes.pop(0) # pop first item
            
            # enqueue children
            if node.children != None:
                grey_nodes.extend(node.children)
    

    
            #####################################################
            #     init data structures
            #####################################################

            # get data below current node
            data = [train_data[key] for key in node.get_data_keys()]
            
            data_keys.append(node.get_data_keys())
    
            print "data at current level"
            for instance_set in data:        
                print instance_set[0].dataset
            
            
            # initialize containers
            examples = []
            labels = []       
    

            # concatenate data
            for instance_set in data:
      
                print "train split_set:", instance_set[0].dataset.organism
                
                for inst in instance_set:
                    examples.append(inst.example)
                    labels.append(inst.label)
    

            # create shogun data objects
            k = shogun_factory.create_kernel(examples, param)
            lab = shogun_factory.create_labels(labels)


            #####################################################
            #    train weak learners    
            #####################################################
            
            cost = param.cost
            
            # set up svm
            svm = SVMLight(cost, k, lab)
                        
            if param.flags["normalize_cost"]:
                # set class-specific Cs
                norm_c_pos = param.cost / float(len([l for l in labels if l==1]))
                norm_c_neg = param.cost / float(len([l for l in labels if l==-1]))
                svm.set_C(norm_c_neg, norm_c_pos)
            
            
            print "using cost: negative class=%f, positive class=%f" % (norm_c_neg, norm_c_pos) 
            
            # enable output
            svm.io.enable_progress()
            svm.io.set_loglevel(shogun.Classifier.MSG_INFO)
            
            # train
            svm.train()
            
            # append svm object
            classifiers.append(svm)
            classifier_at_node[node.name] = svm                            
            
            # save some information
            self.additional_information[node.name + " svm obj"] = svm.get_objective()
            self.additional_information[node.name + " svm num sv"] = svm.get_num_support_vectors()
            self.additional_information[node.name + " runtime"] = svm.get_runtime()


        return (classifiers, classifier_at_node)
def serialization_svmlight_modular(num, dist, width, C):
	from shogun.IO import MSG_DEBUG
	from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet
	from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
	from shogun.Classifier import SVMLight
	from numpy import concatenate, ones
	from numpy.random import randn, seed

	import sys
	import types
	import random
	import bz2
	try:
		import cPickle as pickle
	except ImportError:
		import pickle as pickle	
	import inspect


	def save(filename, myobj):
		"""
		save object to file using pickle
		
		@param filename: name of destination file
		@type filename: str
		@param myobj: object to save (has to be pickleable)
		@type myobj: obj
		"""

		try:
			f = bz2.BZ2File(filename, 'wb')
		except IOError as details:
			sys.stderr.write('File ' + filename + ' cannot be written\n')
			sys.stderr.write(details)
			return

		pickle.dump(myobj, f, protocol=2)
		f.close()



	def load(filename):
		"""
		Load from filename using pickle
		
		@param filename: name of file to load from
		@type filename: str
		"""
		
		try:
			f = bz2.BZ2File(filename, 'rb')
		except IOError as details:
			sys.stderr.write('File ' + filename + ' cannot be read\n')
			sys.stderr.write(details)
			return

		myobj = pickle.load(f)
		f.close()
		return myobj


	##################################################

	seed(17)
	traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1)
	testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1);

	trainlab=concatenate((-ones(num), ones(num)));
	testlab=concatenate((-ones(num), ones(num)));

	feats_train=RealFeatures(traindata_real);
	feats_test=RealFeatures(testdata_real);
	kernel=GaussianKernel(feats_train, feats_train, width);
	#kernel.io.set_loglevel(MSG_DEBUG)

	labels=BinaryLabels(trainlab);

	svm=SVMLight(C, kernel, labels)
	svm.train()
	#svm.io.set_loglevel(MSG_DEBUG)

	##################################################

	#print("labels:")
	#print(pickle.dumps(labels))
	#
	#print("features")
	#print(pickle.dumps(feats_train))
	#
	#print("kernel")
	#print(pickle.dumps(kernel))
	#
	#print("svm")
	#print(pickle.dumps(svm))
	#
	#print("#################################")

	fn = "serialized_svm.bz2"
	#print("serializing SVM to file", fn)

	save(fn, svm)

	#print("#################################")

	#print("unserializing SVM")
	svm2 = load(fn)


	#print("#################################")
	#print("comparing training")

	svm2.train()

	#print("objective before serialization:", svm.get_objective())
	#print("objective after serialization:", svm2.get_objective())
	return svm, svm.get_objective(), svm2, svm2.get_objective()
labels_presvm[19] = 1


feat_presvm = StringCharFeatures(DNA)
feat_presvm.set_features(examples_presvm)
wdk_presvm = WeightedDegreeStringKernel(feat_presvm, feat_presvm, 1)
lab_presvm = Labels(numpy.array(labels_presvm))


presvm = SVMLight(1, wdk_presvm, lab_presvm)
presvm.train()

presvm2 = LibSVM(1, wdk_presvm, lab_presvm)
presvm2.train()

print "svmlight", presvm.get_objective()
print "libsvm", presvm2.get_objective()

assert(abs(presvm.get_objective() - presvm2.get_objective())<= 0.001)

print "simple svm", presvm.get_objective()

print "len(examples_presvm)", len(examples_presvm)

print "##############"


#############################################
#    compute linear term manually
#############################################
def test_data():
    
    ##################################################################
    # select MSS
    ##################################################################
    
    mss = expenv.MultiSplitSet.get(379)
    
    
    
    ##################################################################
    # data
    ##################################################################
    
    # fetch data
    instance_set = mss.get_train_data(-1)
    
    # prepare data
    data = PreparedMultitaskData(instance_set, shuffle=True)
    
    # set parameters
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"
    param.wdk_degree = 4
    param.cost = 1.0
    param.transform = 1.0
    param.id = 666
    param.freeze()
    
    
    
    
    ##################################################################
    # taxonomy
    ##################################################################
    
    
    taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data)
    
    
    support = numpy.linspace(0, 100, 4)
    
    
    distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]]
    
    # create tree normalizer 
    tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names)
    
    
    
    
    task_names = data.get_task_names()
    
    
    FACTOR = 1.0
    
    
    # init gamma matrix
    gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))
    
    for t1_name in task_names:
        for t2_name in task_names:
            
            similarity = taxonomy.compute_node_similarity(taxonomy.get_id(t1_name), taxonomy.get_id(t2_name))        
            gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity
    
    helper.save("/tmp/gammas", gammas)
    
    
    gammas = gammas * FACTOR
    
    cost = param.cost * numpy.sqrt(FACTOR) 
    
    print gammas
    
    
    ##########
    # regular normalizer
    
    normalizer = MultitaskKernelNormalizer(data.task_vector_nums)
    
    for t1_name in task_names:
        for t2_name in task_names:
                    
            similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)]
            normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity)
    
                
    ##################################################################
    # Train SVMs
    ##################################################################
    
    # create shogun objects
    wdk_tree = shogun_factory.create_kernel(data.examples, param)
    lab = shogun_factory.create_labels(data.labels)
    
    wdk_tree.set_normalizer(tree_normalizer)
    wdk_tree.init_normalizer()
    
    print "--->",wdk_tree.get_normalizer().get_name()
    
    svm_tree = SVMLight(cost, wdk_tree, lab)
    svm_tree.set_linadd_enabled(False)
    svm_tree.set_batch_computation_enabled(False)
    
    svm_tree.train()
    
    del wdk_tree
    del tree_normalizer
    
    print "finished training tree-norm SVM:", svm_tree.get_objective()
    
    
    wdk = shogun_factory.create_kernel(data.examples, param)
    wdk.set_normalizer(normalizer)
    wdk.init_normalizer()
    
    print "--->",wdk.get_normalizer().get_name()
    
    svm = SVMLight(cost, wdk, lab)
    svm.set_linadd_enabled(False)
    svm.set_batch_computation_enabled(False)
    
    svm.train()
    
    print "finished training manually set SVM:", svm.get_objective()
    
    
    alphas_tree = svm_tree.get_alphas()
    alphas = svm.get_alphas()
    
    assert(len(alphas_tree)==len(alphas))
    
    for i in xrange(len(alphas)):
        assert(abs(alphas_tree[i] - alphas[i]) < 0.0001)
        
    print "success: all alphas are the same"
Exemple #19
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        assert (param.base_similarity >= 1)

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        # load data
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt")
        f = file(
            "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt"
        )
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt")
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt")

        num_lines = int(f.readline().strip())
        task_distances = numpy.zeros((num_lines, num_lines))
        name_to_id = {}
        for (i, line) in enumerate(f):
            tokens = line.strip().split("\t")
            name = str(tokens[0])
            name_to_id[name] = i
            entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0])
            assert len(entry) == num_lines, "len_entry %i, num_lines %i" % (
                len(entry), num_lines)
            task_distances[i, :] = entry

        # cut relevant submatrix
        active_ids = [name_to_id[name] for name in data.get_task_names()]
        tmp_distances = task_distances[active_ids, :]
        tmp_distances = tmp_distances[:, active_ids]
        print "distances ", tmp_distances.shape

        # normalize distances
        task_distances = task_distances / numpy.max(tmp_distances)

        similarities = numpy.zeros(
            (data.get_num_tasks(), data.get_num_tasks()))

        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():

                # convert similarity with simple transformation
                similarity = param.base_similarity - task_distances[
                    name_to_id[task_name_lhs], name_to_id[task_name_rhs]]
                normalizer.set_task_similarity(data.name_to_id(task_name_lhs),
                                               data.name_to_id(task_name_rhs),
                                               similarity)

                # save for later
                similarities[data.name_to_id(task_name_lhs),
                             data.name_to_id(task_name_rhs)] = similarity

        # set normalizer
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()

        # set up svm
        svm = SVMLight(param.cost, base_wdk, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        # normalize cost
        norm_c_pos = param.cost / float(len([l
                                             for l in data.labels if l == 1]))
        norm_c_neg = param.cost / float(
            len([l for l in data.labels if l == -1]))

        svm.set_C(norm_c_neg, norm_c_pos)

        # start training
        svm.train()

        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        #self.additional_information["distances"] = distances
        self.additional_information["similarities"] = similarities

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in data.get_task_names():

            task_num = data.name_to_id(task_name)

            # save svm and task_num
            svms[task_name] = (task_num, param, svm)

        return svms
wdk_tree.set_normalizer(tree_normalizer)
wdk_tree.init_normalizer()

print "--->",wdk_tree.get_normalizer().get_name()

svm_tree = SVMLight(cost, wdk_tree, lab)
svm_tree.set_linadd_enabled(False)
svm_tree.set_batch_computation_enabled(False)

svm_tree.train()

del wdk_tree
del tree_normalizer

print "finished training tree-norm SVM:", svm_tree.get_objective()


wdk = shogun_factory.create_kernel(data.examples, param)
wdk.set_normalizer(normalizer)
wdk.init_normalizer()

print "--->",wdk.get_normalizer().get_name()

svm = SVMLight(cost, wdk, lab)
svm.set_linadd_enabled(False)
svm.set_batch_computation_enabled(False)

svm.train()

print "finished training manually set SVM:", svm.get_objective()
Exemple #21
0
            normalizer.set_task_similarity(i, j, 4.0)
        else:
            normalizer.set_task_similarity(i, j, 1.0)

base_wdk.set_normalizer(normalizer)

print base_wdk.get_kernel_matrix()
print "--->", base_wdk.get_normalizer().get_name()

svm = SVMLight(1, base_wdk, lab)
svm.set_linadd_enabled(False)
svm.set_batch_computation_enabled(False)

svm.train(feat)

print "interally modified kernel. objective:", svm.get_objective()

##################################################################
# regular SVM
##################################################################

wdk = WeightedDegreeStringKernel(feat, feat, 1)

normalizer = IdentityKernelNormalizer()
wdk.set_normalizer(normalizer)

svm = SVMLight(1, wdk, lab)
svm.set_linadd_enabled(False)
svm.train()

#print "unmodified svm. objective:", svm.get_objective()