def _union_train(self, prepared_data, param):
        """
        perform inner training by processing the tree
        """

        normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums)

        # set similarity
        for task_name_lhs in prepared_data.get_task_names():
            for task_name_rhs in prepared_data.get_task_names():

                similarity = 1.0

                normalizer.set_task_similarity(
                    prepared_data.name_to_id(task_name_lhs),
                    prepared_data.name_to_id(task_name_rhs), similarity)

        lab = shogun_factory.create_labels(prepared_data.labels)

        print "creating empty kernel"
        kernel = shogun_factory.create_kernel(prepared_data.examples, param)

        print "setting normalizer"
        kernel.set_normalizer(normalizer)
        kernel.init_normalizer()

        svm = shogun_factory.create_svm(param, kernel, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        # train SVM
        svm.train()

        return svm
    def _union_train(self, prepared_data, param):
        """
        perform inner training by processing the tree
        """

    
        normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums)
        
        # set similarity
        for task_name_lhs in prepared_data.get_task_names():
            for task_name_rhs in prepared_data.get_task_names():
                
                similarity = 1.0
                                
                normalizer.set_task_similarity(prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs), similarity)

        
        lab = shogun_factory.create_labels(prepared_data.labels)
        
        print "creating empty kernel"
        kernel = shogun_factory.create_kernel(prepared_data.examples, param)
        
        print "setting normalizer"
        kernel.set_normalizer(normalizer)
        kernel.init_normalizer()

        svm = shogun_factory.create_svm(param, kernel, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        # train SVM
        svm.train()


        return svm
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        for task_id in train_data.keys():
            print "task_id:", task_id

        root = param.taxonomy.data

        grey_nodes = [root]

        # top-down processing of taxonomy

        for node in root.get_leaves():

            #####################################################
            #    train predictor
            #####################################################

            parent_node = node.get_nearest_neighbor()

            cost = param.cost

            (examples, labels) = self.get_data(parent_node, train_data)

            # create shogun data objects
            k_parent = shogun_factory_new.create_kernel(examples, param)
            lab_parent = shogun_factory_new.create_labels(labels)

            parent_svm = SVMLight(cost, k_parent, lab_parent)

            parent_svm.train()

            #####################################################
            #    train predictors
            #####################################################

            (examples, labels) = self.get_data(node, train_data)

            # create shogun data objects
            k = shogun_factory_new.create_kernel(examples, param)
            lab = shogun_factory_new.create_labels(labels)

            # regularize vs parent predictor

            weight = param.transform
            print "current edge_weight:", weight, " ,name:", node.name

            svm = DomainAdaptationSVM(cost, k, lab, parent_svm, weight)
            svm.train()

            # attach svm to node
            node.predictor = svm

        #####################################################
        #    Wrap things up
        #####################################################

        # wrap up predictors for later use
        predictors = {}

        for leaf in root.get_leaves():

            predictors[leaf.name] = leaf.predictor

            assert (leaf.predictor != None)

        sym_diff_keys = set(train_data.keys()).symmetric_difference(
            set(predictors.keys()))
        assert len(
            sym_diff_keys
        ) == 0, "symmetric difference between keys non-empty: " + str(
            sym_diff_keys)

        return predictors
Beispiel #4
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """


        assert(param.base_similarity >= 1)
        
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)
        
        
        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)
        
        # load data
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt")
        f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt")
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt")
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt")
        
        num_lines = int(f.readline().strip())
        task_distances = numpy.zeros((num_lines, num_lines))
        name_to_id = {}
        for (i, line) in enumerate(f):
            tokens = line.strip().split("\t")
            name = str(tokens[0])
            name_to_id[name] = i
            entry = numpy.array([v for (j,v) in enumerate(tokens) if j!=0])
            assert len(entry)==num_lines, "len_entry %i, num_lines %i" % (len(entry), num_lines)
            task_distances[i,:] = entry
            
        
        # cut relevant submatrix
        active_ids = [name_to_id[name] for name in data.get_task_names()] 
        tmp_distances = task_distances[active_ids, :]
        tmp_distances = tmp_distances[:, active_ids]
        print "distances ", tmp_distances.shape

        
        # normalize distances
        task_distances = task_distances / numpy.max(tmp_distances)
        
        
        similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))
                                
        
        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():
                
                
                # convert similarity with simple transformation
                similarity = param.base_similarity - task_distances[name_to_id[task_name_lhs], name_to_id[task_name_rhs]]
                normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity)
                
                # save for later
                similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity
                
                
        # set normalizer                
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()
        

        # set up svm
        svm = SVMLight(param.cost, base_wdk, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        
        
        # normalize cost
        norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
        norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
        
        svm.set_C(norm_c_neg, norm_c_pos)
        
        
        # start training
        svm.train()


        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        #self.additional_information["distances"] = distances
        self.additional_information["similarities"] = similarities


        # wrap up predictors
        svms = {}
        
        # use a reference to the same svm several times
        for task_name in data.get_task_names():
            
            task_num = data.name_to_id(task_name)
            
            # save svm and task_num
            svms[task_name] = (task_num, param, svm)

        return svms
def test_data():
    
    ##################################################################
    # select MSS
    ##################################################################
    
    mss = expenv.MultiSplitSet.get(379)
    
    
    
    ##################################################################
    # data
    ##################################################################
    
    # fetch data
    instance_set = mss.get_train_data(-1)
    
    # prepare data
    data = PreparedMultitaskData(instance_set, shuffle=True)
    
    # set parameters
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"
    param.wdk_degree = 4
    param.cost = 1.0
    param.transform = 1.0
    param.id = 666
    param.freeze()
    
    
    
    
    ##################################################################
    # taxonomy
    ##################################################################
    
    
    taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data)
    
    
    support = numpy.linspace(0, 100, 4)
    
    
    distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]]
    
    # create tree normalizer 
    tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names)
    
    
    
    
    task_names = data.get_task_names()
    
    
    FACTOR = 1.0
    
    
    # init gamma matrix
    gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))
    
    for t1_name in task_names:
        for t2_name in task_names:
            
            similarity = taxonomy.compute_node_similarity(taxonomy.get_id(t1_name), taxonomy.get_id(t2_name))        
            gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity
    
    helper.save("/tmp/gammas", gammas)
    
    
    gammas = gammas * FACTOR
    
    cost = param.cost * numpy.sqrt(FACTOR) 
    
    print gammas
    
    
    ##########
    # regular normalizer
    
    normalizer = MultitaskKernelNormalizer(data.task_vector_nums)
    
    for t1_name in task_names:
        for t2_name in task_names:
                    
            similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)]
            normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity)
    
                
    ##################################################################
    # Train SVMs
    ##################################################################
    
    # create shogun objects
    wdk_tree = shogun_factory.create_kernel(data.examples, param)
    lab = shogun_factory.create_labels(data.labels)
    
    wdk_tree.set_normalizer(tree_normalizer)
    wdk_tree.init_normalizer()
    
    print "--->",wdk_tree.get_normalizer().get_name()
    
    svm_tree = SVMLight(cost, wdk_tree, lab)
    svm_tree.set_linadd_enabled(False)
    svm_tree.set_batch_computation_enabled(False)
    
    svm_tree.train()
    
    del wdk_tree
    del tree_normalizer
    
    print "finished training tree-norm SVM:", svm_tree.get_objective()
    
    
    wdk = shogun_factory.create_kernel(data.examples, param)
    wdk.set_normalizer(normalizer)
    wdk.init_normalizer()
    
    print "--->",wdk.get_normalizer().get_name()
    
    svm = SVMLight(cost, wdk, lab)
    svm.set_linadd_enabled(False)
    svm.set_batch_computation_enabled(False)
    
    svm.train()
    
    print "finished training manually set SVM:", svm.get_objective()
    
    
    alphas_tree = svm_tree.get_alphas()
    alphas = svm.get_alphas()
    
    assert(len(alphas_tree)==len(alphas))
    
    for i in xrange(len(alphas)):
        assert(abs(alphas_tree[i] - alphas[i]) < 0.0001)
        
    print "success: all alphas are the same"
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
                
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)
        
        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # support
        support = numpy.linspace(0, 1, 5)

        # set normalizer
        normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_nums) 
        
        # fetch taxonomy from parameter object
        taxonomy = param.taxonomy.data

        taxonomy.plot()
        import os
        os.system("evince demo.png &")
        
        # compute distances
        distances = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))
        
        for (i,task_name_lhs) in enumerate(data.get_task_names()):
            for (j, task_name_rhs) in enumerate(data.get_task_names()):
                
                distances[i,j] = task_similarities.compute_hop_distance(taxonomy, task_name_lhs, task_name_rhs)

                
        # normalize distances
        distances = distances / numpy.max(distances)

        
        # set distances
        for (i,task_name_lhs) in enumerate(data.get_task_names()):
            for (j, task_name_rhs) in enumerate(data.get_task_names()):
                        
                normalizer.set_task_distance(i, j, distances[i,j])

        
        # assign normalizer
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()

        svm = None
        
        debug_weights = {}
                
        num_subk = base_wdk.get_num_subkernels()
        
        print "num subkernels:", num_subk
        
        #print "subkernel weights:", base_wdk.get_subkernel_weights()
        
        debug_weights["before"] = [normalizer.get_beta(i) for i in range(num_subk)]        
        
        print "using MKL:", (param.transform >= 1.0)
        
        if param.transform >= 1.0:
        
        
            num_threads = 4

            
            svm = MKLClassification()
            
            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX) 
        
            svm.set_C(param.cost, param.cost)
            
            svm.set_kernel(base_wdk)
            svm.set_labels(lab)
            
            svm.parallel.set_num_threads(num_threads)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)
            
            svm.train()
        
            #print "subkernel weights (after):", base_wdk.get_subkernel_weights()    
            
        else:
            
            # create SVM (disable unsupported optimizations)
            svm = SVMLight(param.cost, base_wdk, lab)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)
            
            svm.train()
        
        
        print "svm objective:", svm.get_objective()     
        


        debug_weights["after"] = [normalizer.get_beta(i) for i in range(num_subk)]            
        
        # debugging output
        print "debug weights (before/after):"
        print debug_weights["before"]
        print debug_weights["after"]
        print ""
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (svm, data.name_to_id(task_name))


        return svms
    def _inner_train(self, prepared_data, param):
        """
        perform inner training by processing the tree
        """


        # init seq handler 
        
        classifiers = []


        #################
        # mtk
        normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums)
        
        from method_mhc_rbf import SequencesHandlerRbf
        task_kernel = SequencesHandlerRbf(1, param.base_similarity, prepared_data.get_task_names(), param.flags["wdk_rbf_on"])
        

        # set similarity
        for task_name_lhs in prepared_data.get_task_names():
            for task_name_rhs in prepared_data.get_task_names():
                
                similarity = task_kernel.get_similarity(task_name_lhs, task_name_rhs)
                                
                normalizer.set_task_similarity(prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs), similarity)
           
        
        lab = shogun_factory.create_labels(prepared_data.labels)
        
        print "creating empty kernel"
        kernel = shogun_factory.create_kernel(prepared_data.examples, param)
        
        print "setting normalizer"
        kernel.set_normalizer(normalizer)
        kernel.init_normalizer()

        svm = shogun_factory.create_svm(param, kernel, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        # train SVM
        svm.train()
        
        classifiers.append(svm)

        #################
        # dirac             
            #import pdb
            #pdb.set_trace()
            

        svm_dirac = self._dirac_train(prepared_data, param)

        classifiers.append(svm_dirac)
        
        ##
        #union
        
        #svm_union = self._union_train(prepared_data, param)

        #classifiers.append(svm_union)
        


        return classifiers
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # support
        support = numpy.linspace(0, 1, 5)

        # set normalizer
        normalizer = MultitaskKernelPlifNormalizer(support,
                                                   data.task_vector_nums)

        # fetch taxonomy from parameter object
        taxonomy = param.taxonomy.data

        taxonomy.plot()
        import os
        os.system("evince demo.png &")

        # compute distances
        distances = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))

        for (i, task_name_lhs) in enumerate(data.get_task_names()):
            for (j, task_name_rhs) in enumerate(data.get_task_names()):

                distances[i, j] = task_similarities.compute_hop_distance(
                    taxonomy, task_name_lhs, task_name_rhs)

        # normalize distances
        distances = distances / numpy.max(distances)

        # set distances
        for (i, task_name_lhs) in enumerate(data.get_task_names()):
            for (j, task_name_rhs) in enumerate(data.get_task_names()):

                normalizer.set_task_distance(i, j, distances[i, j])

        # assign normalizer
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()

        svm = None

        debug_weights = {}

        num_subk = base_wdk.get_num_subkernels()

        print "num subkernels:", num_subk

        #print "subkernel weights:", base_wdk.get_subkernel_weights()

        debug_weights["before"] = [
            normalizer.get_beta(i) for i in range(num_subk)
        ]

        print "using MKL:", (param.transform >= 1.0)

        if param.transform >= 1.0:

            num_threads = 4

            svm = MKLClassification()

            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX)

            svm.set_C(param.cost, param.cost)

            svm.set_kernel(base_wdk)
            svm.set_labels(lab)

            svm.parallel.set_num_threads(num_threads)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)

            svm.train()

            #print "subkernel weights (after):", base_wdk.get_subkernel_weights()

        else:

            # create SVM (disable unsupported optimizations)
            svm = SVMLight(param.cost, base_wdk, lab)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)

            svm.train()

        print "svm objective:", svm.get_objective()

        debug_weights["after"] = [
            normalizer.get_beta(i) for i in range(num_subk)
        ]

        # debugging output
        print "debug weights (before/after):"
        print debug_weights["before"]
        print debug_weights["after"]
        print ""

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (svm, data.name_to_id(task_name))

        return svms
Beispiel #9
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
                
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

                
        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        kernel_matrix = base_wdk.get_kernel_matrix()
        lab = shogun_factory.create_labels(data.labels)
        

        # fetch taxonomy from parameter object
        taxonomy = param.taxonomy.data

        # create name to leaf map
        nodes = taxonomy.get_all_nodes()


        ########################################################
        print "creating a kernel for each node:"
        ########################################################


        # assemble combined kernel
        from shogun.Kernel import CombinedKernel, CustomKernel
        
        combined_kernel = CombinedKernel()
        
        # indicator to which task each example belongs
        task_vector = data.task_vector_names
        
        for node in nodes:
            
            print "creating kernel for ", node.name
            
            # fetch sub-tree
            leaf_names = [leaf.name for leaf in node.get_leaves()]
            
            print "masking all entries other than:", leaf_names
            
            # init matrix
            kernel_matrix_node = numpy.zeros(kernel_matrix.shape)
            
            # fill matrix for node
            for (i, task_lhs) in enumerate(task_vector):
                for (j, task_rhs) in enumerate(task_vector):
                    
                    # only copy values, if both tasks are present in subtree
                    if task_lhs in leaf_names and task_rhs in leaf_names:
                        kernel_matrix_node[i,j] = kernel_matrix[i,j]
                    
            # create custom kernel
            kernel_node = CustomKernel()
            kernel_node.set_full_kernel_matrix_from_full(kernel_matrix_node)
            
            
            # append custom kernel to CombinedKernel
            combined_kernel.append_kernel(kernel_node)                
            
            print "------"
        

        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None
                
        
        print "using MKL:", (param.transform >= 1.0)
        
        if param.transform >= 1.0:
        
        
            num_threads = 4

            
            svm = MKLClassification()
            
            svm.set_mkl_norm(param.transform)
            svm.set_solver_type(ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX)
        
            svm.set_C(param.cost, param.cost)
            
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)
            
            svm.parallel.set_num_threads(num_threads)
            #svm.set_linadd_enabled(False)
            #svm.set_batch_computation_enabled(False)
            
            svm.train()
        
            print "subkernel weights (after):", combined_kernel.get_subkernel_weights()    
            
        else:
            
            # create SVM (disable unsupported optimizations)
            svm = SVMLight(param.cost, combined_kernel, lab)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)
            
            svm.train()


        ########################################################
        print "svm objective:"
        print svm.get_objective()
        ########################################################
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_id in train_data.keys():
            svms[task_id] = svm


        return svms
Beispiel #10
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """


        assert(param.base_similarity >= 1)
        
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)
        
        
        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)


        # create normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        # load hard-coded task-similarity
        task_similarity = helper.load("/fml/ag-raetsch/home/cwidmer/svn/projects/alt_splice_code/src/task_sim_tis.bz2")


        # set similarity
        similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))
        
        for (i, task_name_lhs) in enumerate(data.get_task_names()):
            
            #max_value_row = max(task_similarity.get_row(task_name_lhs))
            max_value_row = 1.0
            
            for (j, task_name_rhs) in enumerate(data.get_task_names()):
                
                similarity = task_similarity.get_value(task_name_lhs, task_name_rhs) / max_value_row
                normalizer.set_task_similarity(i, j, similarity)
                similarities[i,j] = similarity
                
        
        pprint.pprint similarities
        
        # set normalizer
        #print "WARNING MTK disabled!!!!!!!!!!!!!!!!!!!!!"                
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()
        
        
        # set up svm
        param.flags["svm_type"] = "svmlight" #fix svm type
        
        svm = shogun_factory.create_svm(param, base_wdk, lab)
        
        # make sure these parameters are set correctly
        #print "WARNING MTK WONT WORK WITH THESE SETTINGS!!!!!!!!!!!!!!!!!!!!!"
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        

        assert svm.get_linadd_enabled() == False, "linadd should be disabled"
        assert svm.get_batch_computation_enabled == False, "batch compute should be disabled"
        
        # start training
        svm.train()
        
        
        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        #self.additional_information["distances"] = distances
        self.additional_information["similarities"] = similarities
        
        
        # wrap up predictors
        svms = {}
        
        # use a reference to the same svm several times
        for task_name in data.get_task_names():
            
            task_num = data.name_to_id(task_name)
            
            # save svm and task_num
            svms[task_name] = (task_num, svm)

        return svms
Beispiel #11
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """



        for task_id in train_data.keys():
            print "task_id:", task_id
            
        
        root = param.taxonomy.data
        

        grey_nodes = [root]
        

        # top-down processing of taxonomy
        
        for node in root.get_leaves():


            #####################################################
            #    train predictor 
            #####################################################
 
            parent_node = node.get_nearest_neighbor()

            cost = param.cost
   
            (examples, labels) = self.get_data(parent_node, train_data)

            # create shogun data objects
            k_parent = shogun_factory_new.create_kernel(examples, param)
            lab_parent = shogun_factory_new.create_labels(labels)

            parent_svm = SVMLight(cost, k_parent, lab_parent)
            
            parent_svm.train()
    


            #####################################################
            #    train predictors    
            #####################################################
            

            (examples, labels) = self.get_data(node, train_data)

            # create shogun data objects
            k = shogun_factory_new.create_kernel(examples, param)
            lab = shogun_factory_new.create_labels(labels)

               
            # regularize vs parent predictor
            
            weight = param.transform
            print "current edge_weight:", weight, " ,name:", node.name
            
            svm = DomainAdaptationSVM(cost, k, lab, parent_svm, weight)
            svm.train()

                                    
            # attach svm to node
            node.predictor = svm
 


        #####################################################
        #    Wrap things up    
        #####################################################
 
        # wrap up predictors for later use
        predictors = {}

        for leaf in root.get_leaves():
            
            predictors[leaf.name] = leaf.predictor
            
            assert(leaf.predictor!=None)
            
        sym_diff_keys = set(train_data.keys()).symmetric_difference(set(predictors.keys()))
        assert len(sym_diff_keys)==0, "symmetric difference between keys non-empty: " + str(sym_diff_keys)  


        return predictors
def test_data():

    ##################################################################
    # select MSS
    ##################################################################

    mss = expenv.MultiSplitSet.get(379)

    ##################################################################
    # data
    ##################################################################

    # fetch data
    instance_set = mss.get_train_data(-1)

    # prepare data
    data = PreparedMultitaskData(instance_set, shuffle=True)

    # set parameters
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"
    param.wdk_degree = 4
    param.cost = 1.0
    param.transform = 1.0
    param.id = 666
    param.freeze()

    ##################################################################
    # taxonomy
    ##################################################################

    taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data)

    support = numpy.linspace(0, 100, 4)

    distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]]

    # create tree normalizer
    tree_normalizer = MultitaskKernelPlifNormalizer(support,
                                                    data.task_vector_names)

    task_names = data.get_task_names()

    FACTOR = 1.0

    # init gamma matrix
    gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))

    for t1_name in task_names:
        for t2_name in task_names:

            similarity = taxonomy.compute_node_similarity(
                taxonomy.get_id(t1_name), taxonomy.get_id(t2_name))
            gammas[data.name_to_id(t1_name),
                   data.name_to_id(t2_name)] = similarity

    helper.save("/tmp/gammas", gammas)

    gammas = gammas * FACTOR

    cost = param.cost * numpy.sqrt(FACTOR)

    print gammas

    ##########
    # regular normalizer

    normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

    for t1_name in task_names:
        for t2_name in task_names:

            similarity = gammas[data.name_to_id(t1_name),
                                data.name_to_id(t2_name)]
            normalizer.set_task_similarity(data.name_to_id(t1_name),
                                           data.name_to_id(t2_name),
                                           similarity)

    ##################################################################
    # Train SVMs
    ##################################################################

    # create shogun objects
    wdk_tree = shogun_factory.create_kernel(data.examples, param)
    lab = shogun_factory.create_labels(data.labels)

    wdk_tree.set_normalizer(tree_normalizer)
    wdk_tree.init_normalizer()

    print "--->", wdk_tree.get_normalizer().get_name()

    svm_tree = SVMLight(cost, wdk_tree, lab)
    svm_tree.set_linadd_enabled(False)
    svm_tree.set_batch_computation_enabled(False)

    svm_tree.train()

    del wdk_tree
    del tree_normalizer

    print "finished training tree-norm SVM:", svm_tree.get_objective()

    wdk = shogun_factory.create_kernel(data.examples, param)
    wdk.set_normalizer(normalizer)
    wdk.init_normalizer()

    print "--->", wdk.get_normalizer().get_name()

    svm = SVMLight(cost, wdk, lab)
    svm.set_linadd_enabled(False)
    svm.set_batch_computation_enabled(False)

    svm.train()

    print "finished training manually set SVM:", svm.get_objective()

    alphas_tree = svm_tree.get_alphas()
    alphas = svm.get_alphas()

    assert (len(alphas_tree) == len(alphas))

    for i in xrange(len(alphas)):
        assert (abs(alphas_tree[i] - alphas[i]) < 0.0001)

    print "success: all alphas are the same"
normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

for t1_name in task_names:
    for t2_name in task_names:

        similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)]
        normalizer.set_task_similarity(data.name_to_id(t1_name),
                                       data.name_to_id(t2_name), similarity)

##################################################################
# Train SVMs
##################################################################

# create shogun objects
wdk_tree = shogun_factory.create_kernel(data.examples, param)
lab = shogun_factory.create_labels(data.labels)

wdk_tree.set_normalizer(tree_normalizer)
wdk_tree.init_normalizer()

print "--->", wdk_tree.get_normalizer().get_name()

svm_tree = SVMLight(cost, wdk_tree, lab)
svm_tree.set_linadd_enabled(False)
svm_tree.set_batch_computation_enabled(False)

svm_tree.train()

del wdk_tree
del tree_normalizer
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
                
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)
        
        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # fetch taxonomy from parameter object
        taxonomy = shogun_factory.create_taxonomy(param.taxonomy.data)


        # set normalizer
        normalizer = MultitaskKernelTreeNormalizer(data.task_vector_names, data.task_vector_names, taxonomy)
        
        
        ########################################################
        gammas = self.taxonomy_to_gammas(data, taxonomy)
        print "gammas before MKL:"
        print gammas
        ########################################################
        
        
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()

        svm = None
        
        num_subk = base_wdk.get_num_subkernels()
        
        print "num subkernels:", num_subk
        
        #print "subkernel weights:", base_wdk.get_subkernel_weights()
        
        self.additional_information["weights_before"] = [normalizer.get_beta(i) for i in range(num_subk)]        
        
        print "using MKL:", (param.transform >= 1.0)
        
        if param.transform >= 1.0:
        
        
            num_threads = 4
            
            svm = MKLClassification()
            
            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX) 
        
            
            svm.set_kernel(base_wdk)
            svm.set_labels(lab)
            
            svm.parallel.set_num_threads(num_threads)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)


            if param.flags["normalize_cost"]:        
                # normalize cost
                norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
                norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
                
                svm.set_C(norm_c_neg, norm_c_pos)
            else:
                svm.set_C(param.cost, param.cost)
            
            
            svm.train()
        
            #print "subkernel weights (after):", base_wdk.get_subkernel_weights()    
            
        else:
            
            # create SVM (disable unsupported optimizations)
            svm = SVMLight(param.cost, base_wdk, lab)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)
            
            svm.train()
        
        
        print "svm objective:", svm.get_objective()     
        


        self.additional_information["weights"] = [normalizer.get_beta(i) for i in range(num_subk)]
        self.additional_information["gammas"] = self.taxonomy_to_gammas(data, taxonomy) 
       
        print "debug weights:"
        print self.additional_information
        print ""
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_id in train_data.keys():
            svms[task_id] = svm


        return svms
    def _inner_train(self, prepared_data, param):
        """
        perform inner training by processing the tree
        """

        # init seq handler

        classifiers = []

        #################
        # mtk
        normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums)

        from method_mhc_rbf import SequencesHandlerRbf
        task_kernel = SequencesHandlerRbf(1, param.base_similarity,
                                          prepared_data.get_task_names(),
                                          param.flags["wdk_rbf_on"])

        # set similarity
        for task_name_lhs in prepared_data.get_task_names():
            for task_name_rhs in prepared_data.get_task_names():

                similarity = task_kernel.get_similarity(
                    task_name_lhs, task_name_rhs)

                normalizer.set_task_similarity(
                    prepared_data.name_to_id(task_name_lhs),
                    prepared_data.name_to_id(task_name_rhs), similarity)

        lab = shogun_factory.create_labels(prepared_data.labels)

        print "creating empty kernel"
        kernel = shogun_factory.create_kernel(prepared_data.examples, param)

        print "setting normalizer"
        kernel.set_normalizer(normalizer)
        kernel.init_normalizer()

        svm = shogun_factory.create_svm(param, kernel, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        # train SVM
        svm.train()

        classifiers.append(svm)

        #################
        # dirac
        #import pdb
        #pdb.set_trace()

        svm_dirac = self._dirac_train(prepared_data, param)

        classifiers.append(svm_dirac)

        ##
        #union

        #svm_union = self._union_train(prepared_data, param)

        #classifiers.append(svm_union)

        return classifiers
Beispiel #16
0
    def __init__(self, degree, sigma, active_set, wdk_rbf_on):
        '''
        loads data into handler
        '''

        self.active_set = active_set

        fn = "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHCsequenzen/pseudo.txt"

        tmp_key = ""
        tmp_idx = 0

        self.seqs = []
        self.keys = []
        self.name_to_id = {}

        # parse file
        for line in file(fn):

            if line.startswith(">"):
                tmp_key = line.strip()[1:]
            else:

                if active_set.count(tmp_key) > 0:

                    assert self.keys.count(
                        tmp_key
                    ) == 0, "key %s is already contained in self.keys" % (
                        tmp_key)

                    self.seqs.append(line.strip())
                    self.keys.append(tmp_key)
                    self.name_to_id[tmp_key] = tmp_idx

                    tmp_idx += 1

                    assert len(
                        self.seqs
                    ) == tmp_idx, "incorrect number of sequences %i != %i" % (
                        len(self.seqs), tmp_idx)
                    assert len(
                        self.keys
                    ) == tmp_idx, "incorrect number of keys %i != %i" % (len(
                        self.keys), tmp_idx)

        # setup kernel
        param = Options()

        if wdk_rbf_on:
            param.kernel = "WeightedDegreeRBFKernel"
        else:
            param.kernel = "WeightedDegreeStringKernel"
        param.wdk_degree = degree
        param.transform = sigma

        self.kernel = shogun_factory.create_kernel(self.seqs, param)

        #######################
        # compute kernel
        #######################

        num_tasks = len(self.seqs)

        self.similarity = numpy.zeros((num_tasks, num_tasks))

        for i in xrange(num_tasks):
            for j in xrange(num_tasks):
                self.similarity[i, j] = self.kernel.kernel(i, j)

        # normalize kernel
        my_min = numpy.min(self.similarity)
        my_max = numpy.max(self.similarity)
        my_diff = my_max - my_min

        # scale to interval [0,1]
        #self.similarity = (self.similarity - my_min) / my_diff
        self.similarity = (self.similarity) / my_max

        print self.similarity
Beispiel #17
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # fetch taxonomy from parameter object
        taxonomy = shogun_factory.create_taxonomy(param.taxonomy.data)

        # set normalizer
        normalizer = MultitaskKernelTreeNormalizer(data.task_vector_names,
                                                   data.task_vector_names,
                                                   taxonomy)

        ########################################################
        gammas = self.taxonomy_to_gammas(data, taxonomy)
        print "gammas before MKL:"
        print gammas
        ########################################################

        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()

        svm = None

        num_subk = base_wdk.get_num_subkernels()

        print "num subkernels:", num_subk

        #print "subkernel weights:", base_wdk.get_subkernel_weights()

        self.additional_information["weights_before"] = [
            normalizer.get_beta(i) for i in range(num_subk)
        ]

        print "using MKL:", (param.transform >= 1.0)

        if param.transform >= 1.0:

            num_threads = 4

            svm = MKLClassification()

            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX)

            svm.set_kernel(base_wdk)
            svm.set_labels(lab)

            svm.parallel.set_num_threads(num_threads)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)

            if param.flags["normalize_cost"]:
                # normalize cost
                norm_c_pos = param.cost / float(
                    len([l for l in data.labels if l == 1]))
                norm_c_neg = param.cost / float(
                    len([l for l in data.labels if l == -1]))

                svm.set_C(norm_c_neg, norm_c_pos)
            else:
                svm.set_C(param.cost, param.cost)

            svm.train()

            #print "subkernel weights (after):", base_wdk.get_subkernel_weights()

        else:

            # create SVM (disable unsupported optimizations)
            svm = SVMLight(param.cost, base_wdk, lab)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)

            svm.train()

        print "svm objective:", svm.get_objective()

        self.additional_information["weights"] = [
            normalizer.get_beta(i) for i in range(num_subk)
        ]
        self.additional_information["gammas"] = self.taxonomy_to_gammas(
            data, taxonomy)

        print "debug weights:"
        print self.additional_information
        print ""

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_id in train_data.keys():
            svms[task_id] = svm

        return svms
Beispiel #18
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
          
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)
        
        
        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        ########################################################
        print "creating a kernel for each node:"
        ########################################################

        
        # init seq handler 
        task_kernel = SequencesHandlerRbf(1, param.base_similarity, data.get_task_names(), param.flags["wdk_rbf_on"])
        similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))
        
        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():
                
                
                 
                
                # convert similarity with simple transformation
                similarity = task_kernel.get_similarity(task_name_lhs, task_name_rhs)
                
                print similarity
                
                print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs, similarity)
                
                normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity)
                
                # save for later
                similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity
                
                
        # set normalizer                
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()
        

        # set up svm
        svm = SVMLight(param.cost, base_wdk, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        
        
        # normalize cost
        norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
        norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
        
        svm.set_C(norm_c_neg, norm_c_pos)
        
        
        # start training
        svm.train()


        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        #self.additional_information["distances"] = distances
        self.additional_information["similarities"] = similarities


        # wrap up predictors
        svms = {}
        
        # use a reference to the same svm several times
        for task_name in data.get_task_names():
            
            task_num = data.name_to_id(task_name)
            
            # save svm and task_num
            svms[task_name] = (task_num, param, svm)

        return svms
Beispiel #19
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        for task_id in train_data.keys():
            print "task_id:", task_id

        root = param.taxonomy.data

        grey_nodes = [root]

        # top-down processing of taxonomy

        while len(grey_nodes) > 0:

            node = grey_nodes.pop(0)  # pop first item

            # enqueue children
            if node.children != None:
                grey_nodes.extend(node.children)

            #####################################################
            #     init data structures
            #####################################################

            # get data below current node
            data = [train_data[key] for key in node.get_data_keys()]

            print "data at current level"
            for instance_set in data:
                print instance_set[0].dataset

            # initialize containers
            examples = []
            labels = []

            # concatenate data
            for instance_set in data:

                print "train split_set:", instance_set[0].dataset.organism

                for inst in instance_set:
                    examples.append(inst.example)
                    labels.append(inst.label)

            # create shogun data objects
            k = shogun_factory_new.create_kernel(examples, param)
            lab = shogun_factory_new.create_labels(labels)

            cost = param.cost
            #cost = node.cost

            print "using cost:", cost

            #####################################################
            #    train predictors
            #####################################################

            # init predictor variable
            svm = None

            # set up SVM
            if node.is_root():

                print "training svm at top level"
                svm = SVMLight(cost, k, lab)

            else:

                # regularize vs parent predictor

                #weight = node.edge_weight
                weight = param.transform

                print "current edge_weight:", weight, " ,name:", node.name

                parent_svm = node.parent.predictor

                svm = DomainAdaptationSVM(cost, k, lab, parent_svm, weight)
                #svm.set_train_factor(param.base_similarity)

            if param.flags["normalize_cost"]:

                norm_c_pos = param.cost / float(
                    len([l for l in lab.get_labels() if l == 1]))
                norm_c_neg = param.cost / float(
                    len([l for l in lab.get_labels() if l == -1]))
                svm.set_C(norm_c_neg, norm_c_pos)

            # set epsilon
            if param.flags.has_key("epsilon"):
                svm.set_epsilon(param.flags["epsilon"])

            # enable output
            svm.io.enable_progress()
            svm.io.set_loglevel(shogun.Classifier.MSG_INFO)

            svm.set_train_factor(param.flags["train_factor"])
            svm.train()

            # attach svm to node
            node.predictor = svm

            # save some information
            self.additional_information[node.name +
                                        " svm obj"] = svm.get_objective()
            self.additional_information[
                node.name + " svm num sv"] = svm.get_num_support_vectors()
            self.additional_information[node.name +
                                        " runtime"] = svm.get_runtime()

        #####################################################
        #    Wrap things up
        #####################################################

        # wrap up predictors for later use
        predictors = {}

        for leaf in root.get_leaves():

            predictors[leaf.name] = leaf.predictor

            assert (leaf.predictor != None)

        # make sure we have the same keys (potentiall in a different order)
        sym_diff_keys = set(train_data.keys()).symmetric_difference(
            set(predictors.keys()))
        assert len(
            sym_diff_keys
        ) == 0, "symmetric difference between keys non-empty: " + str(
            sym_diff_keys)

        # save graph plot
        mypath = "/fml/ag-raetsch/share/projects/multitask/graphs/"
        filename = mypath + "graph_" + str(param.id)
        root.plot(filename)  #, plot_cost=True, plot_B=True)

        return predictors
Beispiel #20
0
 def __init__(self, degree, sigma, active_set, wdk_rbf_on):
     '''
     loads data into handler
     '''
 
     self.active_set = active_set
     
     fn = "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHCsequenzen/pseudo.txt"
     
     tmp_key = ""
     tmp_idx = 0
     
     self.seqs = []
     self.keys = []
     self.name_to_id = {}
     
     # parse file
     for line in file(fn):
         
         if line.startswith(">"):
             tmp_key = line.strip()[1:]
         else:
        
             if active_set.count(tmp_key) > 0:
             
                 assert self.keys.count(tmp_key) == 0, "key %s is already contained in self.keys" % (tmp_key)
                 
                 self.seqs.append(line.strip())
                 self.keys.append(tmp_key)
                 self.name_to_id[tmp_key] = tmp_idx
         
                 tmp_idx += 1
         
                 assert len(self.seqs) == tmp_idx, "incorrect number of sequences %i != %i" % (len(self.seqs), tmp_idx)
                 assert len(self.keys) == tmp_idx, "incorrect number of keys %i != %i" % (len(self.keys), tmp_idx)
     
         
     
     # setup kernel
     param = Options()
     
     if wdk_rbf_on:
         param.kernel = "WeightedDegreeRBFKernel"
     else:
         param.kernel = "WeightedDegreeStringKernel"
     param.wdk_degree = degree
     param.transform = sigma
     
     self.kernel = shogun_factory.create_kernel(self.seqs, param)
     
     #######################
     # compute kernel
     #######################
     
     num_tasks = len(self.seqs)
     
     self.similarity = numpy.zeros((num_tasks, num_tasks))
     
     for i in xrange(num_tasks):
         for j in xrange(num_tasks):
             self.similarity[i,j] = self.kernel.kernel(i, j)
             
     # normalize kernel
     my_min = numpy.min(self.similarity)
     my_max = numpy.max(self.similarity)
     my_diff = my_max - my_min
 
     # scale to interval [0,1]    
     #self.similarity = (self.similarity - my_min) / my_diff
     self.similarity = (self.similarity) / my_max
 
     print self.similarity
Beispiel #21
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # fix dimensions
        M = len(train_data)

        N = 0
        for key in train_data.keys():
            N += len(train_data[key])

        # init containers
        examples = []
        labels = []

        # vector to indicate to which task each example belongs
        task_vector = []
        task_num = 0
        tmp_examples = 0

        label_matrix = numpy.zeros((M, N))

        # extract training data
        for (task_id, instance_set) in train_data.items():

            print "train task id:", task_id
            #assert(instance_set[0].dataset.organism==task_id)

            examples.extend([inst.example for inst in instance_set])

            tmp_labels = [inst.label for inst in instance_set]
            labels.extend(tmp_labels)

            begin_idx = tmp_examples
            end_idx = tmp_examples + len(tmp_labels)

            # fill matrix row
            label_matrix[task_num, begin_idx:end_idx] = tmp_labels

            task_vector.extend([task_num] * len(instance_set))

            task_num += 1
            tmp_examples += len(tmp_labels)

        # fetch gammas from parameter object
        # TODO: compute gammas outside of this
        gammas = numpy.ones((M, M)) + numpy.eye(M)
        #gammas = numpy.eye(M)

        # create kernel
        kernel = shogun_factory.create_kernel(examples, param)

        y = numpy.array(labels)

        print "computing kernel matrix"

        km = kernel.get_kernel_matrix()
        km = reweight_kernel_matrix(km, gammas, task_vector)

        # "add" labels to Q-matrix
        km = numpy.transpose(y.flatten() * (km * y.flatten()).transpose())

        print "done computing kernel matrix, calling solver"

        f = -numpy.ones(N)
        b = numpy.zeros((M, 1))

        # set up QP
        p = QP(km,
               f,
               Aeq=label_matrix,
               beq=b,
               lb=numpy.zeros(N),
               ub=param.cost * numpy.ones(N))
        p.debug = 1

        # run solver
        r = p.solve('cvxopt_qp', iprint=0)

        print "done with training"

        alphas = r.xf
        objective = r.ff

        print "alphas:", alphas

        predictors = {}

        for (k, task_id) in enumerate(train_data.keys()):
            # pack all relevant information in predictor
            predictors[task_id] = (alphas, param, task_vector, k, gammas,
                                   examples, labels)

        return predictors
Beispiel #22
0
    def _inner_train(self, train_data, param):
        """
        perform inner training by processing the tree
        """

        data_keys = []
        # top-down processing of taxonomy


        classifiers = []
        classifier_at_node = {}

        root = param.taxonomy.data

        grey_nodes = [root]
        
        while len(grey_nodes)>0:
           
            node = grey_nodes.pop(0) # pop first item
            
            # enqueue children
            if node.children != None:
                grey_nodes.extend(node.children)
    

    
            #####################################################
            #     init data structures
            #####################################################

            # get data below current node
            data = [train_data[key] for key in node.get_data_keys()]
            
            data_keys.append(node.get_data_keys())
    
            print "data at current level"
            for instance_set in data:        
                print instance_set[0].dataset
            
            
            # initialize containers
            examples = []
            labels = []       
    

            # concatenate data
            for instance_set in data:
      
                print "train split_set:", instance_set[0].dataset.organism
                
                for inst in instance_set:
                    examples.append(inst.example)
                    labels.append(inst.label)
    

            # create shogun data objects
            k = shogun_factory.create_kernel(examples, param)
            lab = shogun_factory.create_labels(labels)


            #####################################################
            #    train weak learners    
            #####################################################
            
            cost = param.cost
            
            # set up svm
            svm = SVMLight(cost, k, lab)
                        
            if param.flags["normalize_cost"]:
                # set class-specific Cs
                norm_c_pos = param.cost / float(len([l for l in labels if l==1]))
                norm_c_neg = param.cost / float(len([l for l in labels if l==-1]))
                svm.set_C(norm_c_neg, norm_c_pos)
            
            
            print "using cost: negative class=%f, positive class=%f" % (norm_c_neg, norm_c_pos) 
            
            # enable output
            svm.io.enable_progress()
            svm.io.set_loglevel(shogun.Classifier.MSG_INFO)
            
            # train
            svm.train()
            
            # append svm object
            classifiers.append(svm)
            classifier_at_node[node.name] = svm                            
            
            # save some information
            self.additional_information[node.name + " svm obj"] = svm.get_objective()
            self.additional_information[node.name + " svm num sv"] = svm.get_num_support_vectors()
            self.additional_information[node.name + " runtime"] = svm.get_runtime()


        return (classifiers, classifier_at_node)
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
 
        
                
        # fix dimensions
        M = len(train_data)

        N = 0
        for key in train_data.keys():
            N += len(train_data[key])
        
        # init containers
        examples = []
        labels = []


        # vector to indicate to which task each example belongs
        task_vector = []
        task_num = 0
        tmp_examples = 0

        label_matrix = numpy.zeros((M,N))


        # extract training data
        for (task_id, instance_set) in train_data.items():
  
            print "train task id:", task_id
            #assert(instance_set[0].dataset.organism==task_id)
            
            examples.extend([inst.example for inst in instance_set])
            
            tmp_labels = [inst.label for inst in instance_set]
            labels.extend(tmp_labels)
            
            begin_idx = tmp_examples
            end_idx = tmp_examples + len(tmp_labels) 
            
            # fill matrix row
            label_matrix[task_num, begin_idx:end_idx] = tmp_labels

            task_vector.extend([task_num]*len(instance_set))

            task_num += 1
            tmp_examples += len(tmp_labels)


        # fetch gammas from parameter object
        # TODO: compute gammas outside of this
        gammas = numpy.ones((M,M)) + numpy.eye(M)
        #gammas = numpy.eye(M)
        

        # create kernel
        kernel = shogun_factory.create_kernel(examples, param)


        y = numpy.array(labels)

        print "computing kernel matrix"

        km = kernel.get_kernel_matrix()
        km = reweight_kernel_matrix(km, gammas, task_vector)

        # "add" labels to Q-matrix
        km = numpy.transpose(y.flatten() * (km*y.flatten()).transpose())

        print "done computing kernel matrix, calling solver"


        f = -numpy.ones(N)
        b = numpy.zeros((M,1))

        # set up QP
        p = QP(km, f, Aeq=label_matrix, beq=b, lb=numpy.zeros(N), ub=param.cost*numpy.ones(N))
        p.debug=1
        
        # run solver
        r = p.solve('cvxopt_qp', iprint = 0)

        print "done with training"

        alphas = r.xf
        objective = r.ff


        print "alphas:", alphas

        predictors = {}

        for (k, task_id) in enumerate(train_data.keys()):
            # pack all relevant information in predictor
            predictors[task_id] = (alphas, param, task_vector, k, gammas, examples, labels)

        return predictors
Beispiel #24
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        assert (param.base_similarity >= 1)

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        # load data
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt")
        f = file(
            "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt"
        )
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt")
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt")

        num_lines = int(f.readline().strip())
        task_distances = numpy.zeros((num_lines, num_lines))
        name_to_id = {}
        for (i, line) in enumerate(f):
            tokens = line.strip().split("\t")
            name = str(tokens[0])
            name_to_id[name] = i
            entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0])
            assert len(entry) == num_lines, "len_entry %i, num_lines %i" % (
                len(entry), num_lines)
            task_distances[i, :] = entry

        # cut relevant submatrix
        active_ids = [name_to_id[name] for name in data.get_task_names()]
        tmp_distances = task_distances[active_ids, :]
        tmp_distances = tmp_distances[:, active_ids]
        print "distances ", tmp_distances.shape

        # normalize distances
        task_distances = task_distances / numpy.max(tmp_distances)

        similarities = numpy.zeros(
            (data.get_num_tasks(), data.get_num_tasks()))

        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():

                # convert similarity with simple transformation
                similarity = param.base_similarity - task_distances[
                    name_to_id[task_name_lhs], name_to_id[task_name_rhs]]
                normalizer.set_task_similarity(data.name_to_id(task_name_lhs),
                                               data.name_to_id(task_name_rhs),
                                               similarity)

                # save for later
                similarities[data.name_to_id(task_name_lhs),
                             data.name_to_id(task_name_rhs)] = similarity

        # set normalizer
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()

        # set up svm
        svm = SVMLight(param.cost, base_wdk, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        # normalize cost
        norm_c_pos = param.cost / float(len([l
                                             for l in data.labels if l == 1]))
        norm_c_neg = param.cost / float(
            len([l for l in data.labels if l == -1]))

        svm.set_C(norm_c_neg, norm_c_pos)

        # start training
        svm.train()

        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        #self.additional_information["distances"] = distances
        self.additional_information["similarities"] = similarities

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in data.get_task_names():

            task_num = data.name_to_id(task_name)

            # save svm and task_num
            svms[task_name] = (task_num, param, svm)

        return svms
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # split for training weak_learners and boosting
        (train_weak, train_boosting) = split_data(train_data, 4)
          
        # merge data sets
        data = PreparedMultitaskData(train_weak, shuffle=True)
        
        # create shogun label
        lab = shogun_factory.create_labels(data.labels)
        


        ##################################################
        # define pockets
        ##################################################
        
        pockets = [0]*9
        
        pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34]
        pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31]
        pockets[2] = [11, 20, 21, 22, 29, 31]
        pockets[3] = [8, 30, 31, 32]
        pockets[4] = [10, 11, 30]
        pockets[5] = [10, 11, 12, 13, 20, 29]
        pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29]
        pockets[7] = [12, 14, 15, 26]
        pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26]
        
        pockets = []
        for i in xrange(35):
            pockets.append([i])


        #new_pockets = []
        
        # merge neighboring pockets
        #for i in range(8):
        #    new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1]))))
            
        #pockets = new_pockets
        
        
        ########################################################
        print "creating a kernel:"
        ########################################################
        
        
        # init seq handler 
        pseudoseqs = SequencesHandler()

        
        classifiers = []


        for pocket in pockets:

            print "creating normalizer"
            #import pdb
            #pdb.set_trace()
            
            normalizer = MultitaskKernelNormalizer(data.task_vector_nums)
            
            print "processing pocket", pocket

            # set similarity
            for task_name_lhs in data.get_task_names():
                for task_name_rhs in data.get_task_names():
                    
                    similarity = 0.0
                    
                    for pseudo_seq_pos in pocket:
                        similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1))
                    
                    # normalize
                    similarity = similarity / float(len(pocket))
                    
                    print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity)
                    
                    normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity)
               

            print "creating empty kernel"
            kernel = shogun_factory.create_kernel(data.examples, param)
            
            print "setting normalizer"
            kernel.set_normalizer(normalizer)

            print "training SVM for pocket", pocket
            svm = self._train_single_svm(param, kernel, lab)

            classifiers.append(svm)
        
        
        print "done obtaining weak learners"
            
        
        # save additional info
        #self.additional_information["svm_objective"] = svm.get_objective()
        #self.additional_information["svm num sv"] = svm.get_num_support_vectors()
        #self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights()
        
        #print self.additional_information 
        


        ##################################################
        # combine weak learners for each task
        ##################################################
        
        
        # set constants
        
        some = 0.9
        import cvxmod
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_boosting.keys():
            
            instances = train_boosting[task_name]
            
            N = len(instances)
            F = len(pockets)
            
            examples = [inst.example for inst in instances]
            labels = [inst.label for inst in instances]
            
            # dim = (F x N)
            out = cvxmod.zeros((N,F))
            
            for i in xrange(F):
                svm = classifiers[i]
                tmp_out = self._predict_weak(svm, examples, data.name_to_id(task_name))

                out[:,i] = numpy.sign(tmp_out)
                #out[:,i] = tmp_out
            

            #TODO: fix
            helper.save("/tmp/out_sparse", (out,labels))
            pdb.set_trace()
            
            weights = solve_boosting(out, labels, some, solver="mosek")
            
            
            
            svms[task_name] = (data.name_to_id(task_name), svm)

        
        return svms
Beispiel #26
0
    def _inner_train(self, prepared_data, param):
        """
        perform inner training by processing the tree
        """


        # init seq handler 
        pseudoseqs = SequencesHandler()

        
        classifiers = []


        for pocket in self.get_pockets(param.flags["all_positions"]):

            print "creating normalizer"
            #import pdb
            #pdb.set_trace()
            
            normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums)
            
            from method_mhc_rbf import SequencesHandlerRbf
            
            task_kernel = SequencesHandlerRbf(1, param.base_similarity, prepared_data.get_task_names(), param.flags["wdk_rbf_on"])
            print "processing pocket", pocket
            
            M = prepared_data.get_num_tasks()
            save_sim_p = numpy.zeros((M,M))
            save_sim_t = numpy.zeros((M,M))

            # set similarity
            for task_name_lhs in prepared_data.get_task_names():
                for task_name_rhs in prepared_data.get_task_names():
                    
                    similarity = 0.0
                    
                    for pseudo_seq_pos in pocket:
                        similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1))
                    
                    
                    # normalize
                    similarity = similarity / float(len(pocket))
                    
                    similarity_task = task_kernel.get_similarity(task_name_lhs, task_name_rhs)
                    
                    print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity)
                    
                    normalizer.set_task_similarity(prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs), similarity)
               
                    save_sim_p[prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs)] = similarity
                    save_sim_t[prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs)] = similarity_task
            
            
            #from IPython.Shell import IPShellEmbed
            #IPShellEmbed([])()
            
            lab = shogun_factory.create_labels(prepared_data.labels)
            
            print "creating empty kernel"
            kernel = shogun_factory.create_kernel(prepared_data.examples, param)
            
            print "setting normalizer"
            kernel.set_normalizer(normalizer)
            kernel.init_normalizer()

            print "training SVM for pocket", pocket
            svm = shogun_factory.create_svm(param, kernel, lab)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)
    
            # train SVM
            svm.train()
            
            #import pdb
            #pdb.set_trace()

            classifiers.append(svm)



        return classifiers
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """



        for task_id in train_data.keys():
            print "task_id:", task_id
            
        
        root = param.taxonomy.data
        

        grey_nodes = [root]
        

        # top-down processing of taxonomy
        
        while len(grey_nodes)>0:
           
            node = grey_nodes.pop(0) # pop first item
            
            # enqueue children
            if node.children != None:
                grey_nodes.extend(node.children)
    

    
            #####################################################
            #     init data structures
            #####################################################

            # get data below current node
            data = [train_data[key] for key in node.get_data_keys()]
            
    
            print "data at current level"
            for instance_set in data:        
                print instance_set[0].dataset
            
            
            # initialize containers
            examples = []
            labels = []       
    

            # concatenate data
            for instance_set in data:
      
                print "train split_set:", instance_set[0].dataset.organism
                
                for inst in instance_set:
                    examples.append(inst.example)
                    labels.append(inst.label)
    

            # create shogun data objects
            k = shogun_factory_new.create_kernel(examples, param)
            lab = shogun_factory_new.create_labels(labels)

            
            cost = param.cost
            #cost = node.cost
            
            print "using cost:", cost




            #####################################################
            #    train predictors    
            #####################################################
            
                
            # init predictor variable
            svm = None
            

            # set up SVM
            if node.is_root():
                
                print "training svm at top level"
                svm = SVMLight(cost, k, lab)

            else:
                
                # regularize vs parent predictor
                
                #weight = node.edge_weight
                weight = param.transform
                
                print "current edge_weight:", weight, " ,name:", node.name
                
                parent_svm = node.parent.predictor
                
                svm = DomainAdaptationSVM(cost, k, lab, parent_svm, weight)
                #svm.set_train_factor(param.base_similarity)
             

            if param.flags["normalize_cost"]:
                
                norm_c_pos = param.cost / float(len([l for l in lab.get_labels() if l==1]))
                norm_c_neg = param.cost / float(len([l for l in lab.get_labels() if l==-1]))
                svm.set_C(norm_c_neg, norm_c_pos)
             

            # set epsilon
            if param.flags.has_key("epsilon"):
                svm.set_epsilon(param.flags["epsilon"])

               
            # enable output
            svm.io.enable_progress()
            svm.io.set_loglevel(shogun.Classifier.MSG_INFO)
            

            svm.set_train_factor(param.flags["train_factor"])
            svm.train()
            
            # attach svm to node
            node.predictor = svm
            
            # save some information
            self.additional_information[node.name + " svm obj"] = svm.get_objective()
            self.additional_information[node.name + " svm num sv"] = svm.get_num_support_vectors()
            self.additional_information[node.name + " runtime"] = svm.get_runtime()
            


        #####################################################
        #    Wrap things up    
        #####################################################
 
        # wrap up predictors for later use
        predictors = {}

        for leaf in root.get_leaves():
            
            predictors[leaf.name] = leaf.predictor
            
            assert(leaf.predictor!=None)
          
        # make sure we have the same keys (potentiall in a different order)  
        sym_diff_keys = set(train_data.keys()).symmetric_difference(set(predictors.keys()))
        assert len(sym_diff_keys)==0, "symmetric difference between keys non-empty: " + str(sym_diff_keys)  



        # save graph plot
        mypath = "/fml/ag-raetsch/share/projects/multitask/graphs/"
        filename = mypath + "graph_" + str(param.id)
        root.plot(filename)#, plot_cost=True, plot_B=True)


        return predictors
Beispiel #28
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        ########################################################
        print "creating a kernel for each node:"
        ########################################################

        # init seq handler
        task_kernel = SequencesHandlerRbf(1, param.base_similarity,
                                          data.get_task_names(),
                                          param.flags["wdk_rbf_on"])
        similarities = numpy.zeros(
            (data.get_num_tasks(), data.get_num_tasks()))

        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():

                # convert similarity with simple transformation
                similarity = task_kernel.get_similarity(
                    task_name_lhs, task_name_rhs)

                print similarity

                print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs,
                                                 similarity)

                normalizer.set_task_similarity(data.name_to_id(task_name_lhs),
                                               data.name_to_id(task_name_rhs),
                                               similarity)

                # save for later
                similarities[data.name_to_id(task_name_lhs),
                             data.name_to_id(task_name_rhs)] = similarity

        # set normalizer
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()

        # set up svm
        svm = SVMLight(param.cost, base_wdk, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        # normalize cost
        norm_c_pos = param.cost / float(len([l
                                             for l in data.labels if l == 1]))
        norm_c_neg = param.cost / float(
            len([l for l in data.labels if l == -1]))

        svm.set_C(norm_c_neg, norm_c_pos)

        # start training
        svm.train()

        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        #self.additional_information["distances"] = distances
        self.additional_information["similarities"] = similarities

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in data.get_task_names():

            task_num = data.name_to_id(task_name)

            # save svm and task_num
            svms[task_name] = (task_num, param, svm)

        return svms
normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

for t1_name in task_names:
    for t2_name in task_names:
                
        similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)]
        normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity)

            
##################################################################
# Train SVMs
##################################################################

# create shogun objects
wdk_tree = shogun_factory.create_kernel(data.examples, param)
lab = shogun_factory.create_labels(data.labels)

wdk_tree.set_normalizer(tree_normalizer)
wdk_tree.init_normalizer()

print "--->",wdk_tree.get_normalizer().get_name()

svm_tree = SVMLight(cost, wdk_tree, lab)
svm_tree.set_linadd_enabled(False)
svm_tree.set_batch_computation_enabled(False)

svm_tree.train()

del wdk_tree
del tree_normalizer
Beispiel #30
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # split for training weak_learners and boosting
        (train_weak, train_boosting) = split_data(train_data, 4)
          
        # merge data sets
        data = PreparedMultitaskData(train_weak, shuffle=True)
        
        # create shogun label
        lab = shogun_factory.create_labels(data.labels)
        


        
        
        ########################################################
        print "creating a kernel:"
        ########################################################
        
        
        # init seq handler 
        pseudoseqs = SequencesHandler()

        
        classifiers = []


        for pocket in pockets:

            print "creating normalizer"
            #import pdb
            #pdb.set_trace()
            
            normalizer = MultitaskKernelNormalizer(data.task_vector_nums)
            
            print "processing pocket", pocket

            # set similarity
            for task_name_lhs in data.get_task_names():
                for task_name_rhs in data.get_task_names():
                    
                    similarity = 0.0
                    
                    for pseudo_seq_pos in pocket:
                        similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1))
                    
                    # normalize
                    similarity = similarity / float(len(pocket))
                    
                    print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity)
                    
                    normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity)
               

            print "creating empty kernel"
            kernel = shogun_factory.create_kernel(data.examples, param)
            
            print "setting normalizer"
            kernel.set_normalizer(normalizer)

            print "training SVM for pocket", pocket
            svm = self._train_single_svm(param, kernel, lab)

            classifiers.append(svm)
        
        
        print "done obtaining weak learners"
            
        
        # save additional info
        #self.additional_information["svm_objective"] = svm.get_objective()
        #self.additional_information["svm num sv"] = svm.get_num_support_vectors()
        #self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights()
        
        #print self.additional_information 
        


        ##################################################
        # combine weak learners for each task
        ##################################################
        
        
        # set constants
        
        some = 0.9
        import cvxmod
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_boosting.keys():
            
            instances = train_boosting[task_name]
            
            N = len(instances)
            F = len(pockets)
            
            examples = [inst.example for inst in instances]
            labels = [inst.label for inst in instances]
            
            # dim = (F x N)
            out = cvxmod.zeros((N,F))
            
            for i in xrange(F):
                svm = classifiers[i]
                tmp_out = self._predict_weak(svm, examples, data.name_to_id(task_name))

                out[:,i] = numpy.sign(tmp_out)
                #out[:,i] = tmp_out
            

            #TODO: fix
            helper.save("/tmp/out_sparse", (out,labels))
            pdb.set_trace()
            
            weights = solve_boosting(out, labels, some, solver="mosek")
            
            
            
            svms[task_name] = (data.name_to_id(task_name), svm)

        
        return svms