def get_weighted_spectrum_kernel(subfeats_list, options):
	"""build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement)

	Arguments:
	subfeats_list -- list of sub-feature objects
	options -- object containing option data 

	Return:
	CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel 
	"""
	kmerlen = options.kmerlen
	kmerlen2 = options.kmerlen2

	subkernels = 0
	kernel = CombinedKernel()
	feats = CombinedFeatures()

	for subfeats in subfeats_list:
		feats.append_feature_obj(subfeats)

	for k in xrange(kmerlen, kmerlen2+1):
		if k <= 8:
			subkernel = CommWordStringKernel(10, False)
		else:
			subkernel = CommUlongStringKernel(10, False)

		kernel.append_kernel(subkernel)
		subkernels+=1

	kernel.init(feats, feats)

	kernel.set_subkernel_weights(numpy.array([1/float(subkernels)]*subkernels, numpy.dtype('float64')))

	return kernel
def kernel_combined_modular(fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ):
	from shogun.Kernel import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
	from shogun.Features import RealFeatures, StringCharFeatures, CombinedFeatures, DNA

	kernel=CombinedKernel()
	feats_train=CombinedFeatures()
	feats_test=CombinedFeatures()

	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=GaussianKernel(10, 1.1)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	degree=3
	subkernel=FixedDegreeStringKernel(10, degree)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	subkernel=LocalAlignmentStringKernel(10)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	kernel.init(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def mkl_multiclass ():
	print 'mkl_multiclass'

	from shogun.Features import CombinedFeatures, RealFeatures, Labels
	from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel
	from shogun.Classifier import MKLMultiClass


	width=1.2
	C=1.2
	epsilon=1e-5
	num_threads=1


	kernel=CombinedKernel()
	feats_train=CombinedFeatures()
	feats_test=CombinedFeatures()

	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=GaussianKernel(10, width)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)


	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=LinearKernel()
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)


	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=PolyKernel(10,2)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)
	
	kernel.init(feats_train, feats_train)


	labels=Labels(label_train_multiclass)

	mkl=MKLMultiClass(C, kernel, labels)
	
	mkl.set_epsilon(epsilon);
	mkl.parallel.set_num_threads(num_threads)
	mkl.set_mkl_epsilon(0.001)

	mkl.train()

	kernel.init(feats_train, feats_test)

	out= mkl.classify().get_labels()

	print out
def mkl_binclass_modular (train_data, testdata, train_labels, test_labels, d1, d2):
        # create some Gaussian train/test matrix
    	tfeats = RealFeatures(train_data)
    	tkernel = GaussianKernel(128, d1)
    	tkernel.init(tfeats, tfeats)
    	K_train = tkernel.get_kernel_matrix()

    	pfeats = RealFeatures(test_data)
    	tkernel.init(tfeats, pfeats)
    	K_test = tkernel.get_kernel_matrix()

    	# create combined train features
    	feats_train = CombinedFeatures()
    	feats_train.append_feature_obj(RealFeatures(train_data))

    	# and corresponding combined kernel
    	kernel = CombinedKernel()
    	kernel.append_kernel(CustomKernel(K_train))
    	kernel.append_kernel(GaussianKernel(128, d2))
    	kernel.init(feats_train, feats_train)

    	# train mkl
    	labels = Labels(train_labels)
    	mkl = MKLClassification()
	
        # not to use svmlight
        mkl.set_interleaved_optimization_enabled(0)

    	# which norm to use for MKL
    	mkl.set_mkl_norm(2)

    	# set cost (neg, pos)
    	mkl.set_C(1, 1)

    	# set kernel and labels
    	mkl.set_kernel(kernel)
    	mkl.set_labels(labels)

    	# train
    	mkl.train()

    	# test
	# create combined test features
    	feats_pred = CombinedFeatures()
    	feats_pred.append_feature_obj(RealFeatures(test_data))

    	# and corresponding combined kernel
    	kernel = CombinedKernel()
    	kernel.append_kernel(CustomKernel(K_test))
    	kernel.append_kernel(GaussianKernel(128, d2))
    	kernel.init(feats_train, feats_pred)

	# and classify
    	mkl.set_kernel(kernel)
    	output = mkl.apply().get_labels()
	output = [1.0 if i>0 else -1.0 for i in output]
	accu = len(where(output == test_labels)[0]) / float(len(output))
	return accu
Exemple #5
0
def training_run(options):
    """Conduct a training run and return a trained SVM kernel"""
    settings = MotifFinderSettings(kirmes_ini.MOTIF_LENGTH, options.window_width, options.replace)
    positives = MotifFinder(finder_settings=settings)
    positives.setFastaFile(options.positives)
    positives.setMotifs(options.pgff)
    pmotifs, ppositions = positives.getResults()
    negatives = MotifFinder(finder_settings=settings)
    negatives.setFastaFile(options.negatives)
    negatives.setMotifs(options.ngff)
    nmotifs, npositions = negatives.getResults()

    wds_kparams = kirmes_ini.WDS_KERNEL_PARAMETERS
    wds_svm = EasySVM.EasySVM(wds_kparams)
    num_positives = len(pmotifs.values()[0])
    num_negatives = len(nmotifs.values()[0])
    # Creating Kernel Objects
    kernel = CombinedKernel()
    features = CombinedFeatures()
    kernel_array = []
    motifs = pmotifs.keys()
    motifs.sort()
    # Adding Kmer Kernels
    for motif in motifs:
        all_examples = pmotifs[motif] + nmotifs[motif]
        motif_features = wds_svm.createFeatures(all_examples)
        wds_kernel = WeightedDegreePositionStringKernel(motif_features, motif_features, wds_kparams["degree"])
        wds_kernel.set_shifts(wds_kparams["shift"] * ones(wds_kparams["seqlength"], dtype=int32))
        features.append_feature_obj(motif_features)
        kernel_array.append(wds_kernel)
        kernel.append_kernel(wds_kernel)
    rbf_svm = EasySVM.EasySVM(kirmes_ini.RBF_KERNEL_PARAMETERS)
    positions = array(ppositions + npositions, dtype=float64).T
    position_features = rbf_svm.createFeatures(positions)
    features.append_feature_obj(position_features)
    motif_labels = append(ones(num_positives), -ones(num_negatives))
    complete_labels = Labels(motif_labels)
    rbf_kernel = GaussianKernel(position_features, position_features, kirmes_ini.RBF_KERNEL_PARAMETERS["width"])
    kernel_array.append(rbf_kernel)
    kernel.append_kernel(rbf_kernel)
    # Kernel init
    kernel.init(features, features)
    kernel.set_cache_size(kirmes_ini.K_CACHE_SIZE)
    svm = LibSVM(kirmes_ini.K_COMBINED_C, kernel, complete_labels)
    svm.parallel.set_num_threads(kirmes_ini.K_NUM_THREADS)
    # Training
    svm.train()
    if not os.path.exists(options.output_path):
        os.mkdir(options.output_path)
    html = {}
    if options.contrib:
        html["contrib"] = contrib(svm, kernel, motif_labels, kernel_array, motifs)
    if options.logos:
        html["poims"] = poims(svm, kernel, kernel_array, motifs, options.output_path)
    if options.query:
        html["query"] = evaluate(options, svm, kernel, features, motifs)
    htmlize(html, options.output_html)
    def train(self, data, labels):
        """
        model training 
        """

        # centered WDK/WDK-shift
        if self.param["shifts"] == 0:
            kernel_center = WeightedDegreeStringKernel(self.param["degree"])
        else:
            kernel_center = WeightedDegreePositionStringKernel(
                10, self.param["degree"])
            shifts_vector = numpy.ones(
                self.param["center_offset"] * 2,
                dtype=numpy.int32) * self.param["shifts"]
            kernel_center.set_shifts(shifts_vector)

        kernel_center.set_cache_size(self.param["kernel_cache"] / 3)

        # border spetrum kernels
        size = self.param["kernel_cache"] / 3
        use_sign = False
        kernel_left = WeightedCommWordStringKernel(size, use_sign)
        kernel_right = WeightedCommWordStringKernel(size, use_sign)

        # assemble combined kernel
        kernel = CombinedKernel()
        kernel.append_kernel(kernel_center)
        kernel.append_kernel(kernel_left)
        kernel.append_kernel(kernel_right)

        ## building features
        feat = create_features(data, self.param["center_offset"],
                               self.param["center_pos"])

        # init combined kernel
        kernel.init(feat, feat)

        print "len(labels) = %i" % (len(labels))
        lab = BinaryLabels(numpy.double(labels))
        self.svm = SVMLight(self.param["cost"], kernel, lab)

        # show debugging output
        self.svm.io.enable_progress()
        self.svm.io.set_loglevel(MSG_DEBUG)

        # optimization settings
        num_threads = 2
        self.svm.parallel.set_num_threads(num_threads)
        self.svm.set_epsilon(10e-8)

        self.svm.train()

        return self
Exemple #7
0
def get_weighted_spectrum_kernel(subfeats_list, options):
    """build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement)

	Arguments:
	subfeats_list -- list of sub-feature objects
	options -- object containing option data 

	Return:
	CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel 
	"""
    kmerlen = options.kmerlen
    kmerlen2 = options.kmerlen2

    subkernels = 0
    kernel = CombinedKernel()
    feats = CombinedFeatures()

    weights = []

    i = 0
    for subfeats in subfeats_list:
        feats.append_feature_obj(subfeats)

        combine_kcount = Counter()
        for i in xrange(subfeats.get_num_vectors()):
            fv = list(subfeats.get_feature_vector(i))
            combine_kcount += Counter(fv)
            number = len(combine_kcount)
            klen = kmerlen + i

    for k in xrange(kmerlen, kmerlen2 + 1):
        if k <= 8:
            subkernel = CommWordStringKernel(10, False)
        else:
            subkernel = CommUlongStringKernel(10, False)

        kernel.append_kernel(subkernel)
        subkernels += 1

    kernel.init(feats, feats)
    # here the weight for each k-mer is uniform
    '''
	subkernels = 8
	numpy.array([1 / float(subkernels)] * subkernels, numpy.dtype('float64'))
	array([ 0.125,  0.125,  0.125,  0.125,  0.125,  0.125,  0.125,  0.125])
	'''
    kernel.set_subkernel_weights(
        numpy.array([1 / float(subkernels)] * subkernels,
                    numpy.dtype('float64')))

    return kernel
def mkl_multiclass_modular(fm_train_real, fm_test_real, label_train_multiclass,
                           width, C, epsilon, num_threads, mkl_epsilon,
                           mkl_norm):

    from shogun.Features import CombinedFeatures, RealFeatures, MulticlassLabels
    from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel, PolyKernel
    from shogun.Classifier import MKLMulticlass

    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    feats_test = CombinedFeatures()

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = GaussianKernel(10, width)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = LinearKernel()
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = PolyKernel(10, 2)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)

    labels = MulticlassLabels(label_train_multiclass)

    mkl = MKLMulticlass(C, kernel, labels)

    mkl.set_epsilon(epsilon)
    mkl.parallel.set_num_threads(num_threads)
    mkl.set_mkl_epsilon(mkl_epsilon)
    mkl.set_mkl_norm(mkl_norm)

    mkl.train()

    kernel.init(feats_train, feats_test)

    out = mkl.apply().get_labels()
    return out
def mkl_multiclass_modular(fm_train_real, fm_test_real, label_train_multiclass,
                           width, C, epsilon, num_threads, mkl_epsilon,
                           mkl_norm):

    from shogun.Features import CombinedFeatures, RealFeatures, Labels
    from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel, PolyKernel
    from shogun.Classifier import MKLMultiClass

    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    feats_test = CombinedFeatures()

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = GaussianKernel(10, width)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = LinearKernel()
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = PolyKernel(10, 2)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)

    labels = Labels(label_train_multiclass)

    mkl = MKLMultiClass(C, kernel, labels)

    mkl.set_epsilon(epsilon)
    mkl.parallel.set_num_threads(num_threads)
    mkl.set_mkl_epsilon(mkl_epsilon)
    mkl.set_mkl_norm(mkl_norm)

    mkl.train()

    kernel.init(feats_train, feats_test)

    out = mkl.apply().get_labels()
    return out
Exemple #10
0
    def train(self, data, labels):
        """
        model training 
        """

        # centered WDK/WDK-shift
        if self.param["shifts"] == 0:
            kernel_center = WeightedDegreeStringKernel(self.param["degree"])
        else:
            kernel_center = WeightedDegreePositionStringKernel(10, self.param["degree"])
            shifts_vector = numpy.ones(self.param["center_offset"]*2, dtype=numpy.int32)*self.param["shifts"]
            kernel_center.set_shifts(shifts_vector)

        kernel_center.set_cache_size(self.param["kernel_cache"]/3)

        # border spetrum kernels
        size = self.param["kernel_cache"]/3
        use_sign = False
        kernel_left = WeightedCommWordStringKernel(size, use_sign)
        kernel_right = WeightedCommWordStringKernel(size, use_sign)
        
        # assemble combined kernel
        kernel = CombinedKernel()
        kernel.append_kernel(kernel_center)
        kernel.append_kernel(kernel_left)
        kernel.append_kernel(kernel_right)

        ## building features 
        feat = create_features(data, self.param["center_offset"], self.param["center_pos"])
        
        # init combined kernel
        kernel.init(feat, feat)

        print "len(labels) = %i" % (len(labels))
        lab = BinaryLabels(numpy.double(labels))
        self.svm = SVMLight(self.param["cost"], kernel, lab)

        # show debugging output
        self.svm.io.enable_progress()
        self.svm.io.set_loglevel(MSG_DEBUG)

        # optimization settings
        num_threads = 2
        self.svm.parallel.set_num_threads(num_threads)
        self.svm.set_epsilon(10e-8)

        self.svm.train()

        return self
def kernel_combined_custom_poly_modular(fm_train_real=traindat,
                                        fm_test_real=testdat,
                                        fm_label_twoclass=label_traindat):
    from shogun.Features import CombinedFeatures, RealFeatures, Labels
    from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel
    from shogun.Classifier import LibSVM

    kernel = CombinedKernel()
    feats_train = CombinedFeatures()

    tfeats = RealFeatures(fm_train_real)
    tkernel = PolyKernel(10, 3)
    tkernel.init(tfeats, tfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))

    subkfeats_train = RealFeatures(fm_train_real)
    feats_train.append_feature_obj(subkfeats_train)
    subkernel = PolyKernel(10, 2)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)

    labels = Labels(fm_label_twoclass)
    svm = LibSVM(1.0, kernel, labels)
    svm.train()

    kernel = CombinedKernel()
    feats_pred = CombinedFeatures()

    pfeats = RealFeatures(fm_test_real)
    tkernel = PolyKernel(10, 3)
    tkernel.init(tfeats, pfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))

    subkfeats_test = RealFeatures(fm_test_real)
    feats_pred.append_feature_obj(subkfeats_test)
    subkernel = PolyKernel(10, 2)
    kernel.append_kernel(subkernel)
    kernel.init(feats_train, feats_pred)

    svm.set_kernel(kernel)
    svm.classify()
    km_train = kernel.get_kernel_matrix()
    return km_train, kernel
def mkl_binclass_modular(fm_train_real=traindat,
                         fm_test_real=testdat,
                         fm_label_twoclass=label_traindat):

    sc1 = StringCharFeatures(strings1, RAWBYTE)
    sc2 = StringCharFeatures(strings2, RAWBYTE)

    sfeats1 = StringWordFeatures(RAWBYTE)
    sfeats1.obtain_from_char(sc1, 0, 2, 0, False)
    skernel1 = CommWordStringKernel(10, False)
    skernel1.init(sfeats1, sfeats1)
    sfeats2 = StringWordFeatures(RAWBYTE)
    sfeats2.obtain_from_char(sc2, 0, 2, 0, False)
    skernel2 = CommWordStringKernel(10, False)
    skernel2.init(sfeats2, sfeats2)

    ffeats = RealFeatures(traindat)
    fkernel = LinearKernel(ffeats, ffeats)

    fffeats = RealFeatures(traindat)
    ffkernel = GaussianKernel(fffeats, fffeats, 1.0)

    # COMBINING LINADD FEATURES/KERNELS LEAD TO FAIL
    feats_train = CombinedFeatures()
    feats_train.append_feature_obj(ffeats)
    #feats_train.append_feature_obj(fffeats)
    feats_train.append_feature_obj(sfeats2)

    print feats_train.get_num_vectors()

    kernel = CombinedKernel()
    kernel.append_kernel(fkernel)
    #kernel.append_kernel(ffkernel)
    kernel.append_kernel(skernel2)
    kernel.init(feats_train, feats_train)

    labels = RegressionLabels(fm_label_twoclass)
    mkl = MKLRegression()

    mkl.set_mkl_norm(1)  #2,3
    mkl.set_C(1, 1)
    mkl.set_kernel(kernel)
    mkl.set_labels(labels)

    mkl.io.enable_file_and_line()
    mkl.io.set_loglevel(MSG_DEBUG)
def kernel_combined_custom_poly_modular(fm_train_real = traindat,fm_test_real = testdat,fm_label_twoclass=label_traindat):
    from shogun.Features import CombinedFeatures, RealFeatures, BinaryLabels
    from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel
    from shogun.Classifier import LibSVM
   
    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    
    tfeats = RealFeatures(fm_train_real)
    tkernel = PolyKernel(10,3)
    tkernel.init(tfeats, tfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))
        
    subkfeats_train = RealFeatures(fm_train_real)
    feats_train.append_feature_obj(subkfeats_train)
    subkernel = PolyKernel(10,2)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)
    
    labels = BinaryLabels(fm_label_twoclass)
    svm = LibSVM(1.0, kernel, labels)
    svm.train()

    kernel = CombinedKernel()
    feats_pred = CombinedFeatures()

    pfeats = RealFeatures(fm_test_real)
    tkernel = PolyKernel(10,3)
    tkernel.init(tfeats, pfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))

    subkfeats_test = RealFeatures(fm_test_real)
    feats_pred.append_feature_obj(subkfeats_test)
    subkernel = PolyKernel(10, 2)
    kernel.append_kernel(subkernel)
    kernel.init(feats_train, feats_pred)

    svm.set_kernel(kernel)
    svm.apply()
    km_train=kernel.get_kernel_matrix()
    return km_train,kernel
def create_combined_wd_kernel(instances, param):
    """
    creates a combined wd kernel object
    """

    num_features = len(instances[0])

    # contruct combined features
    kernel = CombinedKernel()
    
    for idx in range(num_features): 
        
        param.kernel = "WeightedDegreeStringKernel"
        
        tmp_kernel = create_empty_kernel(param)
        kernel.append_kernel(tmp_kernel)

    combined_features = create_combined_wd_features(instances, feat_type="dna")

    kernel.init(combined_features, combined_features)
    
    return kernel
def create_combined_wd_kernel(instances, param):
    """
    creates a combined wd kernel object
    """

    num_features = len(instances[0])

    # contruct combined features
    kernel = CombinedKernel()

    for idx in range(num_features):

        param.kernel = "WeightedDegreeStringKernel"

        tmp_kernel = create_empty_kernel(param)
        kernel.append_kernel(tmp_kernel)

    combined_features = create_combined_wd_features(instances, feat_type="dna")

    kernel.init(combined_features, combined_features)

    return kernel
def kernel_combined_modular(fm_train_real=traindat,
                            fm_test_real=testdat,
                            fm_train_dna=traindna,
                            fm_test_dna=testdna):
    from shogun.Kernel import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
    from shogun.Features import RealFeatures, StringCharFeatures, CombinedFeatures, DNA

    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    feats_test = CombinedFeatures()

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = GaussianKernel(10, 1.1)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = StringCharFeatures(fm_train_dna, DNA)
    subkfeats_test = StringCharFeatures(fm_test_dna, DNA)
    degree = 3
    subkernel = FixedDegreeStringKernel(10, degree)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = StringCharFeatures(fm_train_dna, DNA)
    subkfeats_test = StringCharFeatures(fm_test_dna, DNA)
    subkernel = LocalAlignmentStringKernel(10)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)
    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
Exemple #17
0
def get_weighted_spectrum_kernel(subfeats_list, options):
    """build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement)

	Arguments:
	subfeats_list -- list of sub-feature objects
	options -- object containing option data 

	Return:
	CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel 
	"""
    kmerlen = options.kmerlen
    kmerlen2 = options.kmerlen2

    subkernels = 0
    kernel = CombinedKernel()
    feats = CombinedFeatures()

    for subfeats in subfeats_list:
        feats.append_feature_obj(subfeats)

    for k in xrange(kmerlen, kmerlen2 + 1):
        if k <= 8:
            subkernel = CommWordStringKernel(10, False)
        else:
            subkernel = CommUlongStringKernel(10, False)

        kernel.append_kernel(subkernel)
        subkernels += 1

    kernel.init(feats, feats)

    kernel.set_subkernel_weights(
        numpy.array([1 / float(subkernels)] * subkernels,
                    numpy.dtype('float64')))

    return kernel
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
        # dict to save additional information for later analysis
        self.additional_information = {}
        
          
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)

                
        # create shogun label
        lab = shogun_factory.create_labels(data.labels)
        


        ########################################################
        print "creating a kernel for each node:"
        ########################################################


        # assemble combined kernel
        
        combined_kernel = CombinedKernel()
        
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)
        
        
        base_features = shogun_factory.create_features(data.examples, param)
        
        combined_features = CombinedFeatures()
        
        
        
        
        ##################################################
        # intra-domain blocks (dirac kernel)
        
        
        intra_block_vec = PairiiVec()
        
        for task_id in data.get_task_ids():
            intra_block_vec.push_back(Pairii(task_id, task_id))
        
        
        
        # create mask-based normalizer
        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec)        
        kernel = shogun_factory.create_empty_kernel(param)
        kernel.set_normalizer(normalizer)
        
        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel)
    
        # append features
        combined_features.append_feature_obj(base_features)

        print "------"
        
        ##################################################
        # all blocks (full kernel matrix)
        
        
        all_block_vec = PairiiVec()
        
        for task_id_1 in data.get_task_ids():
            for task_id_2 in data.get_task_ids():
                all_block_vec.push_back(Pairii(task_id_1, task_id_2))
                
        
        # create mask-based normalizer
        normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec)        
        kernel_all = shogun_factory.create_empty_kernel(param)
        kernel_all.set_normalizer(normalizer_all)
                
        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel_all)
    
        # append features
        combined_features.append_feature_obj(base_features)

        
        ##################################################
        # hack
        
        
        #        hack_block_vec = PairiiVec()
        #        
        #        for task_id_1 in data.get_task_ids():
        #            for task_id_2 in data.get_task_ids():
        #                hack_block_vec.push_back(Pairii(task_id_1, task_id_2))
        #        
        #        hack_block_vec.push_back(Pairii(data.name_to_id("B_2705"), data.name_to_id("B_4001")))
        #        other_group = ["B_0702", "B_1501", "B_5801"]
        #        for task_id_1 in other_group:
        #            for task_id_2 in other_group:
        #                hack_block_vec.push_back(Pairii(data.name_to_id(task_id_1), data.name_to_id(task_id_2)))
        #        
        #        
        #        
        #        # create mask-based normalizer
        #        normalizer_hack = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, hack_block_vec)        
        #        kernel_hack = shogun_factory.create_empty_kernel(param)
        #        kernel_hack.set_normalizer(normalizer_hack)
        #                
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel_hack)
        #    
        #        # append features
        #        combined_features.append_feature_obj(base_features)
        
        
        
            
        ##################################################
        # init combined kernel
        
        combined_kernel.init(combined_features, combined_features)    
        
            
        #combined_kernel.precompute_subkernels()
        self.additional_information["mkl weights before"] = combined_kernel.get_subkernel_weights()
        
        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None
                
        
        print "using MKL:", (param.flags["mkl_q"] >= 1.0)
        
        if param.flags["mkl_q"] >= 1.0:
            
            svm = MKLClassification()
            
            svm.set_mkl_norm(param.flags["mkl_q"])
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)
        
        else:
            
            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)
            svm = SVMLight(param.cost, combined_kernel, lab)


        num_threads = 8
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_INFO)
        
        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
    
        svm.set_epsilon(0.03)
        
        # set cost
        if param.flags["normalize_cost"]:
            
            norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
            norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
            svm.set_C(norm_c_neg, norm_c_pos)
            
        else:

            svm.set_C(param.cost, param.cost)
        
        svm.train()
    
    
        print "subkernel weights (after):", combined_kernel.get_subkernel_weights()

        ########################################################
        print "svm objective:"
        print svm.get_objective()
        
        
        self.additional_information["svm_objective"] = svm.get_objective()
        self.additional_information["svm num sv"] = svm.get_num_support_vectors()
        self.additional_information["mkl weights post-training"] = combined_kernel.get_subkernel_weights()
         
        ########################################################
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm, param)

        
        return svms
Exemple #19
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_empty_kernel(param)
        lab = shogun_factory.create_labels(data.labels)

        combined_kernel = CombinedKernel()
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)
        base_features = shogun_factory.create_features(data.examples)
        combined_features = CombinedFeatures()

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        # load data
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt")
        f = file(
            "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt"
        )
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt")
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt")

        num_lines = int(f.readline().strip())
        task_distances = numpy.zeros((num_lines, num_lines))
        name_to_id = {}
        for (i, line) in enumerate(f):
            tokens = line.strip().split("\t")
            name = str(tokens[0])
            name_to_id[name] = i
            entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0])
            assert len(entry) == num_lines, "len_entry %i, num_lines %i" % (
                len(entry), num_lines)
            task_distances[i, :] = entry

        # cut relevant submatrix
        active_ids = [name_to_id[name] for name in data.get_task_names()]
        tmp_distances = task_distances[active_ids, :]
        tmp_distances = tmp_distances[:, active_ids]
        print "distances ", tmp_distances.shape

        # normalize distances
        task_distances = task_distances / numpy.max(tmp_distances)

        similarities = numpy.zeros(
            (data.get_num_tasks(), data.get_num_tasks()))

        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():

                # convert similarity with simple transformation
                similarity = param.base_similarity - task_distances[
                    name_to_id[task_name_lhs], name_to_id[task_name_rhs]]
                normalizer.set_task_similarity(data.name_to_id(task_name_lhs),
                                               data.name_to_id(task_name_rhs),
                                               similarity)

                # save for later
                similarities[data.name_to_id(task_name_lhs),
                             data.name_to_id(task_name_rhs)] = similarity

        # set normalizer
        base_wdk.set_normalizer(normalizer)
        #base_wdk.init_normalizer()

        combined_features.append_feature_obj(base_features)
        combined_kernel.append_kernel(base_wdk)

        ##################################################
        # intra-domain blocks

        intra_block_vec = PairiiVec()

        for task_id in data.get_task_ids():
            intra_block_vec.push_back(Pairii(task_id, task_id))

        # create mask-based normalizer
        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums,
                                                       intra_block_vec)
        kernel = shogun_factory.create_empty_kernel(param)
        kernel.set_normalizer(normalizer)

        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel)

        # append features
        combined_features.append_feature_obj(base_features)

        # set mixing factor (used if MKL is OFF)
        assert (param.base_similarity <= 1)
        assert (param.base_similarity >= 0)
        combined_kernel.set_subkernel_weights(
            [param.base_similarity, 1 - param.base_similarity])

        combined_kernel.init(combined_features, combined_features)

        svm = None

        print "using MKL:", (param.transform >= 1.0)

        if param.transform >= 1.0:

            svm = MKLClassification()

            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto

            svm.set_C(param.cost, param.cost)

            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)

        else:

            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)

            svm = SVMLight(param.cost, combined_kernel, lab)

        # set up SVM
        num_threads = 8
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)

        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        print "WARNING: custom epsilon set"
        svm.set_epsilon(0.05)

        # normalize cost
        norm_c_pos = param.cost / float(len([l
                                             for l in data.labels if l == 1]))
        norm_c_neg = param.cost / float(
            len([l for l in data.labels if l == -1]))

        svm.set_C(norm_c_neg, norm_c_pos)

        # start training
        svm.train()

        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        self.additional_information["similarities"] = similarities
        self.additional_information[
            "post_weights"] = combined_kernel.get_subkernel_weights()

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in data.get_task_names():

            task_num = data.name_to_id(task_name)

            # save svm and task_num
            svms[task_name] = (task_num, combined_kernel, svm)

        return svms
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        

        #numpy.random.seed(1337)
        numpy.random.seed(666)

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)

                
        # create shogun label
        lab = shogun_factory.create_labels(data.labels)


        # assemble combined kernel
        combined_kernel = CombinedKernel()
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_DEBUG)    
        # set kernel cache
        if param.flags.has_key("cache_size"):
            combined_kernel.set_cache_size(param.flags["cache_size"])
        

        # create features
        base_features = shogun_factory.create_features(data.examples)
        
        combined_features = CombinedFeatures()
        


        ########################################################
        print "creating a masked kernel for each node:"
        ########################################################
        

        # fetch taxonomy from parameter object
        taxonomy = param.taxonomy.data

        # create name to leaf map
        nodes = taxonomy.get_all_nodes()

        
        for node in nodes:
            
            print "creating kernel for ", node.name
            
            # fetch sub-tree
            active_task_ids = [data.name_to_id(leaf.name) for leaf in node.get_leaves()]
            
            print "masking all entries other than:", active_task_ids
            
        
            # create mask-based normalizer
            normalizer = MultitaskKernelMaskNormalizer(data.task_vector_nums, data.task_vector_nums, active_task_ids)
            
            # normalize trace
            if param.flags.has_key("normalize_trace") and param.flags["normalize_trace"]:
                norm_factor = len(node.get_leaves()) / len(active_task_ids)
                normalizer.set_normalization_constant(norm_factor)
            
            # create kernel
            kernel = shogun_factory.create_empty_kernel(param)
            kernel.set_normalizer(normalizer)
            
            
            # append current kernel to CombinedKernel
            combined_kernel.append_kernel(kernel)
        
            # append features
            combined_features.append_feature_obj(base_features)

            print "------"
        

        combined_kernel.init(combined_features, combined_features)                
        #combined_kernel.precompute_subkernels()
                
        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None
                        
        print "using MKL:", (param.flags["mkl_q"] >= 1.0)

        
        if param.flags["mkl_q"] >= 1.0:
            
            # set up MKL    
            svm = MKLClassification()

            # set the "q" in q-norm MKL
            svm.set_mkl_norm(param.flags["mkl_q"])
            
            # set interleaved optimization
            if param.flags.has_key("interleaved"):
                svm.set_interleaved_optimization_enabled(param.flags["interleaved"])
            
            # set solver type
            if param.flags.has_key("solver_type") and param.flags["solver_type"]:
                if param.flags["solver_type"] == "ST_CPLEX":
                    svm.set_solver_type(ST_CPLEX)
                if param.flags["solver_type"] == "ST_DIRECT":
                    svm.set_solver_type(ST_DIRECT)
                if param.flags["solver_type"] == "ST_NEWTON":
                    svm.set_solver_type(ST_NEWTON)
                if param.flags["solver_type"] == "ST_GLPK":
                    svm.set_solver_type(ST_GLPK)
            
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)
            
        else:
            # create vanilla SVM 
            svm = SVMLight(param.cost, combined_kernel, lab)


        # optimization settings
        num_threads = 4
        svm.parallel.set_num_threads(num_threads)
        
        if param.flags.has_key("epsilon"):
            svm.set_epsilon(param.flags["epsilon"])
        
        
        # enable output        
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)
        
        
        # disable unsupported optimizations (due to special normalizer)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        
        
        # set cost
        if param.flags["normalize_cost"]:
            
            norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
            norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
            svm.set_C(norm_c_neg, norm_c_pos)
            
        else:
            
            svm.set_C(param.cost, param.cost)
        
        
        # start training
        svm.train()


        ########################################################
        print "svm objective:"
        print svm.get_objective()
        ########################################################
        
        # store additional info
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["weights"] = combined_kernel.get_subkernel_weights()
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), len(nodes), combined_kernel, svm)

        
        return svms
def mkl_binclass_modular(fm_train_real=traindat, fm_test_real=testdat, fm_label_twoclass=label_traindat):

    ##################################
    # set up and train

    # create some poly train/test matrix
    tfeats = RealFeatures(fm_train_real)
    tkernel = PolyKernel(10, 3)
    tkernel.init(tfeats, tfeats)
    K_train = tkernel.get_kernel_matrix()

    pfeats = RealFeatures(fm_test_real)
    tkernel.init(tfeats, pfeats)
    K_test = tkernel.get_kernel_matrix()

    # create combined train features
    feats_train = CombinedFeatures()
    feats_train.append_feature_obj(RealFeatures(fm_train_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_train))
    kernel.append_kernel(PolyKernel(10, 2))
    kernel.init(feats_train, feats_train)

    # train mkl
    labels = BinaryLabels(fm_label_twoclass)
    mkl = MKLClassification()

    # which norm to use for MKL
    mkl.set_mkl_norm(1)  # 2,3

    # set cost (neg, pos)
    mkl.set_C(1, 1)

    # set kernel and labels
    mkl.set_kernel(kernel)
    mkl.set_labels(labels)

    # train
    mkl.train()
    # w=kernel.get_subkernel_weights()
    # kernel.set_subkernel_weights(w)

    ##################################
    # test

    # create combined test features
    feats_pred = CombinedFeatures()
    feats_pred.append_feature_obj(RealFeatures(fm_test_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_test))
    kernel.append_kernel(PolyKernel(10, 2))
    kernel.init(feats_train, feats_pred)

    # and classify
    mkl.set_kernel(kernel)
    mkl.apply()
    return mkl.apply(), kernel
Exemple #22
0
class SignalSensor(object):
    """
    A collection of sensors
    """
    def __init__(self):
        self.sensors = list()
        self.kernel = CombinedKernel()
        self.svs = CombinedFeatures()
        self.svm = None
        self.window = (+100000, -1000000)

    def from_file(self, file):
        sys.stderr.write('loading model file')
        l = file.readline()

        if l != '%arts version: 1.0\n':
            sys.stderr.write("\nfile not an arts definition file\n")
            return None

        bias = None
        alphas = None
        num_kernels = None

        while l:
            # skip comment or empty line
            if not (l.startswith('%') or l.startswith('\n')):
                if bias is None: bias = parse_float(l, 'b')
                if alphas is None: alphas = parse_vector(l, file, 'alphas')
                if num_kernels is None:
                    num_kernels = parse_int(l, 'num_kernels')

                if num_kernels and bias and alphas is not None:
                    for i in xrange(num_kernels):
                        s = Sensor()
                        (k, f) = s.from_file(file, i + 1)
                        k.io.enable_progress()
                        self.window = (min(self.window[0], s.window[0]),
                                       max(self.window[1], s.window[2]))
                        self.sensors.append(s)
                        self.kernel.append_kernel(k)
                        self.svs.append_feature_obj(f)

                    self.kernel.init(self.svs, self.svs)
                    self.svm = KernelMachine(
                        self.kernel, alphas,
                        numpy.arange(len(alphas), dtype=numpy.int32), bias)
                    self.svm.io.set_target_to_stderr()
                    self.svm.io.enable_progress()
                    self.svm.parallel.set_num_threads(
                        self.svm.parallel.get_num_cpus())
                    sys.stderr.write('done\n')
                    return

            l = file.readline()

        sys.stderr.write('error loading model file\n')

    def predict(self, seq, chunk_size=int(10e6)):
        """
        predicts on whole contig, splits up sequence in chunks of size chunk_size
        """

        seq_len = len(seq)
        num_chunks = int(numpy.ceil(float(seq_len) / float(chunk_size)))
        assert (num_chunks > 0)

        sys.stderr.write("number of chunks for contig: %i\n" % (num_chunks))

        start = 0
        stop = min(chunk_size, seq_len)

        out = []

        # iterate over chunks
        for chunk_idx in range(num_chunks):

            sys.stderr.write("processing chunk #%i\n" % (chunk_idx))

            assert (start < stop)
            chunk = seq[start:stop]

            assert (len(self.sensors) > 0)
            tf = CombinedFeatures()
            for i in xrange(len(self.sensors)):
                f = self.sensors[i].get_test_features(chunk, self.window)
                tf.append_feature_obj(f)

            sys.stderr.write("initialising kernel...")
            self.kernel.init(self.svs, tf)
            sys.stderr.write("..done\n")

            self.svm.set_kernel(self.kernel)
            lab_out = self.svm.apply().get_values()

            assert (len(lab_out) > 0)
            out.extend(lab_out)

            # increment chunk
            start = stop
            stop = min(stop + chunk_size, seq_len)

        l = (-self.window[0]) * [-42]
        r = self.window[1] * [-42]

        # concatenate
        ret = l + out + r

        assert (len(ret) == len(seq))

        return ret
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
          
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)
        
        # create shogun label
        lab = shogun_factory.create_labels(data.labels)
        


        ########################################################
        print "creating a kernel for each node:"
        ########################################################


        # assemble combined kernel
        
        combined_kernel = CombinedKernel()
        
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)
        
        
        base_features = shogun_factory.create_features(data.examples)
        
        combined_features = CombinedFeatures()
        
        
        
        
        ##################################################
        # intra-domain blocks
        
        
        #        intra_block_vec = PairiiVec()
        #        
        #        for task_id in data.get_task_ids():
        #            intra_block_vec.push_back(Pairii(task_id, task_id))
        #        
        #        
        #        
        #        # create mask-based normalizer
        #        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec)        
        #        kernel = shogun_factory.create_empty_kernel(param)
        #        kernel.set_normalizer(normalizer)
        #        
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel)
        #    
        #        # append features
        #        combined_features.append_feature_obj(base_features)
        #
        #        print "------"
        #        
        #        ##################################################
        #        # all blocks
        #        
        #        
        #        all_block_vec = PairiiVec()
        #        
        #        for task_id_1 in data.get_task_ids():
        #            for task_id_2 in data.get_task_ids():
        #                all_block_vec.push_back(Pairii(task_id_1, task_id_2))
        #                
        #        
        #        # create mask-based normalizer
        #        normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec)        
        #        kernel_all = shogun_factory.create_empty_kernel(param)
        #        kernel_all.set_normalizer(normalizer_all)
        #                
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel_all)
        #    
        #        # append features
        #        combined_features.append_feature_obj(base_features)

        
        ##################################################
        # add one kernel per similarity position
        
        
        # init seq handler 
        pseudoseqs = SequencesHandler()
        pseudoseq_length = pseudoseqs.seq_length


        for pos in range(pseudoseq_length):
            
            print "appending kernel for pos %i" % (pos)
        
            print "nums", data.task_vector_nums

    
            pos_block_vec = PairiiVec()
    
            # set similarity
            for task_name_lhs in data.get_task_names():
                for task_name_rhs in data.get_task_names():
                    
                    similarity = pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pos)
                    #print "computing similarity for tasks (%s, %s) = %i" % (task_name_lhs, task_name_rhs, similarity)
                    
                    if similarity == 1:                    
                        tmp_pair = Pairii(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs))
                        pos_block_vec.push_back(tmp_pair)

            print "creating normalizer"
            normalizer_pos = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, pos_block_vec)   

            print "creating empty kernel"
            kernel_pos = shogun_factory.create_empty_kernel(param)
            
            print "setting normalizer"
            kernel_pos.set_normalizer(normalizer_pos)
                
            print "appending kernel"
            # append current kernel to CombinedKernel
            combined_kernel.append_kernel(kernel_pos)
    
            print "appending features"
            # append features
            combined_features.append_feature_obj(base_features)
        
        
        print "done constructing combined kernel"
        
        ##################################################
        # init combined kernel
        
        combined_kernel.init(combined_features, combined_features)    
        
            

                
        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None
                
        
        print "using MKL:", (param.transform >= 1.0)
        
        if param.transform >= 1.0:
            
            svm = MKLClassification()
            
            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto
        
            svm.set_C(param.cost, param.cost)
            
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)
            
                
        else:
            
            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)
            
            svm = SVMLight(param.cost, combined_kernel, lab)


        # set up SVM
        num_threads = 8
        svm.io.enable_progress()
        #svm.io.set_loglevel(shogun.Classifier.MSG_INFO)
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)
        
        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        
        print "WARNING: custom epsilon set"
        svm.set_epsilon(0.05)    
        
        # normalize cost
        norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
        norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
        
        svm.set_C(norm_c_neg, norm_c_pos)
        
        
        # start training
        svm.train()
    
        
        # save additional info
        self.additional_information["svm_objective"] = svm.get_objective()
        self.additional_information["svm num sv"] = svm.get_num_support_vectors()
        self.additional_information["mkl weights post-training"] = combined_kernel.get_subkernel_weights()
        
        print self.additional_information 
        
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm)

        
        return svms
Exemple #24
0
class SignalSensor(object):
    """
    A collection of sensors
    """
    def __init__(self):
        self.sensors = list()
        self.kernel = CombinedKernel()
        self.svs = CombinedFeatures()
        self.svm = None
        self.window = (+100000, -1000000)

    def from_file(self, file):
        sys.stderr.write('loading model file')
        l = file.readline();

        if l != '%arts version: 1.0\n':
            sys.stderr.write("\nfile not an arts definition file\n")
            return None

        bias = None
        alphas = None
        num_kernels = None

        while l:
            # skip comment or empty line
            if not (l.startswith('%') or l.startswith('\n')):
                if bias is None: bias = parse_float(l, 'b')
                if alphas is None: alphas = parse_vector(l, file, 'alphas')
                if num_kernels is None: num_kernels = parse_int(l, 'num_kernels')

                if num_kernels and bias and alphas is not None:
                    for i in xrange(num_kernels):
                        s = Sensor()
                        (k, f) = s.from_file(file, i + 1)
                        k.io.enable_progress()
                        self.window = (min(self.window[0], s.window[0]),
                                max(self.window[1], s.window[2]))
                        self.sensors.append(s)
                        self.kernel.append_kernel(k)
                        self.svs.append_feature_obj(f)

                    self.kernel.init(self.svs, self.svs)
                    self.svm = SVM(self.kernel, alphas,
                            numpy.arange(len(alphas), dtype=numpy.int32), bias)
                    self.svm.io.set_target_to_stderr()
                    self.svm.io.enable_progress()
                    self.svm.parallel.set_num_threads(self.svm.parallel.get_num_cpus())
                    sys.stderr.write('done\n')
                    return

            l = file.readline()

        sys.stderr.write('error loading model file\n')


    def predict(self, seq, chunk_size = int(10e6)):
        """
        predicts on whole contig, splits up sequence in chunks of size chunk_size
        """

        seq_len = len(seq)
        num_chunks = int(numpy.ceil(float(seq_len) / float(chunk_size)))
        assert(num_chunks > 0)

    	sys.stderr.write("number of chunks for contig: %i\n" % (num_chunks))

        start = 0
        stop = min(chunk_size, seq_len)
		
        out = []

        # iterate over chunks
        for chunk_idx in range(num_chunks):

            sys.stderr.write("processing chunk #%i\n" % (chunk_idx))

            assert (start < stop)
            chunk = seq[start:stop]

            assert(len(self.sensors) > 0)
            tf = CombinedFeatures()
            for i in xrange(len(self.sensors)):
                f = self.sensors[i].get_test_features(chunk, self.window)
                tf.append_feature_obj(f)

            sys.stderr.write("initialising kernel...")
            self.kernel.init(self.svs, tf)
            sys.stderr.write("..done\n")

            lab_out = self.svm.apply()

            # work around problem with get_labels()
            tmp_out = [lab_out.get_label(idx) for idx in range(0, lab_out.get_num_labels())]
            assert(len(tmp_out) > 0)
            out.extend(tmp_out)

            print "len out", len(out)

            # increment chunk
            start = stop
            stop = min(stop+chunk_size, seq_len)


        l = (-self.window[0]) * [-42]
        r = self.window[1] * [-42]

        # concatenate
        ret = l + out + r

        assert(len(ret) == len(seq))

        return ret
Exemple #25
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        import numpy
        numpy.random.seed(666)

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)

        # create shogun label
        lab = shogun_factory.create_labels(data.labels)

        # assemble combined kernel
        combined_kernel = CombinedKernel()
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_DEBUG)
        # set kernel cache
        if param.flags.has_key("cache_size"):
            combined_kernel.set_cache_size(param.flags["cache_size"])

        # create features
        base_features = shogun_factory.create_features(data.examples, param)

        combined_features = CombinedFeatures()

        ########################################################
        print "creating a masked kernel for possible subset:"
        ########################################################

        power_set_tasks = power_set(data.get_task_ids())

        for active_task_ids in power_set_tasks:

            print "masking all entries other than:", active_task_ids

            # create mask-based normalizer
            normalizer = MultitaskKernelMaskNormalizer(data.task_vector_nums,
                                                       data.task_vector_nums,
                                                       active_task_ids)

            # normalize trace
            if param.flags.has_key(
                    "normalize_trace") and param.flags["normalize_trace"]:
                norm_factor = len(data.get_task_ids()) / len(active_task_ids)
                normalizer.set_normalization_constant(norm_factor)

            kernel = shogun_factory.create_empty_kernel(param)
            kernel.set_normalizer(normalizer)

            # append current kernel to CombinedKernel
            combined_kernel.append_kernel(kernel)

            # append features
            combined_features.append_feature_obj(base_features)

            print "------"

        combined_kernel.init(combined_features, combined_features)

        #combined_kernel.precompute_subkernels()

        self.additional_information[
            "weights before trainng"] = combined_kernel.get_subkernel_weights(
            )
        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None

        print "using MKL:", (param.flags["mkl_q"] >= 1.0)

        if param.flags["mkl_q"] >= 1.0:

            svm = MKLClassification()

            svm.set_mkl_norm(param.flags["mkl_q"])

            # set interleaved optimization
            if param.flags.has_key("interleaved"):
                svm.set_interleaved_optimization_enabled(
                    param.flags["interleaved"])

            # set solver type
            if param.flags.has_key(
                    "solver_type") and param.flags["solver_type"]:
                if param.flags["solver_type"] == "ST_CPLEX":
                    svm.set_solver_type(ST_CPLEX)
                if param.flags["solver_type"] == "ST_DIRECT":
                    svm.set_solver_type(ST_DIRECT)
                if param.flags["solver_type"] == "ST_NEWTON":
                    svm.set_solver_type(ST_NEWTON)
                if param.flags["solver_type"] == "ST_GLPK":
                    svm.set_solver_type(ST_GLPK)

            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)

        else:

            svm = SVMLight(param.cost, combined_kernel, lab)

        # optimization settings
        num_threads = 4
        svm.parallel.set_num_threads(num_threads)

        if param.flags.has_key("epsilon"):
            svm.set_epsilon(param.flags["epsilon"])

        # enable output
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)

        # disable unsupported optimizations (due to special normalizer)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        # set cost
        if param.flags["normalize_cost"]:

            norm_c_pos = param.cost / float(
                len([l for l in data.labels if l == 1]))
            norm_c_neg = param.cost / float(
                len([l for l in data.labels if l == -1]))
            svm.set_C(norm_c_neg, norm_c_pos)

        else:

            svm.set_C(param.cost, param.cost)

        svm.train()

        # prepare mapping
        weight_map = {}
        weights = combined_kernel.get_subkernel_weights()
        for (i, pset) in enumerate(power_set_tasks):
            print pset
            subset_str = str([data.id_to_name(task_idx) for task_idx in pset])
            weight_map[subset_str] = weights[i]

        # store additional info
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["weight_map"] = weight_map

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name),
                               len(power_set_tasks), combined_kernel, svm,
                               param)

        return svms
Exemple #26
0
def training_run(options):
    """Conduct a training run and return a trained SVM kernel"""
    settings = MotifFinderSettings(kirmes_ini.MOTIF_LENGTH,
                                   options.window_width, options.replace)
    positives = MotifFinder(finder_settings=settings)
    positives.setFastaFile(options.positives)
    positives.setMotifs(options.pgff)
    pmotifs, ppositions = positives.getResults()
    negatives = MotifFinder(finder_settings=settings)
    negatives.setFastaFile(options.negatives)
    negatives.setMotifs(options.ngff)
    nmotifs, npositions = negatives.getResults()

    wds_kparams = kirmes_ini.WDS_KERNEL_PARAMETERS
    wds_svm = EasySVM.EasySVM(wds_kparams)
    num_positives = len(pmotifs.values()[0])
    num_negatives = len(nmotifs.values()[0])
    #Creating Kernel Objects
    kernel = CombinedKernel()
    features = CombinedFeatures()
    kernel_array = []
    motifs = pmotifs.keys()
    motifs.sort()
    #Adding Kmer Kernels
    for motif in motifs:
        all_examples = pmotifs[motif] + nmotifs[motif]
        motif_features = wds_svm.createFeatures(all_examples)
        wds_kernel = WeightedDegreePositionStringKernel(motif_features, motif_features, \
                                                        wds_kparams['degree'])
        wds_kernel.set_shifts(wds_kparams['shift'] *
                              ones(wds_kparams['seqlength'], dtype=int32))
        features.append_feature_obj(motif_features)
        kernel_array.append(wds_kernel)
        kernel.append_kernel(wds_kernel)
    rbf_svm = EasySVM.EasySVM(kirmes_ini.RBF_KERNEL_PARAMETERS)
    positions = array(ppositions + npositions, dtype=float64).T
    position_features = rbf_svm.createFeatures(positions)
    features.append_feature_obj(position_features)
    motif_labels = append(ones(num_positives), -ones(num_negatives))
    complete_labels = Labels(motif_labels)
    rbf_kernel = GaussianKernel(position_features, position_features, \
                                kirmes_ini.RBF_KERNEL_PARAMETERS['width'])
    kernel_array.append(rbf_kernel)
    kernel.append_kernel(rbf_kernel)
    #Kernel init
    kernel.init(features, features)
    kernel.set_cache_size(kirmes_ini.K_CACHE_SIZE)
    svm = LibSVM(kirmes_ini.K_COMBINED_C, kernel, complete_labels)
    svm.parallel.set_num_threads(kirmes_ini.K_NUM_THREADS)
    #Training
    svm.train()
    if not os.path.exists(options.output_path):
        os.mkdir(options.output_path)
    html = {}
    if options.contrib:
        html["contrib"] = contrib(svm, kernel, motif_labels, kernel_array,
                                  motifs)
    if options.logos:
        html["poims"] = poims(svm, kernel, kernel_array, motifs,
                              options.output_path)
    if options.query:
        html["query"] = evaluate(options, svm, kernel, features, motifs)
    htmlize(html, options.output_html)
# features_train = RealFeatures(train_feats.T)
# features_test = RealFeatures(test_feats.T)
# labels_train = BinaryLabels(train_label.T)
# labels_test = BinaryLabels(test_label.T)
epsilon = 0.001
C = 100000

combined_kernel = CombinedKernel()

#gauss_kernel_1 = GaussianKernel(features_train, features_train, 15)
gauss_kernel_1 = GaussianKernel(features_train, features_train, 10)
gauss_kernel_2 = GaussianKernel(features_train, features_train, 1.0)

combined_kernel.append_kernel(gauss_kernel_1)
combined_kernel.append_kernel(gauss_kernel_2)
combined_kernel.init(features_train, features_train)


libsvm = LibSVM()
svm = MKLClassification(libsvm)
svm.set_interleaved_optimization_enabled(False)
svm.set_kernel(combined_kernel)
svm.set_labels(labels_train)

svm.train()

beta = combined_kernel.get_subkernel_weights()
alpha = svm.get_alphas()

combined_kernel.init(features_train, features_test)
labels_predict = svm.apply()
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
          
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)
        
        # create shogun label
        lab = shogun_factory.create_labels(data.labels)
        


        ##################################################
        # define pockets
        ##################################################
        
        pockets = [0]*9
        
        pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34]
        pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31]
        pockets[2] = [11, 20, 21, 22, 29, 31]
        pockets[3] = [8, 30, 31, 32]
        pockets[4] = [10, 11, 30]
        pockets[5] = [10, 11, 12, 13, 20, 29]
        pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29]
        pockets[7] = [12, 14, 15, 26]
        pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26]
        

        #new_pockets = []
        
        # merge neighboring pockets
        #for i in range(8):
        #    new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1]))))
            
        #pockets = new_pockets
        
        
        ########################################################
        print "creating a kernel:"
        ########################################################


        # assemble combined kernel
        
        combined_kernel = CombinedKernel()
        
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)
        
        
        base_features = shogun_factory.create_features(data.examples)
        
        combined_features = CombinedFeatures()
        
        
        
        ##################################################
        # intra-domain blocks
        
        
        #        intra_block_vec = PairiiVec()
        #        
        #        for task_id in data.get_task_ids():
        #            intra_block_vec.push_back(Pairii(task_id, task_id))
        #        
        #        
        #        
        #        # create mask-based normalizer
        #        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec)        
        #        kernel = shogun_factory.create_empty_kernel(param)
        #        kernel.set_normalizer(normalizer)
        #        
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel)
        #    
        #        # append features
        #        combined_features.append_feature_obj(base_features)
        #
        #        print "------"
        #        
        #        ##################################################
        #        # all blocks
        #        
        #        
        #        all_block_vec = PairiiVec()
        #        
        #        for task_id_1 in data.get_task_ids():
        #            for task_id_2 in data.get_task_ids():
        #                all_block_vec.push_back(Pairii(task_id_1, task_id_2))
        #                
        #        
        #        # create mask-based normalizer
        #        normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec)        
        #        kernel_all = shogun_factory.create_empty_kernel(param)
        #        kernel_all.set_normalizer(normalizer_all)
        #                
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel_all)
        #    
        #        # append features
        #        combined_features.append_feature_obj(base_features)

        
        ##################################################
        # add one kernel per similarity position
        
        
        # init seq handler 
        pseudoseqs = SequencesHandler()



        for pocket in pockets:

            print "creating normalizer"
            #import pdb
            #pdb.set_trace()
            
            normalizer = MultitaskKernelNormalizer(data.task_vector_nums)
            
            print "processing pocket", pocket


            # set similarity
            for task_name_lhs in data.get_task_names():
                for task_name_rhs in data.get_task_names():
                    
                    similarity = 0.0
                    
                    for pseudo_seq_pos in pocket:
                        similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1))
                    
                    # normalize
                    similarity = similarity / float(len(pocket))
                    
                    print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity)
                    
                    normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity)
               

            print "creating empty kernel"
            kernel_pos = shogun_factory.create_empty_kernel(param)
            
            print "setting normalizer"
            kernel_pos.set_normalizer(normalizer)
                
            print "appending kernel"
            # append current kernel to CombinedKernel
            combined_kernel.append_kernel(kernel_pos)
    
            print "appending features"
            # append features
            combined_features.append_feature_obj(base_features)

        
        
        print "done constructing combined kernel"
        
        ##################################################
        # init combined kernel

        
        # init weights
        # combined_kernel.set_subkernel_weights([1.0/2.85]*combined_kernel.get_num_subkernels())
        
        
        combined_kernel.init(combined_features, combined_features)    
        
        

                
        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None
                
        
        print "using MKL:", (param.transform >= 1.0)
        
        if param.transform >= 1.0:
            
            svm = MKLClassification()
            
            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto
        
            svm.set_C(param.cost, param.cost)
            
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)
            
                
        else:
            
            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)
            
            svm = SVMLight(param.cost, combined_kernel, lab)


        # set up SVM
        num_threads = 8
        svm.io.enable_progress()
        #svm.io.set_loglevel(shogun.Classifier.MSG_INFO)
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)
        
        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        
        #print "WARNING: custom epsilon set"
        #svm.set_epsilon(0.05)    
        
        # normalize cost
        norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
        norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
        
        svm.set_C(norm_c_neg, norm_c_pos)
        
        
        # start training
        svm.train()
    
        
        # save additional info
        self.additional_information["svm_objective"] = svm.get_objective()
        self.additional_information["svm num sv"] = svm.get_num_support_vectors()
        self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights()
        
        print self.additional_information 
        
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm)

        
        return svms
Exemple #29
0
#feats_test.append_feature_obj(RealFeatures(testdata_real))
#feats_test.append_feature_obj(RealFeatures(testdata_real))
#feats_test.append_feature_obj(RealFeatures(testdata_real))
#feats_test.append_feature_obj(RealFeatures(testdata_real))
#feats_test.append_feature_obj(RealFeatures(testdata_real))

labels = Labels(trainlab)

# and corresponding combined kernel
kernel = CombinedKernel()
kernel.append_kernel(GaussianKernel(10, 2.0))
kernel.append_kernel(GaussianKernel(10, 0.25))
kernel.append_kernel(GaussianKernel(10, 0.062))
kernel.append_kernel(GaussianKernel(10, 8.0))
kernel.append_kernel(GaussianKernel(10, 10.0))
kernel.init(feats_train, feats_train)

# Create a classifier
classifier=MKLClassification(LibSVM())
classifier.set_interleaved_optimization_enabled(False)
classifier.set_kernel(kernel)
classifier.set_labels(labels)
classifier.set_C(1, 1)

param_tree_root=ModelSelectionParameters()

# () C1 parameter to the tree
c1=ModelSelectionParameters("C1"); 
c1.build_values(-4.0, 4.0, R_EXP);
param_tree_root.append_child(c1)
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # dict to save additional information for later analysis
        self.additional_information = {}

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)

        # create shogun label
        lab = shogun_factory.create_labels(data.labels)

        ########################################################
        print "creating a kernel for each node:"
        ########################################################

        # assemble combined kernel

        combined_kernel = CombinedKernel()

        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)

        base_features = shogun_factory.create_features(data.examples, param)

        combined_features = CombinedFeatures()

        ##################################################
        # intra-domain blocks (dirac kernel)

        intra_block_vec = PairiiVec()

        for task_id in data.get_task_ids():
            intra_block_vec.push_back(Pairii(task_id, task_id))

        # create mask-based normalizer
        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums,
                                                       intra_block_vec)
        kernel = shogun_factory.create_empty_kernel(param)
        kernel.set_normalizer(normalizer)

        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel)

        # append features
        combined_features.append_feature_obj(base_features)

        print "------"

        ##################################################
        # all blocks (full kernel matrix)

        all_block_vec = PairiiVec()

        for task_id_1 in data.get_task_ids():
            for task_id_2 in data.get_task_ids():
                all_block_vec.push_back(Pairii(task_id_1, task_id_2))

        # create mask-based normalizer
        normalizer_all = MultitaskKernelMaskPairNormalizer(
            data.task_vector_nums, all_block_vec)
        kernel_all = shogun_factory.create_empty_kernel(param)
        kernel_all.set_normalizer(normalizer_all)

        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel_all)

        # append features
        combined_features.append_feature_obj(base_features)

        ##################################################
        # hack

        #        hack_block_vec = PairiiVec()
        #
        #        for task_id_1 in data.get_task_ids():
        #            for task_id_2 in data.get_task_ids():
        #                hack_block_vec.push_back(Pairii(task_id_1, task_id_2))
        #
        #        hack_block_vec.push_back(Pairii(data.name_to_id("B_2705"), data.name_to_id("B_4001")))
        #        other_group = ["B_0702", "B_1501", "B_5801"]
        #        for task_id_1 in other_group:
        #            for task_id_2 in other_group:
        #                hack_block_vec.push_back(Pairii(data.name_to_id(task_id_1), data.name_to_id(task_id_2)))
        #
        #
        #
        #        # create mask-based normalizer
        #        normalizer_hack = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, hack_block_vec)
        #        kernel_hack = shogun_factory.create_empty_kernel(param)
        #        kernel_hack.set_normalizer(normalizer_hack)
        #
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel_hack)
        #
        #        # append features
        #        combined_features.append_feature_obj(base_features)

        ##################################################
        # init combined kernel

        combined_kernel.init(combined_features, combined_features)

        #combined_kernel.precompute_subkernels()
        self.additional_information[
            "mkl weights before"] = combined_kernel.get_subkernel_weights()

        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None

        print "using MKL:", (param.flags["mkl_q"] >= 1.0)

        if param.flags["mkl_q"] >= 1.0:

            svm = MKLClassification()

            svm.set_mkl_norm(param.flags["mkl_q"])
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)

        else:

            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)
            svm = SVMLight(param.cost, combined_kernel, lab)

        num_threads = 8
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_INFO)

        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        svm.set_epsilon(0.03)

        # set cost
        if param.flags["normalize_cost"]:

            norm_c_pos = param.cost / float(
                len([l for l in data.labels if l == 1]))
            norm_c_neg = param.cost / float(
                len([l for l in data.labels if l == -1]))
            svm.set_C(norm_c_neg, norm_c_pos)

        else:

            svm.set_C(param.cost, param.cost)

        svm.train()

        print "subkernel weights (after):", combined_kernel.get_subkernel_weights(
        )

        ########################################################
        print "svm objective:"
        print svm.get_objective()

        self.additional_information["svm_objective"] = svm.get_objective()
        self.additional_information[
            "svm num sv"] = svm.get_num_support_vectors()
        self.additional_information[
            "mkl weights post-training"] = combined_kernel.get_subkernel_weights(
            )

        ########################################################

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), combined_kernel,
                               svm, param)

        return svms
def mkl_binclass_modular(fm_train_real=traindat,
                         fm_test_real=testdat,
                         fm_label_twoclass=label_traindat):

    ##################################
    # set up and train

    # create some poly train/test matrix
    tfeats = RealFeatures(fm_train_real)
    tkernel = PolyKernel(10, 3)
    tkernel.init(tfeats, tfeats)
    K_train = tkernel.get_kernel_matrix()

    pfeats = RealFeatures(fm_test_real)
    tkernel.init(tfeats, pfeats)
    K_test = tkernel.get_kernel_matrix()

    # create combined train features
    feats_train = CombinedFeatures()
    feats_train.append_feature_obj(RealFeatures(fm_train_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_train))
    kernel.append_kernel(PolyKernel(10, 2))
    kernel.init(feats_train, feats_train)

    # train mkl
    labels = BinaryLabels(fm_label_twoclass)
    mkl = MKLClassification()

    # which norm to use for MKL
    mkl.set_mkl_norm(1)  #2,3

    # set cost (neg, pos)
    mkl.set_C(1, 1)

    # set kernel and labels
    mkl.set_kernel(kernel)
    mkl.set_labels(labels)

    # train
    mkl.train()
    #w=kernel.get_subkernel_weights()
    #kernel.set_subkernel_weights(w)

    ##################################
    # test

    # create combined test features
    feats_pred = CombinedFeatures()
    feats_pred.append_feature_obj(RealFeatures(fm_test_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_test))
    kernel.append_kernel(PolyKernel(10, 2))
    kernel.init(feats_train, feats_pred)

    # and classify
    mkl.set_kernel(kernel)
    mkl.apply()
    return mkl.apply(), kernel
Exemple #32
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)

        # create shogun label
        lab = shogun_factory.create_labels(data.labels)

        ##################################################
        # define pockets
        ##################################################

        pockets = [0] * 9

        pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34]
        pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31]
        pockets[2] = [11, 20, 21, 22, 29, 31]
        pockets[3] = [8, 30, 31, 32]
        pockets[4] = [10, 11, 30]
        pockets[5] = [10, 11, 12, 13, 20, 29]
        pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29]
        pockets[7] = [12, 14, 15, 26]
        pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26]

        #new_pockets = []

        # merge neighboring pockets
        #for i in range(8):
        #    new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1]))))

        #pockets = new_pockets

        ########################################################
        print "creating a kernel:"
        ########################################################

        # assemble combined kernel

        combined_kernel = CombinedKernel()

        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)

        base_features = shogun_factory.create_features(data.examples)

        combined_features = CombinedFeatures()

        ##################################################
        # intra-domain blocks

        #        intra_block_vec = PairiiVec()
        #
        #        for task_id in data.get_task_ids():
        #            intra_block_vec.push_back(Pairii(task_id, task_id))
        #
        #
        #
        #        # create mask-based normalizer
        #        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec)
        #        kernel = shogun_factory.create_empty_kernel(param)
        #        kernel.set_normalizer(normalizer)
        #
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel)
        #
        #        # append features
        #        combined_features.append_feature_obj(base_features)
        #
        #        print "------"
        #
        #        ##################################################
        #        # all blocks
        #
        #
        #        all_block_vec = PairiiVec()
        #
        #        for task_id_1 in data.get_task_ids():
        #            for task_id_2 in data.get_task_ids():
        #                all_block_vec.push_back(Pairii(task_id_1, task_id_2))
        #
        #
        #        # create mask-based normalizer
        #        normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec)
        #        kernel_all = shogun_factory.create_empty_kernel(param)
        #        kernel_all.set_normalizer(normalizer_all)
        #
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel_all)
        #
        #        # append features
        #        combined_features.append_feature_obj(base_features)

        ##################################################
        # add one kernel per similarity position

        # init seq handler
        pseudoseqs = SequencesHandler()

        for pocket in pockets:

            print "creating normalizer"
            #import pdb
            #pdb.set_trace()

            normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

            print "processing pocket", pocket

            # set similarity
            for task_name_lhs in data.get_task_names():
                for task_name_rhs in data.get_task_names():

                    similarity = 0.0

                    for pseudo_seq_pos in pocket:
                        similarity += float(
                            pseudoseqs.get_similarity(task_name_lhs,
                                                      task_name_rhs,
                                                      pseudo_seq_pos - 1))

                    # normalize
                    similarity = similarity / float(len(pocket))

                    print "pocket %s (%s, %s) = %f" % (
                        str(pocket), task_name_lhs, task_name_rhs, similarity)

                    normalizer.set_task_similarity(
                        data.name_to_id(task_name_lhs),
                        data.name_to_id(task_name_rhs), similarity)

            print "creating empty kernel"
            kernel_pos = shogun_factory.create_empty_kernel(param)

            print "setting normalizer"
            kernel_pos.set_normalizer(normalizer)

            print "appending kernel"
            # append current kernel to CombinedKernel
            combined_kernel.append_kernel(kernel_pos)

            print "appending features"
            # append features
            combined_features.append_feature_obj(base_features)

        print "done constructing combined kernel"

        ##################################################
        # init combined kernel

        # init weights
        # combined_kernel.set_subkernel_weights([1.0/2.85]*combined_kernel.get_num_subkernels())

        combined_kernel.init(combined_features, combined_features)

        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None

        print "using MKL:", (param.transform >= 1.0)

        if param.transform >= 1.0:

            svm = MKLClassification()

            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto

            svm.set_C(param.cost, param.cost)

            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)

        else:

            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)

            svm = SVMLight(param.cost, combined_kernel, lab)

        # set up SVM
        num_threads = 8
        svm.io.enable_progress()
        #svm.io.set_loglevel(shogun.Classifier.MSG_INFO)
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)

        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        #print "WARNING: custom epsilon set"
        #svm.set_epsilon(0.05)

        # normalize cost
        norm_c_pos = param.cost / float(len([l
                                             for l in data.labels if l == 1]))
        norm_c_neg = param.cost / float(
            len([l for l in data.labels if l == -1]))

        svm.set_C(norm_c_neg, norm_c_pos)

        # start training
        svm.train()

        # save additional info
        self.additional_information["svm_objective"] = svm.get_objective()
        self.additional_information[
            "svm num sv"] = svm.get_num_support_vectors()
        self.additional_information[
            "post_weights"] = combined_kernel.get_subkernel_weights()

        print self.additional_information

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), combined_kernel,
                               svm)

        return svms