コード例 #1
0
def evaluation_cross_validation_multiclass_storage(traindat=traindat, label_traindat=label_traindat):
    from shogun.Evaluation import CrossValidation, CrossValidationResult
    from shogun.Evaluation import CrossValidationPrintOutput
    from shogun.Evaluation import CrossValidationMKLStorage, CrossValidationMulticlassStorage
    from shogun.Evaluation import MulticlassAccuracy, F1Measure
    from shogun.Evaluation import StratifiedCrossValidationSplitting
    from shogun.Features import MulticlassLabels
    from shogun.Features import RealFeatures, CombinedFeatures
    from shogun.Kernel import GaussianKernel, CombinedKernel
    from shogun.Classifier import MKLMulticlass
    from shogun.Mathematics import Statistics, MSG_DEBUG

    # training data, combined features all on same data
    features=RealFeatures(traindat)
    comb_features=CombinedFeatures()
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    labels=MulticlassLabels(label_traindat)
    
    # kernel, different Gaussians combined
    kernel=CombinedKernel()
    kernel.append_kernel(GaussianKernel(10, 0.1))
    kernel.append_kernel(GaussianKernel(10, 1))
    kernel.append_kernel(GaussianKernel(10, 2))

    # create mkl using libsvm, due to a mem-bug, interleaved is not possible
    svm=MKLMulticlass(1.0,kernel,labels);
    svm.set_kernel(kernel);

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "StratifiedCrossValidationSplitting" is also available
    splitting_strategy=StratifiedCrossValidationSplitting(labels, 5)

    # evaluation method
    evaluation_criterium=MulticlassAccuracy()

    # cross-validation instance
    cross_validation=CrossValidation(svm, comb_features, labels,
        splitting_strategy, evaluation_criterium)
    cross_validation.set_autolock(False)

    # append cross vlaidation output classes
    #cross_validation.add_cross_validation_output(CrossValidationPrintOutput())
    #mkl_storage=CrossValidationMKLStorage()
    #cross_validation.add_cross_validation_output(mkl_storage)
    multiclass_storage=CrossValidationMulticlassStorage()
    multiclass_storage.append_binary_evaluation(F1Measure())
    cross_validation.add_cross_validation_output(multiclass_storage)
    cross_validation.set_num_runs(3)
    
    # perform cross-validation
    result=cross_validation.evaluate()

    roc_0_0_0 = multiclass_storage.get_fold_ROC(0,0,0)
    #print roc_0_0_0
    auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0,0,0,0)
    #print auc_0_0_0
    return roc_0_0_0, auc_0_0_0
コード例 #2
0
def create_combined_wd_features(instances, feat_type):
    """
    creates a combined wd feature object
    """

    num_features = len(instances[0])

    # contruct combined features
    feat = CombinedFeatures()

    for idx in range(num_features):

        # cut column idx
        data = [instance[idx] for instance in instances]

        seq_len = len(data[0])
        for seq in data:
            if len(seq) != seq_len:
                print "warning, seq lengths differ", len(
                    seq), seq_len, "in", idx, "num_feat", num_features

        tmp_feat = get_wd_features(data, feat_type)
        feat.append_feature_obj(tmp_feat)

    return feat
コード例 #3
0
ファイル: signal_sensor.py プロジェクト: theoryno3/shogun
    def predict(self, seq, chunk_size=int(10e6)):
        """
        predicts on whole contig, splits up sequence in chunks of size chunk_size
        """

        seq_len = len(seq)
        num_chunks = int(numpy.ceil(float(seq_len) / float(chunk_size)))
        assert (num_chunks > 0)

        sys.stderr.write("number of chunks for contig: %i\n" % (num_chunks))

        start = 0
        stop = min(chunk_size, seq_len)

        out = []

        # iterate over chunks
        for chunk_idx in range(num_chunks):

            sys.stderr.write("processing chunk #%i\n" % (chunk_idx))

            assert (start < stop)
            chunk = seq[start:stop]

            assert (len(self.sensors) > 0)
            tf = CombinedFeatures()
            for i in xrange(len(self.sensors)):
                f = self.sensors[i].get_test_features(chunk, self.window)
                tf.append_feature_obj(f)

            sys.stderr.write("initialising kernel...")
            self.kernel.init(self.svs, tf)
            sys.stderr.write("..done\n")

            self.svm.set_kernel(self.kernel)
            lab_out = self.svm.apply()

            # work around problem with get_labels()
            tmp_out = [
                lab_out.get_label(idx)
                for idx in range(0, lab_out.get_num_labels())
            ]
            assert (len(tmp_out) > 0)
            out.extend(tmp_out)

            print "len out", len(out)

            # increment chunk
            start = stop
            stop = min(stop + chunk_size, seq_len)

        l = (-self.window[0]) * [-42]
        r = self.window[1] * [-42]

        # concatenate
        ret = l + out + r

        assert (len(ret) == len(seq))

        return ret
コード例 #4
0
ファイル: kirmes.py プロジェクト: veniciusgrjr/oqtans_tools
def evaluate(options, svm, kernel, features, motifs):
    """Evaluate examples using a trained kernel"""
    query = MotifFinder(finder_settings=MotifFinderSettings(kirmes_ini.MOTIF_LENGTH, options.window_width))
    query.setFastaFile(options.query)
    query.setMotifs(options.qgff)
    qmotifs, qpositions = query.getResults()
    feats_query = CombinedFeatures()
    wds_svm = EasySVM.EasySVM(kirmes_ini.WDS_KERNEL_PARAMETERS)
    try:
        assert set(qmotifs.keys()).issuperset(set(motifs))
    except AssertionError:
        print "The motif positions in the query sequence are incomplete, there are no positions for:"
        print set(motifs).difference(qmotifs.keys())
        raise
    for motif in motifs:
        feats_query.append_feature_obj(wds_svm.createFeatures(qmotifs[motif]))
    query_positions = array(qpositions, dtype=float64)
    query_positions = query_positions.T
    rbf_svm = EasySVM.EasySVM(kirmes_ini.RBF_KERNEL_PARAMETERS)
    feats_query.append_feature_obj(rbf_svm.createFeatures(query_positions))
    kernel.init(features, feats_query)
    out = svm.classify().get_labels()
    qgenes = query.getGenes()
    ret_str = ""
    print "#example\toutput\tsplit"
    for i in xrange(len(out)):
        if out[i] >= 0:
            classif = "\tpositive\t"
        else:
            classif = "\tnegative\t"
        ret_str += qgenes[i] + classif + str(out[i]) + "\n"
        print str(i) + "\t" + str(out[i]) + "\t0"
    return ret_str
コード例 #5
0
    def _predict(self, predictor, examples, task_id):
        """
        make prediction on examples using trained predictor

        @param predictor: trained predictor
        @type predictor: dict<str, tuple<SVM, int> >
        @param examples: list of examples
        @type examples: list<str> 
        @param task_id: task identifier
        @type task_id: str
        """

        (task_num, combined_kernel, svm) = predictor

        # shogun data
        base_feat = shogun_factory.create_features(examples)
        feat = CombinedFeatures()
        feat.append_feature_obj(base_feat)
        feat.append_feature_obj(base_feat)

        # update normalizers
        normalizer = combined_kernel.get_kernel(0).get_normalizer()
        normalizer = KernelNormalizerToMultitaskKernelNormalizer(normalizer)
        normalizer.set_task_vector_rhs([task_num] * len(examples))

        normalizer_dirac = combined_kernel.get_kernel(1).get_normalizer()
        normalizer_dirac = KernelNormalizerToMultitaskKernelMaskPairNormalizer(
            normalizer_dirac)
        normalizer_dirac.set_task_vector_rhs([task_num] * len(examples))

        # predict
        out = svm.classify(feat).get_labels()

        return out
コード例 #6
0
def create_combined_wd_features(instances, feat_type):
    """
    creates a combined wd feature object
    """

    num_features = len(instances[0])
    
    # contruct combined features
    feat = CombinedFeatures()
        
    for idx in range(num_features): 
    
        # cut column idx
        data = [instance[idx] for instance in instances]
    
        seq_len = len(data[0])
        for seq in data:
            if len(seq) != seq_len:
                print "warning, seq lengths differ", len(seq), seq_len, "in", idx, "num_feat", num_features
    
        tmp_feat = get_wd_features(data, feat_type)
        feat.append_feature_obj(tmp_feat)

    
    return feat
コード例 #7
0
def get_weighted_spectrum_kernel(subfeats_list, options):
	"""build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement)

	Arguments:
	subfeats_list -- list of sub-feature objects
	options -- object containing option data 

	Return:
	CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel 
	"""
	kmerlen = options.kmerlen
	kmerlen2 = options.kmerlen2

	subkernels = 0
	kernel = CombinedKernel()
	feats = CombinedFeatures()

	for subfeats in subfeats_list:
		feats.append_feature_obj(subfeats)

	for k in xrange(kmerlen, kmerlen2+1):
		if k <= 8:
			subkernel = CommWordStringKernel(10, False)
		else:
			subkernel = CommUlongStringKernel(10, False)

		kernel.append_kernel(subkernel)
		subkernels+=1

	kernel.init(feats, feats)

	kernel.set_subkernel_weights(numpy.array([1/float(subkernels)]*subkernels, numpy.dtype('float64')))

	return kernel
コード例 #8
0
ファイル: signal_sensor.py プロジェクト: AlexBinder/shogun
    def predict(self, seq, chunk_size = int(10e6)):
        """
        predicts on whole contig, splits up sequence in chunks of size chunk_size
        """

        seq_len = len(seq)
        num_chunks = int(numpy.ceil(float(seq_len) / float(chunk_size)))
        assert(num_chunks > 0)

    	sys.stderr.write("number of chunks for contig: %i\n" % (num_chunks))

        start = 0
        stop = min(chunk_size, seq_len)
		
        out = []

        # iterate over chunks
        for chunk_idx in range(num_chunks):

            sys.stderr.write("processing chunk #%i\n" % (chunk_idx))

            assert (start < stop)
            chunk = seq[start:stop]

            assert(len(self.sensors) > 0)
            tf = CombinedFeatures()
            for i in xrange(len(self.sensors)):
                f = self.sensors[i].get_test_features(chunk, self.window)
                tf.append_feature_obj(f)

            sys.stderr.write("initialising kernel...")
            self.kernel.init(self.svs, tf)
            sys.stderr.write("..done\n")

            self.svm.set_kernel(self.kernel)
            lab_out = self.svm.apply()

            # work around problem with get_labels()
            tmp_out = [lab_out.get_label(idx) for idx in range(0, lab_out.get_num_labels())]
            assert(len(tmp_out) > 0)
            out.extend(tmp_out)

            print "len out", len(out)

            # increment chunk
            start = stop
            stop = min(stop+chunk_size, seq_len)


        l = (-self.window[0]) * [-42]
        r = self.window[1] * [-42]

        # concatenate
        ret = l + out + r

        assert(len(ret) == len(seq))

        return ret
コード例 #9
0
ファイル: kirmes.py プロジェクト: veniciusgrjr/oqtans_tools
def training_run(options):
    """Conduct a training run and return a trained SVM kernel"""
    settings = MotifFinderSettings(kirmes_ini.MOTIF_LENGTH, options.window_width, options.replace)
    positives = MotifFinder(finder_settings=settings)
    positives.setFastaFile(options.positives)
    positives.setMotifs(options.pgff)
    pmotifs, ppositions = positives.getResults()
    negatives = MotifFinder(finder_settings=settings)
    negatives.setFastaFile(options.negatives)
    negatives.setMotifs(options.ngff)
    nmotifs, npositions = negatives.getResults()

    wds_kparams = kirmes_ini.WDS_KERNEL_PARAMETERS
    wds_svm = EasySVM.EasySVM(wds_kparams)
    num_positives = len(pmotifs.values()[0])
    num_negatives = len(nmotifs.values()[0])
    # Creating Kernel Objects
    kernel = CombinedKernel()
    features = CombinedFeatures()
    kernel_array = []
    motifs = pmotifs.keys()
    motifs.sort()
    # Adding Kmer Kernels
    for motif in motifs:
        all_examples = pmotifs[motif] + nmotifs[motif]
        motif_features = wds_svm.createFeatures(all_examples)
        wds_kernel = WeightedDegreePositionStringKernel(motif_features, motif_features, wds_kparams["degree"])
        wds_kernel.set_shifts(wds_kparams["shift"] * ones(wds_kparams["seqlength"], dtype=int32))
        features.append_feature_obj(motif_features)
        kernel_array.append(wds_kernel)
        kernel.append_kernel(wds_kernel)
    rbf_svm = EasySVM.EasySVM(kirmes_ini.RBF_KERNEL_PARAMETERS)
    positions = array(ppositions + npositions, dtype=float64).T
    position_features = rbf_svm.createFeatures(positions)
    features.append_feature_obj(position_features)
    motif_labels = append(ones(num_positives), -ones(num_negatives))
    complete_labels = Labels(motif_labels)
    rbf_kernel = GaussianKernel(position_features, position_features, kirmes_ini.RBF_KERNEL_PARAMETERS["width"])
    kernel_array.append(rbf_kernel)
    kernel.append_kernel(rbf_kernel)
    # Kernel init
    kernel.init(features, features)
    kernel.set_cache_size(kirmes_ini.K_CACHE_SIZE)
    svm = LibSVM(kirmes_ini.K_COMBINED_C, kernel, complete_labels)
    svm.parallel.set_num_threads(kirmes_ini.K_NUM_THREADS)
    # Training
    svm.train()
    if not os.path.exists(options.output_path):
        os.mkdir(options.output_path)
    html = {}
    if options.contrib:
        html["contrib"] = contrib(svm, kernel, motif_labels, kernel_array, motifs)
    if options.logos:
        html["poims"] = poims(svm, kernel, kernel_array, motifs, options.output_path)
    if options.query:
        html["query"] = evaluate(options, svm, kernel, features, motifs)
    htmlize(html, options.output_html)
コード例 #10
0
ファイル: kernel.py プロジェクト: manantomar/test
def _run_combined ():
	"""Run Combined kernel."""

	kern=kernel.CombinedKernel()
	feats={'train': CombinedFeatures(), 'test': CombinedFeatures()}
	output={}
	params={
		'name': 'Combined',
		'accuracy': 1e-7
	}
	subkdata=[
		{
			'name': 'FixedDegreeString',
			'feature_class': 'string',
			'feature_type': 'Char',
			'args': {'key': ('size', 'degree'), 'val': (10, 3)}
		},
		{
			'name': 'PolyMatchString',
			'feature_class': 'string',
			'feature_type': 'Char',
			'args': {
				'key': ('size', 'degree', 'inhomogene'),
				'val': (10, 3, True)
			}
		},
		{
			'name': 'LocalAlignmentString',
			'feature_class': 'string',
			'feature_type': 'Char',
			'args': {'key': ('size',), 'val': (10,)}
		}
	]

	i=0
	for sd in subkdata:
		kfun=eval('kernel.'+sd['name']+'Kernel')
		subk=kfun(*sd['args']['val'])
		sd['data']=dataop.get_dna()
		subkfeats=featop.get_features(
			sd['feature_class'], sd['feature_type'], sd['data'])
		output.update(
			fileop.get_output(category.KERNEL, sd, 'subkernel'+str(i)+'_'))

		kern.append_kernel(subk)
		feats['train'].append_feature_obj(subkfeats['train'])
		feats['test'].append_feature_obj(subkfeats['test'])

		i+=1

	output.update(fileop.get_output(category.KERNEL, params))
	kern.init(feats['train'], feats['train'])
	output['kernel_matrix_train']=kern.get_kernel_matrix()
	kern.init(feats['train'], feats['test'])
	output['kernel_matrix_test']=kern.get_kernel_matrix()

	fileop.write(category.KERNEL, output)
コード例 #11
0
def statistics_linear_time_mmd_kernel_choice():
	from shogun.Features import RealFeatures, CombinedFeatures
	from shogun.Kernel import GaussianKernel, CombinedKernel
	from shogun.Statistics import LinearTimeMMD
	from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN

	# note that the linear time statistic is designed for much larger datasets
	n=50000
	dim=5
	difference=2

	# data is standard normal distributed. only one dimension of Y has a mean
	# shift of difference
	(X,Y)=gen_data.create_mean_data(n,dim,difference)
	
	# concatenate since MMD class takes data as one feature object
	# (it is possible to give two, but then data is copied)
	Z=concatenate((X,Y), axis=1)
	print "dimension means of X", [mean(x) for x in X]
	print "dimension means of Y", [mean(x) for x in Y]

	# create kernels/features to choose from
	# here: just a bunch of Gaussian Kernels with different widths
	# real sigmas are 2^-5, ..., 2^10
	sigmas=array([pow(2,x) for x in range(-5,10)])
	
	# shogun has a different parametrization of the Gaussian kernel
	shogun_sigmas=array([x*x*2 for x in sigmas])
	
	# We will use multiple kernels
	kernel=CombinedKernel()
	
	# two separate feature objects here, could also be one with appended data
	features=CombinedFeatures()
	
	# all kernels work on same features
	for i in range(len(sigmas)):
		kernel.append_kernel(GaussianKernel(10, shogun_sigmas[i]))
		features.append_feature_obj(RealFeatures(Z))
	
	mmd=LinearTimeMMD(kernel,features, n)
	
	print "start learning kernel weights"
	mmd.set_opt_regularization_eps(10E-5)
	mmd.set_opt_low_cut(10E-5)
	mmd.set_opt_max_iterations(1000)
	mmd.set_opt_epsilon(10E-7)
	mmd.optimize_kernel_weights()
	weights=kernel.get_subkernel_weights()
	print "learned weights:", weights
	#pyplot.plot(array(range(len(sigmas))), weights)
	#pyplot.show()
	print "index of max weight", weights.argmax()
コード例 #12
0
def statistics_linear_time_mmd_kernel_choice():
    from shogun.Features import RealFeatures, CombinedFeatures
    from shogun.Features import MeanShiftRealDataGenerator
    from shogun.Kernel import GaussianKernel, CombinedKernel
    from shogun.Statistics import LinearTimeMMD
    from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN

    # note that the linear time statistic is designed for much larger datasets
    n = 50000
    dim = 5
    difference = 2

    # use data generator class to produce example data
    # in pratice, this generate data function could be replaced by a method
    # that obtains data from a stream
    data = DataGenerator.generate_mean_data(n, dim, difference)

    print "dimension means of X", mean(data.T[0:n].T)
    print "dimension means of Y", mean(data.T[n:2 * n + 1].T)

    # create kernels/features to choose from
    # here: just a bunch of Gaussian Kernels with different widths
    # real sigmas are 2^-5, ..., 2^10
    sigmas = array([pow(2, x) for x in range(-5, 10)])

    # shogun has a different parametrization of the Gaussian kernel
    shogun_sigmas = array([x * x * 2 for x in sigmas])

    # We will use multiple kernels
    kernel = CombinedKernel()

    # two separate feature objects here, could also be one with appended data
    features = CombinedFeatures()

    # all kernels work on same features
    for i in range(len(sigmas)):
        kernel.append_kernel(GaussianKernel(10, shogun_sigmas[i]))
        features.append_feature_obj(RealFeatures(data))

    mmd = LinearTimeMMD(kernel, features, n)

    print "start learning kernel weights"
    mmd.set_opt_regularization_eps(10E-5)
    mmd.set_opt_low_cut(10E-5)
    mmd.set_opt_max_iterations(1000)
    mmd.set_opt_epsilon(10E-7)
    mmd.optimize_kernel_weights()
    weights = kernel.get_subkernel_weights()
    print "learned weights:", weights
    #pyplot.plot(array(range(len(sigmas))), weights)
    #pyplot.show()
    print "index of max weight", weights.argmax()
コード例 #13
0
def statistics_linear_time_mmd_kernel_choice():
	from shogun.Features import RealFeatures, CombinedFeatures
	from shogun.Features import DataGenerator
	from shogun.Kernel import GaussianKernel, CombinedKernel
	from shogun.Statistics import LinearTimeMMD
	from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN

	# note that the linear time statistic is designed for much larger datasets
	n=50000
	dim=5
	difference=2

	# use data generator class to produce example data
	# in pratice, this generate data function could be replaced by a method
	# that obtains data from a stream
	data=DataGenerator.generate_mean_data(n,dim,difference)
	
	print "dimension means of X", mean(data.T[0:n].T)
	print "dimension means of Y", mean(data.T[n:2*n+1].T)

	# create kernels/features to choose from
	# here: just a bunch of Gaussian Kernels with different widths
	# real sigmas are 2^-5, ..., 2^10
	sigmas=array([pow(2,x) for x in range(-5,10)])
	
	# shogun has a different parametrization of the Gaussian kernel
	shogun_sigmas=array([x*x*2 for x in sigmas])
	
	# We will use multiple kernels
	kernel=CombinedKernel()
	
	# two separate feature objects here, could also be one with appended data
	features=CombinedFeatures()
	
	# all kernels work on same features
	for i in range(len(sigmas)):
		kernel.append_kernel(GaussianKernel(10, shogun_sigmas[i]))
		features.append_feature_obj(RealFeatures(data))
	
	mmd=LinearTimeMMD(kernel,features, n)
	
	print "start learning kernel weights"
	mmd.set_opt_regularization_eps(10E-5)
	mmd.set_opt_low_cut(10E-5)
	mmd.set_opt_max_iterations(1000)
	mmd.set_opt_epsilon(10E-7)
	mmd.optimize_kernel_weights()
	weights=kernel.get_subkernel_weights()
	print "learned weights:", weights
	#pyplot.plot(array(range(len(sigmas))), weights)
	#pyplot.show()
	print "index of max weight", weights.argmax()
コード例 #14
0
def get_weighted_spectrum_kernel(subfeats_list, options):
    """build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement)

	Arguments:
	subfeats_list -- list of sub-feature objects
	options -- object containing option data 

	Return:
	CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel 
	"""
    kmerlen = options.kmerlen
    kmerlen2 = options.kmerlen2

    subkernels = 0
    kernel = CombinedKernel()
    feats = CombinedFeatures()

    weights = []

    i = 0
    for subfeats in subfeats_list:
        feats.append_feature_obj(subfeats)

        combine_kcount = Counter()
        for i in xrange(subfeats.get_num_vectors()):
            fv = list(subfeats.get_feature_vector(i))
            combine_kcount += Counter(fv)
            number = len(combine_kcount)
            klen = kmerlen + i

    for k in xrange(kmerlen, kmerlen2 + 1):
        if k <= 8:
            subkernel = CommWordStringKernel(10, False)
        else:
            subkernel = CommUlongStringKernel(10, False)

        kernel.append_kernel(subkernel)
        subkernels += 1

    kernel.init(feats, feats)
    # here the weight for each k-mer is uniform
    '''
	subkernels = 8
	numpy.array([1 / float(subkernels)] * subkernels, numpy.dtype('float64'))
	array([ 0.125,  0.125,  0.125,  0.125,  0.125,  0.125,  0.125,  0.125])
	'''
    kernel.set_subkernel_weights(
        numpy.array([1 / float(subkernels)] * subkernels,
                    numpy.dtype('float64')))

    return kernel
コード例 #15
0
def create_promoter_features(data, param):
    """
    creates promoter combined features
    
    @param examples:
    @param param:
    """

    print "creating promoter features"

    (center, left, right) = split_data_promoter(data, param["center_offset"],
                                                param["center_pos"])

    # set up base features
    feat_center = StringCharFeatures(DNA)
    feat_center.set_features(center)
    feat_left = get_spectrum_features(left)
    feat_right = get_spectrum_features(right)

    # construct combined features
    feat = CombinedFeatures()
    feat.append_feature_obj(feat_center)
    feat.append_feature_obj(feat_left)
    feat.append_feature_obj(feat_right)

    return feat
コード例 #16
0
def create_promoter_features(data, param):
    """
    creates promoter combined features
    
    @param examples:
    @param param:
    """

    print "creating promoter features"

    (center, left, right) = split_data_promoter(data, param["center_offset"], param["center_pos"])

    # set up base features
    feat_center = StringCharFeatures(DNA)
    feat_center.set_features(center)
    feat_left = get_spectrum_features(left)
    feat_right = get_spectrum_features(right)

    # construct combined features
    feat = CombinedFeatures()
    feat.append_feature_obj(feat_center)
    feat.append_feature_obj(feat_left)
    feat.append_feature_obj(feat_right)

    return feat
コード例 #17
0
    def _predict(self, predictor, examples, task_name):
        """
        make prediction on examples using trained predictor

        @param predictor: trained predictor (task_id, num_nodes, combined_kernel, predictor)
        @type predictor: tuple<int, int, CombinedKernel, SVM>
        @param examples: list of examples
        @type examples: list<object>
        @param task_name: task name
        @type task_name: str
        """

        (task_id, combined_kernel, svm, param) = predictor

        # shogun data
        base_feat = shogun_factory.create_features(examples, param)
                
        # construct combined kernel
        feat = CombinedFeatures()
        
        for i in xrange(combined_kernel.get_num_subkernels()):
            feat.append_feature_obj(base_feat)

            # fetch kernel normalizer
            normalizer = combined_kernel.get_kernel(i).get_normalizer()
            
            # cast using dedicated SWIG-helper function
            normalizer = KernelNormalizerToMultitaskKernelMaskPairNormalizer(normalizer)
            
            # set task vector
            normalizer.set_task_vector_rhs([task_id]*len(examples))


        combined_kernel = svm.get_kernel()
        combined_kernel.init(combined_kernel.get_lhs(), feat)
        
        # predict
        out = svm.classify().get_labels()

        # predict
        #out = svm.classify(feat).get_labels()
        
        
        return out
    def _predict(self, predictor, examples, task_name):
        """
        make prediction on examples using trained predictor

        @param predictor: trained predictor (task_id, num_nodes, combined_kernel, predictor)
        @type predictor: tuple<int, int, CombinedKernel, SVM>
        @param examples: list of examples
        @type examples: list<object>
        @param task_name: task name
        @type task_name: str
        """

        (task_id, combined_kernel, svm, param) = predictor

        # shogun data
        base_feat = shogun_factory.create_features(examples, param)

        # construct combined kernel
        feat = CombinedFeatures()

        for i in xrange(combined_kernel.get_num_subkernels()):
            feat.append_feature_obj(base_feat)

            # fetch kernel normalizer
            normalizer = combined_kernel.get_kernel(i).get_normalizer()

            # cast using dedicated SWIG-helper function
            normalizer = KernelNormalizerToMultitaskKernelMaskPairNormalizer(
                normalizer)

            # set task vector
            normalizer.set_task_vector_rhs([task_id] * len(examples))

        combined_kernel = svm.get_kernel()
        combined_kernel.init(combined_kernel.get_lhs(), feat)

        # predict
        out = svm.classify().get_labels()

        # predict
        #out = svm.classify(feat).get_labels()

        return out
コード例 #19
0
def create_combined_spectrum_features(instances, feat_type):
    """
    creates a combined spectrum feature object
    """

    num_features = len(instances[0])

    # contruct combined features
    feat = CombinedFeatures()

    for idx in range(num_features):

        # cut column idx
        data = [instance[idx] for instance in instances]

        tmp_feat = get_spectrum_features(data, feat_type)
        feat.append_features(tmp_feat)

    return feat
コード例 #20
0
def construct_features(features):
    """
    makes a list
    """

    feat_all = [inst for inst in features]
    feat_lhs = [inst[0:15] for inst in features]
    feat_rhs = [inst[15:] for inst in features]

    feat_wd = get_wd_features(feat_all)
    feat_spec_1 = get_spectrum_features(feat_lhs, order=3)
    feat_spec_2 = get_spectrum_features(feat_rhs, order=3)

    feat_comb = CombinedFeatures()
    feat_comb.append_feature_obj(feat_wd)
    feat_comb.append_feature_obj(feat_spec_1)
    feat_comb.append_feature_obj(feat_spec_2)

    return feat_comb
コード例 #21
0
def create_combined_spectrum_features(instances, feat_type):
    """
    creates a combined spectrum feature object
    """

    num_features = len(instances[0])
    
    # contruct combined features
    feat = CombinedFeatures()
        
    for idx in range(num_features): 
    
        # cut column idx
        data = [instance[idx] for instance in instances]
    
        tmp_feat = get_spectrum_features(data, feat_type)
        feat.append_features(tmp_feat)

    
    return feat
コード例 #22
0
ファイル: kmersvm_train.py プロジェクト: aasoni/igm-research
def init_weighted_spectrum_kernel(kern, subfeats_list_lhs, subfeats_list_rhs):
    """initialize weighted spectrum kernel (wrapper function)
	"""
    feats_lhs = CombinedFeatures()
    feats_rhs = CombinedFeatures()

    for subfeats in subfeats_list_lhs:
        feats_lhs.append_feature_obj(subfeats)

    for subfeats in subfeats_list_rhs:
        feats_rhs.append_feature_obj(subfeats)

    kern.init(feats_lhs, feats_rhs)
コード例 #23
0
def evaluate(options, svm, kernel, features, motifs):
    """Evaluate examples using a trained kernel"""
    query = MotifFinder(finder_settings=MotifFinderSettings(
        kirmes_ini.MOTIF_LENGTH, options.window_width))
    query.setFastaFile(options.query)
    query.setMotifs(options.qgff)
    qmotifs, qpositions = query.getResults()
    feats_query = CombinedFeatures()
    wds_svm = EasySVM.EasySVM(kirmes_ini.WDS_KERNEL_PARAMETERS)
    try:
        assert set(qmotifs.keys()).issuperset(set(motifs))
    except AssertionError:
        print "The motif positions in the query sequence are incomplete, there are no positions for:"
        print set(motifs).difference(qmotifs.keys())
        raise
    for motif in motifs:
        feats_query.append_feature_obj(wds_svm.createFeatures(qmotifs[motif]))
    query_positions = array(qpositions, dtype=float64)
    query_positions = query_positions.T
    rbf_svm = EasySVM.EasySVM(kirmes_ini.RBF_KERNEL_PARAMETERS)
    feats_query.append_feature_obj(rbf_svm.createFeatures(query_positions))
    kernel.init(features, feats_query)
    out = svm.classify().get_labels()
    qgenes = query.getGenes()
    ret_str = ""
    print "#example\toutput\tsplit"
    for i in xrange(len(out)):
        if out[i] >= 0:
            classif = "\tpositive\t"
        else:
            classif = "\tnegative\t"
        ret_str += qgenes[i] + classif + str(out[i]) + "\n"
        print str(i) + "\t" + str(out[i]) + "\t0"
    return ret_str
コード例 #24
0
def mkl_binclass_modular (train_data, testdata, train_labels, test_labels, d1, d2):
        # create some Gaussian train/test matrix
    	tfeats = RealFeatures(train_data)
    	tkernel = GaussianKernel(128, d1)
    	tkernel.init(tfeats, tfeats)
    	K_train = tkernel.get_kernel_matrix()

    	pfeats = RealFeatures(test_data)
    	tkernel.init(tfeats, pfeats)
    	K_test = tkernel.get_kernel_matrix()

    	# create combined train features
    	feats_train = CombinedFeatures()
    	feats_train.append_feature_obj(RealFeatures(train_data))

    	# and corresponding combined kernel
    	kernel = CombinedKernel()
    	kernel.append_kernel(CustomKernel(K_train))
    	kernel.append_kernel(GaussianKernel(128, d2))
    	kernel.init(feats_train, feats_train)

    	# train mkl
    	labels = Labels(train_labels)
    	mkl = MKLClassification()
	
        # not to use svmlight
        mkl.set_interleaved_optimization_enabled(0)

    	# which norm to use for MKL
    	mkl.set_mkl_norm(2)

    	# set cost (neg, pos)
    	mkl.set_C(1, 1)

    	# set kernel and labels
    	mkl.set_kernel(kernel)
    	mkl.set_labels(labels)

    	# train
    	mkl.train()

    	# test
	# create combined test features
    	feats_pred = CombinedFeatures()
    	feats_pred.append_feature_obj(RealFeatures(test_data))

    	# and corresponding combined kernel
    	kernel = CombinedKernel()
    	kernel.append_kernel(CustomKernel(K_test))
    	kernel.append_kernel(GaussianKernel(128, d2))
    	kernel.init(feats_train, feats_pred)

	# and classify
    	mkl.set_kernel(kernel)
    	output = mkl.apply().get_labels()
	output = [1.0 if i>0 else -1.0 for i in output]
	accu = len(where(output == test_labels)[0]) / float(len(output))
	return accu
コード例 #25
0
ファイル: kmersvm_train.py プロジェクト: aasoni/igm-research
def get_weighted_spectrum_kernel(subfeats_list, options):
    """build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement)

	Arguments:
	subfeats_list -- list of sub-feature objects
	options -- object containing option data 

	Return:
	CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel 
	"""
    kmerlen = options.kmerlen
    kmerlen2 = options.kmerlen2

    subkernels = 0
    kernel = CombinedKernel()
    feats = CombinedFeatures()

    for subfeats in subfeats_list:
        feats.append_feature_obj(subfeats)

    for k in xrange(kmerlen, kmerlen2 + 1):
        if k <= 8:
            subkernel = CommWordStringKernel(10, False)
        else:
            subkernel = CommUlongStringKernel(10, False)

        kernel.append_kernel(subkernel)
        subkernels += 1

    kernel.init(feats, feats)

    kernel.set_subkernel_weights(
        numpy.array([1 / float(subkernels)] * subkernels,
                    numpy.dtype('float64')))

    return kernel
コード例 #26
0
def kernel_combined_custom_poly_modular(fm_train_real=traindat,
                                        fm_test_real=testdat,
                                        fm_label_twoclass=label_traindat):
    from shogun.Features import CombinedFeatures, RealFeatures, Labels
    from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel
    from shogun.Classifier import LibSVM

    kernel = CombinedKernel()
    feats_train = CombinedFeatures()

    tfeats = RealFeatures(fm_train_real)
    tkernel = PolyKernel(10, 3)
    tkernel.init(tfeats, tfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))

    subkfeats_train = RealFeatures(fm_train_real)
    feats_train.append_feature_obj(subkfeats_train)
    subkernel = PolyKernel(10, 2)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)

    labels = Labels(fm_label_twoclass)
    svm = LibSVM(1.0, kernel, labels)
    svm.train()

    kernel = CombinedKernel()
    feats_pred = CombinedFeatures()

    pfeats = RealFeatures(fm_test_real)
    tkernel = PolyKernel(10, 3)
    tkernel.init(tfeats, pfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))

    subkfeats_test = RealFeatures(fm_test_real)
    feats_pred.append_feature_obj(subkfeats_test)
    subkernel = PolyKernel(10, 2)
    kernel.append_kernel(subkernel)
    kernel.init(feats_train, feats_pred)

    svm.set_kernel(kernel)
    svm.classify()
    km_train = kernel.get_kernel_matrix()
    return km_train, kernel
コード例 #27
0
def evaluation_cross_validation_mkl_weight_storage(traindat=traindat, label_traindat=label_traindat):
    from shogun.Evaluation import CrossValidation, CrossValidationResult
    from shogun.Evaluation import CrossValidationPrintOutput
    from shogun.Evaluation import CrossValidationMKLStorage
    from shogun.Evaluation import ContingencyTableEvaluation, ACCURACY
    from shogun.Evaluation import StratifiedCrossValidationSplitting
    from shogun.Features import BinaryLabels
    from shogun.Features import RealFeatures, CombinedFeatures
    from shogun.Kernel import GaussianKernel, CombinedKernel
    from shogun.Classifier import LibSVM, MKLClassification
    from shogun.Mathematics import Statistics

    # training data, combined features all on same data
    features=RealFeatures(traindat)
    comb_features=CombinedFeatures()
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    labels=BinaryLabels(label_traindat)
    
    # kernel, different Gaussians combined
    kernel=CombinedKernel()
    kernel.append_kernel(GaussianKernel(10, 0.1))
    kernel.append_kernel(GaussianKernel(10, 1))
    kernel.append_kernel(GaussianKernel(10, 2))

    # create mkl using libsvm, due to a mem-bug, interleaved is not possible
    svm=MKLClassification(LibSVM());
    svm.set_interleaved_optimization_enabled(False);
    svm.set_kernel(kernel);

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "StratifiedCrossValidationSplitting" is also available
    splitting_strategy=StratifiedCrossValidationSplitting(labels, 5)

    # evaluation method
    evaluation_criterium=ContingencyTableEvaluation(ACCURACY)

    # cross-validation instance
    cross_validation=CrossValidation(svm, comb_features, labels,
        splitting_strategy, evaluation_criterium)
    cross_validation.set_autolock(False)

    # append cross vlaidation output classes
    #cross_validation.add_cross_validation_output(CrossValidationPrintOutput())
    mkl_storage=CrossValidationMKLStorage()
    cross_validation.add_cross_validation_output(mkl_storage)
    cross_validation.set_num_runs(3)
    
    # perform cross-validation
    result=cross_validation.evaluate()

    # print mkl weights
    weights=mkl_storage.get_mkl_weights()
コード例 #28
0
ファイル: experiment.py プロジェクト: mcopik/shogun
def create_combined_kernel(kname, kparam, examples, train_mode, preproc):
    """A wrapper for creating combined kernels.

    kname, kparam and examples are lists.

    """
    num_kernels = len(kname)
    feats['combined'] = CombinedFeatures()
    kernel = CombinedKernel()

    for kix in xrange(num_kernels):
        cur_kname = '%s%d' % (kname[kix],kix)
        (cur_feats, cur_preproc) = create_features(kname[kix], examples[kix], kparam[kix], train_mode, preproc)
        feats[cur_kname] = cur_feats
        cur_kernel = create_kernel(kname[kix], kparam[kix], cur_feats)
        kernel.append_kernel(cur_kernel)

    return (feats,kernel)
コード例 #29
0
def init_weighted_spectrum_kernel(kern, subfeats_list_lhs, subfeats_list_rhs):
	"""initialize weighted spectrum kernel (wrapper function)
	"""
	feats_lhs = CombinedFeatures()
	feats_rhs = CombinedFeatures()

	for subfeats in subfeats_list_lhs:
		feats_lhs.append_feature_obj(subfeats)

	for subfeats in subfeats_list_rhs:
		feats_rhs.append_feature_obj(subfeats)

	kern.init(feats_lhs, feats_rhs)
コード例 #30
0
def mkl_binclass_modular(fm_train_real=traindat,
                         fm_test_real=testdat,
                         fm_label_twoclass=label_traindat):

    sc1 = StringCharFeatures(strings1, RAWBYTE)
    sc2 = StringCharFeatures(strings2, RAWBYTE)

    sfeats1 = StringWordFeatures(RAWBYTE)
    sfeats1.obtain_from_char(sc1, 0, 2, 0, False)
    skernel1 = CommWordStringKernel(10, False)
    skernel1.init(sfeats1, sfeats1)
    sfeats2 = StringWordFeatures(RAWBYTE)
    sfeats2.obtain_from_char(sc2, 0, 2, 0, False)
    skernel2 = CommWordStringKernel(10, False)
    skernel2.init(sfeats2, sfeats2)

    ffeats = RealFeatures(traindat)
    fkernel = LinearKernel(ffeats, ffeats)

    fffeats = RealFeatures(traindat)
    ffkernel = GaussianKernel(fffeats, fffeats, 1.0)

    # COMBINING LINADD FEATURES/KERNELS LEAD TO FAIL
    feats_train = CombinedFeatures()
    feats_train.append_feature_obj(ffeats)
    #feats_train.append_feature_obj(fffeats)
    feats_train.append_feature_obj(sfeats2)

    print feats_train.get_num_vectors()

    kernel = CombinedKernel()
    kernel.append_kernel(fkernel)
    #kernel.append_kernel(ffkernel)
    kernel.append_kernel(skernel2)
    kernel.init(feats_train, feats_train)

    labels = RegressionLabels(fm_label_twoclass)
    mkl = MKLRegression()

    mkl.set_mkl_norm(1)  #2,3
    mkl.set_C(1, 1)
    mkl.set_kernel(kernel)
    mkl.set_labels(labels)

    mkl.io.enable_file_and_line()
    mkl.io.set_loglevel(MSG_DEBUG)
コード例 #31
0
def create_features(data, center_offset, center_pos):
    """
    combined features from different kernel perspective 
    """

    #print("creating promoter features - center_offset %d, center_pos %d - %s" % (center_offset, center_pos, data))

    (center, left, right) = split_data_promoter(data, center_offset,
                                                center_pos)
    # sanity check sequences
    assert len(center) == len(left) == len(right)

    feat_center = StringCharFeatures(DNA)
    feat_center.set_features(center)
    feat_left = get_spectrum_features(left)
    feat_right = get_spectrum_features(right)

    # contruct combined features
    feat = CombinedFeatures()
    feat.append_feature_obj(feat_center)
    feat.append_feature_obj(feat_left)
    feat.append_feature_obj(feat_right)

    return feat
コード例 #32
0
def kernel_combined_custom_poly_modular(fm_train_real = traindat,fm_test_real = testdat,fm_label_twoclass=label_traindat):
    from shogun.Features import CombinedFeatures, RealFeatures, BinaryLabels
    from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel
    from shogun.Classifier import LibSVM
   
    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    
    tfeats = RealFeatures(fm_train_real)
    tkernel = PolyKernel(10,3)
    tkernel.init(tfeats, tfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))
        
    subkfeats_train = RealFeatures(fm_train_real)
    feats_train.append_feature_obj(subkfeats_train)
    subkernel = PolyKernel(10,2)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)
    
    labels = BinaryLabels(fm_label_twoclass)
    svm = LibSVM(1.0, kernel, labels)
    svm.train()

    kernel = CombinedKernel()
    feats_pred = CombinedFeatures()

    pfeats = RealFeatures(fm_test_real)
    tkernel = PolyKernel(10,3)
    tkernel.init(tfeats, pfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))

    subkfeats_test = RealFeatures(fm_test_real)
    feats_pred.append_feature_obj(subkfeats_test)
    subkernel = PolyKernel(10, 2)
    kernel.append_kernel(subkernel)
    kernel.init(feats_train, feats_pred)

    svm.set_kernel(kernel)
    svm.apply()
    km_train=kernel.get_kernel_matrix()
    return km_train,kernel
コード例 #33
0
def construct_features(features):
    """
    makes a list
    """

    feat_all = [inst for inst in features]
    feat_lhs = [inst[0:15] for inst in features]
    feat_rhs = [inst[15:] for inst in features]

    feat_wd = get_wd_features(feat_all)
    feat_spec_1 = get_spectrum_features(feat_lhs, order=3)
    feat_spec_2 = get_spectrum_features(feat_rhs, order=3)

    feat_comb = CombinedFeatures()
    feat_comb.append_feature_obj(feat_wd)
    feat_comb.append_feature_obj(feat_spec_1)
    feat_comb.append_feature_obj(feat_spec_2)

    return feat_comb
コード例 #34
0
ファイル: promoter_kernel.py プロジェクト: kuod/genomeutils
def create_features(data, center_offset, center_pos):
    """
    combined features from different kernel perspective 
    """
    
    #print("creating promoter features - center_offset %d, center_pos %d - %s" % (center_offset, center_pos, data))

    (center, left, right) = split_data_promoter(data, center_offset, center_pos)
    # sanity check sequences
    assert len(center) == len(left) == len(right)

    feat_center = StringCharFeatures(DNA)
    feat_center.set_features(center)
    feat_left = get_spectrum_features(left)
    feat_right = get_spectrum_features(right)

    # contruct combined features
    feat = CombinedFeatures()
    feat.append_feature_obj(feat_center)
    feat.append_feature_obj(feat_left)
    feat.append_feature_obj(feat_right)

    return feat
コード例 #35
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_empty_kernel(param)
        lab = shogun_factory.create_labels(data.labels)

        combined_kernel = CombinedKernel()
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)
        base_features = shogun_factory.create_features(data.examples)
        combined_features = CombinedFeatures()

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        # load data
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt")
        f = file(
            "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt"
        )
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt")
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt")

        num_lines = int(f.readline().strip())
        task_distances = numpy.zeros((num_lines, num_lines))
        name_to_id = {}
        for (i, line) in enumerate(f):
            tokens = line.strip().split("\t")
            name = str(tokens[0])
            name_to_id[name] = i
            entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0])
            assert len(entry) == num_lines, "len_entry %i, num_lines %i" % (
                len(entry), num_lines)
            task_distances[i, :] = entry

        # cut relevant submatrix
        active_ids = [name_to_id[name] for name in data.get_task_names()]
        tmp_distances = task_distances[active_ids, :]
        tmp_distances = tmp_distances[:, active_ids]
        print "distances ", tmp_distances.shape

        # normalize distances
        task_distances = task_distances / numpy.max(tmp_distances)

        similarities = numpy.zeros(
            (data.get_num_tasks(), data.get_num_tasks()))

        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():

                # convert similarity with simple transformation
                similarity = param.base_similarity - task_distances[
                    name_to_id[task_name_lhs], name_to_id[task_name_rhs]]
                normalizer.set_task_similarity(data.name_to_id(task_name_lhs),
                                               data.name_to_id(task_name_rhs),
                                               similarity)

                # save for later
                similarities[data.name_to_id(task_name_lhs),
                             data.name_to_id(task_name_rhs)] = similarity

        # set normalizer
        base_wdk.set_normalizer(normalizer)
        #base_wdk.init_normalizer()

        combined_features.append_feature_obj(base_features)
        combined_kernel.append_kernel(base_wdk)

        ##################################################
        # intra-domain blocks

        intra_block_vec = PairiiVec()

        for task_id in data.get_task_ids():
            intra_block_vec.push_back(Pairii(task_id, task_id))

        # create mask-based normalizer
        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums,
                                                       intra_block_vec)
        kernel = shogun_factory.create_empty_kernel(param)
        kernel.set_normalizer(normalizer)

        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel)

        # append features
        combined_features.append_feature_obj(base_features)

        # set mixing factor (used if MKL is OFF)
        assert (param.base_similarity <= 1)
        assert (param.base_similarity >= 0)
        combined_kernel.set_subkernel_weights(
            [param.base_similarity, 1 - param.base_similarity])

        combined_kernel.init(combined_features, combined_features)

        svm = None

        print "using MKL:", (param.transform >= 1.0)

        if param.transform >= 1.0:

            svm = MKLClassification()

            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto

            svm.set_C(param.cost, param.cost)

            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)

        else:

            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)

            svm = SVMLight(param.cost, combined_kernel, lab)

        # set up SVM
        num_threads = 8
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)

        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        print "WARNING: custom epsilon set"
        svm.set_epsilon(0.05)

        # normalize cost
        norm_c_pos = param.cost / float(len([l
                                             for l in data.labels if l == 1]))
        norm_c_neg = param.cost / float(
            len([l for l in data.labels if l == -1]))

        svm.set_C(norm_c_neg, norm_c_pos)

        # start training
        svm.train()

        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        self.additional_information["similarities"] = similarities
        self.additional_information[
            "post_weights"] = combined_kernel.get_subkernel_weights()

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in data.get_task_names():

            task_num = data.name_to_id(task_name)

            # save svm and task_num
            svms[task_name] = (task_num, combined_kernel, svm)

        return svms
コード例 #36
0
def kernel_combined_modular(fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ):
	from shogun.Kernel import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
	from shogun.Features import RealFeatures, StringCharFeatures, CombinedFeatures, DNA

	kernel=CombinedKernel()
	feats_train=CombinedFeatures()
	feats_test=CombinedFeatures()

	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=GaussianKernel(10, 1.1)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	degree=3
	subkernel=FixedDegreeStringKernel(10, degree)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	subkernel=LocalAlignmentStringKernel(10)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	kernel.init(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
コード例 #37
0
def mkl_multiclass_modular(fm_train_real, fm_test_real, label_train_multiclass,
                           width, C, epsilon, num_threads, mkl_epsilon,
                           mkl_norm):

    from shogun.Features import CombinedFeatures, RealFeatures, Labels
    from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel, PolyKernel
    from shogun.Classifier import MKLMultiClass

    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    feats_test = CombinedFeatures()

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = GaussianKernel(10, width)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = LinearKernel()
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = PolyKernel(10, 2)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)

    labels = Labels(label_train_multiclass)

    mkl = MKLMultiClass(C, kernel, labels)

    mkl.set_epsilon(epsilon)
    mkl.parallel.set_num_threads(num_threads)
    mkl.set_mkl_epsilon(mkl_epsilon)
    mkl.set_mkl_norm(mkl_norm)

    mkl.train()

    kernel.init(feats_train, feats_test)

    out = mkl.apply().get_labels()
    return out
コード例 #38
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        

        #numpy.random.seed(1337)
        numpy.random.seed(666)

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)

                
        # create shogun label
        lab = shogun_factory.create_labels(data.labels)


        # assemble combined kernel
        combined_kernel = CombinedKernel()
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_DEBUG)    
        # set kernel cache
        if param.flags.has_key("cache_size"):
            combined_kernel.set_cache_size(param.flags["cache_size"])
        

        # create features
        base_features = shogun_factory.create_features(data.examples)
        
        combined_features = CombinedFeatures()
        


        ########################################################
        print "creating a masked kernel for each node:"
        ########################################################
        

        # fetch taxonomy from parameter object
        taxonomy = param.taxonomy.data

        # create name to leaf map
        nodes = taxonomy.get_all_nodes()

        
        for node in nodes:
            
            print "creating kernel for ", node.name
            
            # fetch sub-tree
            active_task_ids = [data.name_to_id(leaf.name) for leaf in node.get_leaves()]
            
            print "masking all entries other than:", active_task_ids
            
        
            # create mask-based normalizer
            normalizer = MultitaskKernelMaskNormalizer(data.task_vector_nums, data.task_vector_nums, active_task_ids)
            
            # normalize trace
            if param.flags.has_key("normalize_trace") and param.flags["normalize_trace"]:
                norm_factor = len(node.get_leaves()) / len(active_task_ids)
                normalizer.set_normalization_constant(norm_factor)
            
            # create kernel
            kernel = shogun_factory.create_empty_kernel(param)
            kernel.set_normalizer(normalizer)
            
            
            # append current kernel to CombinedKernel
            combined_kernel.append_kernel(kernel)
        
            # append features
            combined_features.append_feature_obj(base_features)

            print "------"
        

        combined_kernel.init(combined_features, combined_features)                
        #combined_kernel.precompute_subkernels()
                
        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None
                        
        print "using MKL:", (param.flags["mkl_q"] >= 1.0)

        
        if param.flags["mkl_q"] >= 1.0:
            
            # set up MKL    
            svm = MKLClassification()

            # set the "q" in q-norm MKL
            svm.set_mkl_norm(param.flags["mkl_q"])
            
            # set interleaved optimization
            if param.flags.has_key("interleaved"):
                svm.set_interleaved_optimization_enabled(param.flags["interleaved"])
            
            # set solver type
            if param.flags.has_key("solver_type") and param.flags["solver_type"]:
                if param.flags["solver_type"] == "ST_CPLEX":
                    svm.set_solver_type(ST_CPLEX)
                if param.flags["solver_type"] == "ST_DIRECT":
                    svm.set_solver_type(ST_DIRECT)
                if param.flags["solver_type"] == "ST_NEWTON":
                    svm.set_solver_type(ST_NEWTON)
                if param.flags["solver_type"] == "ST_GLPK":
                    svm.set_solver_type(ST_GLPK)
            
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)
            
        else:
            # create vanilla SVM 
            svm = SVMLight(param.cost, combined_kernel, lab)


        # optimization settings
        num_threads = 4
        svm.parallel.set_num_threads(num_threads)
        
        if param.flags.has_key("epsilon"):
            svm.set_epsilon(param.flags["epsilon"])
        
        
        # enable output        
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)
        
        
        # disable unsupported optimizations (due to special normalizer)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        
        
        # set cost
        if param.flags["normalize_cost"]:
            
            norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
            norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
            svm.set_C(norm_c_neg, norm_c_pos)
            
        else:
            
            svm.set_C(param.cost, param.cost)
        
        
        # start training
        svm.train()


        ########################################################
        print "svm objective:"
        print svm.get_objective()
        ########################################################
        
        # store additional info
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["weights"] = combined_kernel.get_subkernel_weights()
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), len(nodes), combined_kernel, svm)

        
        return svms
コード例 #39
0
def mkl_multiclass_modular(fm_train_real, fm_test_real, label_train_multiclass,
	width, C, epsilon, num_threads, mkl_epsilon, mkl_norm):

	from shogun.Features import CombinedFeatures, RealFeatures, Labels
	from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel
	from shogun.Classifier import MKLMultiClass

	kernel = CombinedKernel()
	feats_train = CombinedFeatures()
	feats_test = CombinedFeatures()

	subkfeats_train = RealFeatures(fm_train_real)
	subkfeats_test = RealFeatures(fm_test_real)
	subkernel = GaussianKernel(10, width)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train = RealFeatures(fm_train_real)
	subkfeats_test = RealFeatures(fm_test_real)
	subkernel = LinearKernel()
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train = RealFeatures(fm_train_real)
	subkfeats_test = RealFeatures(fm_test_real)
	subkernel = PolyKernel(10,2)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)
	
	kernel.init(feats_train, feats_train)

	labels = Labels(label_train_multiclass)

	mkl = MKLMultiClass(C, kernel, labels)
	
	mkl.set_epsilon(epsilon);
	mkl.parallel.set_num_threads(num_threads)
	mkl.set_mkl_epsilon(mkl_epsilon)
	mkl.set_mkl_norm(mkl_norm)

	mkl.train()

	kernel.init(feats_train, feats_test)

	out =  mkl.classify().get_labels()
	return out
コード例 #40
0
ファイル: experiment.py プロジェクト: mcopik/shogun
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con):
    """Converts numpy arrays or sequences into shogun features"""

    if kname == 'gauss' or kname == 'linear' or kname == 'poly':
        examples = numpy.array(examples)
        feats = RealFeatures(examples)
        
    elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove':
        if seq_source == 'dna': 
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA)
        elif seq_source == 'protein':
            examples = non_aminoacid_converter(examples, nuc_con) 
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)

    elif kname == 'spec' or kname == 'cumspec':
        if seq_source == 'dna':
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA) 
        elif seq_source == 'protein':    
            examples = non_aminoacid_converter(examples, nuc_con)
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)
       
        wf = StringUlongFeatures( feats.get_alphabet() )
        wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec')
        del feats

        if train_mode:
            preproc = SortUlongString()
            preproc.init(wf)
        wf.add_preproc(preproc)
        ret = wf.apply_preproc()
        #assert(ret)

        feats = wf
    elif kname == 'spec2' or kname == 'cumspec2':
        # spectrum kernel on two sequences
        feats = {}
        feats['combined'] = CombinedFeatures()

        reversed = kname=='cumspec2'

        (ex0,ex1) = zip(*examples)

        f0 = StringCharFeatures(list(ex0), DNA)
        wf = StringWordFeatures(f0.get_alphabet())
        wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f0

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessors()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f0'] = wf

        f1 = StringCharFeatures(list(ex1), DNA)
        wf = StringWordFeatures( f1.get_alphabet() )
        wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f1

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preproc(preproc)
        ret = wf.apply_preproc()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f1'] = wf

    else:
        print 'Unknown kernel %s' % kname
    
    return (feats,preproc)
コード例 #41
0
ファイル: signal_sensor.py プロジェクト: Anshul-Bansal/gsoc
class SignalSensor(object):
    """
    A collection of sensors
    """
    def __init__(self):
        self.sensors = list()
        self.kernel = CombinedKernel()
        self.svs = CombinedFeatures()
        self.svm = None
        self.window = (+100000, -1000000)

    def from_file(self, file):
        sys.stderr.write('loading model file')
        l = file.readline();

        if l != '%arts version: 1.0\n':
            sys.stderr.write("\nfile not an arts definition file\n")
            return None

        bias = None
        alphas = None
        num_kernels = None

        while l:
            # skip comment or empty line
            if not (l.startswith('%') or l.startswith('\n')):
                if bias is None: bias = parse_float(l, 'b')
                if alphas is None: alphas = parse_vector(l, file, 'alphas')
                if num_kernels is None: num_kernels = parse_int(l, 'num_kernels')

                if num_kernels and bias and alphas is not None:
                    for i in xrange(num_kernels):
                        s = Sensor()
                        (k, f) = s.from_file(file, i + 1)
                        k.io.enable_progress()
                        self.window = (min(self.window[0], s.window[0]),
                                max(self.window[1], s.window[2]))
                        self.sensors.append(s)
                        self.kernel.append_kernel(k)
                        self.svs.append_feature_obj(f)

                    self.kernel.init(self.svs, self.svs)
                    self.svm = SVM(self.kernel, alphas,
                            numpy.arange(len(alphas), dtype=numpy.int32), bias)
                    self.svm.io.set_target_to_stderr()
                    self.svm.io.enable_progress()
                    self.svm.parallel.set_num_threads(self.svm.parallel.get_num_cpus())
                    sys.stderr.write('done\n')
                    return

            l = file.readline()

        sys.stderr.write('error loading model file\n')


    def predict(self, seq, chunk_size = int(10e6)):
        """
        predicts on whole contig, splits up sequence in chunks of size chunk_size
        """

        seq_len = len(seq)
        num_chunks = int(numpy.ceil(float(seq_len) / float(chunk_size)))
        assert(num_chunks > 0)

    	sys.stderr.write("number of chunks for contig: %i\n" % (num_chunks))

        start = 0
        stop = min(chunk_size, seq_len)
		
        out = []

        # iterate over chunks
        for chunk_idx in range(num_chunks):

            sys.stderr.write("processing chunk #%i\n" % (chunk_idx))

            assert (start < stop)
            chunk = seq[start:stop]

            assert(len(self.sensors) > 0)
            tf = CombinedFeatures()
            for i in xrange(len(self.sensors)):
                f = self.sensors[i].get_test_features(chunk, self.window)
                tf.append_feature_obj(f)

            sys.stderr.write("initialising kernel...")
            self.kernel.init(self.svs, tf)
            sys.stderr.write("..done\n")

            lab_out = self.svm.apply()

            # work around problem with get_labels()
            tmp_out = [lab_out.get_label(idx) for idx in range(0, lab_out.get_num_labels())]
            assert(len(tmp_out) > 0)
            out.extend(tmp_out)

            print "len out", len(out)

            # increment chunk
            start = stop
            stop = min(stop+chunk_size, seq_len)


        l = (-self.window[0]) * [-42]
        r = self.window[1] * [-42]

        # concatenate
        ret = l + out + r

        assert(len(ret) == len(seq))

        return ret
コード例 #42
0
ファイル: signal_sensor.py プロジェクト: nuoanunu/shogun-olaf
 def __init__(self):
     self.sensors = list()
     self.kernel = CombinedKernel()
     self.svs = CombinedFeatures()
     self.svm = None
     self.window = (+100000, -1000000)
コード例 #43
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
          
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)
        
        # create shogun label
        lab = shogun_factory.create_labels(data.labels)
        


        ########################################################
        print "creating a kernel for each node:"
        ########################################################


        # assemble combined kernel
        
        combined_kernel = CombinedKernel()
        
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)
        
        
        base_features = shogun_factory.create_features(data.examples)
        
        combined_features = CombinedFeatures()
        
        
        
        
        ##################################################
        # intra-domain blocks
        
        
        #        intra_block_vec = PairiiVec()
        #        
        #        for task_id in data.get_task_ids():
        #            intra_block_vec.push_back(Pairii(task_id, task_id))
        #        
        #        
        #        
        #        # create mask-based normalizer
        #        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec)        
        #        kernel = shogun_factory.create_empty_kernel(param)
        #        kernel.set_normalizer(normalizer)
        #        
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel)
        #    
        #        # append features
        #        combined_features.append_feature_obj(base_features)
        #
        #        print "------"
        #        
        #        ##################################################
        #        # all blocks
        #        
        #        
        #        all_block_vec = PairiiVec()
        #        
        #        for task_id_1 in data.get_task_ids():
        #            for task_id_2 in data.get_task_ids():
        #                all_block_vec.push_back(Pairii(task_id_1, task_id_2))
        #                
        #        
        #        # create mask-based normalizer
        #        normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec)        
        #        kernel_all = shogun_factory.create_empty_kernel(param)
        #        kernel_all.set_normalizer(normalizer_all)
        #                
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel_all)
        #    
        #        # append features
        #        combined_features.append_feature_obj(base_features)

        
        ##################################################
        # add one kernel per similarity position
        
        
        # init seq handler 
        pseudoseqs = SequencesHandler()
        pseudoseq_length = pseudoseqs.seq_length


        for pos in range(pseudoseq_length):
            
            print "appending kernel for pos %i" % (pos)
        
            print "nums", data.task_vector_nums

    
            pos_block_vec = PairiiVec()
    
            # set similarity
            for task_name_lhs in data.get_task_names():
                for task_name_rhs in data.get_task_names():
                    
                    similarity = pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pos)
                    #print "computing similarity for tasks (%s, %s) = %i" % (task_name_lhs, task_name_rhs, similarity)
                    
                    if similarity == 1:                    
                        tmp_pair = Pairii(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs))
                        pos_block_vec.push_back(tmp_pair)

            print "creating normalizer"
            normalizer_pos = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, pos_block_vec)   

            print "creating empty kernel"
            kernel_pos = shogun_factory.create_empty_kernel(param)
            
            print "setting normalizer"
            kernel_pos.set_normalizer(normalizer_pos)
                
            print "appending kernel"
            # append current kernel to CombinedKernel
            combined_kernel.append_kernel(kernel_pos)
    
            print "appending features"
            # append features
            combined_features.append_feature_obj(base_features)
        
        
        print "done constructing combined kernel"
        
        ##################################################
        # init combined kernel
        
        combined_kernel.init(combined_features, combined_features)    
        
            

                
        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None
                
        
        print "using MKL:", (param.transform >= 1.0)
        
        if param.transform >= 1.0:
            
            svm = MKLClassification()
            
            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto
        
            svm.set_C(param.cost, param.cost)
            
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)
            
                
        else:
            
            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)
            
            svm = SVMLight(param.cost, combined_kernel, lab)


        # set up SVM
        num_threads = 8
        svm.io.enable_progress()
        #svm.io.set_loglevel(shogun.Classifier.MSG_INFO)
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)
        
        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        
        print "WARNING: custom epsilon set"
        svm.set_epsilon(0.05)    
        
        # normalize cost
        norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
        norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
        
        svm.set_C(norm_c_neg, norm_c_pos)
        
        
        # start training
        svm.train()
    
        
        # save additional info
        self.additional_information["svm_objective"] = svm.get_objective()
        self.additional_information["svm num sv"] = svm.get_num_support_vectors()
        self.additional_information["mkl weights post-training"] = combined_kernel.get_subkernel_weights()
        
        print self.additional_information 
        
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm)

        
        return svms
コード例 #44
0
ファイル: signal_sensor.py プロジェクト: nuoanunu/shogun-olaf
class SignalSensor(object):
    """
    A collection of sensors
    """
    def __init__(self):
        self.sensors = list()
        self.kernel = CombinedKernel()
        self.svs = CombinedFeatures()
        self.svm = None
        self.window = (+100000, -1000000)

    def from_file(self, file):
        sys.stderr.write('loading model file')
        l = file.readline()

        if l != '%arts version: 1.0\n':
            sys.stderr.write("\nfile not an arts definition file\n")
            return None

        bias = None
        alphas = None
        num_kernels = None

        while l:
            # skip comment or empty line
            if not (l.startswith('%') or l.startswith('\n')):
                if bias is None: bias = parse_float(l, 'b')
                if alphas is None: alphas = parse_vector(l, file, 'alphas')
                if num_kernels is None:
                    num_kernels = parse_int(l, 'num_kernels')

                if num_kernels and bias and alphas is not None:
                    for i in xrange(num_kernels):
                        s = Sensor()
                        (k, f) = s.from_file(file, i + 1)
                        k.io.enable_progress()
                        self.window = (min(self.window[0], s.window[0]),
                                       max(self.window[1], s.window[2]))
                        self.sensors.append(s)
                        self.kernel.append_kernel(k)
                        self.svs.append_feature_obj(f)

                    self.kernel.init(self.svs, self.svs)
                    self.svm = KernelMachine(
                        self.kernel, alphas,
                        numpy.arange(len(alphas), dtype=numpy.int32), bias)
                    self.svm.io.set_target_to_stderr()
                    self.svm.io.enable_progress()
                    self.svm.parallel.set_num_threads(
                        self.svm.parallel.get_num_cpus())
                    sys.stderr.write('done\n')
                    return

            l = file.readline()

        sys.stderr.write('error loading model file\n')

    def predict(self, seq, chunk_size=int(10e6)):
        """
        predicts on whole contig, splits up sequence in chunks of size chunk_size
        """

        seq_len = len(seq)
        num_chunks = int(numpy.ceil(float(seq_len) / float(chunk_size)))
        assert (num_chunks > 0)

        sys.stderr.write("number of chunks for contig: %i\n" % (num_chunks))

        start = 0
        stop = min(chunk_size, seq_len)

        out = []

        # iterate over chunks
        for chunk_idx in range(num_chunks):

            sys.stderr.write("processing chunk #%i\n" % (chunk_idx))

            assert (start < stop)
            chunk = seq[start:stop]

            assert (len(self.sensors) > 0)
            tf = CombinedFeatures()
            for i in xrange(len(self.sensors)):
                f = self.sensors[i].get_test_features(chunk, self.window)
                tf.append_feature_obj(f)

            sys.stderr.write("initialising kernel...")
            self.kernel.init(self.svs, tf)
            sys.stderr.write("..done\n")

            self.svm.set_kernel(self.kernel)
            lab_out = self.svm.apply().get_values()

            assert (len(lab_out) > 0)
            out.extend(lab_out)

            # increment chunk
            start = stop
            stop = min(stop + chunk_size, seq_len)

        l = (-self.window[0]) * [-42]
        r = self.window[1] * [-42]

        # concatenate
        ret = l + out + r

        assert (len(ret) == len(seq))

        return ret
コード例 #45
0
def mkl_multiclass ():
	print 'mkl_multiclass'

	from shogun.Features import CombinedFeatures, RealFeatures, Labels
	from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel
	from shogun.Classifier import MKLMultiClass


	width=1.2
	C=1.2
	epsilon=1e-5
	num_threads=1


	kernel=CombinedKernel()
	feats_train=CombinedFeatures()
	feats_test=CombinedFeatures()

	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=GaussianKernel(10, width)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)


	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=LinearKernel()
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)


	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=PolyKernel(10,2)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)
	
	kernel.init(feats_train, feats_train)


	labels=Labels(label_train_multiclass)

	mkl=MKLMultiClass(C, kernel, labels)
	
	mkl.set_epsilon(epsilon);
	mkl.parallel.set_num_threads(num_threads)
	mkl.set_mkl_epsilon(0.001)

	mkl.train()

	kernel.init(feats_train, feats_test)

	out= mkl.classify().get_labels()

	print out
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # dict to save additional information for later analysis
        self.additional_information = {}

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)

        # create shogun label
        lab = shogun_factory.create_labels(data.labels)

        ########################################################
        print "creating a kernel for each node:"
        ########################################################

        # assemble combined kernel

        combined_kernel = CombinedKernel()

        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)

        base_features = shogun_factory.create_features(data.examples, param)

        combined_features = CombinedFeatures()

        ##################################################
        # intra-domain blocks (dirac kernel)

        intra_block_vec = PairiiVec()

        for task_id in data.get_task_ids():
            intra_block_vec.push_back(Pairii(task_id, task_id))

        # create mask-based normalizer
        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums,
                                                       intra_block_vec)
        kernel = shogun_factory.create_empty_kernel(param)
        kernel.set_normalizer(normalizer)

        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel)

        # append features
        combined_features.append_feature_obj(base_features)

        print "------"

        ##################################################
        # all blocks (full kernel matrix)

        all_block_vec = PairiiVec()

        for task_id_1 in data.get_task_ids():
            for task_id_2 in data.get_task_ids():
                all_block_vec.push_back(Pairii(task_id_1, task_id_2))

        # create mask-based normalizer
        normalizer_all = MultitaskKernelMaskPairNormalizer(
            data.task_vector_nums, all_block_vec)
        kernel_all = shogun_factory.create_empty_kernel(param)
        kernel_all.set_normalizer(normalizer_all)

        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel_all)

        # append features
        combined_features.append_feature_obj(base_features)

        ##################################################
        # hack

        #        hack_block_vec = PairiiVec()
        #
        #        for task_id_1 in data.get_task_ids():
        #            for task_id_2 in data.get_task_ids():
        #                hack_block_vec.push_back(Pairii(task_id_1, task_id_2))
        #
        #        hack_block_vec.push_back(Pairii(data.name_to_id("B_2705"), data.name_to_id("B_4001")))
        #        other_group = ["B_0702", "B_1501", "B_5801"]
        #        for task_id_1 in other_group:
        #            for task_id_2 in other_group:
        #                hack_block_vec.push_back(Pairii(data.name_to_id(task_id_1), data.name_to_id(task_id_2)))
        #
        #
        #
        #        # create mask-based normalizer
        #        normalizer_hack = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, hack_block_vec)
        #        kernel_hack = shogun_factory.create_empty_kernel(param)
        #        kernel_hack.set_normalizer(normalizer_hack)
        #
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel_hack)
        #
        #        # append features
        #        combined_features.append_feature_obj(base_features)

        ##################################################
        # init combined kernel

        combined_kernel.init(combined_features, combined_features)

        #combined_kernel.precompute_subkernels()
        self.additional_information[
            "mkl weights before"] = combined_kernel.get_subkernel_weights()

        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None

        print "using MKL:", (param.flags["mkl_q"] >= 1.0)

        if param.flags["mkl_q"] >= 1.0:

            svm = MKLClassification()

            svm.set_mkl_norm(param.flags["mkl_q"])
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)

        else:

            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)
            svm = SVMLight(param.cost, combined_kernel, lab)

        num_threads = 8
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_INFO)

        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        svm.set_epsilon(0.03)

        # set cost
        if param.flags["normalize_cost"]:

            norm_c_pos = param.cost / float(
                len([l for l in data.labels if l == 1]))
            norm_c_neg = param.cost / float(
                len([l for l in data.labels if l == -1]))
            svm.set_C(norm_c_neg, norm_c_pos)

        else:

            svm.set_C(param.cost, param.cost)

        svm.train()

        print "subkernel weights (after):", combined_kernel.get_subkernel_weights(
        )

        ########################################################
        print "svm objective:"
        print svm.get_objective()

        self.additional_information["svm_objective"] = svm.get_objective()
        self.additional_information[
            "svm num sv"] = svm.get_num_support_vectors()
        self.additional_information[
            "mkl weights post-training"] = combined_kernel.get_subkernel_weights(
            )

        ########################################################

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), combined_kernel,
                               svm, param)

        return svms
コード例 #47
0
ファイル: mklms.py プロジェクト: nickponline/mkl
seed(42)
traindata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist),   axis=1)
testdata_real  = concatenate((randn(2,numt)-dist, randn(2,numt)+dist), axis=1);

# BinaryLabels
trainlab = concatenate((-ones(num), ones(num)));
testlab  = concatenate((-ones(numt), ones(numt)));

# Split into pos/neg train/test for plotting
trainpos = traindata_real[:,trainlab ==  1]
trainneg = traindata_real[:,trainlab == -1]
testpos  = testdata_real[:,testlab   ==  1]
testneg  = testdata_real[:,testlab   == -1]

# create combined train features
feats_train = CombinedFeatures()
feats_train.append_feature_obj(RealFeatures(traindata_real))
feats_train.append_feature_obj(RealFeatures(traindata_real))
feats_train.append_feature_obj(RealFeatures(traindata_real))
feats_train.append_feature_obj(RealFeatures(traindata_real))
feats_train.append_feature_obj(RealFeatures(traindata_real))

feats_test = CombinedFeatures()
feats_test.append_feature_obj(RealFeatures(testdata_real))
feats_test.append_feature_obj(RealFeatures(testdata_real))
feats_test.append_feature_obj(RealFeatures(testdata_real))
feats_test.append_feature_obj(RealFeatures(testdata_real))
feats_test.append_feature_obj(RealFeatures(testdata_real))

labels = BinaryLabels(trainlab)
コード例 #48
0
ファイル: irimklms-500.py プロジェクト: nickponline/mkl
#traindata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist),   axis=1)
#testdata_real  = concatenate((randn(2,numt)-dist, randn(2,numt)+dist), axis=1);

# Labels
trainlab = labs_1
#testlab  = concatenate((-ones(numt), ones(numt)));

# Split into pos/neg train/test for plotting
#trainpos = traindata_real[:,trainlab ==  1]
#trainneg = traindata_real[:,trainlab == -1]
#testpos  = testdata_real[:,testlab   ==  1]
#testneg  = testdata_real[:,testlab   == -1]

# create combined train features
feats_train = CombinedFeatures()
feats_train.append_feature_obj(RealFeatures(data_1))
feats_train.append_feature_obj(RealFeatures(data_2))
feats_train.append_feature_obj(RealFeatures(data_3))

#feats_test = CombinedFeatures()
#feats_test.append_feature_obj(RealFeatures(testdata_real))
#feats_test.append_feature_obj(RealFeatures(testdata_real))
#feats_test.append_feature_obj(RealFeatures(testdata_real))
#feats_test.append_feature_obj(RealFeatures(testdata_real))
#feats_test.append_feature_obj(RealFeatures(testdata_real))

labels = BinaryLabels(trainlab)

# and corresponding combined kernel
kernel = CombinedKernel()
コード例 #49
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
          
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)
        
        # create shogun label
        lab = shogun_factory.create_labels(data.labels)
        


        ##################################################
        # define pockets
        ##################################################
        
        pockets = [0]*9
        
        pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34]
        pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31]
        pockets[2] = [11, 20, 21, 22, 29, 31]
        pockets[3] = [8, 30, 31, 32]
        pockets[4] = [10, 11, 30]
        pockets[5] = [10, 11, 12, 13, 20, 29]
        pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29]
        pockets[7] = [12, 14, 15, 26]
        pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26]
        

        #new_pockets = []
        
        # merge neighboring pockets
        #for i in range(8):
        #    new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1]))))
            
        #pockets = new_pockets
        
        
        ########################################################
        print "creating a kernel:"
        ########################################################


        # assemble combined kernel
        
        combined_kernel = CombinedKernel()
        
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)
        
        
        base_features = shogun_factory.create_features(data.examples)
        
        combined_features = CombinedFeatures()
        
        
        
        ##################################################
        # intra-domain blocks
        
        
        #        intra_block_vec = PairiiVec()
        #        
        #        for task_id in data.get_task_ids():
        #            intra_block_vec.push_back(Pairii(task_id, task_id))
        #        
        #        
        #        
        #        # create mask-based normalizer
        #        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec)        
        #        kernel = shogun_factory.create_empty_kernel(param)
        #        kernel.set_normalizer(normalizer)
        #        
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel)
        #    
        #        # append features
        #        combined_features.append_feature_obj(base_features)
        #
        #        print "------"
        #        
        #        ##################################################
        #        # all blocks
        #        
        #        
        #        all_block_vec = PairiiVec()
        #        
        #        for task_id_1 in data.get_task_ids():
        #            for task_id_2 in data.get_task_ids():
        #                all_block_vec.push_back(Pairii(task_id_1, task_id_2))
        #                
        #        
        #        # create mask-based normalizer
        #        normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec)        
        #        kernel_all = shogun_factory.create_empty_kernel(param)
        #        kernel_all.set_normalizer(normalizer_all)
        #                
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel_all)
        #    
        #        # append features
        #        combined_features.append_feature_obj(base_features)

        
        ##################################################
        # add one kernel per similarity position
        
        
        # init seq handler 
        pseudoseqs = SequencesHandler()



        for pocket in pockets:

            print "creating normalizer"
            #import pdb
            #pdb.set_trace()
            
            normalizer = MultitaskKernelNormalizer(data.task_vector_nums)
            
            print "processing pocket", pocket


            # set similarity
            for task_name_lhs in data.get_task_names():
                for task_name_rhs in data.get_task_names():
                    
                    similarity = 0.0
                    
                    for pseudo_seq_pos in pocket:
                        similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1))
                    
                    # normalize
                    similarity = similarity / float(len(pocket))
                    
                    print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity)
                    
                    normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity)
               

            print "creating empty kernel"
            kernel_pos = shogun_factory.create_empty_kernel(param)
            
            print "setting normalizer"
            kernel_pos.set_normalizer(normalizer)
                
            print "appending kernel"
            # append current kernel to CombinedKernel
            combined_kernel.append_kernel(kernel_pos)
    
            print "appending features"
            # append features
            combined_features.append_feature_obj(base_features)

        
        
        print "done constructing combined kernel"
        
        ##################################################
        # init combined kernel

        
        # init weights
        # combined_kernel.set_subkernel_weights([1.0/2.85]*combined_kernel.get_num_subkernels())
        
        
        combined_kernel.init(combined_features, combined_features)    
        
        

                
        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None
                
        
        print "using MKL:", (param.transform >= 1.0)
        
        if param.transform >= 1.0:
            
            svm = MKLClassification()
            
            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto
        
            svm.set_C(param.cost, param.cost)
            
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)
            
                
        else:
            
            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)
            
            svm = SVMLight(param.cost, combined_kernel, lab)


        # set up SVM
        num_threads = 8
        svm.io.enable_progress()
        #svm.io.set_loglevel(shogun.Classifier.MSG_INFO)
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)
        
        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        
        #print "WARNING: custom epsilon set"
        #svm.set_epsilon(0.05)    
        
        # normalize cost
        norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
        norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
        
        svm.set_C(norm_c_neg, norm_c_pos)
        
        
        # start training
        svm.train()
    
        
        # save additional info
        self.additional_information["svm_objective"] = svm.get_objective()
        self.additional_information["svm num sv"] = svm.get_num_support_vectors()
        self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights()
        
        print self.additional_information 
        
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm)

        
        return svms
コード例 #50
0
def mkl_binclass_modular(fm_train_real=traindat, fm_test_real=testdat, fm_label_twoclass=label_traindat):

    ##################################
    # set up and train

    # create some poly train/test matrix
    tfeats = RealFeatures(fm_train_real)
    tkernel = PolyKernel(10, 3)
    tkernel.init(tfeats, tfeats)
    K_train = tkernel.get_kernel_matrix()

    pfeats = RealFeatures(fm_test_real)
    tkernel.init(tfeats, pfeats)
    K_test = tkernel.get_kernel_matrix()

    # create combined train features
    feats_train = CombinedFeatures()
    feats_train.append_feature_obj(RealFeatures(fm_train_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_train))
    kernel.append_kernel(PolyKernel(10, 2))
    kernel.init(feats_train, feats_train)

    # train mkl
    labels = BinaryLabels(fm_label_twoclass)
    mkl = MKLClassification()

    # which norm to use for MKL
    mkl.set_mkl_norm(1)  # 2,3

    # set cost (neg, pos)
    mkl.set_C(1, 1)

    # set kernel and labels
    mkl.set_kernel(kernel)
    mkl.set_labels(labels)

    # train
    mkl.train()
    # w=kernel.get_subkernel_weights()
    # kernel.set_subkernel_weights(w)

    ##################################
    # test

    # create combined test features
    feats_pred = CombinedFeatures()
    feats_pred.append_feature_obj(RealFeatures(fm_test_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_test))
    kernel.append_kernel(PolyKernel(10, 2))
    kernel.init(feats_train, feats_pred)

    # and classify
    mkl.set_kernel(kernel)
    mkl.apply()
    return mkl.apply(), kernel
コード例 #51
0
ファイル: signal_sensor.py プロジェクト: Anshul-Bansal/gsoc
 def __init__(self):
     self.sensors = list()
     self.kernel = CombinedKernel()
     self.svs = CombinedFeatures()
     self.svm = None
     self.window = (+100000, -1000000)
コード例 #52
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
        # dict to save additional information for later analysis
        self.additional_information = {}
        
          
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)

                
        # create shogun label
        lab = shogun_factory.create_labels(data.labels)
        


        ########################################################
        print "creating a kernel for each node:"
        ########################################################


        # assemble combined kernel
        
        combined_kernel = CombinedKernel()
        
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)
        
        
        base_features = shogun_factory.create_features(data.examples, param)
        
        combined_features = CombinedFeatures()
        
        
        
        
        ##################################################
        # intra-domain blocks (dirac kernel)
        
        
        intra_block_vec = PairiiVec()
        
        for task_id in data.get_task_ids():
            intra_block_vec.push_back(Pairii(task_id, task_id))
        
        
        
        # create mask-based normalizer
        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec)        
        kernel = shogun_factory.create_empty_kernel(param)
        kernel.set_normalizer(normalizer)
        
        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel)
    
        # append features
        combined_features.append_feature_obj(base_features)

        print "------"
        
        ##################################################
        # all blocks (full kernel matrix)
        
        
        all_block_vec = PairiiVec()
        
        for task_id_1 in data.get_task_ids():
            for task_id_2 in data.get_task_ids():
                all_block_vec.push_back(Pairii(task_id_1, task_id_2))
                
        
        # create mask-based normalizer
        normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec)        
        kernel_all = shogun_factory.create_empty_kernel(param)
        kernel_all.set_normalizer(normalizer_all)
                
        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel_all)
    
        # append features
        combined_features.append_feature_obj(base_features)

        
        ##################################################
        # hack
        
        
        #        hack_block_vec = PairiiVec()
        #        
        #        for task_id_1 in data.get_task_ids():
        #            for task_id_2 in data.get_task_ids():
        #                hack_block_vec.push_back(Pairii(task_id_1, task_id_2))
        #        
        #        hack_block_vec.push_back(Pairii(data.name_to_id("B_2705"), data.name_to_id("B_4001")))
        #        other_group = ["B_0702", "B_1501", "B_5801"]
        #        for task_id_1 in other_group:
        #            for task_id_2 in other_group:
        #                hack_block_vec.push_back(Pairii(data.name_to_id(task_id_1), data.name_to_id(task_id_2)))
        #        
        #        
        #        
        #        # create mask-based normalizer
        #        normalizer_hack = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, hack_block_vec)        
        #        kernel_hack = shogun_factory.create_empty_kernel(param)
        #        kernel_hack.set_normalizer(normalizer_hack)
        #                
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel_hack)
        #    
        #        # append features
        #        combined_features.append_feature_obj(base_features)
        
        
        
            
        ##################################################
        # init combined kernel
        
        combined_kernel.init(combined_features, combined_features)    
        
            
        #combined_kernel.precompute_subkernels()
        self.additional_information["mkl weights before"] = combined_kernel.get_subkernel_weights()
        
        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None
                
        
        print "using MKL:", (param.flags["mkl_q"] >= 1.0)
        
        if param.flags["mkl_q"] >= 1.0:
            
            svm = MKLClassification()
            
            svm.set_mkl_norm(param.flags["mkl_q"])
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)
        
        else:
            
            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)
            svm = SVMLight(param.cost, combined_kernel, lab)


        num_threads = 8
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_INFO)
        
        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
    
        svm.set_epsilon(0.03)
        
        # set cost
        if param.flags["normalize_cost"]:
            
            norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
            norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
            svm.set_C(norm_c_neg, norm_c_pos)
            
        else:

            svm.set_C(param.cost, param.cost)
        
        svm.train()
    
    
        print "subkernel weights (after):", combined_kernel.get_subkernel_weights()

        ########################################################
        print "svm objective:"
        print svm.get_objective()
        
        
        self.additional_information["svm_objective"] = svm.get_objective()
        self.additional_information["svm num sv"] = svm.get_num_support_vectors()
        self.additional_information["mkl weights post-training"] = combined_kernel.get_subkernel_weights()
         
        ########################################################
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm, param)

        
        return svms
コード例 #53
0
def evaluation_cross_validation_multiclass_storage(
        traindat=traindat, label_traindat=label_traindat):
    from shogun.Evaluation import CrossValidation, CrossValidationResult
    from shogun.Evaluation import CrossValidationPrintOutput
    from shogun.Evaluation import CrossValidationMKLStorage, CrossValidationMulticlassStorage
    from shogun.Evaluation import MulticlassAccuracy, F1Measure
    from shogun.Evaluation import StratifiedCrossValidationSplitting
    from shogun.Features import MulticlassLabels
    from shogun.Features import RealFeatures, CombinedFeatures
    from shogun.Kernel import GaussianKernel, CombinedKernel
    from shogun.Classifier import MKLMulticlass
    from shogun.Mathematics import Statistics, MSG_DEBUG

    # training data, combined features all on same data
    features = RealFeatures(traindat)
    comb_features = CombinedFeatures()
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    labels = MulticlassLabels(label_traindat)

    # kernel, different Gaussians combined
    kernel = CombinedKernel()
    kernel.append_kernel(GaussianKernel(10, 0.1))
    kernel.append_kernel(GaussianKernel(10, 1))
    kernel.append_kernel(GaussianKernel(10, 2))

    # create mkl using libsvm, due to a mem-bug, interleaved is not possible
    svm = MKLMulticlass(1.0, kernel, labels)
    svm.set_kernel(kernel)

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "StratifiedCrossValidationSplitting" is also available
    splitting_strategy = StratifiedCrossValidationSplitting(labels, 5)

    # evaluation method
    evaluation_criterium = MulticlassAccuracy()

    # cross-validation instance
    cross_validation = CrossValidation(svm, comb_features, labels,
                                       splitting_strategy,
                                       evaluation_criterium)
    cross_validation.set_autolock(False)

    # append cross vlaidation output classes
    #cross_validation.add_cross_validation_output(CrossValidationPrintOutput())
    #mkl_storage=CrossValidationMKLStorage()
    #cross_validation.add_cross_validation_output(mkl_storage)
    multiclass_storage = CrossValidationMulticlassStorage()
    multiclass_storage.append_binary_evaluation(F1Measure())
    cross_validation.add_cross_validation_output(multiclass_storage)
    cross_validation.set_num_runs(3)

    # perform cross-validation
    result = cross_validation.evaluate()

    roc_0_0_0 = multiclass_storage.get_fold_ROC(0, 0, 0)
    #print roc_0_0_0
    auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0, 0, 0, 0)
    #print auc_0_0_0
    return roc_0_0_0, auc_0_0_0
コード例 #54
0
def kernel_combined_modular(fm_train_real=traindat,
                            fm_test_real=testdat,
                            fm_train_dna=traindna,
                            fm_test_dna=testdna):
    from shogun.Kernel import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
    from shogun.Features import RealFeatures, StringCharFeatures, CombinedFeatures, DNA

    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    feats_test = CombinedFeatures()

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = GaussianKernel(10, 1.1)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = StringCharFeatures(fm_train_dna, DNA)
    subkfeats_test = StringCharFeatures(fm_test_dna, DNA)
    degree = 3
    subkernel = FixedDegreeStringKernel(10, degree)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = StringCharFeatures(fm_train_dna, DNA)
    subkfeats_test = StringCharFeatures(fm_test_dna, DNA)
    subkernel = LocalAlignmentStringKernel(10)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)
    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
コード例 #55
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        import numpy
        numpy.random.seed(666)

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)

        # create shogun label
        lab = shogun_factory.create_labels(data.labels)

        # assemble combined kernel
        combined_kernel = CombinedKernel()
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_DEBUG)
        # set kernel cache
        if param.flags.has_key("cache_size"):
            combined_kernel.set_cache_size(param.flags["cache_size"])

        # create features
        base_features = shogun_factory.create_features(data.examples, param)

        combined_features = CombinedFeatures()

        ########################################################
        print "creating a masked kernel for possible subset:"
        ########################################################

        power_set_tasks = power_set(data.get_task_ids())

        for active_task_ids in power_set_tasks:

            print "masking all entries other than:", active_task_ids

            # create mask-based normalizer
            normalizer = MultitaskKernelMaskNormalizer(data.task_vector_nums,
                                                       data.task_vector_nums,
                                                       active_task_ids)

            # normalize trace
            if param.flags.has_key(
                    "normalize_trace") and param.flags["normalize_trace"]:
                norm_factor = len(data.get_task_ids()) / len(active_task_ids)
                normalizer.set_normalization_constant(norm_factor)

            kernel = shogun_factory.create_empty_kernel(param)
            kernel.set_normalizer(normalizer)

            # append current kernel to CombinedKernel
            combined_kernel.append_kernel(kernel)

            # append features
            combined_features.append_feature_obj(base_features)

            print "------"

        combined_kernel.init(combined_features, combined_features)

        #combined_kernel.precompute_subkernels()

        self.additional_information[
            "weights before trainng"] = combined_kernel.get_subkernel_weights(
            )
        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None

        print "using MKL:", (param.flags["mkl_q"] >= 1.0)

        if param.flags["mkl_q"] >= 1.0:

            svm = MKLClassification()

            svm.set_mkl_norm(param.flags["mkl_q"])

            # set interleaved optimization
            if param.flags.has_key("interleaved"):
                svm.set_interleaved_optimization_enabled(
                    param.flags["interleaved"])

            # set solver type
            if param.flags.has_key(
                    "solver_type") and param.flags["solver_type"]:
                if param.flags["solver_type"] == "ST_CPLEX":
                    svm.set_solver_type(ST_CPLEX)
                if param.flags["solver_type"] == "ST_DIRECT":
                    svm.set_solver_type(ST_DIRECT)
                if param.flags["solver_type"] == "ST_NEWTON":
                    svm.set_solver_type(ST_NEWTON)
                if param.flags["solver_type"] == "ST_GLPK":
                    svm.set_solver_type(ST_GLPK)

            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)

        else:

            svm = SVMLight(param.cost, combined_kernel, lab)

        # optimization settings
        num_threads = 4
        svm.parallel.set_num_threads(num_threads)

        if param.flags.has_key("epsilon"):
            svm.set_epsilon(param.flags["epsilon"])

        # enable output
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)

        # disable unsupported optimizations (due to special normalizer)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        # set cost
        if param.flags["normalize_cost"]:

            norm_c_pos = param.cost / float(
                len([l for l in data.labels if l == 1]))
            norm_c_neg = param.cost / float(
                len([l for l in data.labels if l == -1]))
            svm.set_C(norm_c_neg, norm_c_pos)

        else:

            svm.set_C(param.cost, param.cost)

        svm.train()

        # prepare mapping
        weight_map = {}
        weights = combined_kernel.get_subkernel_weights()
        for (i, pset) in enumerate(power_set_tasks):
            print pset
            subset_str = str([data.id_to_name(task_idx) for task_idx in pset])
            weight_map[subset_str] = weights[i]

        # store additional info
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["weight_map"] = weight_map

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name),
                               len(power_set_tasks), combined_kernel, svm,
                               param)

        return svms
コード例 #56
0
def training_run(options):
    """Conduct a training run and return a trained SVM kernel"""
    settings = MotifFinderSettings(kirmes_ini.MOTIF_LENGTH,
                                   options.window_width, options.replace)
    positives = MotifFinder(finder_settings=settings)
    positives.setFastaFile(options.positives)
    positives.setMotifs(options.pgff)
    pmotifs, ppositions = positives.getResults()
    negatives = MotifFinder(finder_settings=settings)
    negatives.setFastaFile(options.negatives)
    negatives.setMotifs(options.ngff)
    nmotifs, npositions = negatives.getResults()

    wds_kparams = kirmes_ini.WDS_KERNEL_PARAMETERS
    wds_svm = EasySVM.EasySVM(wds_kparams)
    num_positives = len(pmotifs.values()[0])
    num_negatives = len(nmotifs.values()[0])
    #Creating Kernel Objects
    kernel = CombinedKernel()
    features = CombinedFeatures()
    kernel_array = []
    motifs = pmotifs.keys()
    motifs.sort()
    #Adding Kmer Kernels
    for motif in motifs:
        all_examples = pmotifs[motif] + nmotifs[motif]
        motif_features = wds_svm.createFeatures(all_examples)
        wds_kernel = WeightedDegreePositionStringKernel(motif_features, motif_features, \
                                                        wds_kparams['degree'])
        wds_kernel.set_shifts(wds_kparams['shift'] *
                              ones(wds_kparams['seqlength'], dtype=int32))
        features.append_feature_obj(motif_features)
        kernel_array.append(wds_kernel)
        kernel.append_kernel(wds_kernel)
    rbf_svm = EasySVM.EasySVM(kirmes_ini.RBF_KERNEL_PARAMETERS)
    positions = array(ppositions + npositions, dtype=float64).T
    position_features = rbf_svm.createFeatures(positions)
    features.append_feature_obj(position_features)
    motif_labels = append(ones(num_positives), -ones(num_negatives))
    complete_labels = Labels(motif_labels)
    rbf_kernel = GaussianKernel(position_features, position_features, \
                                kirmes_ini.RBF_KERNEL_PARAMETERS['width'])
    kernel_array.append(rbf_kernel)
    kernel.append_kernel(rbf_kernel)
    #Kernel init
    kernel.init(features, features)
    kernel.set_cache_size(kirmes_ini.K_CACHE_SIZE)
    svm = LibSVM(kirmes_ini.K_COMBINED_C, kernel, complete_labels)
    svm.parallel.set_num_threads(kirmes_ini.K_NUM_THREADS)
    #Training
    svm.train()
    if not os.path.exists(options.output_path):
        os.mkdir(options.output_path)
    html = {}
    if options.contrib:
        html["contrib"] = contrib(svm, kernel, motif_labels, kernel_array,
                                  motifs)
    if options.logos:
        html["poims"] = poims(svm, kernel, kernel_array, motifs,
                              options.output_path)
    if options.query:
        html["query"] = evaluate(options, svm, kernel, features, motifs)
    htmlize(html, options.output_html)
コード例 #57
0
def mkl_binclass_modular(fm_train_real=traindat,
                         fm_test_real=testdat,
                         fm_label_twoclass=label_traindat):

    ##################################
    # set up and train

    # create some poly train/test matrix
    tfeats = RealFeatures(fm_train_real)
    tkernel = PolyKernel(10, 3)
    tkernel.init(tfeats, tfeats)
    K_train = tkernel.get_kernel_matrix()

    pfeats = RealFeatures(fm_test_real)
    tkernel.init(tfeats, pfeats)
    K_test = tkernel.get_kernel_matrix()

    # create combined train features
    feats_train = CombinedFeatures()
    feats_train.append_feature_obj(RealFeatures(fm_train_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_train))
    kernel.append_kernel(PolyKernel(10, 2))
    kernel.init(feats_train, feats_train)

    # train mkl
    labels = BinaryLabels(fm_label_twoclass)
    mkl = MKLClassification()

    # which norm to use for MKL
    mkl.set_mkl_norm(1)  #2,3

    # set cost (neg, pos)
    mkl.set_C(1, 1)

    # set kernel and labels
    mkl.set_kernel(kernel)
    mkl.set_labels(labels)

    # train
    mkl.train()
    #w=kernel.get_subkernel_weights()
    #kernel.set_subkernel_weights(w)

    ##################################
    # test

    # create combined test features
    feats_pred = CombinedFeatures()
    feats_pred.append_feature_obj(RealFeatures(fm_test_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_test))
    kernel.append_kernel(PolyKernel(10, 2))
    kernel.init(feats_train, feats_pred)

    # and classify
    mkl.set_kernel(kernel)
    mkl.apply()
    return mkl.apply(), kernel
コード例 #58
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)

        # create shogun label
        lab = shogun_factory.create_labels(data.labels)

        ##################################################
        # define pockets
        ##################################################

        pockets = [0] * 9

        pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34]
        pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31]
        pockets[2] = [11, 20, 21, 22, 29, 31]
        pockets[3] = [8, 30, 31, 32]
        pockets[4] = [10, 11, 30]
        pockets[5] = [10, 11, 12, 13, 20, 29]
        pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29]
        pockets[7] = [12, 14, 15, 26]
        pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26]

        #new_pockets = []

        # merge neighboring pockets
        #for i in range(8):
        #    new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1]))))

        #pockets = new_pockets

        ########################################################
        print "creating a kernel:"
        ########################################################

        # assemble combined kernel

        combined_kernel = CombinedKernel()

        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)

        base_features = shogun_factory.create_features(data.examples)

        combined_features = CombinedFeatures()

        ##################################################
        # intra-domain blocks

        #        intra_block_vec = PairiiVec()
        #
        #        for task_id in data.get_task_ids():
        #            intra_block_vec.push_back(Pairii(task_id, task_id))
        #
        #
        #
        #        # create mask-based normalizer
        #        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec)
        #        kernel = shogun_factory.create_empty_kernel(param)
        #        kernel.set_normalizer(normalizer)
        #
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel)
        #
        #        # append features
        #        combined_features.append_feature_obj(base_features)
        #
        #        print "------"
        #
        #        ##################################################
        #        # all blocks
        #
        #
        #        all_block_vec = PairiiVec()
        #
        #        for task_id_1 in data.get_task_ids():
        #            for task_id_2 in data.get_task_ids():
        #                all_block_vec.push_back(Pairii(task_id_1, task_id_2))
        #
        #
        #        # create mask-based normalizer
        #        normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec)
        #        kernel_all = shogun_factory.create_empty_kernel(param)
        #        kernel_all.set_normalizer(normalizer_all)
        #
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel_all)
        #
        #        # append features
        #        combined_features.append_feature_obj(base_features)

        ##################################################
        # add one kernel per similarity position

        # init seq handler
        pseudoseqs = SequencesHandler()

        for pocket in pockets:

            print "creating normalizer"
            #import pdb
            #pdb.set_trace()

            normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

            print "processing pocket", pocket

            # set similarity
            for task_name_lhs in data.get_task_names():
                for task_name_rhs in data.get_task_names():

                    similarity = 0.0

                    for pseudo_seq_pos in pocket:
                        similarity += float(
                            pseudoseqs.get_similarity(task_name_lhs,
                                                      task_name_rhs,
                                                      pseudo_seq_pos - 1))

                    # normalize
                    similarity = similarity / float(len(pocket))

                    print "pocket %s (%s, %s) = %f" % (
                        str(pocket), task_name_lhs, task_name_rhs, similarity)

                    normalizer.set_task_similarity(
                        data.name_to_id(task_name_lhs),
                        data.name_to_id(task_name_rhs), similarity)

            print "creating empty kernel"
            kernel_pos = shogun_factory.create_empty_kernel(param)

            print "setting normalizer"
            kernel_pos.set_normalizer(normalizer)

            print "appending kernel"
            # append current kernel to CombinedKernel
            combined_kernel.append_kernel(kernel_pos)

            print "appending features"
            # append features
            combined_features.append_feature_obj(base_features)

        print "done constructing combined kernel"

        ##################################################
        # init combined kernel

        # init weights
        # combined_kernel.set_subkernel_weights([1.0/2.85]*combined_kernel.get_num_subkernels())

        combined_kernel.init(combined_features, combined_features)

        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None

        print "using MKL:", (param.transform >= 1.0)

        if param.transform >= 1.0:

            svm = MKLClassification()

            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto

            svm.set_C(param.cost, param.cost)

            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)

        else:

            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)

            svm = SVMLight(param.cost, combined_kernel, lab)

        # set up SVM
        num_threads = 8
        svm.io.enable_progress()
        #svm.io.set_loglevel(shogun.Classifier.MSG_INFO)
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)

        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        #print "WARNING: custom epsilon set"
        #svm.set_epsilon(0.05)

        # normalize cost
        norm_c_pos = param.cost / float(len([l
                                             for l in data.labels if l == 1]))
        norm_c_neg = param.cost / float(
            len([l for l in data.labels if l == -1]))

        svm.set_C(norm_c_neg, norm_c_pos)

        # start training
        svm.train()

        # save additional info
        self.additional_information["svm_objective"] = svm.get_objective()
        self.additional_information[
            "svm num sv"] = svm.get_num_support_vectors()
        self.additional_information[
            "post_weights"] = combined_kernel.get_subkernel_weights()

        print self.additional_information

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), combined_kernel,
                               svm)

        return svms