def get_weighted_spectrum_kernel(subfeats_list, options):
	"""build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement)

	Arguments:
	subfeats_list -- list of sub-feature objects
	options -- object containing option data 

	Return:
	CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel 
	"""
	kmerlen = options.kmerlen
	kmerlen2 = options.kmerlen2

	subkernels = 0
	kernel = CombinedKernel()
	feats = CombinedFeatures()

	for subfeats in subfeats_list:
		feats.append_feature_obj(subfeats)

	for k in xrange(kmerlen, kmerlen2+1):
		if k <= 8:
			subkernel = CommWordStringKernel(10, False)
		else:
			subkernel = CommUlongStringKernel(10, False)

		kernel.append_kernel(subkernel)
		subkernels+=1

	kernel.init(feats, feats)

	kernel.set_subkernel_weights(numpy.array([1/float(subkernels)]*subkernels, numpy.dtype('float64')))

	return kernel
Beispiel #2
0
def get_weighted_spectrum_kernel(subfeats_list, options):
    """build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement)

	Arguments:
	subfeats_list -- list of sub-feature objects
	options -- object containing option data 

	Return:
	CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel 
	"""
    kmerlen = options.kmerlen
    kmerlen2 = options.kmerlen2

    subkernels = 0
    kernel = CombinedKernel()
    feats = CombinedFeatures()

    weights = []

    i = 0
    for subfeats in subfeats_list:
        feats.append_feature_obj(subfeats)

        combine_kcount = Counter()
        for i in xrange(subfeats.get_num_vectors()):
            fv = list(subfeats.get_feature_vector(i))
            combine_kcount += Counter(fv)
            number = len(combine_kcount)
            klen = kmerlen + i

    for k in xrange(kmerlen, kmerlen2 + 1):
        if k <= 8:
            subkernel = CommWordStringKernel(10, False)
        else:
            subkernel = CommUlongStringKernel(10, False)

        kernel.append_kernel(subkernel)
        subkernels += 1

    kernel.init(feats, feats)
    # here the weight for each k-mer is uniform
    '''
	subkernels = 8
	numpy.array([1 / float(subkernels)] * subkernels, numpy.dtype('float64'))
	array([ 0.125,  0.125,  0.125,  0.125,  0.125,  0.125,  0.125,  0.125])
	'''
    kernel.set_subkernel_weights(
        numpy.array([1 / float(subkernels)] * subkernels,
                    numpy.dtype('float64')))

    return kernel
Beispiel #3
0
def get_weighted_spectrum_kernel(subfeats_list, options):
    """build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement)

	Arguments:
	subfeats_list -- list of sub-feature objects
	options -- object containing option data 

	Return:
	CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel 
	"""
    kmerlen = options.kmerlen
    kmerlen2 = options.kmerlen2

    subkernels = 0
    kernel = CombinedKernel()
    feats = CombinedFeatures()

    for subfeats in subfeats_list:
        feats.append_feature_obj(subfeats)

    for k in xrange(kmerlen, kmerlen2 + 1):
        if k <= 8:
            subkernel = CommWordStringKernel(10, False)
        else:
            subkernel = CommUlongStringKernel(10, False)

        kernel.append_kernel(subkernel)
        subkernels += 1

    kernel.init(feats, feats)

    kernel.set_subkernel_weights(
        numpy.array([1 / float(subkernels)] * subkernels,
                    numpy.dtype('float64')))

    return kernel
Beispiel #4
0
c2.build_values(-4.0, 4.0, R_EXP);
param_tree_root.append_child(c2)

splitting_strategy   = StratifiedCrossValidationSplitting(labels, 50)
evaluation_criterium = ContingencyTableEvaluation(ACCURACY)
cross_validation     = CrossValidation(classifier, feats_train, labels, splitting_strategy, evaluation_criterium)
model_selection      = GridSearchModelSelection(param_tree_root, cross_validation)
best_parameters      = model_selection.select_model(True)

print "Best parameters: ",
best_parameters.print_tree()
best_parameters.apply_to_machine(classifier)

classifier.train()
w=kernel.get_subkernel_weights()
kernel.set_subkernel_weights(w)

# Plot ROC curve
subplot(111)
ROC_evaluation=ROCEvaluation()
ROC_evaluation.evaluate(classifier.apply(feats_train),Labels(trainlab))
roc = ROC_evaluation.get_ROC()
plot(roc[0], roc[1])
fill_between(roc[0],roc[1],0,alpha=0.1)
grid(True)
xlabel('FPR')
ylabel('TPR')
title('Train ROC (Width=%.3f, C1=%.3f, C2=%.3f) ROC curve = %.3f' % (10, classifier.get_C1(), classifier.get_C2(), ROC_evaluation.get_auROC()),size=10)
savefig("data/iri/mkl.png")
"""
subplot(222)
Beispiel #5
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_empty_kernel(param)
        lab = shogun_factory.create_labels(data.labels)

        combined_kernel = CombinedKernel()
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)
        base_features = shogun_factory.create_features(data.examples)
        combined_features = CombinedFeatures()

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        # load data
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt")
        f = file(
            "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt"
        )
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt")
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt")

        num_lines = int(f.readline().strip())
        task_distances = numpy.zeros((num_lines, num_lines))
        name_to_id = {}
        for (i, line) in enumerate(f):
            tokens = line.strip().split("\t")
            name = str(tokens[0])
            name_to_id[name] = i
            entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0])
            assert len(entry) == num_lines, "len_entry %i, num_lines %i" % (
                len(entry), num_lines)
            task_distances[i, :] = entry

        # cut relevant submatrix
        active_ids = [name_to_id[name] for name in data.get_task_names()]
        tmp_distances = task_distances[active_ids, :]
        tmp_distances = tmp_distances[:, active_ids]
        print "distances ", tmp_distances.shape

        # normalize distances
        task_distances = task_distances / numpy.max(tmp_distances)

        similarities = numpy.zeros(
            (data.get_num_tasks(), data.get_num_tasks()))

        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():

                # convert similarity with simple transformation
                similarity = param.base_similarity - task_distances[
                    name_to_id[task_name_lhs], name_to_id[task_name_rhs]]
                normalizer.set_task_similarity(data.name_to_id(task_name_lhs),
                                               data.name_to_id(task_name_rhs),
                                               similarity)

                # save for later
                similarities[data.name_to_id(task_name_lhs),
                             data.name_to_id(task_name_rhs)] = similarity

        # set normalizer
        base_wdk.set_normalizer(normalizer)
        #base_wdk.init_normalizer()

        combined_features.append_feature_obj(base_features)
        combined_kernel.append_kernel(base_wdk)

        ##################################################
        # intra-domain blocks

        intra_block_vec = PairiiVec()

        for task_id in data.get_task_ids():
            intra_block_vec.push_back(Pairii(task_id, task_id))

        # create mask-based normalizer
        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums,
                                                       intra_block_vec)
        kernel = shogun_factory.create_empty_kernel(param)
        kernel.set_normalizer(normalizer)

        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel)

        # append features
        combined_features.append_feature_obj(base_features)

        # set mixing factor (used if MKL is OFF)
        assert (param.base_similarity <= 1)
        assert (param.base_similarity >= 0)
        combined_kernel.set_subkernel_weights(
            [param.base_similarity, 1 - param.base_similarity])

        combined_kernel.init(combined_features, combined_features)

        svm = None

        print "using MKL:", (param.transform >= 1.0)

        if param.transform >= 1.0:

            svm = MKLClassification()

            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto

            svm.set_C(param.cost, param.cost)

            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)

        else:

            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)

            svm = SVMLight(param.cost, combined_kernel, lab)

        # set up SVM
        num_threads = 8
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)

        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        print "WARNING: custom epsilon set"
        svm.set_epsilon(0.05)

        # normalize cost
        norm_c_pos = param.cost / float(len([l
                                             for l in data.labels if l == 1]))
        norm_c_neg = param.cost / float(
            len([l for l in data.labels if l == -1]))

        svm.set_C(norm_c_neg, norm_c_pos)

        # start training
        svm.train()

        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        self.additional_information["similarities"] = similarities
        self.additional_information[
            "post_weights"] = combined_kernel.get_subkernel_weights()

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in data.get_task_names():

            task_num = data.name_to_id(task_name)

            # save svm and task_num
            svms[task_name] = (task_num, combined_kernel, svm)

        return svms