Ejemplo n.º 1
0
def fit_and_predict(load_test_data, train_data, test_feature_matrics,
                    train_label, test_label_OR_test_data):
    features_train = RealFeatures(train_data)
    features_test = RealFeatures(test_feature_matrics)
    labels_train = BinaryLabels(train_label)

    learn_rate = 1.0
    max_iter = 1000
    perceptron = AveragedPerceptron(features_train, labels_train)
    perceptron.set_learn_rate(learn_rate)
    perceptron.set_max_iter(max_iter)
    perceptron.train()
    perceptron.set_features(features_test)
    labels_predict = perceptron.apply()
    if load_test_data:
        del test_label_OR_test_data['question_text']
        # import pdb; pdb.set_trace()
        test_label_OR_test_data.insert(1, 'prediction', prediction80)
        test_label_OR_test_data.to_csv('submission.csv', index=False)
        return prediction
    else:
        labels_test = BinaryLabels(test_label_OR_test_data)
        accEval = AccuracyMeasure()
        accuracy = accEval.evaluate(labels_predict, labels_test)
        f1Eval = F1Measure()
        f1_score = f1Eval.evaluate(labels_predict, labels_test)
        print('#accuracy is: ', accuracy)
        print('#F1 score is: ', f1_score)
Ejemplo n.º 2
0
def evaluation_thresholds(index):
    from shogun import BinaryLabels, ROCEvaluation
    import numpy
    numpy.random.seed(17)
    output = numpy.arange(-1, 1, 0.001)
    output = (0.3 * output + 0.7 * (numpy.random.rand(len(output)) - 0.5))
    label = [-1.0] * (len(output) // 2)
    label.extend([1.0] * (len(output) // 2))
    label = numpy.array(label)

    pred = BinaryLabels(output)
    truth = BinaryLabels(label)

    evaluator = ROCEvaluation()
    evaluator.evaluate(pred, truth)

    [fp, tp] = evaluator.get_ROC()

    thresh = evaluator.get_thresholds()
    b = thresh[index]

    #print("tpr", numpy.mean(output[label>0]>b), tp[index])
    #print("fpr", numpy.mean(output[label<0]>b), fp[index])

    return tp[index], fp[index], numpy.mean(output[label > 0] > b), numpy.mean(
        output[label < 0] > b)
Ejemplo n.º 3
0
def classifier_domainadaptationsvm (fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna, \
                                               label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \
                                               label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3):

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)
    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)
    labels = BinaryLabels(label_train_dna)
    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    #####################################

    #print("obtaining DA SVM from previously trained SVM")

    feats_train2 = StringCharFeatures(fm_train_dna, DNA)
    feats_test2 = StringCharFeatures(fm_test_dna, DNA)
    kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree)
    labels2 = BinaryLabels(label_train_dna)

    # we regularize against the previously obtained solution
    dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0)
    dasvm.train()

    out = dasvm.apply_binary(feats_test2)

    return out  #,dasvm TODO
Ejemplo n.º 4
0
def modelselection_grid_search_kernel(num_subsets, num_vectors, dim_vectors):
    # init seed for reproducability
    Math.init_random(1)
    random.seed(1)

    # create some (non-sense) data
    matrix = random.rand(dim_vectors, num_vectors)

    # create num_feautres 2-dimensional vectors
    features = RealFeatures()
    features.set_feature_matrix(matrix)

    # create labels, two classes
    labels = BinaryLabels(num_vectors)
    for i in range(num_vectors):
        labels.set_label(i, 1 if i % 2 == 0 else -1)

    # create svm
    classifier = LibSVM()

    # splitting strategy
    splitting_strategy = StratifiedCrossValidationSplitting(
        labels, num_subsets)

    # accuracy evaluation
    evaluation_criterion = ContingencyTableEvaluation(ACCURACY)

    # cross validation class for evaluation in model selection
    cross = CrossValidation(classifier, features, labels, splitting_strategy,
                            evaluation_criterion)
    cross.set_num_runs(1)

    # print all parameter available for modelselection
    # Dont worry if yours is not included, simply write to the mailing list
    #classifier.print_modsel_params()

    # model parameter selection
    param_tree = create_param_tree()
    #param_tree.print_tree()

    grid_search = GridSearchModelSelection(cross, param_tree)

    print_state = False
    best_combination = grid_search.select_model(print_state)
    #print("best parameter(s):")
    #best_combination.print_tree()

    best_combination.apply_to_machine(classifier)

    # larger number of runs to have less variance
    cross.set_num_runs(10)
    result = cross.evaluate()
    casted = CrossValidationResult.obtain_from_generic(result)
    #print "result mean:", casted.mean

    return classifier, result, casted.get_mean()
def modelselection_grid_search_kernel (num_subsets, num_vectors, dim_vectors):
	# init seed for reproducability
	Math.init_random(1)
	random.seed(1);

	# create some (non-sense) data
	matrix=random.rand(dim_vectors, num_vectors)

	# create num_feautres 2-dimensional vectors
	features=RealFeatures()
	features.set_feature_matrix(matrix)

	# create labels, two classes
	labels=BinaryLabels(num_vectors)
	for i in range(num_vectors):
		labels.set_label(i, 1 if i%2==0 else -1)

	# create svm
	classifier=LibSVM()

	# splitting strategy
	splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets)

	# accuracy evaluation
	evaluation_criterion=ContingencyTableEvaluation(ACCURACY)

	# cross validation class for evaluation in model selection
	cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion)
	cross.set_num_runs(1)

	# print all parameter available for modelselection
	# Dont worry if yours is not included, simply write to the mailing list
	#classifier.print_modsel_params()

	# model parameter selection
	param_tree=create_param_tree()
	#param_tree.print_tree()

	grid_search=GridSearchModelSelection(cross, param_tree)

	print_state=False
	best_combination=grid_search.select_model(print_state)
	#print("best parameter(s):")
	#best_combination.print_tree()

	best_combination.apply_to_machine(classifier)

	# larger number of runs to have less variance
	cross.set_num_runs(10)
	result=cross.evaluate()
	casted=CrossValidationResult.obtain_from_generic(result);
	#print "result mean:", casted.mean

	return classifier,result,casted.get_mean()
Ejemplo n.º 6
0
def evaluation_rocevaluation(ground_truth, predicted):
    from shogun import BinaryLabels
    from shogun import ROCEvaluation

    ground_truth_labels = BinaryLabels(ground_truth)
    predicted_labels = BinaryLabels(predicted)

    evaluator = ROCEvaluation()
    evaluator.evaluate(predicted_labels, ground_truth_labels)

    return evaluator.get_ROC(), evaluator.get_auROC()
def evaluation_cross_validation_classification(traindat=traindat,
                                               label_traindat=label_traindat):
    from shogun import CrossValidation, CrossValidationResult
    from shogun import ContingencyTableEvaluation, ACCURACY
    from shogun import StratifiedCrossValidationSplitting
    from shogun import BinaryLabels
    from shogun import RealFeatures
    from shogun import LibLinear, L2R_L2LOSS_SVC

    # training data
    features = RealFeatures(traindat)
    labels = BinaryLabels(label_traindat)

    # classifier
    classifier = LibLinear(L2R_L2LOSS_SVC)

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "CrossValidationSplitting" is also available
    splitting_strategy = StratifiedCrossValidationSplitting(labels, 5)

    # evaluation method
    evaluation_criterium = ContingencyTableEvaluation(ACCURACY)

    # cross-validation instance
    cross_validation = CrossValidation(classifier, features, labels,
                                       splitting_strategy,
                                       evaluation_criterium)
    cross_validation.set_autolock(False)

    # (optional) repeat x-val 10 times
    cross_validation.set_num_runs(10)

    # perform cross-validation and print(results)
    result = cross_validation.evaluate()
Ejemplo n.º 8
0
def classifier_svmlight(fm_train_dna=traindat,
                        fm_test_dna=testdat,
                        label_train_dna=label_traindat,
                        C=1.2,
                        epsilon=1e-5,
                        num_threads=1):
    from shogun import StringCharFeatures, BinaryLabels, DNA
    from shogun import WeightedDegreeStringKernel
    try:
        from shogun import SVMLight
    except ImportError:
        print('No support for SVMLight available.')
        return

    feats_train = StringCharFeatures(DNA)
    feats_train.set_features(fm_train_dna)
    feats_test = StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)
    degree = 20

    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels = BinaryLabels(label_train_dna)

    svm = SVMLight(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)
    svm.apply().get_labels()
    return kernel
def transfer_multitask_clustered_logistic_regression(fm_train=traindat,
                                                     fm_test=testdat,
                                                     label_train=label_traindat
                                                     ):
    from shogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG
    try:
        from shogun import MultitaskClusteredLogisticRegression
    except ImportError:
        print("MultitaskClusteredLogisticRegression not available")
        exit()

    features = RealFeatures(hstack((traindat, sin(traindat), cos(traindat))))
    labels = BinaryLabels(hstack((label_train, label_train, label_train)))

    n_vectors = features.get_num_vectors()
    task_one = Task(0, n_vectors // 3)
    task_two = Task(n_vectors // 3, 2 * n_vectors // 3)
    task_three = Task(2 * n_vectors // 3, n_vectors)
    task_group = TaskGroup()
    task_group.append_task(task_one)
    task_group.append_task(task_two)
    task_group.append_task(task_three)

    mtlr = MultitaskClusteredLogisticRegression(1.0, 100.0, features, labels,
                                                task_group, 2)
    #mtlr.io.set_loglevel(MSG_DEBUG)
    mtlr.set_tolerance(1e-3)  # use 1e-2 tolerance
    mtlr.set_max_iter(100)
    mtlr.train()
    mtlr.set_current_task(0)
    #print mtlr.get_w()
    out = mtlr.apply_regression().get_labels()

    return out
Ejemplo n.º 10
0
def kernel_salzberg_word_string(fm_train_dna=traindat,
                                fm_test_dna=testdat,
                                label_train_dna=label_traindat,
                                order=3,
                                gap=0,
                                reverse=False):
    from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
    from shogun import SalzbergWordStringKernel
    from shogun import PluginEstimate

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    pie = PluginEstimate()
    labels = BinaryLabels(label_train_dna)
    pie.set_labels(labels)
    pie.set_features(feats_train)
    pie.train()

    kernel = SalzbergWordStringKernel(feats_train, feats_train, pie, labels)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    pie.set_features(feats_test)
    pie.apply().get_labels()
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
Ejemplo n.º 11
0
def classifier_svmocas(train_fname=traindat,
                       test_fname=testdat,
                       label_fname=label_traindat,
                       C=0.9,
                       epsilon=1e-5,
                       num_threads=1):
    from shogun import RealFeatures, BinaryLabels
    from shogun import CSVFile
    try:
        from shogun import SVMOcas
    except ImportError:
        print("SVMOcas not available")
        return

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = BinaryLabels(CSVFile(label_fname))

    svm = SVMOcas(C, feats_train, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.set_bias_enabled(False)
    svm.train()

    bias = svm.get_bias()
    w = svm.get_w()
    predictions = svm.apply(feats_test)
    return predictions, svm, predictions.get_labels()
Ejemplo n.º 12
0
def classifier_ssk(fm_train_dna=traindat,
                   fm_test_dna=testdat,
                   label_train_dna=label_traindat,
                   C=1,
                   maxlen=1,
                   decay=1):
    from shogun import StringCharFeatures, BinaryLabels
    from shogun import LibSVM, SubsequenceStringKernel, DNA
    from shogun import ErrorRateMeasure

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)
    labels = BinaryLabels(label_train_dna)
    kernel = SubsequenceStringKernel(feats_train, feats_train, maxlen, decay)

    svm = LibSVM(C, kernel, labels)
    svm.train()

    out = svm.apply(feats_train)
    evaluator = ErrorRateMeasure()
    trainerr = evaluator.evaluate(out, labels)
    # print(trainerr)

    kernel.init(feats_train, feats_test)
    predicted_labels = svm.apply(feats_test).get_labels()
    # print predicted_labels

    return predicted_labels
def transfer_multitask_logistic_regression(fm_train=traindat,
                                           fm_test=testdat,
                                           label_train=label_traindat):
    from shogun import BinaryLabels, RealFeatures, Task, TaskGroup
    try:
        from shogun import MultitaskLogisticRegression
    except ImportError:
        print("MultitaskLogisticRegression not available")
        exit()

    features = RealFeatures(hstack((traindat, traindat)))
    labels = BinaryLabels(hstack((label_train, label_train)))

    n_vectors = features.get_num_vectors()
    task_one = Task(0, n_vectors // 2)
    task_two = Task(n_vectors // 2, n_vectors)
    task_group = TaskGroup()
    task_group.append_task(task_one)
    task_group.append_task(task_two)

    mtlr = MultitaskLogisticRegression(0.1, features, labels, task_group)
    mtlr.set_regularization(1)  # use regularization ratio
    mtlr.set_tolerance(1e-2)  # use 1e-2 tolerance
    mtlr.train()
    mtlr.set_current_task(0)
    out = mtlr.apply().get_labels()

    return out
Ejemplo n.º 14
0
def kernel_histogram_word_string(fm_train_dna=traindat,
                                 fm_test_dna=testdat,
                                 label_train_dna=label_traindat,
                                 order=3,
                                 ppseudo_count=1,
                                 npseudo_count=1):

    from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
    from shogun import HistogramWordStringKernel, AvgDiagKernelNormalizer
    from shogun import PluginEstimate  #, MSG_DEBUG

    charfeat = StringCharFeatures(DNA)
    #charfeat.io.set_loglevel(MSG_DEBUG)
    charfeat.set_features(fm_train_dna)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, 0, False)

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_test_dna)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, 0, False)

    pie = PluginEstimate(ppseudo_count, npseudo_count)
    labels = BinaryLabels(label_train_dna)
    pie.set_labels(labels)
    pie.set_features(feats_train)
    pie.train()

    kernel = HistogramWordStringKernel(feats_train, feats_train, pie)
    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    pie.set_features(feats_test)
    pie.apply().get_labels()
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def classifier_featureblock_logistic_regression(fm_train=traindat,
                                                fm_test=testdat,
                                                label_train=label_traindat):

    from shogun import BinaryLabels, RealFeatures, IndexBlock, IndexBlockGroup
    try:
        from shogun import FeatureBlockLogisticRegression
    except ImportError:
        print("FeatureBlockLogisticRegression not available")
        exit(0)

    features = RealFeatures(hstack((traindat, traindat)))
    labels = BinaryLabels(hstack((label_train, label_train)))

    n_features = features.get_num_features()
    block_one = IndexBlock(0, n_features // 2)
    block_two = IndexBlock(n_features // 2, n_features)
    block_group = IndexBlockGroup()
    block_group.add_block(block_one)
    block_group.add_block(block_two)

    mtlr = FeatureBlockLogisticRegression(0.1, features, labels, block_group)
    mtlr.set_regularization(1)  # use regularization ratio
    mtlr.set_tolerance(1e-2)  # use 1e-2 tolerance
    mtlr.train()
    out = mtlr.apply().get_labels()

    return out
def classifier_svmlight_linear_term (fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna,degree=3, \
                                                C=10,epsilon=1e-5,num_threads=1):

    from shogun import StringCharFeatures, BinaryLabels, DNA
    from shogun import WeightedDegreeStringKernel
    try:
    	from shogun import SVMLight
    except ImportError:
    	print("SVMLight is not available")
    	exit(0)

    feats_train=StringCharFeatures(DNA)
    feats_train.set_features(fm_train_dna)
    feats_test=StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)

    kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels=BinaryLabels(label_train_dna)

    svm=SVMLight(C, kernel, labels)
    svm.set_qpsize(3)
    svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double));
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)
    out = svm.apply().get_labels()
    return out,kernel
Ejemplo n.º 17
0
def classifier_gpbtsvm(train_fname=traindat,
                       test_fname=testdat,
                       label_fname=label_traindat,
                       width=2.1,
                       C=1,
                       epsilon=1e-5):
    from shogun import RealFeatures, BinaryLabels
    from shogun import GaussianKernel
    from shogun import CSVFile
    try:
        from shogun import GPBTSVM
    except ImportError:
        print("GPBTSVM not available")
        exit(0)

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = BinaryLabels(CSVFile(label_fname))
    kernel = GaussianKernel(feats_train, feats_train, width)

    svm = GPBTSVM(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.train()

    predictions = svm.apply(feats_test)
    return predictions, svm, predictions.get_labels()
def evaluation_contingencytableevaluation(ground_truth, predicted):
    from shogun import BinaryLabels
    from shogun import ContingencyTableEvaluation
    from shogun import AccuracyMeasure, ErrorRateMeasure, BALMeasure
    from shogun import WRACCMeasure, F1Measure, CrossCorrelationMeasure
    from shogun import RecallMeasure, PrecisionMeasure, SpecificityMeasure

    ground_truth_labels = BinaryLabels(ground_truth)
    predicted_labels = BinaryLabels(predicted)

    base_evaluator = ContingencyTableEvaluation()
    base_evaluator.evaluate(predicted_labels, ground_truth_labels)

    evaluator = AccuracyMeasure()
    accuracy = evaluator.evaluate(predicted_labels, ground_truth_labels)

    evaluator = ErrorRateMeasure()
    errorrate = evaluator.evaluate(predicted_labels, ground_truth_labels)

    evaluator = BALMeasure()
    bal = evaluator.evaluate(predicted_labels, ground_truth_labels)

    evaluator = WRACCMeasure()
    wracc = evaluator.evaluate(predicted_labels, ground_truth_labels)

    evaluator = F1Measure()
    f1 = evaluator.evaluate(predicted_labels, ground_truth_labels)

    evaluator = CrossCorrelationMeasure()
    crosscorrelation = evaluator.evaluate(predicted_labels,
                                          ground_truth_labels)

    evaluator = RecallMeasure()
    recall = evaluator.evaluate(predicted_labels, ground_truth_labels)

    evaluator = PrecisionMeasure()
    precision = evaluator.evaluate(predicted_labels, ground_truth_labels)

    evaluator = SpecificityMeasure()
    specificity = evaluator.evaluate(predicted_labels, ground_truth_labels)

    return accuracy, errorrate, bal, wracc, f1, crosscorrelation, recall, precision, specificity
Ejemplo n.º 19
0
def kernel_auc (train_fname=traindat,label_fname=label_traindat,width=1.7):
	from shogun import GaussianKernel, AUCKernel, RealFeatures
	from shogun import BinaryLabels, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	subkernel=GaussianKernel(feats_train, feats_train, width)

	kernel=AUCKernel(0, subkernel)
	kernel.setup_auc_maximization(BinaryLabels(CSVFile(label_fname)))
	km_train=kernel.get_kernel_matrix()
	return kernel
Ejemplo n.º 20
0
def get_labels(raw=False, type='binary'):
    data = concatenate(
        array(
            (-ones(NUM_EXAMPLES, dtype=double), ones(NUM_EXAMPLES,
                                                     dtype=double))))
    if raw:
        return data
    else:
        if type == 'binary':
            return BinaryLabels(data)
        if type == 'regression':
            return RegressionLabels(data)
        return None
Ejemplo n.º 21
0
def evaluation_director_contingencytableevaluation (ground_truth, predicted):
	try:
		from shogun import DirectorContingencyTableEvaluation, ED_MAXIMIZE
	except ImportError:
		print("recompile shogun with --enable-swig-directors")
		return

	class SimpleWeightedBinaryEvaluator(DirectorContingencyTableEvaluation):
		def __init__(self):
			DirectorContingencyTableEvaluation.__init__(self)
		def get_custom_direction(self):
			return ED_MAXIMIZE
		def get_custom_score(self):
			return self.get_WRACC()+self.get_BAL()

	from shogun import BinaryLabels

	evaluator = SimpleWeightedBinaryEvaluator()
	r = evaluator.evaluate(BinaryLabels(ground_truth), BinaryLabels(predicted))
	r2 = evaluator.get_custom_score()
	print(r,r2)

	return r,r2
Ejemplo n.º 22
0
def features_from_file(fileName):

    fileHandle = open(fileName)
    fileHandle.readline()
    features = []
    labels = []
    for line in fileHandle:
        tokens = line.split(',')
        labels.append(float(tokens[1]))
        features.append([float(token) for token in tokens[2:]])

    return RealFeatures(numpy.transpose(
        numpy.array(features))), features, BinaryLabels(
            numpy.array(labels, numpy.float))
Ejemplo n.º 23
0
def classifier_svmsgd (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,num_threads=1,num_iter=5):
	from shogun import RealFeatures, SparseRealFeatures, BinaryLabels
	from shogun import SVMSGD, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))
	labels=BinaryLabels(CSVFile(label_fname))

	svm=SVMSGD(C, feats_train, labels)
	svm.set_epochs(num_iter)
	#svm.io.set_loglevel(0)
	svm.train()

	bias=svm.get_bias()
	w=svm.get_w()
	predictions = svm.apply(feats_test)
	return predictions, svm, predictions.get_labels()
Ejemplo n.º 24
0
def kernel_combined_custom_poly(train_fname=traindat,
                                test_fname=testdat,
                                train_label_fname=label_traindat):
    from shogun import CombinedFeatures, RealFeatures, BinaryLabels
    from shogun import CombinedKernel, PolyKernel, CustomKernel
    from shogun import LibSVM, CSVFile

    kernel = CombinedKernel()
    feats_train = CombinedFeatures()

    tfeats = RealFeatures(CSVFile(train_fname))
    tkernel = PolyKernel(10, 3)
    tkernel.init(tfeats, tfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))

    subkfeats_train = RealFeatures(CSVFile(train_fname))
    feats_train.append_feature_obj(subkfeats_train)
    subkernel = PolyKernel(10, 2)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)

    labels = BinaryLabels(CSVFile(train_label_fname))
    svm = LibSVM(1.0, kernel, labels)
    svm.train()

    kernel = CombinedKernel()
    feats_pred = CombinedFeatures()

    pfeats = RealFeatures(CSVFile(test_fname))
    tkernel = PolyKernel(10, 3)
    tkernel.init(tfeats, pfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))

    subkfeats_test = RealFeatures(CSVFile(test_fname))
    feats_pred.append_feature_obj(subkfeats_test)
    subkernel = PolyKernel(10, 2)
    kernel.append_kernel(subkernel)
    kernel.init(feats_train, feats_pred)

    svm.set_kernel(kernel)
    svm.apply()
    km_train = kernel.get_kernel_matrix()
    return km_train, kernel
Ejemplo n.º 25
0
def classifier_lda(train_fname=traindat,
                   test_fname=testdat,
                   label_fname=label_traindat,
                   gamma=3,
                   num_threads=1):
    from shogun import RealFeatures, BinaryLabels, LDA, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = BinaryLabels(CSVFile(label_fname))

    lda = LDA(gamma, feats_train, labels)
    lda.train()

    bias = lda.get_bias()
    w = lda.get_w()
    predictions = lda.apply(feats_test).get_labels()
    return lda, predictions
Ejemplo n.º 26
0
def classifier_svmlin (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,epsilon=1e-5,num_threads=1):
	from shogun import RealFeatures, SparseRealFeatures, BinaryLabels
	from shogun import SVMLin, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))
	labels=BinaryLabels(CSVFile(label_fname))

	svm=SVMLin(C, feats_train, labels)
	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(num_threads)
	svm.set_bias_enabled(True)
	svm.train()

	bias=svm.get_bias()
	w=svm.get_w()
	predictions = svm.apply(feats_test)
	return predictions, svm, predictions.get_labels()
Ejemplo n.º 27
0
def classifier_custom_kernel(C=1, dim=7):
    from shogun import RealFeatures, BinaryLabels, CustomKernel, LibSVM
    from numpy import diag, ones, sign
    from numpy.random import rand, seed

    seed((C, dim))

    lab = sign(2 * rand(dim) - 1)
    data = rand(dim, dim)
    symdata = data * data.T + diag(ones(dim))

    kernel = CustomKernel()
    kernel.set_full_kernel_matrix_from_full(data)
    labels = BinaryLabels(lab)
    svm = LibSVM(C, kernel, labels)
    svm.train()
    predictions = svm.apply()
    out = svm.apply().get_labels()
    return svm, out
Ejemplo n.º 28
0
def runShogunSVMDNASpectrumKernel(train_xt, train_lt, test_xt):
	"""
	run svm with spectrum kernel
	"""

    ##################################################
    # set up SVM
	charfeat_train = StringCharFeatures(train_xt, DNA)
	feats_train = StringWordFeatures(DNA)
	feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()
	
	charfeat_test = StringCharFeatures(test_xt, DNA)
	feats_test=StringWordFeatures(DNA)
	feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()
	
	kernel=CommWordStringKernel(feats_train, feats_train, False)
	kernel.io.set_loglevel(MSG_DEBUG)

    # init kernel
	labels = BinaryLabels(train_lt)
	
	# run svm model
	print "Ready to train!"
	svm=LibSVM(SVMC, kernel, labels)
	svm.io.set_loglevel(MSG_DEBUG)
	svm.train()

	# predictions
	print "Making predictions!"
	out1DecisionValues = svm.apply(feats_train)
	out1=out1DecisionValues.get_labels()
	kernel.init(feats_train, feats_test)
	out2DecisionValues = svm.apply(feats_test)
	out2=out2DecisionValues.get_labels()

	return out1,out2,out1DecisionValues,out2DecisionValues
def classifier_svmlight_batch_linadd(fm_train_dna, fm_test_dna,
                                     label_train_dna, degree, C, epsilon,
                                     num_threads):

    from shogun import StringCharFeatures, BinaryLabels, DNA
    from shogun import WeightedDegreeStringKernel, MSG_DEBUG
    try:
        from shogun import SVMLight
    except ImportError:
        print('No support for SVMLight available.')
        return

    feats_train = StringCharFeatures(DNA)
    #feats_train.io.set_loglevel(MSG_DEBUG)
    feats_train.set_features(fm_train_dna)
    feats_test = StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)
    degree = 20

    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels = BinaryLabels(label_train_dna)

    svm = SVMLight(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)

    #print('SVMLight Objective: %f num_sv: %d' % \)
    #	(svm.get_objective(), svm.get_num_support_vectors())
    svm.set_batch_computation_enabled(False)
    svm.set_linadd_enabled(False)
    svm.apply().get_labels()

    svm.set_batch_computation_enabled(True)
    labels = svm.apply().get_labels()
    return labels, svm
Ejemplo n.º 30
0
def classifier_mpdsvm(train_fname=traindat,
                      test_fname=testdat,
                      label_fname=label_traindat,
                      C=1,
                      epsilon=1e-5):

    from shogun import RealFeatures, BinaryLabels
    from shogun import GaussianKernel
    from shogun import MPDSVM, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = BinaryLabels(CSVFile(label_fname))
    width = 2.1
    kernel = GaussianKernel(feats_train, feats_train, width)

    svm = MPDSVM(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.train()

    predictions = svm.apply(feats_test)
    return predictions, svm, predictions.get_labels()
Ejemplo n.º 31
0
def classifier_perceptron(n=100,
                          dim=2,
                          distance=5,
                          learn_rate=1.,
                          max_iter=1000,
                          num_threads=1,
                          seed=1):
    from shogun import RealFeatures, BinaryLabels
    from shogun import Perceptron

    random.seed(seed)

    # produce some (probably) linearly separable training data by hand
    # Two Gaussians at a far enough distance
    X = array(random.randn(dim, n)) + distance
    Y = array(random.randn(dim, n)) - distance
    X_test = array(random.randn(dim, n)) + distance
    Y_test = array(random.randn(dim, n)) - distance
    label_train_twoclass = hstack((ones(n), -ones(n)))

    #plot(X[0,:], X[1,:], 'x', Y[0,:], Y[1,:], 'o')
    fm_train_real = hstack((X, Y))
    fm_test_real = hstack((X_test, Y_test))

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    labels = BinaryLabels(label_train_twoclass)

    perceptron = Perceptron(feats_train, labels)
    perceptron.set_learn_rate(learn_rate)
    perceptron.set_max_iter(max_iter)
    # only guaranteed to converge for separable data
    perceptron.train()

    perceptron.set_features(feats_test)
    out_labels = perceptron.apply().get_labels()
    return perceptron, out_labels