Ejemplo n.º 1
0
def __assign_shogun_version():
    """Assign shogun versions
    """
    if 'shogun' in versions:
        return
    import shogun.Classifier as __sc
    versions['shogun:rev'] = __sc.Version_get_version_revision()
    ver = __sc.Version_get_version_release().lstrip('v')
    versions['shogun:full'] = ver
    if '_' in ver:
        ver = ver[:ver.index('_')]
    versions['shogun'] = ver
Ejemplo n.º 2
0
def _run_lda ():
	"""Run Linear Discriminant Analysis classifier."""

	params={
		'name': 'LDA',
		'type': 'lda',
		'gamma': 0.1,
		'num_threads': 1,
		'data': dataop.get_clouds(2),
		'feature_class': 'simple',
		'feature_type': 'Real',
		'label_type': 'twoclass',
		'accuracy': 1e-7
	}
	feats=featop.get_features(
		params['feature_class'], params['feature_type'], params['data'])
	params['labels'], labels=dataop.get_labels(
		feats['train'].get_num_vectors(), params['label_type'])

	lda=classifier.LDA(params['gamma'], feats['train'], labels)
	lda.parallel.set_num_threads(params['num_threads'])
	lda.train()

	lda.set_features(feats['test'])
	params['classified']=lda.classify().get_labels()

	output=fileop.get_output(category.CLASSIFIER, params)
	fileop.write(category.CLASSIFIER, output)
Ejemplo n.º 3
0
def _run_perceptron ():
	"""Run Perceptron classifier."""

	params={
		'name': 'Perceptron',
		'type': 'perceptron',
		'num_threads': 1,
		'learn_rate': .1,
		'max_iter': 1000,
		'data': dataop.get_clouds(2),
		'feature_class': 'simple',
		'feature_type': 'Real',
		'label_type': 'twoclass',
		'accuracy': 1e-7
	}
	feats=featop.get_features(
		params['feature_class'], params['feature_type'], params['data'])
	num_vec=feats['train'].get_num_vectors()
	params['labels'], labels=dataop.get_labels(num_vec, params['label_type'])

	perceptron=classifier.Perceptron(feats['train'], labels)
	perceptron.parallel.set_num_threads(params['num_threads'])
	perceptron.set_learn_rate(params['learn_rate'])
	perceptron.set_max_iter(params['max_iter'])
	perceptron.train()

	params['bias']=perceptron.get_bias()
	perceptron.set_features(feats['test'])
	params['classified']=perceptron.classify().get_labels()

	output=fileop.get_output(category.CLASSIFIER, params)
	fileop.write(category.CLASSIFIER, output)
Ejemplo n.º 4
0
def bench_shogun(X, y, T, valid):
    #
    #       .. Shogun ..
    #
    from shogun import Classifier, Features, Distance
    start = datetime.now()
    feat = Features.RealFeatures(X.T)
    distance = Distance.EuclidianDistance(feat, feat)
    labels = Features.Labels(y.astype(np.float64))
    test_feat = Features.RealFeatures(T.T)
    knn = Classifier.KNN(n_neighbors, distance, labels)
    knn.train()
    score = np.mean(knn.classify(test_feat).get_labels() == valid)
    return score, datetime.now() - start
Ejemplo n.º 5
0
def __check_shogun(bottom_version, custom_versions=[]):
    """Check if version of shogun is high enough (or custom known) to
    be enabled in the testsuite

    Parameters
    ----------
    bottom_version : int
      Bottom version which must be satisfied
    custom_versions : list of int
      Arbitrary list of versions which could got patched for
      a specific issue
    """
    import shogun.Classifier as __sc
    ver = __sc.Version_get_version_revision()
    __assign_shogun_version()
    if (ver in custom_versions) or (ver >= bottom_version):
        return True
    else:
        raise ImportError, 'Version %s is smaller than needed %s' % \
              (ver, bottom_version)
Ejemplo n.º 6
0
def _run_knn ():
	"""Run K-Nearest-Neighbour classifier.
	"""

	params={
		'name': 'EuclidianDistance',
		'data': dataop.get_clouds(2),
		'feature_class': 'simple',
		'feature_type': 'Real'
	}
	feats=featop.get_features(
		params['feature_class'], params['feature_type'], params['data'])
	dfun=eval(params['name'])
	distance=dfun(feats['train'], feats['train'])
	output=fileop.get_output(category.DISTANCE, params)

	params={
		'name': 'KNN',
		'type': 'knn',
		'num_threads': 1,
		'k': 3,
		'label_type': 'twoclass',
		'accuracy': 1e-8
	}
	params['labels'], labels=dataop.get_labels(
		feats['train'].get_num_vectors(), params['label_type'])

	knn=classifier.KNN(params['k'], distance, labels)
	knn.parallel.set_num_threads(params['num_threads'])
	knn.train()

	distance.init(feats['train'], feats['test'])
	params['classified']=knn.classify().get_labels()

	output.update(fileop.get_output(category.CLASSIFIER, params))
	fileop.write(category.CLASSIFIER, output)
def parse_config_file(config_files, feature_file_suffix='.bed'):
    """ Return classifiers. """
    valid_words = ['CLASSIFIER', 'KERNEL', 'KERNEL_NAME', 'KERNEL_NORM', 'FEATURE', 
                   'BIN_FEATURE', 'CON_FEATURE', 'PP_FEATURE', 'SEQ', 'END', 'C', 'MKL_NORM', 'REV_COMP']

    classifiers = []

    for config_file in config_files:
        name = None; kernels = []; c = None
        bin_features = []; con_features = []; pp_features = []
        seqs = []; ks = [];
        kern_names = []
        kern_norms = []
        rev_comps = []
        mkl_norm = 2

        line_num = 0
        for line in open(config_file):
            line_num += 1
            line = line.strip()
            if line.startswith('#') or not line: continue

            t = line.split()
            if t[0] not in valid_words: 
                sys.exit('ERROR! %s not recognized in line %d: %s.' % (t[0], line_num, line))

            elif t[0] == 'CLASSIFIER':
                name = '_'.join(t[1:])

            elif t[0] == 'KERNEL':
                bin_features.append([])
                con_features.append([])
                pp_features.append([])
                seqs.append([])
                kern_norms.append(None)
                kern_names.append(None)
                rev_comps.append(False)
                ks.append(None)
                if t[1] == 'Linear':
                    kernels.append(LinearKernel())
                elif t[1].startswith('CommWordString_') or t[1].startswith('Spectrum_'):
                    k = int(t[1].split('_')[-1])
                    ks[-1] = k
                    kernels.append(CommWordStringKernel(10, False))
                elif t[1].startswith('Gaussian_'):
                    sigma = float(t[1].split('_')[-1])
                    kernels.append(GaussianKernel(10, sigma))
                else:
                    sys.exit('ERROR! %s is not a valid kernel.' % t[1])

            elif t[0] == 'KERNEL_NORM':
                if t[1] == 'VarianceKernelNormalizer':
                    kern_norms[-1] = VarianceKernelNormalizer()
                    kernels[-1].set_normalizer(VarianceKernelNormalizer())
                elif t[1] == 'SqrtDiagKernelNormalizer':
                    kern_norms[-1] = SqrtDiagKernelNormalizer()
                    kernels[-1].set_normalizer(SqrtDiagKernelNormalizer())
                elif t[1] == 'AvgDiagKernelNormalizer':
                    kern_norms[-1] = AvgDiagKernelNormalizer()
                    kernels[-1].set_normalizer(AvgDiagKernelNormalizer())
                else:
                    sys.exit('ERROR! %s is not a recognized kernel normalizer.' % t[1])

            elif t[0] == 'KERNEL_NAME':
                kern_names[-1] = t[1]

            elif t[0] == 'REV_COMP':
                rev_comps[-1] = bool(int(t[1]))

            elif t[0] == 'BIN_FEATURE' or t[0] == 'FEATURE':  # for BW compatibility
                bf_path = t[1]
                if os.path.isdir(bf_path):
                    for fn in os.listdir(bf_path):
                        if not fn.endswith(feature_file_suffix): continue
                        full_fn = bf_path + ('/' if bf_path[-1] != '/' else '') + fn
                        if full_fn not in bin_features[-1]: bin_features[-1].append(full_fn)
                else:
                    bin_features[-1].append(bf_path)

            elif t[0] == 'CON_FEATURE':
                cf_path = t[1]
                if os.path.isdir(cf_path):
                    for fn in os.listdir(cf_path):
                        if not fn.endswith(feature_file_suffix): continue
                        full_fn = cf_path + ('/' if cf_path[-1] != '/' else '') + fn
                        if full_fn not in con_features[-1]: con_features[-1].append(full_fn)
                else:
                    con_features[-1].append(cf_path)

            elif t[0] == 'PP_FEATURE':
                pf_path = t[1]
                if os.path.isdir(pf_path):
                    for fn in os.listdir(pf_path):
                        if not fn.endswith(feature_file_suffix): continue
                        full_fn = pf_path + ('/' if pf_path[-1] != '/' else '') + fn
                        if full_fn not in pp_features[-1]: pp_features[-1].append(full_fn)
                else:
                    pp_features[-1].append(pf_path)

            elif t[0] == 'SEQ':
                seqs[-1].append(t[1])

            elif t[0] == 'C':
                c = [float(x) for x in t[1:]]

            elif t[0] == 'MKL_NORM':
                mkl_norm = int(t[1])

            elif t[0] == 'END':
                print '\t', name
                classifiers.append(Classifier(name, kernels, bin_features, con_features,
                                              pp_features=pp_features, seqs=seqs, 
                                              kern_norms=kern_norms, ks=ks, c=c, 
                                              kern_names=kern_names, 
                                              mkl_norm=mkl_norm, rev_comps=rev_comps))

                name = None; kernels = []; c = None
                bin_features = []; con_features; pp_features = []
                seqs = []; ks = []
                kern_norms = []
                kern_names = []
                rev_comps = []
                mkl_norm = 2

    return classifiers
def train_attribute(attribute_id, C, split=0):
    from shogun import Classifier, Features, Kernel, Distance
    attribute_id = int(attribute_id)
    print "# attribute ", attributenames[attribute_id]
    C = float(C)
    print "# C ", C

    if split == 0:
        train_classes = loadstr(
            '/nfs3group/chlgrp/datasets/Animals_with_Attributes/trainclasses.txt'
        )
        test_classes = loadstr(
            '/nfs3group/chlgrp/datasets/Animals_with_Attributes/testclasses.txt'
        )
    else:
        classnames = loadstr(
            '/nfs3group/chlgrp/datasets/Animals_with_Attributes/classnames.txt'
        )
        startid = (split - 1) * 10
        stopid = split * 10
        test_classes = classnames[startid:stopid]
        train_classes = classnames[0:startid] + classnames[stopid:]

    Xtrn, Ltrn = create_data(train_classes, attribute_id)
    Xtst, Ltst = create_data(test_classes, attribute_id)

    if min(Ltrn) == max(Ltrn):  # only 1 class
        Lprior = mean(Ltrn)
        prediction = sign(Lprior) * ones(len(Ltst))
        probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones(
            len(Ltst))  # fallback
        return prediction, probabilities, Ltst

    #sg('loglevel', 'WARN')
    widths = {}
    for feature in all_features:
        traindata = array(Xtrn[feature][:, ::50], float)  # used to be 5*offset
        trainfeat = Features.RealFeatures(traindata)
        DM = Distance.ChiSquareDistance(trainfeat,
                                        trainfeat).get_distance_matrix()
        widths[feature] = median(DM.flatten())
        del traindata, trainfeat, DM

    s = Classifier.LibSVM()  #sg('new_svm', 'LIBSVM')
    #sg('use_mkl', False)     # we use fixed weights here

    #sg('clean_features', 'TRAIN')
    #sg('clean_features', 'TEST')

    Lplatt_trn = concatenate([Ltrn[i::10]
                              for i in range(9)])  # 90% for training
    Lplatt_val = Ltrn[9::10]  # remaining 10% for platt scaling

    feats_trn = Features.CombinedFeatures()
    feats_val = Features.CombinedFeatures()
    for feature in all_features:
        Xplatt_trn = concatenate([Xtrn[feature][:, i::10] for i in range(9)],
                                 axis=1)
        feats_trn.append_feature_obj(
            Features.RealFeatures(ascontiguousarray(Xplatt_trn)))
        #sg('add_features', 'TRAIN', Xplatt_trn)
        Xplatt_val = Xtrn[feature][:, 9::10]
        feats_val.append_feature_obj(
            Features.RealFeatures(ascontiguousarray(Xplatt_val)))
        #sg('add_features', 'TEST', Xplatt_val)
        del Xplatt_trn, Xplatt_val, Xtrn[feature]

    labels_trn = Features.Labels(Lplatt_trn)
    #sg('set_labels', 'TRAIN', Lplatt_trn)

    kernel = Kernel.CombinedKernel()
    #sg('set_kernel', 'COMBINED', 5000)
    for featureset in all_features:
        kernel.append_kernel(Kernel.Chi2Kernel(5000, widths[featureset] / 5.))
        #sg('add_kernel', 1., 'CHI2', 'REAL', 10, widths[featureset]/5. )

    kernel.init(feats_trn, feats_trn)
    K = kernel.get_kernel_matrix()
    K.tofile('/scratch/chl/cvfold%d_C%g_%02d-trn.kernel' %
             (split, C, attribute_id))
    del K

    s.set_max_train_time(600 * 60.)
    #sg('svm_max_train_time', 600*60.) # one hour should be plenty
    s.set_C(C, C)
    #sg('c', C)

    s.set_kernel(kernel)
    s.set_labels(labels_trn)
    #sg('init_kernel', 'TRAIN')
    try:
        s.train()
        #sg('train_classifier')
    except (RuntimeWarning, RuntimeError
            ):  # can't train, e.g. all samples have the same labels
        Lprior = mean(Ltrn)
        prediction = sign(Lprior) * ones(len(Ltst))
        probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones(len(Ltst))
        savetxt('./DAP/cvfold%d_C%g_%02d.txt' % (split, C, attribute_id),
                prediction)
        savetxt('./DAP/cvfold%d_C%g_%02d.prob' % (split, C, attribute_id),
                probabilities)
        savetxt('./DAP/cvfold%d_C%g_%02d.labels' % (split, C, attribute_id),
                Ltst)
        return prediction, probabilities, Ltst

    bias = s.get_bias()
    alphas = s.get_alphas()
    #[bias, alphas]=sg('get_svm')
    #print bias,alphas

    kernel.init(feats_trn, feats_val)
    K = kernel.get_kernel_matrix()
    K.tofile('/scratch/chl/cvfold%d_C%g_%02d-val.kernel' %
             (split, C, attribute_id))
    del K

    #sg('init_kernel', 'TEST')
    try:
        prediction = s.classify().get_labels()
        #prediction=sg('classify')
        platt_params = SigmoidTrain(prediction, Lplatt_val)
        probabilities = SigmoidPredict(prediction, platt_params)

        savetxt('./DAP/cvfold%d_C%g_%02d-val.txt' % (split, C, attribute_id),
                prediction)
        savetxt('./DAP/cvfold%d_C%g_%02d-val.prob' % (split, C, attribute_id),
                probabilities)
        savetxt(
            './DAP/cvfold%d_C%g_%02d-val.labels' % (split, C, attribute_id),
            Lplatt_val)
        savetxt('./DAP/cvfold%d_C%g_%02d-val.platt' % (split, C, attribute_id),
                platt_params)
        #print '#train-perf ',attribute_id,C,mean((prediction*Lplatt_val)>0),mean(Lplatt_val>0)
        #print '#platt-perf ',attribute_id,C,mean((sign(probabilities-0.5)*Lplatt_val)>0),mean(Lplatt_val>0)
    except RuntimeError:
        Lprior = mean(Ltrn)
        prediction = sign(Lprior) * ones(len(Ltst))
        probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones(len(Ltst))
        print >> sys.stderr, "#Error during testing. Using constant platt scaling"
        platt_params = [1., 0.]

    # ----------------------------- now apply to test classes ------------------

    feats_tst = Features.CombinedFeatures()
    #sg('clean_features', 'TEST')
    for feature in all_features:
        feats_tst.append_feature_obj(
            Features.RealFeatures(ascontiguousarray(Xtst[feature])))
        del Xtst[feature]

    kernel.init(feats_trn, feats_tst)
    K = kernel.get_kernel_matrix()
    K.tofile('/scratch/chl/cvfold%d_C%g_%02d-tst.kernel' %
             (split, C, attribute_id))
    del K

    #sg('init_kernel', 'TEST')
    prediction = s.classify().get_labels()
    #prediction=sg('classify')
    probabilities = SigmoidPredict(prediction, platt_params)

    savetxt('./DAP/cvfold%d_C%g_%02d.txt' % (split, C, attribute_id),
            prediction)
    savetxt('./DAP/cvfold%d_C%g_%02d.prob' % (split, C, attribute_id),
            probabilities)
    savetxt('./DAP/cvfold%d_C%g_%02d.labels' % (split, C, attribute_id), Ltst)

    #print '#test-perf ',attribute_id,C,mean((prediction*Ltst)>0),mean(Ltst>0)
    #print '#platt-perf ',attribute_id,C,mean((sign(probabilities-0.5)*Ltst)>0),mean(Ltst>0)
    return prediction, probabilities, Ltst