Beispiel #1
0
def get_string_complex(ftype,
                       data,
                       alphabet=features.DNA,
                       order=WORDSTRING_ORDER,
                       gap=WORDSTRING_GAP,
                       reverse=WORDSTRING_REVERSE):
    """Return complex StringFeatures.

	@param ftype Feature type, e.g. RealFeature, ByteFeature
	@param data Train/test data for feature creation
	@param alphabet Alphabet for feature creation
	@param order Order of the feature
	@param gap Gap of the feature
	@param reverse Is feature reverse?
	@return Dict with complex StringFeatures train/test
	"""

    feats = {}

    charfeat = features.StringCharFeatures(data['train'], alphabet)
    feat = eval('features.String' + ftype + 'Features(alphabet)')
    feat.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats['train'] = feat

    charfeat = features.StringCharFeatures(data['test'], alphabet)
    feat = eval('features.String' + ftype + 'Features(alphabet)')
    feat.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats['test'] = feat

    if ftype == 'Word' or ftype == 'Ulong':
        name = 'Sort' + ftype + 'String'
        return add_preproc(name, feats)
    else:
        return feats
Beispiel #2
0
    def _stop_training(self):
        super(ShogunSVMClassifier, self)._stop_training()
        self.normalizer = _LabelNormalizer(self.labels)
        labels = self.normalizer.normalize(self.labels)
        # shogun expects float labels
        labels = sgFeatures.Labels(labels.astype(float))

        features = sgFeatures.RealFeatures(self.data.transpose())

        self.classifier.set_train_features(features, labels)
        self.classifier.train()
def bench_shogun(X, y, T, valid):
    #
    #       .. Shogun ..
    #
    from shogun import Classifier, Features, Distance
    start = datetime.now()
    feat = Features.RealFeatures(X.T)
    distance = Distance.EuclidianDistance(feat, feat)
    labels = Features.Labels(y.astype(np.float64))
    test_feat = Features.RealFeatures(T.T)
    knn = Classifier.KNN(n_neighbors, distance, labels)
    knn.train()
    score = np.mean(knn.classify(test_feat).get_labels() == valid)
    return score, datetime.now() - start
Beispiel #4
0
    def _label(self, x):
        """Classify the input data 'x'
        """
        test_features = sgFeatures.RealFeatures(x.transpose())

        labels = self.classifier.label(test_features)

        if self.normalizer:
            return self.normalizer.revert(labels)
        else:
            return labels
    def _label(self, x):
        """Classify the input data 'x'
        
        :param x: The input data to classify.
        :return: The corresponding labels for the input.
        """
        test_features = sgFeatures.RealFeatures(x.transpose())

        labels = self.classifier.label(test_features)

        if self.normalizer:
            return self.normalizer.revert(labels)
        else:
            return labels
Beispiel #6
0
def get_wd(data, order=WORDSTRING_ORDER):
    """Return WDFeatures.

	@param data Train/test data for feature creation
	@param order Order of the feature
	@return Dict with WDFeatures train/test
	"""

    feats = {}

    charfeat = features.StringCharFeatures(data['train'], features.DNA)
    bytefeat = features.StringByteFeatures(features.RAWDNA)
    bytefeat.obtain_from_char(charfeat, 0, 1, 0, False)
    feats['train'] = features.WDFeatures(bytefeat, order, order)

    charfeat = features.StringCharFeatures(data['test'], features.DNA)
    bytefeat = features.StringByteFeatures(features.RAWDNA)
    bytefeat.obtain_from_char(charfeat, 0, 1, 0, False)
    feats['test'] = features.WDFeatures(bytefeat, order, order)

    return feats
def train_attribute(attribute_id, C, split=0):
    from shogun import Classifier, Features, Kernel, Distance
    attribute_id = int(attribute_id)
    print "# attribute ", attributenames[attribute_id]
    C = float(C)
    print "# C ", C

    if split == 0:
        train_classes = loadstr(
            '/nfs3group/chlgrp/datasets/Animals_with_Attributes/trainclasses.txt'
        )
        test_classes = loadstr(
            '/nfs3group/chlgrp/datasets/Animals_with_Attributes/testclasses.txt'
        )
    else:
        classnames = loadstr(
            '/nfs3group/chlgrp/datasets/Animals_with_Attributes/classnames.txt'
        )
        startid = (split - 1) * 10
        stopid = split * 10
        test_classes = classnames[startid:stopid]
        train_classes = classnames[0:startid] + classnames[stopid:]

    Xtrn, Ltrn = create_data(train_classes, attribute_id)
    Xtst, Ltst = create_data(test_classes, attribute_id)

    if min(Ltrn) == max(Ltrn):  # only 1 class
        Lprior = mean(Ltrn)
        prediction = sign(Lprior) * ones(len(Ltst))
        probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones(
            len(Ltst))  # fallback
        return prediction, probabilities, Ltst

    #sg('loglevel', 'WARN')
    widths = {}
    for feature in all_features:
        traindata = array(Xtrn[feature][:, ::50], float)  # used to be 5*offset
        trainfeat = Features.RealFeatures(traindata)
        DM = Distance.ChiSquareDistance(trainfeat,
                                        trainfeat).get_distance_matrix()
        widths[feature] = median(DM.flatten())
        del traindata, trainfeat, DM

    s = Classifier.LibSVM()  #sg('new_svm', 'LIBSVM')
    #sg('use_mkl', False)     # we use fixed weights here

    #sg('clean_features', 'TRAIN')
    #sg('clean_features', 'TEST')

    Lplatt_trn = concatenate([Ltrn[i::10]
                              for i in range(9)])  # 90% for training
    Lplatt_val = Ltrn[9::10]  # remaining 10% for platt scaling

    feats_trn = Features.CombinedFeatures()
    feats_val = Features.CombinedFeatures()
    for feature in all_features:
        Xplatt_trn = concatenate([Xtrn[feature][:, i::10] for i in range(9)],
                                 axis=1)
        feats_trn.append_feature_obj(
            Features.RealFeatures(ascontiguousarray(Xplatt_trn)))
        #sg('add_features', 'TRAIN', Xplatt_trn)
        Xplatt_val = Xtrn[feature][:, 9::10]
        feats_val.append_feature_obj(
            Features.RealFeatures(ascontiguousarray(Xplatt_val)))
        #sg('add_features', 'TEST', Xplatt_val)
        del Xplatt_trn, Xplatt_val, Xtrn[feature]

    labels_trn = Features.Labels(Lplatt_trn)
    #sg('set_labels', 'TRAIN', Lplatt_trn)

    kernel = Kernel.CombinedKernel()
    #sg('set_kernel', 'COMBINED', 5000)
    for featureset in all_features:
        kernel.append_kernel(Kernel.Chi2Kernel(5000, widths[featureset] / 5.))
        #sg('add_kernel', 1., 'CHI2', 'REAL', 10, widths[featureset]/5. )

    kernel.init(feats_trn, feats_trn)
    K = kernel.get_kernel_matrix()
    K.tofile('/scratch/chl/cvfold%d_C%g_%02d-trn.kernel' %
             (split, C, attribute_id))
    del K

    s.set_max_train_time(600 * 60.)
    #sg('svm_max_train_time', 600*60.) # one hour should be plenty
    s.set_C(C, C)
    #sg('c', C)

    s.set_kernel(kernel)
    s.set_labels(labels_trn)
    #sg('init_kernel', 'TRAIN')
    try:
        s.train()
        #sg('train_classifier')
    except (RuntimeWarning, RuntimeError
            ):  # can't train, e.g. all samples have the same labels
        Lprior = mean(Ltrn)
        prediction = sign(Lprior) * ones(len(Ltst))
        probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones(len(Ltst))
        savetxt('./DAP/cvfold%d_C%g_%02d.txt' % (split, C, attribute_id),
                prediction)
        savetxt('./DAP/cvfold%d_C%g_%02d.prob' % (split, C, attribute_id),
                probabilities)
        savetxt('./DAP/cvfold%d_C%g_%02d.labels' % (split, C, attribute_id),
                Ltst)
        return prediction, probabilities, Ltst

    bias = s.get_bias()
    alphas = s.get_alphas()
    #[bias, alphas]=sg('get_svm')
    #print bias,alphas

    kernel.init(feats_trn, feats_val)
    K = kernel.get_kernel_matrix()
    K.tofile('/scratch/chl/cvfold%d_C%g_%02d-val.kernel' %
             (split, C, attribute_id))
    del K

    #sg('init_kernel', 'TEST')
    try:
        prediction = s.classify().get_labels()
        #prediction=sg('classify')
        platt_params = SigmoidTrain(prediction, Lplatt_val)
        probabilities = SigmoidPredict(prediction, platt_params)

        savetxt('./DAP/cvfold%d_C%g_%02d-val.txt' % (split, C, attribute_id),
                prediction)
        savetxt('./DAP/cvfold%d_C%g_%02d-val.prob' % (split, C, attribute_id),
                probabilities)
        savetxt(
            './DAP/cvfold%d_C%g_%02d-val.labels' % (split, C, attribute_id),
            Lplatt_val)
        savetxt('./DAP/cvfold%d_C%g_%02d-val.platt' % (split, C, attribute_id),
                platt_params)
        #print '#train-perf ',attribute_id,C,mean((prediction*Lplatt_val)>0),mean(Lplatt_val>0)
        #print '#platt-perf ',attribute_id,C,mean((sign(probabilities-0.5)*Lplatt_val)>0),mean(Lplatt_val>0)
    except RuntimeError:
        Lprior = mean(Ltrn)
        prediction = sign(Lprior) * ones(len(Ltst))
        probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones(len(Ltst))
        print >> sys.stderr, "#Error during testing. Using constant platt scaling"
        platt_params = [1., 0.]

    # ----------------------------- now apply to test classes ------------------

    feats_tst = Features.CombinedFeatures()
    #sg('clean_features', 'TEST')
    for feature in all_features:
        feats_tst.append_feature_obj(
            Features.RealFeatures(ascontiguousarray(Xtst[feature])))
        del Xtst[feature]

    kernel.init(feats_trn, feats_tst)
    K = kernel.get_kernel_matrix()
    K.tofile('/scratch/chl/cvfold%d_C%g_%02d-tst.kernel' %
             (split, C, attribute_id))
    del K

    #sg('init_kernel', 'TEST')
    prediction = s.classify().get_labels()
    #prediction=sg('classify')
    probabilities = SigmoidPredict(prediction, platt_params)

    savetxt('./DAP/cvfold%d_C%g_%02d.txt' % (split, C, attribute_id),
            prediction)
    savetxt('./DAP/cvfold%d_C%g_%02d.prob' % (split, C, attribute_id),
            probabilities)
    savetxt('./DAP/cvfold%d_C%g_%02d.labels' % (split, C, attribute_id), Ltst)

    #print '#test-perf ',attribute_id,C,mean((prediction*Ltst)>0),mean(Ltst>0)
    #print '#platt-perf ',attribute_id,C,mean((sign(probabilities-0.5)*Ltst)>0),mean(Ltst>0)
    return prediction, probabilities, Ltst