def Do_alpha(self):
        """The observed disagreement for the alpha coefficient.

        The alpha coefficient, unlike the other metrics, uses this rather than
        observed agreement.
        """
        total = 0.0
        for i, itemdata in self._grouped_data('item'):
            label_freqs = FreqDist(x['labels'] for x in itemdata)

            for j, nj in compat.iteritems(label_freqs):
                for l, nl in compat.iteritems(label_freqs):
                    total += float(nj * nl) * self.distance(l, j)
        ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total
        log.debug("Observed disagreement: %f", ret)
        return ret
Ejemplo n.º 2
0
    def svm_label_name(self, label):
        """
        searches values of _labelmapping to resolve +1 or -1 to a string

        :param label: the string label to look up
        """
        labelname = [k for k, v in compat.iteritems(self._labelmapping) if v == label][0]
        return labelname
Ejemplo n.º 3
0
 def _apply_filter(self, fn=lambda ngram, freq: False):
     """Generic filter removes ngrams from the frequency distribution
     if the function returns True when passed an ngram tuple.
     """
     tmp_ngram = FreqDist()
     for ngram, freq in iteritems(self.ngram_fd):
         if not fn(ngram, freq):
             tmp_ngram[ngram] = freq
     self.ngram_fd = tmp_ngram
Ejemplo n.º 4
0
 def _apply_filter(self, fn=lambda ngram, freq: False):
     """Generic filter removes ngrams from the frequency distribution
     if the function returns True when passed an ngram tuple.
     """
     tmp_ngram = FreqDist()
     for ngram, freq in iteritems(self.ngram_fd):
         if not fn(ngram, freq):
             tmp_ngram[ngram] = freq
     self.ngram_fd = tmp_ngram
Ejemplo n.º 5
0
    def svm_label_name(self, label):
        """
        searches values of _labelmapping to resolve +1 or -1 to a string

        :param label: the string label to look up
        """
        labelname = [
            k for k, v in compat.iteritems(self._labelmapping) if v == label
        ][0]
        return labelname
    def pi(self):
        """Scott 1955; here, multi-pi.
        Equivalent to K from Siegel and Castellan (1988).

        """
        total = 0.0
        label_freqs = FreqDist(x['labels'] for x in self.data)
        for k, f in compat.iteritems(label_freqs):
            total += f ** 2
        Ae = total / float((len(self.I) * len(self.C)) ** 2)
        return (self.avg_Ao() - Ae) / (1 - Ae)
Ejemplo n.º 7
0
    def _featuresets_to_array(self, featuresets):
        """Convert featureset to NumPy array."""

        X = np.zeros((len(featuresets), len(self._feature_index)),
                     dtype=self._dtype)

        for i, fs in enumerate(featuresets):
            for f, v in compat.iteritems(fs):
                try:
                    X[i, self._feature_index[f]] = self._dtype(v)
                except KeyError:    # feature not seen in training
                    pass

        return X
Ejemplo n.º 8
0
    def _featuresets_to_array(self, featuresets):
        """Convert featureset to NumPy array."""

        X = np.zeros((len(featuresets), len(self._feature_index)),
                     dtype=self._dtype)

        for i, fs in enumerate(featuresets):
            for f, v in compat.iteritems(fs):
                try:
                    X[i, self._feature_index[f]] = self._dtype(v)
                except KeyError:  # feature not seen in training
                    pass

        return X
Ejemplo n.º 9
0
    def train(featuresets):
        """
        given a set of training instances in nltk format:
        [ ( {feature:value, ..}, str(label) ) ]
        train a support vector machine

        :param featuresets: training instances
        """

        _raise_if_svmlight_is_missing()

        # build a unique list of labels
        labels = set()
        for (features, label) in featuresets:
            labels.add(label)

        # this is a binary classifier only
        if len(labels) > 2:
            raise ValueError('Can only do boolean classification (labels: ' +
                             str(labels) + ')')
            return False

        # we need ordering, so a set's no good
        labels = list(labels)

        # next, assign -1 and 1
        labelmapping = {labels[0]: -1, labels[1]: 1}

        # now for feature conversion
        # iter through instances, building a set of feature:type:str(value) triples
        svmfeatures = set()
        for (features, label) in featuresets:
            for k, v in compat.iteritems(features):
                svmfeatures.add(featurename(k, v))
        # svmfeatures is indexable by integer svm feature number
        # svmfeatureindex is the inverse (svm feature name -> number)
        svmfeatures = list(svmfeatures)
        svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures))))

        # build svm feature set case by case
        svmfeatureset = []
        for instance in featuresets:
            svmfeatureset.append(
                map_instance_to_svm(instance, labelmapping, svmfeatureindex))

        # train the svm
        # TODO: implement passing of SVMlight parameters from train() to learn()
        return SvmClassifier(
            labels, labelmapping, svmfeatures,
            svmlight.learn(svmfeatureset, type='classification'))
Ejemplo n.º 10
0
def map_features_to_svm(features, svmfeatureindex):
    """
    :param features: a dict of features in the format {'feature':value}
    :param svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels
    """
    instancefeatures = []
    # svmlight supports sparse feature sets and so we simply omit features that we don't include
    for k,v in compat.iteritems(features):
        # each feature is represented as an (int, float) tuple where the int is the SVMlight feature label and the float is the value; as we either have or have not a feature, this is 1.0
        # this does not support scalar features - rather, each value that a feature may take on is a discrete independent label
        # use 1.0 as the feature value to specify the presence of a feature:value couple
        svmfeaturename = featurename(k, v)
        if svmfeaturename not in svmfeatureindex:
            # skip over feature:value pairs that were not in the training data and so not included in our mappings
            continue
        instancefeatures.append( (svmfeatureindex[svmfeaturename], 1.0) )
    return instancefeatures
Ejemplo n.º 11
0
    def train(featuresets):
        """
        given a set of training instances in nltk format:
        [ ( {feature:value, ..}, str(label) ) ]
        train a support vector machine

        :param featuresets: training instances
        """

        _raise_if_svmlight_is_missing()

        # build a unique list of labels
        labels = set()
        for (features, label) in featuresets:
            labels.add(label)

        # this is a binary classifier only
        if len(labels) > 2:
            raise ValueError('Can only do boolean classification (labels: '+ str(labels) + ')')
            return False

        # we need ordering, so a set's no good
        labels = list(labels)

        # next, assign -1 and 1
        labelmapping = {labels[0]:-1, labels[1]:1}

        # now for feature conversion
        # iter through instances, building a set of feature:type:str(value) triples
        svmfeatures = set()
        for (features, label) in featuresets:
            for k,v in compat.iteritems(features):
                svmfeatures.add(featurename(k, v))
        # svmfeatures is indexable by integer svm feature number
        # svmfeatureindex is the inverse (svm feature name -> number)
        svmfeatures = list(svmfeatures)
        svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures))))

        # build svm feature set case by case
        svmfeatureset = []
        for instance in featuresets:
            svmfeatureset.append(map_instance_to_svm(instance, labelmapping, svmfeatureindex))

        # train the svm
        # TODO: implement passing of SVMlight parameters from train() to learn()
        return SvmClassifier(labels, labelmapping, svmfeatures, svmlight.learn(svmfeatureset, type='classification'))
Ejemplo n.º 12
0
def map_features_to_svm(features, svmfeatureindex):
    """
    :param features: a dict of features in the format {'feature':value}
    :param svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels
    """
    instancefeatures = []
    # svmlight supports sparse feature sets and so we simply omit features that we don't include
    for k, v in compat.iteritems(features):
        # each feature is represented as an (int, float) tuple where the int is the SVMlight feature label and the float is the value; as we either have or have not a feature, this is 1.0
        # this does not support scalar features - rather, each value that a feature may take on is a discrete independent label
        # use 1.0 as the feature value to specify the presence of a feature:value couple
        svmfeaturename = featurename(k, v)
        if svmfeaturename not in svmfeatureindex:
            # skip over feature:value pairs that were not in the training data and so not included in our mappings
            continue
        instancefeatures.append((svmfeatureindex[svmfeaturename], 1.0))
    return instancefeatures
Ejemplo n.º 13
0
    def _featuresets_to_coo(self, featuresets):
        """Convert featuresets to sparse matrix (COO format)."""

        i_ind = []
        j_ind = []
        values = []

        for i, fs in enumerate(featuresets):
            for f, v in compat.iteritems(fs):
                try:
                    j = self._feature_index[f]
                    i_ind.append(i)
                    j_ind.append(j)
                    values.append(self._dtype(v))
                except KeyError:
                    pass

        shape = (i + 1, len(self._feature_index))
        return coo_matrix((values, (i_ind, j_ind)), shape=shape, dtype=self._dtype)