Ejemplo n.º 1
0
 def setdefault(self, property, default=None):
     assert chktype(1, property, str)
     assert chktype(2, default, self._checkval)
     if ((property == 'LOC') and not isinstance(default, LocationI)
         and default is not None):
         raise TypeError("The 'LOC' property must contain a Location")
     return super(SafeToken, self).setdefault(property, default)
def tagger_accuracy(tagger, gold_standard):
    """
    Score the accuracy of the tagger against the gold standard.
    Strip the tags from the gold standard text, retag it using
    the tagger, then compute the accuracy score.

    @type tagger: C{Tagger}
    @param tagger: The tagger being evaluated.
    @type gold_standard: C{list} of C{Token}
    @param gold_standard: The list of tagged tokens to score
      the tagger on; each must have the 'SUBTOKENS' attribute.
    @rtype: C{float}
    """

    # NB: replace tagger._property_names with tagger.property_names()?

    assert chktype(1, tagger, TaggerI)
    assert chktype(2, gold_standard, (Token,), [Token])
    TAG = tagger.property('TAG')
    SUBTOKENS = tagger.property('SUBTOKENS')

    gold_toks = []
    test_toks = []
    for gold_doc in gold_standard:
        test_doc = gold_doc.exclude(TAG)
        tagger.tag(test_doc)
        gold_toks += gold_doc[SUBTOKENS]
        test_toks += test_doc[SUBTOKENS]
    return accuracy(gold_toks, test_toks)
Ejemplo n.º 3
0
 def __setitem__(self, property, value):
     assert chktype(1, property, str)
     assert chktype(2, value, self._checkval)
     if ((property == 'LOC') and not isinstance(value, LocationI)
         and value is not None):
         raise TypeError("The 'LOC' property must contain a Location")
     return super(SafeToken, self).__setitem__(property, value)
    def __init__(self, states=None, symbols=None, **properties):
        """
        Creates an HMM trainer to induce an HMM with the given states and
        output symbol alphabet. A supervised and unsupervised training
        method may be used. If either of the states or symbols are not given,
        these may be derived from supervised training.

        @param states:  the set of state labels
        @type states:   sequence of any
        @param symbols: the set of observation symbols
        @type symbols:  sequence of any
        @param properties: alternative names to be used for the TEXT,
                        SUBTOKENS and TAG properties
        """
        assert chktype(1,symbols,types.TupleType,types.ListType,types.NoneType)
        assert chktype(2,states,types.TupleType,types.ListType,types.NoneType)
        if states:
            self._states = states
        else:
            self._states = []
        if symbols:
            self._symbols = symbols
        else:
            self._symbols = []
        self._properties = properties
Ejemplo n.º 5
0
    def pp(self, margin=70, indent=0, nodesep=':', parens='()'):
        """
        @return: A pretty-printed string representation of this tree.
        @rtype: C{string}
        @param margin: The right margin at which to do line-wrapping.
        @type margin: C{int}
        @param indent: The indentation level at which printing
            begins.  This number is used to decide how far to indent
            subsequent lines.
        @type indent: C{int}
        @param nodesep: A string that is used to separate the node
            from the children.  E.g., the default value C{':'} gives
            trees like C{(S: (NP: I) (VP: (V: saw) (NP: it)))}.
        """
        assert chktype(1, margin, types.IntType)
        assert chktype(2, indent, types.IntType)

        # Try writing it on one line.
        s = self._ppflat(nodesep, parens)
        if len(s)+indent < margin:
            return s

        # If it doesn't fit on one line, then write it on multi-lines.
        s = '%s%s%s' % (parens[0], self.node, nodesep)
        for child in self:
            if isinstance(child, Tree):
                s += '\n'+' '*(indent+2)+child.pp(margin, indent+2,
                                                  nodesep, parens)
            else:
                s += '\n'+' '*(indent+2)+repr(child)
        return s+parens[1]
    def cluster(self, tokens, assign_clusters=False, trace=False):
        assert chktype(1, tokens, [Token])
        assert chktype(2, assign_clusters, bool)
        assert chktype(3, trace, bool)
        assert len(tokens) > 0
        vectors = map(lambda tk: tk['FEATURES'], tokens)

        # normalise the vectors
        if self._should_normalise:
            vectors = map(self._normalise, vectors)

        # use SVD to reduce the dimensionality
        if self._svd_dimensions and self._svd_dimensions < len(vectors[0]):
            [u, d, vt] = numarray.linear_algebra.singular_value_decomposition(
                            numarray.transpose(numarray.array(vectors)))
            S = d[:self._svd_dimensions] * \
                numarray.identity(self._svd_dimensions, numarray.Float64)
            T = u[:,:self._svd_dimensions]
            Dt = vt[:self._svd_dimensions,:]
            vectors = numarray.transpose(numarray.matrixmultiply(S, Dt))
            self._Tt = numarray.transpose(T)
            
        # call abstract method to cluster the vectors
        self.cluster_vectorspace(vectors, trace)

        # assign the tokens to clusters
        if assign_clusters:
            for token in tokens:
                self.classify(token)
 def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
     assert chktype(1, num_clusters, int)
     assert chktype(2, normalise, bool)
     assert chktype(3, svd_dimensions, int, types.NoneType)
     VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
     self._num_clusters = num_clusters
     self._dendogram = None
     self._groups_values = None
 def __init__(self, normalise=False, svd_dimensions=None):
     """
     @param normalise:       should vectors be normalised to length 1
     @type normalise:        boolean
     @param svd_dimensions:  number of dimensions to use in reducing vector
                             dimensionsionality with SVD
     @type svd_dimensions:   int 
     """
     assert chktype(1, normalise, bool)
     assert chktype(2, svd_dimensions, int, types.NoneType)
     self._Tt = None
     self._should_normalise = normalise
     self._svd_dimensions = svd_dimensions
 def _centroid(self, cluster):
     assert chktype(1, cluster, [])
     assert len(cluster) > 0
     centroid = copy.copy(cluster[0])
     for vector in cluster[1:]:
         centroid += vector
     return centroid / float(len(cluster))
Ejemplo n.º 10
0
    def __init__(self, n, reverse=False, cutoff=0, **property_names):
        """
        Construct a new I{n}-th order stochastic tagger.  The new
        tagger should be trained, using the L{train()} method, before
        it is used to tag data.
        
        @param n: The order of the new C{NthOrderTagger}.
        @type n: int
        @param reverse: If true, then assign tags to subtokens in
            reverse sequential order (i.e., from last to first).
        @type cutoff: C{int}
        @param cutoff: A count-cutoff for the tagger's frequency
            distribution.  If the tagger saw fewer than
            C{cutoff} examples of a given context in training,
            then it will return a tag of C{None} for that context.
        @type property_names: C{dict}
        @param property_names: A dictionary that can be used to override
            the default property names.  Each entry maps from a
            default property name to a new property name.
        """
        assert chktype(1, n, types.IntType)
        if n < 0: raise ValueError('n must be non-negative')
        SequentialTagger.__init__(self, reverse, **property_names)
        self._freqdist = ConditionalFreqDist()
        self._n = n
        self._cutoff = cutoff

        # Record the start & end indices of the context window for
        # tags.
        if self._reverse:
            self._left = 1
            self._right = 1+n
        else:
            self._left = -n
            self._right = 0
Ejemplo n.º 11
0
    def read_token(self, s, add_contexts=False, add_locs=False, 
                   source=None):
        assert chktype(1, s, str)

        TEXT = self.property('TEXT')
        LOC = self.property('LOC')
        CONTEXT = self.property('CONTEXT')
        SENTS = self.property('SENTS')
        TREE = self.property('TREE')

        sentences = re.findall('(?s)\S.*?/\.', s)
        sent_toks = []
        for sent_num, sentence in enumerate(sentences):
            sent_loc = SentIndexLocation(sent_num, source)
            sent_tok = self._sent_reader.read_token(
                sentence, add_contexts=add_contexts,
                add_locs=add_locs, source=sent_loc)
            sent_toks.append(sent_tok)
        tok = Token(**{SENTS: sent_toks})

        # Add context pointers, if requested
        if add_contexts:
            for i, sent_tok in enumerate(tok[SENTS]):
                sent_tok[CONTEXT] = SubtokenContextPointer(tok, SENTS, i)

        # Return the finished token.
        return tok
Ejemplo n.º 12
0
    def point_entropy(self, unlabelled_sequence):
	"""
	Returns the pointwise entropy over the possible states at each
	position in the chain, given the observation sequence.
	"""
        assert chktype(1, unlabelled_sequence, Token)
    
        SUBTOKENS = 'SUBTOKENS'
        TAG = 'TAG'

        symbols = unlabelled_sequence[SUBTOKENS]
        T = len(symbols)
        N = len(self._states)

	alpha = self._forward_probability(unlabelled_sequence)
	beta = self._backward_probability(unlabelled_sequence)
	normalisation = _log_add(*alpha[T-1, :])
    
	entropies = zeros(T, Float64)
	probs = zeros(N, Float64)
	for t in range(T):
	    for s in range(N):
		probs[s] = alpha[t, s] + beta[t, s] - normalisation

	    for s in range(N):
		entropies[t] -= exp(probs[s]) * probs[s]

	return entropies
Ejemplo n.º 13
0
    def train(self, tagged_token):
        """
        Train this C{NthOrderTagger} using the given training data.
        If this method is called multiple times, then the training
        data will be combined.
        
        @param tagged_token: A tagged corpus.  Each subtoken in
            C{tagged_token} should define the C{text} and C{tag}
            properties.
        @type tagged_token: L{Token}
        """
        assert chktype(1, tagged_token, Token)
        SUBTOKENS = self.property('SUBTOKENS')
        TEXT = self.property('TEXT')
        TAG = self.property('TAG')
        left, right = self._left, self._right
        
        # Extract the list of subtokens & list of tags.
        subtokens = tagged_token[SUBTOKENS]
        tags = tuple([t[TAG] for t in subtokens])

        for i, subtok in enumerate(subtokens):
            if i+left<0: continue
            # Construct the context from the current subtoken's text
            # and the adjacent tokens' tags.
            context = (tags[i+left:i+right], subtok[TEXT])

            # Record the current token in the frequency distribution.
            tag = subtok[TAG]
            self._freqdist[context].inc(tag)
Ejemplo n.º 14
0
 def raw_tokenize(self, text):
     assert chktype(1, text, str)
     TEXT = self.property('TEXT')
     SUBTOKENS = self.property('SUBTOKENS')
     token = Token({TEXT:text})
     self.tokenize(token)
     return [subtok[TEXT] for subtok in token[SUBTOKENS]]
Ejemplo n.º 15
0
    def tokenize(self, token, add_locs=False, add_contexts=False):
        assert chktype(1, token, Token)
        TEXT = self.property('TEXT')
        LOC = self.property('LOC')
        SUBTOKENS = self.property('SUBTOKENS')

        # If we're not adding locations, then just delegate to
        # raw_tokenize.
        if not add_locs:
            self._tokenize_from_raw(token, add_locs, add_contexts)
            return

        # This split will return a list of alternating matches and
        # non-matches.  If negative=1, then we want the even elements;
        # if negative=0, then we want the odd elements.
        words = self._regexp.split(token[TEXT])

        # Get the input token's source and start position.
        if token.has(LOC):
            source = token[LOC].source()
            pos = token[LOC].start()
        else:
            source = None
            pos = 0

        # Generate a list of subtokens with locations.
        subtoks = []
        for i, w in enumerate(words):
            if (i%2==0) == self._negative and w!='':
                loc = CharSpanLocation(pos, pos+len(w), source)
                subtoks.append(Token({TEXT:w, LOC:loc}))
            pos += len(w)
            
        # Write subtoks to the SUBTOKENS property.
        token[SUBTOKENS] = subtoks
Ejemplo n.º 16
0
    def best_path(self, unlabelled_sequence):
        """
        Returns the state sequence of the optimal (most probable) path through
        the HMM. Uses the Viterbi algorithm to calculate this part by dynamic
        programming.

        @return: the state sequence
        @rtype: sequence of any
        @param unlabelled_sequence: the sequence of unlabelled symbols 
        @type unlabelled_sequence: Token
        """
        assert chktype(1, unlabelled_sequence, Token)

        SUBTOKENS = 'SUBTOKENS'
        TEXT = 'TEXT'

        symbols = unlabelled_sequence[SUBTOKENS]
        T = len(symbols)
        N = len(self._states)
        V = zeros((T, N), Float64)
        B = {}

        # find the starting log probabilities for each state
        symbol = symbols[0][TEXT]
        for i, state in enumerate(self._states):
            V[0, i] = self._priors.logprob(state) + \
                      self._output_logprob(state, symbol)
            B[0, state] = None

        # find the maximum log probabilities for reaching each state at time t
        for t in range(1, T):
            symbol = symbols[t][TEXT]
            for j in range(N):
                sj = self._states[j]
                best = None
                for i in range(N):
                    si = self._states[i]
                    va = V[t-1, i] + self._transitions[si].logprob(sj)
                    if not best or va > best[0]:
                        best = (va, si)
                V[t, j] = best[0] + self._output_logprob(sj, symbol)
                B[t, sj] = best[1]

        # find the highest probability final state
        best = None
        for i in range(N):
            val = V[T-1, i]
            if not best or val > best[0]:
                best = (val, self._states[i])

        # traverse the back-pointers B to find the state sequence
        current = best[1]
        sequence = [current]
        for t in range(T-1, 0, -1):
            last = B[t, current]
            sequence.append(last)
            current = last

        sequence.reverse()
        return sequence
 def __init__(self, states=None, symbols=None, **properties):
     """
     Creates an HMM trainer to induce an HMM with the given states and
     output symbol alphabet. Only a supervised training method may be used.
     """
     assert chktype(1,symbols,types.TupleType,types.ListType,types.NoneType)
     assert chktype(2,states,types.TupleType,types.ListType,types.NoneType)
     if states:
         self._states = states
     else:
         self._states = []
     if symbols:
         self._symbols = symbols
     else:
         self._symbols = []
     self._properties = properties
Ejemplo n.º 18
0
 def raw_xtokenize(self, text):
     assert chktype(1, text, str)
     TEXT = self.property('TEXT')
     SUBTOKENS = self.property('SUBTOKENS')
     token = Token({TEXT:text})
     self.xtokenize(token)
     for subtok in token[SUBTOKENS]:
         yield subtok[TEXT]
Ejemplo n.º 19
0
 def likelihood(self, labelled_token):
     assert chktype(1, labelled_token, Token)
     vector = labelled_token['FEATURES']
     #assert chktype('features', vector, numarray.array([]), SparseArray)
     if self._should_normalise:
         vector = self._normalise(vector)
     if self._Tt != None:
         vector = numarray.matrixmultiply(self._Tt, vector)
     return self.likelihood_vectorspace(vector, labelled_token['CLUSTER'])
Ejemplo n.º 20
0
 def xtokenize(self, token, add_locs=False, add_contexts=False):
     assert chktype(1, token, Token)
     TEXT = self.property('TEXT')
     SUBTOKENS = self.property('SUBTOKENS')
     text = token[TEXT]
     if hasattr(text, '__iter__') and hasattr(text, 'next'):
         token[TEXT] = ''.join(text)
     self.tokenize(token, add_locs, add_contexts)
     token[SUBTOKENS] = iter(token[SUBTOKENS])
Ejemplo n.º 21
0
 def __init__(self, items=[]):
     """
     @param  items: the items at the leaves of the dendogram
     @type   items: sequence of (any)
     """
     assert chktype(1, items, [])
     self._items = [_DendogramNode(item) for item in items]
     self._original_items = copy.copy(self._items)
     self._merge = 1
Ejemplo n.º 22
0
    def __init__(self, reference, test):
        """
        Construct a new confusion matrix from a list of reference
        values and a corresponding list of test values.
        
        @type reference: C{list}
        @param reference: An ordered list of reference values.
        @type test: C{list}
        @param test: A list of values to compare against the
            corresponding reference values.
        @raise ValueError: If C{reference} and C{length} do not have
            the same length.
        """
        assert chktype(1, reference, [])
        assert chktype(2, test, [])
        
        if len(reference) != len(test):
            raise ValueError('Lists must have the same length.')
            
        # Get a list of all values.
        values = dict([(val,1) for val in reference+test]).keys()

        # Construct a value->index dictionary
        indices = dict([(val,i) for (i,val) in enumerate(values)])

        # Make a confusion matrix table.
        confusion = [[0 for val in values] for val in values]
        max_conf = 0 # Maximum confusion
        for w,g in zip(reference, test):
            confusion[indices[w]][indices[g]] += 1
            max_conf = max(max_conf, confusion[indices[w]][indices[g]])

        #: A list of all values in C{reference} or C{test}.
        self._values = values
        #: A dictionary mapping values in L{self._values} to their indices.
        self._indices = indices
        #: The confusion matrix itself (as a list of lists of counts).
        self._confusion = confusion
        #: The greatest count in L{self._confusion} (used for printing).
        self._max_conf = 0
        #: The total number of values in the confusion matrix.
        self._total = len(reference)
        #: The number of correct (on-diagonal) values in the matrix.
        self._correct = sum([confusion[i][i] for i in range(len(values))])
Ejemplo n.º 23
0
def recall(reference, test):
    """
    Given a set of reference values and a set of test values, return
    the percentage of reference values that appear in the test set.
    In particular, return |C{reference}S{cap}C{test}|/|C{reference}|.
    If C{reference} is empty, then return C{None}.
    
    @type reference: C{Set}
    @param reference: A set of reference values.
    @type test: C{Set}
    @param test: A set of values to compare against the reference set.
    @rtype: C{float} or C{None}
    """
    assert chktype(1, reference, sets.BaseSet)
    assert chktype(2, test, sets.BaseSet)
    if len(reference) == 0:
        return None
    else:
        return float(len(reference.intersection(test)))/len(reference)
Ejemplo n.º 24
0
 def classify(self, token):
     assert chktype(1, token, Token)
     vector = token['FEATURES']
     #assert chktype('features', vector, numarray.array([]), SparseArray)
     if self._should_normalise:
         vector = self._normalise(vector)
     if self._Tt != None:
         vector = numarray.matrixmultiply(self._Tt, vector)
     cluster = self.classify_vectorspace(vector)
     token['CLUSTER'] = self.cluster_name(cluster)
Ejemplo n.º 25
0
    def tag(self, token):
        assert chktype(1, token, Token)
        SUBTOKENS = self.property('SUBTOKENS')
        TAG = self.property('TAG')

        # Tag each token, in sequential order.
        subtokens = token[SUBTOKENS]
        for i, subtoken in enumerate(subtokens):
            tag = self.tag_subtoken(subtokens, i)
            subtoken[TAG] = tag
Ejemplo n.º 26
0
 def __init__(self, num_means, distance, repeats=1,
                    conv_test=1e-6, initial_means=None,
                    normalise=False, svd_dimensions=None,
                    rng=None):
     """
     @param  num_means:  the number of means to use (may use fewer)
     @type   num_means:  int
     @param  distance:   measure of distance between two vectors
     @type   distance:   function taking two vectors and returing a float
     @param  repeats:    number of randomised clustering trials to use
     @type   repeats:    int
     @param  conv_test:  maximum variation in mean differences before
                         deemed convergent
     @type   conv_test:  number
     @param  initial_means: set of k initial means
     @type   initial_means: sequence of vectors
     @param  normalise:  should vectors be normalised to length 1
     @type   normalise:  boolean
     @param svd_dimensions: number of dimensions to use in reducing vector
                            dimensionsionality with SVD
     @type svd_dimensions: int 
     @param  rng:        random number generator (or None)
     @type   rng:        Random
     """
     assert chktype(1, num_means, int)
     #assert chktype(2, distance, ...)
     assert chktype(3, repeats, int)
     assert chktype(4, conv_test, int, float)
     #assert chktype(5, initial_means, [numarray.array([])], [SparseArray])
     assert chktype(6, normalise, bool)
     assert chktype(7, svd_dimensions, int, types.NoneType)
     VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
     self._num_means = num_means
     self._distance = distance
     self._max_difference = conv_test
     assert not initial_means or len(initial_means) == num_means
     self._means = initial_means
     assert repeats >= 1
     assert not (initial_means and repeats > 1)
     self._repeats = repeats
     if rng: self._rng = rng
     else:   self._rng = random.Random()
Ejemplo n.º 27
0
 def groups(self, n):
     """
     Finds the n-groups of items (leaves) reachable from a cut at depth n.
     @param  n: number of groups
     @type   n: int
     """
     assert chktype(1, n, int)
     if len(self._items) > 1:
         root = _DendogramNode(self._merge, *self._items)
     else:
         root = self._items[0]
     return root.groups(n)
Ejemplo n.º 28
0
def accuracy(reference, test):
    """
    Given a list of reference values and a corresponding list of test
    values, return the percentage of corresponding values that are
    equal.  In particular, return the percentage of indices
    C{0<i<=len(test)} such that C{test[i] == reference[i]}.

    @type reference: C{list}
    @param reference: An ordered list of reference values.
    @type test: C{list}
    @param test: A list of values to compare against the corresponding
        reference values.
    @raise ValueError: If C{reference} and C{length} do not have the
        same length.
    """
    assert chktype(1, reference, [])
    assert chktype(2, test, [])
    if len(reference) != len(test):
        raise ValueError("Lists must have the same length.")
    num_correct = [1 for x,y in zip(reference, test) if x==y]
    return float(len(num_correct)) / len(reference)
Ejemplo n.º 29
0
 def vector(self, token):
     """
     Returns the vector after normalisation and dimensionality reduction
     for the given token's FEATURES.
     """
     assert chktype(1, token, Token)
     vector = token['FEATURES']
     #assert chktype('features', vector, numarray.array([]), SparseArray)
     if self._should_normalise:
         vector = self._normalise(vector)
     if self._Tt != None:
         vector = numarray.matrixmultiply(self._Tt, vector)
     return vector
Ejemplo n.º 30
0
 def copy(self, deep=True):
     """
     @rtype: L{Token}
     @return: A new copy of this token.
     @param deep: If false, then the new token will use the same
         objects to encode feature values that the original token
         did.  If true, then the new token will use deep copies of
         the original token's feature values.  The default value
         of C{True} is almost always the correct choice.
     """
     assert chktype(1, deep, bool)
     if deep: return copy.deepcopy(self)
     else: return copy.copy(self)