def setdefault(self, property, default=None): assert chktype(1, property, str) assert chktype(2, default, self._checkval) if ((property == 'LOC') and not isinstance(default, LocationI) and default is not None): raise TypeError("The 'LOC' property must contain a Location") return super(SafeToken, self).setdefault(property, default)
def tagger_accuracy(tagger, gold_standard): """ Score the accuracy of the tagger against the gold standard. Strip the tags from the gold standard text, retag it using the tagger, then compute the accuracy score. @type tagger: C{Tagger} @param tagger: The tagger being evaluated. @type gold_standard: C{list} of C{Token} @param gold_standard: The list of tagged tokens to score the tagger on; each must have the 'SUBTOKENS' attribute. @rtype: C{float} """ # NB: replace tagger._property_names with tagger.property_names()? assert chktype(1, tagger, TaggerI) assert chktype(2, gold_standard, (Token,), [Token]) TAG = tagger.property('TAG') SUBTOKENS = tagger.property('SUBTOKENS') gold_toks = [] test_toks = [] for gold_doc in gold_standard: test_doc = gold_doc.exclude(TAG) tagger.tag(test_doc) gold_toks += gold_doc[SUBTOKENS] test_toks += test_doc[SUBTOKENS] return accuracy(gold_toks, test_toks)
def __setitem__(self, property, value): assert chktype(1, property, str) assert chktype(2, value, self._checkval) if ((property == 'LOC') and not isinstance(value, LocationI) and value is not None): raise TypeError("The 'LOC' property must contain a Location") return super(SafeToken, self).__setitem__(property, value)
def __init__(self, states=None, symbols=None, **properties): """ Creates an HMM trainer to induce an HMM with the given states and output symbol alphabet. A supervised and unsupervised training method may be used. If either of the states or symbols are not given, these may be derived from supervised training. @param states: the set of state labels @type states: sequence of any @param symbols: the set of observation symbols @type symbols: sequence of any @param properties: alternative names to be used for the TEXT, SUBTOKENS and TAG properties """ assert chktype(1,symbols,types.TupleType,types.ListType,types.NoneType) assert chktype(2,states,types.TupleType,types.ListType,types.NoneType) if states: self._states = states else: self._states = [] if symbols: self._symbols = symbols else: self._symbols = [] self._properties = properties
def pp(self, margin=70, indent=0, nodesep=':', parens='()'): """ @return: A pretty-printed string representation of this tree. @rtype: C{string} @param margin: The right margin at which to do line-wrapping. @type margin: C{int} @param indent: The indentation level at which printing begins. This number is used to decide how far to indent subsequent lines. @type indent: C{int} @param nodesep: A string that is used to separate the node from the children. E.g., the default value C{':'} gives trees like C{(S: (NP: I) (VP: (V: saw) (NP: it)))}. """ assert chktype(1, margin, types.IntType) assert chktype(2, indent, types.IntType) # Try writing it on one line. s = self._ppflat(nodesep, parens) if len(s)+indent < margin: return s # If it doesn't fit on one line, then write it on multi-lines. s = '%s%s%s' % (parens[0], self.node, nodesep) for child in self: if isinstance(child, Tree): s += '\n'+' '*(indent+2)+child.pp(margin, indent+2, nodesep, parens) else: s += '\n'+' '*(indent+2)+repr(child) return s+parens[1]
def cluster(self, tokens, assign_clusters=False, trace=False): assert chktype(1, tokens, [Token]) assert chktype(2, assign_clusters, bool) assert chktype(3, trace, bool) assert len(tokens) > 0 vectors = map(lambda tk: tk['FEATURES'], tokens) # normalise the vectors if self._should_normalise: vectors = map(self._normalise, vectors) # use SVD to reduce the dimensionality if self._svd_dimensions and self._svd_dimensions < len(vectors[0]): [u, d, vt] = numarray.linear_algebra.singular_value_decomposition( numarray.transpose(numarray.array(vectors))) S = d[:self._svd_dimensions] * \ numarray.identity(self._svd_dimensions, numarray.Float64) T = u[:,:self._svd_dimensions] Dt = vt[:self._svd_dimensions,:] vectors = numarray.transpose(numarray.matrixmultiply(S, Dt)) self._Tt = numarray.transpose(T) # call abstract method to cluster the vectors self.cluster_vectorspace(vectors, trace) # assign the tokens to clusters if assign_clusters: for token in tokens: self.classify(token)
def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None): assert chktype(1, num_clusters, int) assert chktype(2, normalise, bool) assert chktype(3, svd_dimensions, int, types.NoneType) VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_clusters = num_clusters self._dendogram = None self._groups_values = None
def __init__(self, normalise=False, svd_dimensions=None): """ @param normalise: should vectors be normalised to length 1 @type normalise: boolean @param svd_dimensions: number of dimensions to use in reducing vector dimensionsionality with SVD @type svd_dimensions: int """ assert chktype(1, normalise, bool) assert chktype(2, svd_dimensions, int, types.NoneType) self._Tt = None self._should_normalise = normalise self._svd_dimensions = svd_dimensions
def _centroid(self, cluster): assert chktype(1, cluster, []) assert len(cluster) > 0 centroid = copy.copy(cluster[0]) for vector in cluster[1:]: centroid += vector return centroid / float(len(cluster))
def __init__(self, n, reverse=False, cutoff=0, **property_names): """ Construct a new I{n}-th order stochastic tagger. The new tagger should be trained, using the L{train()} method, before it is used to tag data. @param n: The order of the new C{NthOrderTagger}. @type n: int @param reverse: If true, then assign tags to subtokens in reverse sequential order (i.e., from last to first). @type cutoff: C{int} @param cutoff: A count-cutoff for the tagger's frequency distribution. If the tagger saw fewer than C{cutoff} examples of a given context in training, then it will return a tag of C{None} for that context. @type property_names: C{dict} @param property_names: A dictionary that can be used to override the default property names. Each entry maps from a default property name to a new property name. """ assert chktype(1, n, types.IntType) if n < 0: raise ValueError('n must be non-negative') SequentialTagger.__init__(self, reverse, **property_names) self._freqdist = ConditionalFreqDist() self._n = n self._cutoff = cutoff # Record the start & end indices of the context window for # tags. if self._reverse: self._left = 1 self._right = 1+n else: self._left = -n self._right = 0
def read_token(self, s, add_contexts=False, add_locs=False, source=None): assert chktype(1, s, str) TEXT = self.property('TEXT') LOC = self.property('LOC') CONTEXT = self.property('CONTEXT') SENTS = self.property('SENTS') TREE = self.property('TREE') sentences = re.findall('(?s)\S.*?/\.', s) sent_toks = [] for sent_num, sentence in enumerate(sentences): sent_loc = SentIndexLocation(sent_num, source) sent_tok = self._sent_reader.read_token( sentence, add_contexts=add_contexts, add_locs=add_locs, source=sent_loc) sent_toks.append(sent_tok) tok = Token(**{SENTS: sent_toks}) # Add context pointers, if requested if add_contexts: for i, sent_tok in enumerate(tok[SENTS]): sent_tok[CONTEXT] = SubtokenContextPointer(tok, SENTS, i) # Return the finished token. return tok
def point_entropy(self, unlabelled_sequence): """ Returns the pointwise entropy over the possible states at each position in the chain, given the observation sequence. """ assert chktype(1, unlabelled_sequence, Token) SUBTOKENS = 'SUBTOKENS' TAG = 'TAG' symbols = unlabelled_sequence[SUBTOKENS] T = len(symbols) N = len(self._states) alpha = self._forward_probability(unlabelled_sequence) beta = self._backward_probability(unlabelled_sequence) normalisation = _log_add(*alpha[T-1, :]) entropies = zeros(T, Float64) probs = zeros(N, Float64) for t in range(T): for s in range(N): probs[s] = alpha[t, s] + beta[t, s] - normalisation for s in range(N): entropies[t] -= exp(probs[s]) * probs[s] return entropies
def train(self, tagged_token): """ Train this C{NthOrderTagger} using the given training data. If this method is called multiple times, then the training data will be combined. @param tagged_token: A tagged corpus. Each subtoken in C{tagged_token} should define the C{text} and C{tag} properties. @type tagged_token: L{Token} """ assert chktype(1, tagged_token, Token) SUBTOKENS = self.property('SUBTOKENS') TEXT = self.property('TEXT') TAG = self.property('TAG') left, right = self._left, self._right # Extract the list of subtokens & list of tags. subtokens = tagged_token[SUBTOKENS] tags = tuple([t[TAG] for t in subtokens]) for i, subtok in enumerate(subtokens): if i+left<0: continue # Construct the context from the current subtoken's text # and the adjacent tokens' tags. context = (tags[i+left:i+right], subtok[TEXT]) # Record the current token in the frequency distribution. tag = subtok[TAG] self._freqdist[context].inc(tag)
def raw_tokenize(self, text): assert chktype(1, text, str) TEXT = self.property('TEXT') SUBTOKENS = self.property('SUBTOKENS') token = Token({TEXT:text}) self.tokenize(token) return [subtok[TEXT] for subtok in token[SUBTOKENS]]
def tokenize(self, token, add_locs=False, add_contexts=False): assert chktype(1, token, Token) TEXT = self.property('TEXT') LOC = self.property('LOC') SUBTOKENS = self.property('SUBTOKENS') # If we're not adding locations, then just delegate to # raw_tokenize. if not add_locs: self._tokenize_from_raw(token, add_locs, add_contexts) return # This split will return a list of alternating matches and # non-matches. If negative=1, then we want the even elements; # if negative=0, then we want the odd elements. words = self._regexp.split(token[TEXT]) # Get the input token's source and start position. if token.has(LOC): source = token[LOC].source() pos = token[LOC].start() else: source = None pos = 0 # Generate a list of subtokens with locations. subtoks = [] for i, w in enumerate(words): if (i%2==0) == self._negative and w!='': loc = CharSpanLocation(pos, pos+len(w), source) subtoks.append(Token({TEXT:w, LOC:loc})) pos += len(w) # Write subtoks to the SUBTOKENS property. token[SUBTOKENS] = subtoks
def best_path(self, unlabelled_sequence): """ Returns the state sequence of the optimal (most probable) path through the HMM. Uses the Viterbi algorithm to calculate this part by dynamic programming. @return: the state sequence @rtype: sequence of any @param unlabelled_sequence: the sequence of unlabelled symbols @type unlabelled_sequence: Token """ assert chktype(1, unlabelled_sequence, Token) SUBTOKENS = 'SUBTOKENS' TEXT = 'TEXT' symbols = unlabelled_sequence[SUBTOKENS] T = len(symbols) N = len(self._states) V = zeros((T, N), Float64) B = {} # find the starting log probabilities for each state symbol = symbols[0][TEXT] for i, state in enumerate(self._states): V[0, i] = self._priors.logprob(state) + \ self._output_logprob(state, symbol) B[0, state] = None # find the maximum log probabilities for reaching each state at time t for t in range(1, T): symbol = symbols[t][TEXT] for j in range(N): sj = self._states[j] best = None for i in range(N): si = self._states[i] va = V[t-1, i] + self._transitions[si].logprob(sj) if not best or va > best[0]: best = (va, si) V[t, j] = best[0] + self._output_logprob(sj, symbol) B[t, sj] = best[1] # find the highest probability final state best = None for i in range(N): val = V[T-1, i] if not best or val > best[0]: best = (val, self._states[i]) # traverse the back-pointers B to find the state sequence current = best[1] sequence = [current] for t in range(T-1, 0, -1): last = B[t, current] sequence.append(last) current = last sequence.reverse() return sequence
def __init__(self, states=None, symbols=None, **properties): """ Creates an HMM trainer to induce an HMM with the given states and output symbol alphabet. Only a supervised training method may be used. """ assert chktype(1,symbols,types.TupleType,types.ListType,types.NoneType) assert chktype(2,states,types.TupleType,types.ListType,types.NoneType) if states: self._states = states else: self._states = [] if symbols: self._symbols = symbols else: self._symbols = [] self._properties = properties
def raw_xtokenize(self, text): assert chktype(1, text, str) TEXT = self.property('TEXT') SUBTOKENS = self.property('SUBTOKENS') token = Token({TEXT:text}) self.xtokenize(token) for subtok in token[SUBTOKENS]: yield subtok[TEXT]
def likelihood(self, labelled_token): assert chktype(1, labelled_token, Token) vector = labelled_token['FEATURES'] #assert chktype('features', vector, numarray.array([]), SparseArray) if self._should_normalise: vector = self._normalise(vector) if self._Tt != None: vector = numarray.matrixmultiply(self._Tt, vector) return self.likelihood_vectorspace(vector, labelled_token['CLUSTER'])
def xtokenize(self, token, add_locs=False, add_contexts=False): assert chktype(1, token, Token) TEXT = self.property('TEXT') SUBTOKENS = self.property('SUBTOKENS') text = token[TEXT] if hasattr(text, '__iter__') and hasattr(text, 'next'): token[TEXT] = ''.join(text) self.tokenize(token, add_locs, add_contexts) token[SUBTOKENS] = iter(token[SUBTOKENS])
def __init__(self, items=[]): """ @param items: the items at the leaves of the dendogram @type items: sequence of (any) """ assert chktype(1, items, []) self._items = [_DendogramNode(item) for item in items] self._original_items = copy.copy(self._items) self._merge = 1
def __init__(self, reference, test): """ Construct a new confusion matrix from a list of reference values and a corresponding list of test values. @type reference: C{list} @param reference: An ordered list of reference values. @type test: C{list} @param test: A list of values to compare against the corresponding reference values. @raise ValueError: If C{reference} and C{length} do not have the same length. """ assert chktype(1, reference, []) assert chktype(2, test, []) if len(reference) != len(test): raise ValueError('Lists must have the same length.') # Get a list of all values. values = dict([(val,1) for val in reference+test]).keys() # Construct a value->index dictionary indices = dict([(val,i) for (i,val) in enumerate(values)]) # Make a confusion matrix table. confusion = [[0 for val in values] for val in values] max_conf = 0 # Maximum confusion for w,g in zip(reference, test): confusion[indices[w]][indices[g]] += 1 max_conf = max(max_conf, confusion[indices[w]][indices[g]]) #: A list of all values in C{reference} or C{test}. self._values = values #: A dictionary mapping values in L{self._values} to their indices. self._indices = indices #: The confusion matrix itself (as a list of lists of counts). self._confusion = confusion #: The greatest count in L{self._confusion} (used for printing). self._max_conf = 0 #: The total number of values in the confusion matrix. self._total = len(reference) #: The number of correct (on-diagonal) values in the matrix. self._correct = sum([confusion[i][i] for i in range(len(values))])
def recall(reference, test): """ Given a set of reference values and a set of test values, return the percentage of reference values that appear in the test set. In particular, return |C{reference}S{cap}C{test}|/|C{reference}|. If C{reference} is empty, then return C{None}. @type reference: C{Set} @param reference: A set of reference values. @type test: C{Set} @param test: A set of values to compare against the reference set. @rtype: C{float} or C{None} """ assert chktype(1, reference, sets.BaseSet) assert chktype(2, test, sets.BaseSet) if len(reference) == 0: return None else: return float(len(reference.intersection(test)))/len(reference)
def classify(self, token): assert chktype(1, token, Token) vector = token['FEATURES'] #assert chktype('features', vector, numarray.array([]), SparseArray) if self._should_normalise: vector = self._normalise(vector) if self._Tt != None: vector = numarray.matrixmultiply(self._Tt, vector) cluster = self.classify_vectorspace(vector) token['CLUSTER'] = self.cluster_name(cluster)
def tag(self, token): assert chktype(1, token, Token) SUBTOKENS = self.property('SUBTOKENS') TAG = self.property('TAG') # Tag each token, in sequential order. subtokens = token[SUBTOKENS] for i, subtoken in enumerate(subtokens): tag = self.tag_subtoken(subtokens, i) subtoken[TAG] = tag
def __init__(self, num_means, distance, repeats=1, conv_test=1e-6, initial_means=None, normalise=False, svd_dimensions=None, rng=None): """ @param num_means: the number of means to use (may use fewer) @type num_means: int @param distance: measure of distance between two vectors @type distance: function taking two vectors and returing a float @param repeats: number of randomised clustering trials to use @type repeats: int @param conv_test: maximum variation in mean differences before deemed convergent @type conv_test: number @param initial_means: set of k initial means @type initial_means: sequence of vectors @param normalise: should vectors be normalised to length 1 @type normalise: boolean @param svd_dimensions: number of dimensions to use in reducing vector dimensionsionality with SVD @type svd_dimensions: int @param rng: random number generator (or None) @type rng: Random """ assert chktype(1, num_means, int) #assert chktype(2, distance, ...) assert chktype(3, repeats, int) assert chktype(4, conv_test, int, float) #assert chktype(5, initial_means, [numarray.array([])], [SparseArray]) assert chktype(6, normalise, bool) assert chktype(7, svd_dimensions, int, types.NoneType) VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_means = num_means self._distance = distance self._max_difference = conv_test assert not initial_means or len(initial_means) == num_means self._means = initial_means assert repeats >= 1 assert not (initial_means and repeats > 1) self._repeats = repeats if rng: self._rng = rng else: self._rng = random.Random()
def groups(self, n): """ Finds the n-groups of items (leaves) reachable from a cut at depth n. @param n: number of groups @type n: int """ assert chktype(1, n, int) if len(self._items) > 1: root = _DendogramNode(self._merge, *self._items) else: root = self._items[0] return root.groups(n)
def accuracy(reference, test): """ Given a list of reference values and a corresponding list of test values, return the percentage of corresponding values that are equal. In particular, return the percentage of indices C{0<i<=len(test)} such that C{test[i] == reference[i]}. @type reference: C{list} @param reference: An ordered list of reference values. @type test: C{list} @param test: A list of values to compare against the corresponding reference values. @raise ValueError: If C{reference} and C{length} do not have the same length. """ assert chktype(1, reference, []) assert chktype(2, test, []) if len(reference) != len(test): raise ValueError("Lists must have the same length.") num_correct = [1 for x,y in zip(reference, test) if x==y] return float(len(num_correct)) / len(reference)
def vector(self, token): """ Returns the vector after normalisation and dimensionality reduction for the given token's FEATURES. """ assert chktype(1, token, Token) vector = token['FEATURES'] #assert chktype('features', vector, numarray.array([]), SparseArray) if self._should_normalise: vector = self._normalise(vector) if self._Tt != None: vector = numarray.matrixmultiply(self._Tt, vector) return vector
def copy(self, deep=True): """ @rtype: L{Token} @return: A new copy of this token. @param deep: If false, then the new token will use the same objects to encode feature values that the original token did. If true, then the new token will use deep copies of the original token's feature values. The default value of C{True} is almost always the correct choice. """ assert chktype(1, deep, bool) if deep: return copy.deepcopy(self) else: return copy.copy(self)