Example #1
0
 def __init__(self, fname, load=True):
     self.model = AveragedPerceptron()
     self.tagdict = {}
     self.classes = set()
     self.model_file = fname
     if load:
         self.load(self.model_file)
Example #2
0
	def __init__(self, fname, load=True):
		self.model = AveragedPerceptron()
		self.tagdict = {}
		self.classes = set()
		# # initiate a glove
		self.wv = api.load('glove-twitter-25')
		self.model_file = fname
		if load:
			self.load(self.model_file)
    def train(self, essay_feats, save_loc=None, nr_iter=5, verbose=True):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.
        :param sentences: A list of (words, tags) tuples.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''

        cp_essay_feats = list(essay_feats)

        # Copy as we do an inplace shuffle below
        tag_freq = defaultdict(int)
        for essay in cp_essay_feats:
            for taggged_sentence in essay.sentences:
                for wd in taggged_sentence:
                    fs_tags = self.__get_tags_(wd.tags)
                    tag_freq[fs_tags] +=1


        self.classes = set([ fs for fs, cnt in tag_freq.items() if cnt >= self.combo_freq_threshold])
        self.model = AveragedPerceptron(self.classes)

        for iter_ in range(nr_iter):
            class2predictions = defaultdict(list)
            class2tags = defaultdict(list)

            for essay_ix, essay in enumerate(cp_essay_feats):
                for sent_ix, taggged_sentence in enumerate(essay.sentences):
                    """ Start Sentence """
                    prev = list(self.START)

                    for i, (wd) in enumerate(taggged_sentence):
                        # Don't mutate the feat dictionary
                        shared_features = dict(wd.features.items())
                        # get all tagger predictions for previous 2 tags
                        self._add_secondary_tag_features(shared_features, prev)

                        tagger_feats = dict(shared_features.items())
                        # add more in depth features for this tag
                        actual = self.__get_tags_(wd.tags)

                        if self.use_tag_features:
                            self._add_tag_features(tagger_feats, wd.word, prev[-1], prev[-2])

                        guess = self.model.predict(tagger_feats)
                        self.model.update(actual, guess, tagger_feats)

                        prev.append(guess)
                        for cls in self.individual_tags:
                            class2predictions[cls].append(  1 if cls in guess  else 0 )
                            class2tags[cls].append(         1 if cls in actual else 0)

            random.shuffle(cp_essay_feats)
            class2metrics = ResultsProcessor.compute_metrics(class2tags, class2predictions)
            micro_metrics = micro_rpfa(class2metrics.values())
            if verbose:
                logging.info("Iter {0}: Micro Avg Metrics: {1}".format(iter_, str(micro_metrics)))

        self.model.average_weights()
        return None
Example #4
0
 def __init__(self, classes, tag_history):
     self.tag_history = tag_history
     self.classes = set(classes)
     self.class2model = {}
     for cls in classes:
         self.class2model[cls] = AveragedPerceptron()
         self.class2model[cls].classes = set(
             [self.NEGATIVE_CLASS, self.POSITIVE_CLASS])
Example #5
0
 def __init__(self, load=None):
     self.model = AveragedPerceptron()
     self.tagdict = {}
     self.classes = set()
     self.graphdict = pickle.load(open("../pos_dict.pickle", "rb"))
     self.nodeperm = pickle.load(open("../pos_nodeperm_dict.pickle", "rb"))
     self.graph = nx.DiGraph()
     if load:
         self.load(load)
     with open("../gen_pos_graph.txt", "r") as pos_file:
         for line in pos_file:
             first, second = tuple(map(int, line.split()));
             if first >= len(self.nodeperm) or second >= len(self.nodeperm):
                 continue
             first_idx = self.nodeperm[first]
             first_word = self.graphdict[first_idx]
             second_idx = self.nodeperm[second]
             second_word = self.graphdict[second_idx]
             self.graph.add_edge(first_word, second_word)
Example #6
0
class PerceptronTagger():
    '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.

	See more implementation details here:
		http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/

	:param load: Load the pickled model upon instantiation.
	'''
    START = ['-START-', '-START2-', '-START3-']
    END = ['-END-', '-END2-', '-END3-']

    # START = ['-START-', '-START2-']
    # END = ['-END-', '-END2-']

    def __init__(self, fname, load=True):
        self.model = AveragedPerceptron()
        self.tagdict = {}
        self.classes = set()
        self.model_file = fname
        if load:
            self.load(self.model_file)

    def tag(self, corpus, tokenise=False):
        '''Tags a string `corpus`.'''
        # Assume untokenised corpus has \n between sentences and ' ' between words
        #s_split = SentenceTokenizer().tokenise if tokenise else lambda t: t.split('\n')
        #w_split = WordTokenizer().tokenise if tokenise else lambda s: s.split()

        reading = True
        sentence = []
        line = corpus.readline()
        while reading:
            if line == '\n':
                # sentence boundary
                prev, prev2, prev3 = self.START
                #				print('s:',sentence)
                for words in sentence:
                    context = self.START + [
                        self._normalise(w[1]) for w in sentence
                    ] + self.END
                    for i, token in enumerate(sentence):
                        tag = self.tagdict.get(token[1])
                        if not tag:
                            # if the word isn't "unambiguous", extract features
                            features = self._get_features(
                                i, token[1], context, prev, prev2, prev3)
                            # make the prediction
                            tag = self.model.predict(features)
                        sentence[i][3] = tag
                        prev3 = prev2
                        prev2 = prev
                        prev = tag
                # print out the tokens and their tags
                for words in sentence:
                    print('\t'.join(words))
                print()
                sentence = []
            elif line == '':
                # we reached the end of the input
                reading = False
            elif line[0] == '#':
                # line is a comment line
                print(line.strip())
                line = corpus.readline()
                continue
            else:
                # normal conllu line
                row = line.strip().split('\t')
                sentence.append(row)

            # read the next line
            line = corpus.readline()

        return

    def train(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
		controls the number of Perceptron training iterations.

		:param sentences: A list of 10-value tuples
		:param save_loc: If not ``None``, saves a pickled model in this location.
		:param nr_iter: Number of training iterations.
		'''
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            #			for words,tags in sentences:
            for sentence in sentences:
                #				print(c, n, '|||', sentence);
                print(n, end='', file=sys.stderr)
                prev, prev2, prev3 = self.START
                context = self.START + [
                    self._normalise(w[1]) for w in sentence
                ] + self.END
                tags = [w[3] for w in sentence]
                for i, token in enumerate(sentence):
                    word = token[1]
                    guess = self.tagdict.get(word)
                    if not guess:
                        feats = self._get_features(i, word, context, prev,
                                                   prev2, prev3)
                        guess = self.model.predict(feats)
                        self.model.update(tags[i], guess, feats)
                    prev3 = prev2
                    prev2 = prev
                    prev = guess
                    c += guess == tags[i]
                    n += 1
                print('\r', end='', file=sys.stderr)
            random.shuffle(sentences)
            print()
            print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)),
                  file=sys.stderr)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                        open(save_loc, 'wb'), -1)
        return None

    def load(self, loc):
        '''Load a pickled model.'''
        try:
            w_td_c = pickle.load(open(loc, 'rb'))
        except IOError:
            print("Missing " + loc + " file.")
            sys.exit(-1)
        self.model.weights, self.tagdict, self.classes = w_td_c
        self.model.classes = self.classes
        return None

    def _normalise(self, word):
        '''Normalisation used in pre-processing.

		- All words are lower cased
		- Digits in the range 0000-2100 are represented as !YEAR;
		- Other digits are represented as !DIGITS

		:rtype: str
		'''
        if '-' in word and word[0] != '-':
            return '!HYPHEN'
        elif word.isdigit() and len(word) == 4:
            return '!YEAR'
        elif word[0].isdigit():
            return '!DIGITS'
        else:
            return word.lower()

    def _get_features(self, i, word, context, prev, prev2, prev3):
        '''Map tokens into a feature representation, implemented as a
		{hashable: float} dict. If the features change, a new model must be
		trained.
		'''
        def add(name, *args):
            features[' '.join((name, ) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(float)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        add('i-1 tag', prev)
        add('i-2 tag', prev2)
        add('i tag+i-2 tag', prev, prev2)
        add('i word', context[i])
        add('i-1 tag+i word', prev, context[i])
        add('i-1 word', context[i - 1])
        add('i-1 suffix', context[i - 1][-3:])
        add('i-2 word', context[i - 2])
        add('i+1 word', context[i + 1])
        add('i+1 suffix', context[i + 1][-3:])
        add('i+2 word', context[i + 2])
        # add some features @hw
        add('i-3 tag', prev3)
        add('i tag+i-2 tag+i-3 tag', prev, prev2, prev3)
        add('i-2 tag+i-1 tag+i word', prev, prev2, context[i])
        add('i-3 tag+i-2 tag+i-1 tag+i word', prev, prev2, prev3, context[i])
        add('i-2 word', context[i - 2])
        add('i-3 word', context[i - 3])
        add('i-1 suffix2', context[i - 1][-2:])
        add('i-1 suffix1', context[i - 1][-1:])
        add('i+1 suffix2', context[i + 1][-2:])
        add('i+1 suffix1', context[i + 1][-1:])
        # print(word, '|||', features)
        return features

    def _make_tagdict(self, sentences):
        '''Make a tag dictionary for single-tag words.'''
        counts = defaultdict(lambda: defaultdict(int))
        #		for words, tags in sentences:
        for sentence in sentences:
            for token in sentence:
                word = token[1]
                tag = token[3]
                counts[word][tag] += 1
                self.classes.add(tag)
        freq_thresh = 20
        ambiguity_thresh = 0.97
        for word, tag_freqs in counts.items():
            tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
            n = sum(tag_freqs.values())
            # Don't add rare words to the tag dictionary
            # Only add quite unambiguous words
            if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
                self.tagdict[word] = tag
Example #7
0
class PerceptronTagger(BaseTagger):
    START = ['-START-', '-START2-']
    END = ['-END-', '-END2-']
    AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE)

    def __init__(self, load=None):
        self.model = AveragedPerceptron()
        self.tagdict = {}
        self.classes = set()
        self.graphdict = pickle.load(open("../pos_dict.pickle", "rb"))
        self.nodeperm = pickle.load(open("../pos_nodeperm_dict.pickle", "rb"))
        self.graph = nx.DiGraph()
        if load:
            self.load(load)
        with open("../gen_pos_graph.txt", "r") as pos_file:
            for line in pos_file:
                first, second = tuple(map(int, line.split()));
                if first >= len(self.nodeperm) or second >= len(self.nodeperm):
                    continue
                first_idx = self.nodeperm[first]
                first_word = self.graphdict[first_idx]
                second_idx = self.nodeperm[second]
                second_word = self.graphdict[second_idx]
                self.graph.add_edge(first_word, second_word)

    def tag(self, corpus):
        prev, prev2 = self.START
        tokens = []
        for sentence in corpus:
            context = self.START + [self._normalize(w) for w in sentence] + self.END
            for i, word in enumerate(sentence):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features(i, word, context, prev, prev2)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens

    def tag_graph(self, corpus):
        prev, prev2 = self.START
        tokens = []
        for sentence in corpus:
            for i, word in enumerate(sentence):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features_graph(i, word, prev, self.graph)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens

    def tag_graph2(self, corpus):
        prev, prev2 = self.START
        tokens = []
        for sentence in corpus:
            for i, word in enumerate(sentence):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features_graph(i, word, prev, self.graph)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens

    def tag_graph_deg(self, corpus):
        prev, prev2 = self.START
        tokens = []
        for sentence in corpus:
            for i, word in enumerate(sentence):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features_graph(i, word, prev, self.graph)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens

    def tag_ngram(self, corpus):
        prev, prev2 = self.START
        tokens = []
        for sentence in corpus:
            for i, word in enumerate(sentence):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features_ngram(i, word, prev)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens

    def train(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.

        :param sentences: A list of (words, tags) tuples.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            print "iteration: " ,iter_
            for tups in sentences:
                words = map(operator.itemgetter(0), tups)
                tags = map(operator.itemgetter(1), tups)
                prev, prev2 = self.START #do this complicatedly
                context = self.START + [self._normalize(w) for w in words] \
                                                                    + self.END
                for i, word in enumerate(words):
                    guess = self.tagdict.get(word)
                    if not guess:
                        #this is the operant part
                        feats = self._get_features(i, word, context, prev, prev2)
                        guess = self.model.predict(feats)
                        self.model.update(tags[i], guess, feats)
                    prev2 = prev
                    prev = guess
                    c += guess == tags[i]
                    n += 1
            random.shuffle(sentences)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                         open(save_loc, 'wb'), -1)
        return None

    def train_ngram(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from graph, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.

        :param sentences: A list of (words, tags) tuples.
        :param graph: a graph of the POSs which lead to each other.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            print "iteration: " ,iter_
            for tups in sentences:
                if n % 1000 == 0:
                    print "n : ", n
                words = map(operator.itemgetter(0), tups)
                tags = map(operator.itemgetter(1), tups)
                prev = self.START[0]
                for i, word in enumerate(words):
                    guess = self.tagdict.get(word)
                    if not guess:
                        feats = self._get_features_ngram(i, word, prev)
                        guess = self.model.predict(feats)
                        self.model.update(tags[i], guess, feats)
                    prev = guess
                    c += guess == tags[i]
                    n += 1
            random.shuffle(sentences)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                         open(save_loc, 'wb'), -1)
        return None

    def train_graph(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from graph, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.

        :param sentences: A list of (words, tags) tuples.
        :param graph: a graph of the POSs which lead to each other.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            print "iteration: " ,iter_
            for tups in sentences:
                if n % 1000 == 0:
                    print "n : ", n
                words = map(operator.itemgetter(0), tups)
                tags = map(operator.itemgetter(1), tups)
                prev = self.START[0]
                for i, word in enumerate(words):
                    guess = self.tagdict.get(word)
                    if not guess:
                        feats = self._get_features_graph(i, word, prev, self.graph)
                        guess = self.model.predict(feats)
                        self.model.update(tags[i], guess, feats)
                    prev = guess
                    c += guess == tags[i]
                    n += 1
            random.shuffle(sentences)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                         open(save_loc, 'wb'), -1)
        return None

    def train_graph2(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from graph, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.

        :param sentences: A list of (words, tags) tuples.
        :param graph: a graph of the POSs which lead to each other.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            print "iteration: " ,iter_
            for tups in sentences:
                if n % 1000 == 0:
                    print "n : ", n
                words = map(operator.itemgetter(0), tups)
                tags = map(operator.itemgetter(1), tups)
                prev = self.START[0]
                for i, word in enumerate(words):
                    guess = self.tagdict.get(word)
                    if not guess:
                        feats = self._get_features_graph2(i, word, prev, self.graph)
                        guess = self.model.predict(feats)
                        self.model.update(tags[i], guess, feats)
                    prev = guess
                    c += guess == tags[i]
                    n += 1
            random.shuffle(sentences)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                         open(save_loc, 'wb'), -1)
        return None

    def train_graph_deg(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from graph, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.

        :param sentences: A list of (words, tags) tuples.
        :param graph: a graph of the POSs which lead to each other.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            print "iteration: " ,iter_
            for tups in sentences:
                if n % 1000 == 0:
                    print "n : ", n
                words = map(operator.itemgetter(0), tups)
                tags = map(operator.itemgetter(1), tups)
                prev = self.START[0]
                for i, word in enumerate(words):
                    guess = self.tagdict.get(word)
                    if not guess:
                        feats = self._get_features_graph_deg(i, word, prev, self.graph)
                        guess = self.model.predict(feats)
                        self.model.update(tags[i], guess, feats)
                    prev = guess
                    c += guess == tags[i]
                    n += 1
            random.shuffle(sentences)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                         open(save_loc, 'wb'), -1)
        return None

    def load(self, loc):
        '''Load a pickled model.'''
        try:
            w_td_c = pickle.load(open(loc, 'rb'))
        except IOError:
            msg = ("Missing trontagger.pickle file.")
            raise MissingCorpusError(msg)
        self.model.weights, self.tagdict, self.classes = w_td_c
        self.model.classes = self.classes
        return None

    def _normalize(self, word):
        '''Normalization used in pre-processing.

        - All words are lower cased
        - Digits in the range 1800-2100 are represented as !YEAR;
        - Other digits are represented as !DIGITS

        :rtype: str
        '''
        if '-' in word and word[0] != '-':
            return '!HYPHEN'
        elif word.isdigit() and len(word) == 4:
            return '!YEAR'
        elif word[0].isdigit():
            return '!DIGITS'
        else:
            return word.lower()

    def _get_features(self, i, word, context, prev, prev2):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        return features

    def _get_features_graph(self, i, word, prev, graph):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        #get the i-1 tag from the graph
        #therefore, the graph should be a digraph
        #see how that performance works
        for parent, _ in graph.in_edges([prev]):
            add('i-1 tag parent', parent)
        return features

    def _get_features_graph2(self, i, word, prev, graph):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        #get the i-1 tag from the graph
        #therefore, the graph should be a digraph
        #see how that performance works
        for child, _ in graph.out_edges([prev]):
            add('i-1 tag children', child)
        return features

    def _get_features_graph_deg(self, i, word, prev, graph):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        #get the i-1 tag from the graph
        #therefore, the graph should be a digraph
        #see how that performance works
        if type(graph.degree(prev)) is int:
            add('i-1 tag degree', str(graph.degree(prev)))
        else: #is dict
            add('i-1 tag degree', str(graph.degree(prev).values()))
        return features

    def _get_features_ngram(self, i, word, prev):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        add('i prev', prev)
        return features

    def _make_tagdict(self, sentences):
        '''Make a tag dictionary for single-tag words.'''
        counts = defaultdict(lambda: defaultdict(int))
        for sentence in sentences:
            for word, tag in sentence:
                counts[word][tag] += 1
                self.classes.add(tag)
        freq_thresh = 20
        ambiguity_thresh = 0.97
        for word, tag_freqs in counts.items():
            tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
            n = sum(tag_freqs.values())
            # Don't add rare words to the tag dictionary
            # Only add quite unambiguous words
            if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
                self.tagdict[word] = tag
class PerceptronTaggerLabelPowerset(object):

    '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
    See more implementation details here:
        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    :param load: Load the pickled model upon instantiation.
    '''

    START = ['-START-', '-START2-']
    END = ['-END-', '-END2-']
    AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE)

    POSITIVE_CLASS = 1.0
    NEGATIVE_CLASS = 0.0

    def __init__(self, individual_tags, tag_history, combo_freq_threshold=1, tag_plus_word=0, tag_ngram_size=0):

        self.combo_freq_threshold = combo_freq_threshold
        self.classes = set()
        self.model = None
        self.individual_tags = set(individual_tags)

        self.tag_history = tag_history
        self.tag_plus_word = tag_plus_word
        self.tag_ngram_size = tag_ngram_size

    def _add_tag_features(self, wd, feats, prev_tags):
        for ix, prev in enumerate(prev_tags[-self.tag_history:]):
            offset = ix - self.tag_history
            feats["HIST_TAG " + str(offset) + " : " + str(prev)] = 1

        for ix, prev in enumerate(prev_tags[-self.tag_plus_word:]):
            offset = ix - self.tag_history
            feats["[HIST_TAG | wd] " + str(offset) + " : " + str(prev) + "|" + wd] = 1

        if self.tag_ngram_size > 0:
            tag_hist = prev_tags[-self.tag_ngram_size:]
            tag_ngram = "|".join(map(str, tag_hist))
            feats["HIST_TAG_NGRAM: " + tag_ngram] = 1

    def predict(self, essay_feats, output_scores = False):
        '''Tags a string `corpus`.
            Outputs a dictionary mapping to a list of binary predictions
        '''

        # Assume untokenized corpus has \n between sentences and ' ' between words
        class2predictions = defaultdict(list)
        for essay_ix, essay in enumerate(essay_feats):
            for sent_ix, taggged_sentence in enumerate(essay.sentences):
                """ Start Sentence """
                class2prev = defaultdict(list)
                for cls in self.classes:
                    class2prev[cls] = list(self.START)

                prev = list(self.START)
                for i, (wd) in enumerate(taggged_sentence):
                    # Don't mutate the feat dictionary
                    tagger_feats = dict(wd.features.items())
                    tagger_feats["bias"] = 1
                    # get all tagger predictions for previous 2 tags

                    self._add_tag_features(wd.word, tagger_feats, prev)

                    scores_by_class = self.model.decision_function(tagger_feats)
                    guess = max(self.model.classes, key=lambda label: (scores_by_class[label], label))
                    prev.append(guess)

                    if output_scores:
                        max_score_per_class = defaultdict(float)
                        for fset_tags, score in scores_by_class.items():
                            for tag in fset_tags:
                                max_score_per_class[tag] = max(max_score_per_class[tag], score)

                        for cls in self.individual_tags:
                            class2predictions[cls].append(max_score_per_class[cls])
                    else:
                        for cls in self.individual_tags:
                            class2predictions[cls].append(1 if cls in guess else 0)

        np_class2predictions = dict()
        for key, lst in class2predictions.items():
            np_class2predictions[key] = np.asarray(lst)
        return np_class2predictions

    def decision_function(self, essay_feats):
        '''Tags a string `corpus`.
            Outputs a dictionary mapping to a list of scores for each class
        '''
        return self.predict(essay_feats, output_scores=True)

    def __get_tags_(self, tags):
        return frozenset((t for t in tags if t in self.individual_tags))

    def train(self, essay_feats, nr_iter=5, verbose=True, average_weights=True):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.
        @param sentences:       A list of (words, tags) tuples.
        @param nr_iter:         Number of training iterations.
        @param verbose:         Print learning progress
        '''

        cp_essay_feats = list(essay_feats)

        if self.model == None:
            # Copy as we do an inplace shuffle below
            tag_freq = defaultdict(int)
            for essay in cp_essay_feats:
                for taggged_sentence in essay.sentences:
                    for wd in taggged_sentence:
                        fs_tags = self.__get_tags_(wd.tags)
                        tag_freq[fs_tags] +=1


            self.classes = set([ fs for fs, cnt in tag_freq.items() if cnt >= self.combo_freq_threshold])
            self.model = AveragedPerceptron(self.classes)

        for iter_ in range(nr_iter):
            class2predictions = defaultdict(list)
            class2tags = defaultdict(list)

            for essay_ix, essay in enumerate(cp_essay_feats):
                for sent_ix, taggged_sentence in enumerate(essay.sentences):
                    """ Start Sentence """
                    prev = list(self.START)

                    for i, (wd) in enumerate(taggged_sentence):
                        # Don't mutate the feat dictionary
                        tagger_feats = dict(wd.features.items())
                        tagger_feats["bias"] = 1

                        # get all tagger predictions for previous 2 tags
                        self._add_tag_features(wd.word, tagger_feats, prev)
                        # add more in depth features for this tag
                        actual = self.__get_tags_(wd.tags)

                        guess = self.model.predict(tagger_feats)
                        self.model.update(actual, guess, tagger_feats)

                        prev.append(guess)
                        for cls in self.individual_tags:
                            class2predictions[cls].append(  1 if cls in guess  else 0 )
                            class2tags[cls].append(         1 if cls in actual else 0)

            random.shuffle(cp_essay_feats)
            if verbose:
                class2metrics = ResultsProcessor.compute_metrics(class2tags, class2predictions)
                micro_metrics = micro_rpfa(class2metrics.values())
                logging.info("Iter {0}: Micro Avg Metrics: {1}".format(iter_, str(micro_metrics)))

        if average_weights:
            self.model.average_weights()
        return None

    def _normalize(self, word):
        '''Normalization used in pre-processing.
        - All words are lower cased
        - Digits in the range 1800-2100 are represented as !YEAR;
        - Other digits are represented as !DIGITS
        :rtype: str
        '''
        if '-' in word and word[0] != '-':
            return '!HYPHEN'
        elif word.isdigit() and len(word) == 4:
            return '!YEAR'
        elif word[0].isdigit():
            return '!DIGITS'
        else:
            return word.lower()

    def _get_features(self, i, word, context, prev, prev2):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        add('i-1 tag', prev)
        add('i-2 tag', prev2)
        add('i tag+i-2 tag', prev, prev2)
        add('i word', context[i])
        add('i-1 tag+i word', prev, context[i])
        add('i-1 word', context[i-1])
        add('i-1 suffix', context[i-1][-3:])
        add('i-2 word', context[i-2])
        add('i+1 word', context[i+1])
        add('i+1 suffix', context[i+1][-3:])
        add('i+2 word', context[i+2])
        return features
Example #9
0
from classifier import BinaryClassifier
from perceptron import Perceptron, AveragedPerceptron
from naive_bayes import NaiveBayes
from utils import read_data, build_vocab
import utils
from config import args

if __name__ == '__main__':
    filepath = '../data/given/'
    build_vocab(filepath, vocab_size=args.vocab_size)
    train_data, test_data = read_data(filepath)

    perc_classifier = Perceptron(args)
    perc_classifier.fit(train_data)
    acc, prec, rec, f1 = perc_classifier.evaluate(test_data)
    print('Perceptron Results:')
    print('Accuracy: %.2f, Precision: %.2f, Recall: %.2f, F1: %.2f'%(acc, prec, rec, f1))

    avg_perc_classifier = AveragedPerceptron(args)
    avg_perc_classifier.fit(train_data)
    acc, prec, rec, f1 = avg_perc_classifier.evaluate(test_data)
    print('\nAveraged Perceptron Results:')
    print('Accuracy: %.2f, Precision: %.2f, Recall: %.2f, F1: %.2f'%(acc, prec, rec, f1))

    nb_classifier = NaiveBayes(args)
    nb_classifier.fit(train_data)
    acc, prec, rec, f1 = nb_classifier.evaluate(test_data)
    print('\nNaive Bayes Performance:')
    print('Accuracy: %.2f, Precision: %.2f, Recall: %.2f, F1: %.2f'%(acc, prec, rec, f1))
class PerceptronTaggerMultiClassCombo(object):
    '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
    See more implementation details here:
        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    :param load: Load the pickled model upon instantiation.
    '''

    START = ['-START-', '-START2-']
    END = ['-END-', '-END2-']
    AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE)

    POSITIVE_CLASS = 1.0
    NEGATIVE_CLASS = 0.0

    def __init__(self,
                 individual_tags,
                 tag_history,
                 combo_freq_threshold,
                 load=False,
                 use_tag_features=True):
        self.use_tag_features = use_tag_features
        self.combo_freq_threshold = combo_freq_threshold
        self.tag_history = tag_history
        self.classes = set()
        self.individual_tags = set(individual_tags)

    def _add_tag_features(self, feats, word, prev, prev2):
        sprev, sprev2 = str(prev), str(prev2)
        feats["bias"] = 1
        # Commenting out the single previous tag features as included with the
        # tag history parameter
        #feats["TAG -1 " + sprev]                  =      1     # included in other
        feats["TAG -1 wd " + sprev + "|" + word] = 1
        #feats["TAG -2 " + sprev2]                 =      1     # included in other
        feats["TAG -2 wd " + sprev2 + "|" + word] = 1
        feats["TAG -1, -2 " + sprev + "|" + sprev2] = 1

    def _add_secondary_tag_features(self, feats, prev_tags):
        for ix, prev in enumerate(prev_tags[-self.tag_history:]):
            offset = ix - self.tag_history
            feats["HIST_TAG " + str(offset) + " " + str(prev)] = 1

    def predict(self, essay_feats, output_scores=False):
        '''Tags a string `corpus`.
            Outputs a dictionary mapping to a list of binary predictions
        '''

        # Assume untokenized corpus has \n between sentences and ' ' between words
        class2predictions = defaultdict(list)
        for essay_ix, essay in enumerate(essay_feats):
            for sent_ix, taggged_sentence in enumerate(essay.sentences):
                """ Start Sentence """
                class2prev = defaultdict(list)
                for cls in self.classes:
                    class2prev[cls] = list(self.START)

                prev = list(self.START)
                for i, (wd) in enumerate(taggged_sentence):
                    # Don't mutate the feat dictionary
                    shared_features = dict(wd.features.items())
                    # get all tagger predictions for previous 2 tags

                    self._add_secondary_tag_features(shared_features, prev)

                    tagger_feats = dict(shared_features.items())
                    if self.use_tag_features:
                        self._add_tag_features(tagger_feats, wd.word, prev[-1],
                                               prev[-2])

                    scores_by_class = self.model.decision_function(
                        tagger_feats)
                    guess = max(self.model.classes,
                                key=lambda label:
                                (scores_by_class[label], label))
                    prev.append(guess)

                    if output_scores:
                        max_score_per_class = defaultdict(float)
                        for fset_tags, score in scores_by_class.items():
                            for tag in fset_tags:
                                max_score_per_class[tag] = max(
                                    max_score_per_class[tag], score)

                        for cls in self.individual_tags:
                            class2predictions[cls].append(
                                max_score_per_class[cls])
                    else:
                        for cls in self.individual_tags:
                            class2predictions[cls].append(1 if cls in
                                                          guess else 0)

        np_class2predictions = dict()
        for key, lst in class2predictions.items():
            np_class2predictions[key] = np.asarray(lst)
        return np_class2predictions

    def decision_function(self, essay_feats):
        '''Tags a string `corpus`.
            Outputs a dictionary mapping to a list of scores for each class
        '''
        return self.predict(essay_feats, output_scores=True)

    def __get_tags_(self, tags):
        return frozenset((t for t in tags if t in self.individual_tags))

    def train(self, essay_feats, save_loc=None, nr_iter=5, verbose=True):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.
        :param sentences: A list of (words, tags) tuples.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''

        cp_essay_feats = list(essay_feats)

        # Copy as we do an inplace shuffle below
        tag_freq = defaultdict(int)
        for essay in cp_essay_feats:
            for taggged_sentence in essay.sentences:
                for wd in taggged_sentence:
                    fs_tags = self.__get_tags_(wd.tags)
                    tag_freq[fs_tags] += 1

        self.classes = set([
            fs for fs, cnt in tag_freq.items()
            if cnt >= self.combo_freq_threshold
        ])
        self.model = AveragedPerceptron(self.classes)

        for iter_ in range(nr_iter):
            class2predictions = defaultdict(list)
            class2tags = defaultdict(list)

            for essay_ix, essay in enumerate(cp_essay_feats):
                for sent_ix, taggged_sentence in enumerate(essay.sentences):
                    """ Start Sentence """
                    prev = list(self.START)

                    for i, (wd) in enumerate(taggged_sentence):
                        # Don't mutate the feat dictionary
                        shared_features = dict(wd.features.items())
                        # get all tagger predictions for previous 2 tags
                        self._add_secondary_tag_features(shared_features, prev)

                        tagger_feats = dict(shared_features.items())
                        # add more in depth features for this tag
                        actual = self.__get_tags_(wd.tags)

                        if self.use_tag_features:
                            self._add_tag_features(tagger_feats, wd.word,
                                                   prev[-1], prev[-2])

                        guess = self.model.predict(tagger_feats)
                        self.model.update(actual, guess, tagger_feats)

                        prev.append(guess)
                        for cls in self.individual_tags:
                            class2predictions[cls].append(1 if cls in
                                                          guess else 0)
                            class2tags[cls].append(1 if cls in actual else 0)

            random.shuffle(cp_essay_feats)
            class2metrics = ResultsProcessor.compute_metrics(
                class2tags, class2predictions)
            micro_metrics = micro_rpfa(class2metrics.values())
            if verbose:
                logging.info("Iter {0}: Micro Avg Metrics: {1}".format(
                    iter_, str(micro_metrics)))

        self.model.average_weights()
        return None

    def _normalize(self, word):
        '''Normalization used in pre-processing.
        - All words are lower cased
        - Digits in the range 1800-2100 are represented as !YEAR;
        - Other digits are represented as !DIGITS
        :rtype: str
        '''
        if '-' in word and word[0] != '-':
            return '!HYPHEN'
        elif word.isdigit() and len(word) == 4:
            return '!YEAR'
        elif word[0].isdigit():
            return '!DIGITS'
        else:
            return word.lower()

    def _get_features(self, i, word, context, prev, prev2):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name, ) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        add('i-1 tag', prev)
        add('i-2 tag', prev2)
        add('i tag+i-2 tag', prev, prev2)
        add('i word', context[i])
        add('i-1 tag+i word', prev, context[i])
        add('i-1 word', context[i - 1])
        add('i-1 suffix', context[i - 1][-3:])
        add('i-2 word', context[i - 2])
        add('i+1 word', context[i + 1])
        add('i+1 suffix', context[i + 1][-3:])
        add('i+2 word', context[i + 2])
        return features
    def train(self, essay_feats, save_loc=None, nr_iter=5, verbose=True):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.
        :param sentences: A list of (words, tags) tuples.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''

        cp_essay_feats = list(essay_feats)

        # Copy as we do an inplace shuffle below
        tag_freq = defaultdict(int)
        for essay in cp_essay_feats:
            for taggged_sentence in essay.sentences:
                for wd in taggged_sentence:
                    fs_tags = self.__get_tags_(wd.tags)
                    tag_freq[fs_tags] += 1

        self.classes = set([
            fs for fs, cnt in tag_freq.items()
            if cnt >= self.combo_freq_threshold
        ])
        self.model = AveragedPerceptron(self.classes)

        for iter_ in range(nr_iter):
            class2predictions = defaultdict(list)
            class2tags = defaultdict(list)

            for essay_ix, essay in enumerate(cp_essay_feats):
                for sent_ix, taggged_sentence in enumerate(essay.sentences):
                    """ Start Sentence """
                    prev = list(self.START)

                    for i, (wd) in enumerate(taggged_sentence):
                        # Don't mutate the feat dictionary
                        shared_features = dict(wd.features.items())
                        # get all tagger predictions for previous 2 tags
                        self._add_secondary_tag_features(shared_features, prev)

                        tagger_feats = dict(shared_features.items())
                        # add more in depth features for this tag
                        actual = self.__get_tags_(wd.tags)

                        if self.use_tag_features:
                            self._add_tag_features(tagger_feats, wd.word,
                                                   prev[-1], prev[-2])

                        guess = self.model.predict(tagger_feats)
                        self.model.update(actual, guess, tagger_feats)

                        prev.append(guess)
                        for cls in self.individual_tags:
                            class2predictions[cls].append(1 if cls in
                                                          guess else 0)
                            class2tags[cls].append(1 if cls in actual else 0)

            random.shuffle(cp_essay_feats)
            class2metrics = ResultsProcessor.compute_metrics(
                class2tags, class2predictions)
            micro_metrics = micro_rpfa(class2metrics.values())
            if verbose:
                logging.info("Iter {0}: Micro Avg Metrics: {1}".format(
                    iter_, str(micro_metrics)))

        self.model.average_weights()
        return None