Python AveragedPerceptron Examples

Programming Language: Python

Namespace/Package Name: perceptron

Examples at hotexamples.com: 11

Python AveragedPerceptron - 11 examples found. These are the top rated real world Python examples of perceptron.AveragedPerceptron extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

AveragedPerceptron(5)

average_weights(2)

predict(2)

update(2)

evaluate(1)

fit(1)

Example #1

Show file

 def __init__(self, fname, load=True):
     self.model = AveragedPerceptron()
     self.tagdict = {}
     self.classes = set()
     self.model_file = fname
     if load:
         self.load(self.model_file)

Example #2

Show file

	def __init__(self, fname, load=True):
		self.model = AveragedPerceptron()
		self.tagdict = {}
		self.classes = set()
		# # initiate a glove
		self.wv = api.load('glove-twitter-25')
		self.model_file = fname
		if load:
			self.load(self.model_file)

Example #3

Show file

File: perceptron_tagger_multiclass_combo.py Project: simonhughes22/PythonNlpResearch

    def train(self, essay_feats, save_loc=None, nr_iter=5, verbose=True):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.
        :param sentences: A list of (words, tags) tuples.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''

        cp_essay_feats = list(essay_feats)

        # Copy as we do an inplace shuffle below
        tag_freq = defaultdict(int)
        for essay in cp_essay_feats:
            for taggged_sentence in essay.sentences:
                for wd in taggged_sentence:
                    fs_tags = self.__get_tags_(wd.tags)
                    tag_freq[fs_tags] +=1


        self.classes = set([ fs for fs, cnt in tag_freq.items() if cnt >= self.combo_freq_threshold])
        self.model = AveragedPerceptron(self.classes)

        for iter_ in range(nr_iter):
            class2predictions = defaultdict(list)
            class2tags = defaultdict(list)

            for essay_ix, essay in enumerate(cp_essay_feats):
                for sent_ix, taggged_sentence in enumerate(essay.sentences):
                    """ Start Sentence """
                    prev = list(self.START)

                    for i, (wd) in enumerate(taggged_sentence):
                        # Don't mutate the feat dictionary
                        shared_features = dict(wd.features.items())
                        # get all tagger predictions for previous 2 tags
                        self._add_secondary_tag_features(shared_features, prev)

                        tagger_feats = dict(shared_features.items())
                        # add more in depth features for this tag
                        actual = self.__get_tags_(wd.tags)

                        if self.use_tag_features:
                            self._add_tag_features(tagger_feats, wd.word, prev[-1], prev[-2])

                        guess = self.model.predict(tagger_feats)
                        self.model.update(actual, guess, tagger_feats)

                        prev.append(guess)
                        for cls in self.individual_tags:
                            class2predictions[cls].append(  1 if cls in guess  else 0 )
                            class2tags[cls].append(         1 if cls in actual else 0)

            random.shuffle(cp_essay_feats)
            class2metrics = ResultsProcessor.compute_metrics(class2tags, class2predictions)
            micro_metrics = micro_rpfa(class2metrics.values())
            if verbose:
                logging.info("Iter {0}: Micro Avg Metrics: {1}".format(iter_, str(micro_metrics)))

        self.model.average_weights()
        return None

Example #4

Show file

 def __init__(self, classes, tag_history):
     self.tag_history = tag_history
     self.classes = set(classes)
     self.class2model = {}
     for cls in classes:
         self.class2model[cls] = AveragedPerceptron()
         self.class2model[cls].classes = set(
             [self.NEGATIVE_CLASS, self.POSITIVE_CLASS])

Example #5

Show file

File: tagger.py Project: howonlee/kroneckerText

 def __init__(self, load=None):
     self.model = AveragedPerceptron()
     self.tagdict = {}
     self.classes = set()
     self.graphdict = pickle.load(open("../pos_dict.pickle", "rb"))
     self.nodeperm = pickle.load(open("../pos_nodeperm_dict.pickle", "rb"))
     self.graph = nx.DiGraph()
     if load:
         self.load(load)
     with open("../gen_pos_graph.txt", "r") as pos_file:
         for line in pos_file:
             first, second = tuple(map(int, line.split()));
             if first >= len(self.nodeperm) or second >= len(self.nodeperm):
                 continue
             first_idx = self.nodeperm[first]
             first_word = self.graphdict[first_idx]
             second_idx = self.nodeperm[second]
             second_word = self.graphdict[second_idx]
             self.graph.add_edge(first_word, second_word)

Example #6

Show file

class PerceptronTagger():
    '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.

	See more implementation details here:
		http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/

	:param load: Load the pickled model upon instantiation.
	'''
    START = ['-START-', '-START2-', '-START3-']
    END = ['-END-', '-END2-', '-END3-']

    # START = ['-START-', '-START2-']
    # END = ['-END-', '-END2-']

    def __init__(self, fname, load=True):
        self.model = AveragedPerceptron()
        self.tagdict = {}
        self.classes = set()
        self.model_file = fname
        if load:
            self.load(self.model_file)

    def tag(self, corpus, tokenise=False):
        '''Tags a string `corpus`.'''
        # Assume untokenised corpus has \n between sentences and ' ' between words
        #s_split = SentenceTokenizer().tokenise if tokenise else lambda t: t.split('\n')
        #w_split = WordTokenizer().tokenise if tokenise else lambda s: s.split()

        reading = True
        sentence = []
        line = corpus.readline()
        while reading:
            if line == '\n':
                # sentence boundary
                prev, prev2, prev3 = self.START
                #				print('s:',sentence)
                for words in sentence:
                    context = self.START + [
                        self._normalise(w[1]) for w in sentence
                    ] + self.END
                    for i, token in enumerate(sentence):
                        tag = self.tagdict.get(token[1])
                        if not tag:
                            # if the word isn't "unambiguous", extract features
                            features = self._get_features(
                                i, token[1], context, prev, prev2, prev3)
                            # make the prediction
                            tag = self.model.predict(features)
                        sentence[i][3] = tag
                        prev3 = prev2
                        prev2 = prev
                        prev = tag
                # print out the tokens and their tags
                for words in sentence:
                    print('\t'.join(words))
                print()
                sentence = []
            elif line == '':
                # we reached the end of the input
                reading = False
            elif line[0] == '#':
                # line is a comment line
                print(line.strip())
                line = corpus.readline()
                continue
            else:
                # normal conllu line
                row = line.strip().split('\t')
                sentence.append(row)

            # read the next line
            line = corpus.readline()

        return

    def train(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
		controls the number of Perceptron training iterations.

		:param sentences: A list of 10-value tuples
		:param save_loc: If not ``None``, saves a pickled model in this location.
		:param nr_iter: Number of training iterations.
		'''
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            #			for words,tags in sentences:
            for sentence in sentences:
                #				print(c, n, '|||', sentence);
                print(n, end='', file=sys.stderr)
                prev, prev2, prev3 = self.START
                context = self.START + [
                    self._normalise(w[1]) for w in sentence
                ] + self.END
                tags = [w[3] for w in sentence]
                for i, token in enumerate(sentence):
                    word = token[1]
                    guess = self.tagdict.get(word)
                    if not guess:
                        feats = self._get_features(i, word, context, prev,
                                                   prev2, prev3)
                        guess = self.model.predict(feats)
                        self.model.update(tags[i], guess, feats)
                    prev3 = prev2
                    prev2 = prev
                    prev = guess
                    c += guess == tags[i]
                    n += 1
                print('\r', end='', file=sys.stderr)
            random.shuffle(sentences)
            print()
            print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)),
                  file=sys.stderr)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                        open(save_loc, 'wb'), -1)
        return None

    def load(self, loc):
        '''Load a pickled model.'''
        try:
            w_td_c = pickle.load(open(loc, 'rb'))
        except IOError:
            print("Missing " + loc + " file.")
            sys.exit(-1)
        self.model.weights, self.tagdict, self.classes = w_td_c
        self.model.classes = self.classes
        return None

    def _normalise(self, word):
        '''Normalisation used in pre-processing.

		- All words are lower cased
		- Digits in the range 0000-2100 are represented as !YEAR;
		- Other digits are represented as !DIGITS

		:rtype: str
		'''
        if '-' in word and word[0] != '-':
            return '!HYPHEN'
        elif word.isdigit() and len(word) == 4:
            return '!YEAR'
        elif word[0].isdigit():
            return '!DIGITS'
        else:
            return word.lower()

    def _get_features(self, i, word, context, prev, prev2, prev3):
        '''Map tokens into a feature representation, implemented as a
		{hashable: float} dict. If the features change, a new model must be
		trained.
		'''
        def add(name, *args):
            features[' '.join((name, ) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(float)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        add('i-1 tag', prev)
        add('i-2 tag', prev2)
        add('i tag+i-2 tag', prev, prev2)
        add('i word', context[i])
        add('i-1 tag+i word', prev, context[i])
        add('i-1 word', context[i - 1])
        add('i-1 suffix', context[i - 1][-3:])
        add('i-2 word', context[i - 2])
        add('i+1 word', context[i + 1])
        add('i+1 suffix', context[i + 1][-3:])
        add('i+2 word', context[i + 2])
        # add some features @hw
        add('i-3 tag', prev3)
        add('i tag+i-2 tag+i-3 tag', prev, prev2, prev3)
        add('i-2 tag+i-1 tag+i word', prev, prev2, context[i])
        add('i-3 tag+i-2 tag+i-1 tag+i word', prev, prev2, prev3, context[i])
        add('i-2 word', context[i - 2])
        add('i-3 word', context[i - 3])
        add('i-1 suffix2', context[i - 1][-2:])
        add('i-1 suffix1', context[i - 1][-1:])
        add('i+1 suffix2', context[i + 1][-2:])
        add('i+1 suffix1', context[i + 1][-1:])
        # print(word, '|||', features)
        return features

    def _make_tagdict(self, sentences):
        '''Make a tag dictionary for single-tag words.'''
        counts = defaultdict(lambda: defaultdict(int))
        #		for words, tags in sentences:
        for sentence in sentences:
            for token in sentence:
                word = token[1]
                tag = token[3]
                counts[word][tag] += 1
                self.classes.add(tag)
        freq_thresh = 20
        ambiguity_thresh = 0.97
        for word, tag_freqs in counts.items():
            tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
            n = sum(tag_freqs.values())
            # Don't add rare words to the tag dictionary
            # Only add quite unambiguous words
            if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
                self.tagdict[word] = tag

Example #7

Show file

File: tagger.py Project: howonlee/kroneckerText

class PerceptronTagger(BaseTagger):
    START = ['-START-', '-START2-']
    END = ['-END-', '-END2-']
    AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE)

    def __init__(self, load=None):
        self.model = AveragedPerceptron()
        self.tagdict = {}
        self.classes = set()
        self.graphdict = pickle.load(open("../pos_dict.pickle", "rb"))
        self.nodeperm = pickle.load(open("../pos_nodeperm_dict.pickle", "rb"))
        self.graph = nx.DiGraph()
        if load:
            self.load(load)
        with open("../gen_pos_graph.txt", "r") as pos_file:
            for line in pos_file:
                first, second = tuple(map(int, line.split()));
                if first >= len(self.nodeperm) or second >= len(self.nodeperm):
                    continue
                first_idx = self.nodeperm[first]
                first_word = self.graphdict[first_idx]
                second_idx = self.nodeperm[second]
                second_word = self.graphdict[second_idx]
                self.graph.add_edge(first_word, second_word)

    def tag(self, corpus):
        prev, prev2 = self.START
        tokens = []
        for sentence in corpus:
            context = self.START + [self._normalize(w) for w in sentence] + self.END
            for i, word in enumerate(sentence):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features(i, word, context, prev, prev2)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens

    def tag_graph(self, corpus):
        prev, prev2 = self.START
        tokens = []
        for sentence in corpus:
            for i, word in enumerate(sentence):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features_graph(i, word, prev, self.graph)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens

    def tag_graph2(self, corpus):
        prev, prev2 = self.START
        tokens = []
        for sentence in corpus:
            for i, word in enumerate(sentence):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features_graph(i, word, prev, self.graph)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens

    def tag_graph_deg(self, corpus):
        prev, prev2 = self.START
        tokens = []
        for sentence in corpus:
            for i, word in enumerate(sentence):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features_graph(i, word, prev, self.graph)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens

    def tag_ngram(self, corpus):
        prev, prev2 = self.START
        tokens = []
        for sentence in corpus:
            for i, word in enumerate(sentence):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features_ngram(i, word, prev)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens

    def train(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.

        :param sentences: A list of (words, tags) tuples.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            print "iteration: " ,iter_
            for tups in sentences:
                words = map(operator.itemgetter(0), tups)
                tags = map(operator.itemgetter(1), tups)
                prev, prev2 = self.START #do this complicatedly
                context = self.START + [self._normalize(w) for w in words] \
                                                                    + self.END
                for i, word in enumerate(words):
                    guess = self.tagdict.get(word)
                    if not guess:
                        #this is the operant part
                        feats = self._get_features(i, word, context, prev, prev2)
                        guess = self.model.predict(feats)
                        self.model.update(tags[i], guess, feats)
                    prev2 = prev
                    prev = guess
                    c += guess == tags[i]
                    n += 1
            random.shuffle(sentences)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                         open(save_loc, 'wb'), -1)
        return None

    def train_ngram(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from graph, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.

        :param sentences: A list of (words, tags) tuples.
        :param graph: a graph of the POSs which lead to each other.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            print "iteration: " ,iter_
            for tups in sentences:
                if n % 1000 == 0:
                    print "n : ", n
                words = map(operator.itemgetter(0), tups)
                tags = map(operator.itemgetter(1), tups)
                prev = self.START[0]
                for i, word in enumerate(words):
                    guess = self.tagdict.get(word)
                    if not guess:
                        feats = self._get_features_ngram(i, word, prev)
                        guess = self.model.predict(feats)
                        self.model.update(tags[i], guess, feats)
                    prev = guess
                    c += guess == tags[i]
                    n += 1
            random.shuffle(sentences)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                         open(save_loc, 'wb'), -1)
        return None

    def train_graph(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from graph, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.

        :param sentences: A list of (words, tags) tuples.
        :param graph: a graph of the POSs which lead to each other.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            print "iteration: " ,iter_
            for tups in sentences:
                if n % 1000 == 0:
                    print "n : ", n
                words = map(operator.itemgetter(0), tups)
                tags = map(operator.itemgetter(1), tups)
                prev = self.START[0]
                for i, word in enumerate(words):
                    guess = self.tagdict.get(word)
                    if not guess:
                        feats = self._get_features_graph(i, word, prev, self.graph)
                        guess = self.model.predict(feats)
                        self.model.update(tags[i], guess, feats)
                    prev = guess
                    c += guess == tags[i]
                    n += 1
            random.shuffle(sentences)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                         open(save_loc, 'wb'), -1)
        return None

    def train_graph2(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from graph, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.

        :param sentences: A list of (words, tags) tuples.
        :param graph: a graph of the POSs which lead to each other.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            print "iteration: " ,iter_
            for tups in sentences:
                if n % 1000 == 0:
                    print "n : ", n
                words = map(operator.itemgetter(0), tups)
                tags = map(operator.itemgetter(1), tups)
                prev = self.START[0]
                for i, word in enumerate(words):
                    guess = self.tagdict.get(word)
                    if not guess:
                        feats = self._get_features_graph2(i, word, prev, self.graph)
                        guess = self.model.predict(feats)
                        self.model.update(tags[i], guess, feats)
                    prev = guess
                    c += guess == tags[i]
                    n += 1
            random.shuffle(sentences)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                         open(save_loc, 'wb'), -1)
        return None

    def train_graph_deg(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from graph, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.

        :param sentences: A list of (words, tags) tuples.
        :param graph: a graph of the POSs which lead to each other.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            print "iteration: " ,iter_
            for tups in sentences:
                if n % 1000 == 0:
                    print "n : ", n
                words = map(operator.itemgetter(0), tups)
                tags = map(operator.itemgetter(1), tups)
                prev = self.START[0]
                for i, word in enumerate(words):
                    guess = self.tagdict.get(word)
                    if not guess:
                        feats = self._get_features_graph_deg(i, word, prev, self.graph)
                        guess = self.model.predict(feats)
                        self.model.update(tags[i], guess, feats)
                    prev = guess
                    c += guess == tags[i]
                    n += 1
            random.shuffle(sentences)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                         open(save_loc, 'wb'), -1)
        return None

    def load(self, loc):
        '''Load a pickled model.'''
        try:
            w_td_c = pickle.load(open(loc, 'rb'))
        except IOError:
            msg = ("Missing trontagger.pickle file.")
            raise MissingCorpusError(msg)
        self.model.weights, self.tagdict, self.classes = w_td_c
        self.model.classes = self.classes
        return None

    def _normalize(self, word):
        '''Normalization used in pre-processing.

        - All words are lower cased
        - Digits in the range 1800-2100 are represented as !YEAR;
        - Other digits are represented as !DIGITS

        :rtype: str
        '''
        if '-' in word and word[0] != '-':
            return '!HYPHEN'
        elif word.isdigit() and len(word) == 4:
            return '!YEAR'
        elif word[0].isdigit():
            return '!DIGITS'
        else:
            return word.lower()

    def _get_features(self, i, word, context, prev, prev2):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        return features

    def _get_features_graph(self, i, word, prev, graph):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        #get the i-1 tag from the graph
        #therefore, the graph should be a digraph
        #see how that performance works
        for parent, _ in graph.in_edges([prev]):
            add('i-1 tag parent', parent)
        return features

    def _get_features_graph2(self, i, word, prev, graph):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        #get the i-1 tag from the graph
        #therefore, the graph should be a digraph
        #see how that performance works
        for child, _ in graph.out_edges([prev]):
            add('i-1 tag children', child)
        return features

    def _get_features_graph_deg(self, i, word, prev, graph):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        #get the i-1 tag from the graph
        #therefore, the graph should be a digraph
        #see how that performance works
        if type(graph.degree(prev)) is int:
            add('i-1 tag degree', str(graph.degree(prev)))
        else: #is dict
            add('i-1 tag degree', str(graph.degree(prev).values()))
        return features

    def _get_features_ngram(self, i, word, prev):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        add('i prev', prev)
        return features

    def _make_tagdict(self, sentences):
        '''Make a tag dictionary for single-tag words.'''
        counts = defaultdict(lambda: defaultdict(int))
        for sentence in sentences:
            for word, tag in sentence:
                counts[word][tag] += 1
                self.classes.add(tag)
        freq_thresh = 20
        ambiguity_thresh = 0.97
        for word, tag_freqs in counts.items():
            tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
            n = sum(tag_freqs.values())
            # Don't add rare words to the tag dictionary
            # Only add quite unambiguous words
            if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
                self.tagdict[word] = tag

Example #8

Show file

File: perceptron_tagger_multiclass_combo_new.py Project: simonhughes22/PythonNlpResearch

class PerceptronTaggerLabelPowerset(object):

    '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
    See more implementation details here:
        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    :param load: Load the pickled model upon instantiation.
    '''

    START = ['-START-', '-START2-']
    END = ['-END-', '-END2-']
    AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE)

    POSITIVE_CLASS = 1.0
    NEGATIVE_CLASS = 0.0

    def __init__(self, individual_tags, tag_history, combo_freq_threshold=1, tag_plus_word=0, tag_ngram_size=0):

        self.combo_freq_threshold = combo_freq_threshold
        self.classes = set()
        self.model = None
        self.individual_tags = set(individual_tags)

        self.tag_history = tag_history
        self.tag_plus_word = tag_plus_word
        self.tag_ngram_size = tag_ngram_size

    def _add_tag_features(self, wd, feats, prev_tags):
        for ix, prev in enumerate(prev_tags[-self.tag_history:]):
            offset = ix - self.tag_history
            feats["HIST_TAG " + str(offset) + " : " + str(prev)] = 1

        for ix, prev in enumerate(prev_tags[-self.tag_plus_word:]):
            offset = ix - self.tag_history
            feats["[HIST_TAG | wd] " + str(offset) + " : " + str(prev) + "|" + wd] = 1

        if self.tag_ngram_size > 0:
            tag_hist = prev_tags[-self.tag_ngram_size:]
            tag_ngram = "|".join(map(str, tag_hist))
            feats["HIST_TAG_NGRAM: " + tag_ngram] = 1

    def predict(self, essay_feats, output_scores = False):
        '''Tags a string `corpus`.
            Outputs a dictionary mapping to a list of binary predictions
        '''

        # Assume untokenized corpus has \n between sentences and ' ' between words
        class2predictions = defaultdict(list)
        for essay_ix, essay in enumerate(essay_feats):
            for sent_ix, taggged_sentence in enumerate(essay.sentences):
                """ Start Sentence """
                class2prev = defaultdict(list)
                for cls in self.classes:
                    class2prev[cls] = list(self.START)

                prev = list(self.START)
                for i, (wd) in enumerate(taggged_sentence):
                    # Don't mutate the feat dictionary
                    tagger_feats = dict(wd.features.items())
                    tagger_feats["bias"] = 1
                    # get all tagger predictions for previous 2 tags

                    self._add_tag_features(wd.word, tagger_feats, prev)

                    scores_by_class = self.model.decision_function(tagger_feats)
                    guess = max(self.model.classes, key=lambda label: (scores_by_class[label], label))
                    prev.append(guess)

                    if output_scores:
                        max_score_per_class = defaultdict(float)
                        for fset_tags, score in scores_by_class.items():
                            for tag in fset_tags:
                                max_score_per_class[tag] = max(max_score_per_class[tag], score)

                        for cls in self.individual_tags:
                            class2predictions[cls].append(max_score_per_class[cls])
                    else:
                        for cls in self.individual_tags:
                            class2predictions[cls].append(1 if cls in guess else 0)

        np_class2predictions = dict()
        for key, lst in class2predictions.items():
            np_class2predictions[key] = np.asarray(lst)
        return np_class2predictions

    def decision_function(self, essay_feats):
        '''Tags a string `corpus`.
            Outputs a dictionary mapping to a list of scores for each class
        '''
        return self.predict(essay_feats, output_scores=True)

    def __get_tags_(self, tags):
        return frozenset((t for t in tags if t in self.individual_tags))

    def train(self, essay_feats, nr_iter=5, verbose=True, average_weights=True):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.
        @param sentences:       A list of (words, tags) tuples.
        @param nr_iter:         Number of training iterations.
        @param verbose:         Print learning progress
        '''

        cp_essay_feats = list(essay_feats)

        if self.model == None:
            # Copy as we do an inplace shuffle below
            tag_freq = defaultdict(int)
            for essay in cp_essay_feats:
                for taggged_sentence in essay.sentences:
                    for wd in taggged_sentence:
                        fs_tags = self.__get_tags_(wd.tags)
                        tag_freq[fs_tags] +=1


            self.classes = set([ fs for fs, cnt in tag_freq.items() if cnt >= self.combo_freq_threshold])
            self.model = AveragedPerceptron(self.classes)

        for iter_ in range(nr_iter):
            class2predictions = defaultdict(list)
            class2tags = defaultdict(list)

            for essay_ix, essay in enumerate(cp_essay_feats):
                for sent_ix, taggged_sentence in enumerate(essay.sentences):
                    """ Start Sentence """
                    prev = list(self.START)

                    for i, (wd) in enumerate(taggged_sentence):
                        # Don't mutate the feat dictionary
                        tagger_feats = dict(wd.features.items())
                        tagger_feats["bias"] = 1

                        # get all tagger predictions for previous 2 tags
                        self._add_tag_features(wd.word, tagger_feats, prev)
                        # add more in depth features for this tag
                        actual = self.__get_tags_(wd.tags)

                        guess = self.model.predict(tagger_feats)
                        self.model.update(actual, guess, tagger_feats)

                        prev.append(guess)
                        for cls in self.individual_tags:
                            class2predictions[cls].append(  1 if cls in guess  else 0 )
                            class2tags[cls].append(         1 if cls in actual else 0)

            random.shuffle(cp_essay_feats)
            if verbose:
                class2metrics = ResultsProcessor.compute_metrics(class2tags, class2predictions)
                micro_metrics = micro_rpfa(class2metrics.values())
                logging.info("Iter {0}: Micro Avg Metrics: {1}".format(iter_, str(micro_metrics)))

        if average_weights:
            self.model.average_weights()
        return None

    def _normalize(self, word):
        '''Normalization used in pre-processing.
        - All words are lower cased
        - Digits in the range 1800-2100 are represented as !YEAR;
        - Other digits are represented as !DIGITS
        :rtype: str
        '''
        if '-' in word and word[0] != '-':
            return '!HYPHEN'
        elif word.isdigit() and len(word) == 4:
            return '!YEAR'
        elif word[0].isdigit():
            return '!DIGITS'
        else:
            return word.lower()

    def _get_features(self, i, word, context, prev, prev2):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        add('i-1 tag', prev)
        add('i-2 tag', prev2)
        add('i tag+i-2 tag', prev, prev2)
        add('i word', context[i])
        add('i-1 tag+i word', prev, context[i])
        add('i-1 word', context[i-1])
        add('i-1 suffix', context[i-1][-3:])
        add('i-2 word', context[i-2])
        add('i+1 word', context[i+1])
        add('i+1 suffix', context[i+1][-3:])
        add('i+2 word', context[i+2])
        return features

Example #9

Show file

from classifier import BinaryClassifier
from perceptron import Perceptron, AveragedPerceptron
from naive_bayes import NaiveBayes
from utils import read_data, build_vocab
import utils
from config import args

if __name__ == '__main__':
    filepath = '../data/given/'
    build_vocab(filepath, vocab_size=args.vocab_size)
    train_data, test_data = read_data(filepath)

    perc_classifier = Perceptron(args)
    perc_classifier.fit(train_data)
    acc, prec, rec, f1 = perc_classifier.evaluate(test_data)
    print('Perceptron Results:')
    print('Accuracy: %.2f, Precision: %.2f, Recall: %.2f, F1: %.2f'%(acc, prec, rec, f1))

    avg_perc_classifier = AveragedPerceptron(args)
    avg_perc_classifier.fit(train_data)
    acc, prec, rec, f1 = avg_perc_classifier.evaluate(test_data)
    print('\nAveraged Perceptron Results:')
    print('Accuracy: %.2f, Precision: %.2f, Recall: %.2f, F1: %.2f'%(acc, prec, rec, f1))

    nb_classifier = NaiveBayes(args)
    nb_classifier.fit(train_data)
    acc, prec, rec, f1 = nb_classifier.evaluate(test_data)
    print('\nNaive Bayes Performance:')
    print('Accuracy: %.2f, Precision: %.2f, Recall: %.2f, F1: %.2f'%(acc, prec, rec, f1))

Example #10

Show file

File: perceptron_tagger_multiclass_combo.py Project: abhilashreddyy/PythonNlpResearch

class PerceptronTaggerMultiClassCombo(object):
    '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
    See more implementation details here:
        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    :param load: Load the pickled model upon instantiation.
    '''

    START = ['-START-', '-START2-']
    END = ['-END-', '-END2-']
    AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE)

    POSITIVE_CLASS = 1.0
    NEGATIVE_CLASS = 0.0

    def __init__(self,
                 individual_tags,
                 tag_history,
                 combo_freq_threshold,
                 load=False,
                 use_tag_features=True):
        self.use_tag_features = use_tag_features
        self.combo_freq_threshold = combo_freq_threshold
        self.tag_history = tag_history
        self.classes = set()
        self.individual_tags = set(individual_tags)

    def _add_tag_features(self, feats, word, prev, prev2):
        sprev, sprev2 = str(prev), str(prev2)
        feats["bias"] = 1
        # Commenting out the single previous tag features as included with the
        # tag history parameter
        #feats["TAG -1 " + sprev]                  =      1     # included in other
        feats["TAG -1 wd " + sprev + "|" + word] = 1
        #feats["TAG -2 " + sprev2]                 =      1     # included in other
        feats["TAG -2 wd " + sprev2 + "|" + word] = 1
        feats["TAG -1, -2 " + sprev + "|" + sprev2] = 1

    def _add_secondary_tag_features(self, feats, prev_tags):
        for ix, prev in enumerate(prev_tags[-self.tag_history:]):
            offset = ix - self.tag_history
            feats["HIST_TAG " + str(offset) + " " + str(prev)] = 1

    def predict(self, essay_feats, output_scores=False):
        '''Tags a string `corpus`.
            Outputs a dictionary mapping to a list of binary predictions
        '''

        # Assume untokenized corpus has \n between sentences and ' ' between words
        class2predictions = defaultdict(list)
        for essay_ix, essay in enumerate(essay_feats):
            for sent_ix, taggged_sentence in enumerate(essay.sentences):
                """ Start Sentence """
                class2prev = defaultdict(list)
                for cls in self.classes:
                    class2prev[cls] = list(self.START)

                prev = list(self.START)
                for i, (wd) in enumerate(taggged_sentence):
                    # Don't mutate the feat dictionary
                    shared_features = dict(wd.features.items())
                    # get all tagger predictions for previous 2 tags

                    self._add_secondary_tag_features(shared_features, prev)

                    tagger_feats = dict(shared_features.items())
                    if self.use_tag_features:
                        self._add_tag_features(tagger_feats, wd.word, prev[-1],
                                               prev[-2])

                    scores_by_class = self.model.decision_function(
                        tagger_feats)
                    guess = max(self.model.classes,
                                key=lambda label:
                                (scores_by_class[label], label))
                    prev.append(guess)

                    if output_scores:
                        max_score_per_class = defaultdict(float)
                        for fset_tags, score in scores_by_class.items():
                            for tag in fset_tags:
                                max_score_per_class[tag] = max(
                                    max_score_per_class[tag], score)

                        for cls in self.individual_tags:
                            class2predictions[cls].append(
                                max_score_per_class[cls])
                    else:
                        for cls in self.individual_tags:
                            class2predictions[cls].append(1 if cls in
                                                          guess else 0)

        np_class2predictions = dict()
        for key, lst in class2predictions.items():
            np_class2predictions[key] = np.asarray(lst)
        return np_class2predictions

    def decision_function(self, essay_feats):
        '''Tags a string `corpus`.
            Outputs a dictionary mapping to a list of scores for each class
        '''
        return self.predict(essay_feats, output_scores=True)

    def __get_tags_(self, tags):
        return frozenset((t for t in tags if t in self.individual_tags))

    def train(self, essay_feats, save_loc=None, nr_iter=5, verbose=True):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.
        :param sentences: A list of (words, tags) tuples.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''

        cp_essay_feats = list(essay_feats)

        # Copy as we do an inplace shuffle below
        tag_freq = defaultdict(int)
        for essay in cp_essay_feats:
            for taggged_sentence in essay.sentences:
                for wd in taggged_sentence:
                    fs_tags = self.__get_tags_(wd.tags)
                    tag_freq[fs_tags] += 1

        self.classes = set([
            fs for fs, cnt in tag_freq.items()
            if cnt >= self.combo_freq_threshold
        ])
        self.model = AveragedPerceptron(self.classes)

        for iter_ in range(nr_iter):
            class2predictions = defaultdict(list)
            class2tags = defaultdict(list)

            for essay_ix, essay in enumerate(cp_essay_feats):
                for sent_ix, taggged_sentence in enumerate(essay.sentences):
                    """ Start Sentence """
                    prev = list(self.START)

                    for i, (wd) in enumerate(taggged_sentence):
                        # Don't mutate the feat dictionary
                        shared_features = dict(wd.features.items())
                        # get all tagger predictions for previous 2 tags
                        self._add_secondary_tag_features(shared_features, prev)

                        tagger_feats = dict(shared_features.items())
                        # add more in depth features for this tag
                        actual = self.__get_tags_(wd.tags)

                        if self.use_tag_features:
                            self._add_tag_features(tagger_feats, wd.word,
                                                   prev[-1], prev[-2])

                        guess = self.model.predict(tagger_feats)
                        self.model.update(actual, guess, tagger_feats)

                        prev.append(guess)
                        for cls in self.individual_tags:
                            class2predictions[cls].append(1 if cls in
                                                          guess else 0)
                            class2tags[cls].append(1 if cls in actual else 0)

            random.shuffle(cp_essay_feats)
            class2metrics = ResultsProcessor.compute_metrics(
                class2tags, class2predictions)
            micro_metrics = micro_rpfa(class2metrics.values())
            if verbose:
                logging.info("Iter {0}: Micro Avg Metrics: {1}".format(
                    iter_, str(micro_metrics)))

        self.model.average_weights()
        return None

    def _normalize(self, word):
        '''Normalization used in pre-processing.
        - All words are lower cased
        - Digits in the range 1800-2100 are represented as !YEAR;
        - Other digits are represented as !DIGITS
        :rtype: str
        '''
        if '-' in word and word[0] != '-':
            return '!HYPHEN'
        elif word.isdigit() and len(word) == 4:
            return '!YEAR'
        elif word[0].isdigit():
            return '!DIGITS'
        else:
            return word.lower()

    def _get_features(self, i, word, context, prev, prev2):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name, ) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        add('i-1 tag', prev)
        add('i-2 tag', prev2)
        add('i tag+i-2 tag', prev, prev2)
        add('i word', context[i])
        add('i-1 tag+i word', prev, context[i])
        add('i-1 word', context[i - 1])
        add('i-1 suffix', context[i - 1][-3:])
        add('i-2 word', context[i - 2])
        add('i+1 word', context[i + 1])
        add('i+1 suffix', context[i + 1][-3:])
        add('i+2 word', context[i + 2])
        return features

Example #11

Show file

File: perceptron_tagger_multiclass_combo.py Project: abhilashreddyy/PythonNlpResearch

    def train(self, essay_feats, save_loc=None, nr_iter=5, verbose=True):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.
        :param sentences: A list of (words, tags) tuples.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''

        cp_essay_feats = list(essay_feats)

        # Copy as we do an inplace shuffle below
        tag_freq = defaultdict(int)
        for essay in cp_essay_feats:
            for taggged_sentence in essay.sentences:
                for wd in taggged_sentence:
                    fs_tags = self.__get_tags_(wd.tags)
                    tag_freq[fs_tags] += 1

        self.classes = set([
            fs for fs, cnt in tag_freq.items()
            if cnt >= self.combo_freq_threshold
        ])
        self.model = AveragedPerceptron(self.classes)

        for iter_ in range(nr_iter):
            class2predictions = defaultdict(list)
            class2tags = defaultdict(list)

            for essay_ix, essay in enumerate(cp_essay_feats):
                for sent_ix, taggged_sentence in enumerate(essay.sentences):
                    """ Start Sentence """
                    prev = list(self.START)

                    for i, (wd) in enumerate(taggged_sentence):
                        # Don't mutate the feat dictionary
                        shared_features = dict(wd.features.items())
                        # get all tagger predictions for previous 2 tags
                        self._add_secondary_tag_features(shared_features, prev)

                        tagger_feats = dict(shared_features.items())
                        # add more in depth features for this tag
                        actual = self.__get_tags_(wd.tags)

                        if self.use_tag_features:
                            self._add_tag_features(tagger_feats, wd.word,
                                                   prev[-1], prev[-2])

                        guess = self.model.predict(tagger_feats)
                        self.model.update(actual, guess, tagger_feats)

                        prev.append(guess)
                        for cls in self.individual_tags:
                            class2predictions[cls].append(1 if cls in
                                                          guess else 0)
                            class2tags[cls].append(1 if cls in actual else 0)

            random.shuffle(cp_essay_feats)
            class2metrics = ResultsProcessor.compute_metrics(
                class2tags, class2predictions)
            micro_metrics = micro_rpfa(class2metrics.values())
            if verbose:
                logging.info("Iter {0}: Micro Avg Metrics: {1}".format(
                    iter_, str(micro_metrics)))

        self.model.average_weights()
        return None