Python PerceptronTagger Examples

Programming Language: Python

Namespace/Package Name: external.tagger.taggers

Class/Type: PerceptronTagger

Examples at hotexamples.com: 4

Python PerceptronTagger - 4 examples found. These are the top rated real world Python examples of external.tagger.taggers.PerceptronTagger extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PerceptronTagger(1)

tag(1)

Example #1

Show file

File: grammar.py Project: xhydras/scan

 def __init__(self, load=True):
     super(GrammarCorrector, self).__init__()
     self.data_path = os.path.abspath(os.path.join(settings.DATA_PATH, PICKLE_FILENAME))
     self.model = PerceptronTagger()
     if not load:
         tags, self.good_sequences = self.get_sequences(self.data)
         pickle.dump(self.good_sequences, open(self.data_path, 'wb'))
     else:
         self.good_sequences = pickle.load(open(self.data_path, 'r'))

Example #2

Show file

File: grammar.py Project: fototo/scan

 def __init__(self, load=True):
     super(GrammarCorrector, self).__init__()
     self.data_path = os.path.abspath(os.path.join(settings.DATA_PATH, PICKLE_FILENAME))
     self.model = PerceptronTagger()
     if not load:
         tags, self.good_sequences = self.get_sequences(self.data)
         pickle.dump(self.good_sequences, open(self.data_path, "wb"))
     else:
         self.good_sequences = pickle.load(open(self.data_path, "r"))

Example #3

Show file

File: grammar.py Project: xhydras/scan

class GrammarCorrector(BaseCorrector):
    """
    Adapted from http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    """

    lengths = [4]
    min_count = 3

    def __init__(self, load=True):
        super(GrammarCorrector, self).__init__()
        self.data_path = os.path.abspath(os.path.join(settings.DATA_PATH, PICKLE_FILENAME))
        self.model = PerceptronTagger()
        if not load:
            tags, self.good_sequences = self.get_sequences(self.data)
            pickle.dump(self.good_sequences, open(self.data_path, 'wb'))
        else:
            self.good_sequences = pickle.load(open(self.data_path, 'r'))

    def find_bad_sequences(self, sequences):
        bad_sequences = set([])
        for k in sequences:
            count = self.good_sequences.get(k, 0)
            if count < self.min_count:
                bad_sequences.update([k])
        return bad_sequences

    def correct_string(self, string_corpus):
        tags, sequences = self.get_sequences(string_corpus)
        bad_sequences = self.find_bad_sequences(sequences)
        error_ratio = len(bad_sequences) / (float(len(sequences)) + 1)
        new_tags = []
        for t in tags:
            new_tags.append([t[0], t[1], False])

        for l in self.lengths:
            for i in range(l, len(new_tags)):
                nt = new_tags[(i-l):i]
                seq = "_".join([t[1] for t in nt])
                if seq in bad_sequences:
                    for j in range((i-l), i):
                        new_tags[j][2] = True
        for i in range(1, len(new_tags)-1):
            tag = new_tags[i]
            if tag[2] and not new_tags[i-1][2]:
                new_tags[i][0] = "<grammar>" + new_tags[i][0]
            if tag[2] and not new_tags[i+1][2]:
                new_tags[i][0] += "</grammar>"

        words = [nt[0] for nt in new_tags]
        return error_ratio, " ".join(words), new_tags


    def tag(self, string_corpus):
        return self.model.tag(string_corpus)

    def get_pos(self, tags):
        return [t[1] for t in tags]

    def get_sequences(self, string_corpus):
        tags = self.tag(string_corpus)
        pos = self.get_pos(tags)
        sequences = self._get_sequences(pos, self.lengths)
        return tags, sequences

    def _get_sequences(self, pos, lengths):
        sequences = {}
        for l in lengths:
            sequences.update(self._get_sequence(pos, l))
        return sequences

    def _get_sequence(self, pos, length):
        sequences = {}
        if length >= len(pos):
            return sequences

        for i in range(0, len(pos) - length):
            if i % 100000 == 0 and i != 0:
                log.debug("{0}% done with sequence.".format((i/float(len(pos))) * 100))
            val = "_".join(pos[i:i+length])
            if val not in sequences:
                sequences[val] = 0
            sequences[val] += 1
        return sequences

Example #4

Show file

File: grammar.py Project: fototo/scan

class GrammarCorrector(BaseCorrector):
    """
    Adapted from http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    """

    lengths = [4]
    min_count = 3

    def __init__(self, load=True):
        super(GrammarCorrector, self).__init__()
        self.data_path = os.path.abspath(os.path.join(settings.DATA_PATH, PICKLE_FILENAME))
        self.model = PerceptronTagger()
        if not load:
            tags, self.good_sequences = self.get_sequences(self.data)
            pickle.dump(self.good_sequences, open(self.data_path, "wb"))
        else:
            self.good_sequences = pickle.load(open(self.data_path, "r"))

    def find_bad_sequences(self, sequences):
        bad_sequences = set([])
        for k in sequences:
            count = self.good_sequences.get(k, 0)
            if count < self.min_count:
                bad_sequences.update([k])
        return bad_sequences

    def correct_string(self, string_corpus):
        tags, sequences = self.get_sequences(string_corpus)
        bad_sequences = self.find_bad_sequences(sequences)
        error_ratio = len(bad_sequences) / (float(len(sequences)) + 1)
        new_tags = []
        for t in tags:
            new_tags.append([t[0], t[1], False])

        for l in self.lengths:
            for i in range(l, len(new_tags)):
                nt = new_tags[(i - l) : i]
                seq = "_".join([t[1] for t in nt])
                if seq in bad_sequences:
                    for j in range((i - l), i):
                        new_tags[j][2] = True
        for i in range(1, len(new_tags) - 1):
            tag = new_tags[i]
            if tag[2] and not new_tags[i - 1][2]:
                new_tags[i][0] = "<grammar>" + new_tags[i][0]
            if tag[2] and not new_tags[i + 1][2]:
                new_tags[i][0] += "</grammar>"

        words = [nt[0] for nt in new_tags]
        return error_ratio, " ".join(words), new_tags

    def tag(self, string_corpus):
        return self.model.tag(string_corpus)

    def get_pos(self, tags):
        return [t[1] for t in tags]

    def get_sequences(self, string_corpus):
        tags = self.tag(string_corpus)
        pos = self.get_pos(tags)
        sequences = self._get_sequences(pos, self.lengths)
        return tags, sequences

    def _get_sequences(self, pos, lengths):
        sequences = {}
        for l in lengths:
            sequences.update(self._get_sequence(pos, l))
        return sequences

    def _get_sequence(self, pos, length):
        sequences = {}
        if length >= len(pos):
            return sequences

        for i in range(0, len(pos) - length):
            if i % 100000 == 0 and i != 0:
                log.debug("{0}% done with sequence.".format((i / float(len(pos))) * 100))
            val = "_".join(pos[i : i + length])
            if val not in sequences:
                sequences[val] = 0
            sequences[val] += 1
        return sequences