Beispiel #1
0
 def __init__(self, model, vocab=None):
     if vocab == None:
         self._shadow = Linear(len(model.weight))
     else:
         self._shadow = Linear(len(model.weight), vocab)
     self.model = model
     self._counter = 1.0
Beispiel #2
0
 def __init__(self, vocab_words, vocab_tags, encoded=True):
     self.vocab_words = vocab_words
     self.vocab_tags = vocab_tags
     if encoded:
         self.model = Linear(len(vocab_tags))
     else:
         self.model = Linear(len(vocab_tags), vocab_tags)
Beispiel #3
0
 def __init__(self, vocab_words, vocab_tags, encoded=True):
     self.vocab_words = vocab_words
     self.vocab_tags = vocab_tags
     self.most_frequent = {}
     self.pre_sufix = {}
     self.fallback = None
     if encoded:
         self.model = Linear(len(vocab_tags))
     else:
         self.model = Linear(len(vocab_tags), vocab_tags)
Beispiel #4
0
class PerceptronTagger(Tagger):
    def __init__(self, vocab_words, vocab_tags, encoded=True):
        self.vocab_words = vocab_words
        self.vocab_tags = vocab_tags
        if encoded:
            self.model = Linear(len(vocab_tags))
        else:
            self.model = Linear(len(vocab_tags), vocab_tags)

    def featurize(self, words, i, pred_tags):
        ENC_PAD = self.vocab_words[PAD]
        current_word = words[i]
        prev_word = ENC_PAD if i == 0 else words[i - 1]
        next_word = ENC_PAD if i == len(words) - 1 else words[i + 1]
        tag = ENC_PAD if i == 0 else pred_tags[i - 1]
        return [(0, current_word), (1, prev_word), (2, next_word), (3, tag)]

    def predict(self, words):
        pred_tags = []
        for i in range(len(words)):
            features = self.featurize(words, i, pred_tags)
            output_vector = self.model.forward(features)
            tag = max(output_vector, key=output_vector.get)
            pred_tags.append(tag)
        return pred_tags
Beispiel #5
0
class GoldTagger(PerceptronTagger):

    def __init__(self, vocab_words, vocab_tags, encoded = True):
        self.vocab_words = vocab_words
        self.vocab_tags = vocab_tags
        self.most_frequent = {}
        self.pre_sufix = {}
        self.fallback = None
        if encoded:
            self.model = Linear(len(vocab_tags))
        else:
            self.model = Linear(len(vocab_tags), vocab_tags)


    def featurize(self, words, i, pred_tags, next_tag):
        ENC_PAD = self.vocab_words[PAD]
        features = []

        features.append((0,words[i]))
        features.append((1,words[i-1]) if i != 0 else ENC_PAD )
        features.append((2 ,words[i+1] if i + 1 < len(words) else ENC_PAD))
        features.append((3 ,pred_tags[i-1] if i != 0 else ENC_PAD))
        features.append((4 ,(pred_tags[i-2] if i > 1 else ENC_PAD, pred_tags[i-1] if i != 0 else ENC_PAD)))
        features.append((5, self.pre_sufix[words[i][:1] ] if words[i][:1] in self.pre_sufix else ENC_PAD))
        features.append((6, self.pre_sufix[words[i][:2] ] if words[i][:2] in self.pre_sufix else ENC_PAD))
        features.append((7, self.pre_sufix[words[i][:3] ] if words[i][:3] in self.pre_sufix else ENC_PAD))
        features.append((8, self.pre_sufix[words[i][:4] ] if words[i][:4] in self.pre_sufix else ENC_PAD))
        features.append((9, self.pre_sufix[words[i][-1:]] if words[i][-1:] in self.pre_sufix else ENC_PAD))
        features.append((10, self.pre_sufix[words[i][-2:]] if words[i][-2:] in self.pre_sufix else ENC_PAD))
        features.append((11, self.pre_sufix[words[i][-3:]] if words[i][-3:] in self.pre_sufix else ENC_PAD))
        features.append((12, self.pre_sufix[words[i][-4:]] if words[i][-4:] in self.pre_sufix else ENC_PAD))
        #Most frequent tag for next word        
        features.append((13, next_tag))
        features.append((14, 1 if  bool(re.search(r'\w*-\w*', words[i])) else ENC_PAD))
        features.append((15, 1 if bool(re.search(r'\d', words[i])) else ENC_PAD))
        return features

    def predict(self, words, tags):
        pred_tags = []
        for i in range(len(words)):
            features = self.featurize(words, i, pred_tags, tags[i+1] if i + 1 < len(tags) else PAD)
            output_vector = self.model.forward(features)
            tag = max(output_vector, key=output_vector.get)
            pred_tags.append(tag)
        return pred_tags
Beispiel #6
0
class PerceptronParser(Parser):
    def __init__(self, vocab_words, vocab_tags):
        self.vocab_words = vocab_words
        self.vocab_tags = vocab_tags
        self.model = Linear(len(vocab_tags))

    def __calc_dist(self, dist):
        if dist < 6:
            return 1
        if 6 <= dist <= 11:
            return 2
        return 3

    def featurize(self, words, tags, config):
        ENC_PAD = self.vocab_words[PAD]
        i = config[0]
        stack = config[1]
        heads = config[2]

        w_next = ENC_PAD if i == len(words) else words[i]
        w_top = ENC_PAD if len(stack) == 0 else words[stack[-1]]
        w_sec = ENC_PAD if len(stack) < 2 else words[stack[-2]]
        t_next = ENC_PAD if i == len(words) else tags[i]
        t_top = ENC_PAD if len(stack) == 0 else tags[stack[-1]]
        t_sec = ENC_PAD if len(stack) < 2 else tags[stack[-2]]

        return [(0, w_next), (1, w_top), (2, w_sec), (3, t_next), (4, t_top),
                (5, t_sec)]

    def predict(self, words, tags):
        parser = Parser()

        # 1. Start in the initial configuration for the input sentence.
        config = parser.initial_config(len(words))

        # 2. As long as there are valid moves, ask the averaged perceptron for the next move to take.

        while len(self.valid_moves(config)) != 0:
            features = self.featurize(words, tags, config)
            output_vector = self.model.forward(features)
            move = max(output_vector, key=output_vector.get)
            config = self.next_config(config, move)

        # 3. Return the list of heads associated with the final configuration.
        return config[2]
 def __init__(self, vocab_words, vocab_tags):
     self.vocab_words = vocab_words
     self.vocab_tags = vocab_tags
     self.model = Linear(len(vocab_tags))