Beispiel #1
0
 def __init__(self, n_classes, width, depth, get_bow, rho=1e-5, eta=0.005,
              eps=1e-6, bias=0.0, update_step='adadelta'):
     nn_shape = tuple([width] + [width] * depth + [n_classes])
     NeuralNet.__init__(self, nn_shape, embed=((width,), (0,)),
                        rho=rho, eta=eta, eps=eps, bias=bias,
                        update_step=update_step)
     self.get_bow = get_bow
Beispiel #2
0
 def start_training(self, sentences):
     self._make_tagdict(sentences)
     input_length = self.ex.input_length
     widths = [input_length] + [self.hidden_width] * self.depth + [len(self.classes)]
     self.model = NeuralNet(
         widths,
         embed=(self.ex.tables, self.ex.slots),
         rho=self.L2, eta=self.learn_rate, update_step=self.solver)
     print(self.model.widths)
Beispiel #3
0
class Tagger(object):
    def __init__(self, depth, hidden_width, extractor, learn_rate=0.01, L2=1e6, solver='adam',
        classes=None, load=False):
        self.ex = extractor
        self.tagdict = {}
        if classes:
            self.classes = classes
        else:
            self.classes = {}
        self.depth = depth
        self.hidden_width = hidden_width
        self.learn_rate = learn_rate
        self.L2 = L2
        self.solver = solver
        self.model = None

    def start_training(self, sentences):
        self._make_tagdict(sentences)
        input_length = self.ex.input_length
        widths = [input_length] + [self.hidden_width] * self.depth + [len(self.classes)]
        self.model = NeuralNet(
            widths,
            embed=(self.ex.tables, self.ex.slots),
            rho=self.L2, eta=self.learn_rate, update_step=self.solver)
        print(self.model.widths)
    
    def tag(self, words):
        tags = DefaultList('') 
        context = START + [w for w in words] + END
        inverted_classes = {i: tag for tag, i in self.classes.items()}
        eg = self.model.Example([])
        for i, word in enumerate(words):
            eg.wipe(self.model.widths)
            features = self.ex(i, word, context, tags)
            eg.set_features(features)
            eg = self.model.predict_example(eg)
            tag = inverted_classes[eg.guess]
            tags.append(tag)
        return tags
    
    def train_one(self, words, tags):
        tag_history = DefaultList('') 
        #context = START + [self._normalize(w) for w in words] + END
        context = START + [w for w in words] + END
        Xs = []
        ys = []
        inverted_classes = {i: tag for tag, i in self.classes.items()}
        loss = 0.0
        eg = self.model.Example([])
        for i, word in enumerate(words):
            eg.wipe(self.model.widths)
            features = self.ex(i, word, context, tag_history)
            eg.set_features(features)
            eg.set_label(self.classes[tags[i]])
            eg = self.model.train_example(eg)
            tag_history.append(inverted_classes[eg.guess])
            loss += eg.loss
        return loss

    def save(self):
        # Pickle as a binary file
        pickle.dump((self.model.weights, self.tagdict, self.classes),
                    open(PerceptronTagger.model_loc, 'wb'), -1)

    def load(self, loc):
        w_td_c = pickle.load(open(loc, 'rb'))
        self.model.weights, self.tagdict, self.classes = w_td_c
        self.model.classes = self.classes

    def _normalize(self, word):
        if '-' in word and word[0] != '-':
            return '!HYPHEN'
        elif word.isdigit() and len(word) == 4:
            return '!YEAR'
        elif word[0].isdigit():
            return '!DIGITS'
        else:
            return word.lower()

    def _make_tagdict(self, sentences):
        '''Make a tag dictionary for single-tag words.'''
        counts = defaultdict(lambda: defaultdict(int))
        for sent in sentences:
            for word, tag in zip(sent[0], sent[1]):
                counts[word][tag] += 1
                if tag not in self.classes:
                    self.classes[tag] = len(self.classes)
        freq_thresh = 20
        ambiguity_thresh = 0.97
        for word, tag_freqs in counts.items():
            tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
            n = sum(tag_freqs.values())
            # Don't add rare words to the tag dictionary
            # Only add quite unambiguous words
            if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
                self.tagdict[word] = self.classes[tag]