class Tagger(object): def __init__(self, depth, hidden_width, extractor, learn_rate=0.01, L2=1e6, solver='adam', classes=None, load=False): self.ex = extractor self.tagdict = {} if classes: self.classes = classes else: self.classes = {} self.depth = depth self.hidden_width = hidden_width self.learn_rate = learn_rate self.L2 = L2 self.solver = solver self.model = None def start_training(self, sentences): self._make_tagdict(sentences) input_length = self.ex.input_length widths = [input_length] + [self.hidden_width] * self.depth + [len(self.classes)] self.model = NeuralNet( widths, embed=(self.ex.tables, self.ex.slots), rho=self.L2, eta=self.learn_rate, update_step=self.solver) print(self.model.widths) def tag(self, words): tags = DefaultList('') context = START + [w for w in words] + END inverted_classes = {i: tag for tag, i in self.classes.items()} eg = self.model.Example([]) for i, word in enumerate(words): eg.wipe(self.model.widths) features = self.ex(i, word, context, tags) eg.set_features(features) eg = self.model.predict_example(eg) tag = inverted_classes[eg.guess] tags.append(tag) return tags def train_one(self, words, tags): tag_history = DefaultList('') #context = START + [self._normalize(w) for w in words] + END context = START + [w for w in words] + END Xs = [] ys = [] inverted_classes = {i: tag for tag, i in self.classes.items()} loss = 0.0 eg = self.model.Example([]) for i, word in enumerate(words): eg.wipe(self.model.widths) features = self.ex(i, word, context, tag_history) eg.set_features(features) eg.set_label(self.classes[tags[i]]) eg = self.model.train_example(eg) tag_history.append(inverted_classes[eg.guess]) loss += eg.loss return loss def save(self): # Pickle as a binary file pickle.dump((self.model.weights, self.tagdict, self.classes), open(PerceptronTagger.model_loc, 'wb'), -1) def load(self, loc): w_td_c = pickle.load(open(loc, 'rb')) self.model.weights, self.tagdict, self.classes = w_td_c self.model.classes = self.classes def _normalize(self, word): if '-' in word and word[0] != '-': return '!HYPHEN' elif word.isdigit() and len(word) == 4: return '!YEAR' elif word[0].isdigit(): return '!DIGITS' else: return word.lower() def _make_tagdict(self, sentences): '''Make a tag dictionary for single-tag words.''' counts = defaultdict(lambda: defaultdict(int)) for sent in sentences: for word, tag in zip(sent[0], sent[1]): counts[word][tag] += 1 if tag not in self.classes: self.classes[tag] = len(self.classes) freq_thresh = 20 ambiguity_thresh = 0.97 for word, tag_freqs in counts.items(): tag, mode = max(tag_freqs.items(), key=lambda item: item[1]) n = sum(tag_freqs.values()) # Don't add rare words to the tag dictionary # Only add quite unambiguous words if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh: self.tagdict[word] = self.classes[tag]