def test_save_load(self): ids1 = IdManager() ids1.add_token('hi') ids1.add_token('hello') if not isdir('temp'): os.mkdir('temp') ids1.save('temp/temp') ids2 = IdManager() ids2.load('temp/temp') vec1 = ids1.vector() vec2 = ids2.vector() ids1.assign(vec1, 'hello', 3) ids2.assign(vec2, 'hello', 3) assert vec1 == vec2
class SimpleIntent(object): """General intent used to match sentences or phrases""" LENIENCE = 0.6 def __init__(self, name=''): self.name = name self.ids = IdManager(Ids) self.net = None def match(self, sent): return max(0, self.net.run(self.vectorize(sent))[0]) def vectorize(self, sent): vector = self.ids.vector() unknown = 0 for token in sent: if token in self.ids: self.ids.assign(vector, token, 1.0) else: unknown += 1 if len(sent) > 0: self.ids.assign(vector, Ids.unknown_tokens, unknown / float(len(sent))) self.ids.assign(vector, Ids.w_1, len(sent) / 1) self.ids.assign(vector, Ids.w_2, len(sent) / 2.) self.ids.assign(vector, Ids.w_3, len(sent) / 3.) self.ids.assign(vector, Ids.w_4, len(sent) / 4.) return vector def configure_net(self): self.net = fann.neural_net() self.net.create_standard_array([len(self.ids), 10, 1]) self.net.set_activation_function_hidden( fann.SIGMOID_SYMMETRIC_STEPWISE) self.net.set_activation_function_output( fann.SIGMOID_SYMMETRIC_STEPWISE) self.net.set_train_stop_function(fann.STOPFUNC_BIT) self.net.set_bit_fail_limit(0.1) def train(self, train_data): for sent in train_data.my_sents(self.name): self.ids.add_sent(sent) inputs = [] outputs = [] def add(vec, out): inputs.append(self.vectorize(vec)) outputs.append([out]) def pollute(sent, p): sent = sent[:] for _ in range(int((len(sent) + 2) / 3)): sent.insert(p, ':null:') add(sent, self.LENIENCE) def weight(sent): def calc_weight(w): return pow(len(w), 3.0) total_weight = 0.0 for word in sent: total_weight += calc_weight(word) for word in sent: weight = 0 if word.startswith('{') else calc_weight(word) add([word], weight / total_weight) for sent in train_data.my_sents(self.name): add(sent, 1.0) weight(sent) if not any(word[0] == ':' for word in sent): pollute(sent, 0) pollute(sent, len(sent)) for sent in train_data.other_sents(self.name): add(sent, 0.0) add([], 0.0) inputs, outputs = resolve_conflicts(inputs, outputs) train_data = fann.training_data() train_data.set_train_data(inputs, outputs) for _ in range(10): self.configure_net() self.net.train_on_data(train_data, 1000, 0, 0) self.net.test_data(train_data) if self.net.get_bit_fail() == 0: break def save(self, prefix): prefix += '.intent' self.net.save(str(prefix + '.net')) # Must have str() self.ids.save(prefix) @classmethod def from_file(cls, name, prefix): prefix += '.intent' self = cls(name) self.net = fann.neural_net() self.net.create_from_file(str(prefix + '.net')) # Must have str() self.ids.load(prefix) return self
class EntityEdge(object): """ Represents the left or right side of an entity (a PosIntent) Args: direction (int): -1 for left and +1 for right token (str): token to attach to (something like {word}) intent_name (str): name of parent intent """ def __init__(self, direction, token, intent_name): self.ids = IdManager(Ids) self.intent_name = intent_name self.token = token self.dir = direction self.net = None def get_end(self, sent): return len(sent) if self.dir > 0 else -1 def vectorize(self, sent, pos): unknown = 0 vector = self.ids.vector() end_pos = self.get_end(sent) for i in range(pos + self.dir, end_pos, self.dir): if sent[i] in self.ids: self.ids.assign(vector, sent[i], 1.0 / abs(i - pos)) else: unknown += 1 self.ids.assign(vector, Ids.end, 1.0 / abs(end_pos - pos)) return vector def match(self, sent, pos): return self.net.run(self.vectorize(sent, pos))[0] def configure_net(self): layers = [len(self.ids), 3, 1] self.net = fann.neural_net() self.net.create_standard_array(layers) self.net.set_activation_function_hidden( fann.SIGMOID_SYMMETRIC_STEPWISE) self.net.set_activation_function_output(fann.SIGMOID_STEPWISE) self.net.set_train_stop_function(fann.STOPFUNC_BIT) self.net.set_bit_fail_limit(0.1) def save(self, prefix): prefix += '.' + {-1: 'l', +1: 'r'}[self.dir] self.net.save(str(prefix + '.net')) # Must have str() self.ids.save(prefix) def load(self, prefix): prefix += '.' + {-1: 'l', +1: 'r'}[self.dir] self.net = fann.neural_net() if not self.net.create_from_file( str(prefix + '.net')): # Must have str() raise FileNotFoundError(str(prefix + '.net')) self.ids.load(prefix) def train(self, train_data): for sent in train_data.my_sents(self.intent_name): if self.token in sent: for i in range( sent.index(self.token) + self.dir, self.get_end(sent), self.dir): if sent[i][0] != '{': self.ids.add_token(sent[i]) inputs, outputs = [], [] def pollute(sent, i, out_val): """Simulates multiple token words in adjacent entities""" for j, check_token in enumerate(sent): d = j - i if int(d > 0) - int( d < 0) == self.dir and check_token.startswith('{'): for pol_len in range(1, 4): s = sent[:j] + [':0'] * pol_len + sent[j + 1:] p = i + (pol_len - 1) * int(self.dir < 0) inputs.append(self.vectorize(s, p)) outputs.append([out_val]) def add_sents(sents, out_fn): for sent in sents: for i, token in enumerate(sent): out_val = out_fn(token) inputs.append(self.vectorize(sent, i)) outputs.append([out_val]) if out_val == 1.0: pollute(sent, i, 1.0) add_sents(train_data.my_sents(self.intent_name), lambda x: float(x == self.token)) add_sents(train_data.other_sents(self.intent_name), lambda x: 0.0) inputs, outputs = resolve_conflicts(inputs, outputs) data = fann.training_data() data.set_train_data(inputs, outputs) for _ in range(10): self.configure_net() self.net.train_on_data(data, 1000, 0, 0) self.net.test_data(data) if self.net.get_bit_fail() == 0: break