Esempio n. 1
0
    def test_save_load(self):
        ids1 = IdManager()
        ids1.add_token('hi')
        ids1.add_token('hello')

        if not isdir('temp'):
            os.mkdir('temp')
        ids1.save('temp/temp')

        ids2 = IdManager()
        ids2.load('temp/temp')

        vec1 = ids1.vector()
        vec2 = ids2.vector()
        ids1.assign(vec1, 'hello', 3)
        ids2.assign(vec2, 'hello', 3)

        assert vec1 == vec2
Esempio n. 2
0
class SimpleIntent(object):
    """General intent used to match sentences or phrases"""
    LENIENCE = 0.6

    def __init__(self, name=''):
        self.name = name
        self.ids = IdManager(Ids)
        self.net = None

    def match(self, sent):
        return max(0, self.net.run(self.vectorize(sent))[0])

    def vectorize(self, sent):
        vector = self.ids.vector()
        unknown = 0
        for token in sent:
            if token in self.ids:
                self.ids.assign(vector, token, 1.0)
            else:
                unknown += 1
        if len(sent) > 0:
            self.ids.assign(vector, Ids.unknown_tokens,
                            unknown / float(len(sent)))
            self.ids.assign(vector, Ids.w_1, len(sent) / 1)
            self.ids.assign(vector, Ids.w_2, len(sent) / 2.)
            self.ids.assign(vector, Ids.w_3, len(sent) / 3.)
            self.ids.assign(vector, Ids.w_4, len(sent) / 4.)
        return vector

    def configure_net(self):
        self.net = fann.neural_net()
        self.net.create_standard_array([len(self.ids), 10, 1])
        self.net.set_activation_function_hidden(
            fann.SIGMOID_SYMMETRIC_STEPWISE)
        self.net.set_activation_function_output(
            fann.SIGMOID_SYMMETRIC_STEPWISE)
        self.net.set_train_stop_function(fann.STOPFUNC_BIT)
        self.net.set_bit_fail_limit(0.1)

    def train(self, train_data):
        for sent in train_data.my_sents(self.name):
            self.ids.add_sent(sent)

        inputs = []
        outputs = []

        def add(vec, out):
            inputs.append(self.vectorize(vec))
            outputs.append([out])

        def pollute(sent, p):
            sent = sent[:]
            for _ in range(int((len(sent) + 2) / 3)):
                sent.insert(p, ':null:')
            add(sent, self.LENIENCE)

        def weight(sent):
            def calc_weight(w):
                return pow(len(w), 3.0)

            total_weight = 0.0
            for word in sent:
                total_weight += calc_weight(word)
            for word in sent:
                weight = 0 if word.startswith('{') else calc_weight(word)
                add([word], weight / total_weight)

        for sent in train_data.my_sents(self.name):
            add(sent, 1.0)
            weight(sent)
            if not any(word[0] == ':' for word in sent):
                pollute(sent, 0)
                pollute(sent, len(sent))

        for sent in train_data.other_sents(self.name):
            add(sent, 0.0)
        add([], 0.0)

        inputs, outputs = resolve_conflicts(inputs, outputs)

        train_data = fann.training_data()
        train_data.set_train_data(inputs, outputs)

        for _ in range(10):
            self.configure_net()
            self.net.train_on_data(train_data, 1000, 0, 0)
            self.net.test_data(train_data)
            if self.net.get_bit_fail() == 0:
                break

    def save(self, prefix):
        prefix += '.intent'
        self.net.save(str(prefix + '.net'))  # Must have str()
        self.ids.save(prefix)

    @classmethod
    def from_file(cls, name, prefix):
        prefix += '.intent'
        self = cls(name)
        self.net = fann.neural_net()
        self.net.create_from_file(str(prefix + '.net'))  # Must have str()
        self.ids.load(prefix)
        return self
Esempio n. 3
0
class EntityEdge(object):
    """
    Represents the left or right side of an entity (a PosIntent)

    Args:
        direction (int): -1 for left and +1 for right
        token (str): token to attach to (something like {word})
        intent_name (str): name of parent intent
    """
    def __init__(self, direction, token, intent_name):
        self.ids = IdManager(Ids)
        self.intent_name = intent_name
        self.token = token
        self.dir = direction
        self.net = None

    def get_end(self, sent):
        return len(sent) if self.dir > 0 else -1

    def vectorize(self, sent, pos):
        unknown = 0
        vector = self.ids.vector()
        end_pos = self.get_end(sent)
        for i in range(pos + self.dir, end_pos, self.dir):
            if sent[i] in self.ids:
                self.ids.assign(vector, sent[i], 1.0 / abs(i - pos))
            else:
                unknown += 1
        self.ids.assign(vector, Ids.end, 1.0 / abs(end_pos - pos))
        return vector

    def match(self, sent, pos):
        return self.net.run(self.vectorize(sent, pos))[0]

    def configure_net(self):
        layers = [len(self.ids), 3, 1]

        self.net = fann.neural_net()
        self.net.create_standard_array(layers)
        self.net.set_activation_function_hidden(
            fann.SIGMOID_SYMMETRIC_STEPWISE)
        self.net.set_activation_function_output(fann.SIGMOID_STEPWISE)
        self.net.set_train_stop_function(fann.STOPFUNC_BIT)
        self.net.set_bit_fail_limit(0.1)

    def save(self, prefix):
        prefix += '.' + {-1: 'l', +1: 'r'}[self.dir]
        self.net.save(str(prefix + '.net'))  # Must have str()
        self.ids.save(prefix)

    def load(self, prefix):
        prefix += '.' + {-1: 'l', +1: 'r'}[self.dir]
        self.net = fann.neural_net()
        if not self.net.create_from_file(
                str(prefix + '.net')):  # Must have str()
            raise FileNotFoundError(str(prefix + '.net'))
        self.ids.load(prefix)

    def train(self, train_data):
        for sent in train_data.my_sents(self.intent_name):
            if self.token in sent:
                for i in range(
                        sent.index(self.token) + self.dir, self.get_end(sent),
                        self.dir):
                    if sent[i][0] != '{':
                        self.ids.add_token(sent[i])

        inputs, outputs = [], []

        def pollute(sent, i, out_val):
            """Simulates multiple token words in adjacent entities"""
            for j, check_token in enumerate(sent):
                d = j - i
                if int(d > 0) - int(
                        d < 0) == self.dir and check_token.startswith('{'):
                    for pol_len in range(1, 4):
                        s = sent[:j] + [':0'] * pol_len + sent[j + 1:]
                        p = i + (pol_len - 1) * int(self.dir < 0)
                        inputs.append(self.vectorize(s, p))
                        outputs.append([out_val])

        def add_sents(sents, out_fn):
            for sent in sents:
                for i, token in enumerate(sent):
                    out_val = out_fn(token)
                    inputs.append(self.vectorize(sent, i))
                    outputs.append([out_val])
                    if out_val == 1.0:
                        pollute(sent, i, 1.0)

        add_sents(train_data.my_sents(self.intent_name),
                  lambda x: float(x == self.token))
        add_sents(train_data.other_sents(self.intent_name), lambda x: 0.0)
        inputs, outputs = resolve_conflicts(inputs, outputs)

        data = fann.training_data()
        data.set_train_data(inputs, outputs)

        for _ in range(10):
            self.configure_net()
            self.net.train_on_data(data, 1000, 0, 0)
            self.net.test_data(data)
            if self.net.get_bit_fail() == 0:
                break