Exemple #1
0
    def train(self, data):
        self.model = {}
        for entry in data:
            triples = utils.delexicalize_struct(utils.split_struct(entry['source']))
            source= []
            for snt in triples:
                sentence = ' '.join(['<SNT>'] + [t[1] for t in snt] + ['</SNT>'])
                source.append(sentence)

            source = tuple(source)
            if source not in self.model:
                self.model[source] = []

            for target in entry['targets']:
                output = ' '.join(target['output'])
                self.model[source].append(output)

        return self.model
Exemple #2
0
    def predict(self, source):
        sentences = utils.split_struct(source)
        triples = utils.delexicalize_struct(sentences)
        struct = []
        for snt in triples:
            sentence = ' '.join(['<SNT>'] + [t[1] for t in snt] + ['</SNT>'])
            struct.append(sentence)

        target = []
        # Try to extract a full template
        start, end, templates = 0, len(struct), []
        while start < len(struct):
            snts = tuple(struct[start:end])
            entities, _ = self.track_entity(sentences[start:end])

            if snts in self.model:
                pos = randint(0, len(self.model[snts]) - 1)
                template = self.model[snts][pos].split()
                for i, w in enumerate(template):
                    if w in entities:
                        template[i] = entities[w]
                target.extend(template)

                start = copy.copy(end)
                end = len(struct)
            else:
                end -= 1

                # jump a triple if it is not on training set
                if start == end:
                    start += 1
                    end = len(struct)

        _, entitytag = self.track_entity(sentences)
        for i, w in enumerate(target):
            if w in entitytag:
                target[i] = entitytag[w]
        return target
Exemple #3
0
    def load(self, path):
        flat = lambda struct: [
            w for w in struct if w not in ['<SNT>', '</SNT>']
        ]

        entryset = parsing.run_parser(path)

        data, size = [], 0
        invocab, outvocab = [], []
        for entry in entryset:
            entitymap = {b: a for a, b in entry.entitymap_to_dict().items()}

            if len(entry.modifiedtripleset) > 1:
                visited = []
                for lex in entry.lexEntries:
                    # process ordered tripleset
                    source, delex_source, _ = load.snt_source(
                        lex.orderedtripleset, entitymap, {})
                    source, delex_source = flat(source), flat(delex_source)

                    if source not in visited and ' '.join(
                            source).strip() != '':
                        visited.append(source)
                        invocab.extend(source)

                        targets = []
                        for lex2 in entry.lexEntries:
                            _, text, _ = load.snt_source(
                                lex2.orderedtripleset, entitymap, {})
                            flatten = flat(text)
                            if delex_source == flatten:
                                trgt_preds = []
                                for snt in utils.split_struct(text):
                                    trgt_preds.append('<SNT>')
                                    trgt_preds.extend([t[1] for t in snt])
                                    trgt_preds.append('</SNT>')
                                target = {
                                    'lid': lex2.lid,
                                    'comment': lex2.comment,
                                    'output': trgt_preds
                                }
                                targets.append(target)
                                outvocab.extend(trgt_preds)

                        data.append({
                            'eid': entry.eid,
                            'category': entry.category,
                            'size': entry.size,
                            'source': source,
                            'targets': targets
                        })
                        size += len(targets)

        invocab.append('unk')
        outvocab.append('unk')

        invocab = list(set(invocab))
        outvocab = list(set(outvocab))
        vocab = {'input': invocab, 'output': outvocab}

        print('Path:', path, 'Size: ', size)
        return data, vocab