Ejemplo n.º 1
0
    def structing_analysis(self, structing):
        for i, entry in enumerate(self.gold):
            triples = utils.split_triples(entry['source'])

            num, visited = 0, []
            for triple in triples:
                for j, predicate in enumerate(structing[i]):
                    if predicate == triple[1] and j not in visited:
                        num += 1
                        visited.append(j)
            # How many predicates in the modified tripleset are present in the result?
            entry['structing'] = num
        return self.gold
Ejemplo n.º 2
0
    def train(self, data):
        self.model = {}
        for entry in data:
            src_triples = utils.split_triples(entry['source'])
            src_preds = tuple([t[1] for t in src_triples])
            if src_preds not in self.model:
                self.model[src_preds] = []
            for target in entry['targets']:
                trgt_preds = ' '.join(target['output'])
                self.model[src_preds].append(trgt_preds)

        for source in self.model:
            self.model[source] = Counter(self.model[source])
        return self.model
Ejemplo n.º 3
0
def run(out_path, entries_path, task):
    with open(out_path) as f:
        outputs = f.read().split('\n')
    outputs = [out.split() for out in outputs]

    with open(entries_path) as f:
        entries = f.read().split('\n')

    entries = [utils.split_triples(t.split()) for t in entries]
    for i, entry in enumerate(entries):
        if task == 'ordering':
            yield orderout2structin(ordering_out=outputs[i], triples=entry)
        elif task == 'structing':
            yield structout2lexin(struct_out=outputs[i], triples=entry)
        else:
            yield lexout2regin(lex_out=outputs[i], triples=entry)
Ejemplo n.º 4
0
    def __call__(self, in_path, order_path, out_path):
        with open(in_path) as f:
            entries = f.read().split('\n')

        with open(order_path) as f:
            ordered_triples = [
                utils.split_triples(t.split()) for t in f.read().split('\n')
            ]

        entity_maps = [utils.entity_mapping(t) for t in ordered_triples]
        result = []
        for i, entry in enumerate(entries):
            print('Progress: ', round(i / len(entries), 2), end='\r')
            result.append(self.realize(entry, entity_maps[i]))
        # result = [self.realize(entry, entity_maps[i]) for i, entry in enumerate(entries)]
        with open(out_path, 'w') as f:
            out = [' '.join(predicates) for predicates in result]
            f.write('\n'.join(out))
Ejemplo n.º 5
0
    def predict(self, source):
        triples = utils.split_triples(source)
        predicates = [t[1] for t in triples]

        start, end = 0, -1
        intervals = []
        while end < len(triples) and len(triples) > 0:
            end = randint(start + 1, len(triples))
            intervals.append((start, end))
            start = end

        struct = []
        for interval in intervals:
            start, end = interval
            struct.append('<SNT>')
            for predicate in predicates[start:end]:
                struct.append(predicate)
            struct.append('</SNT>')
        return struct
Ejemplo n.º 6
0
    def load(self, path, augment=True):
        entryset = parsing.run_parser(path)

        data, size = [], 0
        invocab, outvocab = [], []

        for i, entry in enumerate(entryset):
            progress = round(float(i) / len(entryset), 2)
            print('Progress: {0}'.format(progress), end='   \r')
            try:
                # triples greater than 1
                if len(entry.modifiedtripleset) > 1:
                    # process source
                    entitymap = {
                        b: a
                        for a, b in entry.entitymap_to_dict().items()
                    }
                    source, _, entities = load.source(entry.modifiedtripleset,
                                                      entitymap, {})
                    invocab.extend(source)

                    targets = []
                    for lex in entry.lexEntries:
                        # process ordered tripleset
                        _, text, _ = load.snt_source(lex.orderedtripleset,
                                                     entitymap, entities)
                        text = [
                            w for w in text if w not in ['<SNT>', '</SNT>']
                        ]
                        trg_preds = [t[1] for t in utils.split_triples(text)]

                        target = {
                            'lid': lex.lid,
                            'comment': lex.comment,
                            'output': trg_preds
                        }
                        targets.append(target)
                        outvocab.extend(trg_preds)

                    data.append({
                        'eid': entry.eid,
                        'category': entry.category,
                        'augmented': False,
                        'size': entry.size,
                        'source': source,
                        'targets': targets
                    })
                    size += len(targets)

                    # choose the original order and N permutations such as N = len(tripleset)-1
                    if augment:
                        triplesize = len(entry.modifiedtripleset)
                        perm = list(permutations(entry.modifiedtripleset))
                        perm = [
                            load.source(src, entitymap, {}) for src in perm
                        ]
                        entitylist = [w[2] for w in perm]
                        perm = [w[0] for w in perm]

                        taken = []
                        # to augment the corpus, pick the minimum between the number of permutations - 1 or 49
                        X = min(len(perm) - 1, 49)
                        for _ in range(X):
                            found = False
                            while not found and triplesize != 1:
                                pos = randint(0, len(perm) - 1)
                                src, entities = perm[pos], entitylist[pos]

                                if pos not in taken and src != source:
                                    taken.append(pos)
                                    found = True

                                    targets = []
                                    for lex in entry.lexEntries:
                                        # process ordered tripleset
                                        _, text, _ = load.snt_source(
                                            lex.orderedtripleset, entitymap,
                                            entities)
                                        text = [
                                            w for w in text
                                            if w not in ['<SNT>', '</SNT>']
                                        ]
                                        trg_preds = [
                                            t[1]
                                            for t in utils.split_triples(text)
                                        ]

                                        target = {
                                            'lid': lex.lid,
                                            'comment': lex.comment,
                                            'output': trg_preds
                                        }
                                        targets.append(target)
                                        outvocab.extend(trg_preds)

                                    data.append({
                                        'eid': entry.eid,
                                        'category': entry.category,
                                        'augmented': True,
                                        'size': entry.size,
                                        'source': src,
                                        'targets': targets
                                    })
                                    size += len(targets)
            except:
                print('Preprocessing error...')

        invocab.append('unk')
        outvocab.append('unk')

        invocab = list(set(invocab))
        outvocab = list(set(outvocab))
        vocab = {'input': invocab, 'output': outvocab}

        print('Path:', path, 'Size: ', size)
        return data, vocab
Ejemplo n.º 7
0
    def load_simple(self, path):
        entryset = parsing.run_parser(path)

        data, size = [], 0
        invocab, outvocab = [], []

        for i, entry in enumerate(entryset):
            progress = round(float(i) / len(entryset), 2)
            print('Progress: {0}'.format(progress), end='   \r')
            try:
                # triples greater than 1
                if len(entry.modifiedtripleset) > 1:
                    # process source
                    tripleset = []
                    for i, triple in enumerate(entry.modifiedtripleset):
                        striple = triple.predicate + ' ' + triple.subject + ' ' + triple.object
                        tripleset.append((i, striple))
                    # given a fixed order by sorting the set of triples automatically (predicate - subject - object)
                    tripleset = sorted(tripleset, key=lambda x: x[1])
                    triples = [
                        entry.modifiedtripleset[t[0]] for t in tripleset
                    ]

                    entitymap = {
                        b: a
                        for a, b in entry.entitymap_to_dict().items()
                    }
                    source, _, entities = load.source(triples, entitymap, {})
                    invocab.extend(source)

                    targets = []
                    for lex in entry.lexEntries:
                        # process ordered tripleset
                        _, text, _ = load.snt_source(lex.orderedtripleset,
                                                     entitymap, entities)
                        text = [
                            w for w in text if w not in ['<SNT>', '</SNT>']
                        ]
                        trg_preds = [t[1] for t in utils.split_triples(text)]

                        target = {
                            'lid': lex.lid,
                            'comment': lex.comment,
                            'output': trg_preds
                        }
                        targets.append(target)
                        outvocab.extend(trg_preds)

                    data.append({
                        'eid': entry.eid,
                        'category': entry.category,
                        'augmented': False,
                        'size': entry.size,
                        'source': source,
                        'targets': targets
                    })
                    size += len(targets)
            except:
                print('Preprocessing error...')

        invocab.append('unk')
        outvocab.append('unk')

        invocab = list(set(invocab))
        outvocab = list(set(outvocab))
        vocab = {'input': invocab, 'output': outvocab}

        print('Path:', path, 'Size: ', size)
        return data, vocab