コード例 #1
0
ファイル: preprocess.py プロジェクト: maqboolkhan/DeepNLG
    def load_simple(self, path):
        entryset = parsing.run_parser(path)

        data, size = [], 0
        invocab, outvocab = [], []

        for i, entry in enumerate(entryset):
            progress = round(float(i) / len(entryset), 2)
            print('Progress: {0}'.format(progress), end='   \r')
            try:
                # triples greater than 1
                if len(entry.modifiedtripleset) > 1:
                    # process source
                    tripleset = []
                    for i, triple in enumerate(entry.modifiedtripleset):
                        striple = triple.predicate + ' ' + triple.subject + ' ' + triple.object
                        tripleset.append((i, striple))
                    # given a fixed order by sorting the set of triples automatically (predicate - subject - object)
                    tripleset = sorted(tripleset, key=lambda x: x[1])
                    triples = [entry.modifiedtripleset[t[0]] for t in tripleset]

                    entitymap = {b:a for a, b in entry.entitymap_to_dict().items()}
                    source, _, entities = load.source(triples, entitymap, {})
                    invocab.extend(source)

                    targets = []
                    for lex in entry.lexEntries:
                        # process ordered tripleset
                        _, text, _ = load.snt_source(lex.orderedtripleset, entitymap, entities)
                        text = [w for w in text if w not in ['<SNT>', '</SNT>']]
                        trg_preds = [t[1] for t in utils.split_triples(text)]

                        target = { 'lid': lex.lid, 'comment': lex.comment, 'output': trg_preds }
                        targets.append(target)
                        outvocab.extend(trg_preds)

                    data.append({
                        'eid': entry.eid,
                        'category': entry.category,
                        'augmented': False,
                        'size': entry.size,
                        'source': source,
                        'targets': targets })
                    size += len(targets)
            except:
                print('Preprocessing error...')

        invocab.append('unk')
        outvocab.append('unk')

        invocab = list(set(invocab))
        outvocab = list(set(outvocab))
        vocab = { 'input': invocab, 'output': outvocab }

        print('Path:', path, 'Size: ', size)
        return data, vocab
コード例 #2
0
ファイル: preprocess.py プロジェクト: xc15071347094/DeepNLG
    def load(self, path):
        entryset = parsing.run_parser(path)

        data, size = [], 0
        invocab, outvocab, surfacevocab = [], [], []
        nerrors = 0
        for i, entry in enumerate(entryset):
            progress = round(i / len(entryset), 2)
            print('Progress: {0} \t Errors: {1}'.format(
                progress, round(nerrors / len(entryset), 2)),
                  end='\r')
            entitymap = {b: a for a, b in entry.entitymap_to_dict().items()}

            visited = []
            for lex in entry.lexEntries:
                # process ordered tripleset
                source, delex_source, _ = load.snt_source(
                    lex.orderedtripleset, entitymap, {})

                if source not in visited:
                    visited.append(source)
                    invocab.extend(source)

                    targets = []
                    for lex2 in entry.lexEntries:
                        _, target, entities = load.snt_source(
                            lex2.orderedtripleset, entitymap, {})

                        if delex_source == target:
                            try:
                                template, vocab = self.extractor.extract(
                                    lex2.template)
                                surfacevocab.extend(vocab)
                                for i, word in enumerate(template):
                                    if word in entities:
                                        template[i] = entities[word]
                                target = {
                                    'lid': lex.lid,
                                    'comment': lex.comment,
                                    'output': template
                                }
                                targets.append(target)
                                outvocab.extend(template)
                            except:
                                nerrors += 1
                                print('Parsing Error...')

                    data.append({
                        'eid': entry.eid,
                        'category': entry.category,
                        'size': entry.size,
                        'source': source,
                        'targets': targets
                    })
                    size += len(targets)

        invocab.append('<unk>')
        outvocab.append('<unk>')

        invocab = list(set(invocab))
        outvocab = list(set(outvocab))
        vocab = {'input': invocab, 'output': outvocab}

        print('Path:', path, 'Size: ', size)
        return data, vocab, surfacevocab
コード例 #3
0
    def load(self, path, augment=True):
        entryset = parsing.run_parser(path)

        data, size = [], 0
        invocab, outvocab = [], []

        for i, entry in enumerate(entryset):
            progress = round(float(i) / len(entryset), 2)
            print('Progress: {0}'.format(progress), end='   \r')
            try:
                # process source
                entitymap = {
                    b: a
                    for a, b in entry.entitymap_to_dict().items()
                }
                source, _, entities = load.source(entry.modifiedtripleset,
                                                  entitymap, {})
                invocab.extend(source)

                targets = []
                for lex in entry.lexEntries:
                    # process ordered tripleset
                    text = self.tokenize(text=lex.text)

                    target = {
                        'lid': lex.lid,
                        'comment': lex.comment,
                        'output': text,
                        'text': lex.text.replace('@', ' ')
                    }
                    targets.append(target)
                    outvocab.extend(text)

                data.append({
                    'eid': entry.eid,
                    'category': entry.category,
                    'augmented': False,
                    'size': entry.size,
                    'source': source,
                    'targets': targets
                })
                size += len(targets)

                # choose the original order and N permutations such as N = len(tripleset)-1
                if augment:
                    triplesize = len(entry.modifiedtripleset)
                    perm = list(permutations(entry.modifiedtripleset))
                    perm = [load.source(src, entitymap, {}) for src in perm]
                    entitylist = [w[2] for w in perm]
                    perm = [w[0] for w in perm]

                    taken = []
                    # to augment the corpus, pick the minumum between the number of permutations - 1 or 49
                    X = min(len(perm) - 1, 49)
                    for _ in range(X):
                        found = False
                        while not found and triplesize != 1:
                            pos = randint(0, len(perm) - 1)
                            src, entities = perm[pos], entitylist[pos]

                            if pos not in taken and src != source:
                                taken.append(pos)
                                found = True

                                targets = []
                                for lex in entry.lexEntries:
                                    # process ordered tripleset
                                    text = self.tokenize(text=lex.text)

                                    target = {
                                        'lid': lex.lid,
                                        'comment': lex.comment,
                                        'output': text,
                                        'text': lex.text.replace('@', ' ')
                                    }
                                    targets.append(target)
                                    outvocab.extend(text)

                                data.append({
                                    'eid': entry.eid,
                                    'category': entry.category,
                                    'augmented': True,
                                    'size': entry.size,
                                    'source': src,
                                    'targets': targets
                                })
                                size += len(targets)
            except:
                print('Preprocessing error...')

        invocab.append('unk')
        outvocab.append('unk')

        invocab = list(set(invocab))
        outvocab = list(set(outvocab))
        vocab = {'input': invocab, 'output': outvocab}

        print('Path:', path, 'Size: ', size)
        return data, vocab
コード例 #4
0
ファイル: preprocess.py プロジェクト: xc15071347094/DeepNLG
    def load_index(self, path):
        entryset = parsing.run_parser(path)

        data, size = [], 0
        invocab, outvocab = [], []

        for i, entry in enumerate(entryset):
            progress = round(float(i) / len(entryset), 2)
            print('Progress: {0}'.format(progress), end='   \r')
            try:
                # triples greater than 1
                if len(entry.modifiedtripleset) > 1:
                    # process source
                    tripleset = []
                    for i, triple in enumerate(entry.modifiedtripleset):
                        striple = triple.predicate + ' ' + triple.subject + ' ' + triple.object
                        tripleset.append((i, striple))
                    # given a fixed order by sorting the set of triples automatically (predicate - subject - object)
                    tripleset = sorted(tripleset, key=lambda x: x[1])
                    triples = [
                        entry.modifiedtripleset[t[0]] for t in tripleset
                    ]

                    entitymap = {
                        b: a
                        for a, b in entry.entitymap_to_dict().items()
                    }
                    source, _, entities = load.source(triples, entitymap, {})
                    invocab.extend(source)

                    targets = []
                    for lex in entry.lexEntries:
                        # process ordered tripleset
                        trg_idx = []
                        orderedtripleset = [
                            item for sublist in lex.orderedtripleset
                            for item in sublist
                        ]
                        for sorted_triple in orderedtripleset:
                            for i, src_triple in enumerate(triples):
                                if sorted_triple.subject == src_triple.subject and \
                                                sorted_triple.predicate == src_triple.predicate and \
                                                sorted_triple.object == src_triple.object and str(i+1) not in trg_idx:
                                    trg_idx.append(str(i + 1))

                        target = {
                            'lid': lex.lid,
                            'comment': lex.comment,
                            'output': trg_idx
                        }
                        targets.append(target)
                        outvocab.extend(trg_idx)

                    data.append({
                        'eid': entry.eid,
                        'category': entry.category,
                        'augmented': False,
                        'size': entry.size,
                        'source': source,
                        'targets': targets
                    })
                    size += len(targets)
            except:
                print('Preprocessing error...')

        invocab.append('unk')
        outvocab.append('unk')

        invocab = list(set(invocab))
        outvocab = list(set(outvocab))
        vocab = {'input': invocab, 'output': outvocab}

        print('Path:', path, 'Size: ', size)
        return data, vocab
コード例 #5
0
ファイル: preprocess.py プロジェクト: xc15071347094/DeepNLG
    def process(self, entry_path):
        entryset = parser.run_parser(entry_path)

        data, size = [], 0
        invocab, outvocab = [], []

        for i, entry in enumerate(entryset):
            progress = round(float(i) / len(entryset), 2)
            print('Progress: {0}'.format(progress), end='   \r')
            # process source
            entitymap = entry.entitymap_to_dict()

            for lex in entry.lexEntries:
                # process ordered tripleset
                template = self.temp_extractor.extract(lex.template)[0]
                template = ' '.join(template).split()

                refcount = {}
                for reference in lex.references:
                    tag = reference.tag
                    if tag not in refcount:
                        refcount[tag] = 0
                    refcount[tag] += 1

                    entity = '_'.join(reference.entity.split())
                    if entity != '':
                        refex = self.tokenize(reference.refex)

                        isDigit = entity.replace('.', '').strip().isdigit()
                        regex='([0-9]{4})-([0-9]{2})-([0-9]{2})'
                        isDate = len(re.findall(regex,entity)) > 0
                        if entity[0] not in ['\'', '\"'] and not isDigit and not isDate:
                            context, pos = [], 0
                            for i, w in enumerate(template):
                                if w.strip() == tag.strip():
                                    pos += 1
                                    if pos == refcount[tag]:
                                        pre_context = copy.copy(context)
                                        pos_context = []
                                        for j in range(i+1, len(template)):
                                            if template[j].strip() not in entitymap:
                                                pos_context.append(template[j].lower())
                                            else:
                                                pos_context.append('_'.join(entitymap[template[j]].split()))

                                        data.append({
                                            'entity': entity,
                                            'category': entry.category,
                                            'pre_context': pre_context,
                                            'pos_context': pos_context,
                                            'refex': refex
                                        })
                                        size += 1
                                        invocab.extend(pre_context)
                                        invocab.extend(pos_context)
                                        invocab.append(entity)
                                        outvocab.extend(refex)
                                else:
                                    if w.strip() not in entitymap:
                                        context.append(w.lower())
                                    else:
                                        context.append('_'.join(entitymap[w].split()))

        invocab.append('unk')
        outvocab.append('unk')
        invocab.append('eos')
        outvocab.append('eos')

        invocab = list(set(invocab))
        outvocab = list(set(outvocab))
        vocab = { 'input': invocab, 'output': outvocab }

        print('Path:', entry_path, 'Size: ', size)
        return data, vocab
コード例 #6
0
ファイル: preprocess.py プロジェクト: maqboolkhan/DeepNLG
    def load(self, path):
        flat = lambda struct: [
            w for w in struct if w not in ['<SNT>', '</SNT>']
        ]

        entryset = parsing.run_parser(path)

        data, size = [], 0
        invocab, outvocab = [], []
        for entry in entryset:
            entitymap = {b: a for a, b in entry.entitymap_to_dict().items()}

            if len(entry.modifiedtripleset) > 1:
                visited = []
                for lex in entry.lexEntries:
                    # process ordered tripleset
                    source, delex_source, _ = load.snt_source(
                        lex.orderedtripleset, entitymap, {})
                    source, delex_source = flat(source), flat(delex_source)

                    if source not in visited and ' '.join(
                            source).strip() != '':
                        visited.append(source)
                        invocab.extend(source)

                        targets = []
                        for lex2 in entry.lexEntries:
                            _, text, _ = load.snt_source(
                                lex2.orderedtripleset, entitymap, {})
                            flatten = flat(text)
                            if delex_source == flatten:
                                trgt_preds = []
                                for snt in utils.split_struct(text):
                                    trgt_preds.append('<SNT>')
                                    trgt_preds.extend([t[1] for t in snt])
                                    trgt_preds.append('</SNT>')
                                target = {
                                    'lid': lex2.lid,
                                    'comment': lex2.comment,
                                    'output': trgt_preds
                                }
                                targets.append(target)
                                outvocab.extend(trgt_preds)

                        data.append({
                            'eid': entry.eid,
                            'category': entry.category,
                            'size': entry.size,
                            'source': source,
                            'targets': targets
                        })
                        size += len(targets)

        invocab.append('unk')
        outvocab.append('unk')

        invocab = list(set(invocab))
        outvocab = list(set(outvocab))
        vocab = {'input': invocab, 'output': outvocab}

        print('Path:', path, 'Size: ', size)
        return data, vocab