def load_simple(self, path): entryset = parsing.run_parser(path) data, size = [], 0 invocab, outvocab = [], [] for i, entry in enumerate(entryset): progress = round(float(i) / len(entryset), 2) print('Progress: {0}'.format(progress), end=' \r') try: # triples greater than 1 if len(entry.modifiedtripleset) > 1: # process source tripleset = [] for i, triple in enumerate(entry.modifiedtripleset): striple = triple.predicate + ' ' + triple.subject + ' ' + triple.object tripleset.append((i, striple)) # given a fixed order by sorting the set of triples automatically (predicate - subject - object) tripleset = sorted(tripleset, key=lambda x: x[1]) triples = [entry.modifiedtripleset[t[0]] for t in tripleset] entitymap = {b:a for a, b in entry.entitymap_to_dict().items()} source, _, entities = load.source(triples, entitymap, {}) invocab.extend(source) targets = [] for lex in entry.lexEntries: # process ordered tripleset _, text, _ = load.snt_source(lex.orderedtripleset, entitymap, entities) text = [w for w in text if w not in ['<SNT>', '</SNT>']] trg_preds = [t[1] for t in utils.split_triples(text)] target = { 'lid': lex.lid, 'comment': lex.comment, 'output': trg_preds } targets.append(target) outvocab.extend(trg_preds) data.append({ 'eid': entry.eid, 'category': entry.category, 'augmented': False, 'size': entry.size, 'source': source, 'targets': targets }) size += len(targets) except: print('Preprocessing error...') invocab.append('unk') outvocab.append('unk') invocab = list(set(invocab)) outvocab = list(set(outvocab)) vocab = { 'input': invocab, 'output': outvocab } print('Path:', path, 'Size: ', size) return data, vocab
def load(self, path): entryset = parsing.run_parser(path) data, size = [], 0 invocab, outvocab, surfacevocab = [], [], [] nerrors = 0 for i, entry in enumerate(entryset): progress = round(i / len(entryset), 2) print('Progress: {0} \t Errors: {1}'.format( progress, round(nerrors / len(entryset), 2)), end='\r') entitymap = {b: a for a, b in entry.entitymap_to_dict().items()} visited = [] for lex in entry.lexEntries: # process ordered tripleset source, delex_source, _ = load.snt_source( lex.orderedtripleset, entitymap, {}) if source not in visited: visited.append(source) invocab.extend(source) targets = [] for lex2 in entry.lexEntries: _, target, entities = load.snt_source( lex2.orderedtripleset, entitymap, {}) if delex_source == target: try: template, vocab = self.extractor.extract( lex2.template) surfacevocab.extend(vocab) for i, word in enumerate(template): if word in entities: template[i] = entities[word] target = { 'lid': lex.lid, 'comment': lex.comment, 'output': template } targets.append(target) outvocab.extend(template) except: nerrors += 1 print('Parsing Error...') data.append({ 'eid': entry.eid, 'category': entry.category, 'size': entry.size, 'source': source, 'targets': targets }) size += len(targets) invocab.append('<unk>') outvocab.append('<unk>') invocab = list(set(invocab)) outvocab = list(set(outvocab)) vocab = {'input': invocab, 'output': outvocab} print('Path:', path, 'Size: ', size) return data, vocab, surfacevocab
def load(self, path, augment=True): entryset = parsing.run_parser(path) data, size = [], 0 invocab, outvocab = [], [] for i, entry in enumerate(entryset): progress = round(float(i) / len(entryset), 2) print('Progress: {0}'.format(progress), end=' \r') try: # triples greater than 1 if len(entry.modifiedtripleset) > 1: # process source entitymap = { b: a for a, b in entry.entitymap_to_dict().items() } source, _, entities = load.source(entry.modifiedtripleset, entitymap, {}) invocab.extend(source) targets = [] for lex in entry.lexEntries: # process ordered tripleset _, text, _ = load.snt_source(lex.orderedtripleset, entitymap, entities) text = [ w for w in text if w not in ['<SNT>', '</SNT>'] ] trg_preds = [t[1] for t in utils.split_triples(text)] target = { 'lid': lex.lid, 'comment': lex.comment, 'output': trg_preds } targets.append(target) outvocab.extend(trg_preds) data.append({ 'eid': entry.eid, 'category': entry.category, 'augmented': False, 'size': entry.size, 'source': source, 'targets': targets }) size += len(targets) # choose the original order and N permutations such as N = len(tripleset)-1 if augment: triplesize = len(entry.modifiedtripleset) perm = list(permutations(entry.modifiedtripleset)) perm = [ load.source(src, entitymap, {}) for src in perm ] entitylist = [w[2] for w in perm] perm = [w[0] for w in perm] taken = [] # to augment the corpus, pick the minimum between the number of permutations - 1 or 49 X = min(len(perm) - 1, 49) for _ in range(X): found = False while not found and triplesize != 1: pos = randint(0, len(perm) - 1) src, entities = perm[pos], entitylist[pos] if pos not in taken and src != source: taken.append(pos) found = True targets = [] for lex in entry.lexEntries: # process ordered tripleset _, text, _ = load.snt_source( lex.orderedtripleset, entitymap, entities) text = [ w for w in text if w not in ['<SNT>', '</SNT>'] ] trg_preds = [ t[1] for t in utils.split_triples(text) ] target = { 'lid': lex.lid, 'comment': lex.comment, 'output': trg_preds } targets.append(target) outvocab.extend(trg_preds) data.append({ 'eid': entry.eid, 'category': entry.category, 'augmented': True, 'size': entry.size, 'source': src, 'targets': targets }) size += len(targets) except: print('Preprocessing error...') invocab.append('unk') outvocab.append('unk') invocab = list(set(invocab)) outvocab = list(set(outvocab)) vocab = {'input': invocab, 'output': outvocab} print('Path:', path, 'Size: ', size) return data, vocab
def load(self, path): flat = lambda struct: [ w for w in struct if w not in ['<SNT>', '</SNT>'] ] entryset = parsing.run_parser(path) data, size = [], 0 invocab, outvocab = [], [] for entry in entryset: entitymap = {b: a for a, b in entry.entitymap_to_dict().items()} if len(entry.modifiedtripleset) > 1: visited = [] for lex in entry.lexEntries: # process ordered tripleset source, delex_source, _ = load.snt_source( lex.orderedtripleset, entitymap, {}) source, delex_source = flat(source), flat(delex_source) if source not in visited and ' '.join( source).strip() != '': visited.append(source) invocab.extend(source) targets = [] for lex2 in entry.lexEntries: _, text, _ = load.snt_source( lex2.orderedtripleset, entitymap, {}) flatten = flat(text) if delex_source == flatten: trgt_preds = [] for snt in utils.split_struct(text): trgt_preds.append('<SNT>') trgt_preds.extend([t[1] for t in snt]) trgt_preds.append('</SNT>') target = { 'lid': lex2.lid, 'comment': lex2.comment, 'output': trgt_preds } targets.append(target) outvocab.extend(trgt_preds) data.append({ 'eid': entry.eid, 'category': entry.category, 'size': entry.size, 'source': source, 'targets': targets }) size += len(targets) invocab.append('unk') outvocab.append('unk') invocab = list(set(invocab)) outvocab = list(set(outvocab)) vocab = {'input': invocab, 'output': outvocab} print('Path:', path, 'Size: ', size) return data, vocab