def save_2index(): grammar = Grammar('sql_simple_transition_2.bnf') #this shouldn't be necessary but it is grammar2 = Grammar('sql_simple_transition_2.bnf') sellist_gram = grammar.get_subgrammar('<sellist>') assert '<modifyop>' in grammar.gram_keys modifyopgram = grammar2.get_subgrammar('<modifyop>') dontuse = ['<start>'] db_dict = [] _, gram2ind = get_grammar2index(sellist_gram, db_dict, dontuse=dontuse) with open('grammar2index_grammar_2_sellist.json', 'w') as f: json.dump(gram2ind, f) _, gram2ind = get_grammar2index(modifyopgram, db_dict, dontuse=dontuse) with open('grammar2index_grammar_2_modifyop.json', 'w') as f: json.dump(gram2ind, f) _, w2ind = get_word2index(grammar, db_dict, dontuse=dontuse) with open('terminals2index_grammar_2.json', 'w') as f: json.dump(w2ind, f) with open('spider_tables_lowercase.json', 'r') as f: spider_db = json.loads(f.read()) tab2ind = {} col2ind = {} allcols_db2ind = {} for db in spider_db.keys(): dbp = db.lower() tab2ind[dbp] = { tab.lower(): i for i, tab in enumerate(spider_db[db].keys()) } all_cols = list( set([ c.lower() for t in spider_db[db].keys() for c in spider_db[db][t] ])) allcols_db2ind[dbp] = {c.lower(): i for i, c in enumerate(all_cols)} col2ind[dbp] = {} for tab in spider_db[db].keys(): tabp = tab.lower() col2ind[dbp][tabp] = { col.lower(): i for i, col in enumerate(spider_db[db][tab]) } with open('spider_tab2index.json', 'w') as f: json.dump(tab2ind, f) with open('spider_col2index.json', 'w') as f: json.dump(col2ind, f) with open('spider_db_cols2ind.json', 'w') as f: json.dump(allcols_db2ind, f)
class AugmentedDataLoader(DataLoader): def __init__(self, dataset, filen, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=default_collate, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None): super().__init__(dataset, batch_size, shuffle, sampler, batch_sampler, num_workers, collate_fn, pin_memory, drop_last, timeout, worker_init_fn) self.grammar = SimpleGrammar(filen) self.database_calls def get_choices_key(self, key): return self.grammar.gr[key]['items'] def get_values_terminal(self, terminal): if terminal in resolve_dict: pass else: lst = self.grammar.gr[terminal] assert len(lst) == 1 return lst[0] def is_terminal_on_path(self, tok, terminal): return self.grammar.from_terminal_to_token(tok, terminal)
def __init__(self, dataset, filen, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=default_collate, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None): super().__init__(dataset, batch_size, shuffle, sampler, batch_sampler, num_workers, collate_fn, pin_memory, drop_last, timeout, worker_init_fn) self.grammar = SimpleGrammar(filen) self.database_calls
class AugmentedDataset: #(Dataset): def __init__(self, jsonfile, grammar_file): self.grammar = SimpleGrammar(grammar_file) self.grammar_terminals = self.grammar.get_terminal_toks() with open(jsonfile, 'r') as f: self.data = json.loads(f.read()) def augment_data(self, test_string): to_learn = self.grammar.learn_ def test(self): counter = 0 not_resolved = 0 kk = [k for k in self.data.keys()] for k in kk: string = self.data[k]['sql'] val, reas = self.grammar.check_string_tokens(string, verbose=True) if not val: counter += 1 if reas == 'res': not_resolved += 1 print('{} out of {} are errors, {} are with resolution'.format( counter, len(kk), not_resolved))
def __init__(self, jsonfile, grammar_file): self.grammar = SimpleGrammar(grammar_file) self.grammar_terminals = self.grammar.get_terminal_toks() with open(jsonfile, 'r') as f: self.data = json.loads(f.read())
def __init__(self, filen:str): grammar = SimpleGrammar(filen) self.grammar = grammar self.setup_conditions()
if tok == key: matched = True elif tok in tables[key]: matched = True if matched: break if not matched: toks_not_in.append(tok) return set(toks_not_in) def iterate_through_data_tables(grammar, data, tables): for k in data.keys(): toks = rem_terminals_tokenize_string(grammar, data[k]['sql']) toks = all_toks_not_matched(toks, tables) print('{} unmatched tokens in {}'.format(len(toks), data[k]['sql'])) print(toks) if __name__ == '__main__': jsonfile = '/home/jq/software/triplet-all/spider_train_subset.json' with open(jsonfile, 'r') as f: data = json.loads(f.read()) with open('spider_tables.json', 'r') as f: spider_tables = json.loads(f.read()) g = SimpleGrammar('sql_simple_transition.bnf') iterate_through_data_tables(g, data, spider_tables)