def from_file(cls, root, dspider, dcache, debug=False): train_database, dev_database = editsql_preprocess.read_db_split(dspider) conv = converter.Converter() kmaps = evaluation.build_foreign_key_map_from_json(os.path.join(dspider, 'tables.json')) splits = {} for k in ['train', 'dev']: with open(os.path.join(root, '{}.json'.format(k)), 'rb') as f: splits[k] = [] for ex in json.load(f): splits[k].append(ex) if debug and len(splits[k]) > 100: break tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL, cache_dir=dcache) sql_voc = Vocab(['PAD', 'EOS', 'GO', 'SEP', '`', "'", '1', '%', 'yes', '2', '.', '5', 'f', 'm', 'name', 'song', 't', 'l']) # make contexts and populate vocab for s, data in splits.items(): proc = [] for i, ex in enumerate(tqdm.tqdm(data, desc='preprocess {}'.format(s))): for turn_i, turn in enumerate(ex['interaction']): turn['id'] = '{}/{}:{}'.format(ex['database_id'], i, turn_i) turn['db_id'] = ex['database_id'] turn['prev'] = ex['interaction'][turn_i-1] if turn_i > 0 else None new = cls.make_example(turn, tokenizer, sql_voc, kmaps, conv, train=s=='train') if new is not None and (s != 'train' or not new['invalid']): proc.append(new) splits[s] = proc # make candidate list using vocab for s, data in splits.items(): for ex in data: ex['cands_query'], ex['cands_value'] = cls.make_cands(ex, sql_voc) splits[s] = data # make pointers for training data for ex in splits['train']: ex['pointer_query'], ex['pointer_value'] = cls.make_query_pointer(ex['sup_query'], ex['cands_query'], ex['cands_value'], sql_voc) # look up pretrained word embeddings emb = E.ConcatEmbedding([E.GloveEmbedding(), E.KazumaCharEmbedding()], default='zero') sql_emb = torch.tensor([emb.emb(w) for w in sql_voc._index2word]) ext = dict(sql_voc=sql_voc, sql_emb=sql_emb) return splits, ext
def from_file(cls, root, dcache, debug=False): conv = converter.Converter(os.path.join(root, 'tables.json')) splits = {} for k in ['train', 'dev']: with open(os.path.join(root, '{}.json'.format(k)), 'rb') as f: splits[k] = [] for ex in json.load(f): ex['query_orig'] = ex['query'] splits[k].append(ex) if debug and len(splits[k]) > 100: break tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL, cache_dir=dcache) utt_voc = Vocab(['PAD', 'EOS', 'GO']) # make contexts and populate vocab for s, data in splits.items(): proc = [] for i, ex in enumerate(tqdm.tqdm(data, desc='preprocess {}'.format(s))): ex['id'] = '{}/{}'.format(ex['db_id'], i) new = cls.make_example(ex, tokenizer, utt_voc, conv, train=s=='train') if new is not None and (s != 'train' or not new['invalid']): proc.append(new) splits[s] = proc # make candidate list using vocab for s, data in splits.items(): for ex in data: ex['cands_question'] = cls.make_cands(ex, utt_voc) splits[s] = data # make pointers for training data for ex in splits['train']: ex['pointer_question'] = cls.make_question_pointer(ex['sup_question'], ex['cands_question'], utt_voc) # look up pretrained word embeddings emb = E.ConcatEmbedding([E.GloveEmbedding(), E.KazumaCharEmbedding()], default='zero') utt_emb = torch.tensor([emb.emb(w) for w in utt_voc._index2word]) ext = dict(utt_voc=utt_voc, utt_emb=utt_emb) return splits, ext
if not os.path.isdir(dout): os.makedirs(dout) print('Flattening train') train = create_split(train_trees, vocab) print('Flattening dev') dev = create_split(dev_trees, vocab) par = joblib.Parallel(12) print('Segmenting train') train_ba = par(joblib.delayed(segment)(ex, vocab) for ex in tqdm(train)) train_filtered = [] for ex, ba in zip(train, train_ba): if ba: ex.update(ba) train_filtered.append(ex) print('filtered train from {} to {}'.format(len(train), len(train_filtered))) print('vocab size {}'.format(len(vocab))) emb = embeddings.ConcatEmbedding( [embeddings.GloveEmbedding(), embeddings.KazumaCharEmbedding()], default='zero') mat = torch.Tensor([emb.emb(w) for w in vocab._index2word]) torch.save({'vocab': vocab, 'emb': mat}, dout + '/vocab.pt') torch.save(train_filtered, dout + '/proc_train.pt') torch.save(dev, dout + '/proc_dev.pt')