def process_data(lang): if not os.path.isdir(os.path.join(config.RAW_DATA_DIR, lang)): os.makedirs(os.path.join(config.RAW_DATA_DIR, lang)) download( 'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_train_{}.json' .format(lang), os.path.join(config.RAW_DATA_DIR, lang, 'train.json')) download( 'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_validate_{}.json' .format(lang), os.path.join(config.RAW_DATA_DIR, lang, 'dev.json')) download( 'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_test_{}.json' .format(lang), os.path.join(config.RAW_DATA_DIR, lang, 'test.json')) print('Processing language: {}'.format(lang)) splits = ['dev', 'train', 'test'] dir_to_save_files = os.path.join(config.RAW_DATA_DIR, lang, 'preprocessed') # create folder if not exists if not os.path.exists(dir_to_save_files): os.makedirs(dir_to_save_files) #delete any existing file for f in os.listdir(dir_to_save_files): os.remove(dir_to_save_files + f) ontology = Ontology() vocab = Vocab() vocab.word2index(['pad', 'sos', 'eos', config.NONE_TOKEN], train=True) for s in splits: fname = '{}.json'.format(s) print('Annotating {}'.format(s)) dataset = Dataset.annotate_raw( os.path.join(config.RAW_DATA_DIR, lang, fname)) dataset.numericalize_(vocab) ontology = ontology + dataset.extract_ontology() with open(os.path.join(dir_to_save_files, fname), 'wt') as f: json.dump(dataset.to_dict(), f) ontology.numericalize_(vocab) with open(os.path.join(dir_to_save_files, 'ontology.json'), 'wt') as f: json.dump(ontology.to_dict(), f) with open(os.path.join(dir_to_save_files, 'vocab.json'), 'wt') as f: json.dump(vocab.to_dict(), f)
def load_dataset(base_path): dataset = {} dataset['train'] = Dataset.from_dict(read_json(base_path / 'train.json')) dataset['dev'] = Dataset.from_dict(read_json(base_path / 'dev.json')) dataset['test'] = Dataset.from_dict(read_json(base_path / 'test.json')) ontology = Ontology.from_dict(read_json(base_path / 'ontology.json')) return dataset, ontology
def load_dataset(splits=('train', 'dev', 'test')): with open(os.path.join(dann, 'ontology.json')) as f: ontology = Ontology.from_dict(json.load(f)) with open(os.path.join(dann, 'vocab.json')) as f: vocab = Vocab.from_dict(json.load(f)) with open(os.path.join(dann, 'emb.json')) as f: E = json.load(f) dataset = {} for split in splits: with open(os.path.join(dann, '{}.json'.format(split))) as f: logging.warning('loading split {}'.format(split)) dataset[split] = Dataset.from_dict(json.load(f)) logging.info('dataset sizes: {}'.format(pformat({k: len(v) for k, v in dataset.items()}))) return dataset, ontology, vocab, E
def load_dataset(emb=False, splits=('train', 'dev', 'test')): with open(os.path.join(config.DATA_DIR, 'ontology.json')) as f: ontology = Ontology.from_dict(json.load(f)) with open(os.path.join(config.DATA_DIR, 'vocab.json')) as f: vocab = Vocab.from_dict(json.load(f)) if emb: with open(os.path.join(config.DATA_DIR, 'emb.json')) as f: E = json.load(f) dataset = {} for split in splits: with open(os.path.join(config.DATA_DIR, '{}.json'.format(split))) as f: dataset[split] = Dataset.from_dict(json.load(f)) print('dataset sizes: {}'.format( pformat({k: len(v) for k, v in dataset.items()}))) if emb: return dataset, ontology, vocab, E else: return dataset, ontology, vocab
os.makedirs(draw) download( 'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_train_en.json', os.path.join(draw, 'train.json')) download( 'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_validate_en.json', os.path.join(draw, 'dev.json')) download( 'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_test_en.json', os.path.join(draw, 'test.json')) if missing_files(dann, files=splits + ['ontology', 'vocab', 'emb']): if not os.path.isdir(dann): os.makedirs(dann) dataset = {} ontology = Ontology() vocab = Vocab() vocab.word2index(['<sos>', '<eos>'], train=True) for s in splits: fname = '{}.json'.format(s) logging.warn('Annotating {}'.format(s)) dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname)) dataset[s].numericalize_(vocab) ontology = ontology + dataset[s].extract_ontology() with open(os.path.join(dann, fname), 'wt') as f: json.dump(dataset[s].to_dict(), f) ontology.numericalize_(vocab) with open(os.path.join(dann, 'ontology.json'), 'wt') as f: json.dump(ontology.to_dict(), f) with open(os.path.join(dann, 'vocab.json'), 'wt') as f: json.dump(vocab.to_dict(), f)
os.makedirs(draw) download( 'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_train_en.json', os.path.join(draw, 'train.json')) download( 'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_validate_en.json', os.path.join(draw, 'dev.json')) download( 'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_test_en.json', os.path.join(draw, 'test.json')) if missing_files(dann, files=splits + ['ontology', 'vocab', 'emb']): if not os.path.isdir(dann): os.makedirs(dann) dataset = {} ontology = Ontology() vocab = Vocab() vocab.word2index(['<sos>', '<eos>'], train=True) for s in splits: fname = '{}.json'.format(s) logging.warn('Annotating {}'.format(s)) dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname)) dataset[s].numericalize_(vocab) ontology = ontology + dataset[s].extract_ontology() # add 'none' value for each inform slot unk_svs = defaultdict(set) for slot in ontology.slots: if slot != 'request': unk_svs[slot].add('<eos>') ontology = ontology + Ontology( sorted(list(ontology.slots)),
if __name__ == '__main__': # if missing_files(draw, splits): # if not os.path.isdir(draw): # os.makedirs(draw) # download('https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_train_en.json', os.path.join(draw, 'train.json')) # download('https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_validate_en.json', os.path.join(draw, 'dev.json')) # download('https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_test_en.json', os.path.join(draw, 'test.json')) if missing_files(dann, files=splits + ['ontology', 'vocab', 'emb']): if not os.path.isdir(dann): os.makedirs(dann) dataset = {} # ontology = Ontology() ont = json.load(open(os.path.join(draw, 'ontology.json'))) ontology = Ontology(slots=ont['slots'], values=ont['values']) vocab = Vocab() vocab.word2index(['<sos>', '<eos>'], train=True) for s in splits: fname = '{}.json'.format(s) logging.warning('Annotating {}'.format(s)) dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname)) dataset[s].numericalize_(vocab) # ontology = ontology + dataset[s].extract_ontology() with open(os.path.join(dann, fname), 'wt') as f: json.dump(dataset[s].to_dict(), f) ontology.numericalize_(vocab) with open(os.path.join(dann, 'ontology.json'), 'wt') as f: json.dump(ontology.to_dict(), f) with open(os.path.join(dann, 'vocab.json'), 'wt') as f: json.dump(vocab.to_dict(), f)