Beispiel #1
0
def process_data(lang):
    if not os.path.isdir(os.path.join(config.RAW_DATA_DIR, lang)):
        os.makedirs(os.path.join(config.RAW_DATA_DIR, lang))
    download(
        'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_train_{}.json'
        .format(lang), os.path.join(config.RAW_DATA_DIR, lang, 'train.json'))
    download(
        'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_validate_{}.json'
        .format(lang), os.path.join(config.RAW_DATA_DIR, lang, 'dev.json'))
    download(
        'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_test_{}.json'
        .format(lang), os.path.join(config.RAW_DATA_DIR, lang, 'test.json'))

    print('Processing language: {}'.format(lang))

    splits = ['dev', 'train', 'test']

    dir_to_save_files = os.path.join(config.RAW_DATA_DIR, lang, 'preprocessed')
    # create folder if not exists
    if not os.path.exists(dir_to_save_files):
        os.makedirs(dir_to_save_files)

    #delete any existing file
    for f in os.listdir(dir_to_save_files):
        os.remove(dir_to_save_files + f)

    ontology = Ontology()

    vocab = Vocab()
    vocab.word2index(['pad', 'sos', 'eos', config.NONE_TOKEN], train=True)

    for s in splits:
        fname = '{}.json'.format(s)
        print('Annotating {}'.format(s))
        dataset = Dataset.annotate_raw(
            os.path.join(config.RAW_DATA_DIR, lang, fname))
        dataset.numericalize_(vocab)

        ontology = ontology + dataset.extract_ontology()

        with open(os.path.join(dir_to_save_files, fname), 'wt') as f:
            json.dump(dataset.to_dict(), f)

    ontology.numericalize_(vocab)
    with open(os.path.join(dir_to_save_files, 'ontology.json'), 'wt') as f:
        json.dump(ontology.to_dict(), f)

    with open(os.path.join(dir_to_save_files, 'vocab.json'), 'wt') as f:
        json.dump(vocab.to_dict(), f)
Beispiel #2
0
def load_dataset(base_path):
    dataset = {}
    dataset['train'] = Dataset.from_dict(read_json(base_path / 'train.json'))
    dataset['dev'] = Dataset.from_dict(read_json(base_path / 'dev.json'))
    dataset['test'] = Dataset.from_dict(read_json(base_path / 'test.json'))
    ontology = Ontology.from_dict(read_json(base_path / 'ontology.json'))
    return dataset, ontology
Beispiel #3
0
def load_dataset(splits=('train', 'dev', 'test')):
    with open(os.path.join(dann, 'ontology.json')) as f:
        ontology = Ontology.from_dict(json.load(f))
    with open(os.path.join(dann, 'vocab.json')) as f:
        vocab = Vocab.from_dict(json.load(f))
    with open(os.path.join(dann, 'emb.json')) as f:
        E = json.load(f)
    dataset = {}
    for split in splits:
        with open(os.path.join(dann, '{}.json'.format(split))) as f:
            logging.warning('loading split {}'.format(split))
            dataset[split] = Dataset.from_dict(json.load(f))

    logging.info('dataset sizes: {}'.format(pformat({k: len(v) for k, v in dataset.items()})))
    return dataset, ontology, vocab, E
Beispiel #4
0
def load_dataset(emb=False, splits=('train', 'dev', 'test')):
    with open(os.path.join(config.DATA_DIR, 'ontology.json')) as f:
        ontology = Ontology.from_dict(json.load(f))

    with open(os.path.join(config.DATA_DIR, 'vocab.json')) as f:
        vocab = Vocab.from_dict(json.load(f))

    if emb:
        with open(os.path.join(config.DATA_DIR, 'emb.json')) as f:
            E = json.load(f)

    dataset = {}
    for split in splits:
        with open(os.path.join(config.DATA_DIR, '{}.json'.format(split))) as f:
            dataset[split] = Dataset.from_dict(json.load(f))

    print('dataset sizes: {}'.format(
        pformat({k: len(v)
                 for k, v in dataset.items()})))

    if emb:
        return dataset, ontology, vocab, E
    else:
        return dataset, ontology, vocab
            os.makedirs(draw)
        download(
            'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_train_en.json',
            os.path.join(draw, 'train.json'))
        download(
            'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_validate_en.json',
            os.path.join(draw, 'dev.json'))
        download(
            'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_test_en.json',
            os.path.join(draw, 'test.json'))

    if missing_files(dann, files=splits + ['ontology', 'vocab', 'emb']):
        if not os.path.isdir(dann):
            os.makedirs(dann)
        dataset = {}
        ontology = Ontology()
        vocab = Vocab()
        vocab.word2index(['<sos>', '<eos>'], train=True)
        for s in splits:
            fname = '{}.json'.format(s)
            logging.warn('Annotating {}'.format(s))
            dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname))
            dataset[s].numericalize_(vocab)
            ontology = ontology + dataset[s].extract_ontology()
            with open(os.path.join(dann, fname), 'wt') as f:
                json.dump(dataset[s].to_dict(), f)
        ontology.numericalize_(vocab)
        with open(os.path.join(dann, 'ontology.json'), 'wt') as f:
            json.dump(ontology.to_dict(), f)
        with open(os.path.join(dann, 'vocab.json'), 'wt') as f:
            json.dump(vocab.to_dict(), f)
Beispiel #6
0
            os.makedirs(draw)
        download(
            'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_train_en.json',
            os.path.join(draw, 'train.json'))
        download(
            'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_validate_en.json',
            os.path.join(draw, 'dev.json'))
        download(
            'https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_test_en.json',
            os.path.join(draw, 'test.json'))

    if missing_files(dann, files=splits + ['ontology', 'vocab', 'emb']):
        if not os.path.isdir(dann):
            os.makedirs(dann)
        dataset = {}
        ontology = Ontology()
        vocab = Vocab()
        vocab.word2index(['<sos>', '<eos>'], train=True)
        for s in splits:
            fname = '{}.json'.format(s)
            logging.warn('Annotating {}'.format(s))
            dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname))
            dataset[s].numericalize_(vocab)
            ontology = ontology + dataset[s].extract_ontology()
            # add 'none' value for each inform slot
            unk_svs = defaultdict(set)
            for slot in ontology.slots:
                if slot != 'request':
                    unk_svs[slot].add('<eos>')
            ontology = ontology + Ontology(
                sorted(list(ontology.slots)),
Beispiel #7
0
if __name__ == '__main__':
    # if missing_files(draw, splits):
    #     if not os.path.isdir(draw):
    #         os.makedirs(draw)
    # download('https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_train_en.json', os.path.join(draw, 'train.json'))
    # download('https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_validate_en.json', os.path.join(draw, 'dev.json'))
    # download('https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_test_en.json', os.path.join(draw, 'test.json'))

    if missing_files(dann, files=splits + ['ontology', 'vocab', 'emb']):
        if not os.path.isdir(dann):
            os.makedirs(dann)
        dataset = {}
        # ontology = Ontology()
        ont = json.load(open(os.path.join(draw, 'ontology.json')))
        ontology = Ontology(slots=ont['slots'], values=ont['values'])
        vocab = Vocab()
        vocab.word2index(['<sos>', '<eos>'], train=True)
        for s in splits:
            fname = '{}.json'.format(s)
            logging.warning('Annotating {}'.format(s))
            dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname))
            dataset[s].numericalize_(vocab)
            # ontology = ontology + dataset[s].extract_ontology()
            with open(os.path.join(dann, fname), 'wt') as f:
                json.dump(dataset[s].to_dict(), f)
        ontology.numericalize_(vocab)
        with open(os.path.join(dann, 'ontology.json'), 'wt') as f:
            json.dump(ontology.to_dict(), f)
        with open(os.path.join(dann, 'vocab.json'), 'wt') as f:
            json.dump(vocab.to_dict(), f)