Esempio n. 1
0
    def __init__(self, logger, config):
        if torch.cuda.is_available():  # and False:
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')

        self.logger = logger
        self.train_config = registry.instantiate(TrainConfig, config['train'])
        self.data_random = random_state.RandomContext(
            self.train_config.data_seed)
        self.model_random = random_state.RandomContext(
            self.train_config.model_seed)

        self.init_random = random_state.RandomContext(
            self.train_config.init_seed)
        with self.init_random:
            # 0. Construct preprocessors
            self.model_preproc = registry.instantiate(registry.lookup(
                'model', config['model']).Preproc,
                                                      config['model'],
                                                      unused_keys=('name', ))
            self.model_preproc.load()

            # 1. Construct model
            self.model = registry.construct('model',
                                            config['model'],
                                            unused_keys=('encoder_preproc',
                                                         'decoder_preproc'),
                                            preproc=self.model_preproc,
                                            device=device)
            self.model.to(device)
Esempio n. 2
0
    def __init__(self, config):
        self.config = config
        if torch.cuda.is_available():  #  & False
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
            torch.set_num_threads(1)

        # 0. Construct preprocessors
        self.model_preproc = registry.instantiate(
            registry.lookup('model', config['model']).Preproc, config['model'])
        self.model_preproc.load()
Esempio n. 3
0
    def __init__(
        self,
        device,
        preproc,
        word_emb_size=128,
        recurrent_size=256,
        dropout=0.,
        question_encoder=('emb', 'bilstm'),
        column_encoder=('emb', 'bilstm'),
        table_encoder=('emb', 'bilstm'),
        update_config={},
        include_in_memory=('question', 'column', 'table'),
        batch_encs_update=True,
    ):
        super().__init__()
        self._device = device
        self.preproc = preproc
        self.vocab = preproc.vocab

        self.word_emb_size = word_emb_size
        self.recurrent_size = recurrent_size
        assert self.recurrent_size % 2 == 0
        self.include_in_memory = set(include_in_memory)
        self.dropout = dropout

        self.question_encoder = self._build_modules(question_encoder)
        self.column_encoder = self._build_modules(column_encoder)
        self.table_encoder = self._build_modules(table_encoder)

        update_modules = {
            'relational_transformer':
            spider_enc_modules.RelationalTransformerUpdate,
            'none': spider_enc_modules.NoOpUpdate,
        }

        self.encs_update = registry.instantiate(
            update_modules[update_config['name']],
            update_config,
            device=self._device,
            hidden_size=recurrent_size,
        )
        self.batch_encs_update = batch_encs_update
    def __init__(self,
                 device,
                 num_layers,
                 num_heads,
                 hidden_size,
                 tie_layers=False,
                 ff_size=None,
                 dropout=0.1,
                 relation_providers=[
                     {
                         'name': 'schema'
                     },
                 ]):
        super().__init__()
        self._device = device

        registered_relation_providers = {
            'schema': SchemaRelationProvider,
        }
        self.relation_providers = [
            registry.instantiate(registered_relation_providers[config['name']],
                                 config,
                                 unused_keys=('name', ))
            for config in relation_providers
        ]
        self.relation_ids = {}

        for provider in self.relation_providers:
            for key in provider.all_relation_types:
                self.relation_ids[key] = len(self.relation_ids)

        if ff_size is None:
            ff_size = hidden_size * 4
        self.encoder = transformer.Encoder(
            lambda: transformer.EncoderLayer(
                hidden_size,
                transformer.MultiHeadedAttentionWithRelations(
                    num_heads, hidden_size, dropout),
                transformer.PositionwiseFeedForward(hidden_size, ff_size,
                                                    dropout),
                len(self.relation_ids), dropout), hidden_size, num_layers,
            tie_layers)
Esempio n. 5
0
    def __init__(self, N_word, N_h, N_depth, word_embedding_layer, encode_cols,
                 spider_enc_config, gpu):
        super(Seq2structEncoder, self).__init__()
        self.N_word = N_word
        self.N_h = N_h
        self.N_depth = N_depth

        self.word_embedding_layer = word_embedding_layer
        self.encode_cols = encode_cols
        self.zero_emb = np.zeros(self.N_word, dtype=np.float32)
        self.gpu = gpu

        self.spider_enc = registry.instantiate(
            spider_enc.SpiderEncoderV2,
            spider_enc_config,
            device=torch.device('cuda') if gpu else torch.device('cpu'),
            preproc=FakePreproc,
            word_emb_size=N_word,
            recurrent_size=N_h,
        )
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', required=True)
    parser.add_argument('--config-args')
    args = parser.parse_args()

    if args.config_args:
        config = json.loads(_jsonnet.evaluate_file(args.config, tla_codes={'args': args.config_args}))
    else:
        config = json.loads(_jsonnet.evaluate_file(args.config))

    model_preproc = registry.instantiate(
        registry.lookup('model', config['model']).Preproc,
        config['model'])

    for section in config['data']:
        data = registry.construct('dataset', config['data'][section])
        for item in tqdm.tqdm(data, desc=section, dynamic_ncols=True):
            to_add, validation_info = model_preproc.validate_item(item, section)
            if to_add:
                model_preproc.add_item(item, section, validation_info)
    model_preproc.save()
Esempio n. 7
0
 def __init__(self, config):
     self.config = config
     self.model_preproc = registry.instantiate(
         registry.lookup('model', config['model']).Preproc,
         config['model'])
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--logdir', required=True)
    parser.add_argument('--config', required=True)
    parser.add_argument('--config-args')
    args = parser.parse_args()

    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    if args.config_args:
        config = json.loads(_jsonnet.evaluate_file(args.config, tla_codes={'args': args.config_args}))
    else:
        config = json.loads(_jsonnet.evaluate_file(args.config))

    if 'model_name' in config:
        args.logdir = os.path.join(args.logdir, config['model_name'])
    train_config = registry.instantiate(TrainConfig, config['train'])

    reopen_to_flush = config.get('log', {}).get('reopen_to_flush')
    logger = Logger(os.path.join(args.logdir, 'log.txt'), reopen_to_flush)
    with open(os.path.join(args.logdir,
          'config-{}.json'.format(
            datetime.datetime.now().strftime('%Y%m%dT%H%M%S%Z'))), 'w') as f:
        json.dump(config, f, sort_keys=True, indent=4)
    logger.log('Logging to {}'.format(args.logdir))

    init_random = random_state.RandomContext(train_config.init_seed)
    data_random = random_state.RandomContext(train_config.data_seed)
    model_random = random_state.RandomContext(train_config.model_seed)

    with init_random:
        # 0. Construct preprocessors
        model_preproc = registry.instantiate(
            registry.lookup('model', config['model']).Preproc,
            config['model'],
            unused_keys=('name',))
        model_preproc.load()

        # 1. Construct model
        model = registry.construct('model', config['model'],
                unused_keys=('encoder_preproc', 'decoder_preproc'), preproc=model_preproc, device=device)
        model.to(device)

        optimizer = registry.construct('optimizer', config['optimizer'], params=model.parameters())
        lr_scheduler = registry.construct(
                'lr_scheduler',
                config.get('lr_scheduler', {'name': 'noop'}),
                optimizer=optimizer)

    # 2. Restore its parameters
    saver = saver_mod.Saver(
        model, optimizer, keep_every_n=train_config.keep_every_n)
    last_step = saver.restore(args.logdir)

    # 3. Get training data somewhere
    with data_random:
        train_data = model_preproc.dataset('train')
        train_data_loader = yield_batches_from_epochs(
            torch.utils.data.DataLoader(
                train_data,
                batch_size=train_config.batch_size,
                shuffle=True,
                drop_last=True,
                collate_fn=lambda x: x))
    train_eval_data_loader = torch.utils.data.DataLoader(
            train_data,
            batch_size=train_config.eval_batch_size,
            collate_fn=lambda x: x)

    val_data = model_preproc.dataset('val')
    val_data_loader = torch.utils.data.DataLoader(
            val_data,
            batch_size=train_config.eval_batch_size,
            collate_fn=lambda x: x)

    # 4. Start training loop
    with data_random:
        for batch in train_data_loader:
            # Quit if too long
            if last_step >= train_config.max_steps:
                break

            # Evaluate model
            if last_step % train_config.eval_every_n == 0:
                if train_config.eval_on_train:
                    eval_model(logger, model, last_step, train_eval_data_loader, 'train', num_eval_items=train_config.num_eval_items)
                if train_config.eval_on_val:
                    eval_model(logger, model, last_step, val_data_loader, 'val', num_eval_items=train_config.num_eval_items)

            # Compute and apply gradient
            with model_random:
                optimizer.zero_grad()
                loss = model.compute_loss(batch)
                loss.backward()
                lr_scheduler.update_lr(last_step)
                optimizer.step()

            # Report metrics
            if last_step % train_config.report_every_n == 0:
                logger.log('Step {}: loss={:.4f}'.format(last_step, loss.item()))

            last_step += 1
            # Run saver
            if last_step % train_config.save_every_n == 0:
                saver.save(args.logdir, last_step)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', required=True)
    parser.add_argument('--config-args')
    args = parser.parse_args()

    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    if args.config_args:
        config = json.loads(
            _jsonnet.evaluate_file(args.config,
                                   tla_codes={'args': args.config_args}))
    else:
        config = json.loads(_jsonnet.evaluate_file(args.config))

    # 0. Construct preprocessors
    model_preproc = registry.instantiate(registry.lookup(
        'model', config['model']).Preproc,
                                         config['model'],
                                         unused_keys=('name', ))
    model_preproc.load()

    # 1. Construct model
    model = registry.construct('model',
                               config['model'],
                               unused_keys=('encoder_preproc',
                                            'decoder_preproc'),
                               preproc=model_preproc,
                               device=device)
    model.to(device)
    model.eval()

    # 3. Get training data somewhere
    train_data = model_preproc.dataset('train')
    train_eval_data_loader = torch.utils.data.DataLoader(
        train_data, batch_size=10, collate_fn=lambda x: x)

    batch = next(iter(train_eval_data_loader))
    descs = [x for x, y in batch]

    q0, qb = test_enc_equal([descs[0]['question']],
                            [[desc['question']] for desc in descs],
                            model.encoder.question_encoder)

    c0, cb = test_enc_equal(descs[0]['columns'],
                            [desc['columns'] for desc in descs],
                            model.encoder.column_encoder)

    t0, tb = test_enc_equal(descs[0]['tables'],
                            [desc['tables'] for desc in descs],
                            model.encoder.table_encoder)

    q0_enc, c0_enc, t0_enc = model.encoder.encs_update.forward_unbatched(
        descs[0], q0[0], c0[0], c0[1], t0[0], t0[1])
    qb_enc, cb_enc, tb_enc = model.encoder.encs_update.forward(
        descs, qb[0], cb[0], cb[1], tb[0], tb[1])

    check_close(q0_enc.squeeze(1), qb_enc.select(0))
    check_close(c0_enc.squeeze(1), cb_enc.select(0))
    check_close(t0_enc.squeeze(1), tb_enc.select(0))
Esempio n. 10
0
def main():
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
        torch.set_num_threads(1)
    if args.config_args:
        config = json.loads(
            _jsonnet.evaluate_file(args.config,
                                   tla_codes={'args': args.config_args}))
    else:
        config = json.loads(_jsonnet.evaluate_file(args.config))

    if 'model_name' in config:
        args.logdir = os.path.join(args.logdir, config['model_name'])

    output_path = args.output.replace('__LOGDIR__', args.logdir)
    if os.path.exists(output_path):
        print('Output file {} already exists'.format(output_path))
        sys.exit(1)

    # 0. Construct preprocessors
    model_preproc = registry.instantiate(
        registry.lookup('model', config['model']).Preproc, config['model'])
    model_preproc.load()

    # 1. Construct model
    model = registry.construct('model',
                               config['model'],
                               preproc=model_preproc,
                               device=device)
    model.to(device)
    model.eval()
    model.visualize_flag = False

    optimizer = registry.construct('optimizer',
                                   config['optimizer'],
                                   params=model.parameters())

    # 2. Restore its parameters
    saver = saver_mod.Saver(model, optimizer)
    last_step = saver.restore(args.logdir, step=args.step, map_location=device)
    if not last_step:
        raise Exception('Attempting to infer on untrained model')

    # 3. Get training data somewhere
    output = open(output_path, 'w')
    data = registry.construct('dataset', config['data'][args.section])
    if args.limit:
        sliced_data = itertools.islice(data, args.limit)
    else:
        sliced_data = data

    with torch.no_grad():
        if args.mode == 'infer':
            orig_data = registry.construct('dataset',
                                           config['data'][args.section])
            preproc_data = model_preproc.dataset(args.section)
            if args.limit:
                sliced_orig_data = itertools.islice(data, args.limit)
                sliced_preproc_data = itertools.islice(data, args.limit)
            else:
                sliced_orig_data = orig_data
                sliced_preproc_data = preproc_data
            assert len(orig_data) == len(preproc_data)
            infer(model, args.beam_size, args.output_history, sliced_orig_data,
                  sliced_preproc_data, output)
        elif args.mode == 'debug':
            data = model_preproc.dataset(args.section)
            if args.limit:
                sliced_data = itertools.islice(data, args.limit)
            else:
                sliced_data = data
            debug(model, sliced_data, output)
        elif args.mode == 'visualize_attention':
            model.visualize_flag = True
            model.decoder.visualize_flag = True
            data = registry.construct('dataset', config['data'][args.section])
            if args.limit:
                sliced_data = itertools.islice(data, args.limit)
            else:
                sliced_data = data
            visualize_attention(model, args.beam_size, args.output_history,
                                sliced_data, output)
Esempio n. 11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', required=True)
    parser.add_argument('--config-args')
    parser.add_argument('--section', default='train')
    parser.add_argument('--num-iters', type=int, default=100)
    parser.add_argument('--vis-out')
    args = parser.parse_args()

    if args.config_args:
        config = json.loads(
            _jsonnet.evaluate_file(args.config,
                                   tla_codes={'args': args.config_args}))
    else:
        config = json.loads(_jsonnet.evaluate_file(args.config))

    # 0. Construct preprocessors
    model_preproc = registry.instantiate(
        registry.lookup('model', config['model']).Preproc, config['model'])
    model_preproc.load()

    # 3. Get training data somewhere
    preproc_data = model_preproc.dataset(args.section)
    all_trees = [dec.tree for enc, dec in preproc_data]
    tree_bpe = TreeBPE(model_preproc.dec_preproc.grammar)
    for i in tqdm.tqdm(range(args.num_iters), dynamic_ncols=True):
        tree_bpe.run_iteration(all_trees)
    tree_bpe.finish(all_trees)
    print('Finished')

    if args.vis_out:
        f = open(args.vis_out, 'w')
        f.write('''# Documentation
#
# Idiom trees are printed like this:
#   NodeType
#   ├─field1 [field1_type]
#   ├─field2 [field2_type]?
#   └─field3 [field3_type]*
# ? indicates the field is optional.
# * indicates the field is sequential.
#
# If a field has a known primitive value, it is written like this:
#   └─field3 [str]
#     └─'value'
#
# If a field has a known type for its value, it is written like this:
#   └─field3 [field3_type]
#     └─Field3NodeType
#       └─...
#
# If a field:
# - does not have a known value, or
# - is sequential and the idiom allows for further entries at the end
# it is written like this:
#   └─field3 [field3_type]
#     └─??
# 
# If a field:
# - is optional and known to lack a value, or
# - is sequential and the idiom does not allow for further entries at the end
# then there is no ??.

Initial node type frequency:
''')

        for k, v in tree_bpe.pre_iteration_counts[0].most_common():
            print('- {}: {}'.format(k, v), file=f)
        print(file=f)

        for i, type_info in enumerate(tree_bpe.created_types):
            print('# Idiom {} [{}]'.format(i, type_info.name), file=f)
            print('# Descended from {} by setting {} to {}'.format(
                *type_info.predecessor_triple),
                  file=f)
            print('# Frequency at creation: {}'.format(
                tree_bpe.pre_iteration_counts[i + 1][type_info.name]),
                  file=f)
            print(tree_bpe.visualize(type_info), file=f)
        f.close()
    else:
        import IPython
        IPython.embed()