Example #1
0
            if n_iter % 500 == 0:
                stats_str = [('DIS_COSTS', 'Discriminator loss')]
                stats_log = ['%s: %.4f' % (v, np.mean(stats[k]))
                             for k, v in stats_str if len(stats[k]) > 0]
                stats_log.append('%i samples/s' % int(n_words_proc / (time.time() - tic)))
                logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log))

                # reset
                tic = time.time()
                n_words_proc = 0
                for k, _ in stats_str:
                    del stats[k][:]

        # embeddings / discriminator evaluation
        to_log = OrderedDict({'n_epoch': n_epoch})
        evaluator.all_eval(to_log)
        evaluator.eval_dis(to_log)

        # JSON log / save best model / end of epoch
        logger.info("__log__:%s" % json.dumps(to_log))
        trainer.save_best(to_log, VALIDATION_METRIC)
        logger.info('End of epoch %i.\n\n' % n_epoch)

        # update the learning rate (stop if too small)
        trainer.update_lr(to_log, VALIDATION_METRIC)
        if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr:
            logger.info('Learning rate < 1e-6. BREAK.')
            break


"""
Example #2
0
def main():
    VALIDATION_METRIC_SUP = 'precision_at_1-csls_knn_10'
    VALIDATION_METRIC_UNSUP = 'mean_cosine-csls_knn_10-S2T-10000'


    # main
    parser = argparse.ArgumentParser(description='Supervised training')
    parser.add_argument("--seed", type=int, default=-1, help="Initialization seed")
    parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)")
    parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models")
    parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name")
    parser.add_argument("--exp_id", type=str, default="", help="Experiment ID")
    parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU")
    parser.add_argument("--export", type=str, default="txt", help="Export embeddings after training (txt / pth)")

    # data
    parser.add_argument("--src_lang", type=str, default='en', help="Source language")
    parser.add_argument("--tgt_lang", type=str, default='es', help="Target language")
    parser.add_argument("--aux_lang", type=str, default='', help="Auxiliary language")
    parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension")
    parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)")
    # training refinement
    parser.add_argument("--n_refinement", type=int, default=5, help="Number of refinement iterations (0 to disable the refinement procedure)")
    # dictionary creation parameters (for refinement)
    parser.add_argument("--dico_train", type=str, default="default", help="Path to training dictionary (default: use identical character strings)")
    parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary")
    parser.add_argument("--dico_method", type=str, default='csls_knn_10', help="Method used for dictionary generation (nn/invsm_beta_30/csls_knn_10)")
    parser.add_argument("--dico_build", type=str, default='S2T&T2S', help="S2T,T2S,S2T|T2S,S2T&T2S")
    parser.add_argument("--dico_threshold", type=float, default=0, help="Threshold confidence for dictionary generation")
    parser.add_argument("--dico_max_rank", type=int, default=10000, help="Maximum dictionary words rank (0 to disable)")
    parser.add_argument("--dico_min_size", type=int, default=0, help="Minimum generated dictionary size (0 to disable)")
    parser.add_argument("--dico_max_size", type=int, default=0, help="Maximum generated dictionary size (0 to disable)")
    # reload pre-trained embeddings
    parser.add_argument("--src_emb", type=str, default='', help="Reload source embeddings")
    parser.add_argument("--tgt_emb", type=str, default='', help="Reload target embeddings")
    parser.add_argument("--aux_emb", type=str, default='', help="Reload auxiliary embeddings")
    parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training")
    parser.add_argument("--fitting_method", type=str, default="non_iterative", help="Method of fitting, one of [non_iterative, em, gauss_seidel, gradient_based]")

    # parse parameters
    params = parser.parse_args()

    # check parameters
    assert not params.cuda or torch.cuda.is_available()
    assert params.dico_train in ["identical_char", "default"] or os.path.isfile(params.dico_train)
    assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"]
    assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank
    assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size
    print(params.src_emb, params.tgt_emb, params.aux_emb)
    assert os.path.isfile(params.src_emb)
    assert os.path.isfile(params.tgt_emb)
    assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
    assert params.export in ["", "txt", "pth"]

    # build logger / model / trainer / evaluator
    logger = initialize_exp(params)
    src_emb, tgt_emb, aux_emb, mapping, _ = build_model(params, False)

    trainer = Trainer(src_emb, tgt_emb, aux_emb, mapping, None, params)

    # load a training dictionary. if a dictionary path is not provided, use a default
    # one ("default") or create one based on identical character strings ("identical_char")
    trainer.load_training_dico(params.dico_train)

    # define the validation metric
    VALIDATION_METRIC = VALIDATION_METRIC_UNSUP if params.dico_train == 'identical_char' else VALIDATION_METRIC_SUP
    logger.info("Validation metric: %s" % VALIDATION_METRIC)

    # apply the PCCA solution
    trainer.fit(fitting_method=params.fitting_method)

    # IMPORTANT: EVALUATOR SHOULD BE CREATED AFTER TRAINER HAS BEEN FITTED
    evaluator = Evaluator(trainer)

    # embeddings evaluation
    to_log = OrderedDict({})
    evaluator.all_eval(to_log)

    logger.info("__log__:%s" % json.dumps(to_log))
Example #3
0
"""
Learning loop for Procrustes Iterative Learning
"""
for n_iter in range(params.n_refinement + 1):

    logger.info('Starting iteration %i...' % n_iter)

    # build a dictionary from aligned embeddings (unless
    # it is the first iteration and we use the init one)
    if n_iter > 0 or not hasattr(trainer, 'dico'):
        trainer.build_dictionary()

    # apply the Procrustes solution
    trainer.procrustes()

    # embeddings evaluation
    to_log = OrderedDict({'n_iter': n_iter})
    evaluator.all_eval(to_log)

    # JSON log / save best model / end of epoch
    logger.info("__log__:%s" % json.dumps(to_log))
    trainer.save_best(to_log, VALIDATION_METRIC)
    logger.info('End of iteration %i.\n\n' % n_iter)


# export embeddings
if params.export:
    trainer.reload_best()
    trainer.export()
Example #4
0
    """
    for n_iter in range(params.n_refinement + 1):

        logger.info('Starting iteration %i...' % n_iter)

        # build a dictionary from aligned embeddings (unless
        # it is the first iteration and we use the init one)
        if n_iter > 0 or not hasattr(trainer, 'dico'):
            trainer.build_dictionary()

        # apply the Procrustes solution
        trainer.procrustes()

        # embeddings evaluation
        to_log = OrderedDict({'n_iter': n_iter})
        evaluator.all_eval(to_log, exclude=code)

        # JSON log / save best model / end of epoch
        logger.info("__log__:%s" % json.dumps(to_log))
        trainer.save_best(to_log, params.val_metric, n_iter)
        logger.info('End of iteration %i.\n\n' % n_iter)

    #get rank of left-out code
    trainer.reload_best()
    desc_repr = trainer.tgt_emb.weight[trainer.tgt_dico.word2id[word]]
    code_sims = cos(trainer.mapping(trainer.src_emb.weight), desc_repr.unsqueeze(0)).data.cpu().numpy()
    print("getting similarity rank of code %s" % code)
    rank = len(code_sims) - np.where(np.argsort(code_sims) == trainer.src_dico.word2id[code])[0][0]

    code_sims_unaligned = cos(trainer.src_emb.weight, desc_repr.unsqueeze(0)).data.cpu().numpy()
    rank_u = len(code_sims) - np.where(np.argsort(code_sims_unaligned) == trainer.src_dico.word2id[code])[0][0]
    if n_iter > params.n_refinement - params.fine_tuning:
        support = False

    logger.info('Starting iteration %i...' % n_iter)

    # build a dictionary from aligned embeddings (unless
    # it is the first iteration and we use the init one)
    if n_iter > 0 or not hasattr(trainer, 'dico'):
        trainer.build_dictionary(support)

    # apply the Procrustes solution
    if params.generalized:
        trainer.generalized_procrustes(support, n_iter == 0)
    else:
        trainer.simple_procrustes()

    # embeddings evaluation
    to_log = OrderedDict({'n_iter': n_iter})
    biling_dict = True
    evaluator.all_eval(to_log, biling_dict)

    # JSON log / save best model / end of epoch
    logger.info("__log__:%s" % json.dumps(to_log))
    trainer.save_best(to_log, VALIDATION_METRIC.format(params.tgt_lang[-1]))
    logger.info('End of iteration %i.\n\n' % n_iter)

# export embeddings
if params.export:
    trainer.reload_best()
    trainer.export()
Example #6
0
                    '%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str
                    if len(stats[k]) > 0
                ]
                stats_log.append('%i samples/s' % int(n_words_proc /
                                                      (time.time() - tic)))
                logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log))

                # reset
                tic = time.time()
                n_words_proc = 0
                for k, _ in stats_str:
                    del stats[k][:]

        # embeddings / discriminator evaluation
        to_log = OrderedDict({'n_epoch': n_epoch})
        evaluator.all_eval(to_log, n_epoch)
        evaluator.eval_dis(to_log)

        # JSON log / save best model / end of epoch
        logger.info("__log__:%s" % json.dumps(to_log))
        trainer.save_best(to_log, VALIDATION_METRIC)
        logger.info('End of epoch %i.\n\n' % n_epoch)

        # update the learning rate (stop if too small)
        trainer.update_lr(to_log, VALIDATION_METRIC)
        if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr:
            logger.info('Learning rate < 1e-6. BREAK.')
            break
"""
Learning loop for Procrustes Iterative Refinement
"""
Example #7
0
                    '%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str
                    if len(stats[k]) > 0
                ]
                stats_log.append('%i samples/s' % int(n_words_proc /
                                                      (time.time() - tic)))
                logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log))

                # reset
                tic = time.time()
                n_words_proc = 0
                for k, _ in stats_str:
                    del stats[k][:]

        # embeddings / discriminator evaluation
        to_log = OrderedDict({'n_epoch': n_epoch})
        evaluator.all_eval(to_log, 0)
        evaluator.eval_dis(to_log)

        # JSON log / save best model / end of epoch
        logger.info("__log__:%s" % json.dumps(to_log))
        trainer.save_best(to_log, VALIDATION_METRIC)
        logger.info('End of epoch %i.\n\n' % n_epoch)

        # update the learning rate (stop if too small)
        trainer.update_lr(to_log, VALIDATION_METRIC)
        if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr:
            logger.info('Learning rate < 1e-6. BREAK.')
            break
"""
Learning loop for Procrustes Iterative Refinement
"""
Example #8
0
trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)

evaluator = Evaluator(trainer)
np.random.seed(params.seed)

# init generator parameters with artetxe's methods
if params.map_init == "second_order":
    m_init = extract_initial_mapping(params, src_emb, tgt_emb)
    trainer.set_mapping_weights(torch.from_numpy(m_init))

# if we initialize the generator from a supervised mapping, evaluate before training for sanity check
if not params.map_id_init:
    # embeddings / discriminator evaluation
    to_log = OrderedDict({'n_epoch': -1})
    evaluator.all_eval(to_log)
    evaluator.eval_dis(to_log)
"""
Learning loop for Adversarial Training
"""
if params.adversarial:
    logger.info('----> ADVERSARIAL TRAINING <----\n\n')

    # training loop
    for n_epoch in range(params.n_epochs):

        logger.info('Starting adversarial training epoch %i...' % n_epoch)
        tic = time.time()
        n_words_proc = 0
        stats = {'DIS_COSTS': []}
Example #9
0
                    '%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str
                    if len(stats[k]) > 0
                ]
                stats_log.append('%i samples/s' % int(n_words_proc /
                                                      (time.time() - tic)))
                logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log))

                # reset
                tic = time.time()
                n_words_proc = 0
                for k, _ in stats_str:
                    del stats[k][:]

        # embeddings / discriminator evaluation
        to_log = OrderedDict({'n_epoch': n_epoch})
        evaluator.all_eval(to_log, map='to_tgt')
        evaluator.eval_dis(to_log)

        # JSON log / save best model / end of epoch
        logger.info("__log__:%s" % json.dumps(to_log))
        trainer.save_best(to_log, VALIDATION_METRIC, map='to_tgt')
        logger.info('End of epoch %i.\n\n' % n_epoch)

        # update the learning rate (stop if too small)
        trainer.update_lr(to_log, VALIDATION_METRIC)
        if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr:
            logger.info('Learning rate < 1e-6. BREAK.')
            break
"""
Learning loop for Procrustes Iterative Refinement
"""
                    '%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str
                    if len(stats[k]) > 0
                ]
                stats_log.append('%i samples/s' % int(n_words_proc /
                                                      (time.time() - tic)))
                logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log))

                # reset
                tic = time.time()
                n_words_proc = 0
                for k, _ in stats_str:
                    del stats[k][:]

        # embeddings / discriminator evaluation
        to_log = OrderedDict({'n_epoch': n_epoch})
        evaluator.all_eval(to_log, True)
        evaluator.eval_dis(to_log)

        # JSON log / save best model / end of epoch
        logger.info("__log__:%s" % json.dumps(to_log))
        trainer.save_best(to_log,
                          VALIDATION_METRIC.format(params.tgt_lang[-1]))
        logger.info('End of epoch %i.\n\n' % n_epoch)

        # update the learning rate (stop if too small)
        trainer.update_lr(to_log,
                          VALIDATION_METRIC.format(params.tgt_lang[-1]))
        if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr:
            logger.info('Learning rate < 1e-6. BREAK.')
            break
"""
Example #11
0
def learning(params, src_data, tgt_data, options):
    VALIDATION_METRIC = 'mean_cosine-csls_knn_10-S2T-10000'
    logger = logging.getLogger('{}Log'.format(src_data.dataname))
    for i in range(10):
        # tic = time.time()
        if i == 0:
            options.initialize = True
        else:
            options.initialize = False

        logger.info("src_learning {}回目".format(i + 1))
        src_data = RVSML_OT_Learning(src_data, options, params)

        logger.info("tgt_learning {}回目".format(i + 1))
        tgt_data = RVSML_OT_Learning(tgt_data, options, params)

        # build model / trainer / evaluator
        src_emb, tgt_emb, mapping, discriminator = build_model(
            params, src_data, tgt_data, True)

        trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)
        evaluator = Evaluator(trainer)
        """
        Learning loop for Adversarial Training
        """
        logger.info('----> ADVERSARIAL TRAINING <----\n\n')

        # training loop
        for n_epoch in range(params.n_epochs):

            logger.info('Starting adversarial training epoch %i...' % n_epoch)
            tic = time.time()
            n_words_proc = 0
            stats = {'DIS_COSTS': []}

            for n_iter in range(0, params.epoch_size, params.batch_size):

                # discriminator training
                for _ in range(params.dis_steps):
                    trainer.dis_step(stats)

                # mapping training (discriminator fooling)
                n_words_proc += trainer.mapping_step(stats)

                # log stats
                if n_iter % 500 == 0:
                    stats_str = [('DIS_COSTS', 'Discriminator loss')]
                    stats_log = [
                        '%s: %.4f' % (v, np.mean(stats[k]))
                        for k, v in stats_str if len(stats[k]) > 0
                    ]
                    stats_log.append('%i samples/s' % int(n_words_proc /
                                                          (time.time() - tic)))
                    logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log))

                    # reset
                    tic = time.time()
                    n_words_proc = 0
                    for k, _ in stats_str:
                        del stats[k][:]

            # embeddings / discriminator evaluation
            to_log = OrderedDict({'n_epoch': n_epoch})
            evaluator.all_eval(to_log)
            evaluator.eval_dis(to_log)

            # JSON log / save best model / end of epoch
            logger.info("__log__:%s" % json.dumps(to_log))
            trainer.save_best(to_log, VALIDATION_METRIC)
            logger.info('End of epoch %i.\n\n' % n_epoch)

            # update the learning rate (stop if too small)
            trainer.update_lr(to_log, VALIDATION_METRIC)
            if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr:
                logger.info('Learning rate < 1e-6. BREAK.')
                break
        """
        Learning loop for Procrustes Iterative Refinement
        """
        # if params.n_refinement > 0:
        # Get the best mapping according to VALIDATION_METRIC
        logger.info('----> ITERATIVE PROCRUSTES REFINEMENT <----\n\n')
        trainer.reload_best()

        # training loop
        for n_iter in range(params.n_refinement):

            logger.info('Starting refinement iteration %i...' % n_iter)

            # build a dictionary from aligned embeddings
            trainer.build_dictionary()

            # apply the Procrustes solution
            trainer.procrustes()

            # embeddings evaluation
            to_log = OrderedDict({'n_iter': n_iter})
            evaluator.all_eval(to_log)

            # JSON log / save best model / end of epoch
            logger.info("__log__:%s" % json.dumps(to_log))
            trainer.save_best(to_log, VALIDATION_METRIC)
            logger.info('End of refinement iteration %i.\n\n' % n_iter)

        src_data.trans_mat = torch.mm(src_data.trans_mat,
                                      trainer.mapping.weight.data)

    return src_data, tgt_data
Example #12
0
                    '%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str
                    if len(stats[k]) > 0
                ]
                stats_log.append('%i samples/s' % int(n_words_proc /
                                                      (time.time() - tic)))
                logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log))

                # reset
                tic = time.time()
                n_words_proc = 0
                for k, _ in stats_str:
                    del stats[k][:]

        # embeddings / discriminator evaluation
        to_log = OrderedDict({'n_epoch': n_epoch})
        evaluator.all_eval(to_log)  #AssertionError
        evaluator.eval_dis(to_log)

        # JSON log / save best model / end of epoch
        logger.info("__log__:%s" % json.dumps(to_log))
        trainer.save_best(to_log, VALIDATION_METRIC)
        logger.info('End of epoch %i.\n\n' % n_epoch)

        # update the learning rate (stop if too small)
        trainer.update_lr(to_log, VALIDATION_METRIC)
        if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr:
            logger.info('Learning rate < 1e-6. BREAK.')
            break
"""
Learning loop for Procrustes Iterative Refinement
"""