Exemple #1
0
def main(opt):
    """ Spawns 1 process per GPU """
    nb_gpu = len(opt.gpuid)
    logger = get_logger(opt.log_file)
    mp = torch.multiprocessing.get_context('spawn')

    # Create a thread to listen for errors in the child processes.
    error_queue = mp.SimpleQueue()
    error_handler = ErrorHandler(error_queue)

    # Train with multiprocessing.
    procs = []
    for i in range(nb_gpu):
        opt.gpu_rank = i
        opt.device_id = i

        procs.append(
            mp.Process(target=run, args=(
                opt,
                error_queue,
            ), daemon=True))
        procs[i].start()
        logger.info(" Starting process pid: %d  " % procs[i].pid)
        error_handler.add_child(procs[i].pid)
    for p in procs:
        p.join()
def init_model(model, use_gpu=False):
    opt = FakeOpt(model=model, gpu=0 if use_gpu else -1)

    translator = build_translator(opt,
                                  report_score=False,
                                  logger=get_logger(),
                                  use_output=False)
    return translator
def _modify(corpus=None, neurons=None, values=None, model=None):

    opt = FakeOpt(model=model)

    translator = build_translator(opt,
                                  report_score=False,
                                  logger=get_logger(),
                                  use_output=False)

    sources, toggles = zip(*corpus)
    print('Originally toggles is the following:')
    print(toggles[:10])
    _, toggles = zip(*toggles)

    print(sources[:10], toggles[:10])

    def intervene(layer_data, sentence_index, index):
        for (layer, neuron), value in zip(neurons, values):
            if index == layer:
                for i in toggles[sentence_index]:
                    tqdm.write('Successfully modifying %d %d %d %f' %
                               (i, layer, neuron, value))
                    layer_data[i][0][neuron] = value
        return layer_data

    modified = []
    for i, source in enumerate(tqdm(sources)):
        stream = io.StringIO()

        # Logging:
        tqdm.write('Source: %s' % ' '.join(source))
        tqdm.write('Target: %s' % ' '.join(source[j] for j in toggles[i]))

        translator.translate(src_data_iter=[' '.join(source)],
                             src_dir='',
                             batch_size=1,
                             attn_debug=False,
                             intervention=lambda l, j: intervene(l, i, j),
                             out_file=stream)
        translation = stream.getvalue()

        # Logging:
        tqdm.write('Result: %s' % translation)

        modified.append(translation.strip().split(' '))
    return modified
Exemple #4
0
def translate(
        sentences=None,
        modifications=None,
        model=None):

    global translator
    global translator_model_name

    if model != translator_model_name:
        opt = FakeOpt(model=model)

        translator = build_translator(opt, report_score=False, logger=get_logger(), use_output=False)
        translator_model_name = model

    def intervene(layer_data, sentence_index, index):
        for token, layer, neuron, value in modifications[sentence_index]:
            if layer == index:
                print('Succesfully flipping %d %d %d %f' % (token, layer, neuron, value))
                layer_data[token][0][neuron] = value
        return layer_data

    modified = []
    dumps = []

    # NB. Some of this is kind of hacky with passing streams and things
    # and also returning them; it may be good to go back later and try to dedupe
    # all the plumbing. Everything should presently work though.
    for i, source in enumerate(sentences):
        stream = io.StringIO()
        layer_dump, scores, predictions = translator.translate(src_data_iter=[source],
                             src_dir='',
                             batch_size=1,
                             attn_debug=False,
                             intervention=lambda l, j: intervene(l, i, j),
                             out_file=stream)
        translation = stream.getvalue()

        sys.stdout.flush()
        modified.append(translation)
        dumps.append(layer_dump)

    return modified, dumps
Exemple #5
0
def preprocess_main(opt):
    logger = get_logger(opt.log_file)
    src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src,
                                            'src')
    tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt,
                                            'tgt')
    logger.info(" * number of source features: %d." % src_nfeats)
    logger.info(" * number of target features: %d." % tgt_nfeats)

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats)

    logger.info("Building & saving training data...")
    train_dataset_files = build_save_dataset('train', fields, opt, logger)

    logger.info("Building & saving vocabulary...")
    build_save_vocab(train_dataset_files, fields, opt, logger)

    logger.info("Building & saving validation data...")
    build_save_dataset('valid', fields, opt, logger)
Exemple #6
0
    def __init__(self,
                 opt,
                 model_id,
                 tokenizer_opt=None,
                 load=False,
                 timeout=-1,
                 on_timeout="to_cpu",
                 model_root="./"):
        """
            Args:
                opt: (dict) options for the Translator
                model_id: (int) model id
                tokenizer_opt: (dict) options for the tokenizer or None
                load: (bool) whether to load the model during __init__
                timeout: (int) seconds before running `do_timeout`
                         Negative values means no timeout
                on_timeout: (str) in ["to_cpu", "unload"] set what to do on
                            timeout (see function `do_timeout`)
                model_root: (str) path to the model directory
                            it must contain de model and tokenizer file

        """
        self.model_root = model_root
        self.opt = self.parse_opt(opt)
        if self.opt.n_best > 1:
            raise ValueError("Values of n_best > 1 are not supported")

        self.model_id = model_id
        self.tokenizer_opt = tokenizer_opt
        self.timeout = timeout
        self.on_timeout = on_timeout

        self.unload_timer = None
        self.user_opt = opt
        self.tokenizer = None
        self.logger = get_logger(opt.log_file)

        if load:
            self.load()
    def intervene(layer_data, layer_index):
        rnn_size = layer_data.shape[2]
        start_range = layer_index * rnn_size
        end_range = start_range + rnn_size
        neurons = [n-start_range for n in neurons_to_ablate if n >= start_range and n < end_range]

        layer_data[:,:,neurons] = 0

        return layer_data

    translator.translate(src_path=opt.src,
                         tgt_path=opt.tgt,
                         src_dir=opt.src_dir,
                         batch_size=opt.batch_size,
                         attn_debug=opt.attn_debug,
                         intervention=lambda l, i: intervene(l, i),
                )

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='ablate.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    onmt.opts.add_md_help_argument(parser)
    onmt.opts.translate_opts(parser)

    parser.add_argument('-neurons-to-ablate', dest='neurons', type=str, default="")

    opt = parser.parse_args()
    logger = get_logger(opt.log_file)
    main(opt)
def main(opt):
    logger = get_logger(opt.log_file)

    opt = training_opt_postprocessing(opt, logger)

    # Load checkpoint if we resume from a previous training.
    if opt.train_from:
        logger.info('Loading checkpoint from %s' % opt.train_from)
        checkpoint = torch.load(opt.train_from,
                                map_location=lambda storage, loc: storage)
        model_opt = checkpoint['opt']
    else:
        checkpoint = None
        model_opt = opt

    # Peek the fisrt dataset to determine the data_type.
    # (All datasets have the same data_type).
    first_dataset = next(lazily_load_dataset("train", opt, logger))
    data_type = first_dataset.data_type

    # Load fields generated from preprocess phase.
    fields = _load_fields(first_dataset, data_type, opt, checkpoint, logger)

    # Report src/tgt features.

    src_features, tgt_features = _collect_report_features(fields)
    for j, feat in enumerate(src_features):
        logger.info(' * src feature %d size = %d' %
                    (j, len(fields[feat].vocab)))
    for j, feat in enumerate(tgt_features):
        logger.info(' * tgt feature %d size = %d' %
                    (j, len(fields[feat].vocab)))

    # Build model.
    model = build_model(model_opt, opt, fields, checkpoint, logger)
    n_params, enc, dec = _tally_parameters(model)
    logger.info('encoder: %d' % enc)
    logger.info('decoder: %d' % dec)
    logger.info('* number of parameters: %d' % n_params)
    _check_save_model_path(opt)

    # Build optimizer.
    optim = build_optim(model, opt, checkpoint)

    # Build model saver
    model_saver = build_model_saver(model_opt, opt, model, fields, optim)

    trainer = build_trainer(opt,
                            model,
                            fields,
                            optim,
                            data_type,
                            logger,
                            model_saver=model_saver)

    def train_iter_fct():
        return build_dataset_iter(lazily_load_dataset("train", opt, logger),
                                  fields, opt)

    def valid_iter_fct():
        return build_dataset_iter(lazily_load_dataset("valid", opt, logger),
                                  fields, opt)

    # Do training.
    trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps,
                  opt.valid_steps)

    if opt.tensorboard:
        trainer.report_manager.tensorboard_writer.close()
    def forward(self, input):
        laplacian = input.exp() + self.eps
        output = input.clone()
        for b in range(input.size(0)):
            lap = laplacian[b].masked_fill(
                torch.eye(input.size(1)).cuda().ne(0), 0)
            lap = -lap + torch.diag(lap.sum(0))
            # store roots on diagonal
            lap[0] = input[b].diag().exp()
            inv_laplacian = lap.inverse()

            factor = inv_laplacian.diag().unsqueeze(1)\
                                         .expand_as(input[b]).transpose(0, 1)
            term1 = input[b].exp().mul(factor).clone()
            term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone()
            term1[:, 0] = 0
            term2[0] = 0
            output[b] = term1 - term2
            roots_output = input[b].diag().exp().mul(
                inv_laplacian.transpose(0, 1)[0])
            output[b] = output[b] + torch.diag(roots_output)
        return output


if __name__ == "__main__":
    logger = get_logger('StructuredAttention.log')
    dtree = MatrixTree()
    q = torch.rand(1, 5, 5).cuda()
    marg = dtree.forward(q)
    logger.info(marg.sum(1))
Exemple #10
0
    filtered_enc_embeddings, enc_count = match_embeddings(
        enc_vocab, embeddings_enc, opt)
    filtered_dec_embeddings, dec_count = match_embeddings(
        dec_vocab, embeddings_dec, opt)
    logger.info("\nMatching: ")
    match_percent = [
        _['match'] / (_['match'] + _['miss']) * 100
        for _ in [enc_count, dec_count]
    ]
    logger.info("\t* enc: %d match, %d missing, (%.2f%%)" %
                (enc_count['match'], enc_count['miss'], match_percent[0]))
    logger.info("\t* dec: %d match, %d missing, (%.2f%%)" %
                (dec_count['match'], dec_count['miss'], match_percent[1]))

    logger.info("\nFiltered embeddings:")
    logger.info("\t* enc: ", filtered_enc_embeddings.size())
    logger.info("\t* dec: ", filtered_dec_embeddings.size())

    enc_output_file = opt.output_file + ".enc.pt"
    dec_output_file = opt.output_file + ".dec.pt"
    logger.info("\nSaving embedding as:\n\t* enc: %s\n\t* dec: %s" %
                (enc_output_file, dec_output_file))
    torch.save(filtered_enc_embeddings, enc_output_file)
    torch.save(filtered_dec_embeddings, dec_output_file)
    logger.info("\nDone.")


if __name__ == "__main__":
    logger = get_logger('embeddings_to_torch.log')
    main()
Exemple #11
0
    fields = onmt.inputters.load_fields_from_vocab(checkpoint['vocab'])

    model_opt = checkpoint['opt']
    for arg in dummy_opt.__dict__:
        if arg not in model_opt:
            model_opt.__dict__[arg] = dummy_opt.__dict__[arg]

    model = onmt.model_builder.build_base_model(model_opt, fields,
                                                use_gpu(opt), checkpoint)
    encoder = model.encoder
    decoder = model.decoder

    encoder_embeddings = encoder.embeddings.word_lut.weight.data.tolist()
    decoder_embeddings = decoder.embeddings.word_lut.weight.data.tolist()

    logger.info("Writing source embeddings")
    write_embeddings(opt.output_dir + "/src_embeddings.txt", src_dict,
                     encoder_embeddings)

    logger.info("Writing target embeddings")
    write_embeddings(opt.output_dir + "/tgt_embeddings.txt", tgt_dict,
                     decoder_embeddings)

    logger.info('... done.')
    logger.info('Converting model...')


if __name__ == "__main__":
    logger = get_logger('extract_embeddings.log')
    main()