Ejemplo n.º 1
0
    def reload(path, params):
        """
        Create a sentence embedder from a pretrained model.
        """
        # reload model
        reloaded = torch.load(path)
        state_dict = reloaded['model']

        # handle models from multi-GPU checkpoints
        if 'checkpoint' in path:
            state_dict = {(k[7:] if k.startswith('module.') else k): v
                          for k, v in state_dict.items()}

        # reload dictionary and model parameters
        dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                          reloaded['dico_counts'])
        pretrain_params = AttrDict(reloaded['params'])
        pretrain_params.n_words = len(dico)
        pretrain_params.bos_index = dico.index(BOS_WORD)
        pretrain_params.eos_index = dico.index(EOS_WORD)
        pretrain_params.pad_index = dico.index(PAD_WORD)
        pretrain_params.unk_index = dico.index(UNK_WORD)
        pretrain_params.mask_index = dico.index(MASK_WORD)

        # build model and reload weights
        model = TransformerModel(pretrain_params, dico, True, True)
        model.load_state_dict(state_dict)
        model.eval()

        # adding missing parameters
        params.max_batch_size = 0

        return MyModel(model, dico, pretrain_params, params)
Ejemplo n.º 2
0
def load_xlm_embeddings(path, model_name="model"):
    """
    Load all xlm embeddings
    Params:
        path:
        model_name: model name in the reloaded path, "model" for pretrained xlm encoder; "encoder" for encoder of translation model "decoder" for decoder of translation model
    """
    reloaded = torch.load(path)

    assert model_name in ["model", "encoder", "decoder"]
    state_dict = reloaded[model_name]

    # handle models from multi-GPU checkpoints
    state_dict = {(k[7:] if k.startswith('module.') else k): v
                  for k, v in state_dict.items()}

    # reload dictionary and model parameters
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    pretrain_params = AttrDict(reloaded['params'])
    pretrain_params.n_words = len(dico)
    pretrain_params.bos_index = dico.index(BOS_WORD)
    pretrain_params.eos_index = dico.index(EOS_WORD)
    pretrain_params.pad_index = dico.index(PAD_WORD)
    pretrain_params.unk_index = dico.index(UNK_WORD)
    pretrain_params.mask_index = dico.index(MASK_WORD)

    # build model and reload weights
    if model_name != "decoder":
        model = TransformerModel(pretrain_params, dico, True, True)
    else:
        model = TransformerModel(pretrain_params, dico, False, True)
    model.load_state_dict(state_dict)

    return model.embeddings.weight.data, dico
Ejemplo n.º 3
0
def initialize_model():
    """
    """
    print('launching model')

    chemin = getcwd()
    curPath = chemin if "xlm" in chemin else (os.path.join(getcwd(), 'xlm'))

    onlyfiles = [f for f in listdir(chemin) if isfile(join(chemin, f))]
    print(onlyfiles)

    print(os.path.normpath(os.path.join(getcwd(),
                                        './mlm_tlm_xnli15_1024.pth')))
    model_path = os.path.normpath(
        os.path.join(getcwd(), './mlm_tlm_xnli15_1024.pth'))
    reloaded = torch.load(model_path)

    #     print('allez le model')
    #     response = requests.get(url)
    #     print('response downloaded')
    #     f = io.BytesIO(response.content)
    #     reloaded = torch.load(f)
    #     print('file downloaded')

    #    reloaded = Reloaded.serve()

    params = AttrDict(reloaded['params'])
    print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

    # build dictionary / update parameters
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    params.n_words = len(dico)
    params.bos_index = dico.index(BOS_WORD)
    params.eos_index = dico.index(EOS_WORD)
    params.pad_index = dico.index(PAD_WORD)
    params.unk_index = dico.index(UNK_WORD)
    params.mask_index = dico.index(MASK_WORD)

    # build model / reload weights
    model = TransformerModel(params, dico, True, True)
    model.load_state_dict(reloaded['model'])

    #    bpe = fastBPE.fastBPE(
    #            path.normpath(path.join(curPath, "./codes_xnli_15") ),
    #            path.normpath(path.join(curPath, "./vocab_xnli_15") )  )
    print('fin lecture')

    return model, params, dico
Ejemplo n.º 4
0
def reload_ar_checkpoint(path):
    """ Reload autoregressive params, dictionary, model from a given path """
    # Load dictionary/model/datasets first
    reloaded = torch.load(path)
    params = AttrDict(reloaded['params'])

    # build dictionary / update parameters
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    params.n_words = len(dico)
    params.n_langs = 1
    params.bos_index = dico.index(BOS_WORD)
    params.eos_index = dico.index(EOS_WORD)
    params.pad_index = dico.index(PAD_WORD)
    params.unk_index = dico.index(UNK_WORD)
    params.mask_index = dico.index(MASK_WORD)

    # build Transformer model
    model = TransformerModel(params, is_encoder=False, with_output=True)
    model.load_state_dict(reloaded['model'])
    return params, dico, model
Ejemplo n.º 5
0
def reload_checkpoint(path):
    """ Reload params, dictionary, model from a given path """
    # Load dictionary/model/datasets first
    reloaded = torch.load(path)
    params = AttrDict(reloaded['params'])
    print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

    # build dictionary / update parameters
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    params.n_words = len(dico)
    params.bos_index = dico.index(BOS_WORD)
    params.eos_index = dico.index(EOS_WORD)
    params.pad_index = dico.index(PAD_WORD)
    params.unk_index = dico.index(UNK_WORD)
    params.mask_index = dico.index(MASK_WORD)

    # build model / reload weights
    model = TransformerModel(params, dico, True, True)
    model.load_state_dict(reloaded['model'])

    return params, dico, model
Ejemplo n.º 6
0
def load_model(params):
    # check parameters
    assert os.path.isdir(params.data_path)
    assert os.path.isfile(params.model_path)
    reloaded = torch.load(params.model_path)

    encoder_model_params = AttrDict(reloaded['enc_params'])
    decoder_model_params = AttrDict(reloaded['dec_params'])

    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])

    params.n_langs = encoder_model_params['n_langs']
    params.id2lang = encoder_model_params['id2lang']
    params.lang2id = encoder_model_params['lang2id']
    params.n_words = len(dico)
    params.bos_index = dico.index(BOS_WORD)
    params.eos_index = dico.index(EOS_WORD)
    params.pad_index = dico.index(PAD_WORD)
    params.unk_index = dico.index(UNK_WORD)
    params.mask_index = dico.index(MASK_WORD)

    encoder = TransformerModel(encoder_model_params,
                               dico,
                               is_encoder=True,
                               with_output=False)
    decoder = TransformerModel(decoder_model_params,
                               dico,
                               is_encoder=False,
                               with_output=True)

    def _process_state_dict(state_dict):
        return {(k[7:] if k.startswith('module.') else k): v
                for k, v in state_dict.items()}

    encoder.load_state_dict(_process_state_dict(reloaded['encoder']))
    decoder.load_state_dict(_process_state_dict(reloaded['decoder']))

    return encoder, decoder, dico
Ejemplo n.º 7
0
#%%
# model_path = '/checkpoint/guismay/dumped/clm_test3/10431904/periodic-23.pth'
model_path = '/checkpoint/guismay/dumped/clm_test3/10431904/periodic-23.pth'
reloaded = torch.load(model_path)
params = AttrDict(reloaded['params'])
print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

#%% [markdown]
# ## Build dictionary / update parameters / build model

#%%
# build dictionary / update parameters
dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                  reloaded['dico_counts'])
assert params.n_words == len(dico)
assert params.bos_index == dico.index(BOS_WORD)
assert params.eos_index == dico.index(EOS_WORD)
assert params.pad_index == dico.index(PAD_WORD)
assert params.unk_index == dico.index(UNK_WORD)
assert params.mask_index == dico.index(MASK_WORD)

# build model / reload weights
model = TransformerModel(params, dico, True, True)
model.load_state_dict(reloaded['model'])
model.cuda()
model.eval()

#%%

#%%
FASTBPE_PATH = '/private/home/guismay/tools/fastBPE/fast'
Ejemplo n.º 8
0
def main(params):

    # initialize the experiment
    logger = initialize_exp(params)

    # generate parser / parse parameters
    parser = get_parser()
    params = parser.parse_args()
    torch.manual_seed(
        params.seed
    )  # Set random seed. NB: Multi-GPU also needs torch.cuda.manual_seed_all(params.seed)
    assert (params.sample_temperature
            == 0) or (params.beam_size == 1), 'Cannot sample with beam search.'
    assert params.amp <= 1, f'params.amp == {params.amp} not yet supported.'
    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])
    logger.info("Supported languages: %s" %
                ", ".join(model_params.lang2id.keys()))

    # update dictionary parameters
    for name in [
            'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index',
            'mask_index'
    ]:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    encoder = TransformerModel(model_params,
                               dico,
                               is_encoder=True,
                               with_output=False).cuda().eval()
    decoder = TransformerModel(model_params,
                               dico,
                               is_encoder=False,
                               with_output=True).cuda().eval()
    if all([k.startswith('module.') for k in reloaded['encoder'].keys()]):
        reloaded['encoder'] = {
            k[len('module.'):]: v
            for k, v in reloaded['encoder'].items()
        }
    encoder.load_state_dict(reloaded['encoder'])
    if all([k.startswith('module.') for k in reloaded['decoder'].keys()]):
        reloaded['decoder'] = {
            k[len('module.'):]: v
            for k, v in reloaded['decoder'].items()
        }
    decoder.load_state_dict(reloaded['decoder'])

    if params.amp != 0:
        models = apex.amp.initialize([encoder, decoder],
                                     opt_level=('O%i' % params.amp))
        encoder, decoder = models

    params.src_id = model_params.lang2id[params.src_lang]
    params.tgt_id = model_params.lang2id[params.tgt_lang]

    # read sentences from stdin
    src_sent = []
    for line in sys.stdin.readlines():
        assert len(line.strip().split()) > 0
        src_sent.append(line)
    logger.info("Read %i sentences from stdin. Translating ..." %
                len(src_sent))

    # f = io.open(params.output_path, 'w', encoding='utf-8')

    hypothesis = [[] for _ in range(params.beam_size)]
    for i in range(0, len(src_sent), params.batch_size):

        # prepare batch
        word_ids = [
            torch.LongTensor([dico.index(w) for w in s.strip().split()])
            for s in src_sent[i:i + params.batch_size]
        ]
        lengths = torch.LongTensor([len(s) + 2 for s in word_ids])
        batch = torch.LongTensor(lengths.max().item(),
                                 lengths.size(0)).fill_(params.pad_index)
        batch[0] = params.eos_index
        for j, s in enumerate(word_ids):
            if lengths[j] > 2:  # if sentence not empty
                batch[1:lengths[j] - 1, j].copy_(s)
            batch[lengths[j] - 1, j] = params.eos_index
        langs = batch.clone().fill_(params.src_id)

        # encode source batch and translate it
        encoded = encoder('fwd',
                          x=batch.cuda(),
                          lengths=lengths.cuda(),
                          langs=langs.cuda(),
                          causal=False)
        encoded = encoded.transpose(0, 1)
        max_len = int(1.5 * lengths.max().item() + 10)
        if params.beam_size == 1:
            decoded, dec_lengths = decoder.generate(
                encoded,
                lengths.cuda(),
                params.tgt_id,
                max_len=max_len,
                sample_temperature=(None if params.sample_temperature == 0 else
                                    params.sample_temperature))
        else:
            decoded, dec_lengths, all_hyp_strs = decoder.generate_beam(
                encoded,
                lengths.cuda(),
                params.tgt_id,
                beam_size=params.beam_size,
                length_penalty=params.length_penalty,
                early_stopping=params.early_stopping,
                max_len=max_len,
                output_all_hyps=True)
        # hypothesis.extend(convert_to_text(decoded, dec_lengths, dico, params))

        # convert sentences to words
        for j in range(decoded.size(1)):

            # remove delimiters
            sent = decoded[:, j]
            delimiters = (sent == params.eos_index).nonzero().view(-1)
            assert len(delimiters) >= 1 and delimiters[0].item() == 0
            sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]]

            # output translation
            source = src_sent[i + j].strip().replace('<unk>', '<<unk>>')
            target = " ".join([dico[sent[k].item()] for k in range(len(sent))
                               ]).replace('<unk>', '<<unk>>')
            if params.beam_size == 1:
                hypothesis[0].append(target)
            else:
                for hyp_rank in range(params.beam_size):
                    print(
                        all_hyp_strs[j]
                        [hyp_rank if hyp_rank < len(all_hyp_strs[j]) else -1])
                    hypothesis[hyp_rank].append(
                        all_hyp_strs[j]
                        [hyp_rank if hyp_rank < len(all_hyp_strs[j]) else -1])

            sys.stderr.write("%i / %i: %s -> %s\n" %
                             (i + j, len(src_sent), source.replace(
                                 '@@ ', ''), target.replace('@@ ', '')))
            # f.write(target + "\n")

    # f.close()

    # export sentences to reference and hypothesis files / restore BPE segmentation
    save_dir, split = params.output_path.rsplit('/', 1)
    for hyp_rank in range(len(hypothesis)):
        hyp_name = f'hyp.st={params.sample_temperature}.bs={params.beam_size}.lp={params.length_penalty}.es={params.early_stopping}.seed={params.seed if (len(hypothesis) == 1) else str(hyp_rank)}.{params.src_lang}-{params.tgt_lang}.{split}.txt'
        hyp_path = os.path.join(save_dir, hyp_name)
        with open(hyp_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(hypothesis[hyp_rank]) + '\n')
        restore_segmentation(hyp_path)

        # evaluate BLEU score
        if params.ref_path:
            bleu = eval_moses_bleu(params.ref_path, hyp_path)
            logger.info("BLEU %s %s : %f" % (hyp_path, params.ref_path, bleu))
Ejemplo n.º 9
0
def main():
    parser.add_argument("--input", type=str, default="", help="input file")
    parser.add_argument("--model", type=str, default="", help="model path")
    parser.add_argument("--spm_model",
                        type=str,
                        default="",
                        help="spm model path")
    parser.add_argument("--batch_size",
                        type=int,
                        default=64,
                        help="batch size")
    parser.add_argument("--max_words", type=int, default=100, help="max words")
    parser.add_argument("--cuda", type=str, default="True", help="use cuda")
    parser.add_argument("--output", type=str, default="", help="output file")
    args = parser.parse_args()

    # Reload a pretrained model
    reloaded = torch.load(args.model)
    params = AttrDict(reloaded['params'])

    # Reload the SPM model
    spm_model = spm.SentencePieceProcessor()
    spm_model.Load(args.spm_model)

    # cuda
    assert args.cuda in ["True", "False"]
    args.cuda = eval(args.cuda)

    # build dictionary / update parameters
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    params.n_words = len(dico)
    params.bos_index = dico.index(BOS_WORD)
    params.eos_index = dico.index(EOS_WORD)
    params.pad_index = dico.index(PAD_WORD)
    params.unk_index = dico.index(UNK_WORD)
    params.mask_index = dico.index(MASK_WORD)

    # build model / reload weights
    model = TransformerModel(params, dico, True, True)
    reloaded['model'] = OrderedDict({
        key.replace('module.', ''): reloaded['model'][key]
        for key in reloaded['model']
    })
    model.load_state_dict(reloaded['model'])
    model.eval()

    if args.cuda:
        model.cuda()

    # load sentences
    sentences = []
    with open(args.input) as f:
        for line in f:
            line = spm_model.EncodeAsPieces(line.rstrip())
            line = line[:args.max_words - 1]
            sentences.append(line)

    # encode sentences
    embs = []
    for i in range(0, len(sentences), args.batch_size):
        batch = sentences[i:i + args.batch_size]
        lengths = torch.LongTensor([len(s) + 1 for s in batch])
        bs, slen = len(batch), lengths.max().item()
        assert slen <= args.max_words

        x = torch.LongTensor(slen, bs).fill_(params.pad_index)
        for k in range(bs):
            sent = torch.LongTensor([params.eos_index] +
                                    [dico.index(w) for w in batch[k]])
            x[:len(sent), k] = sent

        if args.cuda:
            x = x.cuda()
            lengths = lengths.cuda()

        with torch.no_grad():
            embedding = model('fwd',
                              x=x,
                              lengths=lengths,
                              langs=None,
                              causal=False).contiguous()[0].cpu()

        embs.append(embedding)

    # save embeddings
    torch.save(torch.cat(embs, dim=0).squeeze(0), args.output)
Ejemplo n.º 10
0
def main(params):

    # initialize the experiment
    logger = initialize_exp(params)
    parser = get_parser()
    params = parser.parse_args()
    models_path = params.model_path.split(',')

    # generate parser / parse parameters
    models_reloaded = []
    for model_path in models_path:
        models_reloaded.append(torch.load(model_path))
    model_params = AttrDict(models_reloaded[0]['params'])
    logger.info("Supported languages: %s" %
                ", ".join(model_params.lang2id.keys()))

    # update dictionary parameters
    for name in [
            'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index',
            'mask_index'
    ]:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(models_reloaded[0]['dico_id2word'],
                      models_reloaded[0]['dico_word2id'],
                      models_reloaded[0]['dico_counts'])
    params.src_id = model_params.lang2id[params.src_lang]
    params.tgt_id = model_params.lang2id[params.tgt_lang]

    encoders = []
    decoders = []

    def package_module(modules):
        state_dict = OrderedDict()
        for k, v in modules.items():
            if k.startswith('module.'):
                state_dict[k[7:]] = v
            else:
                state_dict[k] = v
        return state_dict

    for reloaded in models_reloaded:
        encoder = TransformerModel(model_params,
                                   dico,
                                   is_encoder=True,
                                   with_output=True).to(params.device).eval()
        decoder = TransformerModel(model_params,
                                   dico,
                                   is_encoder=False,
                                   with_output=True).to(params.device).eval()
        encoder.load_state_dict(package_module(reloaded['encoder']))
        decoder.load_state_dict(package_module(reloaded['decoder']))

        # float16
        if params.fp16:
            assert torch.backends.cudnn.enabled
            encoder = network_to_half(encoder)
            decoder = network_to_half(decoder)

        encoders.append(encoder)
        decoders.append(decoder)

    #src_sent = ['Poly@@ gam@@ ie statt Demokratie .']
    src_sent = []
    for line in sys.stdin.readlines():
        assert len(line.strip().split()) > 0
        src_sent.append(line)

    f = io.open(params.output_path, 'w', encoding='utf-8')

    for i in range(0, len(src_sent), params.batch_size):

        # prepare batch
        word_ids = [
            torch.LongTensor([dico.index(w) for w in s.strip().split()])
            for s in src_sent[i:i + params.batch_size]
        ]
        lengths = torch.LongTensor([len(s) + 2 for s in word_ids])
        batch = torch.LongTensor(lengths.max().item(),
                                 lengths.size(0)).fill_(params.pad_index)
        batch[0] = params.eos_index
        for j, s in enumerate(word_ids):
            if lengths[j] > 2:  # if sentence not empty
                batch[1:lengths[j] - 1, j].copy_(s)
            batch[lengths[j] - 1, j] = params.eos_index
        langs = batch.clone().fill_(params.src_id)

        # encode source batch and translate it
        encodeds = []
        for encoder in encoders:
            encoded = encoder('fwd',
                              x=batch.to(params.device),
                              lengths=lengths.to(params.device),
                              langs=langs.to(params.device),
                              causal=False)
            encoded = encoded.transpose(0, 1)
            encodeds.append(encoded)

            assert encoded.size(0) == lengths.size(0)

        decoded, dec_lengths = generate_beam(
            decoders,
            encodeds,
            lengths.to(params.device),
            params.tgt_id,
            beam_size=params.beam,
            length_penalty=params.length_penalty,
            early_stopping=False,
            max_len=int(1.5 * lengths.max().item() + 10),
            params=params)

        # convert sentences to words
        for j in range(decoded.size(1)):

            # remove delimiters
            sent = decoded[:, j]
            delimiters = (sent == params.eos_index).nonzero().view(-1)
            assert len(delimiters) >= 1 and delimiters[0].item() == 0
            sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]]

            # output translation
            source = src_sent[i + j].strip()
            target = " ".join([dico[sent[k].item()] for k in range(len(sent))])
            sys.stderr.write("%i / %i: %s -> %s\n" %
                             (i + j, len(src_sent), source, target))
            f.write(target + "\n")

    f.close()
Ejemplo n.º 11
0
class Translate():
    def __init__(self,
                 model_path,
                 tgt_lang,
                 src_lang,
                 dump_path="./dumped/",
                 exp_name="translate",
                 exp_id="test",
                 batch_size=32):

        # parse parameters
        parser = argparse.ArgumentParser(description="Translate sentences")

        # main parameters
        parser.add_argument("--dump_path",
                            type=str,
                            default=dump_path,
                            help="Experiment dump path")
        parser.add_argument("--exp_name",
                            type=str,
                            default=exp_name,
                            help="Experiment name")
        parser.add_argument("--exp_id",
                            type=str,
                            default=exp_id,
                            help="Experiment ID")
        parser.add_argument("--batch_size",
                            type=int,
                            default=batch_size,
                            help="Number of sentences per batch")
        # model / output paths
        parser.add_argument("--model_path",
                            type=str,
                            default=model_path,
                            help="Model path")
        # parser.add_argument("--max_vocab", type=int, default=-1, help="Maximum vocabulary size (-1 to disable)")
        # parser.add_argument("--min_count", type=int, default=0, help="Minimum vocabulary count")
        # source language / target language
        parser.add_argument("--src_lang",
                            type=str,
                            default=src_lang,
                            help="Source language")
        parser.add_argument("--tgt_lang",
                            type=str,
                            default=tgt_lang,
                            help="Target language")
        parser.add_argument('-d',
                            "--text",
                            type=str,
                            default="",
                            nargs='+',
                            help="Text to be translated")

        params = parser.parse_args()
        assert params.src_lang != '' and params.tgt_lang != '' and params.src_lang != params.tgt_lang

        # initialize the experiment
        logger = initialize_exp(params)

        # On a pas de GPU
        #reloaded = torch.load(params.model_path)
        reloaded = torch.load(params.model_path,
                              map_location=torch.device('cpu'))
        model_params = AttrDict(reloaded['params'])
        self.supported_languages = model_params.lang2id.keys()
        logger.info("Supported languages: %s" %
                    ", ".join(self.supported_languages))

        # update dictionary parameters
        for name in [
                'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index',
                'mask_index'
        ]:
            try:
                setattr(params, name, getattr(model_params, name))
            except AttributeError:
                key = list(model_params.meta_params.keys())[0]
                attr = getattr(model_params.meta_params[key], name)
                setattr(params, name, attr)
                setattr(model_params, name, attr)

        # build dictionary / build encoder / build decoder / reload weights
        self.dico = Dictionary(reloaded['dico_id2word'],
                               reloaded['dico_word2id'],
                               reloaded['dico_counts'])
        #self.encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval()
        self.encoder = TransformerModel(model_params,
                                        self.dico,
                                        is_encoder=True,
                                        with_output=True).eval()
        #self.decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval()
        self.decoder = TransformerModel(model_params,
                                        self.dico,
                                        is_encoder=False,
                                        with_output=True).eval()
        self.encoder.load_state_dict(reloaded['encoder'])
        self.decoder.load_state_dict(reloaded['decoder'])
        params.src_id = model_params.lang2id[params.src_lang]
        params.tgt_id = model_params.lang2id[params.tgt_lang]
        self.model_params = model_params
        self.params = params

    def translate(self, src_sent=[]):
        flag = False
        if type(src_sent) == str:
            src_sent = [src_sent]
            flag = True
        tgt_sent = []
        for i in range(0, len(src_sent), self.params.batch_size):
            # prepare batch
            word_ids = [
                torch.LongTensor(
                    [self.dico.index(w) for w in s.strip().split()])
                for s in src_sent[i:i + self.params.batch_size]
            ]
            lengths = torch.LongTensor([len(s) + 2 for s in word_ids])
            batch = torch.LongTensor(lengths.max().item(),
                                     lengths.size(0)).fill_(
                                         self.params.pad_index)
            batch[0] = self.params.eos_index
            for j, s in enumerate(word_ids):
                if lengths[j] > 2:  # if sentence not empty
                    batch[1:lengths[j] - 1, j].copy_(s)
                batch[lengths[j] - 1, j] = self.params.eos_index
            langs = batch.clone().fill_(self.params.src_id)

            # encode source batch and translate it
            #encoded = self.encoder('fwd', x=batch.cuda(), lengths=lengths.cuda(), langs=langs.cuda(), causal=False)
            encoded = self.encoder('fwd',
                                   x=batch,
                                   lengths=lengths,
                                   langs=langs,
                                   causal=False)
            encoded = encoded.transpose(0, 1)
            #decoded, dec_lengths = self.decoder.generate(encoded, lengths.cuda(), self.params.tgt_id, max_len=int(1.5 * lengths.max().item() + 10))
            decoded, dec_lengths = self.decoder.generate(
                encoded,
                lengths,
                self.params.tgt_id,
                max_len=int(1.5 * lengths.max().item() + 10))

            # convert sentences to words
            for j in range(decoded.size(1)):

                # remove delimiters
                sent = decoded[:, j]
                delimiters = (sent == self.params.eos_index).nonzero().view(-1)
                assert len(delimiters) >= 1 and delimiters[0].item() == 0
                sent = sent[1:] if len(
                    delimiters) == 1 else sent[1:delimiters[1]]

                # output translation
                source = src_sent[i + j].strip()
                target = " ".join(
                    [self.dico[sent[k].item()] for k in range(len(sent))])
                sys.stderr.write("%i / %i: %s -> %s\n" %
                                 (i + j, len(src_sent), source, target))
                tgt_sent.append(target)

        if flag:
            return tgt_sent[0]
        return tgt_sent
Ejemplo n.º 12
0
def main(params):

    # initialize the experiment
    logger = initialize_exp(params)

    # generate parser / parse parameters
    parser = get_parser()
    params = parser.parse_args()
    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])
    logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys()))

    # update dictionary parameters
    for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
    encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval()
    decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval()
    encoder.load_state_dict(reloaded['encoder'])
    decoder.load_state_dict(reloaded['decoder'])
    params.src_id = model_params.lang2id[params.src_lang]
    params.tgt_id = model_params.lang2id[params.tgt_lang]

    # float16
    if params.fp16:
        assert torch.backends.cudnn.enabled
        encoder = network_to_half(encoder)
        decoder = network_to_half(decoder)

    # read sentences from stdin
    src_sent = []
    for line in sys.stdin.readlines():
        assert len(line.strip().split()) > 0
        src_sent.append(line)
    logger.info("Read %i sentences from stdin. Translating ..." % len(src_sent))

    f = io.open(params.output_path, 'w', encoding='utf-8')

    for i in range(0, len(src_sent), params.batch_size):

        # prepare batch
        word_ids = [torch.LongTensor([dico.index(w) for w in s.strip().split()])
                    for s in src_sent[i:i + params.batch_size]]
        lengths = torch.LongTensor([len(s) + 2 for s in word_ids])
        batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index)
        batch[0] = params.eos_index
        for j, s in enumerate(word_ids):
            if lengths[j] > 2:  # if sentence not empty
                batch[1:lengths[j] - 1, j].copy_(s)
            batch[lengths[j] - 1, j] = params.eos_index
        langs = batch.clone().fill_(params.src_id)

        # encode source batch and translate it
        encoded = encoder('fwd', x=batch.cuda(), lengths=lengths.cuda(), langs=langs.cuda(), causal=False)
        encoded = encoded.transpose(0, 1)
        decoded, dec_lengths = decoder.generate(encoded, lengths.cuda(), params.tgt_id, max_len=int(1.5 * lengths.max().item() + 10))

        # convert sentences to words
        for j in range(decoded.size(1)):

            # remove delimiters
            sent = decoded[:, j]
            delimiters = (sent == params.eos_index).nonzero().view(-1)
            assert len(delimiters) >= 1 and delimiters[0].item() == 0
            sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]]

            # output translation
            source = src_sent[i + j].strip()
            target = " ".join([dico[sent[k].item()] for k in range(len(sent))])
            sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target))
            f.write(target + "\n")

    f.close()
Ejemplo n.º 13
0
def main(params):

    # initialize the experiment
    logger = initialize_exp(params)

    # generate parser / parse parameters
    parser = get_parser()
    params = parser.parse_args()
    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])
    logger.info("Supported languages: %s" %
                ", ".join(model_params.lang2id.keys()))

    # update dictionary parameters
    for name in [
            'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index',
            'mask_index'
    ]:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    encoder = TransformerModel(model_params,
                               dico,
                               is_encoder=True,
                               with_output=True).cuda().eval()
    encoder.load_state_dict(reloaded['encoder'])
    decoder = None
    #    decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval()
    #    decoder.load_state_dict(reloaded['decoder'])
    params.src_id = model_params.lang2id[params.src_lang]
    params.tgt_id = model_params.lang2id[params.tgt_lang]

    # read sentences from stdin
    src_sent = []
    for line in sys.stdin.readlines():
        assert len(line.strip().split()) > 0
        src_sent.append(line)
    logger.info("Read %i sentences from stdin. Translating ..." %
                len(src_sent))

    all_encodings = []
    # For each sentence...
    for i in range(0, len(src_sent), params.batch_size):
        # prepare batch
        word_ids = [
            torch.LongTensor([dico.index(w) for w in s.strip().split()])
            for s in src_sent[i:i + params.batch_size]
        ]
        lengths = torch.LongTensor([len(s) + 2 for s in word_ids])
        batch = torch.LongTensor(lengths.max().item(),
                                 lengths.size(0)).fill_(params.pad_index)
        batch[0] = params.eos_index
        for j, s in enumerate(word_ids):
            if lengths[j] > 2:  # if sentence not empty
                batch[1:lengths[j] - 1, j].copy_(s)
            batch[lengths[j] - 1, j] = params.eos_index
        langs = batch.clone().fill_(params.src_id)

        # encode source batch and translate it, deal with padding
        encodings = encoderouts(encoder, batch, lengths, langs)

        # batch is actually in original order, append each sent to all_encodings
        for idx in encodings:
            all_encodings.append(idx.cpu().numpy())

    # Save all encodings to npy
    np.save(params.output_path, np.stack(all_encodings))
Ejemplo n.º 14
0
def main(params):
    # generate parser / parse parameters
    parser = get_parser()
    params = parser.parse_args()
    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])

    # update dictionary parameters
    for name in ['src_n_words', 'tgt_n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    source_dico = Dictionary(reloaded['source_dico_id2word'], reloaded['source_dico_word2id'])
    target_dico = Dictionary(reloaded['target_dico_id2word'], reloaded['target_dico_word2id'])
    encoder = TransformerEncoder(model_params, source_dico, with_output=False).cuda().eval()
    decoder = TransformerDecoder(model_params, target_dico, with_output=True).cuda().eval()
    encoder.load_state_dict(reloaded['encoder'])
    decoder.load_state_dict(reloaded['decoder'])

    # read sentences from stdin
    table_lines = []
    table_inf = open(params.table_path, 'r', encoding='utf-8')

    for table_line in table_inf:
        table_lines.append(table_line)

    outf = io.open(params.output_path, 'w', encoding='utf-8')

    for i in range(0, len(table_lines), params.batch_size):
        # prepare batch
        enc_x1_ids = []
        enc_x2_ids = []
        enc_x3_ids = []
        enc_x4_ids = []
        for table_line in table_lines[i:i + params.batch_size]:
            record_seq = [each.split('|') for each in table_line.split()]
            assert all([len(x) == 4 for x in record_seq])
            enc_x1_ids.append(torch.LongTensor([source_dico.index(x[0]) for x in record_seq]))
            enc_x2_ids.append(torch.LongTensor([source_dico.index(x[1]) for x in record_seq]))
            enc_x3_ids.append(torch.LongTensor([source_dico.index(x[2]) for x in record_seq]))
            enc_x4_ids.append(torch.LongTensor([source_dico.index(x[3]) for x in record_seq]))

        enc_xlen = torch.LongTensor([len(x) + 2 for x in enc_x1_ids])
        enc_x1 = torch.LongTensor(enc_xlen.max().item(), enc_xlen.size(0)).fill_(params.pad_index)
        enc_x1[0] = params.eos_index
        enc_x2 = torch.LongTensor(enc_xlen.max().item(), enc_xlen.size(0)).fill_(params.pad_index)
        enc_x2[0] = params.eos_index
        enc_x3 = torch.LongTensor(enc_xlen.max().item(), enc_xlen.size(0)).fill_(params.pad_index)
        enc_x3[0] = params.eos_index
        enc_x4 = torch.LongTensor(enc_xlen.max().item(), enc_xlen.size(0)).fill_(params.pad_index)
        enc_x4[0] = params.eos_index

        for j, (s1,s2,s3,s4) in enumerate(zip(enc_x1_ids, enc_x2_ids, enc_x3_ids, enc_x4_ids)):
            if enc_xlen[j] > 2:  # if sentence not empty
                enc_x1[1:enc_xlen[j] - 1, j].copy_(s1)
                enc_x2[1:enc_xlen[j] - 1, j].copy_(s2)
                enc_x3[1:enc_xlen[j] - 1, j].copy_(s3)
                enc_x4[1:enc_xlen[j] - 1, j].copy_(s4)
            enc_x1[enc_xlen[j] - 1, j] = params.eos_index
            enc_x2[enc_xlen[j] - 1, j] = params.eos_index
            enc_x3[enc_xlen[j] - 1, j] = params.eos_index
            enc_x4[enc_xlen[j] - 1, j] = params.eos_index

        enc_x1 = enc_x1.cuda()
        enc_x2 = enc_x2.cuda()
        enc_x3 = enc_x3.cuda()
        enc_x4 = enc_x4.cuda()
        enc_xlen = enc_xlen.cuda()

        # encode source batch and translate it
        encoder_output = encoder('fwd', x1=enc_x1, x2=enc_x2, x3=enc_x3, x4=enc_x4, lengths=enc_xlen)
        encoder_output = encoder_output.transpose(0, 1)

        # max_len = int(1.5 * enc_xlen.max().item() + 10)
        max_len = 602
        if params.beam_size <= 1:
            decoded, dec_lengths = decoder.generate(encoder_output, enc_xlen, max_len=max_len)
        elif params.beam_size > 1:
            decoded, dec_lengths = decoder.generate_beam(encoder_output, enc_xlen, params.beam_size, 
                                            params.length_penalty, params.early_stopping, max_len=max_len)

        for j in range(decoded.size(1)):

            # remove delimiters
            sent = decoded[:, j]
            delimiters = (sent == params.eos_index).nonzero().view(-1)
            assert len(delimiters) >= 1 and delimiters[0].item() == 0
            sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]]

            # output translation
            source = table_lines[i + j].strip()
            target = " ".join([target_dico[sent[k].item()] for k in range(len(sent))])
            sys.stderr.write("%i / %i: %s\n" % (i + j, len(table_lines), target))
            outf.write(target + "\n")

    outf.close()
Ejemplo n.º 15
0
def main(params):
    # generate parser / parse parameters
    #parser = get_parser()
    #params = parser.parse_args()
    reloaded = torch.load(params.model_path)

    model_params = AttrDict(reloaded['params'])

    # update dictionary parameters
    for name in [
            'src_n_words', 'tgt_n_words', 'bos_index', 'eos_index',
            'pad_index', 'unk_index', 'mask_index'
    ]:
        setattr(params, name, getattr(model_params, name))
    # print(f'src {getattr(model_params, "src_n_words")}')
    # print(f'tgt {getattr(model_params, "tgt_n_words")}')
    # build dictionary / build encoder / build decoder / reload weights
    source_dico = Dictionary(reloaded['source_dico_id2word'],
                             reloaded['source_dico_word2id'])
    target_dico = Dictionary(reloaded['target_dico_id2word'],
                             reloaded['target_dico_word2id'])
    # originalDecoder = reloaded['decoder'].copy()
    encoder = TransformerEncoder(model_params, source_dico,
                                 with_output=False).cuda().eval()
    encoder.load_state_dict(reloaded['encoder'])
    decoder = TransformerDecoder(model_params, target_dico,
                                 with_output=True).cuda().eval()
    decoder.load_state_dict(reloaded['decoder'])
    # read sentences from stdin
    table_lines = []
    title_lines = []
    table_inf = open(params.table_path, 'r', encoding='utf-8')
    for table_line in table_inf:
        table_lines.append(table_line)
    with open(params.title_path, 'r', encoding='utf-8') as title_inf:
        for title_line in title_inf:
            title_lines.append(title_line)

    assert len(title_lines) == len(table_lines)

    outf = io.open(params.output_path, 'w', encoding='utf-8')

    fillers = [
        'in', 'the', 'and', 'or', 'an', 'as', 'can', 'be', 'a', ':', '-', 'to',
        'but', 'is', 'of', 'it', 'on', '.', 'at', '(', ')', ',', 'with'
    ]

    for i in range(0, len(table_lines), params.batch_size):
        # prepare batch
        """valueLengths = []
        xLabelLengths = []
        yLabelLengths = []
        titleLengths = []"""
        enc_x1_ids = []
        enc_x2_ids = []
        enc_x3_ids = []
        enc_x4_ids = []
        for table_line, title_line in zip(table_lines[i:i + params.batch_size],
                                          title_lines[i:i +
                                                      params.batch_size]):
            record_seq = [each.split('|') for each in table_line.split()]
            assert all([len(x) == 4 for x in record_seq])

            enc_x1_ids.append(
                torch.LongTensor([source_dico.index(x[0])
                                  for x in record_seq]))
            enc_x2_ids.append(
                torch.LongTensor([source_dico.index(x[1])
                                  for x in record_seq]))
            enc_x3_ids.append(
                torch.LongTensor([source_dico.index(x[2])
                                  for x in record_seq]))
            enc_x4_ids.append(
                torch.LongTensor([source_dico.index(x[3])
                                  for x in record_seq]))

            xLabel = record_seq[1][0].split('_')
            yLabel = record_seq[0][0].split('_')
            """cleanXLabel = len([item for item in xLabel if item not in fillers])
            cleanYLabel = len([item for item in yLabel if item not in fillers])
            cleanTitle = len([word for word in title_line.split() if word not in fillers])

            xLabelLengths.append(cleanXLabel)
            yLabelLengths.append(cleanYLabel)
            titleLengths.append(cleanTitle)
            valueLengths.append(round(len(record_seq)/2))"""

        enc_xlen = torch.LongTensor([len(x) + 2 for x in enc_x1_ids])
        enc_x1 = torch.LongTensor(enc_xlen.max().item(),
                                  enc_xlen.size(0)).fill_(params.pad_index)
        enc_x1[0] = params.eos_index
        enc_x2 = torch.LongTensor(enc_xlen.max().item(),
                                  enc_xlen.size(0)).fill_(params.pad_index)
        enc_x2[0] = params.eos_index
        enc_x3 = torch.LongTensor(enc_xlen.max().item(),
                                  enc_xlen.size(0)).fill_(params.pad_index)
        enc_x3[0] = params.eos_index
        enc_x4 = torch.LongTensor(enc_xlen.max().item(),
                                  enc_xlen.size(0)).fill_(params.pad_index)
        enc_x4[0] = params.eos_index

        for j, (s1, s2, s3, s4) in enumerate(
                zip(enc_x1_ids, enc_x2_ids, enc_x3_ids, enc_x4_ids)):
            if enc_xlen[j] > 2:  # if sentence not empty
                enc_x1[1:enc_xlen[j] - 1, j].copy_(s1)
                enc_x2[1:enc_xlen[j] - 1, j].copy_(s2)
                enc_x3[1:enc_xlen[j] - 1, j].copy_(s3)
                enc_x4[1:enc_xlen[j] - 1, j].copy_(s4)
            enc_x1[enc_xlen[j] - 1, j] = params.eos_index
            enc_x2[enc_xlen[j] - 1, j] = params.eos_index
            enc_x3[enc_xlen[j] - 1, j] = params.eos_index
            enc_x4[enc_xlen[j] - 1, j] = params.eos_index

        enc_x1 = enc_x1.cuda()
        enc_x2 = enc_x2.cuda()
        enc_x3 = enc_x3.cuda()
        enc_x4 = enc_x4.cuda()
        enc_xlen = enc_xlen.cuda()

        # encode source batch and translate it
        encoder_output = encoder('fwd',
                                 x1=enc_x1,
                                 x2=enc_x2,
                                 x3=enc_x3,
                                 x4=enc_x4,
                                 lengths=enc_xlen)
        encoder_output = encoder_output.transpose(0, 1)

        max_len = 602
        if params.beam_size <= 1:
            decoded, dec_lengths = decoder.generate(encoder_output,
                                                    enc_xlen,
                                                    max_len=max_len)
        elif params.beam_size > 1:
            decoded, dec_lengths = decoder.generate_beam(encoder_output,
                                                         enc_xlen,
                                                         params.beam_size,
                                                         params.length_penalty,
                                                         params.early_stopping,
                                                         max_len=max_len)

        for j in range(decoded.size(1)):
            # remove delimiters
            sent = decoded[:, j]
            delimiters = (sent == params.eos_index).nonzero().view(-1)
            assert len(delimiters) >= 1 and delimiters[0].item() == 0
            sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]]
            # print(sent)
            # output translation
            # source = table_lines[i + j].strip()
            # print(source)
            tokens = []
            for k in range(len(sent)):
                ids = sent[k].item()
                #if ids in removedDict:
                #    print('index error')
                word = target_dico[ids]
                tokens.append(word)
            target = " ".join(tokens)
            sys.stderr.write("%i / %i: %s\n" %
                             (i + j, len(table_lines), target))
            outf.write(target + "\n")
    outf.close()
Ejemplo n.º 16
0
def run_xnlg():
    params = get_params()

    # initialize the experiment / build sentence embedder
    logger = initialize_exp(params)

    if params.tokens_per_batch > -1:
        params.group_by_size = True

    # check parameters
    assert os.path.isdir(params.data_path)
    assert os.path.isfile(params.model_path)

    # tasks
    params.transfer_tasks = params.transfer_tasks.split(',')
    assert len(params.transfer_tasks) > 0
    assert all([task in TASKS for task in params.transfer_tasks])

    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])
    logger.info("Supported languages: %s" %
                ", ".join(model_params.lang2id.keys()))
    params.n_langs = model_params['n_langs']
    params.id2lang = model_params['id2lang']
    params.lang2id = model_params['lang2id']

    if "enc_params" in reloaded:
        encoder_model_params = AttrDict(reloaded["enc_params"])
    elif params.n_enc_layers == model_params.n_layers or params.n_enc_layers == 0:
        encoder_model_params = model_params
    else:
        encoder_model_params = AttrDict(reloaded['params'])
        encoder_model_params.n_layers = params.n_enc_layers
        assert model_params.n_layers is not encoder_model_params.n_layers

    if "dec_params" in reloaded:
        decoder_model_params = AttrDict(reloaded["dec_params"])
    elif params.n_dec_layers == model_params.n_layers or params.n_dec_layers == 0:
        decoder_model_params = model_params
    else:
        decoder_model_params = AttrDict(reloaded['params'])
        decoder_model_params.n_layers = params.n_dec_layers
        assert model_params.n_layers is not decoder_model_params.n_layers

    params.encoder_model_params = encoder_model_params
    params.decoder_model_params = decoder_model_params

    if params.emb_dim != -1:
        encoder_model_params.emb_dim = params.emb_dim
        decoder_model_params.emb_dim = params.emb_dim

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])

    for p in [params, encoder_model_params, decoder_model_params]:
        p.n_words = len(dico)
        p.bos_index = dico.index(BOS_WORD)
        p.eos_index = dico.index(EOS_WORD)
        p.pad_index = dico.index(PAD_WORD)
        p.unk_index = dico.index(UNK_WORD)
        p.mask_index = dico.index(MASK_WORD)

    encoder = TransformerModel(encoder_model_params,
                               dico,
                               is_encoder=True,
                               with_output=False)
    decoder = TransformerModel(decoder_model_params,
                               dico,
                               is_encoder=False,
                               with_output=True)

    def _process_state_dict(state_dict):
        return {(k[7:] if k.startswith('module.') else k): v
                for k, v in state_dict.items()}

    if params.no_init == "all":
        logger.info("All Models will not load state dict.!!!")
    elif params.reload_emb != "":
        logger.info("Reloading embedding from %s ..." % params.reload_emb)
        word2id, embeddings = read_txt_embeddings(logger, params.reload_emb)
        set_pretrain_emb(logger, encoder, dico, word2id, embeddings)
        set_pretrain_emb(logger, decoder, dico, word2id, embeddings)
    else:
        if "model" in reloaded:
            if params.no_init != "encoder":
                encoder.load_state_dict(_process_state_dict(reloaded['model']),
                                        strict=False)
            if params.no_init != "decoder":
                decoder.load_state_dict(_process_state_dict(reloaded['model']),
                                        strict=False)
        else:
            if params.no_init != "encoder":
                encoder.load_state_dict(_process_state_dict(
                    reloaded['encoder']),
                                        strict=False)
            if params.no_init != "decoder":
                decoder.load_state_dict(
                    _process_state_dict(reloaded['decoder']))

    scores = {}

    # run
    for task in params.transfer_tasks:
        if task == "XQG":
            XQG_v3(encoder, decoder, scores, dico, params).run()
        elif task == "XSumm":
            XSumm(encoder, decoder, scores, dico, params).run()
Ejemplo n.º 17
0
def main():

    # Load pre-trained model
    model_path = './models/mlm_tlm_xnli15_1024.pth'
    reloaded = torch.load(model_path)
    params = AttrDict(reloaded['params'])

    # build dictionary / update parameters
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    params.n_words = len(dico)
    params.bos_index = dico.index(BOS_WORD)
    params.eos_index = dico.index(EOS_WORD)
    params.pad_index = dico.index(PAD_WORD)
    params.unk_index = dico.index(UNK_WORD)
    params.mask_index = dico.index(MASK_WORD)

    # build model / reload weights
    model = TransformerModel(params, dico, True, True)
    #model.cuda() #if using GPU
    model.load_state_dict(reloaded['model'])
    """ """

    with open(args.filename, "r") as f:
        sentence_list = f.readlines()[args.sn[0]:args.sn[1]]

    # remove new line symbols
    for i in range(0, len(sentence_list)):
        sentence_list[i] = sentence_list[i].replace("\n", "")

    # save as dataframe and add language tokens
    sentence_df = pd.DataFrame(sentence_list)
    sentence_df.columns = ['sentence']
    sentence_df['language'] = 'en'

    # match xlm format
    sentences = list(zip(sentence_df.sentence, sentence_df.language))(sentence,
                                                                      language)
    """ from XLM repo """
    # add </s> sentence delimiters
    sentences = [(('</s> %s </s>' % sent.strip()).split(), lang)
                 for sent, lang in sentences]

    # Create batch
    bs = len(sentences)
    slen = max([len(sent) for sent, _ in sentences])

    word_ids = torch.LongTensor(slen, bs).fill_(params.pad_index)
    for i in range(len(sentences)):
        sent = torch.LongTensor([dico.index(w) for w in sentences[i][0]])
        word_ids[:len(sent), i] = sent

    lengths = torch.LongTensor([len(sent) for sent, _ in sentences])
    langs = torch.LongTensor([params.lang2id[lang] for _, lang in sentences
                              ]).unsqueeze(0).expand(slen, bs)

    #if using GPU:
    #word_ids=word_ids.cuda()
    #lengths=lengths.cuda()
    #langs=langs.cuda()

    tensor = model('fwd',
                   x=word_ids,
                   lengths=lengths,
                   langs=langs,
                   causal=False).contiguous()
    print(tensor.size())

    # The variable tensor is of shape (sequence_length, batch_size, model_dimension).
    # tensor[0] is a tensor of shape (batch_size, model_dimension) that corresponds to the first hidden state of the last layer of each sentence.
    # This is this vector that we use to finetune on the GLUE and XNLI tasks.
    """ """

    torch.save(tensor[0], args.o)
from src.utils import AttrDict
from src.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD
from src.model.transformer import TransformerModel

logger = getLogger()


# NOTE: remember to replace the model path here
model_path = './dumped/XLM_bora_es/abcedf/checkpoint.pth'
reloaded = torch.load(model_path)
params = AttrDict(reloaded['params'])
print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
params.n_words = len(dico)
params.bos_index = dico.index(BOS_WORD)
params.eos_index = dico.index(EOS_WORD)
params.pad_index = dico.index(PAD_WORD)
params.unk_index = dico.index(UNK_WORD)
params.mask_index = dico.index(MASK_WORD)

# build model / reload weights
model = TransformerModel(params, dico, True, True)
model.eval()
model.load_state_dict(reloaded['model'])

codes = "./data/processed/XLM_bora_es/60k/codes"  # path to the codes of the model
fastbpe = os.path.join(os.getcwd(), 'tools/fastBPE/fast')


def to_bpe(sentences):
Ejemplo n.º 19
0
def main(params):

    # initialize the experiment
    logger = initialize_exp(params)

    # generate parser / parse parameters
    parser = get_parser()
    params = parser.parse_args()
    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])
    logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys()))

    # update dictionary parameters
    for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
    encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval()
    decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval()
    encoder.load_state_dict(reloaded['encoder'])
    decoder.load_state_dict(reloaded['decoder'])
    params.src_id = model_params.lang2id[params.src_lang]
    params.tgt_id = model_params.lang2id[params.tgt_lang]
    logger.info("encoder: {}".format(encoder))
    logger.info("decoder: {}".format(decoder))

    # read sentences from stdin
    src_sent = []
    with open(params.sentences_path, 'r') as file1:
        for line in file1:
            src_sent.append(line)

    logger.info("Read %i sentences from sentences file.Writing them to a src file. Translating ..." % len(src_sent))
    f = io.open(params.output_path + 'src_sent', 'w', encoding='utf-8')
    for sentence in src_sent:
        f.write(sentence + "\n")
    f.close()
    logger.info("Wrote them to a src file")
    f = io.open(params.output_path, 'w', encoding='utf-8')

    for i in range(0, len(src_sent), params.batch_size):

        # prepare batch
        word_ids = [torch.LongTensor([dico.index(w) for w in s.strip().split()])
                    for s in src_sent[i:i + params.batch_size]]
        lengths = torch.LongTensor([len(s) + 2 for s in word_ids])
        batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index)
        batch[0] = params.eos_index
        for j, s in enumerate(word_ids):
            if lengths[j] > 2:  # if sentence not empty
                batch[1:lengths[j] - 1, j].copy_(s)
            batch[lengths[j] - 1, j] = params.eos_index
        langs = batch.clone().fill_(params.src_id)

        # encode source batch and translate it
        encoded, _ = encoder('fwd', x=batch.cuda(), lengths=lengths.cuda(), langs=langs.cuda(), causal=False,
                             encoder_only=False, extra_adapters_flag=True)
        encoded = encoded.transpose(0, 1)
        # decoded, dec_lengths = decoder.generate(encoded, lengths.cuda(), params.tgt_id, max_len=int(1.5 * lengths.max().item() + 10))
        decoded, dec_lengths = decoder.generate_beam(
            encoded, lengths.cuda(), params.tgt_id, beam_size=params.beam_size, length_penalty=params.length_penalty,
            early_stopping=params.early_stopping, max_len=int(1.5 * lengths.cuda().max().item() + 10),
            extra_adapters_flag=True)
        # convert sentences to words
        for j in range(decoded.size(1)):

            # remove delimiters
            sent = decoded[:, j]
            delimiters = (sent == params.eos_index).nonzero().view(-1)
            assert len(delimiters) >= 1 and delimiters[0].item() == 0
            sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]]

            # output translation
            source = src_sent[i + j].strip()
            target = " ".join([dico[sent[k].item()] for k in range(len(sent))])
            #logger.info("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target))
            if (i+j)%10000 == 0:
                logger.info("Translation of %i / %i:\n Source sentence: %s \n Translation: %s\n" % (i + j, len(src_sent), source, target))

            # sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target))
            f.write(target + "\n")

    f.close()
Ejemplo n.º 20
0
import subprocess
from torch.nn.modules.distance import CosineSimilarity

cm = CosineSimilarity(dim=0)

#On initialise le modèle
model_path = './mlm_tlm_xnli15_1024.pth'
reloaded = torch.load(model_path)
params = AttrDict(reloaded['params'])
print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

# build dictionary / update parameters
dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                  reloaded['dico_counts'])
params.n_words = len(dico)
params.bos_index = dico.index(BOS_WORD)
params.eos_index = dico.index(EOS_WORD)
params.pad_index = dico.index(PAD_WORD)
params.unk_index = dico.index(UNK_WORD)
params.mask_index = dico.index(MASK_WORD)

# build model / reload weights
model = TransformerModel(params, dico, True, True)
model.load_state_dict(reloaded['model'])

local = False
if os.environ["TERM_PROGRAM"] == "Apple_Terminal":
    local = True


#On crée les fonctions intermédiaires
Ejemplo n.º 21
0
import os
import torch

from src.utils import AttrDict
from src.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD
from src.model.transformer import TransformerModel

model_path = '/Users/pengjie/Documents/code/XLM/model/mlm_tlm_xnli15_1024.pth'
reloaded = torch.load(model_path)
params = AttrDict(reloaded['params'])

# build dictionary / update parameters
dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                  reloaded['dico_counts'])
params.n_words = len(dico)
params.bos_index = dico.index(BOS_WORD)
params.eos_index = dico.index(EOS_WORD)
params.pad_index = dico.index(PAD_WORD)
params.unk_index = dico.index(UNK_WORD)
params.mask_index = dico.index(MASK_WORD)

# build model / reload weights
model = TransformerModel(params, dico, True, True)
model.eval()
model.load_state_dict(reloaded['model'])

# list of (sentences, lang)
sentences = [
    ('the following secon@@ dary charac@@ ters also appear in the nov@@ el .',
     'en'),
    ('les zones rurales offr@@ ent de petites routes , a deux voies .', 'fr'),