Example #1
0
def preprocess(voc_path, txt_path):

    assert os.path.isfile(voc_path)
    assert os.path.isfile(txt_path)

    logger = create_logger(None, 0)

    bin_path = txt_path + ".pth"

    dico = Dictionary.read_vocab(voc_path)
    logger.info("")

    data = Dictionary.index_data(txt_path, bin_path, dico)
    logger.info("%i words (%i unique) in %i sentences." %
                (len(data['sentences']) - len(data['positions']),
                 len(data['dico']), len(data['positions'])))
    if len(data['unk_words']) > 0:
        logger.info(
            "%i unknown words (%i unique), covering %.2f%% of the data." %
            (sum(data['unk_words'].values()), len(
                data['unk_words']), sum(data['unk_words'].values()) * 100. /
             (len(data['sentences']) - len(data['positions']))))
        if len(data['unk_words']) < 30:
            for w, c in sorted(data['unk_words'].items(),
                               key=lambda x: x[1])[::-1]:
                logger.info("%s: %i" % (w, c))
Example #2
0
def load_xlm_embeddings(path, model_name="model"):
    """
    Load all xlm embeddings
    Params:
        path:
        model_name: model name in the reloaded path, "model" for pretrained xlm encoder; "encoder" for encoder of translation model "decoder" for decoder of translation model
    """
    reloaded = torch.load(path)

    assert model_name in ["model", "encoder", "decoder"]
    state_dict = reloaded[model_name]

    # handle models from multi-GPU checkpoints
    state_dict = {(k[7:] if k.startswith('module.') else k): v
                  for k, v in state_dict.items()}

    # reload dictionary and model parameters
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    pretrain_params = AttrDict(reloaded['params'])
    pretrain_params.n_words = len(dico)
    pretrain_params.bos_index = dico.index(BOS_WORD)
    pretrain_params.eos_index = dico.index(EOS_WORD)
    pretrain_params.pad_index = dico.index(PAD_WORD)
    pretrain_params.unk_index = dico.index(UNK_WORD)
    pretrain_params.mask_index = dico.index(MASK_WORD)

    # build model and reload weights
    if model_name != "decoder":
        model = TransformerModel(pretrain_params, dico, True, True)
    else:
        model = TransformerModel(pretrain_params, dico, False, True)
    model.load_state_dict(state_dict)

    return model.embeddings.weight.data, dico
Example #3
0
    def reload(path, params):
        """
        Create a sentence embedder from a pretrained model.
        """
        # reload model
        reloaded = torch.load(path)
        state_dict = reloaded['model']

        # handle models from multi-GPU checkpoints
        if 'checkpoint' in path:
            state_dict = {(k[7:] if k.startswith('module.') else k): v
                          for k, v in state_dict.items()}

        # reload dictionary and model parameters
        dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                          reloaded['dico_counts'])
        pretrain_params = AttrDict(reloaded['params'])
        pretrain_params.n_words = len(dico)
        pretrain_params.bos_index = dico.index(BOS_WORD)
        pretrain_params.eos_index = dico.index(EOS_WORD)
        pretrain_params.pad_index = dico.index(PAD_WORD)
        pretrain_params.unk_index = dico.index(UNK_WORD)
        pretrain_params.mask_index = dico.index(MASK_WORD)

        # build model and reload weights
        model = TransformerModel(pretrain_params, dico, True, True)
        model.load_state_dict(state_dict)
        model.eval()

        # adding missing parameters
        params.max_batch_size = 0

        return MyModel(model, dico, pretrain_params, params)
Example #4
0
def initialize_model():
    """
    """
    print('launching model')

    chemin = getcwd()
    curPath = chemin if "xlm" in chemin else (os.path.join(getcwd(), 'xlm'))

    onlyfiles = [f for f in listdir(chemin) if isfile(join(chemin, f))]
    print(onlyfiles)

    print(os.path.normpath(os.path.join(getcwd(),
                                        './mlm_tlm_xnli15_1024.pth')))
    model_path = os.path.normpath(
        os.path.join(getcwd(), './mlm_tlm_xnli15_1024.pth'))
    reloaded = torch.load(model_path)

    #     print('allez le model')
    #     response = requests.get(url)
    #     print('response downloaded')
    #     f = io.BytesIO(response.content)
    #     reloaded = torch.load(f)
    #     print('file downloaded')

    #    reloaded = Reloaded.serve()

    params = AttrDict(reloaded['params'])
    print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

    # build dictionary / update parameters
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    params.n_words = len(dico)
    params.bos_index = dico.index(BOS_WORD)
    params.eos_index = dico.index(EOS_WORD)
    params.pad_index = dico.index(PAD_WORD)
    params.unk_index = dico.index(UNK_WORD)
    params.mask_index = dico.index(MASK_WORD)

    # build model / reload weights
    model = TransformerModel(params, dico, True, True)
    model.load_state_dict(reloaded['model'])

    #    bpe = fastBPE.fastBPE(
    #            path.normpath(path.join(curPath, "./codes_xnli_15") ),
    #            path.normpath(path.join(curPath, "./vocab_xnli_15") )  )
    print('fin lecture')

    return model, params, dico
Example #5
0
def main(args):
    if args.table_label is None:
        args.table_label = args.table + "_label"
    if args.table_vocab is None:
        args.table_vocab = args.table + "_vocab"

    assert os.path.isfile(args.table)
    assert os.path.isfile(args.table_label)
    assert os.path.isfile(args.table_vocab)

    print_args(args)

    table_dico = Dictionary.read_vocab(args.table_vocab)

    table_data = Dictionary.index_table(args.table, args.table_label,
                                        table_dico, args.table + ".pth")
Example #6
0
    def __init__(self, model_path, tgt_lang, src_lang,dump_path = "./dumped/", exp_name="translate", exp_id="test", batch_size=32):
        
        # parse parameters
        parser = argparse.ArgumentParser(description="Translate sentences")
        
        # main parameters
        parser.add_argument("--dump_path", type=str, default=dump_path, help="Experiment dump path")
        parser.add_argument("--exp_name", type=str, default=exp_name, help="Experiment name")
        parser.add_argument("--exp_id", type=str, default=exp_id, help="Experiment ID")
        parser.add_argument("--batch_size", type=int, default=batch_size, help="Number of sentences per batch")
        # model / output paths
        parser.add_argument("--model_path", type=str, default=model_path, help="Model path")
        # parser.add_argument("--max_vocab", type=int, default=-1, help="Maximum vocabulary size (-1 to disable)")
        # parser.add_argument("--min_count", type=int, default=0, help="Minimum vocabulary count")
        # source language / target language
        parser.add_argument("--src_lang", type=str, default=src_lang, help="Source language")
        parser.add_argument("--tgt_lang", type=str, default=tgt_lang, help="Target language")

        params = parser.parse_args()
        assert params.src_lang != '' and params.tgt_lang != '' and params.src_lang != params.tgt_lang

        # initialize the experiment
        logger = initialize_exp(params)
        
        # On a pas de GPU
        #reloaded = torch.load(params.model_path)
        reloaded = torch.load(params.model_path, map_location=torch.device('cpu'))
        model_params = AttrDict(reloaded['params'])
        self.supported_languages = model_params.lang2id.keys() 
        logger.info("Supported languages: %s" % ", ".join(self.supported_languages))

        # update dictionary parameters
        for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']:
            setattr(params, name, getattr(model_params, name))

        # build dictionary / build encoder / build decoder / reload weights
        self.dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
        #self.encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval()
        self.encoder = TransformerModel(model_params, self.dico, is_encoder=True, with_output=True).eval()
        #self.decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval()
        self.decoder = TransformerModel(model_params, self.dico, is_encoder=False, with_output=True).eval()
        self.encoder.load_state_dict(reloaded['encoder'])
        self.decoder.load_state_dict(reloaded['decoder'])
        params.src_id = model_params.lang2id[params.src_lang]
        params.tgt_id = model_params.lang2id[params.tgt_lang]
        self.model_params = model_params
        self.params = params
def main(args):
    if args.summary_vocab is None:
        args.summary_vocab = args.summary + "_vocab"
    if args.summary_label is None:
        args.summary_label = args.summary + "_label"

    assert os.path.isfile(args.summary)
    assert os.path.isfile(args.summary_vocab)
    assert os.path.isfile(args.summary_label)

    print_args(args)

    summary_dico = Dictionary.read_vocab(args.summary_vocab)
    summary_data = Dictionary.index_summary(args.summary,
                                            args.summary_label,
                                            summary_dico,
                                            args.summary + ".pth",
                                            max_len=args.summary_max_length)
Example #8
0
def reload_ar_checkpoint(path):
    """ Reload autoregressive params, dictionary, model from a given path """
    # Load dictionary/model/datasets first
    reloaded = torch.load(path)
    params = AttrDict(reloaded['params'])

    # build dictionary / update parameters
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    params.n_words = len(dico)
    params.n_langs = 1
    params.bos_index = dico.index(BOS_WORD)
    params.eos_index = dico.index(EOS_WORD)
    params.pad_index = dico.index(PAD_WORD)
    params.unk_index = dico.index(UNK_WORD)
    params.mask_index = dico.index(MASK_WORD)

    # build Transformer model
    model = TransformerModel(params, is_encoder=False, with_output=True)
    model.load_state_dict(reloaded['model'])
    return params, dico, model
Example #9
0
def reload_checkpoint(path):
    """ Reload params, dictionary, model from a given path """
    # Load dictionary/model/datasets first
    reloaded = torch.load(path)
    params = AttrDict(reloaded['params'])
    print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

    # build dictionary / update parameters
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    params.n_words = len(dico)
    params.bos_index = dico.index(BOS_WORD)
    params.eos_index = dico.index(EOS_WORD)
    params.pad_index = dico.index(PAD_WORD)
    params.unk_index = dico.index(UNK_WORD)
    params.mask_index = dico.index(MASK_WORD)

    # build model / reload weights
    model = TransformerModel(params, dico, True, True)
    model.load_state_dict(reloaded['model'])

    return params, dico, model
Example #10
0
def load_model(params):
    # check parameters
    assert os.path.isdir(params.data_path)
    assert os.path.isfile(params.model_path)
    reloaded = torch.load(params.model_path)

    encoder_model_params = AttrDict(reloaded['enc_params'])
    decoder_model_params = AttrDict(reloaded['dec_params'])

    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])

    params.n_langs = encoder_model_params['n_langs']
    params.id2lang = encoder_model_params['id2lang']
    params.lang2id = encoder_model_params['lang2id']
    params.n_words = len(dico)
    params.bos_index = dico.index(BOS_WORD)
    params.eos_index = dico.index(EOS_WORD)
    params.pad_index = dico.index(PAD_WORD)
    params.unk_index = dico.index(UNK_WORD)
    params.mask_index = dico.index(MASK_WORD)

    encoder = TransformerModel(encoder_model_params,
                               dico,
                               is_encoder=True,
                               with_output=False)
    decoder = TransformerModel(decoder_model_params,
                               dico,
                               is_encoder=False,
                               with_output=True)

    def _process_state_dict(state_dict):
        return {(k[7:] if k.startswith('module.') else k): v
                for k, v in state_dict.items()}

    encoder.load_state_dict(_process_state_dict(reloaded['encoder']))
    decoder.load_state_dict(_process_state_dict(reloaded['decoder']))

    return encoder, decoder, dico
Example #11
0
def create_binary(txt_path, bin_path, dico):

	data = Dictionary.index_data(txt_path, bin_path, dico)
	logger.info("%i words (%i unique) in %i sentences." % (
		len(data['sentences']) - len(data['positions']),
		len(data['dico']),
		len(data['positions'])
	))

	if len(data['unk_words']) > 0:
		logger.info("%i unknown words (%i unique), covering %.2f%% of the data." % (
			sum(data['unk_words'].values()),
			len(data['unk_words']),
			sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions']))
		))
		if len(data['unk_words']) < 30:
			for w, c in sorted(data['unk_words'].items(), key=lambda x: x[1])[::-1]:
				logger.info("%s: %i" % (w, c))
	else:
		logger.info("0 unknown word.")
Example #12
0
from src.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD
from src.model.transformer import TransformerModel

import subprocess
from torch.nn.modules.distance import CosineSimilarity

cm = CosineSimilarity(dim=0)

#On initialise le modèle
model_path = './mlm_tlm_xnli15_1024.pth'
reloaded = torch.load(model_path)
params = AttrDict(reloaded['params'])
print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

# build dictionary / update parameters
dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                  reloaded['dico_counts'])
params.n_words = len(dico)
params.bos_index = dico.index(BOS_WORD)
params.eos_index = dico.index(EOS_WORD)
params.pad_index = dico.index(PAD_WORD)
params.unk_index = dico.index(UNK_WORD)
params.mask_index = dico.index(MASK_WORD)

# build model / reload weights
model = TransformerModel(params, dico, True, True)
model.load_state_dict(reloaded['model'])

local = False
if os.environ["TERM_PROGRAM"] == "Apple_Terminal":
    local = True
Example #13
0
import sys

from src.logger import create_logger
from src.data.dictionary import Dictionary

if __name__ == "__main__":

    logger = create_logger(None, 0)

    voc_path = sys.argv[1]
    txt_path = sys.argv[2]
    bin_path = sys.argv[2] + ".pth"
    assert os.path.isfile(voc_path)
    assert os.path.isfile(txt_path)

    dico = Dictionary.read_vocab(voc_path)
    logger.info("")

    data = Dictionary.index_data(txt_path, bin_path, dico)
    logger.info("%i words (%i unique) in %i sentences." % (
        len(data["sentences"]) - len(data["positions"]),
        len(data["dico"]),
        len(data["positions"]),
    ))
    if len(data["unk_words"]) > 0:
        logger.info(
            "%i unknown words (%i unique), covering %.2f%% of the data." % (
                sum(data["unk_words"].values()),
                len(data["unk_words"]),
                sum(data["unk_words"].values()) * 100.0 /
                (len(data["sentences"]) - len(data["positions"])),
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样

    start_time = time.time()
    logger.info("Loading data...")

    logger.info('Building dictionary ...')

    data = pd.read_csv(config.train_file, sep='\t')
    if args.word:
        data = data['text'].values.tolist()
    else:
        data = data['text'].apply(lambda x: " ".join("".join(x.split())))
    if args.dictionary is None:
        dictionary = Dictionary()
        dictionary.build_dictionary(data)
        del data
        joblib.dump(dictionary, config.root_path + '/model/vocab.bin')
    else:
        dictionary = joblib.load(args.dictionary)
    if not args.model.isupper():
        tokenizer = config.tokenizer
    else:
        tokenizer = None

    logger.info('Making dataset & dataloader...')
    ### TODO
    # 1. 使用自定义的MyDataset, 创建DataLoader
    train_dataset =
    train_dataloader =
Example #15
0
    # src_txt_path = 'data/all.zh.bpe'
    # tgt_voc_path = 'data/vocab.en'
    # tgt_txt_path = 'data/all.en.bpe'

    # bin_path = 'data/cwmt.bin'
    src_voc_path = sys.argv[3]
    src_txt_path = sys.argv[1]
    tgt_voc_path = sys.argv[4]
    tgt_txt_path = sys.argv[2]
    bin_path = sys.argv[5]
    assert os.path.isfile(src_voc_path)
    assert os.path.isfile(src_txt_path)
    assert os.path.isfile(tgt_voc_path)
    assert os.path.isfile(tgt_txt_path)

    src_dico = Dictionary.read_vocab(src_voc_path)
    tgt_dico = Dictionary.read_vocab(tgt_voc_path)

    data = Dictionary.index_data(src_txt_path, tgt_txt_path, src_dico,
                                 tgt_dico, bin_path)
    if data is None:
        exit(0)
    logger.info("%i words (%i unique) in %i sentences." %
                (len(data['src_sentences']) - len(data['src_positions']),
                 len(data['src_dico']), len(data['src_positions'])))
    logger.info("%i words (%i unique) in %i sentences." %
                (len(data['tgt_sentences']) - len(data['tgt_positions']),
                 len(data['tgt_dico']), len(data['tgt_positions'])))
    if len(data['src_unk_words']) > 0:
        logger.info(
            "%i unknown words (%i unique), covering %.2f%% of the data." %
Example #16
0
from src.logger import create_logger
from src.data.dictionary import Dictionary


if __name__ == '__main__':

    logger = create_logger(None, 0)

    voc_path = sys.argv[1]
    txt_path = sys.argv[2]
    bin_path = sys.argv[2] + '.pth'
    assert os.path.isfile(voc_path)
    assert os.path.isfile(txt_path)
    if voc_path[-4:] == '.pth':
        reload = torch.load(voc_path)
        dico = Dictionary(id2word = reload['dico_id2word'], word2id = reload['dico_word2id'], counts = reload['dico_counts'])
    else:
        dico = Dictionary.read_vocab(voc_path)
    logger.info("")

    data = Dictionary.index_data(txt_path, bin_path, dico)
    logger.info("%i words (%i unique) in %i sentences." % (
        len(data['sentences']) - len(data['positions']),
        len(data['dico']),
        len(data['positions'])
    ))
    if len(data['unk_words']) > 0:
        logger.info("%i unknown words (%i unique), covering %.2f%% of the data." % (
            sum(data['unk_words'].values()),
            len(data['unk_words']),
            sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions']))
Example #17
0
def main(params):

    # initialize the experiment
    logger = initialize_exp(params)

    # generate parser / parse parameters
    parser = get_parser()
    params = parser.parse_args()
    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])
    logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys()))

    # update dictionary parameters
    for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
    encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval()
    decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval()
    encoder.load_state_dict(reloaded['encoder'])
    decoder.load_state_dict(reloaded['decoder'])
    params.src_id = model_params.lang2id[params.src_lang]
    params.tgt_id = model_params.lang2id[params.tgt_lang]

    # float16
    if params.fp16:
        assert torch.backends.cudnn.enabled
        encoder = network_to_half(encoder)
        decoder = network_to_half(decoder)

    # read sentences from stdin
    src_sent = []
    for line in sys.stdin.readlines():
        assert len(line.strip().split()) > 0
        src_sent.append(line)
    logger.info("Read %i sentences from stdin. Translating ..." % len(src_sent))

    f = io.open(params.output_path, 'w', encoding='utf-8')

    for i in range(0, len(src_sent), params.batch_size):

        # prepare batch
        word_ids = [torch.LongTensor([dico.index(w) for w in s.strip().split()])
                    for s in src_sent[i:i + params.batch_size]]
        lengths = torch.LongTensor([len(s) + 2 for s in word_ids])
        batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index)
        batch[0] = params.eos_index
        for j, s in enumerate(word_ids):
            if lengths[j] > 2:  # if sentence not empty
                batch[1:lengths[j] - 1, j].copy_(s)
            batch[lengths[j] - 1, j] = params.eos_index
        langs = batch.clone().fill_(params.src_id)

        # encode source batch and translate it
        encoded = encoder('fwd', x=batch.cuda(), lengths=lengths.cuda(), langs=langs.cuda(), causal=False)
        encoded = encoded.transpose(0, 1)
        decoded, dec_lengths = decoder.generate(encoded, lengths.cuda(), params.tgt_id, max_len=int(1.5 * lengths.max().item() + 10))

        # convert sentences to words
        for j in range(decoded.size(1)):

            # remove delimiters
            sent = decoded[:, j]
            delimiters = (sent == params.eos_index).nonzero().view(-1)
            assert len(delimiters) >= 1 and delimiters[0].item() == 0
            sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]]

            # output translation
            source = src_sent[i + j].strip()
            target = " ".join([dico[sent[k].item()] for k in range(len(sent))])
            sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target))
            f.write(target + "\n")

    f.close()
Example #18
0
def main(params):

    # initialize the experiment
    logger = initialize_exp(params)

    # generate parser / parse parameters
    parser = get_parser()
    params = parser.parse_args()
    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])
    logger.info("Supported languages: %s" %
                ", ".join(model_params.lang2id.keys()))

    # update dictionary parameters
    for name in [
            'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index',
            'mask_index'
    ]:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    encoder = TransformerModel(model_params,
                               dico,
                               is_encoder=True,
                               with_output=True).cuda().eval()
    encoder.load_state_dict(reloaded['encoder'])
    decoder = None
    #    decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval()
    #    decoder.load_state_dict(reloaded['decoder'])
    params.src_id = model_params.lang2id[params.src_lang]
    params.tgt_id = model_params.lang2id[params.tgt_lang]

    # read sentences from stdin
    src_sent = []
    for line in sys.stdin.readlines():
        assert len(line.strip().split()) > 0
        src_sent.append(line)
    logger.info("Read %i sentences from stdin. Translating ..." %
                len(src_sent))

    all_encodings = []
    # For each sentence...
    for i in range(0, len(src_sent), params.batch_size):
        # prepare batch
        word_ids = [
            torch.LongTensor([dico.index(w) for w in s.strip().split()])
            for s in src_sent[i:i + params.batch_size]
        ]
        lengths = torch.LongTensor([len(s) + 2 for s in word_ids])
        batch = torch.LongTensor(lengths.max().item(),
                                 lengths.size(0)).fill_(params.pad_index)
        batch[0] = params.eos_index
        for j, s in enumerate(word_ids):
            if lengths[j] > 2:  # if sentence not empty
                batch[1:lengths[j] - 1, j].copy_(s)
            batch[lengths[j] - 1, j] = params.eos_index
        langs = batch.clone().fill_(params.src_id)

        # encode source batch and translate it, deal with padding
        encodings = encoderouts(encoder, batch, lengths, langs)

        # batch is actually in original order, append each sent to all_encodings
        for idx in encodings:
            all_encodings.append(idx.cpu().numpy())

    # Save all encodings to npy
    np.save(params.output_path, np.stack(all_encodings))
Example #19
0
#   12.0149  |   12   |   0.3   | /checkpoint/guismay/dumped/clm_test3/10431904/train.log
#   12.5228  |   18   |   0.1   | /checkpoint/guismay/dumped/clm_test2/10403079/train.log

#%%
# model_path = '/checkpoint/guismay/dumped/clm_test3/10431904/periodic-23.pth'
model_path = '/checkpoint/guismay/dumped/clm_test3/10431904/periodic-23.pth'
reloaded = torch.load(model_path)
params = AttrDict(reloaded['params'])
print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

#%% [markdown]
# ## Build dictionary / update parameters / build model

#%%
# build dictionary / update parameters
dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                  reloaded['dico_counts'])
assert params.n_words == len(dico)
assert params.bos_index == dico.index(BOS_WORD)
assert params.eos_index == dico.index(EOS_WORD)
assert params.pad_index == dico.index(PAD_WORD)
assert params.unk_index == dico.index(UNK_WORD)
assert params.mask_index == dico.index(MASK_WORD)

# build model / reload weights
model = TransformerModel(params, dico, True, True)
model.load_state_dict(reloaded['model'])
model.cuda()
model.eval()

#%%
Example #20
0
def main(params):

    # initialize the experiment
    logger = initialize_exp(params)

    # generate parser / parse parameters
    parser = get_parser()
    params = parser.parse_args()
    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])
    logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys()))

    # update dictionary parameters
    for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
    encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval()
    decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval()
    encoder.load_state_dict(reloaded['encoder'])
    decoder.load_state_dict(reloaded['decoder'])
    params.src_id = model_params.lang2id[params.src_lang]
    params.tgt_id = model_params.lang2id[params.tgt_lang]

    # float16
    if params.fp16:
        assert torch.backends.cudnn.enabled
        encoder = network_to_half(encoder)
        decoder = network_to_half(decoder)

    input_data = torch.load(params.input)
    eval_dataset = Dataset(input_data["sentences"], input_data["positions"], params)

    if params.subset_start is not None:
        assert params.subset_end
        eval_dataset.select_data(params.subset_start, params.subset_end)

    eval_dataset.remove_empty_sentences()
    eval_dataset.remove_long_sentences(params.max_len)

    n_batch = 0

    out = io.open(params.output_path, "w", encoding="utf-8")
    inp_dump = io.open(os.path.join(params.dump_path, "input.txt"), "w", encoding="utf-8")
    logger.info("logging to {}".format(os.path.join(params.dump_path, 'input.txt')))

    with open(params.output_path, "w", encoding="utf-8") as out:

        for batch in eval_dataset.get_iterator(shuffle=False):
            n_batch += 1

            (x1, len1) = batch
            input_text = convert_to_text(x1, len1, input_data["dico"], params)
            inp_dump.write("\n".join(input_text))
            inp_dump.write("\n")

            langs1 = x1.clone().fill_(params.src_id)

            # cuda
            x1, len1, langs1 = to_cuda(x1, len1, langs1)

            # encode source sentence
            enc1 = encoder("fwd", x=x1, lengths=len1, langs=langs1, causal=False)
            enc1 = enc1.transpose(0, 1)

            # generate translation - translate / convert to text
            max_len = int(1.5 * len1.max().item() + 10)
            if params.beam_size == 1:
                generated, lengths = decoder.generate(enc1, len1, params.tgt_id, max_len=max_len)
            else:
                generated, lengths = decoder.generate_beam(
                    enc1, len1, params.tgt_id, beam_size=params.beam_size,
                    length_penalty=params.length_penalty,
                    early_stopping=params.early_stopping,
                    max_len=max_len)

            hypotheses_batch = convert_to_text(generated, lengths, input_data["dico"], params)

            out.write("\n".join(hypotheses_batch))
            out.write("\n")

            if n_batch % 100 == 0:
                logger.info("{} batches processed".format(n_batch))

    out.close()
    inp_dump.close()
Example #21
0
def run_xnlg():
    params = get_params()

    # initialize the experiment / build sentence embedder
    logger = initialize_exp(params)

    if params.tokens_per_batch > -1:
        params.group_by_size = True

    # check parameters
    assert os.path.isdir(params.data_path)
    assert os.path.isfile(params.model_path)

    # tasks
    params.transfer_tasks = params.transfer_tasks.split(',')
    assert len(params.transfer_tasks) > 0
    assert all([task in TASKS for task in params.transfer_tasks])

    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])
    logger.info("Supported languages: %s" %
                ", ".join(model_params.lang2id.keys()))
    params.n_langs = model_params['n_langs']
    params.id2lang = model_params['id2lang']
    params.lang2id = model_params['lang2id']

    if "enc_params" in reloaded:
        encoder_model_params = AttrDict(reloaded["enc_params"])
    elif params.n_enc_layers == model_params.n_layers or params.n_enc_layers == 0:
        encoder_model_params = model_params
    else:
        encoder_model_params = AttrDict(reloaded['params'])
        encoder_model_params.n_layers = params.n_enc_layers
        assert model_params.n_layers is not encoder_model_params.n_layers

    if "dec_params" in reloaded:
        decoder_model_params = AttrDict(reloaded["dec_params"])
    elif params.n_dec_layers == model_params.n_layers or params.n_dec_layers == 0:
        decoder_model_params = model_params
    else:
        decoder_model_params = AttrDict(reloaded['params'])
        decoder_model_params.n_layers = params.n_dec_layers
        assert model_params.n_layers is not decoder_model_params.n_layers

    params.encoder_model_params = encoder_model_params
    params.decoder_model_params = decoder_model_params

    if params.emb_dim != -1:
        encoder_model_params.emb_dim = params.emb_dim
        decoder_model_params.emb_dim = params.emb_dim

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])

    for p in [params, encoder_model_params, decoder_model_params]:
        p.n_words = len(dico)
        p.bos_index = dico.index(BOS_WORD)
        p.eos_index = dico.index(EOS_WORD)
        p.pad_index = dico.index(PAD_WORD)
        p.unk_index = dico.index(UNK_WORD)
        p.mask_index = dico.index(MASK_WORD)

    encoder = TransformerModel(encoder_model_params,
                               dico,
                               is_encoder=True,
                               with_output=False)
    decoder = TransformerModel(decoder_model_params,
                               dico,
                               is_encoder=False,
                               with_output=True)

    def _process_state_dict(state_dict):
        return {(k[7:] if k.startswith('module.') else k): v
                for k, v in state_dict.items()}

    if params.no_init == "all":
        logger.info("All Models will not load state dict.!!!")
    elif params.reload_emb != "":
        logger.info("Reloading embedding from %s ..." % params.reload_emb)
        word2id, embeddings = read_txt_embeddings(logger, params.reload_emb)
        set_pretrain_emb(logger, encoder, dico, word2id, embeddings)
        set_pretrain_emb(logger, decoder, dico, word2id, embeddings)
    else:
        if "model" in reloaded:
            if params.no_init != "encoder":
                encoder.load_state_dict(_process_state_dict(reloaded['model']),
                                        strict=False)
            if params.no_init != "decoder":
                decoder.load_state_dict(_process_state_dict(reloaded['model']),
                                        strict=False)
        else:
            if params.no_init != "encoder":
                encoder.load_state_dict(_process_state_dict(
                    reloaded['encoder']),
                                        strict=False)
            if params.no_init != "decoder":
                decoder.load_state_dict(
                    _process_state_dict(reloaded['decoder']))

    scores = {}

    # run
    for task in params.transfer_tasks:
        if task == "XQG":
            XQG_v3(encoder, decoder, scores, dico, params).run()
        elif task == "XSumm":
            XSumm(encoder, decoder, scores, dico, params).run()
Example #22
0
def main(params):

    # initialize the experiment
    logger = initialize_exp(params)

    # generate parser / parse parameters
    parser = get_parser()
    params = parser.parse_args()
    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])
    logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys()))

    # update dictionary parameters
    for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
    encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval()
    decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval()
    encoder.load_state_dict(reloaded['encoder'])
    decoder.load_state_dict(reloaded['decoder'])
    params.src_id = model_params.lang2id[params.src_lang]
    params.tgt_id = model_params.lang2id[params.tgt_lang]
    logger.info("encoder: {}".format(encoder))
    logger.info("decoder: {}".format(decoder))

    # read sentences from stdin
    src_sent = []
    with open(params.sentences_path, 'r') as file1:
        for line in file1:
            src_sent.append(line)

    logger.info("Read %i sentences from sentences file.Writing them to a src file. Translating ..." % len(src_sent))
    f = io.open(params.output_path + 'src_sent', 'w', encoding='utf-8')
    for sentence in src_sent:
        f.write(sentence + "\n")
    f.close()
    logger.info("Wrote them to a src file")
    f = io.open(params.output_path, 'w', encoding='utf-8')

    for i in range(0, len(src_sent), params.batch_size):

        # prepare batch
        word_ids = [torch.LongTensor([dico.index(w) for w in s.strip().split()])
                    for s in src_sent[i:i + params.batch_size]]
        lengths = torch.LongTensor([len(s) + 2 for s in word_ids])
        batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index)
        batch[0] = params.eos_index
        for j, s in enumerate(word_ids):
            if lengths[j] > 2:  # if sentence not empty
                batch[1:lengths[j] - 1, j].copy_(s)
            batch[lengths[j] - 1, j] = params.eos_index
        langs = batch.clone().fill_(params.src_id)

        # encode source batch and translate it
        encoded, _ = encoder('fwd', x=batch.cuda(), lengths=lengths.cuda(), langs=langs.cuda(), causal=False,
                             encoder_only=False, extra_adapters_flag=True)
        encoded = encoded.transpose(0, 1)
        # decoded, dec_lengths = decoder.generate(encoded, lengths.cuda(), params.tgt_id, max_len=int(1.5 * lengths.max().item() + 10))
        decoded, dec_lengths = decoder.generate_beam(
            encoded, lengths.cuda(), params.tgt_id, beam_size=params.beam_size, length_penalty=params.length_penalty,
            early_stopping=params.early_stopping, max_len=int(1.5 * lengths.cuda().max().item() + 10),
            extra_adapters_flag=True)
        # convert sentences to words
        for j in range(decoded.size(1)):

            # remove delimiters
            sent = decoded[:, j]
            delimiters = (sent == params.eos_index).nonzero().view(-1)
            assert len(delimiters) >= 1 and delimiters[0].item() == 0
            sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]]

            # output translation
            source = src_sent[i + j].strip()
            target = " ".join([dico[sent[k].item()] for k in range(len(sent))])
            #logger.info("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target))
            if (i+j)%10000 == 0:
                logger.info("Translation of %i / %i:\n Source sentence: %s \n Translation: %s\n" % (i + j, len(src_sent), source, target))

            # sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target))
            f.write(target + "\n")

    f.close()
Example #23
0
def main(args):
    rng = np.random.RandomState(0)

    # Make dump path
    if not os.path.exists(args.dump_path):
        subprocess.Popen("mkdir -p %s" % args.dump_path, shell=True).wait()
    else:
        if os.listdir(args.dump_path):
            m = "Directory {} is not empty.".format(args.dump_path)
            raise ValueError(m)
    if len(args.log_file):
        write_log = True
    else:
        write_log = False

    # load model parameters
    model_dir = os.path.dirname(args.load_model)
    params_path = os.path.join(model_dir, 'params.pkl')
    with open(params_path, "rb") as f:
        params = pickle.load(f)

    # load data parameters and model parameters from checkpoint
    checkpoint_path = os.path.join(model_dir, 'checkpoint.pth')
    assert os.path.isfile(checkpoint_path)
    data = torch.load(
        checkpoint_path,
        map_location=lambda storage, loc: storage.cuda(params.local_rank))
    for k, v in data["params"].items():
        params.__dict__[k] = v
    dico = Dictionary(data["dico_id2word"], data["dico_word2id"],
                      data["dico_counts"])

    # Print score
    for k, v in data["best_metrics"].items():
        print("- {}: {}".format(k, v))

    # Fix some of the params we pass to load_data
    params.debug_train = False
    params.max_vocab = -1
    params.min_count = 0
    params.tokens_per_batch = -1
    params.max_batch_size = args.batch_size
    params.batch_size = args.batch_size

    # load data
    data = load_data(args.data_path, params)

    # Print data summary
    for (src, tgt), dataset in data['para'].items():
        datatype = "Para data (%s)" % (
            "WITHOUT labels" if dataset.labels is None else "WITH labels")
        m = '{: <27} - {: >12}:{: >10}'.format(datatype, '%s-%s' % (src, tgt),
                                               len(dataset))
        print(m)

    # Fix some of the params we pass to the model builder
    params.reload_model = args.load_model

    # build model
    if params.encoder_only:
        model = build_model(params, dico)
    else:
        encoder, decoder = build_model(params, dico)
        model = encoder

    # Predict
    model = model.module if params.multi_gpu else model
    model.eval()
    start = time.time()
    for (src, tgt), dataset in data['para'].items():
        path = os.path.join(args.dump_path, "{}-{}.pred".format(src, tgt))
        scores_file = open(path, "w")
        lang1_id = params.lang2id[src]
        lang2_id = params.lang2id[tgt]
        diffs = []
        nb_written = 0
        for batch in dataset.get_iterator(False,
                                          group_by_size=False,
                                          n_sentences=-1,
                                          return_indices=False):
            (sent1, len1), (sent2, len2), labels = batch
            sent1, len1 = truncate(sent1, len1, params.max_len,
                                   params.eos_index)
            sent2, len2 = truncate(sent2, len2, params.max_len,
                                   params.eos_index)
            x, lengths, positions, langs = concat_batches(sent1,
                                                          len1,
                                                          lang1_id,
                                                          sent2,
                                                          len2,
                                                          lang2_id,
                                                          params.pad_index,
                                                          params.eos_index,
                                                          reset_positions=True)
            x, lengths, positions, langs = to_cuda(x, lengths, positions,
                                                   langs)
            with torch.no_grad():
                # Get sentence pair embedding
                h = model('fwd',
                          x=x,
                          lengths=lengths,
                          positions=positions,
                          langs=langs,
                          causal=False)[0]
                CLF_ID1, CLF_ID2 = 8, 9  # very hacky, use embeddings to make weights for the classifier
                emb = (model.module
                       if params.multi_gpu else model).embeddings.weight
                pred = F.linear(h, emb[CLF_ID1].unsqueeze(0), emb[CLF_ID2, 0])
                pred = torch.sigmoid(pred)
                pred = pred.view(-1).cpu().numpy().tolist()
            for p, l1, l2 in zip(pred, len1, len2):
                if l1.item() == 0 and l2.item() == 0:
                    scores_file.write("0.00000000\n")
                else:
                    scores_file.write("{:.8f}\n".format(p))
            nb_written += len(pred)
            if nb_written % 1000 == 0:
                elapsed = int(time.time() - start)
                lpss = elapsed % 60
                lpsm = elapsed // 60
                lpsh = lpsm // 60
                lpsm = lpsm % 60
                msg = "[{:02d}:{:02d}:{:02d} {}-{}]".format(
                    lpsh, lpsm, lpss, src, tgt)
                msg += " {}/{} ({:.2f}%) sentences processed".format(
                    nb_written, len(dataset), 100 * nb_written / len(dataset))
                print(msg)
                if write_log:
                    with open(args.log_file, "a") as fout:
                        fout.write(msg + "\n")
            # Try reversing order
            if TEST_REVERSE:
                x, lengths, positions, langs = concat_batches(
                    sent2,
                    len2,
                    lang2_id,
                    sent1,
                    len1,
                    lang1_id,
                    params.pad_index,
                    params.eos_index,
                    reset_positions=True)
                x, lengths, positions, langs = to_cuda(x, lengths, positions,
                                                       langs)
                with torch.no_grad():
                    # Get sentence pair embedding
                    h = model('fwd',
                              x=x,
                              lengths=lengths,
                              positions=positions,
                              langs=langs,
                              causal=False)[0]
                    CLF_ID1, CLF_ID2 = 8, 9  # very hacky, use embeddings to make weights for the classifier
                    emb = (model.module
                           if params.multi_gpu else model).embeddings.weight
                    pred_rev = F.linear(h, emb[CLF_ID1].unsqueeze(0),
                                        emb[CLF_ID2, 0])
                    pred_rev = torch.sigmoid(pred_rev)
                    pred_rev = pred_rev.view(-1).cpu().numpy().tolist()
                    for p, pp in zip(pred, pred_rev):
                        diffs.append(p - pp)

        if TEST_REVERSE:
            print(
                "Average absolute diff between score(l1,l2) and score(l2,l1): {}"
                .format(np.mean(np.abs(diffs))))

        scores_file.close()
if __name__ == '__main__':
    readme = ""
    parser = argparse.ArgumentParser(description=readme)
    parser.add_argument('--summary', help="summary data")
    parser.add_argument('--summary_vocab', help="summary data vocab")
    parser.add_argument('--summary_label', help="summary data label")
    parser.add_argument('--summary_max_length',
                        type=int,
                        default=600,
                        help="summmary maximum length")
    args = parser.parse_args()

    if args.summary_vocab is None:
        args.summary_vocab = args.summary + "_vocab"
    if args.summary_label is None:
        args.summary_label = args.summary + "_label"

    assert os.path.isfile(args.summary)
    assert os.path.isfile(args.summary_vocab)
    assert os.path.isfile(args.summary_label)

    print_args(args)

    summary_dico = Dictionary.read_vocab(args.summary_vocab)
    summary_data = Dictionary.index_summary(args.summary,
                                            args.summary_label,
                                            summary_dico,
                                            args.summary + ".pth",
                                            max_len=args.summary_max_length)
Example #25
0
def main(params):

    # initialize the experiment
    logger = initialize_exp(params)
    parser = get_parser()
    params = parser.parse_args()
    models_path = params.model_path.split(',')

    # generate parser / parse parameters
    models_reloaded = []
    for model_path in models_path:
        models_reloaded.append(torch.load(model_path))
    model_params = AttrDict(models_reloaded[0]['params'])
    logger.info("Supported languages: %s" %
                ", ".join(model_params.lang2id.keys()))

    # update dictionary parameters
    for name in [
            'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index',
            'mask_index'
    ]:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(models_reloaded[0]['dico_id2word'],
                      models_reloaded[0]['dico_word2id'],
                      models_reloaded[0]['dico_counts'])
    params.src_id = model_params.lang2id[params.src_lang]
    params.tgt_id = model_params.lang2id[params.tgt_lang]

    encoders = []
    decoders = []

    def package_module(modules):
        state_dict = OrderedDict()
        for k, v in modules.items():
            if k.startswith('module.'):
                state_dict[k[7:]] = v
            else:
                state_dict[k] = v
        return state_dict

    for reloaded in models_reloaded:
        encoder = TransformerModel(model_params,
                                   dico,
                                   is_encoder=True,
                                   with_output=True).to(params.device).eval()
        decoder = TransformerModel(model_params,
                                   dico,
                                   is_encoder=False,
                                   with_output=True).to(params.device).eval()
        encoder.load_state_dict(package_module(reloaded['encoder']))
        decoder.load_state_dict(package_module(reloaded['decoder']))

        # float16
        if params.fp16:
            assert torch.backends.cudnn.enabled
            encoder = network_to_half(encoder)
            decoder = network_to_half(decoder)

        encoders.append(encoder)
        decoders.append(decoder)

    #src_sent = ['Poly@@ gam@@ ie statt Demokratie .']
    src_sent = []
    for line in sys.stdin.readlines():
        assert len(line.strip().split()) > 0
        src_sent.append(line)

    f = io.open(params.output_path, 'w', encoding='utf-8')

    for i in range(0, len(src_sent), params.batch_size):

        # prepare batch
        word_ids = [
            torch.LongTensor([dico.index(w) for w in s.strip().split()])
            for s in src_sent[i:i + params.batch_size]
        ]
        lengths = torch.LongTensor([len(s) + 2 for s in word_ids])
        batch = torch.LongTensor(lengths.max().item(),
                                 lengths.size(0)).fill_(params.pad_index)
        batch[0] = params.eos_index
        for j, s in enumerate(word_ids):
            if lengths[j] > 2:  # if sentence not empty
                batch[1:lengths[j] - 1, j].copy_(s)
            batch[lengths[j] - 1, j] = params.eos_index
        langs = batch.clone().fill_(params.src_id)

        # encode source batch and translate it
        encodeds = []
        for encoder in encoders:
            encoded = encoder('fwd',
                              x=batch.to(params.device),
                              lengths=lengths.to(params.device),
                              langs=langs.to(params.device),
                              causal=False)
            encoded = encoded.transpose(0, 1)
            encodeds.append(encoded)

            assert encoded.size(0) == lengths.size(0)

        decoded, dec_lengths = generate_beam(
            decoders,
            encodeds,
            lengths.to(params.device),
            params.tgt_id,
            beam_size=params.beam,
            length_penalty=params.length_penalty,
            early_stopping=False,
            max_len=int(1.5 * lengths.max().item() + 10),
            params=params)

        # convert sentences to words
        for j in range(decoded.size(1)):

            # remove delimiters
            sent = decoded[:, j]
            delimiters = (sent == params.eos_index).nonzero().view(-1)
            assert len(delimiters) >= 1 and delimiters[0].item() == 0
            sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]]

            # output translation
            source = src_sent[i + j].strip()
            target = " ".join([dico[sent[k].item()] for k in range(len(sent))])
            sys.stderr.write("%i / %i: %s -> %s\n" %
                             (i + j, len(src_sent), source, target))
            f.write(target + "\n")

    f.close()
Example #26
0
def main(params):
    # generate parser / parse parameters
    parser = get_parser()
    params = parser.parse_args()
    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])

    # update dictionary parameters
    for name in ['src_n_words', 'tgt_n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    source_dico = Dictionary(reloaded['source_dico_id2word'], reloaded['source_dico_word2id'])
    target_dico = Dictionary(reloaded['target_dico_id2word'], reloaded['target_dico_word2id'])
    encoder = TransformerEncoder(model_params, source_dico, with_output=False).cuda().eval()
    decoder = TransformerDecoder(model_params, target_dico, with_output=True).cuda().eval()
    encoder.load_state_dict(reloaded['encoder'])
    decoder.load_state_dict(reloaded['decoder'])

    # read sentences from stdin
    table_lines = []
    table_inf = open(params.table_path, 'r', encoding='utf-8')

    for table_line in table_inf:
        table_lines.append(table_line)

    outf = io.open(params.output_path, 'w', encoding='utf-8')

    for i in range(0, len(table_lines), params.batch_size):
        # prepare batch
        enc_x1_ids = []
        enc_x2_ids = []
        enc_x3_ids = []
        enc_x4_ids = []
        for table_line in table_lines[i:i + params.batch_size]:
            record_seq = [each.split('|') for each in table_line.split()]
            assert all([len(x) == 4 for x in record_seq])
            enc_x1_ids.append(torch.LongTensor([source_dico.index(x[0]) for x in record_seq]))
            enc_x2_ids.append(torch.LongTensor([source_dico.index(x[1]) for x in record_seq]))
            enc_x3_ids.append(torch.LongTensor([source_dico.index(x[2]) for x in record_seq]))
            enc_x4_ids.append(torch.LongTensor([source_dico.index(x[3]) for x in record_seq]))

        enc_xlen = torch.LongTensor([len(x) + 2 for x in enc_x1_ids])
        enc_x1 = torch.LongTensor(enc_xlen.max().item(), enc_xlen.size(0)).fill_(params.pad_index)
        enc_x1[0] = params.eos_index
        enc_x2 = torch.LongTensor(enc_xlen.max().item(), enc_xlen.size(0)).fill_(params.pad_index)
        enc_x2[0] = params.eos_index
        enc_x3 = torch.LongTensor(enc_xlen.max().item(), enc_xlen.size(0)).fill_(params.pad_index)
        enc_x3[0] = params.eos_index
        enc_x4 = torch.LongTensor(enc_xlen.max().item(), enc_xlen.size(0)).fill_(params.pad_index)
        enc_x4[0] = params.eos_index

        for j, (s1,s2,s3,s4) in enumerate(zip(enc_x1_ids, enc_x2_ids, enc_x3_ids, enc_x4_ids)):
            if enc_xlen[j] > 2:  # if sentence not empty
                enc_x1[1:enc_xlen[j] - 1, j].copy_(s1)
                enc_x2[1:enc_xlen[j] - 1, j].copy_(s2)
                enc_x3[1:enc_xlen[j] - 1, j].copy_(s3)
                enc_x4[1:enc_xlen[j] - 1, j].copy_(s4)
            enc_x1[enc_xlen[j] - 1, j] = params.eos_index
            enc_x2[enc_xlen[j] - 1, j] = params.eos_index
            enc_x3[enc_xlen[j] - 1, j] = params.eos_index
            enc_x4[enc_xlen[j] - 1, j] = params.eos_index

        enc_x1 = enc_x1.cuda()
        enc_x2 = enc_x2.cuda()
        enc_x3 = enc_x3.cuda()
        enc_x4 = enc_x4.cuda()
        enc_xlen = enc_xlen.cuda()

        # encode source batch and translate it
        encoder_output = encoder('fwd', x1=enc_x1, x2=enc_x2, x3=enc_x3, x4=enc_x4, lengths=enc_xlen)
        encoder_output = encoder_output.transpose(0, 1)

        # max_len = int(1.5 * enc_xlen.max().item() + 10)
        max_len = 602
        if params.beam_size <= 1:
            decoded, dec_lengths = decoder.generate(encoder_output, enc_xlen, max_len=max_len)
        elif params.beam_size > 1:
            decoded, dec_lengths = decoder.generate_beam(encoder_output, enc_xlen, params.beam_size, 
                                            params.length_penalty, params.early_stopping, max_len=max_len)

        for j in range(decoded.size(1)):

            # remove delimiters
            sent = decoded[:, j]
            delimiters = (sent == params.eos_index).nonzero().view(-1)
            assert len(delimiters) >= 1 and delimiters[0].item() == 0
            sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]]

            # output translation
            source = table_lines[i + j].strip()
            target = " ".join([target_dico[sent[k].item()] for k in range(len(sent))])
            sys.stderr.write("%i / %i: %s\n" % (i + j, len(table_lines), target))
            outf.write(target + "\n")

    outf.close()

def print_args(args):
    print("table:\t{}".format(args.table))
    print("table_label:\t{}".format(args.table_label))
    print("table_vocab:\t{}".format(args.table_vocab))


if __name__ == '__main__':
    readme = ""
    parser = argparse.ArgumentParser(description=readme)
    parser.add_argument('--table', help="table dataOld")
    parser.add_argument('--table_label', help="table label")
    parser.add_argument('--table_vocab', help="table vocab")
    args = parser.parse_args()

    if args.table_label is None:
        args.table_label = args.table + "_label"
    if args.table_vocab is None:
        args.table_vocab = args.table + "_vocab"

    assert os.path.isfile(args.table)
    assert os.path.isfile(args.table_label)
    assert os.path.isfile(args.table_vocab)

    print_args(args)

    table_dico = Dictionary.read_vocab(args.table_vocab)
    table_data = Dictionary.index_table(args.table, args.table_label,
                                        table_dico, args.table + ".pth")
Example #28
0
class Translate():
    def __init__(self,
                 model_path,
                 tgt_lang,
                 src_lang,
                 dump_path="./dumped/",
                 exp_name="translate",
                 exp_id="test",
                 batch_size=32):

        # parse parameters
        parser = argparse.ArgumentParser(description="Translate sentences")

        # main parameters
        parser.add_argument("--dump_path",
                            type=str,
                            default=dump_path,
                            help="Experiment dump path")
        parser.add_argument("--exp_name",
                            type=str,
                            default=exp_name,
                            help="Experiment name")
        parser.add_argument("--exp_id",
                            type=str,
                            default=exp_id,
                            help="Experiment ID")
        parser.add_argument("--batch_size",
                            type=int,
                            default=batch_size,
                            help="Number of sentences per batch")
        # model / output paths
        parser.add_argument("--model_path",
                            type=str,
                            default=model_path,
                            help="Model path")
        # parser.add_argument("--max_vocab", type=int, default=-1, help="Maximum vocabulary size (-1 to disable)")
        # parser.add_argument("--min_count", type=int, default=0, help="Minimum vocabulary count")
        # source language / target language
        parser.add_argument("--src_lang",
                            type=str,
                            default=src_lang,
                            help="Source language")
        parser.add_argument("--tgt_lang",
                            type=str,
                            default=tgt_lang,
                            help="Target language")
        parser.add_argument('-d',
                            "--text",
                            type=str,
                            default="",
                            nargs='+',
                            help="Text to be translated")

        params = parser.parse_args()
        assert params.src_lang != '' and params.tgt_lang != '' and params.src_lang != params.tgt_lang

        # initialize the experiment
        logger = initialize_exp(params)

        # On a pas de GPU
        #reloaded = torch.load(params.model_path)
        reloaded = torch.load(params.model_path,
                              map_location=torch.device('cpu'))
        model_params = AttrDict(reloaded['params'])
        self.supported_languages = model_params.lang2id.keys()
        logger.info("Supported languages: %s" %
                    ", ".join(self.supported_languages))

        # update dictionary parameters
        for name in [
                'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index',
                'mask_index'
        ]:
            try:
                setattr(params, name, getattr(model_params, name))
            except AttributeError:
                key = list(model_params.meta_params.keys())[0]
                attr = getattr(model_params.meta_params[key], name)
                setattr(params, name, attr)
                setattr(model_params, name, attr)

        # build dictionary / build encoder / build decoder / reload weights
        self.dico = Dictionary(reloaded['dico_id2word'],
                               reloaded['dico_word2id'],
                               reloaded['dico_counts'])
        #self.encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval()
        self.encoder = TransformerModel(model_params,
                                        self.dico,
                                        is_encoder=True,
                                        with_output=True).eval()
        #self.decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval()
        self.decoder = TransformerModel(model_params,
                                        self.dico,
                                        is_encoder=False,
                                        with_output=True).eval()
        self.encoder.load_state_dict(reloaded['encoder'])
        self.decoder.load_state_dict(reloaded['decoder'])
        params.src_id = model_params.lang2id[params.src_lang]
        params.tgt_id = model_params.lang2id[params.tgt_lang]
        self.model_params = model_params
        self.params = params

    def translate(self, src_sent=[]):
        flag = False
        if type(src_sent) == str:
            src_sent = [src_sent]
            flag = True
        tgt_sent = []
        for i in range(0, len(src_sent), self.params.batch_size):
            # prepare batch
            word_ids = [
                torch.LongTensor(
                    [self.dico.index(w) for w in s.strip().split()])
                for s in src_sent[i:i + self.params.batch_size]
            ]
            lengths = torch.LongTensor([len(s) + 2 for s in word_ids])
            batch = torch.LongTensor(lengths.max().item(),
                                     lengths.size(0)).fill_(
                                         self.params.pad_index)
            batch[0] = self.params.eos_index
            for j, s in enumerate(word_ids):
                if lengths[j] > 2:  # if sentence not empty
                    batch[1:lengths[j] - 1, j].copy_(s)
                batch[lengths[j] - 1, j] = self.params.eos_index
            langs = batch.clone().fill_(self.params.src_id)

            # encode source batch and translate it
            #encoded = self.encoder('fwd', x=batch.cuda(), lengths=lengths.cuda(), langs=langs.cuda(), causal=False)
            encoded = self.encoder('fwd',
                                   x=batch,
                                   lengths=lengths,
                                   langs=langs,
                                   causal=False)
            encoded = encoded.transpose(0, 1)
            #decoded, dec_lengths = self.decoder.generate(encoded, lengths.cuda(), self.params.tgt_id, max_len=int(1.5 * lengths.max().item() + 10))
            decoded, dec_lengths = self.decoder.generate(
                encoded,
                lengths,
                self.params.tgt_id,
                max_len=int(1.5 * lengths.max().item() + 10))

            # convert sentences to words
            for j in range(decoded.size(1)):

                # remove delimiters
                sent = decoded[:, j]
                delimiters = (sent == self.params.eos_index).nonzero().view(-1)
                assert len(delimiters) >= 1 and delimiters[0].item() == 0
                sent = sent[1:] if len(
                    delimiters) == 1 else sent[1:delimiters[1]]

                # output translation
                source = src_sent[i + j].strip()
                target = " ".join(
                    [self.dico[sent[k].item()] for k in range(len(sent))])
                sys.stderr.write("%i / %i: %s -> %s\n" %
                                 (i + j, len(src_sent), source, target))
                tgt_sent.append(target)

        if flag:
            return tgt_sent[0]
        return tgt_sent
Example #29
0
def main(params):

    # initialize the experiment
    logger = initialize_exp(params)

    # generate parser / parse parameters
    parser = get_parser()
    params = parser.parse_args()
    torch.manual_seed(
        params.seed
    )  # Set random seed. NB: Multi-GPU also needs torch.cuda.manual_seed_all(params.seed)
    assert (params.sample_temperature
            == 0) or (params.beam_size == 1), 'Cannot sample with beam search.'
    assert params.amp <= 1, f'params.amp == {params.amp} not yet supported.'
    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])
    logger.info("Supported languages: %s" %
                ", ".join(model_params.lang2id.keys()))

    # update dictionary parameters
    for name in [
            'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index',
            'mask_index'
    ]:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                      reloaded['dico_counts'])
    encoder = TransformerModel(model_params,
                               dico,
                               is_encoder=True,
                               with_output=False).cuda().eval()
    decoder = TransformerModel(model_params,
                               dico,
                               is_encoder=False,
                               with_output=True).cuda().eval()
    if all([k.startswith('module.') for k in reloaded['encoder'].keys()]):
        reloaded['encoder'] = {
            k[len('module.'):]: v
            for k, v in reloaded['encoder'].items()
        }
    encoder.load_state_dict(reloaded['encoder'])
    if all([k.startswith('module.') for k in reloaded['decoder'].keys()]):
        reloaded['decoder'] = {
            k[len('module.'):]: v
            for k, v in reloaded['decoder'].items()
        }
    decoder.load_state_dict(reloaded['decoder'])

    if params.amp != 0:
        models = apex.amp.initialize([encoder, decoder],
                                     opt_level=('O%i' % params.amp))
        encoder, decoder = models

    params.src_id = model_params.lang2id[params.src_lang]
    params.tgt_id = model_params.lang2id[params.tgt_lang]

    # read sentences from stdin
    src_sent = []
    for line in sys.stdin.readlines():
        assert len(line.strip().split()) > 0
        src_sent.append(line)
    logger.info("Read %i sentences from stdin. Translating ..." %
                len(src_sent))

    # f = io.open(params.output_path, 'w', encoding='utf-8')

    hypothesis = [[] for _ in range(params.beam_size)]
    for i in range(0, len(src_sent), params.batch_size):

        # prepare batch
        word_ids = [
            torch.LongTensor([dico.index(w) for w in s.strip().split()])
            for s in src_sent[i:i + params.batch_size]
        ]
        lengths = torch.LongTensor([len(s) + 2 for s in word_ids])
        batch = torch.LongTensor(lengths.max().item(),
                                 lengths.size(0)).fill_(params.pad_index)
        batch[0] = params.eos_index
        for j, s in enumerate(word_ids):
            if lengths[j] > 2:  # if sentence not empty
                batch[1:lengths[j] - 1, j].copy_(s)
            batch[lengths[j] - 1, j] = params.eos_index
        langs = batch.clone().fill_(params.src_id)

        # encode source batch and translate it
        encoded = encoder('fwd',
                          x=batch.cuda(),
                          lengths=lengths.cuda(),
                          langs=langs.cuda(),
                          causal=False)
        encoded = encoded.transpose(0, 1)
        max_len = int(1.5 * lengths.max().item() + 10)
        if params.beam_size == 1:
            decoded, dec_lengths = decoder.generate(
                encoded,
                lengths.cuda(),
                params.tgt_id,
                max_len=max_len,
                sample_temperature=(None if params.sample_temperature == 0 else
                                    params.sample_temperature))
        else:
            decoded, dec_lengths, all_hyp_strs = decoder.generate_beam(
                encoded,
                lengths.cuda(),
                params.tgt_id,
                beam_size=params.beam_size,
                length_penalty=params.length_penalty,
                early_stopping=params.early_stopping,
                max_len=max_len,
                output_all_hyps=True)
        # hypothesis.extend(convert_to_text(decoded, dec_lengths, dico, params))

        # convert sentences to words
        for j in range(decoded.size(1)):

            # remove delimiters
            sent = decoded[:, j]
            delimiters = (sent == params.eos_index).nonzero().view(-1)
            assert len(delimiters) >= 1 and delimiters[0].item() == 0
            sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]]

            # output translation
            source = src_sent[i + j].strip().replace('<unk>', '<<unk>>')
            target = " ".join([dico[sent[k].item()] for k in range(len(sent))
                               ]).replace('<unk>', '<<unk>>')
            if params.beam_size == 1:
                hypothesis[0].append(target)
            else:
                for hyp_rank in range(params.beam_size):
                    print(
                        all_hyp_strs[j]
                        [hyp_rank if hyp_rank < len(all_hyp_strs[j]) else -1])
                    hypothesis[hyp_rank].append(
                        all_hyp_strs[j]
                        [hyp_rank if hyp_rank < len(all_hyp_strs[j]) else -1])

            sys.stderr.write("%i / %i: %s -> %s\n" %
                             (i + j, len(src_sent), source.replace(
                                 '@@ ', ''), target.replace('@@ ', '')))
            # f.write(target + "\n")

    # f.close()

    # export sentences to reference and hypothesis files / restore BPE segmentation
    save_dir, split = params.output_path.rsplit('/', 1)
    for hyp_rank in range(len(hypothesis)):
        hyp_name = f'hyp.st={params.sample_temperature}.bs={params.beam_size}.lp={params.length_penalty}.es={params.early_stopping}.seed={params.seed if (len(hypothesis) == 1) else str(hyp_rank)}.{params.src_lang}-{params.tgt_lang}.{split}.txt'
        hyp_path = os.path.join(save_dir, hyp_name)
        with open(hyp_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(hypothesis[hyp_rank]) + '\n')
        restore_segmentation(hyp_path)

        # evaluate BLEU score
        if params.ref_path:
            bleu = eval_moses_bleu(params.ref_path, hyp_path)
            logger.info("BLEU %s %s : %f" % (hyp_path, params.ref_path, bleu))
import torch
from logging import getLogger
from src.utils import AttrDict
from src.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD
from src.model.transformer import TransformerModel

logger = getLogger()


# NOTE: remember to replace the model path here
model_path = './dumped/XLM_bora_es/abcedf/checkpoint.pth'
reloaded = torch.load(model_path)
params = AttrDict(reloaded['params'])
print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
params.n_words = len(dico)
params.bos_index = dico.index(BOS_WORD)
params.eos_index = dico.index(EOS_WORD)
params.pad_index = dico.index(PAD_WORD)
params.unk_index = dico.index(UNK_WORD)
params.mask_index = dico.index(MASK_WORD)

# build model / reload weights
model = TransformerModel(params, dico, True, True)
model.eval()
model.load_state_dict(reloaded['model'])

codes = "./data/processed/XLM_bora_es/60k/codes"  # path to the codes of the model
fastbpe = os.path.join(os.getcwd(), 'tools/fastBPE/fast')