Example #1
0
from tqdm import tqdm
from pathlib import Path

import params
from run import load_model, get_tokenizers
from retrieval.data.loaders import get_loader
from retrieval.model import model
from retrieval.train.train import Trainer
from retrieval.utils import file_utils, helper
from retrieval.utils.logger import create_logger
from run import load_yaml_opts, parse_loader_name, get_data_path

if __name__ == '__main__':
    args = params.get_test_params()
    opt = load_yaml_opts(args.options)
    logger = create_logger(level='debug' if opt.engine.debug else 'info')

    logger.info(f'Used args   : \n{args}')
    logger.info(f'Used options: \n{opt}')

    data_path = get_data_path(opt)

    loaders = []
    for data_info in opt.dataset.val.data:
        _, lang = parse_loader_name(data_info)
        loaders.append(
            get_loader(data_split=args.data_split,
                       data_path=data_path,
                       data_info=data_info,
                       loader_name=opt.dataset.loader_name,
                       local_rank=args.local_rank,
Example #2
0
    for k, v in tqdm(tokenizer.vocab.idx2word.items(), total=len(tokenizer)):
        try:
            word_matrix[k] = torch.tensor(emb_model[v])
        except KeyError:
            word_matrix[k] = emb_model['<unk>']
            total_unk += 1
    return word_matrix, total_unk


def load_tokenizer(args):
    """Load tokenizer"""
    tokenizer = Tokenizer()
    tokenizer.load(args.vocab_path)
    return tokenizer


if __name__ == '__main__':
    args = get_vocab_alignment_params()
    logger = create_logger(level='debug')

    tokenizer = load_tokenizer(args)

    emb_model = loadEmbModel(args.emb_path, logger)

    word_matrix, total_unk = align_vocabs(emb_model, tokenizer)

    logger.info(f'Finished. Total UNK: {total_unk}')
    torch.save(word_matrix, args.outpath)
    logger.info(f'Saved into: {args.outpath}')