from tqdm import tqdm from pathlib import Path import params from run import load_model, get_tokenizers from retrieval.data.loaders import get_loader from retrieval.model import model from retrieval.train.train import Trainer from retrieval.utils import file_utils, helper from retrieval.utils.logger import create_logger from run import load_yaml_opts, parse_loader_name, get_data_path if __name__ == '__main__': args = params.get_test_params() opt = load_yaml_opts(args.options) logger = create_logger(level='debug' if opt.engine.debug else 'info') logger.info(f'Used args : \n{args}') logger.info(f'Used options: \n{opt}') data_path = get_data_path(opt) loaders = [] for data_info in opt.dataset.val.data: _, lang = parse_loader_name(data_info) loaders.append( get_loader(data_split=args.data_split, data_path=data_path, data_info=data_info, loader_name=opt.dataset.loader_name, local_rank=args.local_rank,
for k, v in tqdm(tokenizer.vocab.idx2word.items(), total=len(tokenizer)): try: word_matrix[k] = torch.tensor(emb_model[v]) except KeyError: word_matrix[k] = emb_model['<unk>'] total_unk += 1 return word_matrix, total_unk def load_tokenizer(args): """Load tokenizer""" tokenizer = Tokenizer() tokenizer.load(args.vocab_path) return tokenizer if __name__ == '__main__': args = get_vocab_alignment_params() logger = create_logger(level='debug') tokenizer = load_tokenizer(args) emb_model = loadEmbModel(args.emb_path, logger) word_matrix, total_unk = align_vocabs(emb_model, tokenizer) logger.info(f'Finished. Total UNK: {total_unk}') torch.save(word_matrix, args.outpath) logger.info(f'Saved into: {args.outpath}')