def __init__(self, from_model_name, from_model_checkpt, to_model_name, to_model_checkpt, is_load_from_github=True, tokenzier_name='moses', bpe_name='fastbpe', device='cuda'): super().__init__(device, temperature=None, top_k=None, top_p=None) try: import fairseq from fairseq.models.transformer import TransformerModel except ModuleNotFoundError: raise ModuleNotFoundError('Missed fairseq library. Install fairseq by https://github.com/pytorch/fairseq') self.from_model_name = from_model_name self.from_model_checkpt = from_model_checkpt self.to_model_name = to_model_name self.to_model_checkpt = to_model_checkpt self.is_load_from_github = is_load_from_github self.tokenzier_name = tokenzier_name self.bpe_name = bpe_name if is_load_from_github: self.from_model = torch.hub.load( github='pytorch/fairseq', model=from_model_name, checkpoint_file=from_model_checkpt, tokenizer=tokenzier_name, bpe=bpe_name) self.to_model = torch.hub.load( github='pytorch/fairseq', model=to_model_name, checkpoint_file=to_model_checkpt, tokenizer=tokenzier_name, bpe=bpe_name) else: try: self.from_model = TransformerModel.from_pretrained( model_name_or_path=os.path.join(from_model_name, ''), checkpoint_file=from_model_checkpt, tokenizer=tokenzier_name, bpe=bpe_name) except TypeError: err_msg = 'Cannot load model from local path. You may check the following parameters are correct or not.' err_msg += ' Model Directory: ' + from_model_name err_msg += ', Checkpoint File Name: ' + from_model_checkpt err_msg += ', Tokenizer Name: ' + tokenzier_name err_msg += ', BPE Name: ' + bpe_name raise ValueError(err_msg) try: self.to_model = TransformerModel.from_pretrained( model_name_or_path=os.path.join(to_model_name, ''), checkpoint_file=to_model_checkpt, tokenizer=tokenzier_name, bpe=bpe_name) except TypeError: err_msg = 'Cannot load model from local path. You may check the following parameters are correct or not.' err_msg += ' Model Directory: ' + to_model_name err_msg += ', Checkpoint File Name: ' + to_model_checkpt err_msg += ', Tokenizer Name: ' + tokenzier_name err_msg += ', BPE Name: ' + bpe_name raise ValueError(err_msg) self.from_model.eval() self.to_model.eval() if self.device == 'cuda': self.from_model.cuda() self.to_model.cuda()
def translate(model_dir, in_file, out_file, batch_size, model_name, num_shards, shard_id, moses, spiece, lenpen, beam, ): if moses: tokenizer = 'moses' else: tokenizer = None if spiece: model = TransformerModel.from_pretrained(model_dir, checkpoint_file=model_name, data_name_or_path=model_dir, bpe='sentencepiece', sentencepiece_model=os.path.join(model_dir, 'spiece.model'), tokenizer=tokenizer, ) else: model = TransformerModel.from_pretrained(model_dir, checkpoint_file=model_name, data_name_or_path=model_dir, bpe='subword_nmt', bpe_codes=os.path.join(model_dir, 'bpecodes'), tokenizer=tokenizer, ) start_id, end_id = get_line_ids(in_file, num_shards, shard_id) print(start_id, end_id) src_sents = [] model.cuda() with open(in_file) as fin: for i, line in enumerate(fin): if start_id <= i < end_id: line = line.strip() src_sents.append(line) nb_sents = len(src_sents) nb_batches = (nb_sents+batch_size-1)//batch_size outputs = [] for i in range(nb_batches): print('Batch ID: {}/{}'.format(i, nb_batches)) output = model.translate(src_sents[i*batch_size:(i+1)*batch_size], lenpen=lenpen, beam=beam) outputs.extend(output) with open(out_file, 'wt') as fout: for output in outputs: fout.write(output) fout.write('\n')
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ from pororo.tasks import PororoPosFactory if "transformer.large" in self.config.n_model: from fairseq.models.transformer import TransformerModel load_dict = download_or_load( f"transformer/{self.config.n_model}", self.config.lang, ) model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) morph2idx = pickle.load( open( download_or_load( f"misc/morph2idx.{self.config.lang}.pkl", self.config.lang, ), "rb", )) tag2idx = pickle.load( open( download_or_load( f"misc/tag2idx.{self.config.lang}.pkl", self.config.lang, ), "rb", )) query2origin, query2meaning, query2eng, _ = pickle.load( open( download_or_load( f"misc/wsd-dicts.{self.config.lang}.pkl", self.config.lang, ), "rb", )) return PororoTransformerWsd( model, morph2idx, tag2idx, query2origin, query2meaning, query2eng, self.config, )
def initialize(self, context): self._context = context self.initialized = True self.manifest = context.manifest properties = context.system_properties model_dir = properties.get("model_dir") self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda. is_available() else "cpu") #read configs for the model_name, bpe etc. from setup_config.json setup_config_path = os.path.join(model_dir, "setup_config.json") if os.path.isfile(setup_config_path): with open(setup_config_path) as setup_config_file: self.setup_config = json.load(setup_config_file) else: logger.warning('Missing the setup_config.json file.') # load the model self.model = TransformerModel.from_pretrained( model_dir, checkpoint_file='model.pt', data_name_or_path=model_dir, tokenizer='moses', bpe=self.setup_config["bpe"]) self.model.to(self.device) self.model.eval() self.initialized = True
def transformer(*args, **kwargs): """ Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017) <https://arxiv.org/abs/1706.03762>`_. """ parser = options.get_interactive_generation_parser() model = TransformerModel.from_pretrained(parser, *args, **kwargs) return model
def __init__(self): self.langs = ["zh -> ar", "ar -> zh", "ar -> fr", "fr -> ar"] self.model_ar2zh = TransformerModel.from_pretrained( "checkpoints-ar2zh", checkpoint_file='checkpoint_best.pt', data_name_or_path='data-bin', bpe='subword_nmt', bpe_codes='data-bin/code') self.model_zh2ar = TransformerModel.from_pretrained( "checkpoints-zh2ar", checkpoint_file='checkpoint_best.pt', data_name_or_path='data-bin', bpe='subword_nmt', bpe_codes='data-bin/code') self.segmenter = FarasaSegmenter(interactive=True) self.models = {"ar2zh": self.model_ar2zh, "zh2ar": self.model_zh2ar}
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ from pororo.tasks import PororoTokenizationFactory sent_tokenizer = (lambda text, lang: PororoTokenizationFactory( task="tokenization", lang=lang, model=f"sent_{lang}", ).load(device).predict(text)) if "multi" in self.config.n_model: from fairseq.models.transformer import TransformerModel from pororo.tasks.utils.tokenizer import CustomTokenizer load_dict = download_or_load( f"transformer/{self.config.n_model}", self.config.lang, ) model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) tokenizer = CustomTokenizer.from_file( vocab_filename=f"{load_dict.src_tok}/vocab.json", merges_filename=f"{load_dict.src_tok}/merges.txt", ) if "mtpg" in self.config.n_model: langtok_style = "mbart" elif "m2m" in self.config.n_model: langtok_style = "multilingual" else: langtok_style = "basic" return PororoTransformerTransMulti( model, self.config, tokenizer, sent_tokenizer, langtok_style, )
def load_model(seed): """ Given a seed (as a integer), load the corresponding model. """ model = TransformerModel.from_pretrained( 'work/checkpoints_seed' + str(seed), checkpoint_file='checkpoint_best.pt', data_name_or_path='work/processed_data/fairseq_preprocessed_data', ).to(device) model.eval() return model
def english_to_french(text): print("path ---->", os.getcwd()) en_to_fr = TransformerModel.from_pretrained( 'translator_app/core/pretrained_models/wmt14.en-fr.fconv-py', checkpoint_file='model.pt', data_name_or_path= 'translator_app/core/pretrained_models/wmt14.en-fr.fconv-py', bpe='fastbpe', bpe_codes= 'translator_app/core/pretrained_models/wmt14.en-fr.fconv-py/bpecodes') return en_to_fr.translate(text)
def load_smi_to_iupac_model(): model = TransformerModel.from_pretrained( str(Path().absolute()), checkpoint_file=f'{root_dir}/checkpoints/checkpoint_best.pt', data_name_or_path=f'{root_dir}/data-bin/smi_iupac.smi-iupac/', bpe='subword_nmt', bpe_codes=f'{root_dir}/preprocess/smi_iupac/code') model.eval() print('Load the model OK!') return model
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "charbert" in self.config.n_model: from pororo.models.brainbert import CharBrainRobertaModel model = (CharBrainRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) print( "As of now, this beta model tries to correct spacing errors in Korean text." ) return PororoBertSpacing(model, self.config) if "transformer" in self.config.n_model: from fairseq.models.transformer import TransformerModel from pororo.tasks.utils.tokenizer import CustomTokenizer load_dict = download_or_load( f"transformer/{self.config.n_model}", self.config.lang, ) tokenizer = None model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) if "char" in self.config.n_model: return PororoTransformerGecChar(model, self.config) if load_dict.src_tok: tokenizer = CustomTokenizer.from_file( vocab_filename=f"{load_dict.src_tok}/vocab.json", merges_filename=f"{load_dict.src_tok}/merges.txt", ) return PororoTransformerGec(model, tokenizer, device, self.config)
def __init__(self, model_dir, model_file, tokenizer='moses', bpe='subword_nmt', use_cuda=True): self.model = TransformerModel.from_pretrained(model_dir, model_file, tokenizer=tokenizer, bpe=bpe) if use_cuda and torch.cuda.is_available(): self.model.cuda()
def __init__(self): self.possible_dialects = ['standard', 'north', 'north_east', 'south'] self.dialect_models = {} model_dir = os.getenv("G2P_MODEL_DIR", "/data/models/g2p/fairseq/") """ Select the paths based on dialect """ for dialect in self.possible_dialects: data_dir = model_dir + '/data-bin/' + dialect checkpoint_file = model_dir + '/checkpoints/' + dialect + \ '-256-.3-s-s/checkpoint_last.pt' self.dialect_models[dialect] = \ TransformerModel.from_pretrained(data_dir, checkpoint_file)
def backtranslation_using_en_de_model(args): task_name = args.task_name os.makedirs(args.output_dir, exist_ok=True) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.deterministic = True os.makedirs(args.output_dir, exist_ok=True) processor = get_task_processor(task_name, args.data_dir) # load train and dev data train_examples = processor.get_train_examples() # load the best model en_de_model = TransformerModel.from_pretrained(os.path.join( args.cache, "wmt19.en-de.joined-dict.single_model"), checkpoint_file="model.pt", tokenizer='moses', bpe='fastbpe') de_en_model = TransformerModel.from_pretrained(os.path.join( args.cache, "wmt19.de-en.joined-dict.single_model"), checkpoint_file="model.pt", tokenizer='moses', bpe='fastbpe') # en_de_model.to(device) # de_en_model.to(device) save_train_path = os.path.join(args.output_dir, "bt_aug.tsv") save_train_file = open(save_train_path, 'w') tsv_writer = csv.writer(save_train_file, delimiter='\t') for example in train_examples: text = example.text_a de_example = en_de_model.translate(text, remove_bpe=True) back_translated_example = de_en_model.translate(de_example, remove_bpe=True) tsv_writer.writerow([example.label, back_translated_example])
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "transformer" in self.config.n_model: from fairseq.models.transformer import TransformerModel from pororo.tasks import PororoPosFactory load_dict = download_or_load( f"transformer/{self.config.n_model}", self.config.lang, ) model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) if self.config.lang == "ko": tagger = PororoPosFactory( task="pos", model="mecab-ko", lang=self.config.lang, ).load(device) return PororoTransConstKo(model, tagger, self.config) if self.config.lang == "en": tagger = PororoPosFactory( task="pos", model="nltk", lang=self.config.lang, ).load(device) return PororoTransConstEn(model, tagger, self.config) if self.config.lang == "zh": tagger = PororoPosFactory( task="pos", model="jieba", lang=self.config.lang, ).load(device) return PororoTransConstZh(model, tagger, self.config)
def build_model(): src_encoder = XLMRobertaModel.from_pretrained('xlm-roberta-large') en2fr = TransformerModel.from_pretrained( '/home/mindreese/xencoder/wmt14.en-fr.joined-dict.transformer/', checkpoint_file='model.pt', bpe='subword_nmt', bpe_codes= '/home/mindreese/xencoder/wmt14.en-fr.joined-dict.transformer/bpecodes' ) tgt_encoder = [ model for name, model in en2fr.named_modules() if name == 'models.0.encoder' ][0] return src_encoder, tgt_encoder
def __init__(self, lang: str): self.bart = TransformerModel.from_pretrained( "mbart50.ft.nn", checkpoint_file="model.pt", data_name_or_path="mbart50.ft.nn", bpe="sentencepiece", sentencepiece_model="mbart50.ft.nn/sentence.bpe.model", lang_dict="mbart50.ft.nn/ML50_langs.txt", target_lang=lang, source_lang="en_XX", encoder_langtok="src", ) self.bart.eval() self.bart.to( torch.device("cuda" if torch.cuda.is_available() else "cpu")) self.lang = lang
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if self.config.n_model == "p2g.zh": from pororo.models.p2g import P2gM pinyin = download_or_load( f"misc/pinyin2idx.{self.config.lang}.pkl", self.config.lang, ) char = download_or_load( f"misc/char2idx.{self.config.lang}.pkl", self.config.lang, ) ckpt = download_or_load( f"misc/{self.config.n_model}.pt", self.config.lang, ) model = P2gM(pinyin, char, ckpt, device) return PororoP2GZh(model, self.config) if self.config.n_model == "p2g.ja": from fairseq.models.transformer import TransformerModel load_dict = download_or_load( "transformer/transformer.base.ja.p2g", self.config.lang, ) model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file="transformer.base.ja.p2g.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) return PororoP2GJa(model, self.config)
def __init__(self): self.model = TransformerModel.from_pretrained( "/data/models/eng-isl-base-v1", checkpoint_file="checkpoint.en-is.avg8.pt", data_name_or_path="/data/models/eng-isl-base-v1", gpt2_encoder_json="/data/models/fairseq-eng-isl-base-std-parice/eng-isl-bbpe/eng-isl-bbpe-32k/eng-isl-bbpe-32k-vocab.json", gpt2_vocab_bpe="/data/models/fairseq-eng-isl-base-std-parice/eng-isl-bbpe/eng-isl-bbpe-32k/eng-isl-bbpe-32k-merges.txt", source_lang="en", target_lang="is", bpe="gpt2", beam=5, len_penalty=0.6, task="translation_with_backtranslation", ) self.model.to("cpu") self.model.eval()
def __init__( self, src_vocab_path, tgt_vocab_path, fairseq_path="/home/acb11204eq/data/wmt14_ende_fair/wmt14_ende_fairseq" ): self.src_vmap = self.build_vocab_map( src_vocab_path, "{}/dict.src.txt".format(fairseq_path)) self.tgt_vmap = self.build_vocab_map( tgt_vocab_path, "{}/dict.tgt.txt".format(fairseq_path)) model = TransformerModel.from_pretrained( fairseq_path, checkpoint_file="{}/checkpoint.pt".format(fairseq_path), data_name_or_path=fairseq_path) # model.translate("Yesterday , Gut@@ acht 's Mayor gave a clear answer to this question .") if torch.cuda.is_available(): model.cuda() self.transformer = model._modules["models"][0] self.transformer.train(False)
def __init__(self): self._tokenizer = MosesTokenizer("en") self._model_name = _EN_TH_MODEL_NAME _download_install(self._model_name) self._model = TransformerModel.from_pretrained( model_name_or_path=_get_translate_path( self._model_name, _EN_TH_FILE_NAME, "models", ), checkpoint_file="checkpoint.pt", data_name_or_path=_get_translate_path( self._model_name, _EN_TH_FILE_NAME, "vocab", ), )
def load_translate(dataset, testset): model = TransformerModel.from_pretrained( f'./checkpoint/{dataset}', checkpoint_file='checkpoint_best.pt', data_name_or_path=f'data-bin/{testset}', bpe='sentencepiece', sentencepiece_model='./bpe_model/ta.wiki.bpe.vs50000.model') model.eval() en2de.cuda() with open(f'intermediate_datasets/BPE/{testset}/test.en') as f: src_sentences = f.read().splitlines() with open(f'datasets/{testset}/test.ta') as f: ref_lines = f.read().splitlines() hyp_lines = model.translate(tqdm(src_sentences)) with open(f'generation_results/{dataset}on{testset}.txt', 'w') as f: f.writelines(f'{sentence}\n' for sentence in hyp_lines) return hyp_lines, ref_lines
def __init__(self): self._model_name = _TH_EN_MODEL_NAME _download_install(self._model_name) self._model = TransformerModel.from_pretrained( model_name_or_path=_get_translate_path( self._model_name, _TH_EN_FILE_NAME, "models", ), checkpoint_file="checkpoint.pt", data_name_or_path=_get_translate_path( self._model_name, _TH_EN_FILE_NAME, "vocab", ), bpe="sentencepiece", sentencepiece_model=_get_translate_path( self._model_name, _TH_EN_FILE_NAME, "bpe", "spm.th.model", ), )
def translate(input_file, output_file, device, folder, beam_size=3, batch_size=256, replace_unk=False): translator = TransformerModel.from_pretrained( folder, checkpoint_file='checkpoint_best.pt', beam=beam_size) translator.to(device) translator.eval() input_f = open(input_file, "r") output_f = open(output_file, "w") for batch in tqdm(chunked(input_f, batch_size)): for src, sentence in zip(batch, translator.translate(batch)): if replace_unk: sentence = sentence.replace("<unk>", "") sentence = sentence.replace("▁< unk >", "") sentence = sentence.replace(" ", " ") print("Source text: {}".format(src.strip())) print("Translation text: {}".format(sentence)) print(sentence, file=output_f)
def main(): """ Give the path of the source file (in tsv format) as an argument to the command in command line "run translator --source-file=..." Hard code the Target file, where the ouptut in german should be saved. """ parser = argparse.ArgumentParser(description='') parser.add_argument('--source-file', required=True) args = parser.parse_args() source_file = args.source_file #check if source file is a valid file if not os.path.isfile(source_file) or not source_file_tsv.endswith('.tsv'): raise Exception(f"{source_file_tsv} is no valide file") en2de = TransformerModel.from_pretrained( f'{DATA_DIR}', checkpoint_file=f'{DATA_DIR}/model4.pt', data_name_or_path=f'{DATA_DIR}', bpe='fastbpe', bpe_codes=f'{DATA_DIR}/bpecodes', tokenizer='moses') lines_en = convert_tsv_lines_utf8_en_de(source_file) with open(TARGET_FILE, 'w') as target_tsv: target_tsv_writer = csv.writer(target_tsv, delimiter='\t') for line in lines_en: new_line_de = [] for text_en in line: text_de = en2de.translate(text_en) new_line_de.append(text_de) target_tsv_writer.writerow(new_line_de) print('SUCCESS!')
""" Use AIResearch MT model easily """ import os os.system('pip install sentencepiece') os.system('pip install git+https://github.com/pytorch/fairseq@6f6461b') from fairseq.models.transformer import TransformerModel # download model url = 'https://github.com/vistec-AI/model-releases/releases/download/SCB_1M%2BTBASE_v1.0/SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz' os.system(f'curl -L {url} | tar xz') model = TransformerModel.from_pretrained( model_name_or_path= 'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0/models/', checkpoint_file='checkpoint.pt', data_name_or_path= 'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0/vocab/', bpe='sentencepiece', sentencepiece_vocab= 'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.th.model') # function en2th.translate translate = model.translate
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('--en2fr', required=True, help='path to en2fr model') parser.add_argument('--fr2en', required=True, help='path to fr2en mixture of experts model') parser.add_argument( '--user-dir', help='path to fairseq examples/translation_moe/src directory') parser.add_argument('--num-experts', type=int, default=10, help='(keep at 10 unless using a different model)') parser.add_argument('files', nargs='*', default=['-'], help='input files to paraphrase; "-" for stdin') args = parser.parse_args() if args.user_dir is None: args.user_dir = os.path.join( os.path.dirname(os.path.dirname( os.path.abspath(__file__))), # examples/ 'translation_moe', 'src', ) if os.path.exists(args.user_dir): logging.info('found user_dir:' + args.user_dir) else: raise RuntimeError( 'cannot find fairseq examples/translation_moe/src ' '(tried looking here: {})'.format(args.user_dir)) logging.info('loading en2fr model from:' + args.en2fr) en2fr = TransformerModel.from_pretrained( model_name_or_path=args.en2fr, tokenizer='moses', bpe='sentencepiece', ).eval() logging.info('loading fr2en model from:' + args.fr2en) fr2en = TransformerModel.from_pretrained( model_name_or_path=args.fr2en, tokenizer='moses', bpe='sentencepiece', user_dir=args.user_dir, task='translation_moe', ).eval() def gen_paraphrases(en): fr = en2fr.translate(en) return [ fr2en.translate(fr, inference_step_args={'expert': i}) for i in range(args.num_experts) ] logging.info('Type the input sentence and press return:') for line in fileinput.input(args.files): line = line.strip() if len(line) == 0: continue for paraphrase in gen_paraphrases(line): print(paraphrase)
def run(self): def tokenize_for_bleu(target): target = tokenizer.decode_pieces(target.split()) if self.target_lang == "ja": target = " ".join( map( lambda x: x.split("\t")[0], tagger.parse(target).split("\n")[:-2], )) return target docs = self.load() tagger = MeCab.Tagger() tokenizer = spm.SentencePieceProcessor() tokenizer.load(self.context_aware_sentencepiece_model) translation_models = {} for bias, path in self.context_aware_translation_models.items(): base_path, checkpoint_path = os.path.split(path) model = (TransformerModel.from_pretrained( base_path, checkpoint_file=checkpoint_path).half().cuda().eval()) model.args.max_source_positions = self.max_source_positions model.args.max_target_positions = self.max_target_positions translation_models[int(bias)] = model args = translation_models[-1].args task = translation_models[-1].task criterion = task.build_criterion(args) results = collections.defaultdict(dict) for doc_id, doc in tqdm.tqdm(docs.items(), total=len(docs)): parallel_doc = set([ sent_id for sent_id, score in doc["pairs"] if score >= self.score_threhold ]) batches = collections.defaultdict(dict) targets = {} for sent_id in parallel_doc: source, target = [ tokenizer.encode_as_pieces(doc[lang][sent_id]) for lang in (self.source_lang, self.target_lang) ] available_index = [ index for index in range(0, sent_id) if doc[self.source_lang][index] ] # context_bias is the parameter which the model is trained with. # context_sent_index is the index of the actual used contextual # sentence. targets[sent_id] = " ".join(target) for context_bias, _ in translation_models.items(): context_sent_index = None if context_bias != -1: if len(available_index) < context_bias: context_sent_index = -1 else: context_sent_index = available_index[-context_bias] source_context = tokenizer.encode_as_pieces( docs[doc_id][self.source_lang][context_sent_index]) real_source = source_context + [CONCAT_TOKEN] + source else: real_source = source if real_source and len( real_source) < self.max_source_positions: source_sentence = " ".join(real_source) else: source_sentence = None batches[context_bias][sent_id] = source_sentence batch_results = collections.defaultdict( lambda: collections.defaultdict(dict)) for context_bias, batch in batches.items(): data = [sentence for sentence in batch.values() if sentence] if not data: continue real_targets = { sent_id: targets[sent_id] for sent_id in batch if batch[sent_id] } model = translation_models[context_bias] args.max_source_positions = self.max_source_positions args.max_target_positions = self.max_target_positions translated = model.translate(data) # Compute BLEU score # Make the BLEU negative to easy the results computaion for trans, (sent_id, target) in zip(translated, real_targets.items()): batch_results[sent_id]["bleu"][ context_bias] = -sacrebleu.corpus_bleu( tokenize_for_bleu(trans), tokenize_for_bleu(target)).score # Compute loss src_tokens = [ model.src_dict.encode_line( real_source, line_tokenizer=lambda x: x.split(), add_if_not_exist=False, ).long() for real_source in data ] src_lengths = [tokens.numel() for tokens in src_tokens] tgt_tokens = [ model.tgt_dict.encode_line( target, line_tokenizer=lambda x: x.split(), add_if_not_exist=False, ).long() for target in real_targets.values() ] tgt_lengths = [tokens.numel() for tokens in tgt_tokens] temp_dataset = LanguagePairDataset( src_tokens, src_lengths, model.src_dict, tgt_tokens, tgt_lengths, left_pad_source=args.left_pad_source, left_pad_target=args.left_pad_target, max_source_positions=self.max_source_positions, max_target_positions=self.max_target_positions, ) reports = collections.defaultdict(list) iterator = task.get_batch_iterator( dataset=temp_dataset, max_sentences=self.max_sentences, ) for sample in iterator.next_epoch_itr(shuffle=False): sample["net_input"]["src_tokens"] = sample["net_input"][ "src_tokens"].cuda() sample["net_input"]["src_lengths"] = sample["net_input"][ "src_lengths"].cuda() sample["net_input"]["prev_output_tokens"] = sample[ "net_input"]["prev_output_tokens"].cuda() sample["target"] = sample["target"].cuda() with torch.no_grad(): _, _, report = criterion(model.models[0], sample, False) for key, value in report.items(): reports[key].append(value) for key in ("loss", "nll_loss"): for value, (sent_id, _) in zip(torch.cat(reports[key]), real_targets.items()): batch_results[sent_id][key][context_bias] = float( value) for sent_id, value in batch_results.items(): results[doc_id][sent_id] = value self.dump(dict(results))
def main(): parser = argparse.ArgumentParser(description="") parser.add_argument("--en2fr", required=True, help="path to en2fr model") parser.add_argument( "--fr2en", required=True, help="path to fr2en mixture of experts model" ) parser.add_argument( "--user-dir", help="path to fairseq examples/translation_moe/src directory" ) parser.add_argument( "--num-experts", type=int, default=10, help="(keep at 10 unless using a different model)", ) parser.add_argument( "files", nargs="*", default=["-"], help='input files to paraphrase; "-" for stdin', ) args = parser.parse_args() if args.user_dir is None: args.user_dir = os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), # examples/ "translation_moe", "src", ) if os.path.exists(args.user_dir): logging.info("found user_dir:" + args.user_dir) else: raise RuntimeError( "cannot find fairseq examples/translation_moe/src " "(tried looking here: {})".format(args.user_dir) ) logging.info("loading en2fr model from:" + args.en2fr) en2fr = TransformerModel.from_pretrained( model_name_or_path=args.en2fr, tokenizer="moses", bpe="sentencepiece", ).eval() logging.info("loading fr2en model from:" + args.fr2en) fr2en = TransformerModel.from_pretrained( model_name_or_path=args.fr2en, tokenizer="moses", bpe="sentencepiece", user_dir=args.user_dir, task="translation_moe", ).eval() def gen_paraphrases(en): fr = en2fr.translate(en) return [ fr2en.translate(fr, inference_step_args={"expert": i}) for i in range(args.num_experts) ] logging.info("Type the input sentence and press return:") for line in fileinput.input(args.files): line = line.strip() if len(line) == 0: continue for paraphrase in gen_paraphrases(line): print(paraphrase)
sys.exit("'BPE codes' argument missing! Should be subword-nmt created with learn_bpe.py") if len(sys.argv) > 4: input_file = os.path.abspath(sys.argv[4]) else: sys.exit("'Input text' argument missing!") if len(sys.argv) > 5: output_file = os.path.abspath(sys.argv[5]) else: sys.exit("'Output text' argument missing!") with open(input_file, 'r') as f: text = f.read().strip().splitlines() fout = open(output_file, 'x') nopuncts2puncts = TransformerModel.from_pretrained( model_path, checkpoint_file='checkpoint_best.pt', data_name_or_path=data_path, bpe='subword_nmt', bpe_codes=bpe_codes ) # Punctuate textout = nopuncts2puncts.translate(text) fout.write('\n'.join(textout)) fout.close()