def __init__(self): self._tokenizer = MosesTokenizer("en") self._model_name = _EN_TH_MODEL_NAME _download_install(self._model_name) self._model = TransformerModel.from_pretrained( model_name_or_path=_get_translate_path( self._model_name, _EN_TH_FILE_NAME, "models", ), checkpoint_file="checkpoint.pt", data_name_or_path=_get_translate_path( self._model_name, _EN_TH_FILE_NAME, "vocab", ), )
def _create_model(cls, checkpoints, device, beam_size, use_fp16): model = TransformerModel.build_model(checkpoints.args, checkpoints.task) # Custom make_generation_fast_ eval_fn, train_fn = model.eval, model.train model.eval = lambda: None model.make_generation_fast_( beamable_mm_beam_size=None if beam_size == 0 else beam_size, need_attn=True, # --print-alignment ) model.eval, model.train = eval_fn, train_fn if device is not None: torch.cuda.set_device(device) model = model.cuda(device) if use_fp16: model.half() return model
def load_translate(dataset, testset): model = TransformerModel.from_pretrained( f'./checkpoint/{dataset}', checkpoint_file='checkpoint_best.pt', data_name_or_path=f'data-bin/{testset}', bpe='sentencepiece', sentencepiece_model='./bpe_model/ta.wiki.bpe.vs50000.model') model.eval() en2de.cuda() with open(f'intermediate_datasets/BPE/{testset}/test.en') as f: src_sentences = f.read().splitlines() with open(f'datasets/{testset}/test.ta') as f: ref_lines = f.read().splitlines() hyp_lines = model.translate(tqdm(src_sentences)) with open(f'generation_results/{dataset}on{testset}.txt', 'w') as f: f.writelines(f'{sentence}\n' for sentence in hyp_lines) return hyp_lines, ref_lines
def __init__(self): self._model_name = _TH_EN_MODEL_NAME _download_install(self._model_name) self._model = TransformerModel.from_pretrained( model_name_or_path=_get_translate_path( self._model_name, _TH_EN_FILE_NAME, "models", ), checkpoint_file="checkpoint.pt", data_name_or_path=_get_translate_path( self._model_name, _TH_EN_FILE_NAME, "vocab", ), bpe="sentencepiece", sentencepiece_model=_get_translate_path( self._model_name, _TH_EN_FILE_NAME, "bpe", "spm.th.model", ), )
def translate(input_file, output_file, device, folder, beam_size=3, batch_size=256, replace_unk=False): translator = TransformerModel.from_pretrained( folder, checkpoint_file='checkpoint_best.pt', beam=beam_size) translator.to(device) translator.eval() input_f = open(input_file, "r") output_f = open(output_file, "w") for batch in tqdm(chunked(input_f, batch_size)): for src, sentence in zip(batch, translator.translate(batch)): if replace_unk: sentence = sentence.replace("<unk>", "") sentence = sentence.replace("▁< unk >", "") sentence = sentence.replace(" ", " ") print("Source text: {}".format(src.strip())) print("Translation text: {}".format(sentence)) print(sentence, file=output_f)
def main(): """ Give the path of the source file (in tsv format) as an argument to the command in command line "run translator --source-file=..." Hard code the Target file, where the ouptut in german should be saved. """ parser = argparse.ArgumentParser(description='') parser.add_argument('--source-file', required=True) args = parser.parse_args() source_file = args.source_file #check if source file is a valid file if not os.path.isfile(source_file) or not source_file_tsv.endswith('.tsv'): raise Exception(f"{source_file_tsv} is no valide file") en2de = TransformerModel.from_pretrained( f'{DATA_DIR}', checkpoint_file=f'{DATA_DIR}/model4.pt', data_name_or_path=f'{DATA_DIR}', bpe='fastbpe', bpe_codes=f'{DATA_DIR}/bpecodes', tokenizer='moses') lines_en = convert_tsv_lines_utf8_en_de(source_file) with open(TARGET_FILE, 'w') as target_tsv: target_tsv_writer = csv.writer(target_tsv, delimiter='\t') for line in lines_en: new_line_de = [] for text_en in line: text_de = en2de.translate(text_en) new_line_de.append(text_de) target_tsv_writer.writerow(new_line_de) print('SUCCESS!')
import sentencepiece as spm import re from fairseq.models.transformer import TransformerModel sp = spm.SentencePieceProcessor() sp.load("models/jsec.ja.model") ja2en = TransformerModel.from_pretrained( 'checkpoints/98subwords/', checkpoint_file='checkpoint_best.pt', data_name_or_path='data/bin/98_subwords/') def raw2subword(text): text = text.strip() text = re.sub(r'\s+', ' ', text) subwords = sp.EncodeAsPieces(text) text = ' '.join(subwords) return text def subword2raw(text): text = re.sub(' ', '', text) text = re.sub(r'▁', ' ', text) text = text[1:] return text.capitalize() ''' def translate(text): text = raw2subword(text)
def setUp(self): self.task, self.parser = get_dummy_task_and_parser() TransformerModel.add_args(self.parser) self.args = self.parser.parse_args([]) self.args.encoder_layers = 2 self.args.decoder_layers = 1
def add_args(parser): # Models can override this method to add new command-line arguments. # Here we'll add some new command-line arguments to configure dropout # and the dimensionality of the embeddings and hidden states. parser.add_argument( '--model-type', type=str, default='lstm', help= 'Type of encoder and decoder to use: 1) lstm (default), 2) transformer', ) parser.add_argument('--encoder-hidden-dim', type=int, default=256, help="Size of encoder\'s hidden layer") parser.add_argument('--decoder-hidden-dim', type=int, default=256, help="Size of decoder\'s hidden layer") parser.add_argument('--decoder-out-embed-dim', type=int, default=256, help="Size of decoder\'s output embeddings") parser.add_argument('--num-of-inputs', type=int, default=1, help='Number of different input item sequences') parser.add_argument( '--source-index', type=int, default=0, help= 'Index of the source among those provided as input (used for training a single task in a multi-task framework)', ) parser.add_argument( '--target-index', type=int, default=1, help= 'Index of the target among those provided as input (used for training a single task in a multi-task framework)', ) parser.add_argument( '--match-source-len', action='store_true', default=False, help= 'For scheduled-sampling decoding, same behavior as for fairseq-generate', ) parser.add_argument( '--max-lan-a', type=float, default=0.4, help= 'For scheduled-sampling decoding, same behavior as for fairseq-generate', ) parser.add_argument( '--max-len-b', type=int, default=1, help= 'For scheduled-sampling decoding, same behavior as for fairseq-generate', ) TransformerModel.add_args(parser)
import os from fastapi import FastAPI from pydantic import BaseModel from fairseq.models.transformer import TransformerModel app = FastAPI() TH_EN_MODEL = 'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0' EN_TH_MODEL = 'SCB_1M-MT_OPUS+TBASE_en-th_spm-spm_32000-joined_v1.0' th2en = TransformerModel.from_pretrained( model_name_or_path=os.path.join(TH_EN_MODEL, 'models'), checkpoint_file='checkpoint.pt', data_name_or_path=os.path.join(TH_EN_MODEL, 'vocab'), bpe='sentencepiece', sentencepiece_vocab=os.path.join(TH_EN_MODEL, 'bpe', 'spm.th.model') ) en2th = TransformerModel.from_pretrained( model_name_or_path=os.path.join(EN_TH_MODEL, 'models'), checkpoint_file='checkpoint.pt', data_name_or_path=os.path.join(EN_TH_MODEL, 'vocab'), bpe='sentencepiece', sentencepiece_vocab=os.path.join(EN_TH_MODEL, 'bpe', 'spm.en.model') ) class Request(BaseModel):
def add_args(parser): TransformerModel.add_args(parser) parser.add_argument("--full-mask", action="store_true", help="Full masking")
def main(): parser = argparse.ArgumentParser(description="") parser.add_argument("--en2fr", required=True, help="path to en2fr model") parser.add_argument( "--fr2en", required=True, help="path to fr2en mixture of experts model" ) parser.add_argument( "--user-dir", help="path to fairseq examples/translation_moe/src directory" ) parser.add_argument( "--num-experts", type=int, default=10, help="(keep at 10 unless using a different model)", ) parser.add_argument( "files", nargs="*", default=["-"], help='input files to paraphrase; "-" for stdin', ) args = parser.parse_args() if args.user_dir is None: args.user_dir = os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), # examples/ "translation_moe", "src", ) if os.path.exists(args.user_dir): logging.info("found user_dir:" + args.user_dir) else: raise RuntimeError( "cannot find fairseq examples/translation_moe/src " "(tried looking here: {})".format(args.user_dir) ) logging.info("loading en2fr model from:" + args.en2fr) en2fr = TransformerModel.from_pretrained( model_name_or_path=args.en2fr, tokenizer="moses", bpe="sentencepiece", ).eval() logging.info("loading fr2en model from:" + args.fr2en) fr2en = TransformerModel.from_pretrained( model_name_or_path=args.fr2en, tokenizer="moses", bpe="sentencepiece", user_dir=args.user_dir, task="translation_moe", ).eval() def gen_paraphrases(en): fr = en2fr.translate(en) return [ fr2en.translate(fr, inference_step_args={"expert": i}) for i in range(args.num_experts) ] logging.info("Type the input sentence and press return:") for line in fileinput.input(args.files): line = line.strip() if len(line) == 0: continue for paraphrase in gen_paraphrases(line): print(paraphrase)
def run(self): def tokenize_for_bleu(target): target = tokenizer.decode_pieces(target.split()) if self.target_lang == "ja": target = " ".join( map( lambda x: x.split("\t")[0], tagger.parse(target).split("\n")[:-2], )) return target docs = self.load() tagger = MeCab.Tagger() tokenizer = spm.SentencePieceProcessor() tokenizer.load(self.context_aware_sentencepiece_model) translation_models = {} for bias, path in self.context_aware_translation_models.items(): base_path, checkpoint_path = os.path.split(path) model = (TransformerModel.from_pretrained( base_path, checkpoint_file=checkpoint_path).half().cuda().eval()) model.args.max_source_positions = self.max_source_positions model.args.max_target_positions = self.max_target_positions translation_models[int(bias)] = model args = translation_models[-1].args task = translation_models[-1].task criterion = task.build_criterion(args) results = collections.defaultdict(dict) for doc_id, doc in tqdm.tqdm(docs.items(), total=len(docs)): parallel_doc = set([ sent_id for sent_id, score in doc["pairs"] if score >= self.score_threhold ]) batches = collections.defaultdict(dict) targets = {} for sent_id in parallel_doc: source, target = [ tokenizer.encode_as_pieces(doc[lang][sent_id]) for lang in (self.source_lang, self.target_lang) ] available_index = [ index for index in range(0, sent_id) if doc[self.source_lang][index] ] # context_bias is the parameter which the model is trained with. # context_sent_index is the index of the actual used contextual # sentence. targets[sent_id] = " ".join(target) for context_bias, _ in translation_models.items(): context_sent_index = None if context_bias != -1: if len(available_index) < context_bias: context_sent_index = -1 else: context_sent_index = available_index[-context_bias] source_context = tokenizer.encode_as_pieces( docs[doc_id][self.source_lang][context_sent_index]) real_source = source_context + [CONCAT_TOKEN] + source else: real_source = source if real_source and len( real_source) < self.max_source_positions: source_sentence = " ".join(real_source) else: source_sentence = None batches[context_bias][sent_id] = source_sentence batch_results = collections.defaultdict( lambda: collections.defaultdict(dict)) for context_bias, batch in batches.items(): data = [sentence for sentence in batch.values() if sentence] if not data: continue real_targets = { sent_id: targets[sent_id] for sent_id in batch if batch[sent_id] } model = translation_models[context_bias] args.max_source_positions = self.max_source_positions args.max_target_positions = self.max_target_positions translated = model.translate(data) # Compute BLEU score # Make the BLEU negative to easy the results computaion for trans, (sent_id, target) in zip(translated, real_targets.items()): batch_results[sent_id]["bleu"][ context_bias] = -sacrebleu.corpus_bleu( tokenize_for_bleu(trans), tokenize_for_bleu(target)).score # Compute loss src_tokens = [ model.src_dict.encode_line( real_source, line_tokenizer=lambda x: x.split(), add_if_not_exist=False, ).long() for real_source in data ] src_lengths = [tokens.numel() for tokens in src_tokens] tgt_tokens = [ model.tgt_dict.encode_line( target, line_tokenizer=lambda x: x.split(), add_if_not_exist=False, ).long() for target in real_targets.values() ] tgt_lengths = [tokens.numel() for tokens in tgt_tokens] temp_dataset = LanguagePairDataset( src_tokens, src_lengths, model.src_dict, tgt_tokens, tgt_lengths, left_pad_source=args.left_pad_source, left_pad_target=args.left_pad_target, max_source_positions=self.max_source_positions, max_target_positions=self.max_target_positions, ) reports = collections.defaultdict(list) iterator = task.get_batch_iterator( dataset=temp_dataset, max_sentences=self.max_sentences, ) for sample in iterator.next_epoch_itr(shuffle=False): sample["net_input"]["src_tokens"] = sample["net_input"][ "src_tokens"].cuda() sample["net_input"]["src_lengths"] = sample["net_input"][ "src_lengths"].cuda() sample["net_input"]["prev_output_tokens"] = sample[ "net_input"]["prev_output_tokens"].cuda() sample["target"] = sample["target"].cuda() with torch.no_grad(): _, _, report = criterion(model.models[0], sample, False) for key, value in report.items(): reports[key].append(value) for key in ("loss", "nll_loss"): for value, (sent_id, _) in zip(torch.cat(reports[key]), real_targets.items()): batch_results[sent_id][key][context_bias] = float( value) for sent_id, value in batch_results.items(): results[doc_id][sent_id] = value self.dump(dict(results))
def add_args(parser): TransformerModel.add_args(parser) parser.add_argument('--share-encoders', action='store_true', help='share encoders across languages') parser.add_argument('--share-decoders', action='store_true', help='share decoders across languages')
def __init__(self, vocab: Vocabulary, dataset_reader: DatasetReader, source_embedder: TextFieldEmbedder, lang2_namespace: str = "tokens", use_bleu: bool = True) -> None: super().__init__(vocab) self._lang1_namespace = lang2_namespace # TODO: DO NOT HARDCODE IT self._lang2_namespace = lang2_namespace # TODO: do not hardcore this self._backtranslation_src_langs = ["en", "ru"] self._coeff_denoising = 1 self._coeff_backtranslation = 1 self._coeff_translation = 1 self._label_smoothing = 0.1 self._pad_index_lang1 = vocab.get_token_index(DEFAULT_PADDING_TOKEN, self._lang1_namespace) self._oov_index_lang1 = vocab.get_token_index(DEFAULT_OOV_TOKEN, self._lang1_namespace) self._end_index_lang1 = self.vocab.get_token_index( END_SYMBOL, self._lang1_namespace) self._pad_index_lang2 = vocab.get_token_index(DEFAULT_PADDING_TOKEN, self._lang2_namespace) self._oov_index_lang2 = vocab.get_token_index(DEFAULT_OOV_TOKEN, self._lang2_namespace) self._end_index_lang2 = self.vocab.get_token_index( END_SYMBOL, self._lang2_namespace) self._reader = dataset_reader self._langs_list = self._reader._langs_list self._ae_steps = self._reader._ae_steps self._bt_steps = self._reader._bt_steps self._para_steps = self._reader._para_steps if use_bleu: self._bleu = Average() else: self._bleu = None args = ArgsStub() transformer_iwslt_de_en(args) # build encoder if not hasattr(args, 'max_source_positions'): args.max_source_positions = 1024 if not hasattr(args, 'max_target_positions'): args.max_target_positions = 1024 # Dense embedding of source vocab tokens. self._source_embedder = source_embedder # Dense embedding of vocab words in the target space. num_tokens_lang1 = self.vocab.get_vocab_size(self._lang1_namespace) num_tokens_lang2 = self.vocab.get_vocab_size(self._lang2_namespace) args.share_decoder_input_output_embed = False # TODO implement shared embeddings lang1_dict = DictStub(num_tokens=num_tokens_lang1, pad=self._pad_index_lang1, unk=self._oov_index_lang1, eos=self._end_index_lang1) lang2_dict = DictStub(num_tokens=num_tokens_lang2, pad=self._pad_index_lang2, unk=self._oov_index_lang2, eos=self._end_index_lang2) # instantiate fairseq classes emb_golden_tokens = FairseqEmbedding(num_tokens_lang2, args.decoder_embed_dim, self._pad_index_lang2) self._encoder = TransformerEncoder(args, lang1_dict, self._source_embedder) self._decoder = TransformerDecoder(args, lang2_dict, emb_golden_tokens) self._model = TransformerModel(self._encoder, self._decoder) # TODO: do not hardcode max_len_b and beam size self._sequence_generator_greedy = FairseqBeamSearchWrapper( SequenceGenerator(tgt_dict=lang2_dict, beam_size=1, max_len_b=20)) self._sequence_generator_beam = FairseqBeamSearchWrapper( SequenceGenerator(tgt_dict=lang2_dict, beam_size=7, max_len_b=20))
def build_model(cls, args, task): # set any default arguments transformer_align(args) transformer_model = TransformerModel.build_model(args, task) return TransformerAlignModel(transformer_model.encoder, transformer_model.decoder, args)
def main() -> None: """ Main function to read, translate and write data to disk """ args = parse_arguments(subtype="translate") # get verbosity if args.verbosity == 1: logger = logging.getLogger('base') else: logger = logging.getLogger('root') # get batch-size batch_size = args.batch_size # model subsets model_subset = args.model_subset # local model glob model_checkpoints_glob = args.checkpoints_glob # initialize model names model_names = [] # create path dictionary path_dict = { "wmt": [[ "./data/wmt19/wmt19.test.truecased.de.ref", "./data/wmt19_paraphrased/wmt19-ende-wmtp.ref" ]], "ar": [[ "./data/wmt19_paraphrased/wmt19-ende-ar.ref", "./data/wmt19_paraphrased/wmt19-ende-arp.ref" ]] } path_dict["both"] = path_dict["wmt"] + path_dict["ar"] # define available models for de-en if model_subset in ["local", "both"]: model_names.extend(glob(model_checkpoints_glob)) if model_subset in ["hub", "both"]: model_names.append("transformer.wmt19.de-en.single_model") # loop over respective models for model_name in model_names: # add rules for loading models if model_name == "transformer.wmt19.de-en.single_model": model = torch.hub.load("pytorch/fairseq", model_name, tokenizer="moses", bpe="fastbpe") model_name = "torch_hub." + model_name else: model = TransformerModel.from_pretrained( os.path.dirname(model_name), checkpoint_file=os.path.basename(model_name), bpe="fastbpe", tokenizer="moses", data_name_or_path="./bpe/", bpe_codes=os.path.join(os.path.dirname(model_name), "bpe", "bpe.32000")) model_name = "%s.%s.%s" % ( "local", os.path.basename(os.path.dirname(model_name)), os.path.basename(model_name).replace(".pt", "")) # disable dropout for prediction model.eval() # enable GPU hardware acceleration if GPU/CUDA present if torch.cuda.is_available(): model.cuda() # log model used in current loop logger.info("Translating with model: %s", model_name) # loop over paraphrase files for input_paths in path_dict[args.wmt_references]: base = os.path.basename(input_paths[0]) # read original de data here logger.info("Reading reference data: %s", base) de_input_original = read_data(input_paths[0]) # read de paraphrase data logger.info("Reading paraphrased reference data: %s", os.path.basename(input_paths[1])) de_input_paraphrased = read_data(input_paths[1]) # assemble combined input data logger.info("Interweaving 'de' input data") de_input = interweave(de_input_original, de_input_paraphrased) logger.info("Translating and processing to 'en'") # translate and process store = translate_process(model, de_input, batch_size) # modify metadata if all(re.search(r"-arp?.ref$", path) for path in input_paths): metadata = "wmt19.ar.arp" else: metadata = "wmt19.wmt.wmtp" # write json to disk write_to_file(model_name, metadata, store)
import random import os import time def get_lines(file): reader = open(file) train_lines = reader.readlines() reader.close() return train_lines lines = get_lines('test.ru.txt') from fairseq.models.transformer import TransformerModel ru2en = TransformerModel.from_pretrained( '/home/aleksei/Documents/wmt19.ru-en.ensemble', checkpoint_file='model1.pt' ) result_lines = [] for line in lines: translated = ru2en.translate(line) print(translated) result_lines.append(translated) print('translated, saving to file...') with open('output.txt', 'a') as the_file: for line in result_lines: the_file.write('%s\n' % line) print('saved')
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off TransformerModel.add_args(parser)
def build_model(self) -> GeneratorHubInterface: model = TransformerModel.from_pretrained(**self.model_args) model.to(self.device) model.eval() return model
def run(self): def load_doc(doc: List, doc_id: str): doc_df = pd.DataFrame(doc) for sent_id, row in doc_df.iterrows(): docs[doc_id]["en"].append(str(row["en_sentence"]).strip()) docs[doc_id]["ja"].append(str(row["ja_sentence"]).strip()) docs[doc_id]["pairs"].append((sent_id, 1.0)) docs = collections.defaultdict(lambda: collections.defaultdict(list)) lang1, lang2 = self.data_langs if self.dataset_name == "jiji": read_raw_jiji(docs, self.source_path) elif not isinstance(self.source_path, str): docs = read_seperate_files( self.dataset_name, self.source_path, self.sentence_level ) elif os.path.splitext(self.source_path)[1] == ".tsv": docs = read_tsv_file( self.dataset_name, self.source_path, self.data_langs, self.sentence_level, ) else: if os.path.isfile(self.source_path): with open(self.source_path) as source: data = json.load(source) for doc_index, doc in enumerate(data): load_doc(doc, f"{self.dataset_name}_{doc_index}") elif os.path.isdir(self.source_path): for file_path in glob.glob(self.source_path + "/*.json"): doc_id = os.path.splitext(file_path)[0].split("/")[-1] with open(file_path) as source: load_doc(json.load(source), doc_id) if not self.sentence_level: for _, doc in docs.items(): doc[lang1].append(" ") doc[lang2].append(" ") # Add translated source to the data if self.translation_model_name: langs = list(self.translation_models.keys()) source_target_dict = { lang: langs[1 - index] for index, lang in enumerate(langs) } translation_models = {} for source, path in self.translation_models.items(): base_path, checkpoint_path = os.path.split(path) model = TransformerModel.from_pretrained( base_path, checkpoint_file=checkpoint_path ) model.to("cuda") spm_processor = spm.SentencePieceProcessor() spm_processor.load(self.sentencepiece_models[source]) translation_models[source] = (model, spm_processor) for doc_id, doc in tqdm.tqdm(docs.items(), total=len(docs)): for lang in translation_models.keys(): model = translation_models[source_target_dict[lang]][0] tokenizer = translation_models[source_target_dict[lang]][1] detokenizer = translation_models[lang][1] sources = [] no_translation = {} for index, sent in enumerate(doc[source_target_dict[lang]]): if not sent or sent == " ": no_translation[index] = sent else: sources.append(sent) targets = [ detokenizer.decode_pieces(target.split()) for target in model.translate( [ " ".join(tokenizer.encode_as_pieces(source)) for source in sources ] ) ] for sent_id, sent in no_translation.items(): targets.insert(sent_id, sent) doc[f"{lang}_translated"] = targets self.dump(dict(docs))
""" Use AIResearch MT model easily """ import os os.system('pip install sentencepiece') os.system('pip install git+https://github.com/pytorch/fairseq@6f6461b') from fairseq.models.transformer import TransformerModel # download model url = 'https://github.com/vistec-AI/model-releases/releases/download/SCB_1M%2BTBASE_v1.0/SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz' os.system(f'curl -L {url} | tar xz') model = TransformerModel.from_pretrained( model_name_or_path= 'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0/models/', checkpoint_file='checkpoint.pt', data_name_or_path= 'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0/vocab/', bpe='sentencepiece', sentencepiece_vocab= 'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.th.model') # function en2th.translate translate = model.translate
sys.exit("'BPE codes' argument missing! Should be subword-nmt created with learn_bpe.py") if len(sys.argv) > 4: input_file = os.path.abspath(sys.argv[4]) else: sys.exit("'Input text' argument missing!") if len(sys.argv) > 5: output_file = os.path.abspath(sys.argv[5]) else: sys.exit("'Output text' argument missing!") with open(input_file, 'r') as f: text = f.read().strip().splitlines() fout = open(output_file, 'x') nopuncts2puncts = TransformerModel.from_pretrained( model_path, checkpoint_file='checkpoint_best.pt', data_name_or_path=data_path, bpe='subword_nmt', bpe_codes=bpe_codes ) # Punctuate textout = nopuncts2puncts.translate(text) fout.write('\n'.join(textout)) fout.close()
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "multi" in self.config.n_model: from fairseq.models.transformer import TransformerModel from pororo.tasks.utils.tokenizer import CustomTokenizer load_dict = download_or_load( f"transformer/{self.config.n_model}", "multi", ) model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) tokenizer = CustomTokenizer.from_file( vocab_filename=f"{load_dict.src_tok}/vocab.json", merges_filename=f"{load_dict.src_tok}/merges.txt", ) return PororoTransformerTransMulti( model, self.config, tokenizer, ) if "transformer" in self.config.n_model: from fairseq.models.transformer import TransformerModel load_dict = download_or_load( f"transformer/{self.config.n_model}", self.config.lang, ) tokenizer = None model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) if self.config.lang != "zh": from pororo.tasks.utils.tokenizer import CustomTokenizer tokenizer = CustomTokenizer.from_file( vocab_filename=f"{load_dict.src_tok}/vocab.json", merges_filename=f"{load_dict.src_tok}/merges.txt", ) return PororoTransformerParaphrase(model, self.config, tokenizer)
def add_args(parser): TransformerModel.add_args(parser) # Arguments related to parameter initialization parser.add_argument('--apply-bert-init', action='store_true', help='use custom param initialization for BERT')
from pythainlp.tokenize import word_tokenize as th_word_tokenize from functools import partial from tqdm import tqdm import torch import os os.environ["CUDA_VISIBLE_DEVICES"] = "1" # %% en_word_tokenize = MosesTokenizer('en') en_word_detokenize = MosesDetokenizer('en') en2th_word2bpe = TransformerModel.from_pretrained( model_name_or_path= '/home/users/v-sumeth/AIS/Vistech_SCB_mt_models/SCB_1M+TBASE_en-th_moses-newmm_space_130000-130000_v1.0/models/', checkpoint_file='checkpoint.pt', data_name_or_path= '/home/users/v-sumeth/AIS/Vistech_SCB_mt_models/SCB_1M+TBASE_en-th_moses-newmm_space_130000-130000_v1.0/vocab/' ) th_word_tokenize = partial(th_word_tokenize, keep_whitespace=False) th2en = TransformerModel.from_pretrained( model_name_or_path= "/home/users/v-sumeth/AIS/Vistech_SCB_mt_models/SCB_1M+TBASE_th-en_newmm-moses_130000-130000_v1.0/models", checkpoint_file='checkpoint.pt', data_name_or_path= "/home/users/v-sumeth/AIS/Vistech_SCB_mt_models/SCB_1M+TBASE_th-en_newmm-moses_130000-130000_v1.0/vocab", ) en2th_word2bpe.to(torch.device('cuda'))
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('--en2fr', required=True, help='path to en2fr model') parser.add_argument('--fr2en', required=True, help='path to fr2en mixture of experts model') parser.add_argument( '--user-dir', help='path to fairseq examples/translation_moe/src directory') parser.add_argument('--num-experts', type=int, default=10, help='(keep at 10 unless using a different model)') parser.add_argument('files', nargs='*', default=['-'], help='input files to paraphrase; "-" for stdin') args = parser.parse_args() if args.user_dir is None: args.user_dir = os.path.join( os.path.dirname(os.path.dirname( os.path.abspath(__file__))), # examples/ 'translation_moe', 'src', ) if os.path.exists(args.user_dir): logging.info('found user_dir:' + args.user_dir) else: raise RuntimeError( 'cannot find fairseq examples/translation_moe/src ' '(tried looking here: {})'.format(args.user_dir)) logging.info('loading en2fr model from:' + args.en2fr) en2fr = TransformerModel.from_pretrained( model_name_or_path=args.en2fr, tokenizer='moses', bpe='sentencepiece', ).eval() logging.info('loading fr2en model from:' + args.fr2en) fr2en = TransformerModel.from_pretrained( model_name_or_path=args.fr2en, tokenizer='moses', bpe='sentencepiece', user_dir=args.user_dir, task='translation_moe', ).eval() def gen_paraphrases(en): fr = en2fr.translate(en) return [ fr2en.translate(fr, inference_step_args={'expert': i}) for i in range(args.num_experts) ] logging.info('Type the input sentence and press return:') for line in fileinput.input(args.files): line = line.strip() if len(line) == 0: continue for paraphrase in gen_paraphrases(line): print(paraphrase)
def add_args(parser): TransformerModel.add_args(parser) parser.add_argument('--user-mode', type=str, help='user-mode') parser.add_argument('--results-dir', type=str, help='results-dir')
def build_model(cls, args, task): """Build a new model instance.""" # from fairseq.tasks.multilingual_translation import MultilingualTranslationTask # assert isinstance(task, MultilingualTranslationTask) # make sure all arguments are present in older models base_architecture(args) if args.share_encoders: args.share_encoder_embeddings = True ### nat model # build shared embeddings (if applicable) src_dict, tgt_dict = task.source_dictionary, task.target_dictionary if args.share_all_embeddings: if src_dict != tgt_dict: raise ValueError("--share-all-embeddings requires a joined dictionary") if args.encoder_embed_dim != args.decoder_embed_dim: raise ValueError( "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" ) if args.decoder_embed_path and ( args.decoder_embed_path != args.encoder_embed_path ): raise ValueError( "--share-all-embeddings not compatible with --decoder-embed-path" ) encoder_embed_tokens = TransformerModel.build_embedding( args, src_dict, args.encoder_embed_dim, args.encoder_embed_path ) decoder_embed_tokens = encoder_embed_tokens args.share_decoder_input_output_embed = True else: encoder_embed_tokens = TransformerModel.build_embedding( args, src_dict, args.encoder_embed_dim, args.encoder_embed_path ) decoder_embed_tokens = TransformerModel.build_embedding( args, tgt_dict, args.decoder_embed_dim, args.decoder_embed_path ) student_cls = ARCH_MODEL_REGISTRY[args.student_arch] encoder = student_cls.build_encoder(args, src_dict, encoder_embed_tokens) decoder = student_cls.build_decoder(args, tgt_dict, decoder_embed_tokens) student = student_cls(args,encoder,decoder) teacher_cls = ARCH_MODEL_REGISTRY[args.teacher_arch] if not issubclass(teacher_cls, NATransformerModel): teacher_cls = PatchedTransformerModel teacher_encoder = teacher_cls.build_encoder( args, src_dict, encoder_embed_tokens if args.share_encoder_embeddings else TransformerModel.build_embedding( args, src_dict, args.encoder_embed_dim, args.encoder_embed_path ) ) teacher_decoder = teacher_cls.build_decoder( args, tgt_dict, decoder_embed_tokens if args.share_decoder_embeddings else TransformerModel.build_embedding( args, tgt_dict, args.decoder_embed_dim, args.decoder_embed_path ) ) teacher = teacher_cls(args,teacher_encoder,teacher_decoder) return cls(args, student, teacher)
# -*- coding:utf-8 -*- """ ------------------------------------------------- Project Name: toolkits File Name: mt_demo.py Author: gaoyw Create Date: 2021/1/22 ------------------------------------------------- """ from fairseq.models.transformer import TransformerModel zh2en = TransformerModel.from_pretrained( '/path/to/checkpoints', checkpoint_file='checkpoint_best.pt', data_name_or_path='data-bin/wmt17_zh_en_full', bpe='subword_nmt', bpe_codes='data-bin/wmt17_zh_en_full/zh.code') zh2en.translate('你好 世界')