Ejemplo n.º 1
0
    def load_marian_model(self) -> MarianMTModel:
        state_dict, cfg = self.state_dict, self.hf_config

        assert cfg.static_position_embeddings, "config.static_position_embeddings should be True"
        model = MarianMTModel(cfg)

        assert "hidden_size" not in cfg.to_dict()
        load_layers_(
            model.model.encoder.layers, state_dict, BART_CONVERTER,
        )
        load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True)

        # handle tensors not associated with layers
        wemb_tensor = torch.nn.Parameter(torch.FloatTensor(self.wemb))
        bias_tensor = torch.nn.Parameter(torch.FloatTensor(self.final_bias))
        model.model.shared.weight = wemb_tensor
        model.model.encoder.embed_tokens = model.model.decoder.embed_tokens = model.model.shared

        model.final_logits_bias = bias_tensor

        if "Wpos" in state_dict:
            print("Unexpected: got Wpos")
            wpos_tensor = torch.tensor(state_dict["Wpos"])
            model.model.encoder.embed_positions.weight = wpos_tensor
            model.model.decoder.embed_positions.weight = wpos_tensor

        if cfg.normalize_embedding:
            assert "encoder_emb_ln_scale_pre" in state_dict
            raise NotImplementedError("Need to convert layernorm_embedding")

        assert not self.extra_keys, f"Failed to convert {self.extra_keys}"
        assert (
            model.model.shared.padding_idx == self.pad_token_id
        ), f"Padding tokens {model.model.shared.padding_idx} and {self.pad_token_id} mismatched"
        return model
Ejemplo n.º 2
0
 def combobox_changed(self):
     if self.inputComboBox.currentText() == "English":
         self.inputTextEdit.setAlignment(Qt.AlignLeft)
         if self.model_en is None:
             self.label.setText('جاري تحميل القاموس، الرجاء الانتظار')
             self.repaint()
             self.tokenizer_en = MarianTokenizer.from_pretrained(
                 model_en_path)
             self.model_en = MarianMTModel.from_pretrained(model_en_path)
             self.label.setText('تم تحميل القاموس')
     elif self.inputComboBox.currentText() == "Russian":
         self.inputTextEdit.setAlignment(Qt.AlignLeft)
         if self.model_ru is None:
             self.label.setText('جاري تحميل القاموس، الرجاء الانتظار')
             self.repaint()
             self.tokenizer_ru = MarianTokenizer.from_pretrained(
                 model_ru_path)
             self.model_ru = MarianMTModel.from_pretrained(model_ru_path)
             self.label.setText('تم تحميل القاموس')
     elif self.inputComboBox.currentText() == "Hebrew":
         self.inputTextEdit.setAlignment(Qt.AlignRight)
         if self.model_he is None:
             self.label.setText('جاري تحميل القاموس، الرجاء الانتظار')
             self.repaint()
             self.tokenizer_he = MarianTokenizer.from_pretrained(
                 model_he_path)
             self.model_he = MarianMTModel.from_pretrained(model_he_path)
             self.label.setText('تم تحميل القاموس')
     self.repaint()
    def load_marian_model(self) -> MarianMTModel:
        state_dict, cfg = self.state_dict, self.hf_config

        if not cfg.static_position_embeddings:
            raise ValueError(
                "config.static_position_embeddings should be True")
        model = MarianMTModel(cfg)

        if "hidden_size" in cfg.to_dict():
            raise ValueError("hidden_size is in config")
        load_layers_(
            model.model.encoder.layers,
            state_dict,
            BART_CONVERTER,
        )
        load_layers_(model.model.decoder.layers,
                     state_dict,
                     BART_CONVERTER,
                     is_decoder=True)

        # handle tensors not associated with layers
        if self.cfg["tied-embeddings-src"]:
            wemb_tensor = nn.Parameter(torch.FloatTensor(self.wemb))
            bias_tensor = nn.Parameter(torch.FloatTensor(self.final_bias))
            model.model.shared.weight = wemb_tensor
            model.model.encoder.embed_tokens = model.model.decoder.embed_tokens = model.model.shared
        else:
            wemb_tensor = nn.Parameter(torch.FloatTensor(self.wemb))
            model.model.encoder.embed_tokens.weight = wemb_tensor

            decoder_wemb_tensor = nn.Parameter(torch.FloatTensor(
                self.dec_wemb))
            bias_tensor = nn.Parameter(torch.FloatTensor(self.final_bias))
            model.model.decoder.embed_tokens.weight = decoder_wemb_tensor

        model.final_logits_bias = bias_tensor

        if "Wpos" in state_dict:
            print("Unexpected: got Wpos")
            wpos_tensor = torch.tensor(state_dict["Wpos"])
            model.model.encoder.embed_positions.weight = wpos_tensor
            model.model.decoder.embed_positions.weight = wpos_tensor

        if cfg.normalize_embedding:
            if not ("encoder_emb_ln_scale_pre" in state_dict):
                raise ValueError(
                    "encoder_emb_ln_scale_pre is not in state dictionary")
            raise NotImplementedError("Need to convert layernorm_embedding")

        if self.extra_keys:
            raise ValueError(f"Failed to convert {self.extra_keys}")

        if model.get_input_embeddings().padding_idx != self.pad_token_id:
            raise ValueError(
                f"Padding tokens {model.get_input_embeddings().padding_idx} and {self.pad_token_id} mismatched"
            )
        return model
Ejemplo n.º 4
0
    def __init__(
        self,
        context,
        translation_artifacts_english,
        translation_artifacts_spanish,
        model="microsoft/DialoGPT-small",
        tokenizer="microsoft/DialoGPT-small",
        translate=True,
        sentiment_analisis=False,
        seed=44,
    ):
        """This is a deep learning chatbot with traduction

        Args:
            context (Chatbot): context
            traduction_english_artifacts (dict): Dictionary of artifacts
            traduction_spanish_artifacts (dict): Dictionary of artifacts
            translate (bool, optional): Input and output will be translated?.
            seed (int, optional): random seed. Defaults to 44.
            sentiment_analisis (bool, optional):
        """

        self.generator = pipeline("text-generation",
                                  model=model,
                                  tokenizer=tokenizer)

        self.translate = translate
        self.context = context
        self.translation_artifacts_english = translation_artifacts_english
        self.translation_artifacts_spanish = translation_artifacts_spanish
        self.sentiment_analisis = sentiment_analisis

        self.parsed_context = self.generator.tokenizer.eos_token.join(
            context.split("\n"))

        self.temporal_context = []

        set_seed(seed)

        if sentiment_analisis:
            self.sentiment_engine = SentimentIntensityAnalyzer()

        if translate:
            # ENG -> SPANISH
            self.model_name_en_t_es = "Helsinki-NLP/opus-mt-en-ROMANCE"
            self.tokenizer_en_t_es = MarianTokenizer.from_pretrained(
                self.model_name_en_t_es)
            self.model_en_t_es = MarianMTModel.from_pretrained(
                self.model_name_en_t_es)

            # ESP -> ENGLISH
            self.model_name_es_t_en = "Helsinki-NLP/opus-mt-ROMANCE-en"
            self.tokenizer_es_t_en = MarianTokenizer.from_pretrained(
                self.model_name_es_t_en)
            self.model_es_t_en = MarianMTModel.from_pretrained(
                self.model_name_es_t_en)
Ejemplo n.º 5
0
def get_model_tokenizer_files(romance_lang: str = "ROMANCE"):
    ROMANCE = romance_lang

    target_model_name = f"Helsinki-NLP/opus-mt-en-{ROMANCE}"
    target_tokenizer = MarianTokenizer.from_pretrained(target_model_name)
    target_model = MarianMTModel.from_pretrained(target_model_name)

    en_model_name = f"Helsinki-NLP/opus-mt-{ROMANCE}-en"
    en_tokenizer = MarianTokenizer.from_pretrained(en_model_name)
    en_model = MarianMTModel.from_pretrained(en_model_name)
    return en_model, en_tokenizer, target_model, target_tokenizer
Ejemplo n.º 6
0
    def __init__(self, language):
        self.language = language

        target_model_name = f'Helsinki-NLP/opus-mt-en-{self.language}'
        self.target_tokenizer = MarianTokenizer.from_pretrained(
            target_model_name)
        self.target_model = MarianMTModel.from_pretrained(
            target_model_name).to('cuda')

        en_model_name = f'Helsinki-NLP/opus-mt-{self.language}-en'
        self.en_tokenizer = MarianTokenizer.from_pretrained(en_model_name)
        self.en_model = MarianMTModel.from_pretrained(en_model_name).to('cuda')
Ejemplo n.º 7
0
 def _create_models_and_tokenizers(self):
     
     model1 = MarianMTModel.from_pretrained(self.models_list[0])
     tokenizer1 = MarianTokenizer.from_pretrained(self.models_list[0])
     if len(self.models_list) == 1:
         model2 = None
         tokenizer2 = None
     elif len(self.models_list) == 2:
         model2 = MarianMTModel.from_pretrained(self.models_list[1])
         tokenizer2 = MarianTokenizer.from_pretrained(self.models_list[1])
     else:
         raise ValueError("Is excpeted that the argiment models_list has the length 1 or 2")
     
     return model1,tokenizer1,model2,tokenizer2
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, required=True)
    parser.add_argument('--input_path', type=str, required=True)
    parser.add_argument('--output_path', type=str, required=True)
    parser.add_argument('--batch_size', type=int, default=32)
    args = parser.parse_args()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = MarianTokenizer.from_pretrained(args.model_name)
    model = MarianMTModel.from_pretrained(args.model_name).to(device)

    tgt = open(args.output_path, 'w')
    with open(args.input_path) as src:
        batch = []
        for s in src.read().split('\n'):
            if len(s) == 0: continue
            batch.append(s)
            if len(batch) == args.batch_size:
                tgt_text = translate(model, tokenizer, device, batch)
                for t in tgt_text:
                    tgt.write(t + '\n')
                batch = []
        if len(batch) > 0:
            tgt_text = translate(model, tokenizer, device, batch)
            for t in tgt_text:
                tgt.write(t + '\n')
        tgt.close()
Ejemplo n.º 9
0
def translateDocx(source, target, file):
    global model_name
    global tokenizer
    global model

    if source == 'en' and target == 'de':
        model_name = 'Helsinki-NLP/opus-mt-en-de'
    if source == 'de' and target == 'en':
        model_name = 'Helsinki-NLP/opus-mt-de-en'

    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    text = docx2txt.process(file).split("\n")
    text = list(filter(None, text))
    text = [s for s in text if p.match(s)]

    text = [">>de<< " + s for s in text]

    i = 0
    document = Document()
    for textblock in chunks(text, 2):
        i = i + 1
        print("batch #%i (len: %s)" % (i, len(textblock)), file=sys.stderr)
        print("\t " + str(tuple(textblock)))
        target, duration = translat(textblock)
        document.add_paragraph(target)
        print("\t " + str(tuple(target)))
        print('translate took {:.3f} ms'.format(duration), file=sys.stderr)
        print("\n\n")
    end_time = time.time()
    duration = (start_time - end_time) * 1000.0
    print('Total translate took {:.3f} ms'.format(duration))
    return document
Ejemplo n.º 10
0
def get_docx_text(lst):
    global model_name
    global tokenizer
    global model

    model_name = 'Helsinki-NLP/opus-mt-en-de'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    for str in lst:
        print(str)
        text = str.split("\n")
        text = list(filter(None, text))
        text = [s for s in text if p.match(s)]

        text = [">>de<< " + s for s in text]

        i = 0
        document = Document()
        for textblock in chunks(text, 5):
            i = i + 1
            print("batch #%i (len: %s)" % (i, len(textblock)), file=sys.stderr)
            print("\t " + str(tuple(textblock)))
            target, duration = translat(textblock)
            document.add_paragraph(target)
            print("\t " + str(tuple(target)))
            print('translate took {:.3f} ms'.format(duration), file=sys.stderr)
            print("\n\n")
        end_time = time.time()
    document.save("new.docx")
Ejemplo n.º 11
0
def replace_string2(filename):
    global model_name
    global tokenizer
    global model

    model_name = 'Helsinki-NLP/opus-mt-en-de'

    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    document = zipfile.ZipFile(filename)
    xml_content = document.read('word/document.xml')
    #document.close()
    tree = XML(xml_content)
    # using lxml instead of xml preserved the comments

    paragraphs = []
    i = 0
    for paragraph in tree.iter(PARA):
        i = i + 1
        texts = [node.text for node in paragraph.iter(TEXT) if node.text]
        if texts:
            #text = list(filter(None, text))
            #text = [s for s in text if p.match(s)]

            #text = [">>de<< " + s for s in text]
            #print("%s: %s" %(i,texts))
            target, duration = translat(texts)
            paragraph.text.replace(texts, target)

    document.save("new.docx")
Ejemplo n.º 12
0
    def __init__(self, source_language: str, target_language: str):
        name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.model = MarianMTModel.from_pretrained(name).to(self.device)
        self.tokenizer = MarianTokenizer.from_pretrained(name)
Ejemplo n.º 13
0
def replace_string2(filename):
    global model_name
    global tokenizer
    global model

    model_name = 'Helsinki-NLP/opus-mt-en-de'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    doc = Document(filename)
    for p in doc.paragraphs:
        textsL = [p.text]
        textsL = list(filter(None, textsL))

        for text_value in textsL:
            textElem = [text_value]
            translated_as_L = translat(textElem)
            for translatedE in translated_as_L:
                print(text_value)
                print(translatedE)
                text = p.text.replace(text_value, translatedE)
                style = p.style
                p.text = text
                p.style = style

    # doc.save(filename)
    doc.save('test.docx')
    return 1
def main():
    parser = argparse.ArgumentParser(description='translating using MarianMT in transformers huggingface library')
    parser.add_argument('-i', '--input', default=sys.stdin.fileno(),
                        help='The input file (defaults to stdin)')
    parser.add_argument('-o', '--output', default=sys.stdout.fileno(),
                        help='The output annotated file (defaults to stdout)')
    parser.add_argument('--encoding', default='utf-8',
                        help='The character encoding for input/output '
                             '(it defaults to UTF-8)')
    parser.add_argument('--src', help='The source language')
    parser.add_argument('--tgt', help='The target language')
    parser.add_argument('-c, --chunks', help='Number of chunks to divide the corpus')
    args = parser.parse_args()

    f = open(args.input, encoding=args.encoding)
    output_file = open(args.output, mode='w', encoding=args.encoding)

    model_name = f"Helsinki-NLP/opus-mt-{args.src}-{args.tgt}"
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)

    with open(args.input, 'r', encoding=args.encoding) as f:
        src_lines = f.readlines()
        labels, words = get_label_doc(src_lines)
        tgt_text = translate(words, args.chunks, model, tokenizer)
        write_to_file(output_file, labels, tgt_text)
Ejemplo n.º 15
0
def main(args):
    with open(args.dataset_info, 'rb') as rf:
        dataset_info = pickle.load(rf)
    tokenizer = MarianTokenizer.from_pretrained(args.model_string)
    tokenizer.add_special_tokens({'pad_token': PAD_TOKEN})
    pad_id = tokenizer.encode(PAD_TOKEN)[0]
    model = MarianMTModel.from_pretrained(args.model_string, return_dict=True).to(args.device)
    model.eval()

    checkpoint = torch.load(args.ckpt, map_location=args.device)
    model_args = checkpoint['args']
    conditioning_model = Model(model_args, pad_id, len(dataset_info.index2word)) # no need to get the glove embeddings when reloading since they're saved in model ckpt anyway
    conditioning_model.load_state_dict(checkpoint['state_dict'])
    conditioning_model = conditioning_model.to(args.device)
    conditioning_model.eval()
    print("=> loaded checkpoint '{}' (epoch {})"
            .format(args.ckpt, checkpoint['epoch']))
    print('num params', num_params(conditioning_model))

    while True:
        results = predict_formality(model, 
                        tokenizer, 
                        conditioning_model, 
                        [args.input_text], 
                        dataset_info, 
                        precondition_topk=args.precondition_topk,
                        do_sample=args.do_sample,
                        length_cutoff=args.length_cutoff,
                        condition_lambda=args.condition_lambda,
                        device=args.device)
        print(results)
        import pdb; pdb.set_trace()
Ejemplo n.º 16
0
    def load_model(self, model_name):
        if model_name in self.models:
            self.models[model_name]['last_loaded'] = time.time()
            return self.models[model_name]['tokenizer'], self.models[
                model_name]['model']
        else:
            logger.info("Load model: " + model_name)
            tokenizer = MarianTokenizer.from_pretrained(model_name)
            model = MarianMTModel.from_pretrained(model_name)
            model.eval()

            if len(self.models) >= self.max_loaded_models:
                oldest_time = time.time()
                oldest_model = None
                for model_name in self.models:
                    if self.models[model_name]['last_loaded'] <= oldest_time:
                        oldest_model = model_name
                        oldest_time = self.models[model_name]['last_loaded']
                del self.models[oldest_model]

            self.models[model_name] = {
                'tokenizer': tokenizer,
                'model': model,
                'last_loaded': time.time()
            }
            return tokenizer, model
Ejemplo n.º 17
0
    def __init__(self,
                 model_name_or_path: str,
                 max_seq_length: int = 128,
                 task="translation",
                 model_args: Dict = {},
                 cache_dir: Optional[str] = None,
                 freeze_encoder=False):
        super(EncDecModel, self).__init__()
        self.config_keys = ['max_seq_length']
        self.max_seq_length = max_seq_length

        config = AutoConfig.from_pretrained(model_name_or_path,
                                            **model_args,
                                            cache_dir=cache_dir)
        self.model = MarianMTModel.from_pretrained(model_name_or_path,
                                                   config=config,
                                                   cache_dir=cache_dir)

        self.tokenizer = MarianTokenizer.from_pretrained(model_name_or_path,
                                                         cache_dir=cache_dir)

        self.config = self.model.config
        self.config_class = self.model.config_class
        #self.device = self.model.device
        self.dtype = self.model.dtype
        self.task = task

        self.output_attentions = True
        #self.output_hidden_states = True
        self.config.output_attentions = True
        #self.config.output_hidden_states = True

        self.freeze_encoder = freeze_encoder

        self.add_pooling_layer()
Ejemplo n.º 18
0
    def __init__(self, model_name=None, device=None, half=False):
        """
        basic wrapper around MarianMT model for language translation

        Args:
          model_name(str): Helsinki-NLP model
          device(str): device to use (e.g., 'cuda', 'cpu')
          half(bool): If True, use half precision.
        """
        if 'Helsinki-NLP' not in model_name:
            raise ValueError(
                'Translator requires a Helsinki-NLP model: https://huggingface.co/Helsinki-NLP'
            )
        try:
            import torch
        except ImportError:
            raise Exception('Translator requires PyTorch to be installed.')
        self.torch_device = device
        if self.torch_device is None:
            self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        from transformers import MarianMTModel, MarianTokenizer
        self.tokenizer = MarianTokenizer.from_pretrained(model_name)
        self.model = MarianMTModel.from_pretrained(model_name).to(
            self.torch_device)
        if half: self.model = self.model.half()
Ejemplo n.º 19
0
	def __init__(self):
		self.src = 'en'
		self.trg = 'fr'
		self.mname = f'Helsinki-NLP/opus-mt-{self.src}-{self.trg}'

		self.tokenizer = MarianTokenizer.from_pretrained(self.mname)
		self.model = MarianMTModel.from_pretrained(self.mname)
Ejemplo n.º 20
0
 def test_generate_fp16(self):
     config, input_dict = self.model_tester.prepare_config_and_inputs()
     input_ids = input_dict["input_ids"]
     attention_mask = input_ids.ne(1).to(torch_device)
     model = MarianMTModel(config).eval().to(torch_device)
     if torch_device == "cuda":
         model.half()
     model.generate(input_ids, attention_mask=attention_mask)
     model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
Ejemplo n.º 21
0
def TranslatePt2En(text):
    translation_model_name = f'Helsinki-NLP/opus-mt-roa-en'
    model = MarianMTModel.from_pretrained(translation_model_name)
    tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
    # Translate the text
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    gen = model.generate(**inputs)
    return tokenizer.batch_decode(gen, skip_special_tokens=True)
Ejemplo n.º 22
0
    def __init__(self, target_language: str, device='auto'):
        super(BackTranslation, self).__init__()

        target_model_name = f'Helsinki-NLP/opus-mt-ko-{target_language}'
        self.tar_tokenizer = MarianTokenizer.from_pretrained(target_model_name)
        self.tar_model = MarianMTModel.from_pretrained(target_model_name)

        source_model_name = f'Helsinki-NLP/opus-mt-{target_language}-ko'
        self.src_tokenizer = MarianTokenizer.from_pretrained(source_model_name)
        self.src_model = MarianMTModel.from_pretrained(source_model_name)

        if device == 'auto':
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = device

        self.tar_model.to(device)
        self.src_model.to(device)
Ejemplo n.º 23
0
 def __init__(self, src_lang: str, trg_lang: str):
     super(Marian, self).__init__()
     model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{trg_lang}'
     self.model = MarianMTModel.from_pretrained(model_name,
                                                normalize_embedding=True)
     self.tokenizer = MarianTokenizer.from_pretrained(model_name,
                                                      bos_token='<bos>',
                                                      eos_token='<eos>')
     self._added_tokens = []
Ejemplo n.º 24
0
def get_model(param):
    """
  Load Hugginface marian Machine Translator model and tokenizer
  :param param: Huggingface MarianMt Helsinki-NLP/{model_name} to load (https://huggingface.co/Helsinki-NLP); param[0]=label - param[1]=model_name
  :return a tuple result = (Huggingface MarianMt Model, Marian MT Tokenizer, Marian MT label)
  """
    mt_model = MarianMTModel.from_pretrained(
        param[1])  #param[0]=label ; param[1]=model_name to load
    mt_tokenizer = MarianTokenizer.from_pretrained(param[1])  #load tokenizer
    return mt_model, mt_tokenizer, param[0]
Ejemplo n.º 25
0
 def __init__(self, src="en", trg="ro", use_cuda=True):
     self.src = src
     self.trg = trg
     self.use_cuda = use_cuda
     self.mname = f"Helsinki-NLP/opus-mt-{self.src}-{self.trg}"
     with torch.no_grad():
         self.model = MarianMTModel.from_pretrained(self.mname)
         if self.use_cuda:
             self.model = self.model.cuda()
         self.tok = MarianTokenizer.from_pretrained(self.mname)
 def load_model(self, route):
     model = f'opus-mt-{route}'
     path = os.path.join(self.models_dir,model)
     try:
         model = MarianMTModel.from_pretrained(path)
         tok = MarianTokenizer.from_pretrained(path)
     except:
         return 0,f"Make sure you have downloaded model for {route} translation"
     self.models[route] = (model,tok)
     return 1,f"Successfully loaded model for {route} transation"
Ejemplo n.º 27
0
def translate(sentences,inp_lang,out_lang):
	tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-{}-{}".format(inp_lang,out_lang))
	model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-{}-{}".format(inp_lang,out_lang))

	output = []
	for sentence in tqdm(sentences):
		translated = model.generate(**tokenizer.prepare_translation_batch([sentence]))
		output.extend([tokenizer.decode(t, skip_special_tokens=True) for t in translated])

	return output
def get_models(
    src: str, tgt: str, verbose: int = 0
) -> Tuple[
    transformers.models.marian.tokenization_marian.MarianTokenizer,
    transformers.models.marian.tokenization_marian.MarianTokenizer,
    transformers.models.marian.modeling_marian.MarianMTModel,
    transformers.models.marian.modeling_marian.MarianMTModel,
]:
    model_to = "Helsinki-NLP/opus-mt-{src}-{tgt}".format(src=SRC_TO, tgt=TGT_TO)
    model_from = "Helsinki-NLP/opus-mt-{src}-{tgt}".format(src=TGT_TO, tgt=SRC_TO)

    if verbose > 0:
        print("Loading models: {} and {}".format(model_to, model_from))
    tokenizer_to = MarianTokenizer.from_pretrained(model_to)
    model_to = MarianMTModel.from_pretrained(model_to)
    tokenizer_from = MarianTokenizer.from_pretrained(model_from)
    model_from = MarianMTModel.from_pretrained(model_from)

    return tokenizer_to, tokenizer_from, model_to, model_from
Ejemplo n.º 29
0
def get_batch_opustranslator(src, tgt):
    from transformers import MarianTokenizer, MarianMTModel
    model_name = f'Helsinki-NLP/opus-mt-{src}-{tgt}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    translator = lambda x: tokenizer.batch_decode(model.generate(
        **tokenizer.prepare_seq2seq_batch(src_texts=x, return_tensors="pt")),
                                                  skip_special_tokens=True)
    return translator
Ejemplo n.º 30
0
 def __init__(self,
              src_lang: str = "en",
              tgt_lang: str = "fr",
              device: str = "cpu"):
     super().__init__(src_lang, tgt_lang, device)
     self.model_name = "Helsinki-NLP/opus-mt-{src}-{tgt}".format(
         src=src_lang, tgt=tgt_lang)
     self.tokenizer = MarianTokenizer.from_pretrained(self.model_name)
     self.model = MarianMTModel.from_pretrained(self.model_name)
     self.model = self.model.to(self.device)