def __init__(self):
        self._tokenizer = MosesTokenizer("en")

        self._model_name = _EN_TH_MODEL_NAME

        _download_install(self._model_name)
        self._model = TransformerModel.from_pretrained(
            model_name_or_path=_get_translate_path(
                self._model_name,
                _EN_TH_FILE_NAME,
                "models",
            ),
            checkpoint_file="checkpoint.pt",
            data_name_or_path=_get_translate_path(
                self._model_name,
                _EN_TH_FILE_NAME,
                "vocab",
            ),
        )
Beispiel #2
0
    def _create_model(cls, checkpoints, device, beam_size, use_fp16):
        model = TransformerModel.build_model(checkpoints.args, checkpoints.task)

        # Custom make_generation_fast_
        eval_fn, train_fn = model.eval, model.train
        model.eval = lambda: None

        model.make_generation_fast_(
            beamable_mm_beam_size=None if beam_size == 0 else beam_size,
            need_attn=True,  # --print-alignment
        )

        model.eval, model.train = eval_fn, train_fn

        if device is not None:
            torch.cuda.set_device(device)
            model = model.cuda(device)

        if use_fp16:
            model.half()

        return model
def load_translate(dataset, testset):
    model = TransformerModel.from_pretrained(
        f'./checkpoint/{dataset}',
        checkpoint_file='checkpoint_best.pt',
        data_name_or_path=f'data-bin/{testset}',
        bpe='sentencepiece',
        sentencepiece_model='./bpe_model/ta.wiki.bpe.vs50000.model')

    model.eval()

    en2de.cuda()

    with open(f'intermediate_datasets/BPE/{testset}/test.en') as f:
        src_sentences = f.read().splitlines()

    with open(f'datasets/{testset}/test.ta') as f:
        ref_lines = f.read().splitlines()
    hyp_lines = model.translate(tqdm(src_sentences))

    with open(f'generation_results/{dataset}on{testset}.txt', 'w') as f:
        f.writelines(f'{sentence}\n' for sentence in hyp_lines)

    return hyp_lines, ref_lines
Beispiel #4
0
    def __init__(self):
        self._model_name = _TH_EN_MODEL_NAME

        _download_install(self._model_name)
        self._model = TransformerModel.from_pretrained(
            model_name_or_path=_get_translate_path(
                self._model_name,
                _TH_EN_FILE_NAME,
                "models",
            ),
            checkpoint_file="checkpoint.pt",
            data_name_or_path=_get_translate_path(
                self._model_name,
                _TH_EN_FILE_NAME,
                "vocab",
            ),
            bpe="sentencepiece",
            sentencepiece_model=_get_translate_path(
                self._model_name,
                _TH_EN_FILE_NAME,
                "bpe",
                "spm.th.model",
            ),
        )
Beispiel #5
0
def translate(input_file,
              output_file,
              device,
              folder,
              beam_size=3,
              batch_size=256,
              replace_unk=False):
    translator = TransformerModel.from_pretrained(
        folder, checkpoint_file='checkpoint_best.pt', beam=beam_size)
    translator.to(device)
    translator.eval()

    input_f = open(input_file, "r")
    output_f = open(output_file, "w")

    for batch in tqdm(chunked(input_f, batch_size)):
        for src, sentence in zip(batch, translator.translate(batch)):
            if replace_unk:
                sentence = sentence.replace("<unk>", "")
                sentence = sentence.replace("▁< unk >", "")
                sentence = sentence.replace("  ", " ")
            print("Source text: {}".format(src.strip()))
            print("Translation text: {}".format(sentence))
            print(sentence, file=output_f)
Beispiel #6
0
def main():
    """
    Give the path of the source file (in tsv format) as an argument to the command in command line "run translator --source-file=..."
    Hard code the Target file, where the ouptut in german should be saved.
    """
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--source-file', required=True)

    args = parser.parse_args()

    source_file = args.source_file

    #check if source file is a valid file
    if not os.path.isfile(source_file) or not source_file_tsv.endswith('.tsv'):
        raise Exception(f"{source_file_tsv} is no valide file")

    en2de = TransformerModel.from_pretrained(
        f'{DATA_DIR}',
        checkpoint_file=f'{DATA_DIR}/model4.pt',
        data_name_or_path=f'{DATA_DIR}',
        bpe='fastbpe',
        bpe_codes=f'{DATA_DIR}/bpecodes',
        tokenizer='moses')

    lines_en = convert_tsv_lines_utf8_en_de(source_file)

    with open(TARGET_FILE, 'w') as target_tsv:
        target_tsv_writer = csv.writer(target_tsv, delimiter='\t')
        for line in lines_en:
            new_line_de = []
            for text_en in line:
                text_de = en2de.translate(text_en)
                new_line_de.append(text_de)
            target_tsv_writer.writerow(new_line_de)

    print('SUCCESS!')
Beispiel #7
0
import sentencepiece as spm
import re
from fairseq.models.transformer import TransformerModel

sp = spm.SentencePieceProcessor()
sp.load("models/jsec.ja.model")

ja2en = TransformerModel.from_pretrained(
    'checkpoints/98subwords/',
    checkpoint_file='checkpoint_best.pt',
    data_name_or_path='data/bin/98_subwords/')


def raw2subword(text):
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    subwords = sp.EncodeAsPieces(text)
    text = ' '.join(subwords)
    return text


def subword2raw(text):
    text = re.sub(' ', '', text)
    text = re.sub(r'▁', ' ', text)
    text = text[1:]
    return text.capitalize()


'''
def translate(text):
    text = raw2subword(text)
Beispiel #8
0
 def setUp(self):
     self.task, self.parser = get_dummy_task_and_parser()
     TransformerModel.add_args(self.parser)
     self.args = self.parser.parse_args([])
     self.args.encoder_layers = 2
     self.args.decoder_layers = 1
 def add_args(parser):
     # Models can override this method to add new command-line arguments.
     # Here we'll add some new command-line arguments to configure dropout
     # and the dimensionality of the embeddings and hidden states.
     parser.add_argument(
         '--model-type',
         type=str,
         default='lstm',
         help=
         'Type of encoder and decoder to use: 1) lstm (default), 2) transformer',
     )
     parser.add_argument('--encoder-hidden-dim',
                         type=int,
                         default=256,
                         help="Size of encoder\'s hidden layer")
     parser.add_argument('--decoder-hidden-dim',
                         type=int,
                         default=256,
                         help="Size of decoder\'s hidden layer")
     parser.add_argument('--decoder-out-embed-dim',
                         type=int,
                         default=256,
                         help="Size of decoder\'s output embeddings")
     parser.add_argument('--num-of-inputs',
                         type=int,
                         default=1,
                         help='Number of different input item sequences')
     parser.add_argument(
         '--source-index',
         type=int,
         default=0,
         help=
         'Index of the source among those provided as input (used for training a single task in a multi-task framework)',
     )
     parser.add_argument(
         '--target-index',
         type=int,
         default=1,
         help=
         'Index of the target among those provided as input (used for training a single task in a multi-task framework)',
     )
     parser.add_argument(
         '--match-source-len',
         action='store_true',
         default=False,
         help=
         'For scheduled-sampling decoding, same behavior as for fairseq-generate',
     )
     parser.add_argument(
         '--max-lan-a',
         type=float,
         default=0.4,
         help=
         'For scheduled-sampling decoding, same behavior as for fairseq-generate',
     )
     parser.add_argument(
         '--max-len-b',
         type=int,
         default=1,
         help=
         'For scheduled-sampling decoding, same behavior as for fairseq-generate',
     )
     TransformerModel.add_args(parser)
Beispiel #10
0
import os
from fastapi import FastAPI
from pydantic import BaseModel
from fairseq.models.transformer import TransformerModel

app = FastAPI()


TH_EN_MODEL = 'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0'
EN_TH_MODEL = 'SCB_1M-MT_OPUS+TBASE_en-th_spm-spm_32000-joined_v1.0'


th2en = TransformerModel.from_pretrained(
    model_name_or_path=os.path.join(TH_EN_MODEL, 'models'),
    checkpoint_file='checkpoint.pt',
    data_name_or_path=os.path.join(TH_EN_MODEL, 'vocab'),
    bpe='sentencepiece',
    sentencepiece_vocab=os.path.join(TH_EN_MODEL, 'bpe', 'spm.th.model')
)


en2th = TransformerModel.from_pretrained(
    model_name_or_path=os.path.join(EN_TH_MODEL, 'models'),
    checkpoint_file='checkpoint.pt',
    data_name_or_path=os.path.join(EN_TH_MODEL, 'vocab'),
    bpe='sentencepiece',
    sentencepiece_vocab=os.path.join(EN_TH_MODEL, 'bpe', 'spm.en.model')
)


class Request(BaseModel):
Beispiel #11
0
 def add_args(parser):
     TransformerModel.add_args(parser)
     parser.add_argument("--full-mask",
                         action="store_true",
                         help="Full masking")
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("--en2fr", required=True, help="path to en2fr model")
    parser.add_argument(
        "--fr2en", required=True, help="path to fr2en mixture of experts model"
    )
    parser.add_argument(
        "--user-dir", help="path to fairseq examples/translation_moe/src directory"
    )
    parser.add_argument(
        "--num-experts",
        type=int,
        default=10,
        help="(keep at 10 unless using a different model)",
    )
    parser.add_argument(
        "files",
        nargs="*",
        default=["-"],
        help='input files to paraphrase; "-" for stdin',
    )
    args = parser.parse_args()

    if args.user_dir is None:
        args.user_dir = os.path.join(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),  # examples/
            "translation_moe",
            "src",
        )
        if os.path.exists(args.user_dir):
            logging.info("found user_dir:" + args.user_dir)
        else:
            raise RuntimeError(
                "cannot find fairseq examples/translation_moe/src "
                "(tried looking here: {})".format(args.user_dir)
            )

    logging.info("loading en2fr model from:" + args.en2fr)
    en2fr = TransformerModel.from_pretrained(
        model_name_or_path=args.en2fr,
        tokenizer="moses",
        bpe="sentencepiece",
    ).eval()

    logging.info("loading fr2en model from:" + args.fr2en)
    fr2en = TransformerModel.from_pretrained(
        model_name_or_path=args.fr2en,
        tokenizer="moses",
        bpe="sentencepiece",
        user_dir=args.user_dir,
        task="translation_moe",
    ).eval()

    def gen_paraphrases(en):
        fr = en2fr.translate(en)
        return [
            fr2en.translate(fr, inference_step_args={"expert": i})
            for i in range(args.num_experts)
        ]

    logging.info("Type the input sentence and press return:")
    for line in fileinput.input(args.files):
        line = line.strip()
        if len(line) == 0:
            continue
        for paraphrase in gen_paraphrases(line):
            print(paraphrase)
Beispiel #13
0
    def run(self):
        def tokenize_for_bleu(target):
            target = tokenizer.decode_pieces(target.split())
            if self.target_lang == "ja":
                target = " ".join(
                    map(
                        lambda x: x.split("\t")[0],
                        tagger.parse(target).split("\n")[:-2],
                    ))
            return target

        docs = self.load()
        tagger = MeCab.Tagger()
        tokenizer = spm.SentencePieceProcessor()
        tokenizer.load(self.context_aware_sentencepiece_model)
        translation_models = {}
        for bias, path in self.context_aware_translation_models.items():
            base_path, checkpoint_path = os.path.split(path)
            model = (TransformerModel.from_pretrained(
                base_path,
                checkpoint_file=checkpoint_path).half().cuda().eval())
            model.args.max_source_positions = self.max_source_positions
            model.args.max_target_positions = self.max_target_positions
            translation_models[int(bias)] = model
        args = translation_models[-1].args
        task = translation_models[-1].task
        criterion = task.build_criterion(args)
        results = collections.defaultdict(dict)
        for doc_id, doc in tqdm.tqdm(docs.items(), total=len(docs)):
            parallel_doc = set([
                sent_id for sent_id, score in doc["pairs"]
                if score >= self.score_threhold
            ])
            batches = collections.defaultdict(dict)
            targets = {}
            for sent_id in parallel_doc:
                source, target = [
                    tokenizer.encode_as_pieces(doc[lang][sent_id])
                    for lang in (self.source_lang, self.target_lang)
                ]
                available_index = [
                    index for index in range(0, sent_id)
                    if doc[self.source_lang][index]
                ]
                # context_bias is the parameter which the model is trained with.
                # context_sent_index is the index of the actual used contextual
                # sentence.
                targets[sent_id] = " ".join(target)
                for context_bias, _ in translation_models.items():
                    context_sent_index = None
                    if context_bias != -1:
                        if len(available_index) < context_bias:
                            context_sent_index = -1
                        else:
                            context_sent_index = available_index[-context_bias]
                        source_context = tokenizer.encode_as_pieces(
                            docs[doc_id][self.source_lang][context_sent_index])
                        real_source = source_context + [CONCAT_TOKEN] + source
                    else:
                        real_source = source
                    if real_source and len(
                            real_source) < self.max_source_positions:
                        source_sentence = " ".join(real_source)
                    else:
                        source_sentence = None
                    batches[context_bias][sent_id] = source_sentence
            batch_results = collections.defaultdict(
                lambda: collections.defaultdict(dict))
            for context_bias, batch in batches.items():
                data = [sentence for sentence in batch.values() if sentence]
                if not data:
                    continue
                real_targets = {
                    sent_id: targets[sent_id]
                    for sent_id in batch if batch[sent_id]
                }
                model = translation_models[context_bias]
                args.max_source_positions = self.max_source_positions
                args.max_target_positions = self.max_target_positions
                translated = model.translate(data)
                # Compute BLEU score
                # Make the BLEU negative to easy the results computaion
                for trans, (sent_id, target) in zip(translated,
                                                    real_targets.items()):
                    batch_results[sent_id]["bleu"][
                        context_bias] = -sacrebleu.corpus_bleu(
                            tokenize_for_bleu(trans),
                            tokenize_for_bleu(target)).score
                # Compute loss
                src_tokens = [
                    model.src_dict.encode_line(
                        real_source,
                        line_tokenizer=lambda x: x.split(),
                        add_if_not_exist=False,
                    ).long() for real_source in data
                ]
                src_lengths = [tokens.numel() for tokens in src_tokens]
                tgt_tokens = [
                    model.tgt_dict.encode_line(
                        target,
                        line_tokenizer=lambda x: x.split(),
                        add_if_not_exist=False,
                    ).long() for target in real_targets.values()
                ]
                tgt_lengths = [tokens.numel() for tokens in tgt_tokens]
                temp_dataset = LanguagePairDataset(
                    src_tokens,
                    src_lengths,
                    model.src_dict,
                    tgt_tokens,
                    tgt_lengths,
                    left_pad_source=args.left_pad_source,
                    left_pad_target=args.left_pad_target,
                    max_source_positions=self.max_source_positions,
                    max_target_positions=self.max_target_positions,
                )
                reports = collections.defaultdict(list)
                iterator = task.get_batch_iterator(
                    dataset=temp_dataset,
                    max_sentences=self.max_sentences,
                )
                for sample in iterator.next_epoch_itr(shuffle=False):
                    sample["net_input"]["src_tokens"] = sample["net_input"][
                        "src_tokens"].cuda()
                    sample["net_input"]["src_lengths"] = sample["net_input"][
                        "src_lengths"].cuda()
                    sample["net_input"]["prev_output_tokens"] = sample[
                        "net_input"]["prev_output_tokens"].cuda()
                    sample["target"] = sample["target"].cuda()
                    with torch.no_grad():
                        _, _, report = criterion(model.models[0], sample,
                                                 False)
                    for key, value in report.items():
                        reports[key].append(value)
                for key in ("loss", "nll_loss"):
                    for value, (sent_id, _) in zip(torch.cat(reports[key]),
                                                   real_targets.items()):
                        batch_results[sent_id][key][context_bias] = float(
                            value)
            for sent_id, value in batch_results.items():
                results[doc_id][sent_id] = value
        self.dump(dict(results))
 def add_args(parser):
     TransformerModel.add_args(parser)
     parser.add_argument('--share-encoders', action='store_true',
                         help='share encoders across languages')
     parser.add_argument('--share-decoders', action='store_true',
                         help='share decoders across languages')
Beispiel #15
0
    def __init__(self,
                 vocab: Vocabulary,
                 dataset_reader: DatasetReader,
                 source_embedder: TextFieldEmbedder,
                 lang2_namespace: str = "tokens",
                 use_bleu: bool = True) -> None:
        super().__init__(vocab)
        self._lang1_namespace = lang2_namespace  # TODO: DO NOT HARDCODE IT
        self._lang2_namespace = lang2_namespace

        # TODO: do not hardcore this
        self._backtranslation_src_langs = ["en", "ru"]
        self._coeff_denoising = 1
        self._coeff_backtranslation = 1
        self._coeff_translation = 1

        self._label_smoothing = 0.1

        self._pad_index_lang1 = vocab.get_token_index(DEFAULT_PADDING_TOKEN,
                                                      self._lang1_namespace)
        self._oov_index_lang1 = vocab.get_token_index(DEFAULT_OOV_TOKEN,
                                                      self._lang1_namespace)
        self._end_index_lang1 = self.vocab.get_token_index(
            END_SYMBOL, self._lang1_namespace)

        self._pad_index_lang2 = vocab.get_token_index(DEFAULT_PADDING_TOKEN,
                                                      self._lang2_namespace)
        self._oov_index_lang2 = vocab.get_token_index(DEFAULT_OOV_TOKEN,
                                                      self._lang2_namespace)
        self._end_index_lang2 = self.vocab.get_token_index(
            END_SYMBOL, self._lang2_namespace)

        self._reader = dataset_reader
        self._langs_list = self._reader._langs_list
        self._ae_steps = self._reader._ae_steps
        self._bt_steps = self._reader._bt_steps
        self._para_steps = self._reader._para_steps

        if use_bleu:
            self._bleu = Average()
        else:
            self._bleu = None

        args = ArgsStub()

        transformer_iwslt_de_en(args)

        # build encoder
        if not hasattr(args, 'max_source_positions'):
            args.max_source_positions = 1024
        if not hasattr(args, 'max_target_positions'):
            args.max_target_positions = 1024

        # Dense embedding of source vocab tokens.
        self._source_embedder = source_embedder

        # Dense embedding of vocab words in the target space.
        num_tokens_lang1 = self.vocab.get_vocab_size(self._lang1_namespace)
        num_tokens_lang2 = self.vocab.get_vocab_size(self._lang2_namespace)

        args.share_decoder_input_output_embed = False  # TODO implement shared embeddings

        lang1_dict = DictStub(num_tokens=num_tokens_lang1,
                              pad=self._pad_index_lang1,
                              unk=self._oov_index_lang1,
                              eos=self._end_index_lang1)

        lang2_dict = DictStub(num_tokens=num_tokens_lang2,
                              pad=self._pad_index_lang2,
                              unk=self._oov_index_lang2,
                              eos=self._end_index_lang2)

        # instantiate fairseq classes
        emb_golden_tokens = FairseqEmbedding(num_tokens_lang2,
                                             args.decoder_embed_dim,
                                             self._pad_index_lang2)

        self._encoder = TransformerEncoder(args, lang1_dict,
                                           self._source_embedder)
        self._decoder = TransformerDecoder(args, lang2_dict, emb_golden_tokens)
        self._model = TransformerModel(self._encoder, self._decoder)

        # TODO: do not hardcode max_len_b and beam size
        self._sequence_generator_greedy = FairseqBeamSearchWrapper(
            SequenceGenerator(tgt_dict=lang2_dict, beam_size=1, max_len_b=20))
        self._sequence_generator_beam = FairseqBeamSearchWrapper(
            SequenceGenerator(tgt_dict=lang2_dict, beam_size=7, max_len_b=20))
Beispiel #16
0
    def build_model(cls, args, task):
        # set any default arguments
        transformer_align(args)

        transformer_model = TransformerModel.build_model(args, task)
        return TransformerAlignModel(transformer_model.encoder, transformer_model.decoder, args)
def main() -> None:
    """
    Main function to read, translate and write data to disk
    """
    args = parse_arguments(subtype="translate")
    # get verbosity
    if args.verbosity == 1:
        logger = logging.getLogger('base')
    else:
        logger = logging.getLogger('root')
    # get batch-size
    batch_size = args.batch_size
    # model subsets
    model_subset = args.model_subset
    # local model glob
    model_checkpoints_glob = args.checkpoints_glob
    # initialize model names
    model_names = []
    # create path dictionary
    path_dict = {
        "wmt": [[
            "./data/wmt19/wmt19.test.truecased.de.ref",
            "./data/wmt19_paraphrased/wmt19-ende-wmtp.ref"
        ]],
        "ar": [[
            "./data/wmt19_paraphrased/wmt19-ende-ar.ref",
            "./data/wmt19_paraphrased/wmt19-ende-arp.ref"
        ]]
    }
    path_dict["both"] = path_dict["wmt"] + path_dict["ar"]
    # define available models for de-en
    if model_subset in ["local", "both"]:
        model_names.extend(glob(model_checkpoints_glob))
    if model_subset in ["hub", "both"]:
        model_names.append("transformer.wmt19.de-en.single_model")
    # loop over respective models
    for model_name in model_names:
        # add rules for loading models
        if model_name == "transformer.wmt19.de-en.single_model":
            model = torch.hub.load("pytorch/fairseq",
                                   model_name,
                                   tokenizer="moses",
                                   bpe="fastbpe")
            model_name = "torch_hub." + model_name
        else:
            model = TransformerModel.from_pretrained(
                os.path.dirname(model_name),
                checkpoint_file=os.path.basename(model_name),
                bpe="fastbpe",
                tokenizer="moses",
                data_name_or_path="./bpe/",
                bpe_codes=os.path.join(os.path.dirname(model_name), "bpe",
                                       "bpe.32000"))
            model_name = "%s.%s.%s" % (
                "local", os.path.basename(os.path.dirname(model_name)),
                os.path.basename(model_name).replace(".pt", ""))
        # disable dropout for prediction
        model.eval()
        # enable GPU hardware acceleration if GPU/CUDA present
        if torch.cuda.is_available():
            model.cuda()
        # log model used in current loop
        logger.info("Translating with model: %s", model_name)
        # loop over paraphrase files
        for input_paths in path_dict[args.wmt_references]:
            base = os.path.basename(input_paths[0])
            # read original de data here
            logger.info("Reading reference data: %s", base)
            de_input_original = read_data(input_paths[0])
            # read de paraphrase data
            logger.info("Reading paraphrased reference data: %s",
                        os.path.basename(input_paths[1]))
            de_input_paraphrased = read_data(input_paths[1])
            # assemble combined input data
            logger.info("Interweaving 'de' input data")
            de_input = interweave(de_input_original, de_input_paraphrased)
            logger.info("Translating and processing to 'en'")
            # translate and process
            store = translate_process(model, de_input, batch_size)
            # modify metadata
            if all(re.search(r"-arp?.ref$", path) for path in input_paths):
                metadata = "wmt19.ar.arp"
            else:
                metadata = "wmt19.wmt.wmtp"
            # write json to disk
            write_to_file(model_name, metadata, store)
Beispiel #18
0
import random
import os
import time

def get_lines(file):
    reader = open(file)
    train_lines = reader.readlines()
    reader.close()

    return train_lines

lines = get_lines('test.ru.txt') 

from fairseq.models.transformer import TransformerModel
ru2en = TransformerModel.from_pretrained(
  '/home/aleksei/Documents/wmt19.ru-en.ensemble',
  checkpoint_file='model1.pt'
)

result_lines = []
for line in lines:
    translated = ru2en.translate(line)
    print(translated)
    result_lines.append(translated)

print('translated, saving to file...')

with open('output.txt', 'a') as the_file:
    for line in result_lines:
        the_file.write('%s\n' % line)

print('saved')
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off
     TransformerModel.add_args(parser)
Beispiel #20
0
 def build_model(self) -> GeneratorHubInterface:
     model = TransformerModel.from_pretrained(**self.model_args)
     model.to(self.device)
     model.eval()
     return model
Beispiel #21
0
    def run(self):
        def load_doc(doc: List, doc_id: str):
            doc_df = pd.DataFrame(doc)
            for sent_id, row in doc_df.iterrows():
                docs[doc_id]["en"].append(str(row["en_sentence"]).strip())
                docs[doc_id]["ja"].append(str(row["ja_sentence"]).strip())
                docs[doc_id]["pairs"].append((sent_id, 1.0))

        docs = collections.defaultdict(lambda: collections.defaultdict(list))
        lang1, lang2 = self.data_langs
        if self.dataset_name == "jiji":
            read_raw_jiji(docs, self.source_path)
        elif not isinstance(self.source_path, str):
            docs = read_seperate_files(
                self.dataset_name, self.source_path, self.sentence_level
            )
        elif os.path.splitext(self.source_path)[1] == ".tsv":
            docs = read_tsv_file(
                self.dataset_name,
                self.source_path,
                self.data_langs,
                self.sentence_level,
            )
        else:
            if os.path.isfile(self.source_path):
                with open(self.source_path) as source:
                    data = json.load(source)
                for doc_index, doc in enumerate(data):
                    load_doc(doc, f"{self.dataset_name}_{doc_index}")
            elif os.path.isdir(self.source_path):
                for file_path in glob.glob(self.source_path + "/*.json"):
                    doc_id = os.path.splitext(file_path)[0].split("/")[-1]
                    with open(file_path) as source:
                        load_doc(json.load(source), doc_id)
        if not self.sentence_level:
            for _, doc in docs.items():
                doc[lang1].append(" ")
                doc[lang2].append(" ")

        # Add translated source to the data
        if self.translation_model_name:
            langs = list(self.translation_models.keys())
            source_target_dict = {
                lang: langs[1 - index] for index, lang in enumerate(langs)
            }
            translation_models = {}
            for source, path in self.translation_models.items():
                base_path, checkpoint_path = os.path.split(path)
                model = TransformerModel.from_pretrained(
                    base_path, checkpoint_file=checkpoint_path
                )
                model.to("cuda")
                spm_processor = spm.SentencePieceProcessor()
                spm_processor.load(self.sentencepiece_models[source])
                translation_models[source] = (model, spm_processor)
            for doc_id, doc in tqdm.tqdm(docs.items(), total=len(docs)):
                for lang in translation_models.keys():
                    model = translation_models[source_target_dict[lang]][0]
                    tokenizer = translation_models[source_target_dict[lang]][1]
                    detokenizer = translation_models[lang][1]
                    sources = []
                    no_translation = {}
                    for index, sent in enumerate(doc[source_target_dict[lang]]):
                        if not sent or sent == " ":
                            no_translation[index] = sent
                        else:
                            sources.append(sent)
                    targets = [
                        detokenizer.decode_pieces(target.split())
                        for target in model.translate(
                            [
                                " ".join(tokenizer.encode_as_pieces(source))
                                for source in sources
                            ]
                        )
                    ]
                    for sent_id, sent in no_translation.items():
                        targets.insert(sent_id, sent)
                    doc[f"{lang}_translated"] = targets
        self.dump(dict(docs))
Beispiel #22
0
"""
Use AIResearch MT model easily
"""

import os

os.system('pip install sentencepiece')
os.system('pip install git+https://github.com/pytorch/fairseq@6f6461b')

from fairseq.models.transformer import TransformerModel

# download model
url = 'https://github.com/vistec-AI/model-releases/releases/download/SCB_1M%2BTBASE_v1.0/SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz'
os.system(f'curl -L {url} | tar xz')

model = TransformerModel.from_pretrained(
    model_name_or_path=
    'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0/models/',
    checkpoint_file='checkpoint.pt',
    data_name_or_path=
    'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0/vocab/',
    bpe='sentencepiece',
    sentencepiece_vocab=
    'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.th.model')

# function en2th.translate
translate = model.translate
Beispiel #23
0
        sys.exit("'BPE codes' argument missing! Should be subword-nmt created with learn_bpe.py")

    if len(sys.argv) > 4:
        input_file = os.path.abspath(sys.argv[4])
    else:
        sys.exit("'Input text' argument missing!")

    if len(sys.argv) > 5:
        output_file = os.path.abspath(sys.argv[5])
    else:
        sys.exit("'Output text' argument missing!")

    with open(input_file, 'r') as f:
        text = f.read().strip().splitlines()

    fout = open(output_file, 'x')

    nopuncts2puncts = TransformerModel.from_pretrained(
        model_path,
        checkpoint_file='checkpoint_best.pt',
        data_name_or_path=data_path,
        bpe='subword_nmt',
        bpe_codes=bpe_codes
    )

    # Punctuate
    textout = nopuncts2puncts.translate(text)

    fout.write('\n'.join(textout))
    fout.close()
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if "multi" in self.config.n_model:
            from fairseq.models.transformer import TransformerModel

            from pororo.tasks.utils.tokenizer import CustomTokenizer

            load_dict = download_or_load(
                f"transformer/{self.config.n_model}",
                "multi",
            )

            model = (TransformerModel.from_pretrained(
                model_name_or_path=load_dict.path,
                checkpoint_file=f"{self.config.n_model}.pt",
                data_name_or_path=load_dict.dict_path,
                source_lang=load_dict.src_dict,
                target_lang=load_dict.tgt_dict,
            ).eval().to(device))

            tokenizer = CustomTokenizer.from_file(
                vocab_filename=f"{load_dict.src_tok}/vocab.json",
                merges_filename=f"{load_dict.src_tok}/merges.txt",
            )

            return PororoTransformerTransMulti(
                model,
                self.config,
                tokenizer,
            )

        if "transformer" in self.config.n_model:
            from fairseq.models.transformer import TransformerModel

            load_dict = download_or_load(
                f"transformer/{self.config.n_model}",
                self.config.lang,
            )

            tokenizer = None
            model = (TransformerModel.from_pretrained(
                model_name_or_path=load_dict.path,
                checkpoint_file=f"{self.config.n_model}.pt",
                data_name_or_path=load_dict.dict_path,
                source_lang=load_dict.src_dict,
                target_lang=load_dict.tgt_dict,
            ).eval().to(device))

            if self.config.lang != "zh":
                from pororo.tasks.utils.tokenizer import CustomTokenizer

                tokenizer = CustomTokenizer.from_file(
                    vocab_filename=f"{load_dict.src_tok}/vocab.json",
                    merges_filename=f"{load_dict.src_tok}/merges.txt",
                )

            return PororoTransformerParaphrase(model, self.config, tokenizer)
Beispiel #25
0
    def add_args(parser):
        TransformerModel.add_args(parser)

        # Arguments related to parameter initialization
        parser.add_argument('--apply-bert-init', action='store_true',
                            help='use custom param initialization for BERT')
Beispiel #26
0
from pythainlp.tokenize import word_tokenize as th_word_tokenize
from functools import partial
from tqdm import tqdm
import torch
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# %%
en_word_tokenize = MosesTokenizer('en')
en_word_detokenize = MosesDetokenizer('en')

en2th_word2bpe = TransformerModel.from_pretrained(
    model_name_or_path=
    '/home/users/v-sumeth/AIS/Vistech_SCB_mt_models/SCB_1M+TBASE_en-th_moses-newmm_space_130000-130000_v1.0/models/',
    checkpoint_file='checkpoint.pt',
    data_name_or_path=
    '/home/users/v-sumeth/AIS/Vistech_SCB_mt_models/SCB_1M+TBASE_en-th_moses-newmm_space_130000-130000_v1.0/vocab/'
)

th_word_tokenize = partial(th_word_tokenize, keep_whitespace=False)

th2en = TransformerModel.from_pretrained(
    model_name_or_path=
    "/home/users/v-sumeth/AIS/Vistech_SCB_mt_models/SCB_1M+TBASE_th-en_newmm-moses_130000-130000_v1.0/models",
    checkpoint_file='checkpoint.pt',
    data_name_or_path=
    "/home/users/v-sumeth/AIS/Vistech_SCB_mt_models/SCB_1M+TBASE_th-en_newmm-moses_130000-130000_v1.0/vocab",
)

en2th_word2bpe.to(torch.device('cuda'))
Beispiel #27
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--en2fr', required=True, help='path to en2fr model')
    parser.add_argument('--fr2en',
                        required=True,
                        help='path to fr2en mixture of experts model')
    parser.add_argument(
        '--user-dir',
        help='path to fairseq examples/translation_moe/src directory')
    parser.add_argument('--num-experts',
                        type=int,
                        default=10,
                        help='(keep at 10 unless using a different model)')
    parser.add_argument('files',
                        nargs='*',
                        default=['-'],
                        help='input files to paraphrase; "-" for stdin')
    args = parser.parse_args()

    if args.user_dir is None:
        args.user_dir = os.path.join(
            os.path.dirname(os.path.dirname(
                os.path.abspath(__file__))),  # examples/
            'translation_moe',
            'src',
        )
        if os.path.exists(args.user_dir):
            logging.info('found user_dir:' + args.user_dir)
        else:
            raise RuntimeError(
                'cannot find fairseq examples/translation_moe/src '
                '(tried looking here: {})'.format(args.user_dir))

    logging.info('loading en2fr model from:' + args.en2fr)
    en2fr = TransformerModel.from_pretrained(
        model_name_or_path=args.en2fr,
        tokenizer='moses',
        bpe='sentencepiece',
    ).eval()

    logging.info('loading fr2en model from:' + args.fr2en)
    fr2en = TransformerModel.from_pretrained(
        model_name_or_path=args.fr2en,
        tokenizer='moses',
        bpe='sentencepiece',
        user_dir=args.user_dir,
        task='translation_moe',
    ).eval()

    def gen_paraphrases(en):
        fr = en2fr.translate(en)
        return [
            fr2en.translate(fr, inference_step_args={'expert': i})
            for i in range(args.num_experts)
        ]

    logging.info('Type the input sentence and press return:')
    for line in fileinput.input(args.files):
        line = line.strip()
        if len(line) == 0:
            continue
        for paraphrase in gen_paraphrases(line):
            print(paraphrase)
 def add_args(parser):
     TransformerModel.add_args(parser)
     parser.add_argument('--user-mode', type=str, help='user-mode')
     parser.add_argument('--results-dir', type=str, help='results-dir')
Beispiel #29
0
    def build_model(cls, args, task):
        """Build a new model instance."""
        # from fairseq.tasks.multilingual_translation import MultilingualTranslationTask
        # assert isinstance(task, MultilingualTranslationTask)

        # make sure all arguments are present in older models
        base_architecture(args)

        if args.share_encoders:
            args.share_encoder_embeddings = True

        ### nat model
        # build shared embeddings (if applicable)
        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
        if args.share_all_embeddings:
            if src_dict != tgt_dict:
                raise ValueError("--share-all-embeddings requires a joined dictionary")
            if args.encoder_embed_dim != args.decoder_embed_dim:
                raise ValueError(
                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
                )
            if args.decoder_embed_path and (
                args.decoder_embed_path != args.encoder_embed_path
            ):
                raise ValueError(
                    "--share-all-embeddings not compatible with --decoder-embed-path"
                )
            encoder_embed_tokens = TransformerModel.build_embedding(
                args, src_dict, args.encoder_embed_dim, args.encoder_embed_path
            )
            decoder_embed_tokens = encoder_embed_tokens
            args.share_decoder_input_output_embed = True
        else:
            encoder_embed_tokens = TransformerModel.build_embedding(
                args, src_dict, args.encoder_embed_dim, args.encoder_embed_path
            )
            decoder_embed_tokens = TransformerModel.build_embedding(
                args, tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
            )


        student_cls = ARCH_MODEL_REGISTRY[args.student_arch]
        encoder = student_cls.build_encoder(args, src_dict, encoder_embed_tokens)
        decoder = student_cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
        student = student_cls(args,encoder,decoder)

        teacher_cls = ARCH_MODEL_REGISTRY[args.teacher_arch]
        if not issubclass(teacher_cls, NATransformerModel):
            teacher_cls = PatchedTransformerModel

        teacher_encoder = teacher_cls.build_encoder(
            args, src_dict,
            encoder_embed_tokens if args.share_encoder_embeddings else TransformerModel.build_embedding(
                args, src_dict, args.encoder_embed_dim, args.encoder_embed_path
                )
            )
        teacher_decoder = teacher_cls.build_decoder(
            args, tgt_dict,
            decoder_embed_tokens if args.share_decoder_embeddings else TransformerModel.build_embedding(
                args, tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
                )
            )
        teacher = teacher_cls(args,teacher_encoder,teacher_decoder)

        return cls(args, student, teacher)
Beispiel #30
0
# -*- coding:utf-8 -*-
"""
-------------------------------------------------
Project Name: toolkits
File Name: mt_demo.py
Author: gaoyw
Create Date: 2021/1/22
-------------------------------------------------
"""

from fairseq.models.transformer import TransformerModel
zh2en = TransformerModel.from_pretrained(
    '/path/to/checkpoints',
    checkpoint_file='checkpoint_best.pt',
    data_name_or_path='data-bin/wmt17_zh_en_full',
    bpe='subword_nmt',
    bpe_codes='data-bin/wmt17_zh_en_full/zh.code')
zh2en.translate('你好 世界')