コード例 #1
0
ファイル: gui.py プロジェクト: niedev/argos-translate
 def clicked_handler(self):
     info("WorkerStatusButton clicked_handler")
     if self.status == self.Status.NOT_STARTED:
         self.worker_thread = WorkerThread(self.bound_worker_function)
         self.worker_thread.finished.connect(self.finished_handler)
         self.set_status(self.Status.RUNNING)
         self.worker_thread.start()
コード例 #2
0
def translate_tags(underlying_translation, tag):
    """Translate an ITag or str

    Recursively takes either an ITag or a str, modifies it in place, and returns the translated tag tree

    Args:
        underlying_translation (translate.ITranslation): The translation to apply
        tag (ITag or str): The tag tree to translate

    Returns:
        ITag or str: The translated tag tree
    """
    if type(tag) is str:
        return translate_preserve_formatting(underlying_translation, tag)
    elif tag.translateable is False:
        return tag
    elif depth(tag) == 2:
        tag_injection = inject_tags_inference(underlying_translation, tag)
        if tag_injection is not None:
            info("translate_tags", "tag injection successful")
            return tag_injection
    else:
        tag.children = [
            translate_tags(underlying_translation, child)
            for child in tag.children
        ]

    return tag
コード例 #3
0
def get(url, retry_count=3):
    """Downoads data from a url and returns it

    Args:
        url (str): The url to download (http, https)
        retry_count (int): The number of retries to attempt if the initial download fails.
                If retry_count is 0 the download will only be attempted once.

    Returns:
        bytes: The downloaded data, None is returned if the download fails
    """
    if get_protocol(url) not in supported_protocols:
        return None
    info(f"Downloading {url}")
    download_attempts_count = 0
    while download_attempts_count <= retry_count:
        try:
            response = urllib.request.urlopen(url)
            data = response.read()
            info(f"Got {url}")
            return data
        except Exception as err:
            download_attempts_count += 1
            error(err)
    return None
コード例 #4
0
def apply_packaged_translation(pkg, input_text, translator, num_hypotheses=4):
    """Applies the translation in pkg to translate input_text.

    Args:
        pkg (Package): The package that provides the translation.
        input_text (str): The text to be translated.
        translator (ctranslate2.Translator): The CTranslate2 Translator
        num_hypotheses (int): The number of hypotheses to generate

    Returns:
        [Hypothesis]: A list of Hypothesis's for translating input_text

    """

    info('apply_packaged_translation')
    sp_model_path = str(pkg.package_path / 'sentencepiece.model')
    sp_processor = spm.SentencePieceProcessor(model_file=sp_model_path)
    stanza_pipeline = stanza.Pipeline(lang=pkg.from_code,
                                      dir=str(pkg.package_path / 'stanza'),
                                      processors='tokenize',
                                      use_gpu=False,
                                      logging_level='WARNING')
    stanza_sbd = stanza_pipeline(input_text)
    sentences = [sentence.text for sentence in stanza_sbd.sentences]
    info('sentences', sentences)
    tokenized = [
        sp_processor.encode(sentence, out_type=str) for sentence in sentences
    ]
    info('tokenized', tokenized)
    BATCH_SIZE = 32
    assert (len(sentences) <= BATCH_SIZE)
    translated_batches = translator.translate_batch(
        tokenized,
        replace_unknowns=True,
        max_batch_size=BATCH_SIZE,
        beam_size=num_hypotheses,
        num_hypotheses=num_hypotheses,
        length_penalty=0.2)
    info('translated_batches', translated_batches)

    # Build hypotheses
    value_hypotheses = []
    for i in range(num_hypotheses):
        translated_tokens = []
        cumulative_score = 0
        for translated_batch in translated_batches:
            translated_tokens += translated_batch[i]['tokens']
            cumulative_score += translated_batch[i]['score']
        detokenized = ''.join(translated_tokens)
        detokenized = detokenized.replace('▁', ' ')
        value = detokenized
        if len(value) > 0 and value[0] == ' ':
            # Remove space at the beginning of the translation added
            # by the tokenizer.
            value = value[1:]
        hypothesis = Hypothesis(value, cumulative_score)
        value_hypotheses.append(hypothesis)
    info('value_hypotheses', value_hypotheses)
    return value_hypotheses
コード例 #5
0
def parse_fewshot_response(response_text):
    response = response_text.split(FEWSHOT_BOUNDARY_TOKEN)
    info("parse_fewshot_response", response)
    if len(response) < 2:
        return None
    response = response[-2].split("\n")
    if len(response) < 2:
        return None
    return response[-1]
コード例 #6
0
def apply_packaged_translation(pkg, input_text, translator, nresults):
    """Applies the translation in pkg to translate input_text.

    Args:
        pkg (Package): The package that provides the translation.
        input_text (str): The text to be translated.

    Returns:
        str: The translated text.

    """

    info('apply_packaged_translation')
    sp_model_path = str(pkg.package_path / 'sentencepiece.model')
    sp_processor = spm.SentencePieceProcessor(model_file=sp_model_path)
    stanza_pipeline = stanza.Pipeline(lang=pkg.from_code,
                                      dir=str(pkg.package_path / 'stanza'),
                                      processors='tokenize',
                                      use_gpu=False,
                                      logging_level='WARNING')
    stanza_sbd = stanza_pipeline(input_text)
    sentences = [sentence.text for sentence in stanza_sbd.sentences]
    info('sentences', sentences)
    tokenized = [
        sp_processor.encode(sentence, out_type=str) for sentence in sentences
    ]
    info('tokenized', tokenized)
    translated_batches = translator.translate_batch(tokenized,
                                                    replace_unknowns=True,
                                                    max_batch_size=32,
                                                    beam_size=nresults,
                                                    num_hypotheses=nresults,
                                                    length_penalty=0.2)
    info('translated_batches', translated_batches)
    result_batches = []
    for i in range(nresults):
        translated_tokens = []
        for translated_batch in translated_batches:
            translated_tokens += translated_batch[i]['tokens']
        detokenized = ''.join(translated_tokens)
        detokenized = detokenized.replace('▁', ' ')
        to_return = detokenized
        if len(to_return) > 0 and to_return[0] == ' ':
            # Remove space at the beginning of the translation added
            # by the tokenizer.
            to_return = to_return[1:]
        result_batches.append(to_return)
    info('result_batches', result_batches)
    return result_batches
コード例 #7
0
 def multi_translate(self, input_text, nresults=4):
     new_cache = dict()  # 'text': ['t1'...('tN')]
     paragraphs = self.split_into_paragraphs(input_text)
     translated_paragraphs = []
     for paragraph in paragraphs:
         translated_paragraph = self.cache.get(paragraph)
         if translated_paragraph == None:
             translated_paragraph = self.underlying.multi_translate(
                 paragraph, nresults)
         new_cache[paragraph] = translated_paragraph
         translated_paragraphs.append(translated_paragraph)
     self.cache = new_cache
     info("cached translated_paragraphs", translated_paragraphs)
     pre_combine_paragraphs = [[s[i] for s in translated_paragraphs]
                               for i in range(nresults)]
     info("cached pre_combine_paragraphs", pre_combine_paragraphs)
     return self.combine_paragraphs(pre_combine_paragraphs, nresults)
コード例 #8
0
def process_seq2seq_sbd(input_text, sbd_translated_guess):
    sbd_translated_guess_index = sbd_translated_guess.find(SENTENCE_BOUNDARY_TOKEN)
    if sbd_translated_guess_index != -1:
        sbd_translated_guess = sbd_translated_guess[:sbd_translated_guess_index]
        info("sbd_translated_guess:", sbd_translated_guess)
        best_index = None
        best_ratio = 0.0
        for i in range(len(input_text)):
            candidate_sentence = input_text[:i]
            sm = SequenceMatcher()
            sm.set_seqs(candidate_sentence, sbd_translated_guess)
            ratio = sm.ratio()
            if best_index is None or ratio > best_ratio:
                best_index = i
                best_ratio = ratio
        return best_index
    else:
        return -1
コード例 #9
0
def detect_sentence(input_text, sbd_translation, sentence_guess_length=150):
    """Given input text, return the index after the end of the first sentence.

    Args:
        input_text (str): The text to detect the first sentence of.
        sbd_translation (translate.ITranslation): An ITranslation for detecting sentences.
        sentence_guess_length (int): Estimated number of chars > than most sentences.

    Returns:
        int: The index of the character after the end of the sentence.
                -1 if not found.
    """
    # TODO: Cache
    sentence_guess = input_text[:sentence_guess_length]
    info("sentence_guess:", sentence_guess)
    sbd_translated_guess = sbd_translation.translate(
        DETECT_SENTENCE_BOUNDARIES_TOKEN + sentence_guess
    )
    return process_seq2seq_sbd(input_text, sbd_translated_guess)
コード例 #10
0
    def hypotheses(self, input_text, num_hypotheses):
        if self.translator == None:
            model_path = str(self.pkg.package_path / 'model')
            self.translator = ctranslate2.Translator(model_path)
        paragraphs = ITranslation.split_into_paragraphs(input_text)
        info("paragraphs", paragraphs)
        translated_paragraphs = []
        for paragraph in paragraphs:
            translated_paragraphs.append(
                apply_packaged_translation(self.pkg, paragraph,
                                           self.translator, num_hypotheses))
        info("translated_paragraphs", translated_paragraphs)

        # Construct new hypotheses using all paragraphs
        hypotheses_to_return = [
            Hypothesis('', 0) for i in range(num_hypotheses)
        ]
        for i in range(num_hypotheses):
            for translated_paragraph in translated_paragraphs:
                value = ITranslation.combine_paragraphs([
                    hypotheses_to_return[i].value,
                    translated_paragraph[i].value
                ])
                score = hypotheses_to_return[i].score + translated_paragraph[
                    i].score
                hypotheses_to_return[i] = Hypothesis(value, score)
            hypotheses_to_return[i].value = hypotheses_to_return[
                i].value.lstrip('\n')
        info('hypotheses_to_return', hypotheses_to_return)
        return hypotheses_to_return
コード例 #11
0
def detect_sentence(input_text, sentence_guess_length=150):
    """Given input text, return the index after the end of the first sentence.

    Args:
        input_text (str): The text to detect the first sentence of.
        sentence_guess_length (int): Estimated number of chars > than most sentences.

    Returns:
        int: The index of the character after the end of the sentence.
                -1 if not found.
    """
    # TODO: Cache
    sbd_translation = get_sbd_translation()
    sentence_guess = input_text[:sentence_guess_length]
    info('sentence_guess', sentence_guess)
    sbd_translated_guess = sbd_translation.translate(
        DETECT_SENTENCE_BOUNDARIES_TOKEN + sentence_guess)
    sbd_translated_guess_index = sbd_translated_guess.find(
        SENTENCE_BOUNDARY_TOKEN)
    if sbd_translated_guess_index != -1:
        sbd_translated_guess = sbd_translated_guess[:
                                                    sbd_translated_guess_index]
        info('sbd_translated_guess', sbd_translated_guess)
        best_index = None
        best_ratio = 0.0
        for i in range(len(input_text)):
            candidate_sentence = input_text[:i]
            sm = SequenceMatcher()
            sm.set_seqs(candidate_sentence, sbd_translated_guess)
            ratio = sm.ratio()
            if best_index == None or ratio > best_ratio:
                best_index = i
                best_ratio = ratio
        return best_index
    else:
        return -1
コード例 #12
0
 def multi_translate(self, input_text, nresults=4):
     if self.translator == None:
         model_path = str(self.pkg.package_path / 'model')
         self.translator = ctranslate2.Translator(model_path)
     paragraphs = self.split_into_paragraphs(input_text)
     info("paragraphs", paragraphs)
     translated_paragraphs = []
     for paragraph in paragraphs:
         translated_paragraphs.append(
             apply_packaged_translation(self.pkg, paragraph,
                                        self.translator, nresults))
     info("translated_paragraphs", translated_paragraphs)
     pre_combine_paragraphs = [[s[i] for s in translated_paragraphs]
                               for i in range(nresults)]
     info("pre_combine_paragraphs", pre_combine_paragraphs)
     return self.combine_paragraphs(pre_combine_paragraphs, nresults)
コード例 #13
0
def apply_packaged_translation(pkg, input_text, translator, num_hypotheses=4):
    """Applies the translation in pkg to translate input_text.

    Args:
        pkg (Package): The package that provides the translation.
        input_text (str): The text to be translated.
        translator (ctranslate2.Translator): The CTranslate2 Translator
        num_hypotheses (int): The number of hypotheses to generate

    Returns:
        [Hypothesis]: A list of Hypothesis's for translating input_text

    """

    info("apply_packaged_translation")

    # Sentence boundary detection
    if pkg.type == "sbd":
        sentences = [input_text]
    elif settings.stanza_available:
        stanza_pipeline = stanza.Pipeline(
            lang=pkg.from_code,
            dir=str(pkg.package_path / "stanza"),
            processors="tokenize",
            use_gpu=False,
            logging_level="WARNING",
        )
        stanza_sbd = stanza_pipeline(input_text)
        sentences = [sentence.text for sentence in stanza_sbd.sentences]
    else:
        DEFAULT_SENTENCE_LENGTH = 250
        sentences = []
        start_index = 0
        while start_index < len(input_text) - 1:
            detected_sentence_index = detect_sentence(input_text[start_index:])
            if detected_sentence_index == -1:
                # Couldn't find sentence boundary
                sbd_index = start_index + DEFAULT_SENTENCE_LENGTH
            else:
                sbd_index = start_index + detected_sentence_index
            sentences.append(input_text[start_index:sbd_index])
            info("start_index", start_index)
            info("sbd_index", sbd_index)
            info(input_text[start_index:sbd_index])
            start_index = sbd_index
    info("sentences", sentences)

    # Tokenization
    sp_model_path = str(pkg.package_path / "sentencepiece.model")
    sp_processor = spm.SentencePieceProcessor(model_file=sp_model_path)
    tokenized = [
        sp_processor.encode(sentence, out_type=str) for sentence in sentences
    ]
    info("tokenized", tokenized)

    # Translation
    BATCH_SIZE = 32
    translated_batches = translator.translate_batch(
        tokenized,
        replace_unknowns=True,
        max_batch_size=BATCH_SIZE,
        beam_size=num_hypotheses,
        num_hypotheses=num_hypotheses,
        length_penalty=0.2,
    )
    info("translated_batches", translated_batches)

    # Build hypotheses
    value_hypotheses = []
    for i in range(num_hypotheses):
        translated_tokens = []
        cumulative_score = 0
        for translated_batch in translated_batches:
            translated_tokens += translated_batch[i]["tokens"]
            cumulative_score += translated_batch[i]["score"]
        detokenized = "".join(translated_tokens)
        detokenized = detokenized.replace("▁", " ")
        value = detokenized
        if len(value) > 0 and value[0] == " ":
            # Remove space at the beginning of the translation added
            # by the tokenizer.
            value = value[1:]
        hypothesis = Hypothesis(value, cumulative_score)
        value_hypotheses.append(hypothesis)
    info("value_hypotheses", value_hypotheses)
    return value_hypotheses
コード例 #14
0
ファイル: gui.py プロジェクト: ml-ai-nlp-ir/argos-translate
 def clicked_handler(self):
     info('WorkerStatusButton clicked_handler')
     self.status = self.Status.RUNNING
     self.update_status_indicator()
     self.worker_thread.start()
コード例 #15
0
ファイル: gui.py プロジェクト: niedev/argos-translate
 def finished_handler(self):
     info("WorkerStatusButton finished_handler")
     self.set_status(self.Status.DONE)
コード例 #16
0
ファイル: gui.py プロジェクト: ml-ai-nlp-ir/argos-translate
 def finished_handler(self):
     info('WorkerStatusButton finished_handler')
     self.status = self.Status.DONE
     self.update_status_indicator()
コード例 #17
0
def load_available_packages():
    """Deprecated 1.2, use get_available_packages"""
    info(
        "Using deprecated function load_available_packages, use get_available_packages instead"
    )
    return get_available_packages()
コード例 #18
0
def inject_tags_inference(underlying_translation, tag):
    """Returns translated tag tree with injection tags, None if not possible

    tag is only modified in place if tag injection is successful.

    Args:
        underlying_translation(translate.ITranslation): The translation to apply to the tags.
        tag (ITag): A depth=2 tag tree to attempt injection on.
 
    Returns:
        ITag: A translated version of tag, None if not possible to tag inject
    """
    MAX_SEQUENCE_LENGTH = 200

    text = tag.text()
    if len(text) > MAX_SEQUENCE_LENGTH:
        return None

    translated_text = translate_preserve_formatting(underlying_translation,
                                                    text)

    class InjectionTag:
        """

        Attributes:
            text (str): The text of the tag
            tag (ITag): The depth 1 ITag it represents
            injection_index: The index in the outer translated string that
                    this tag can be injected into.
        """
        def __init__(self, text, tag):
            self.text = text
            self.tag = tag
            self.injection_index = None

    injection_tags = []
    for child in tag.children:
        if depth(child) == 1:
            translated = translate_preserve_formatting(underlying_translation,
                                                       child.text())
            injection_tags.append(InjectionTag(translated, child))
        elif type(child) is not str:
            info("inject_tags_inference", "can't inject depth 0 ITag")
            return None

    for injection_tag in injection_tags:
        injection_index = translated_text.find(injection_tag.text)
        if injection_index != -1:
            injection_tag.injection_index = injection_index
        else:
            info(
                "inject_tags_inference",
                "injection text not found in translated text",
                translated_text,
                injection_tag.text,
            )
            return None

    # Check for overlap
    injection_tags.sort(key=lambda x: x.injection_index)
    for i in range(len(injection_tags) - 1):
        injection_tag = injection_tags[i]
        next_injection_tag = injection_tags[i + 1]
        if (injection_tag.injection_index + len(injection_tag.text) >=
                next_injection_tag.injection_index):
            info(
                "inject_tags_inference",
                "injection tags overlap",
                injection_tag,
                next_injection_tag,
            )
            return None

    to_return = []
    i = 0
    for injection_tag in injection_tags:
        if i < injection_tag.injection_index:
            to_return.append(translated_text[i:injection_tag.injection_index])
        to_return.append(injection_tag.tag)
        i = injection_tag.injection_index + len(injection_tag.text)
    if i < len(translated_text):
        to_return.append(translated_text[i:])

    tag.children = to_return

    return tag
コード例 #19
0
    def hypotheses(self, input_text, num_hypotheses=1):
        # Split into sentences
        DEFAULT_SENTENCE_LENGTH = 250
        sentences = []
        start_index = 0
        while start_index < len(input_text) - 1:
            prompt = sbd.generate_fewshot_sbd_prompt(input_text[start_index:])
            response = sbd.parse_fewshot_response(
                self.language_model.infer(prompt))
            detected_sentence_index = sbd.process_seq2seq_sbd(
                input_text[start_index:], response)
            if detected_sentence_index == -1:
                # Couldn't find sentence boundary
                sbd_index = start_index + DEFAULT_SENTENCE_LENGTH
            else:
                sbd_index = start_index + detected_sentence_index
            sentences.append(input_text[start_index:sbd_index])
            info("start_index", start_index)
            info("sbd_index", sbd_index)
            info(input_text[start_index:sbd_index])
            start_index = sbd_index

        to_return = ""
        for sentence in sentences:
            prompt = fewshot.generate_prompt(
                sentence,
                self.from_lang.name,
                self.from_lang.code,
                self.to_lang.name,
                self.to_lang.code,
            )
            info("fewshot prompt", prompt)
            response = self.language_model.infer(prompt)
            info("fewshot response", response)
            result = fewshot.parse_inference(response)
            info("fewshot result", result)
            to_return += result
        return [Hypothesis(to_return, 0)] * num_hypotheses
コード例 #20
0
def get_installed_languages():
    """Returns a list of Languages installed from packages"""

    info("get_installed_languages")

    if settings.model_provider == settings.ModelProvider.OPENNMT:
        packages = package.get_installed_packages()

        # If stanza not available filter for sbd available
        if not settings.stanza_available:
            sbd_packages = list(filter(lambda x: x.type == "sbd", packages))
            sbd_available_codes = set()
            for sbd_package in sbd_packages:
                sbd_available_codes = sbd_available_codes.union(
                    sbd_package.from_codes)
            packages = list(
                filter(lambda x: x.from_code in sbd_available_codes, packages))

        # Filter for translate packages
        packages = list(filter(lambda x: x.type == "translate", packages))

        # Load languages and translations from packages
        language_of_code = dict()
        for pkg in packages:
            if pkg.from_code not in language_of_code:
                language_of_code[pkg.from_code] = Language(
                    pkg.from_code, pkg.from_name)
            if pkg.to_code not in language_of_code:
                language_of_code[pkg.to_code] = Language(
                    pkg.to_code, pkg.to_name)
            from_lang = language_of_code[pkg.from_code]
            to_lang = language_of_code[pkg.to_code]
            translation_to_add = CachedTranslation(
                PackageTranslation(from_lang, to_lang, pkg))
            from_lang.translations_from.append(translation_to_add)
            to_lang.translations_to.append(translation_to_add)

        languages = list(language_of_code.values())

        # Add translations so everything can translate to itself
        for language in languages:
            identity_translation = IdentityTranslation(language)
            language.translations_from.append(identity_translation)
            language.translations_to.append(identity_translation)

        # Pivot through intermediate languages to add translations
        # that don't already exist
        for language in languages:
            keep_adding_translations = True
            while keep_adding_translations:
                keep_adding_translations = False
                for translation in language.translations_from:
                    for translation_2 in translation.to_lang.translations_from:
                        if language.get_translation(
                                translation_2.to_lang) is None:
                            # The language currently doesn't have a way to translate
                            # to this language
                            keep_adding_translations = True
                            composite_translation = CompositeTranslation(
                                translation, translation_2)
                            language.translations_from.append(
                                composite_translation)
                            translation_2.to_lang.translations_to.append(
                                composite_translation)

    elif settings.model_provider == settings.ModelProvider.LIBRETRANSLATE:
        # TODO: Add API key and custom URL support
        libretranslate_api = apis.LibreTranslateAPI()
        supported_languages = (libretranslate_api.languages()
                               )  # [{"code":"en", "name":"English"}]
        languages = [
            Language(l["code"], l["name"]) for l in supported_languages
        ]
        for from_lang in languages:
            for to_lang in languages:
                translation = LibreTranslateTranslation(
                    from_lang, to_lang, libretranslate_api)
                from_lang.translations_from.append(translation)
                to_lang.translations_to.append(translation)

    elif settings.model_provider == settings.ModelProvider.OPENAI:
        language_model = apis.OpenAIAPI(settings.openai_api_key)
        # TODO
        languages = [Language("en", "English"), Language("es", "Spanish")]
        for from_lang in languages:
            for to_lang in languages:
                translation = FewShotTranslation(from_lang, to_lang,
                                                 language_model)
                from_lang.translations_from.append(translation)
                to_lang.translations_to.append(translation)

    # Put English first if available so it shows up as the from language in the gui
    en_index = None
    for i, language in enumerate(languages):
        if language.code == "en":
            en_index = i
            break
    english = None
    if en_index is not None:
        english = languages.pop(en_index)
    languages.sort(key=lambda x: x.name)
    if english is not None:
        languages = [english] + languages

    return languages
コード例 #21
0
def apply_packaged_translation(pkg, input_text, translator, num_hypotheses=4):
    """Applies the translation in pkg to translate input_text.

    Args:
        pkg (Package): The package that provides the translation.
        input_text (str): The text to be translated.
        translator (ctranslate2.Translator): The CTranslate2 Translator
        num_hypotheses (int): The number of hypotheses to generate

    Returns:
        [Hypothesis]: A list of Hypothesis's for translating input_text

    """

    info('apply_packaged_translation')

    # Sentence boundary detection
    if pkg.from_code == SBD_FROM_CODE:
        sentences = [input_text]
    elif settings.experimental_enabled:
        DEFAULT_SENTENCE_LENGTH = 110
        sentences = []
        start_index = 0
        while start_index < len(input_text) - 1:
            sbd_index = start_index + detect_sentence(input_text[start_index:])
            if sbd_index == -1:
                sbd_index = start_index + DEFAULT_SENTENCE_LENGTH
            sentences.append(input_text[start_index:sbd_index])
            print('=' * 20)
            print('start_index', start_index)
            print('sbd_index', sbd_index)
            print(input_text[start_index:sbd_index])
            start_index = sbd_index
    else:
        stanza_pipeline = stanza.Pipeline(lang=pkg.from_code,
                                          dir=str(pkg.package_path / 'stanza'),
                                          processors='tokenize',
                                          use_gpu=False,
                                          logging_level='WARNING')
        stanza_sbd = stanza_pipeline(input_text)
        sentences = [sentence.text for sentence in stanza_sbd.sentences]
    info('sentences', sentences)

    # Tokenization
    sp_model_path = str(pkg.package_path / 'sentencepiece.model')
    sp_processor = spm.SentencePieceProcessor(model_file=sp_model_path)
    tokenized = [
        sp_processor.encode(sentence, out_type=str) for sentence in sentences
    ]
    info('tokenized', tokenized)

    # Translation
    BATCH_SIZE = 32
    translated_batches = translator.translate_batch(
        tokenized,
        replace_unknowns=True,
        max_batch_size=BATCH_SIZE,
        beam_size=num_hypotheses,
        num_hypotheses=num_hypotheses,
        length_penalty=0.2)
    info('translated_batches', translated_batches)

    # Build hypotheses
    value_hypotheses = []
    for i in range(num_hypotheses):
        translated_tokens = []
        cumulative_score = 0
        for translated_batch in translated_batches:
            translated_tokens += translated_batch[i]['tokens']
            cumulative_score += translated_batch[i]['score']
        detokenized = ''.join(translated_tokens)
        detokenized = detokenized.replace('▁', ' ')
        value = detokenized
        if len(value) > 0 and value[0] == ' ':
            # Remove space at the beginning of the translation added
            # by the tokenizer.
            value = value[1:]
        hypothesis = Hypothesis(value, cumulative_score)
        value_hypotheses.append(hypothesis)
    info('value_hypotheses', value_hypotheses)
    return value_hypotheses
コード例 #22
0
def get_installed_languages():
    """Returns a list of Languages installed from packages"""

    info('get_installed_languages')

    packages = package.get_installed_packages()

    # Load languages and translations from packages
    language_of_code = dict()
    for pkg in packages:
        if pkg.from_code not in language_of_code:
            language_of_code[pkg.from_code] = Language(pkg.from_code,
                                                       pkg.from_name)
        if pkg.to_code not in language_of_code:
            language_of_code[pkg.to_code] = Language(pkg.to_code, pkg.to_name)
        from_lang = language_of_code[pkg.from_code]
        to_lang = language_of_code[pkg.to_code]
        translation_to_add = CachedTranslation(
            PackageTranslation(from_lang, to_lang, pkg))
        from_lang.translations_from.append(translation_to_add)
        to_lang.translations_to.append(translation_to_add)

    languages = list(language_of_code.values())

    # Add translations so everything can translate to itself
    for language in languages:
        identity_translation = IdentityTranslation(language)
        language.translations_from.append(identity_translation)
        language.translations_to.append(identity_translation)

    # Pivot through intermediate languages to add translations
    # that don't already exist
    for language in languages:
        keep_adding_translations = True
        while keep_adding_translations:
            keep_adding_translations = False
            for translation in language.translations_from:
                for translation_2 in translation.to_lang.translations_from:
                    if language.get_translation(translation_2.to_lang) == None:
                        # The language currently doesn't have a way to translate
                        # to this language
                        keep_adding_translations = True
                        composite_translation = CompositeTranslation(
                            translation, translation_2)
                        language.translations_from.append(
                            composite_translation)
                        translation_2.to_lang.translations_to.append(
                            composite_translation)

    # Put English first if available so it shows up as the from language in the gui
    en_index = None
    for i, language in enumerate(languages):
        if language.code == 'en':
            en_index = i
            break
    english = None
    if en_index != None:
        english = languages.pop(en_index)
    languages.sort(key=lambda x: x.name)
    if english != None:
        languages = [english] + languages

    return languages
コード例 #23
0
def generate_fewshot_sbd_prompt(input_text, sentence_guess_length=150):
    sentence_guess = input_text[:sentence_guess_length]
    to_return = fewshot_prompt + "<detect-sentence-boundaries> " + sentence_guess
    info("generate_fewshot_sbd_prompt", to_return)
    return to_return