def clicked_handler(self): info("WorkerStatusButton clicked_handler") if self.status == self.Status.NOT_STARTED: self.worker_thread = WorkerThread(self.bound_worker_function) self.worker_thread.finished.connect(self.finished_handler) self.set_status(self.Status.RUNNING) self.worker_thread.start()
def translate_tags(underlying_translation, tag): """Translate an ITag or str Recursively takes either an ITag or a str, modifies it in place, and returns the translated tag tree Args: underlying_translation (translate.ITranslation): The translation to apply tag (ITag or str): The tag tree to translate Returns: ITag or str: The translated tag tree """ if type(tag) is str: return translate_preserve_formatting(underlying_translation, tag) elif tag.translateable is False: return tag elif depth(tag) == 2: tag_injection = inject_tags_inference(underlying_translation, tag) if tag_injection is not None: info("translate_tags", "tag injection successful") return tag_injection else: tag.children = [ translate_tags(underlying_translation, child) for child in tag.children ] return tag
def get(url, retry_count=3): """Downoads data from a url and returns it Args: url (str): The url to download (http, https) retry_count (int): The number of retries to attempt if the initial download fails. If retry_count is 0 the download will only be attempted once. Returns: bytes: The downloaded data, None is returned if the download fails """ if get_protocol(url) not in supported_protocols: return None info(f"Downloading {url}") download_attempts_count = 0 while download_attempts_count <= retry_count: try: response = urllib.request.urlopen(url) data = response.read() info(f"Got {url}") return data except Exception as err: download_attempts_count += 1 error(err) return None
def apply_packaged_translation(pkg, input_text, translator, num_hypotheses=4): """Applies the translation in pkg to translate input_text. Args: pkg (Package): The package that provides the translation. input_text (str): The text to be translated. translator (ctranslate2.Translator): The CTranslate2 Translator num_hypotheses (int): The number of hypotheses to generate Returns: [Hypothesis]: A list of Hypothesis's for translating input_text """ info('apply_packaged_translation') sp_model_path = str(pkg.package_path / 'sentencepiece.model') sp_processor = spm.SentencePieceProcessor(model_file=sp_model_path) stanza_pipeline = stanza.Pipeline(lang=pkg.from_code, dir=str(pkg.package_path / 'stanza'), processors='tokenize', use_gpu=False, logging_level='WARNING') stanza_sbd = stanza_pipeline(input_text) sentences = [sentence.text for sentence in stanza_sbd.sentences] info('sentences', sentences) tokenized = [ sp_processor.encode(sentence, out_type=str) for sentence in sentences ] info('tokenized', tokenized) BATCH_SIZE = 32 assert (len(sentences) <= BATCH_SIZE) translated_batches = translator.translate_batch( tokenized, replace_unknowns=True, max_batch_size=BATCH_SIZE, beam_size=num_hypotheses, num_hypotheses=num_hypotheses, length_penalty=0.2) info('translated_batches', translated_batches) # Build hypotheses value_hypotheses = [] for i in range(num_hypotheses): translated_tokens = [] cumulative_score = 0 for translated_batch in translated_batches: translated_tokens += translated_batch[i]['tokens'] cumulative_score += translated_batch[i]['score'] detokenized = ''.join(translated_tokens) detokenized = detokenized.replace('▁', ' ') value = detokenized if len(value) > 0 and value[0] == ' ': # Remove space at the beginning of the translation added # by the tokenizer. value = value[1:] hypothesis = Hypothesis(value, cumulative_score) value_hypotheses.append(hypothesis) info('value_hypotheses', value_hypotheses) return value_hypotheses
def parse_fewshot_response(response_text): response = response_text.split(FEWSHOT_BOUNDARY_TOKEN) info("parse_fewshot_response", response) if len(response) < 2: return None response = response[-2].split("\n") if len(response) < 2: return None return response[-1]
def apply_packaged_translation(pkg, input_text, translator, nresults): """Applies the translation in pkg to translate input_text. Args: pkg (Package): The package that provides the translation. input_text (str): The text to be translated. Returns: str: The translated text. """ info('apply_packaged_translation') sp_model_path = str(pkg.package_path / 'sentencepiece.model') sp_processor = spm.SentencePieceProcessor(model_file=sp_model_path) stanza_pipeline = stanza.Pipeline(lang=pkg.from_code, dir=str(pkg.package_path / 'stanza'), processors='tokenize', use_gpu=False, logging_level='WARNING') stanza_sbd = stanza_pipeline(input_text) sentences = [sentence.text for sentence in stanza_sbd.sentences] info('sentences', sentences) tokenized = [ sp_processor.encode(sentence, out_type=str) for sentence in sentences ] info('tokenized', tokenized) translated_batches = translator.translate_batch(tokenized, replace_unknowns=True, max_batch_size=32, beam_size=nresults, num_hypotheses=nresults, length_penalty=0.2) info('translated_batches', translated_batches) result_batches = [] for i in range(nresults): translated_tokens = [] for translated_batch in translated_batches: translated_tokens += translated_batch[i]['tokens'] detokenized = ''.join(translated_tokens) detokenized = detokenized.replace('▁', ' ') to_return = detokenized if len(to_return) > 0 and to_return[0] == ' ': # Remove space at the beginning of the translation added # by the tokenizer. to_return = to_return[1:] result_batches.append(to_return) info('result_batches', result_batches) return result_batches
def multi_translate(self, input_text, nresults=4): new_cache = dict() # 'text': ['t1'...('tN')] paragraphs = self.split_into_paragraphs(input_text) translated_paragraphs = [] for paragraph in paragraphs: translated_paragraph = self.cache.get(paragraph) if translated_paragraph == None: translated_paragraph = self.underlying.multi_translate( paragraph, nresults) new_cache[paragraph] = translated_paragraph translated_paragraphs.append(translated_paragraph) self.cache = new_cache info("cached translated_paragraphs", translated_paragraphs) pre_combine_paragraphs = [[s[i] for s in translated_paragraphs] for i in range(nresults)] info("cached pre_combine_paragraphs", pre_combine_paragraphs) return self.combine_paragraphs(pre_combine_paragraphs, nresults)
def process_seq2seq_sbd(input_text, sbd_translated_guess): sbd_translated_guess_index = sbd_translated_guess.find(SENTENCE_BOUNDARY_TOKEN) if sbd_translated_guess_index != -1: sbd_translated_guess = sbd_translated_guess[:sbd_translated_guess_index] info("sbd_translated_guess:", sbd_translated_guess) best_index = None best_ratio = 0.0 for i in range(len(input_text)): candidate_sentence = input_text[:i] sm = SequenceMatcher() sm.set_seqs(candidate_sentence, sbd_translated_guess) ratio = sm.ratio() if best_index is None or ratio > best_ratio: best_index = i best_ratio = ratio return best_index else: return -1
def detect_sentence(input_text, sbd_translation, sentence_guess_length=150): """Given input text, return the index after the end of the first sentence. Args: input_text (str): The text to detect the first sentence of. sbd_translation (translate.ITranslation): An ITranslation for detecting sentences. sentence_guess_length (int): Estimated number of chars > than most sentences. Returns: int: The index of the character after the end of the sentence. -1 if not found. """ # TODO: Cache sentence_guess = input_text[:sentence_guess_length] info("sentence_guess:", sentence_guess) sbd_translated_guess = sbd_translation.translate( DETECT_SENTENCE_BOUNDARIES_TOKEN + sentence_guess ) return process_seq2seq_sbd(input_text, sbd_translated_guess)
def hypotheses(self, input_text, num_hypotheses): if self.translator == None: model_path = str(self.pkg.package_path / 'model') self.translator = ctranslate2.Translator(model_path) paragraphs = ITranslation.split_into_paragraphs(input_text) info("paragraphs", paragraphs) translated_paragraphs = [] for paragraph in paragraphs: translated_paragraphs.append( apply_packaged_translation(self.pkg, paragraph, self.translator, num_hypotheses)) info("translated_paragraphs", translated_paragraphs) # Construct new hypotheses using all paragraphs hypotheses_to_return = [ Hypothesis('', 0) for i in range(num_hypotheses) ] for i in range(num_hypotheses): for translated_paragraph in translated_paragraphs: value = ITranslation.combine_paragraphs([ hypotheses_to_return[i].value, translated_paragraph[i].value ]) score = hypotheses_to_return[i].score + translated_paragraph[ i].score hypotheses_to_return[i] = Hypothesis(value, score) hypotheses_to_return[i].value = hypotheses_to_return[ i].value.lstrip('\n') info('hypotheses_to_return', hypotheses_to_return) return hypotheses_to_return
def detect_sentence(input_text, sentence_guess_length=150): """Given input text, return the index after the end of the first sentence. Args: input_text (str): The text to detect the first sentence of. sentence_guess_length (int): Estimated number of chars > than most sentences. Returns: int: The index of the character after the end of the sentence. -1 if not found. """ # TODO: Cache sbd_translation = get_sbd_translation() sentence_guess = input_text[:sentence_guess_length] info('sentence_guess', sentence_guess) sbd_translated_guess = sbd_translation.translate( DETECT_SENTENCE_BOUNDARIES_TOKEN + sentence_guess) sbd_translated_guess_index = sbd_translated_guess.find( SENTENCE_BOUNDARY_TOKEN) if sbd_translated_guess_index != -1: sbd_translated_guess = sbd_translated_guess[: sbd_translated_guess_index] info('sbd_translated_guess', sbd_translated_guess) best_index = None best_ratio = 0.0 for i in range(len(input_text)): candidate_sentence = input_text[:i] sm = SequenceMatcher() sm.set_seqs(candidate_sentence, sbd_translated_guess) ratio = sm.ratio() if best_index == None or ratio > best_ratio: best_index = i best_ratio = ratio return best_index else: return -1
def multi_translate(self, input_text, nresults=4): if self.translator == None: model_path = str(self.pkg.package_path / 'model') self.translator = ctranslate2.Translator(model_path) paragraphs = self.split_into_paragraphs(input_text) info("paragraphs", paragraphs) translated_paragraphs = [] for paragraph in paragraphs: translated_paragraphs.append( apply_packaged_translation(self.pkg, paragraph, self.translator, nresults)) info("translated_paragraphs", translated_paragraphs) pre_combine_paragraphs = [[s[i] for s in translated_paragraphs] for i in range(nresults)] info("pre_combine_paragraphs", pre_combine_paragraphs) return self.combine_paragraphs(pre_combine_paragraphs, nresults)
def apply_packaged_translation(pkg, input_text, translator, num_hypotheses=4): """Applies the translation in pkg to translate input_text. Args: pkg (Package): The package that provides the translation. input_text (str): The text to be translated. translator (ctranslate2.Translator): The CTranslate2 Translator num_hypotheses (int): The number of hypotheses to generate Returns: [Hypothesis]: A list of Hypothesis's for translating input_text """ info("apply_packaged_translation") # Sentence boundary detection if pkg.type == "sbd": sentences = [input_text] elif settings.stanza_available: stanza_pipeline = stanza.Pipeline( lang=pkg.from_code, dir=str(pkg.package_path / "stanza"), processors="tokenize", use_gpu=False, logging_level="WARNING", ) stanza_sbd = stanza_pipeline(input_text) sentences = [sentence.text for sentence in stanza_sbd.sentences] else: DEFAULT_SENTENCE_LENGTH = 250 sentences = [] start_index = 0 while start_index < len(input_text) - 1: detected_sentence_index = detect_sentence(input_text[start_index:]) if detected_sentence_index == -1: # Couldn't find sentence boundary sbd_index = start_index + DEFAULT_SENTENCE_LENGTH else: sbd_index = start_index + detected_sentence_index sentences.append(input_text[start_index:sbd_index]) info("start_index", start_index) info("sbd_index", sbd_index) info(input_text[start_index:sbd_index]) start_index = sbd_index info("sentences", sentences) # Tokenization sp_model_path = str(pkg.package_path / "sentencepiece.model") sp_processor = spm.SentencePieceProcessor(model_file=sp_model_path) tokenized = [ sp_processor.encode(sentence, out_type=str) for sentence in sentences ] info("tokenized", tokenized) # Translation BATCH_SIZE = 32 translated_batches = translator.translate_batch( tokenized, replace_unknowns=True, max_batch_size=BATCH_SIZE, beam_size=num_hypotheses, num_hypotheses=num_hypotheses, length_penalty=0.2, ) info("translated_batches", translated_batches) # Build hypotheses value_hypotheses = [] for i in range(num_hypotheses): translated_tokens = [] cumulative_score = 0 for translated_batch in translated_batches: translated_tokens += translated_batch[i]["tokens"] cumulative_score += translated_batch[i]["score"] detokenized = "".join(translated_tokens) detokenized = detokenized.replace("▁", " ") value = detokenized if len(value) > 0 and value[0] == " ": # Remove space at the beginning of the translation added # by the tokenizer. value = value[1:] hypothesis = Hypothesis(value, cumulative_score) value_hypotheses.append(hypothesis) info("value_hypotheses", value_hypotheses) return value_hypotheses
def clicked_handler(self): info('WorkerStatusButton clicked_handler') self.status = self.Status.RUNNING self.update_status_indicator() self.worker_thread.start()
def finished_handler(self): info("WorkerStatusButton finished_handler") self.set_status(self.Status.DONE)
def finished_handler(self): info('WorkerStatusButton finished_handler') self.status = self.Status.DONE self.update_status_indicator()
def load_available_packages(): """Deprecated 1.2, use get_available_packages""" info( "Using deprecated function load_available_packages, use get_available_packages instead" ) return get_available_packages()
def inject_tags_inference(underlying_translation, tag): """Returns translated tag tree with injection tags, None if not possible tag is only modified in place if tag injection is successful. Args: underlying_translation(translate.ITranslation): The translation to apply to the tags. tag (ITag): A depth=2 tag tree to attempt injection on. Returns: ITag: A translated version of tag, None if not possible to tag inject """ MAX_SEQUENCE_LENGTH = 200 text = tag.text() if len(text) > MAX_SEQUENCE_LENGTH: return None translated_text = translate_preserve_formatting(underlying_translation, text) class InjectionTag: """ Attributes: text (str): The text of the tag tag (ITag): The depth 1 ITag it represents injection_index: The index in the outer translated string that this tag can be injected into. """ def __init__(self, text, tag): self.text = text self.tag = tag self.injection_index = None injection_tags = [] for child in tag.children: if depth(child) == 1: translated = translate_preserve_formatting(underlying_translation, child.text()) injection_tags.append(InjectionTag(translated, child)) elif type(child) is not str: info("inject_tags_inference", "can't inject depth 0 ITag") return None for injection_tag in injection_tags: injection_index = translated_text.find(injection_tag.text) if injection_index != -1: injection_tag.injection_index = injection_index else: info( "inject_tags_inference", "injection text not found in translated text", translated_text, injection_tag.text, ) return None # Check for overlap injection_tags.sort(key=lambda x: x.injection_index) for i in range(len(injection_tags) - 1): injection_tag = injection_tags[i] next_injection_tag = injection_tags[i + 1] if (injection_tag.injection_index + len(injection_tag.text) >= next_injection_tag.injection_index): info( "inject_tags_inference", "injection tags overlap", injection_tag, next_injection_tag, ) return None to_return = [] i = 0 for injection_tag in injection_tags: if i < injection_tag.injection_index: to_return.append(translated_text[i:injection_tag.injection_index]) to_return.append(injection_tag.tag) i = injection_tag.injection_index + len(injection_tag.text) if i < len(translated_text): to_return.append(translated_text[i:]) tag.children = to_return return tag
def hypotheses(self, input_text, num_hypotheses=1): # Split into sentences DEFAULT_SENTENCE_LENGTH = 250 sentences = [] start_index = 0 while start_index < len(input_text) - 1: prompt = sbd.generate_fewshot_sbd_prompt(input_text[start_index:]) response = sbd.parse_fewshot_response( self.language_model.infer(prompt)) detected_sentence_index = sbd.process_seq2seq_sbd( input_text[start_index:], response) if detected_sentence_index == -1: # Couldn't find sentence boundary sbd_index = start_index + DEFAULT_SENTENCE_LENGTH else: sbd_index = start_index + detected_sentence_index sentences.append(input_text[start_index:sbd_index]) info("start_index", start_index) info("sbd_index", sbd_index) info(input_text[start_index:sbd_index]) start_index = sbd_index to_return = "" for sentence in sentences: prompt = fewshot.generate_prompt( sentence, self.from_lang.name, self.from_lang.code, self.to_lang.name, self.to_lang.code, ) info("fewshot prompt", prompt) response = self.language_model.infer(prompt) info("fewshot response", response) result = fewshot.parse_inference(response) info("fewshot result", result) to_return += result return [Hypothesis(to_return, 0)] * num_hypotheses
def get_installed_languages(): """Returns a list of Languages installed from packages""" info("get_installed_languages") if settings.model_provider == settings.ModelProvider.OPENNMT: packages = package.get_installed_packages() # If stanza not available filter for sbd available if not settings.stanza_available: sbd_packages = list(filter(lambda x: x.type == "sbd", packages)) sbd_available_codes = set() for sbd_package in sbd_packages: sbd_available_codes = sbd_available_codes.union( sbd_package.from_codes) packages = list( filter(lambda x: x.from_code in sbd_available_codes, packages)) # Filter for translate packages packages = list(filter(lambda x: x.type == "translate", packages)) # Load languages and translations from packages language_of_code = dict() for pkg in packages: if pkg.from_code not in language_of_code: language_of_code[pkg.from_code] = Language( pkg.from_code, pkg.from_name) if pkg.to_code not in language_of_code: language_of_code[pkg.to_code] = Language( pkg.to_code, pkg.to_name) from_lang = language_of_code[pkg.from_code] to_lang = language_of_code[pkg.to_code] translation_to_add = CachedTranslation( PackageTranslation(from_lang, to_lang, pkg)) from_lang.translations_from.append(translation_to_add) to_lang.translations_to.append(translation_to_add) languages = list(language_of_code.values()) # Add translations so everything can translate to itself for language in languages: identity_translation = IdentityTranslation(language) language.translations_from.append(identity_translation) language.translations_to.append(identity_translation) # Pivot through intermediate languages to add translations # that don't already exist for language in languages: keep_adding_translations = True while keep_adding_translations: keep_adding_translations = False for translation in language.translations_from: for translation_2 in translation.to_lang.translations_from: if language.get_translation( translation_2.to_lang) is None: # The language currently doesn't have a way to translate # to this language keep_adding_translations = True composite_translation = CompositeTranslation( translation, translation_2) language.translations_from.append( composite_translation) translation_2.to_lang.translations_to.append( composite_translation) elif settings.model_provider == settings.ModelProvider.LIBRETRANSLATE: # TODO: Add API key and custom URL support libretranslate_api = apis.LibreTranslateAPI() supported_languages = (libretranslate_api.languages() ) # [{"code":"en", "name":"English"}] languages = [ Language(l["code"], l["name"]) for l in supported_languages ] for from_lang in languages: for to_lang in languages: translation = LibreTranslateTranslation( from_lang, to_lang, libretranslate_api) from_lang.translations_from.append(translation) to_lang.translations_to.append(translation) elif settings.model_provider == settings.ModelProvider.OPENAI: language_model = apis.OpenAIAPI(settings.openai_api_key) # TODO languages = [Language("en", "English"), Language("es", "Spanish")] for from_lang in languages: for to_lang in languages: translation = FewShotTranslation(from_lang, to_lang, language_model) from_lang.translations_from.append(translation) to_lang.translations_to.append(translation) # Put English first if available so it shows up as the from language in the gui en_index = None for i, language in enumerate(languages): if language.code == "en": en_index = i break english = None if en_index is not None: english = languages.pop(en_index) languages.sort(key=lambda x: x.name) if english is not None: languages = [english] + languages return languages
def apply_packaged_translation(pkg, input_text, translator, num_hypotheses=4): """Applies the translation in pkg to translate input_text. Args: pkg (Package): The package that provides the translation. input_text (str): The text to be translated. translator (ctranslate2.Translator): The CTranslate2 Translator num_hypotheses (int): The number of hypotheses to generate Returns: [Hypothesis]: A list of Hypothesis's for translating input_text """ info('apply_packaged_translation') # Sentence boundary detection if pkg.from_code == SBD_FROM_CODE: sentences = [input_text] elif settings.experimental_enabled: DEFAULT_SENTENCE_LENGTH = 110 sentences = [] start_index = 0 while start_index < len(input_text) - 1: sbd_index = start_index + detect_sentence(input_text[start_index:]) if sbd_index == -1: sbd_index = start_index + DEFAULT_SENTENCE_LENGTH sentences.append(input_text[start_index:sbd_index]) print('=' * 20) print('start_index', start_index) print('sbd_index', sbd_index) print(input_text[start_index:sbd_index]) start_index = sbd_index else: stanza_pipeline = stanza.Pipeline(lang=pkg.from_code, dir=str(pkg.package_path / 'stanza'), processors='tokenize', use_gpu=False, logging_level='WARNING') stanza_sbd = stanza_pipeline(input_text) sentences = [sentence.text for sentence in stanza_sbd.sentences] info('sentences', sentences) # Tokenization sp_model_path = str(pkg.package_path / 'sentencepiece.model') sp_processor = spm.SentencePieceProcessor(model_file=sp_model_path) tokenized = [ sp_processor.encode(sentence, out_type=str) for sentence in sentences ] info('tokenized', tokenized) # Translation BATCH_SIZE = 32 translated_batches = translator.translate_batch( tokenized, replace_unknowns=True, max_batch_size=BATCH_SIZE, beam_size=num_hypotheses, num_hypotheses=num_hypotheses, length_penalty=0.2) info('translated_batches', translated_batches) # Build hypotheses value_hypotheses = [] for i in range(num_hypotheses): translated_tokens = [] cumulative_score = 0 for translated_batch in translated_batches: translated_tokens += translated_batch[i]['tokens'] cumulative_score += translated_batch[i]['score'] detokenized = ''.join(translated_tokens) detokenized = detokenized.replace('▁', ' ') value = detokenized if len(value) > 0 and value[0] == ' ': # Remove space at the beginning of the translation added # by the tokenizer. value = value[1:] hypothesis = Hypothesis(value, cumulative_score) value_hypotheses.append(hypothesis) info('value_hypotheses', value_hypotheses) return value_hypotheses
def get_installed_languages(): """Returns a list of Languages installed from packages""" info('get_installed_languages') packages = package.get_installed_packages() # Load languages and translations from packages language_of_code = dict() for pkg in packages: if pkg.from_code not in language_of_code: language_of_code[pkg.from_code] = Language(pkg.from_code, pkg.from_name) if pkg.to_code not in language_of_code: language_of_code[pkg.to_code] = Language(pkg.to_code, pkg.to_name) from_lang = language_of_code[pkg.from_code] to_lang = language_of_code[pkg.to_code] translation_to_add = CachedTranslation( PackageTranslation(from_lang, to_lang, pkg)) from_lang.translations_from.append(translation_to_add) to_lang.translations_to.append(translation_to_add) languages = list(language_of_code.values()) # Add translations so everything can translate to itself for language in languages: identity_translation = IdentityTranslation(language) language.translations_from.append(identity_translation) language.translations_to.append(identity_translation) # Pivot through intermediate languages to add translations # that don't already exist for language in languages: keep_adding_translations = True while keep_adding_translations: keep_adding_translations = False for translation in language.translations_from: for translation_2 in translation.to_lang.translations_from: if language.get_translation(translation_2.to_lang) == None: # The language currently doesn't have a way to translate # to this language keep_adding_translations = True composite_translation = CompositeTranslation( translation, translation_2) language.translations_from.append( composite_translation) translation_2.to_lang.translations_to.append( composite_translation) # Put English first if available so it shows up as the from language in the gui en_index = None for i, language in enumerate(languages): if language.code == 'en': en_index = i break english = None if en_index != None: english = languages.pop(en_index) languages.sort(key=lambda x: x.name) if english != None: languages = [english] + languages return languages
def generate_fewshot_sbd_prompt(input_text, sentence_guess_length=150): sentence_guess = input_text[:sentence_guess_length] to_return = fewshot_prompt + "<detect-sentence-boundaries> " + sentence_guess info("generate_fewshot_sbd_prompt", to_return) return to_return