def identify( self, text, constrain_to_discussion_locales=SECURE_IDENTIFICATION_LIMIT): "Try to identify locale of text. Boost if one of the expected locales." if not text: return Locale.UNDEFINED, {Locale.UNDEFINED: 1} len_nourl = self.strlen_nourl(text) if len_nourl < 5: return Locale.NON_LINGUISTIC expected_locales = set(( Locale.extract_root_locale(l) for l in self.discussion.discussion_locales)) language_data = detect_langs(text) if constrain_to_discussion_locales and ( len_nourl < constrain_to_discussion_locales): data = [(x.prob, x.lang) for x in language_data if Locale.any_compatible( Locale.extract_root_locale(x.lang), expected_locales)] else: # boost with discussion locales. data = [ (x.prob * ( 5 if Locale.Locale.extract_root_locale(x.lang) in expected_locales else 1 ), x.lang) for x in language_data] data.sort(reverse=True) top = data[0][1] if (data and (data[0][0] > 0.5) ) else Locale.UNDEFINED return top, {lang: prob for (prob, lang) in data}
def identify(self, text, constrain_to_discussion_locales=SECURE_IDENTIFICATION_LIMIT): "Try to identify locale of text. Boost if one of the expected locales." if not text: return Locale.UNDEFINED, {Locale.UNDEFINED: 1} len_nourl = self.strlen_nourl(text) if len_nourl < 5: return Locale.NON_LINGUISTIC expected_locales = set((Locale.extract_root_locale(l) for l in self.discussion.discussion_locales)) language_data = detect_langs(text) if constrain_to_discussion_locales and ( len_nourl < constrain_to_discussion_locales): data = [(x.prob, x.lang) for x in language_data if Locale.any_compatible( Locale.extract_root_locale(x.lang), expected_locales)] else: # boost with discussion locales. data = [(x.prob * (5 if Locale.Locale.extract_root_locale(x.lang) in expected_locales else 1), x.lang) for x in language_data] data.sort(reverse=True) top = data[0][1] if (data and (data[0][0] > 0.5)) else Locale.UNDEFINED return top, {lang: prob for (prob, lang) in data}
def translate_lse( self, source_lse, target, retranslate=False, is_html=False, constrain_locale_threshold=SECURE_IDENTIFICATION_LIMIT): if not source_lse.value: # don't translate empty strings return source_lse source_locale = source_lse.locale_code if source_locale == Locale.NON_LINGUISTIC: return source_lse # TODO: Handle MULTILINGUAL if (source_locale == Locale.UNDEFINED and self.strlen_nourl(source_lse.value) < 5): source_lse.identify_locale(Locale.NON_LINGUISTIC, None, True) return source_lse if (source_locale == Locale.UNDEFINED and self.distinct_identify_step): self.confirm_locale( source_lse, constrain_locale_threshold=constrain_locale_threshold) # TODO: bail if identification failed source_locale = source_lse.locale_code # TODO: Handle script differences if (Locale.compatible(source_locale, target.code)): return source_lse target_lse = None is_new_lse = False if (source_locale != Locale.UNDEFINED or not self.distinct_identify_step or self.has_fatal_error(source_lse)): # We try to avoid ???-mt-from-und locales in the DB. # This is only stored if both identification and translation # failed to identify a language. mt_target_name = self.get_mt_name(source_locale, target.code) target_lse = source_lse.langstring.entries_as_dict.get( Locale.get_id_of(mt_target_name), None) if target_lse and not retranslate: if self.has_fatal_error(target_lse): return target_lse if target_lse is None: target_lse = LangStringEntry( langstring_id=source_lse.langstring_id, locale_id = Locale.UNDEFINED_LOCALEID, value='') is_new_lse = True if self.canTranslate(source_locale, target.code): try: trans, lang = self.translate( source_lse.value, target.code, is_html, source=source_locale if source_locale != Locale.UNDEFINED else None, db=source_lse.db) lang = self.asPosixLocale(lang) # What if detected language is not a discussion language? if source_locale == Locale.UNDEFINED: if constrain_locale_threshold and ( self.strlen_nourl(source_lse.value) < constrain_locale_threshold): if (not lang) or not Locale.any_compatible( lang, self.discussion.discussion_locales): self.set_error( source_lse, LangStringStatus.IDENTIFIED_TO_UNKNOWN, "Identified to "+lang) return source_lse source_lse.identify_locale(lang, dict( service=self.__class__.__name__)) # This should never actually happen, because # it would mean that the language id. was forgotten. # Still, to be sure that all cases are covered. mt_target_name = self.get_mt_name(lang, target.code) other_target_lse = source_lse.langstring.entries_as_dict.get( Locale.get_id_of(mt_target_name), None) if other_target_lse: target_lse = other_target_lse is_new_lse = False source_locale = source_lse.locale_code if Locale.compatible(source_locale, target.code): return source_lse target_lse.value = trans target_lse.error_count = 0 target_lse.error_code = None target_lse.locale_identification_data_json = dict( service=self.__class__.__name__) if trans.strip() == source_lse.value.strip(): # TODO: Check modulo spaces in the middle target_lse.error_count = 1 target_lse.error_code = \ LangStringStatus.IDENTICAL_TRANSLATION.value except Exception as e: print_exc() self.set_error(target_lse, *self.decode_exception(e)) target_lse.value = None else: # Note: when retranslating, we may lose a valid translation. if source_locale == Locale.UNDEFINED: if not self.distinct_identify_step: # At least do this much. self.confirm_locale(source_lse) source_locale = source_lse.locale_code self.set_error( target_lse, LangStringStatus.CANNOT_TRANSLATE, "cannot translate") target_lse.value = None if (not target_lse.locale or (source_locale != Locale.UNDEFINED and Locale.extract_base_locale( target_lse.locale_code) == Locale.UNDEFINED)): mt_target_name = self.get_mt_name( source_lse.locale_code, target.code) target_lse.locale = Locale.get_or_create( mt_target_name, source_lse.db) if is_new_lse: source_lse.db.add(target_lse) return target_lse
def translate_lse( self, source_lse, target, retranslate=False, constrain_to_discussion_locales=SECURE_IDENTIFICATION_LIMIT): if not source_lse.value: # don't translate empty strings return source_lse source_locale = source_lse.locale_code if source_locale == Locale.NON_LINGUISTIC: return source_lse # TODO: Handle MULTILINGUAL if (source_locale == Locale.UNDEFINED and self.strlen_nourl(source_lse.value) < 5): source_lse.identify_locale(Locale.NON_LINGUISTIC, None, True) return source_lse if (source_locale == Locale.UNDEFINED and self.distinct_identify_step): self.confirm_locale(source_lse, constrain_to_discussion_locales) # TODO: bail if identification failed source_locale = source_lse.locale_code # TODO: Handle script differences if (Locale.compatible(source_locale, target.code)): return source_lse target_lse = None is_new_lse = False if (source_locale != Locale.UNDEFINED or not self.distinct_identify_step or self.has_fatal_error(source_lse)): # We try to avoid ???-mt-from-und locales in the DB. # This is only stored if both identification and translation # failed to identify a language. mt_target_name = self.get_mt_name(source_locale, target.code) target_lse = source_lse.langstring.entries_as_dict.get( Locale.get_id_of(mt_target_name), None) if target_lse and not retranslate: if self.has_fatal_error(target_lse): return target_lse if target_lse is None: target_lse = LangStringEntry( langstring_id=source_lse.langstring_id, locale_id = Locale.UNDEFINED_LOCALEID, value='') is_new_lse = True if self.canTranslate(source_locale, target.code): try: trans, lang = self.translate( source_lse.value, target.code, source_locale if source_locale != Locale.UNDEFINED else None, source_lse.db) lang = self.asPosixLocale(lang) # What if detected language is not a discussion language? if source_locale == Locale.UNDEFINED: if constrain_to_discussion_locales and ( self.strlen_nourl(source_lse.value) < constrain_to_discussion_locales): if (not lang) or not Locale.any_compatible( lang, self.discussion.discussion_locales): self.set_error( source_lse, LangStringStatus.IDENTIFIED_TO_UNKNOWN, "Identified to "+lang) return source_lse source_lse.identify_locale(lang, dict( service=self.__class__.__name__)) # This should never actually happen, because # it would mean that the language id. was forgotten. # Still, to be sure that all cases are covered. mt_target_name = self.get_mt_name(lang, target.code) other_target_lse = source_lse.langstring.entries_as_dict.get( Locale.get_id_of(mt_target_name), None) if other_target_lse: target_lse = other_target_lse is_new_lse = False source_locale = source_lse.locale_code if Locale.compatible(source_locale, target.code): return source_lse target_lse.value = trans target_lse.error_count = 0 target_lse.error_code = None target_lse.locale_identification_data_json = dict( service=self.__class__.__name__) if trans.strip() == source_lse.value.strip(): # TODO: Check modulo spaces in the middle target_lse.error_count = 1 target_lse.error_code = \ LangStringStatus.IDENTICAL_TRANSLATION.value except Exception as e: print_exc() self.set_error(target_lse, *self.decode_exception(e)) target_lse.value = None else: # Note: when retranslating, we may lose a valid translation. if source_locale == Locale.UNDEFINED: if not self.distinct_identify_step: # At least do this much. self.confirm_locale(source_lse) source_locale = source_lse.locale_code self.set_error( target_lse, LangStringStatus.CANNOT_TRANSLATE, "cannot translate") target_lse.value = None if (not target_lse.locale or (source_locale != Locale.UNDEFINED and Locale.extract_base_locale( target_lse.locale_code) == Locale.UNDEFINED)): mt_target_name = self.get_mt_name( source_lse.locale_code, target.code) target_lse.locale = Locale.get_or_create( mt_target_name, source_lse.db) if is_new_lse: source_lse.db.add(target_lse) return target_lse