コード例 #1
0
 def identify(
         self, text,
         constrain_to_discussion_locales=SECURE_IDENTIFICATION_LIMIT):
     "Try to identify locale of text. Boost if one of the expected locales."
     if not text:
         return Locale.UNDEFINED, {Locale.UNDEFINED: 1}
     len_nourl = self.strlen_nourl(text)
     if len_nourl < 5:
         return Locale.NON_LINGUISTIC
     expected_locales = set((
         Locale.extract_root_locale(l)
         for l in self.discussion.discussion_locales))
     language_data = detect_langs(text)
     if constrain_to_discussion_locales and (
             len_nourl < constrain_to_discussion_locales):
         data = [(x.prob, x.lang)
                 for x in language_data
                 if Locale.any_compatible(
                     Locale.extract_root_locale(x.lang),
                     expected_locales)]
     else:
         # boost with discussion locales.
         data = [
             (x.prob * (
                 5 if Locale.Locale.extract_root_locale(x.lang)
                 in expected_locales else 1
             ), x.lang) for x in language_data]
     data.sort(reverse=True)
     top = data[0][1] if (data and (data[0][0] > 0.5)
                          ) else Locale.UNDEFINED
     return top, {lang: prob for (prob, lang) in data}
コード例 #2
0
 def identify(self,
              text,
              constrain_to_discussion_locales=SECURE_IDENTIFICATION_LIMIT):
     "Try to identify locale of text. Boost if one of the expected locales."
     if not text:
         return Locale.UNDEFINED, {Locale.UNDEFINED: 1}
     len_nourl = self.strlen_nourl(text)
     if len_nourl < 5:
         return Locale.NON_LINGUISTIC
     expected_locales = set((Locale.extract_root_locale(l)
                             for l in self.discussion.discussion_locales))
     language_data = detect_langs(text)
     if constrain_to_discussion_locales and (
             len_nourl < constrain_to_discussion_locales):
         data = [(x.prob, x.lang) for x in language_data
                 if Locale.any_compatible(
                     Locale.extract_root_locale(x.lang), expected_locales)]
     else:
         # boost with discussion locales.
         data = [(x.prob * (5 if Locale.Locale.extract_root_locale(x.lang)
                            in expected_locales else 1), x.lang)
                 for x in language_data]
     data.sort(reverse=True)
     top = data[0][1] if (data and (data[0][0] > 0.5)) else Locale.UNDEFINED
     return top, {lang: prob for (prob, lang) in data}
コード例 #3
0
 def translate_lse(
         self, source_lse, target, retranslate=False, is_html=False,
         constrain_locale_threshold=SECURE_IDENTIFICATION_LIMIT):
     if not source_lse.value:
         # don't translate empty strings
         return source_lse
     source_locale = source_lse.locale_code
     if source_locale == Locale.NON_LINGUISTIC:
         return source_lse
     # TODO: Handle MULTILINGUAL
     if (source_locale == Locale.UNDEFINED and
             self.strlen_nourl(source_lse.value) < 5):
         source_lse.identify_locale(Locale.NON_LINGUISTIC, None, True)
         return source_lse
     if (source_locale == Locale.UNDEFINED
             and self.distinct_identify_step):
         self.confirm_locale(
             source_lse,
             constrain_locale_threshold=constrain_locale_threshold)
         # TODO: bail if identification failed
         source_locale = source_lse.locale_code
     # TODO: Handle script differences
     if (Locale.compatible(source_locale, target.code)):
         return source_lse
     target_lse = None
     is_new_lse = False
     if (source_locale != Locale.UNDEFINED
             or not self.distinct_identify_step
             or self.has_fatal_error(source_lse)):
         # We try to avoid ???-mt-from-und locales in the DB.
         # This is only stored if both identification and translation
         # failed to identify a language.
         mt_target_name = self.get_mt_name(source_locale, target.code)
         target_lse = source_lse.langstring.entries_as_dict.get(
             Locale.get_id_of(mt_target_name), None)
         if target_lse and not retranslate:
             if self.has_fatal_error(target_lse):
                 return target_lse
     if target_lse is None:
         target_lse = LangStringEntry(
             langstring_id=source_lse.langstring_id,
             locale_id = Locale.UNDEFINED_LOCALEID,
             value='')
         is_new_lse = True
     if self.canTranslate(source_locale, target.code):
         try:
             trans, lang = self.translate(
                 source_lse.value,
                 target.code,
                 is_html,
                 source=source_locale if source_locale != Locale.UNDEFINED
                 else None,
                 db=source_lse.db)
             lang = self.asPosixLocale(lang)
             # What if detected language is not a discussion language?
             if source_locale == Locale.UNDEFINED:
                 if constrain_locale_threshold and (
                         self.strlen_nourl(source_lse.value) <
                         constrain_locale_threshold):
                     if (not lang) or not Locale.any_compatible(
                             lang, self.discussion.discussion_locales):
                         self.set_error(
                             source_lse,
                             LangStringStatus.IDENTIFIED_TO_UNKNOWN,
                             "Identified to "+lang)
                         return source_lse
                 source_lse.identify_locale(lang, dict(
                     service=self.__class__.__name__))
                 # This should never actually happen, because
                 # it would mean that the language id. was forgotten.
                 # Still, to be sure that all cases are covered.
                 mt_target_name = self.get_mt_name(lang, target.code)
                 other_target_lse = source_lse.langstring.entries_as_dict.get(
                     Locale.get_id_of(mt_target_name), None)
                 if other_target_lse:
                     target_lse = other_target_lse
                     is_new_lse = False
             source_locale = source_lse.locale_code
             if Locale.compatible(source_locale, target.code):
                 return source_lse
             target_lse.value = trans
             target_lse.error_count = 0
             target_lse.error_code = None
             target_lse.locale_identification_data_json = dict(
                 service=self.__class__.__name__)
             if trans.strip() == source_lse.value.strip():
                 # TODO: Check modulo spaces in the middle
                 target_lse.error_count = 1
                 target_lse.error_code = \
                     LangStringStatus.IDENTICAL_TRANSLATION.value
         except Exception as e:
             print_exc()
             self.set_error(target_lse, *self.decode_exception(e))
             target_lse.value = None
     else:
         # Note: when retranslating, we may lose a valid translation.
         if source_locale == Locale.UNDEFINED:
             if not self.distinct_identify_step:
                 # At least do this much.
                 self.confirm_locale(source_lse)
                 source_locale = source_lse.locale_code
         self.set_error(
             target_lse, LangStringStatus.CANNOT_TRANSLATE,
             "cannot translate")
         target_lse.value = None
     if (not target_lse.locale or
             (source_locale != Locale.UNDEFINED
              and Locale.extract_base_locale(
                 target_lse.locale_code) == Locale.UNDEFINED)):
         mt_target_name = self.get_mt_name(
             source_lse.locale_code, target.code)
         target_lse.locale = Locale.get_or_create(
             mt_target_name, source_lse.db)
     if is_new_lse:
         source_lse.db.add(target_lse)
     return target_lse
コード例 #4
0
 def translate_lse(
         self, source_lse, target, retranslate=False,
         constrain_to_discussion_locales=SECURE_IDENTIFICATION_LIMIT):
     if not source_lse.value:
         # don't translate empty strings
         return source_lse
     source_locale = source_lse.locale_code
     if source_locale == Locale.NON_LINGUISTIC:
         return source_lse
     # TODO: Handle MULTILINGUAL
     if (source_locale == Locale.UNDEFINED and
             self.strlen_nourl(source_lse.value) < 5):
         source_lse.identify_locale(Locale.NON_LINGUISTIC, None, True)
         return source_lse
     if (source_locale == Locale.UNDEFINED
             and self.distinct_identify_step):
         self.confirm_locale(source_lse, constrain_to_discussion_locales)
         # TODO: bail if identification failed
         source_locale = source_lse.locale_code
     # TODO: Handle script differences
     if (Locale.compatible(source_locale, target.code)):
         return source_lse
     target_lse = None
     is_new_lse = False
     if (source_locale != Locale.UNDEFINED
             or not self.distinct_identify_step
             or self.has_fatal_error(source_lse)):
         # We try to avoid ???-mt-from-und locales in the DB.
         # This is only stored if both identification and translation
         # failed to identify a language.
         mt_target_name = self.get_mt_name(source_locale, target.code)
         target_lse = source_lse.langstring.entries_as_dict.get(
             Locale.get_id_of(mt_target_name), None)
         if target_lse and not retranslate:
             if self.has_fatal_error(target_lse):
                 return target_lse
     if target_lse is None:
         target_lse = LangStringEntry(
             langstring_id=source_lse.langstring_id,
             locale_id = Locale.UNDEFINED_LOCALEID,
             value='')
         is_new_lse = True
     if self.canTranslate(source_locale, target.code):
         try:
             trans, lang = self.translate(
                 source_lse.value,
                 target.code,
                 source_locale if source_locale != Locale.UNDEFINED
                 else None,
                 source_lse.db)
             lang = self.asPosixLocale(lang)
             # What if detected language is not a discussion language?
             if source_locale == Locale.UNDEFINED:
                 if constrain_to_discussion_locales and (
                         self.strlen_nourl(source_lse.value) <
                         constrain_to_discussion_locales):
                     if (not lang) or not Locale.any_compatible(
                             lang, self.discussion.discussion_locales):
                         self.set_error(
                             source_lse,
                             LangStringStatus.IDENTIFIED_TO_UNKNOWN,
                             "Identified to "+lang)
                         return source_lse
                 source_lse.identify_locale(lang, dict(
                     service=self.__class__.__name__))
                 # This should never actually happen, because
                 # it would mean that the language id. was forgotten.
                 # Still, to be sure that all cases are covered.
                 mt_target_name = self.get_mt_name(lang, target.code)
                 other_target_lse = source_lse.langstring.entries_as_dict.get(
                     Locale.get_id_of(mt_target_name), None)
                 if other_target_lse:
                     target_lse = other_target_lse
                     is_new_lse = False
             source_locale = source_lse.locale_code
             if Locale.compatible(source_locale, target.code):
                 return source_lse
             target_lse.value = trans
             target_lse.error_count = 0
             target_lse.error_code = None
             target_lse.locale_identification_data_json = dict(
                 service=self.__class__.__name__)
             if trans.strip() == source_lse.value.strip():
                 # TODO: Check modulo spaces in the middle
                 target_lse.error_count = 1
                 target_lse.error_code = \
                     LangStringStatus.IDENTICAL_TRANSLATION.value
         except Exception as e:
             print_exc()
             self.set_error(target_lse, *self.decode_exception(e))
             target_lse.value = None
     else:
         # Note: when retranslating, we may lose a valid translation.
         if source_locale == Locale.UNDEFINED:
             if not self.distinct_identify_step:
                 # At least do this much.
                 self.confirm_locale(source_lse)
                 source_locale = source_lse.locale_code
         self.set_error(
             target_lse, LangStringStatus.CANNOT_TRANSLATE,
             "cannot translate")
         target_lse.value = None
     if (not target_lse.locale or
             (source_locale != Locale.UNDEFINED
              and Locale.extract_base_locale(
                 target_lse.locale_code) == Locale.UNDEFINED)):
         mt_target_name = self.get_mt_name(
             source_lse.locale_code, target.code)
         target_lse.locale = Locale.get_or_create(
             mt_target_name, source_lse.db)
     if is_new_lse:
         source_lse.db.add(target_lse)
     return target_lse