def get_glossary_terms(unit): """Return list of term pairs for an unit.""" if unit.glossary_terms is not None: return unit.glossary_terms translation = unit.translation language = translation.language component = translation.component project = component.project source_language = component.source_language units = ( Unit.objects.prefetch().select_related("source_unit").order_by(Lower("source")) ) if language == source_language: return units.none() # Build complete source for matching parts = [""] for text in unit.get_source_plurals() + [unit.context]: text = text.lower().strip() if text: parts.append(text) parts.append("") source = PLURAL_SEPARATOR.join(parts) uses_ngram = source_language.uses_ngram() matches = set() automaton = project.glossary_automaton if automaton.kind == ahocorasick.AHOCORASICK: # Extract terms present in the source for end, term in automaton.iter(source): if uses_ngram or ( NON_WORD_RE.match(source[end - len(term)]) and NON_WORD_RE.match(source[end + 1]) ): matches.add(term) if using_postgresql(): match = r"^({})$".format("|".join(re_escape(term) for term in matches)) # Use regex as that is utilizing pg_trgm index query = Q(source__iregex=match) | Q(variant__unit__source__iregex=match) else: # With MySQL we utilize it does case insensitive lookup query = Q(source__in=matches) | Q(variant__unit__source__in=matches) units = units.filter( query, translation__component__in=project.glossaries, translation__component__source_language=source_language, translation__language=language, ).distinct() # Store in a unit cache unit.glossary_terms = units return units
def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() # Prepare analyzers # - standard analyzer simply splits words # - stemming extracts stems, to catch things like plurals analyzers = [ (SimpleAnalyzer(), True), (SimpleAnalyzer(expression=SPLIT_RE, gaps=True), True), (StandardAnalyzer(), False), (StemmingAnalyzer(), False), ] source_language = unit.translation.subproject.project.source_language lang_code = source_language.base_code() # Add per language analyzer if Whoosh has it if has_stemmer(lang_code): analyzers.append((LanguageAnalyzer(lang_code), False)) # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append((NgramAnalyzer(4), False)) # Extract words from all plurals and from context for text in unit.get_source_plurals() + [unit.context]: for analyzer, combine in analyzers: # Some Whoosh analyzers break on unicode new_words = [] try: new_words = [token.text for token in analyzer(text)] except (UnicodeDecodeError, IndexError) as error: report_error(error, sys.exc_info()) words.update(new_words) # Add combined string to allow match against multiple word # entries allowing to combine up to 5 words if combine: words.update([ ' '.join(new_words[x:y]) for x in range(len(new_words)) for y in range(1, min(x + 6, len(new_words) + 1)) if x != y ]) # Grab all words in the dictionary dictionary = self.filter(project=unit.translation.subproject.project, language=unit.translation.language) if '' in words: words.remove('') if len(words) == 0: # No extracted words, no dictionary dictionary = dictionary.none() else: # Build the query for fetching the words # Can not use __in as we want case insensitive lookup dictionary = dictionary.filter(source__iregex=r'^({0})$'.format( '|'.join([re_escape(word) for word in words]))) return dictionary
def get_glossary_terms(unit): """Return list of term pairs for an unit.""" if unit.glossary_terms is not None: return unit.glossary_terms translation = unit.translation language = translation.language component = translation.component source_language = component.source_language glossaries = component.project.glossaries units = (Unit.objects.prefetch().select_related("source_unit").order_by( Lower("source"))) if language == source_language: return units.none() # Chain terms terms = set( chain.from_iterable(glossary.glossary_sources for glossary in glossaries)) # Build complete source for matching parts = [] for text in unit.get_source_plurals() + [unit.context]: text = text.lower().strip() if text: parts.append(text) source = PLURAL_SEPARATOR.join(parts) # Extract terms present in the source # This might use a suffix tree for improved performance matches = [ term for term in terms if re.search(r"\b{}\b".format(re.escape(term)), source) ] if using_postgresql(): match = r"^({})$".format("|".join(re_escape(term) for term in matches)) # Use regex as that is utilizing pg_trgm index query = Q(source__iregex=match) | Q( variant__unit__source__iregex=match) else: # With MySQL we utilize it does case insensitive lookup query = Q(source__in=matches) | Q(variant__unit__source__in=matches) units = units.filter( query, translation__component__in=glossaries, translation__component__source_language=source_language, translation__language=language, ).distinct() # Store in a unit cache unit.glossary_terms = units return units
def has_field(self, text, context: Dict): # noqa: C901 if text == "plural": return Q(source__contains=PLURAL_SEPARATOR) if text == "suggestion": return Q(suggestion__isnull=False) if text == "explanation": return ~Q(source_unit__explanation="") if text == "note": return ~Q(note="") if text == "comment": return Q(comment__resolved=False) if text in ("resolved-comment", "resolved_comment"): return Q(comment__resolved=True) if text in ("check", "failing-check", "failing_check"): return Q(check__dismissed=False) if text in ( "dismissed-check", "dismissed_check", "ignored-check", "ignored_check", ): return Q(check__dismissed=True) if text == "translation": return Q(state__gte=STATE_TRANSLATED) if text in ("variant", "shaping"): return Q(variant__isnull=False) if text == "label": return Q(source_unit__labels__isnull=False) | Q( labels__isnull=False) if text == "context": return ~Q(context="") if text == "screenshot": return Q(screenshots__isnull=False) | Q( source_unit__screenshots__isnull=False) if text == "flags": return ~Q(source_unit__extra_flags="") if text == "glossary": project = context.get("project") if not project: return Q(source__isnull=True) terms = set( chain.from_iterable(glossary.glossary_sources for glossary in project.glossaries)) if not terms: return Q(source__isnull=True) if using_postgresql(): template = r"[[:<:]]({})[[:>:]]" else: template = r"(^|[ \t\n\r\f\v])({})($|[ \t\n\r\f\v])" return Q(source__iregex=template.format("|".join( re_escape(term) for term in terms))) raise ValueError(f"Unsupported has lookup: {text}")
def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - simple analyzer just splits words based on regexp # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError) as error: report_error(error) if len(words) > 1000: break if len(words) > 1000: break if '' in words: words.remove('') if not words: # No extracted words, no dictionary return self.none() # Build the query for fetching the words # We want case insensitive lookup return self.filter( project=unit.translation.component.project, language=unit.translation.language, source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format( '|'.join(re_escape(word) for word in islice(words, 1000))), )
def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Prepare analyzers # - simple analyzer just splits words based on regexp # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer(expression=SPLIT_RE, gaps=True), LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode new_words = [] try: new_words = [token.text for token in analyzer(text)] except (UnicodeDecodeError, IndexError) as error: report_error(error) words.update(new_words) if len(words) > 1000: break if len(words) > 1000: break if '' in words: words.remove('') if not words: # No extracted words, no dictionary return self.none() # Build the query for fetching the words # We want case insensitive lookup return self.filter( project=unit.translation.component.project, language=unit.translation.language, source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format( '|'.join([re_escape(word) for word in islice(words, 1000)]) ) )
def get_terms(self, unit): """Return list of term pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - basic simple analyzer to split on non-word chars # - simple analyzer just splits words based on regexp to catch in word dashes # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer() | stopfilter, SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError): report_error(cause="Term words parsing") if len(words) > 1000: break if len(words) > 1000: break if "" in words: words.remove("") if not words: # No extracted words, no glossary return self.none() # Build the query for fetching the words # We want case insensitive lookup words = islice(words, 1000) if settings.DATABASES["default"][ "ENGINE"] == "django.db.backends.postgresql": # Use regex as that is utilizing pg_trgm index results = self.filter( source__iregex=r"(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])". format("|".join(re_escape(word) for word in words)), ) else: # MySQL results = self.filter( reduce( lambda x, y: x | y, (models.Q(source__search=word) for word in words), ), ) return results.for_project(unit.translation.component.project).filter( language=unit.translation.language)
def test_re_escape(self): self.assertEqual(re_escape('[a-z]'), '\\[a\\-z\\]') self.assertEqual(re_escape('a{1,4}'), 'a\\{1,4\\}')
def test_re_escape(self): self.assertEqual(re_escape("[a-z]"), "\\[a\\-z\\]") self.assertEqual(re_escape("a{1,4}"), "a\\{1,4\\}")