def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - simple analyzer just splits words based on regexp # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError) as error: report_error(error) if len(words) > 1000: break if len(words) > 1000: break if '' in words: words.remove('') if not words: # No extracted words, no dictionary return self.none() # Build the query for fetching the words # We want case insensitive lookup return self.filter( project=unit.translation.component.project, language=unit.translation.language, source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format( '|'.join(re_escape(word) for word in islice(words, 1000))), )
def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Prepare analyzers # - simple analyzer just splits words based on regexp # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer(expression=SPLIT_RE, gaps=True), LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode new_words = [] try: new_words = [token.text for token in analyzer(text)] except (UnicodeDecodeError, IndexError) as error: report_error(error) words.update(new_words) if len(words) > 1000: break if len(words) > 1000: break if '' in words: words.remove('') if not words: # No extracted words, no dictionary return self.none() # Build the query for fetching the words # We want case insensitive lookup return self.filter( project=unit.translation.component.project, language=unit.translation.language, source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format( '|'.join([re_escape(word) for word in islice(words, 1000)]) ) )
def get_terms(self, unit): """Return list of term pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - basic simple analyzer to split on non-word chars # - simple analyzer just splits words based on regexp to catch in word dashes # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer() | stopfilter, SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError): report_error(cause="Term words parsing") if len(words) > 1000: break if len(words) > 1000: break if "" in words: words.remove("") if not words: # No extracted words, no glossary return self.none() # Build the query for fetching the words # We want case insensitive lookup words = islice(words, 1000) if settings.DATABASES["default"][ "ENGINE"] == "django.db.backends.postgresql": # Use regex as that is utilizing pg_trgm index results = self.filter( source__iregex=r"(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])". format("|".join(re_escape(word) for word in words)), ) else: # MySQL results = self.filter( reduce( lambda x, y: x | y, (models.Q(source__search=word) for word in words), ), ) return results.for_project(unit.translation.component.project).filter( language=unit.translation.language)
def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - simple analyzer just splits words based on regexp # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError) as error: report_error(error) if len(words) > 1000: break if len(words) > 1000: break if "" in words: words.remove("") if not words: # No extracted words, no dictionary return self.none() # Build the query for fetching the words # We want case insensitive lookup words = islice(words, 1000) if settings.DATABASES["default"][ "ENGINE"] == "django.db.backends.postgresql": results = self.filter(source__search=reduce( lambda x, y: x | y, (SearchQuery(word) for word in words)), ) else: # MySQL results = self.filter( reduce( lambda x, y: x | y, (models.Q(source__search=word) for word in words), ), ) return results.filter( project=unit.translation.component.project, language=unit.translation.language, )