Exemple #1
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Filters stop words for a language
        try:
            stopfilter = StopFilter(lang=source_language.base_code)
        except NoStopWords:
            stopfilter = StopFilter()

        # Prepare analyzers
        # - simple analyzer just splits words based on regexp
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter,
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(token.text for token in analyzer(text))
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error)
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if '' in words:
            words.remove('')

        if not words:
            # No extracted words, no dictionary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        return self.filter(
            project=unit.translation.component.project,
            language=unit.translation.language,
            source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format(
                '|'.join(re_escape(word) for word in islice(words, 1000))),
        )
Exemple #2
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Prepare analyzers
        # - simple analyzer just splits words based on regexp
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True),
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                new_words = []
                try:
                    new_words = [token.text for token in analyzer(text)]
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error)
                words.update(new_words)
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if '' in words:
            words.remove('')

        if not words:
            # No extracted words, no dictionary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        return self.filter(
            project=unit.translation.component.project,
            language=unit.translation.language,
            source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format(
                '|'.join([re_escape(word) for word in islice(words, 1000)])
            )
        )
Exemple #3
0
    def get_terms(self, unit):
        """Return list of term pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Filters stop words for a language
        try:
            stopfilter = StopFilter(lang=source_language.base_code)
        except NoStopWords:
            stopfilter = StopFilter()

        # Prepare analyzers
        # - basic simple analyzer to split on non-word chars
        # - simple analyzer just splits words based on regexp to catch in word dashes
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer() | stopfilter,
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter,
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(token.text for token in analyzer(text))
                except (UnicodeDecodeError, IndexError):
                    report_error(cause="Term words parsing")
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if "" in words:
            words.remove("")

        if not words:
            # No extracted words, no glossary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        words = islice(words, 1000)
        if settings.DATABASES["default"][
                "ENGINE"] == "django.db.backends.postgresql":
            # Use regex as that is utilizing pg_trgm index
            results = self.filter(
                source__iregex=r"(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])".
                format("|".join(re_escape(word) for word in words)), )
        else:
            # MySQL
            results = self.filter(
                reduce(
                    lambda x, y: x | y,
                    (models.Q(source__search=word) for word in words),
                ), )

        return results.for_project(unit.translation.component.project).filter(
            language=unit.translation.language)
Exemple #4
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Filters stop words for a language
        try:
            stopfilter = StopFilter(lang=source_language.base_code)
        except NoStopWords:
            stopfilter = StopFilter()

        # Prepare analyzers
        # - simple analyzer just splits words based on regexp
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter,
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(token.text for token in analyzer(text))
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error)
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if "" in words:
            words.remove("")

        if not words:
            # No extracted words, no dictionary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        words = islice(words, 1000)
        if settings.DATABASES["default"][
                "ENGINE"] == "django.db.backends.postgresql":
            results = self.filter(source__search=reduce(
                lambda x, y: x | y, (SearchQuery(word) for word in words)), )
        else:
            # MySQL
            results = self.filter(
                reduce(
                    lambda x, y: x | y,
                    (models.Q(source__search=word) for word in words),
                ), )

        return results.filter(
            project=unit.translation.component.project,
            language=unit.translation.language,
        )