コード例 #1
0
ファイル: models.py プロジェクト: lcsouzamenezes/weblate
def get_glossary_terms(unit):
    """Return list of term pairs for an unit."""
    if unit.glossary_terms is not None:
        return unit.glossary_terms
    translation = unit.translation
    language = translation.language
    component = translation.component
    project = component.project
    source_language = component.source_language

    units = (
        Unit.objects.prefetch().select_related("source_unit").order_by(Lower("source"))
    )
    if language == source_language:
        return units.none()

    # Build complete source for matching
    parts = [""]
    for text in unit.get_source_plurals() + [unit.context]:
        text = text.lower().strip()
        if text:
            parts.append(text)
    parts.append("")
    source = PLURAL_SEPARATOR.join(parts)

    uses_ngram = source_language.uses_ngram()

    matches = set()
    automaton = project.glossary_automaton
    if automaton.kind == ahocorasick.AHOCORASICK:

        # Extract terms present in the source
        for end, term in automaton.iter(source):
            if uses_ngram or (
                NON_WORD_RE.match(source[end - len(term)])
                and NON_WORD_RE.match(source[end + 1])
            ):
                matches.add(term)

    if using_postgresql():
        match = r"^({})$".format("|".join(re_escape(term) for term in matches))
        # Use regex as that is utilizing pg_trgm index
        query = Q(source__iregex=match) | Q(variant__unit__source__iregex=match)
    else:
        # With MySQL we utilize it does case insensitive lookup
        query = Q(source__in=matches) | Q(variant__unit__source__in=matches)

    units = units.filter(
        query,
        translation__component__in=project.glossaries,
        translation__component__source_language=source_language,
        translation__language=language,
    ).distinct()

    # Store in a unit cache
    unit.glossary_terms = units

    return units
コード例 #2
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()

        # Prepare analyzers
        # - standard analyzer simply splits words
        # - stemming extracts stems, to catch things like plurals
        analyzers = [
            (SimpleAnalyzer(), True),
            (SimpleAnalyzer(expression=SPLIT_RE, gaps=True), True),
            (StandardAnalyzer(), False),
            (StemmingAnalyzer(), False),
        ]
        source_language = unit.translation.subproject.project.source_language
        lang_code = source_language.base_code()
        # Add per language analyzer if Whoosh has it
        if has_stemmer(lang_code):
            analyzers.append((LanguageAnalyzer(lang_code), False))
        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append((NgramAnalyzer(4), False))

        # Extract words from all plurals and from context
        for text in unit.get_source_plurals() + [unit.context]:
            for analyzer, combine in analyzers:
                # Some Whoosh analyzers break on unicode
                new_words = []
                try:
                    new_words = [token.text for token in analyzer(text)]
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error, sys.exc_info())
                words.update(new_words)
                # Add combined string to allow match against multiple word
                # entries allowing to combine up to 5 words
                if combine:
                    words.update([
                        ' '.join(new_words[x:y]) for x in range(len(new_words))
                        for y in range(1, min(x + 6,
                                              len(new_words) + 1)) if x != y
                    ])

        # Grab all words in the dictionary
        dictionary = self.filter(project=unit.translation.subproject.project,
                                 language=unit.translation.language)

        if '' in words:
            words.remove('')

        if len(words) == 0:
            # No extracted words, no dictionary
            dictionary = dictionary.none()
        else:
            # Build the query for fetching the words
            # Can not use __in as we want case insensitive lookup
            dictionary = dictionary.filter(source__iregex=r'^({0})$'.format(
                '|'.join([re_escape(word) for word in words])))

        return dictionary
コード例 #3
0
def get_glossary_terms(unit):
    """Return list of term pairs for an unit."""
    if unit.glossary_terms is not None:
        return unit.glossary_terms
    translation = unit.translation
    language = translation.language
    component = translation.component
    source_language = component.source_language
    glossaries = component.project.glossaries

    units = (Unit.objects.prefetch().select_related("source_unit").order_by(
        Lower("source")))
    if language == source_language:
        return units.none()

    # Chain terms
    terms = set(
        chain.from_iterable(glossary.glossary_sources
                            for glossary in glossaries))

    # Build complete source for matching
    parts = []
    for text in unit.get_source_plurals() + [unit.context]:
        text = text.lower().strip()
        if text:
            parts.append(text)
    source = PLURAL_SEPARATOR.join(parts)

    # Extract terms present in the source
    # This might use a suffix tree for improved performance
    matches = [
        term for term in terms
        if re.search(r"\b{}\b".format(re.escape(term)), source)
    ]

    if using_postgresql():
        match = r"^({})$".format("|".join(re_escape(term) for term in matches))
        # Use regex as that is utilizing pg_trgm index
        query = Q(source__iregex=match) | Q(
            variant__unit__source__iregex=match)
    else:
        # With MySQL we utilize it does case insensitive lookup
        query = Q(source__in=matches) | Q(variant__unit__source__in=matches)

    units = units.filter(
        query,
        translation__component__in=glossaries,
        translation__component__source_language=source_language,
        translation__language=language,
    ).distinct()

    # Store in a unit cache
    unit.glossary_terms = units

    return units
コード例 #4
0
ファイル: search.py プロジェクト: nijel/weblate
    def has_field(self, text, context: Dict):  # noqa: C901
        if text == "plural":
            return Q(source__contains=PLURAL_SEPARATOR)
        if text == "suggestion":
            return Q(suggestion__isnull=False)
        if text == "explanation":
            return ~Q(source_unit__explanation="")
        if text == "note":
            return ~Q(note="")
        if text == "comment":
            return Q(comment__resolved=False)
        if text in ("resolved-comment", "resolved_comment"):
            return Q(comment__resolved=True)
        if text in ("check", "failing-check", "failing_check"):
            return Q(check__dismissed=False)
        if text in (
                "dismissed-check",
                "dismissed_check",
                "ignored-check",
                "ignored_check",
        ):
            return Q(check__dismissed=True)
        if text == "translation":
            return Q(state__gte=STATE_TRANSLATED)
        if text in ("variant", "shaping"):
            return Q(variant__isnull=False)
        if text == "label":
            return Q(source_unit__labels__isnull=False) | Q(
                labels__isnull=False)
        if text == "context":
            return ~Q(context="")
        if text == "screenshot":
            return Q(screenshots__isnull=False) | Q(
                source_unit__screenshots__isnull=False)
        if text == "flags":
            return ~Q(source_unit__extra_flags="")
        if text == "glossary":
            project = context.get("project")
            if not project:
                return Q(source__isnull=True)
            terms = set(
                chain.from_iterable(glossary.glossary_sources
                                    for glossary in project.glossaries))
            if not terms:
                return Q(source__isnull=True)
            if using_postgresql():
                template = r"[[:<:]]({})[[:>:]]"
            else:
                template = r"(^|[ \t\n\r\f\v])({})($|[ \t\n\r\f\v])"
            return Q(source__iregex=template.format("|".join(
                re_escape(term) for term in terms)))

        raise ValueError(f"Unsupported has lookup: {text}")
コード例 #5
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Filters stop words for a language
        try:
            stopfilter = StopFilter(lang=source_language.base_code)
        except NoStopWords:
            stopfilter = StopFilter()

        # Prepare analyzers
        # - simple analyzer just splits words based on regexp
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter,
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(token.text for token in analyzer(text))
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error)
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if '' in words:
            words.remove('')

        if not words:
            # No extracted words, no dictionary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        return self.filter(
            project=unit.translation.component.project,
            language=unit.translation.language,
            source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format(
                '|'.join(re_escape(word) for word in islice(words, 1000))),
        )
コード例 #6
0
ファイル: dictionary.py プロジェクト: nijel/weblate
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Prepare analyzers
        # - simple analyzer just splits words based on regexp
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True),
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                new_words = []
                try:
                    new_words = [token.text for token in analyzer(text)]
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error)
                words.update(new_words)
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if '' in words:
            words.remove('')

        if not words:
            # No extracted words, no dictionary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        return self.filter(
            project=unit.translation.component.project,
            language=unit.translation.language,
            source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format(
                '|'.join([re_escape(word) for word in islice(words, 1000)])
            )
        )
コード例 #7
0
ファイル: models.py プロジェクト: thecraftman/weblate
    def get_terms(self, unit):
        """Return list of term pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Filters stop words for a language
        try:
            stopfilter = StopFilter(lang=source_language.base_code)
        except NoStopWords:
            stopfilter = StopFilter()

        # Prepare analyzers
        # - basic simple analyzer to split on non-word chars
        # - simple analyzer just splits words based on regexp to catch in word dashes
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer() | stopfilter,
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter,
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(token.text for token in analyzer(text))
                except (UnicodeDecodeError, IndexError):
                    report_error(cause="Term words parsing")
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if "" in words:
            words.remove("")

        if not words:
            # No extracted words, no glossary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        words = islice(words, 1000)
        if settings.DATABASES["default"][
                "ENGINE"] == "django.db.backends.postgresql":
            # Use regex as that is utilizing pg_trgm index
            results = self.filter(
                source__iregex=r"(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])".
                format("|".join(re_escape(word) for word in words)), )
        else:
            # MySQL
            results = self.filter(
                reduce(
                    lambda x, y: x | y,
                    (models.Q(source__search=word) for word in words),
                ), )

        return results.for_project(unit.translation.component.project).filter(
            language=unit.translation.language)
コード例 #8
0
 def test_re_escape(self):
     self.assertEqual(re_escape('[a-z]'), '\\[a\\-z\\]')
     self.assertEqual(re_escape('a{1,4}'), 'a\\{1,4\\}')
コード例 #9
0
 def test_re_escape(self):
     self.assertEqual(re_escape("[a-z]"), "\\[a\\-z\\]")
     self.assertEqual(re_escape("a{1,4}"), "a\\{1,4\\}")