Example #1
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()

        # Prepare analyzers
        # - standard analyzer simply splits words
        # - stemming extracts stems, to catch things like plurals
        analyzers = [
            (SimpleAnalyzer(), True),
            (SimpleAnalyzer(expression=SPLIT_RE, gaps=True), True),
            (StandardAnalyzer(), False),
            (StemmingAnalyzer(), False),
        ]
        source_language = unit.translation.subproject.project.source_language
        lang_code = source_language.base_code()
        # Add per language analyzer if Whoosh has it
        if has_stemmer(lang_code):
            analyzers.append((LanguageAnalyzer(lang_code), False))
        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append((NgramAnalyzer(4), False))

        # Extract words from all plurals and from context
        for text in unit.get_source_plurals() + [unit.context]:
            for analyzer, combine in analyzers:
                # Some Whoosh analyzers break on unicode
                new_words = []
                try:
                    new_words = [token.text for token in analyzer(text)]
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error, sys.exc_info())
                words.update(new_words)
                # Add combined string to allow match against multiple word
                # entries allowing to combine up to 5 words
                if combine:
                    words.update([
                        ' '.join(new_words[x:y]) for x in range(len(new_words))
                        for y in range(1, min(x + 6,
                                              len(new_words) + 1)) if x != y
                    ])

        # Grab all words in the dictionary
        dictionary = self.filter(project=unit.translation.subproject.project,
                                 language=unit.translation.language)

        if '' in words:
            words.remove('')

        if len(words) == 0:
            # No extracted words, no dictionary
            dictionary = dictionary.none()
        else:
            # Build the query for fetching the words
            # Can not use __in as we want case insensitive lookup
            dictionary = dictionary.filter(source__iregex=r'^({0})$'.format(
                '|'.join([re_escape(word) for word in words])))

        return dictionary
Example #2
0
    def get_words(self, unit):
        """
        Returns list of word pairs for an unit.
        """
        words = set()

        # Prepare analyzers
        # - standard analyzer simply splits words
        # - stemming extracts stems, to catch things like plurals
        analyzers = [
            StandardAnalyzer(),
            StemmingAnalyzer(),
        ]
        source_language = unit.translation.subproject.project.source_language
        lang_code = source_language.base_code()
        # Add per language analyzer if Whoosh has it
        if has_stemmer(lang_code):
            analyzers.append(LanguageAnalyzer(lang_code))
        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        for text in unit.get_source_plurals() + [unit.context]:
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(
                        [token.text for token in analyzer(force_text(text))]
                    )
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error, sys.exc_info())

        # Grab all words in the dictionary
        dictionary = self.filter(
            project=unit.translation.subproject.project,
            language=unit.translation.language
        )

        if len(words) == 0:
            # No extracted words, no dictionary
            dictionary = dictionary.none()
        else:
            # Build the query for fetching the words
            # Can not use __in as we want case insensitive lookup
            query = Q()
            for word in words:
                query |= Q(source__iexact=word)

            # Filter dictionary
            dictionary = dictionary.filter(query)

        return dictionary
Example #3
0
    def get_words(self, unit):
        """
        Returns list of word pairs for an unit.
        """
        words = set()

        # Prepare analyzers
        # - standard analyzer simply splits words
        # - stemming extracts stems, to catch things like plurals
        analyzers = [
            StandardAnalyzer(),
            StemmingAnalyzer(),
        ]
        source_language = unit.translation.subproject.project.source_language
        lang_code = source_language.base_code()
        # Add per language analyzer if Whoosh has it
        if has_stemmer(lang_code):
            analyzers.append(LanguageAnalyzer(lang_code))
        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        for text in unit.get_source_plurals() + [unit.context]:
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(
                        [token.text for token in analyzer(force_text(text))]
                    )
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error, sys.exc_info())

        # Grab all words in the dictionary
        dictionary = self.filter(
            project=unit.translation.subproject.project,
            language=unit.translation.language
        )

        if len(words) == 0:
            # No extracted words, no dictionary
            dictionary = dictionary.none()
        else:
            # Build the query for fetching the words
            # Can not use __in as we want case insensitive lookup
            query = Q()
            for word in words:
                query |= Q(source__iexact=word)

            # Filter dictionary
            dictionary = dictionary.filter(query)

        return dictionary
Example #4
0
    def __init__(self, index_path, language):
        from whoosh import index as whoosh_index
        from whoosh.fields import Schema, TEXT, ID
        from whoosh import qparser
        from whoosh.highlight import UppercaseFormatter
        from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer
        from whoosh.lang import has_stemmer, has_stopwords
        import os

        if not has_stemmer(language) or not has_stopwords(language):
            # TODO Display a warning?
            analyzer = SimpleAnalyzer()
        else:
            analyzer = LanguageAnalyzer(language)

        self.schema = Schema(path=ID(unique=True, stored=True),
                             body=TEXT(analyzer=analyzer))
        self.formatter = UppercaseFormatter()

        self.index_path = index_path

        if not os.path.exists(index_path):
            try:
                os.mkdir(index_path)
            except OSError as e:
                sys.exit("Error creating Whoosh index: %s" % e)

        if whoosh_index.exists_in(index_path):
            try:
                self.search_index = whoosh_index.open_dir(index_path)
            except whoosh_index.IndexError as e:
                sys.exit("Error opening whoosh index: {0}".format(e))
        else:
            self.search_index = whoosh_index.create_in(index_path, self.schema)

        self.query_parser = qparser.MultifieldParser(["body", "path"],
                                                     schema=self.schema)
        self.query_parser.add_plugin(qparser.FuzzyTermPlugin())
Example #5
0
    def __init__(self, index_path, language):
        from whoosh import index as whoosh_index
        from whoosh.fields import Schema, TEXT, ID
        from whoosh import qparser
        from whoosh.highlight import UppercaseFormatter
        from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer
        from whoosh.lang import has_stemmer, has_stopwords
        import os

        if not has_stemmer(language) or not has_stopwords(language):
            # TODO Display a warning?
            analyzer = SimpleAnalyzer()
        else:
            analyzer = LanguageAnalyzer(language)

        self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer))
        self.formatter = UppercaseFormatter()

        self.index_path = index_path

        if not os.path.exists(index_path):
            try:
                os.mkdir(index_path)
            except OSError as e:
                sys.exit("Error creating Whoosh index: %s" % e)

        if whoosh_index.exists_in(index_path):
            try:
                self.search_index = whoosh_index.open_dir(index_path)
            except whoosh_index.IndexError as e:
                sys.exit("Error opening whoosh index: %s" % (e))
        else:
            self.search_index = whoosh_index.create_in(index_path, self.schema)

        self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema)
        self.query_parser.add_plugin(qparser.FuzzyTermPlugin())