Ejemplo n.º 1
0
    def download_translations(self, source, language, text, unit, user):
        """Download list of possible translations from a service."""
        matching_units = Unit.objects.prefetch().filter(
            translation__component__project__in=Project.objects.all_acl(
                user)).more_like_this(unit, 1000)

        comparer = Comparer()

        result = set((self.format_unit_match(
            munit, comparer.similarity(text,
                                       munit.get_source_plurals()[0]))
                      for munit in matching_units))
        if None in result:
            result.remove(None)
        return result
Ejemplo n.º 2
0
    def download_translations(self, source, language, text, unit, user):
        """Download list of possible translations from a service."""
        matching_units = Unit.objects.prefetch().filter(
            translation__component__project__in=user.allowed_projects
        ).more_like_this(unit, 1000)

        comparer = Comparer()

        result = set((
            self.format_unit_match(
                munit,
                comparer.similarity(text, munit.get_source_plurals()[0])
            )
            for munit in matching_units
        ))
        if None in result:
            result.remove(None)
        return result
Ejemplo n.º 3
0
 def download_translations(self, source, language, text, unit, user, search):
     """Download list of possible translations from a service."""
     comparer = Comparer()
     for result in Memory.objects.lookup(
         source,
         language,
         text,
         user,
         unit.translation.component.project,
         unit.translation.component.project.use_shared_tm,
     ).iterator():
         quality = comparer.similarity(text, result.source)
         if quality < 10 or (quality < 75 and not search):
             continue
         yield {
             "text": result.target,
             "quality": quality,
             "service": self.name,
             "origin": result.get_origin_display(),
             "source": result.source,
         }
Ejemplo n.º 4
0
class TranslationMemory(WhooshIndex):
    LOCATION = 'memory'
    SCHEMA = TMSchema

    def __init__(self):
        self.index = self.open_index()
        self.parser = qparser.QueryParser(
            'source',
            schema=self.index.schema,
            group=qparser.OrGroup.factory(0.9),
            termclass=query.FuzzyTerm,
            plugins=[],
        )
        self.comparer = Comparer()

    def __del__(self):
        self.close()

    @cached_property
    def searcher(self):
        return self.index.searcher()

    def doc_count(self):
        return self.searcher.doc_count()

    def close(self):
        if 'seacher' in self.__dict__:
            self.seacher.close()
            del self.__dict__['searcher']

    def refresh(self):
        if 'seacher' in self.__dict__:
            self.__dict__['searcher'] = self.seacher.refresh()

    def writer(self):
        return self.index.writer()

    @staticmethod
    def get_language_code(code, langmap):
        language = Language.objects.auto_get_or_create(code)
        if langmap and language.code in langmap:
            language = Language.objects.auto_get_or_create(
                langmap[language.code])
        return language.code

    @staticmethod
    def get_category(category=None, project=None, user=None, use_file=False):
        if project:
            return CATEGORY_PRIVATE_OFFSET + project.pk
        if user:
            return CATEGORY_USER_OFFSET + user.pk
        if use_file:
            return CATEGORY_FILE
        return category

    @classmethod
    def import_file(cls,
                    request,
                    fileobj,
                    langmap=None,
                    category=None,
                    project=None,
                    user=None,
                    use_file=False):
        origin = force_text(os.path.basename(fileobj.name)).lower()
        category = cls.get_category(category, project, user, use_file)
        name, extension = os.path.splitext(origin)
        if len(name) > 25:
            origin = '{}...{}'.format(name[:25], extension)
        if extension == '.tmx':
            result = cls.import_tmx(request, fileobj, langmap, category,
                                    origin)
        elif extension == '.json':
            result = cls.import_json(request, fileobj, category, origin)
        else:
            raise MemoryImportError(_('Unsupported file!'))
        if not result:
            raise MemoryImportError(
                _('No valid entries found in the uploaded file!'))
        return result

    @classmethod
    def import_json(cls, request, fileobj, category=None, origin=None):
        from weblate.memory.tasks import update_memory_task
        content = fileobj.read()
        try:
            data = json.loads(force_text(content))
        except (ValueError, UnicodeDecodeError) as error:
            report_error(error, request, prefix='Failes to parse')
            raise MemoryImportError(_('Failed to parse JSON file!'))
        updates = {}
        fields = cls.SCHEMA().names()
        if category:
            updates = {
                'category': category,
                'origin': origin,
            }
        found = 0
        if isinstance(data, list):
            for entry in data:
                if not isinstance(entry, dict):
                    continue
                # Apply overrides
                entry.update(updates)
                # Ensure all fields are set
                for field in fields:
                    if not entry.get(field):
                        continue
                # Ensure there are not extra fields
                record = {field: entry[field] for field in fields}
                update_memory_task.delay(**record)
                found += 1
        return found

    @classmethod
    def import_tmx(cls,
                   request,
                   fileobj,
                   langmap=None,
                   category=None,
                   origin=None):
        from weblate.memory.tasks import update_memory_task
        if category is None:
            category = CATEGORY_FILE
        try:
            storage = tmxfile.parsefile(fileobj)
        except (SyntaxError, AssertionError) as error:
            report_error(error, request, prefix='Failes to parse')
            raise MemoryImportError(_('Failed to parse TMX file!'))
        header = next(storage.document.getroot().iterchildren(
            storage.namespaced("header")))
        source_language_code = header.get('srclang')
        source_language = cls.get_language_code(source_language_code, langmap)

        languages = {}
        found = 0
        for unit in storage.units:
            # Parse translations (translate-toolkit does not care about
            # languages here, it just picks first and second XML elements)
            translations = {}
            for node in unit.getlanguageNodes():
                lang, text = get_node_data(unit, node)
                if not lang or not text:
                    continue
                translations[lang] = text
                if lang not in languages:
                    languages[lang] = cls.get_language_code(lang, langmap)

            try:
                source = translations.pop(source_language_code)
            except KeyError:
                # Skip if source language is not present
                continue

            for lang, text in translations.items():
                update_memory_task.delay(
                    source_language=source_language,
                    target_language=languages[lang],
                    source=source,
                    target=text,
                    origin=origin,
                    category=category,
                )
                found += 1
        return found

    @staticmethod
    def get_filter(user, project, use_shared, use_file):
        """Create query to filter categories based on selection."""
        # Always include file imported memory
        if use_file:
            category_filter = [query.Term('category', CATEGORY_FILE)]
        else:
            category_filter = []
        # Per user memory
        if user:
            category_filter.append(
                query.Term('category', CATEGORY_USER_OFFSET + user.id))
        # Private project memory
        if project:
            category_filter.append(
                query.Term('category', CATEGORY_PRIVATE_OFFSET + project.id))
        # Shared memory
        if use_shared:
            category_filter.append(query.Term('category', CATEGORY_SHARED))
        return query.Or(category_filter)

    def list_documents(self, user=None, project=None, use_file=False):
        catfilter = self.get_filter(user, project, False, use_file)
        return self.searcher.search(catfilter, limit=None)

    def lookup(self, source_language, target_language, text, user, project,
               use_shared):
        langfilter = query.And([
            query.Term('source_language', source_language),
            query.Term('target_language', target_language),
            self.get_filter(user, project, use_shared, True),
        ])
        text_query = self.parser.parse(text)
        matches = self.searcher.search(text_query,
                                       filter=langfilter,
                                       limit=20000)

        for match in matches:
            similarity = self.comparer.similarity(text, match['source'])
            if similarity < 30:
                continue
            yield (match['source'], match['target'], similarity,
                   match['category'], match['origin'])

    def delete(self,
               origin=None,
               category=None,
               project=None,
               user=None,
               use_file=False):
        """Delete entries based on filter."""
        category = self.get_category(category, project, user, use_file)
        with self.writer() as writer:
            if origin:
                return writer.delete_by_term('origin', origin)
            return writer.delete_by_term('category', category)

    def empty(self):
        """Recreates translation memory."""
        self.close()
        self.cleanup()
        self.index = self.open_index()

    def get_values(self, field):
        return [
            force_text(x) for x in self.searcher.reader().field_terms(field)
        ]

    def dump(self, handle, indent=2):
        """Dump memory content to JSON file."""
        json.dump(
            list(self.searcher.documents()),
            handle,
            indent=indent,
        )
Ejemplo n.º 5
0
class TranslationMemory(WhooshIndex):
    LOCATION = 'memory'
    SCHEMA = TMSchema

    def __init__(self):
        self.index = self.open_index()
        self.parser = qparser.QueryParser(
            'source',
            schema=self.index.schema,
            group=qparser.OrGroup.factory(0.9),
            termclass=query.FuzzyTerm,
        )
        self.searcher = None
        self.comparer = Comparer()

    def __del__(self):
        self.close()

    def open_searcher(self):
        if self.searcher is None:
            self.searcher = self.index.searcher()

    def doc_count(self):
        self.open_searcher()
        return self.searcher.doc_count()

    def close(self):
        if self.searcher is not None:
            self.searcher.close()
            self.searcher = None

    def writer(self):
        return self.index.writer()

    def get_language_code(self, code, langmap):
        language = Language.objects.auto_get_or_create(code)
        if langmap and language.code in langmap:
            language = Language.objects.auto_get_or_create(
                langmap[language.code]
            )
        return language.code

    def import_tmx(self, fileobj, langmap=None):
        origin = force_text(os.path.basename(fileobj.name))
        storage = tmxfile.parsefile(fileobj)
        header = next(
            storage.document.getroot().iterchildren(
                storage.namespaced("header")
            )
        )
        source_language_code = header.get('srclang')
        source_language = self.get_language_code(source_language_code, langmap)

        languages = {}
        with self.writer() as writer:
            for unit in storage.units:
                # Parse translations (translate-toolkit does not care about
                # languages here, it just picks first and second XML elements)
                translations = {}
                for node in unit.getlanguageNodes():
                    lang, text = get_node_data(unit, node)
                    translations[lang] = text
                    if lang not in languages:
                        languages[lang] = self.get_language_code(lang, langmap)

                try:
                    source = translations.pop(source_language_code)
                except KeyError:
                    # Skip if source language is not present
                    continue

                for lang, text in translations.items():
                    writer.add_document(
                        source_language=source_language,
                        target_language=languages[lang],
                        source=source,
                        target=text,
                        origin=origin,
                        category=CATEGORY_FILE,
                    )

    def lookup(self, source_language, target_language, text):
        langfilter = query.And([
            query.Term('source_language', source_language),
            query.Term('target_language', target_language),
        ])
        self.open_searcher()
        text_query = self.parser.parse(text)
        matches = self.searcher.search(
            text_query, filter=langfilter, limit=20000
        )

        for match in matches:
            similarity = self.comparer.similarity(text, match['source'])
            if similarity < 30:
                continue
            yield (
                match['source'], match['target'], similarity, match['origin']
            )

    def delete(self, origin):
        """Delete entries by origin."""
        with self.writer() as writer:
            return writer.delete_by_term('origin', origin)

    def empty(self):
        """Recreates translation memory."""
        self.cleanup()
        self.index = self.open_index()
        self.searcher = None

    def get_origins(self):
        self.open_searcher()
        return [
            force_text(x) for x in self.searcher.lexicon('origin')
        ]
Ejemplo n.º 6
0
class TranslationMemory(WhooshIndex):
    LOCATION = 'memory'
    SCHEMA = TMSchema

    def __init__(self):
        self.index = self.open_index()
        self.parser = qparser.QueryParser(
            'source',
            schema=self.index.schema,
            group=qparser.OrGroup.factory(0.9),
            termclass=query.FuzzyTerm,
        )
        self.searcher = None
        self.comparer = Comparer()

    def __del__(self):
        self.close()

    def open_searcher(self):
        if self.searcher is None:
            self.searcher = self.index.searcher()

    def doc_count(self):
        self.open_searcher()
        return self.searcher.doc_count()

    def close(self):
        if self.searcher is not None:
            self.searcher.close()
            self.searcher = None

    def writer(self):
        return self.index.writer()

    @staticmethod
    def get_language_code(code, langmap):
        language = Language.objects.auto_get_or_create(code)
        if langmap and language.code in langmap:
            language = Language.objects.auto_get_or_create(
                langmap[language.code]
            )
        return language.code

    @staticmethod
    def get_category(category=None, project=None, user=None):
        if project:
            return CATEGORY_PRIVATE_OFFSET + project.pk
        if user:
            return CATEGORY_USER_OFFSET + user.pk
        return category

    @classmethod
    def import_file(cls, request, fileobj, langmap=None, category=None,
                    project=None, user=None):
        origin = force_text(os.path.basename(fileobj.name)).lower()
        category = cls.get_category(category, project, user)
        name, extension = os.path.splitext(origin)
        if len(name) > 25:
            origin = '{}...{}'.format(name[:25], extension)
        if extension == '.tmx':
            result = cls.import_tmx(request, fileobj, langmap, category, origin)
        elif extension == '.json':
            result = cls.import_json(request, fileobj, category, origin)
        else:
            raise MemoryImportError(_('Unsupported file!'))
        if not result:
            raise MemoryImportError(
                _('No valid entries found in the uploaded file!')
            )
        return result

    @classmethod
    def import_json(cls, request, fileobj, category=None, origin=None):
        from weblate.memory.tasks import update_memory_task
        content = fileobj.read()
        try:
            data = json.loads(force_text(content))
        except (ValueError, UnicodeDecodeError) as error:
            report_error(error, request)
            raise MemoryImportError(_('Failed to parse JSON file!'))
        updates = {}
        fields = cls.SCHEMA().names()
        if category:
            updates = {
                'category': category,
                'origin': origin,
            }
        found = 0
        if isinstance(data, list):
            for entry in data:
                if not isinstance(entry, dict):
                    continue
                # Apply overrides
                entry.update(updates)
                # Ensure all fields are set
                for field in fields:
                    if not entry.get(field):
                        continue
                # Ensure there are not extra fields
                record = {field: entry[field] for field in fields}
                update_memory_task.delay(**record)
                found += 1
        return found

    @classmethod
    def import_tmx(cls, request, fileobj, langmap=None, category=None, origin=None):
        from weblate.memory.tasks import update_memory_task
        if category is None:
            category = CATEGORY_FILE
        try:
            storage = tmxfile.parsefile(fileobj)
        except SyntaxError as error:
            report_error(error, request)
            raise MemoryImportError(_('Failed to parse TMX file!'))
        header = next(
            storage.document.getroot().iterchildren(
                storage.namespaced("header")
            )
        )
        source_language_code = header.get('srclang')
        source_language = cls.get_language_code(source_language_code, langmap)

        languages = {}
        found = 0
        for unit in storage.units:
            # Parse translations (translate-toolkit does not care about
            # languages here, it just picks first and second XML elements)
            translations = {}
            for node in unit.getlanguageNodes():
                lang, text = get_node_data(unit, node)
                if not lang or not text:
                    continue
                translations[lang] = text
                if lang not in languages:
                    languages[lang] = cls.get_language_code(lang, langmap)

            try:
                source = translations.pop(source_language_code)
            except KeyError:
                # Skip if source language is not present
                continue

            for lang, text in translations.items():
                update_memory_task.delay(
                    source_language=source_language,
                    target_language=languages[lang],
                    source=source,
                    target=text,
                    origin=origin,
                    category=category,
                )
                found += 1
        return found

    @staticmethod
    def get_filter(user, project, use_shared, use_file):
        """Create query to filter categories based on selection."""
        # Always include file imported memory
        if use_file:
            category_filter = [query.Term('category', CATEGORY_FILE)]
        else:
            category_filter = []
        # Per user memory
        if user:
            category_filter.append(
                query.Term('category', CATEGORY_USER_OFFSET + user.id)
            )
        # Private project memory
        if project:
            category_filter.append(
                query.Term('category', CATEGORY_PRIVATE_OFFSET + project.id)
            )
        # Shared memory
        if use_shared:
            category_filter.append(query.Term('category', CATEGORY_SHARED))
        return query.Or(category_filter)

    def list_documents(self, user=None, project=None):
        catfilter = self.get_filter(user, project, False, False)
        self.open_searcher()
        return self.searcher.search(catfilter, limit=None)

    def lookup(self, source_language, target_language, text, user,
               project, use_shared):
        langfilter = query.And([
            query.Term('source_language', source_language),
            query.Term('target_language', target_language),
            self.get_filter(user, project, use_shared, True),
        ])
        self.open_searcher()
        text_query = self.parser.parse(text)
        matches = self.searcher.search(
            text_query, filter=langfilter, limit=20000
        )

        for match in matches:
            similarity = self.comparer.similarity(text, match['source'])
            if similarity < 30:
                continue
            yield (
                match['source'], match['target'], similarity,
                match['category'], match['origin']
            )

    def delete(self, origin=None, category=None, project=None, user=None):
        """Delete entries based on filter."""
        category = self.get_category(category, project, user)
        with self.writer() as writer:
            if origin:
                return writer.delete_by_term('origin', origin)
            return writer.delete_by_term('category', category)

    def empty(self):
        """Recreates translation memory."""
        self.cleanup()
        self.index = self.open_index()
        self.searcher = None

    def get_values(self, field):
        self.open_searcher()
        return [
            force_text(x) for x in self.searcher.reader().field_terms(field)
        ]

    def dump(self, handle, indent=2):
        """Dump memory content to JSON file."""
        self.open_searcher()
        json.dump(
            list(self.searcher.documents()),
            handle,
            indent=indent,
        )