def __init__(self): self.index = FileStorage(data_dir('memory')).open_index() self.parser = qparser.QueryParser( 'source', schema=self.index.schema, group=qparser.OrGroup.factory(0.9), termclass=query.FuzzyTerm, ) self.searcher = None self.comparer = Comparer()
def __init__(self): self.index = self.open_index() self.parser = qparser.QueryParser( 'source', schema=self.index.schema, group=qparser.OrGroup.factory(0.9), termclass=query.FuzzyTerm, plugins=[], ) self.comparer = Comparer()
def download_translations(self, source, language, text, unit, user): """Download list of possible translations from a service.""" matching_units = Unit.objects.prefetch().filter( translation__component__project__in=Project.objects.all_acl( user)).more_like_this(unit, 1000) comparer = Comparer() result = set((self.format_unit_match( munit, comparer.similarity(text, munit.get_source_plurals()[0])) for munit in matching_units)) if None in result: result.remove(None) return result
def __init__(self): """Create new machine translation object.""" self.mtid = self.name.lower().replace(" ", "-") self.rate_limit_cache = "{}-rate-limit".format(self.mtid) self.languages_cache = "{}-languages".format(self.mtid) self.comparer = Comparer() self.supported_languages_error = None
def __init__(self): """Create new machine translation object.""" self.mtid = self.name.lower().replace(' ', '-') self.rate_limit_cache = '{}-rate-limit'.format(self.mtid) self.languages_cache = '{}-languages'.format(self.mtid) self.request_url = None self.request_params = None self.comparer = Comparer()
def download_translations(self, source, language, text, unit, user): """Download list of possible translations from a service.""" matching_units = Unit.objects.prefetch().filter( translation__component__project__in=user.allowed_projects ).more_like_this(unit, 1000) comparer = Comparer() result = set(( self.format_unit_match( munit, comparer.similarity(text, munit.get_source_plurals()[0]) ) for munit in matching_units )) if None in result: result.remove(None) return result
def __init__(self): self.index = self.open_index() self.parser = qparser.QueryParser( 'source', schema=self.index.schema, group=qparser.OrGroup.factory(0.9), termclass=query.FuzzyTerm, ) self.searcher = None self.comparer = Comparer()
def download_translations(self, source, language, text, unit, user, search): """Download list of possible translations from a service.""" comparer = Comparer() for result in Memory.objects.lookup( source, language, text, user, unit.translation.component.project, unit.translation.component.project.use_shared_tm, ).iterator(): quality = comparer.similarity(text, result.source) if quality < 10 or (quality < 75 and not search): continue yield { "text": result.target, "quality": quality, "service": self.name, "origin": result.get_origin_display(), "source": result.source, }
def test_long(self): self.assertLessEqual(Comparer().similarity('a' * 200000, 'b' * 200000), 50)
def test_unicode(self): self.assertEqual(Comparer().similarity('NICHOLASŸ', 'NICHOLAS'), 88)
def test_same(self): self.assertEqual(Comparer().similarity('a', 'a'), 100)
def test_different(self): self.assertLessEqual(Comparer().similarity('a', 'b'), 50)
def test_long(self): # This is expected to raise MemoryError inside jellyfish self.assertLessEqual(Comparer().similarity("a" * 200000, "b" * 200000), 50)
def test_unicode(self): # Test fallback to Python implementation in jellyfish # for unicode strings self.assertEqual(Comparer().similarity("NICHOLASŸ", "NICHOLAS"), 88)
def test_same(self): self.assertEqual(Comparer().similarity("a", "a"), 100)
def test_different(self): self.assertLessEqual(Comparer().similarity("a", "b"), 50)
class TranslationMemory(WhooshIndex): LOCATION = 'memory' SCHEMA = TMSchema def __init__(self): self.index = self.open_index() self.parser = qparser.QueryParser( 'source', schema=self.index.schema, group=qparser.OrGroup.factory(0.9), termclass=query.FuzzyTerm, ) self.searcher = None self.comparer = Comparer() def __del__(self): self.close() def open_searcher(self): if self.searcher is None: self.searcher = self.index.searcher() def doc_count(self): self.open_searcher() return self.searcher.doc_count() def close(self): if self.searcher is not None: self.searcher.close() self.searcher = None def writer(self): return self.index.writer() @staticmethod def get_language_code(code, langmap): language = Language.objects.auto_get_or_create(code) if langmap and language.code in langmap: language = Language.objects.auto_get_or_create( langmap[language.code] ) return language.code @staticmethod def get_category(category=None, project=None, user=None): if project: return CATEGORY_PRIVATE_OFFSET + project.pk if user: return CATEGORY_USER_OFFSET + user.pk return category @classmethod def import_file(cls, request, fileobj, langmap=None, category=None, project=None, user=None): origin = force_text(os.path.basename(fileobj.name)).lower() category = cls.get_category(category, project, user) name, extension = os.path.splitext(origin) if len(name) > 25: origin = '{}...{}'.format(name[:25], extension) if extension == '.tmx': result = cls.import_tmx(request, fileobj, langmap, category, origin) elif extension == '.json': result = cls.import_json(request, fileobj, category, origin) else: raise MemoryImportError(_('Unsupported file!')) if not result: raise MemoryImportError( _('No valid entries found in the uploaded file!') ) return result @classmethod def import_json(cls, request, fileobj, category=None, origin=None): from weblate.memory.tasks import update_memory_task content = fileobj.read() try: data = json.loads(force_text(content)) except (ValueError, UnicodeDecodeError) as error: report_error(error, request) raise MemoryImportError(_('Failed to parse JSON file!')) updates = {} fields = cls.SCHEMA().names() if category: updates = { 'category': category, 'origin': origin, } found = 0 if isinstance(data, list): for entry in data: if not isinstance(entry, dict): continue # Apply overrides entry.update(updates) # Ensure all fields are set for field in fields: if not entry.get(field): continue # Ensure there are not extra fields record = {field: entry[field] for field in fields} update_memory_task.delay(**record) found += 1 return found @classmethod def import_tmx(cls, request, fileobj, langmap=None, category=None, origin=None): from weblate.memory.tasks import update_memory_task if category is None: category = CATEGORY_FILE try: storage = tmxfile.parsefile(fileobj) except SyntaxError as error: report_error(error, request) raise MemoryImportError(_('Failed to parse TMX file!')) header = next( storage.document.getroot().iterchildren( storage.namespaced("header") ) ) source_language_code = header.get('srclang') source_language = cls.get_language_code(source_language_code, langmap) languages = {} found = 0 for unit in storage.units: # Parse translations (translate-toolkit does not care about # languages here, it just picks first and second XML elements) translations = {} for node in unit.getlanguageNodes(): lang, text = get_node_data(unit, node) if not lang or not text: continue translations[lang] = text if lang not in languages: languages[lang] = cls.get_language_code(lang, langmap) try: source = translations.pop(source_language_code) except KeyError: # Skip if source language is not present continue for lang, text in translations.items(): update_memory_task.delay( source_language=source_language, target_language=languages[lang], source=source, target=text, origin=origin, category=category, ) found += 1 return found @staticmethod def get_filter(user, project, use_shared, use_file): """Create query to filter categories based on selection.""" # Always include file imported memory if use_file: category_filter = [query.Term('category', CATEGORY_FILE)] else: category_filter = [] # Per user memory if user: category_filter.append( query.Term('category', CATEGORY_USER_OFFSET + user.id) ) # Private project memory if project: category_filter.append( query.Term('category', CATEGORY_PRIVATE_OFFSET + project.id) ) # Shared memory if use_shared: category_filter.append(query.Term('category', CATEGORY_SHARED)) return query.Or(category_filter) def list_documents(self, user=None, project=None): catfilter = self.get_filter(user, project, False, False) self.open_searcher() return self.searcher.search(catfilter, limit=None) def lookup(self, source_language, target_language, text, user, project, use_shared): langfilter = query.And([ query.Term('source_language', source_language), query.Term('target_language', target_language), self.get_filter(user, project, use_shared, True), ]) self.open_searcher() text_query = self.parser.parse(text) matches = self.searcher.search( text_query, filter=langfilter, limit=20000 ) for match in matches: similarity = self.comparer.similarity(text, match['source']) if similarity < 30: continue yield ( match['source'], match['target'], similarity, match['category'], match['origin'] ) def delete(self, origin=None, category=None, project=None, user=None): """Delete entries based on filter.""" category = self.get_category(category, project, user) with self.writer() as writer: if origin: return writer.delete_by_term('origin', origin) return writer.delete_by_term('category', category) def empty(self): """Recreates translation memory.""" self.cleanup() self.index = self.open_index() self.searcher = None def get_values(self, field): self.open_searcher() return [ force_text(x) for x in self.searcher.reader().field_terms(field) ] def dump(self, handle, indent=2): """Dump memory content to JSON file.""" self.open_searcher() json.dump( list(self.searcher.documents()), handle, indent=indent, )
class TranslationMemory(WhooshIndex): LOCATION = 'memory' SCHEMA = TMSchema def __init__(self): self.index = self.open_index() self.parser = qparser.QueryParser( 'source', schema=self.index.schema, group=qparser.OrGroup.factory(0.9), termclass=query.FuzzyTerm, plugins=[], ) self.comparer = Comparer() def __del__(self): self.close() @cached_property def searcher(self): return self.index.searcher() def doc_count(self): return self.searcher.doc_count() def close(self): if 'seacher' in self.__dict__: self.seacher.close() del self.__dict__['searcher'] def refresh(self): if 'seacher' in self.__dict__: self.__dict__['searcher'] = self.seacher.refresh() def writer(self): return self.index.writer() @staticmethod def get_language_code(code, langmap): language = Language.objects.auto_get_or_create(code) if langmap and language.code in langmap: language = Language.objects.auto_get_or_create( langmap[language.code]) return language.code @staticmethod def get_category(category=None, project=None, user=None, use_file=False): if project: return CATEGORY_PRIVATE_OFFSET + project.pk if user: return CATEGORY_USER_OFFSET + user.pk if use_file: return CATEGORY_FILE return category @classmethod def import_file(cls, request, fileobj, langmap=None, category=None, project=None, user=None, use_file=False): origin = force_text(os.path.basename(fileobj.name)).lower() category = cls.get_category(category, project, user, use_file) name, extension = os.path.splitext(origin) if len(name) > 25: origin = '{}...{}'.format(name[:25], extension) if extension == '.tmx': result = cls.import_tmx(request, fileobj, langmap, category, origin) elif extension == '.json': result = cls.import_json(request, fileobj, category, origin) else: raise MemoryImportError(_('Unsupported file!')) if not result: raise MemoryImportError( _('No valid entries found in the uploaded file!')) return result @classmethod def import_json(cls, request, fileobj, category=None, origin=None): from weblate.memory.tasks import update_memory_task content = fileobj.read() try: data = json.loads(force_text(content)) except (ValueError, UnicodeDecodeError) as error: report_error(error, request, prefix='Failes to parse') raise MemoryImportError(_('Failed to parse JSON file!')) updates = {} fields = cls.SCHEMA().names() if category: updates = { 'category': category, 'origin': origin, } found = 0 if isinstance(data, list): for entry in data: if not isinstance(entry, dict): continue # Apply overrides entry.update(updates) # Ensure all fields are set for field in fields: if not entry.get(field): continue # Ensure there are not extra fields record = {field: entry[field] for field in fields} update_memory_task.delay(**record) found += 1 return found @classmethod def import_tmx(cls, request, fileobj, langmap=None, category=None, origin=None): from weblate.memory.tasks import update_memory_task if category is None: category = CATEGORY_FILE try: storage = tmxfile.parsefile(fileobj) except (SyntaxError, AssertionError) as error: report_error(error, request, prefix='Failes to parse') raise MemoryImportError(_('Failed to parse TMX file!')) header = next(storage.document.getroot().iterchildren( storage.namespaced("header"))) source_language_code = header.get('srclang') source_language = cls.get_language_code(source_language_code, langmap) languages = {} found = 0 for unit in storage.units: # Parse translations (translate-toolkit does not care about # languages here, it just picks first and second XML elements) translations = {} for node in unit.getlanguageNodes(): lang, text = get_node_data(unit, node) if not lang or not text: continue translations[lang] = text if lang not in languages: languages[lang] = cls.get_language_code(lang, langmap) try: source = translations.pop(source_language_code) except KeyError: # Skip if source language is not present continue for lang, text in translations.items(): update_memory_task.delay( source_language=source_language, target_language=languages[lang], source=source, target=text, origin=origin, category=category, ) found += 1 return found @staticmethod def get_filter(user, project, use_shared, use_file): """Create query to filter categories based on selection.""" # Always include file imported memory if use_file: category_filter = [query.Term('category', CATEGORY_FILE)] else: category_filter = [] # Per user memory if user: category_filter.append( query.Term('category', CATEGORY_USER_OFFSET + user.id)) # Private project memory if project: category_filter.append( query.Term('category', CATEGORY_PRIVATE_OFFSET + project.id)) # Shared memory if use_shared: category_filter.append(query.Term('category', CATEGORY_SHARED)) return query.Or(category_filter) def list_documents(self, user=None, project=None, use_file=False): catfilter = self.get_filter(user, project, False, use_file) return self.searcher.search(catfilter, limit=None) def lookup(self, source_language, target_language, text, user, project, use_shared): langfilter = query.And([ query.Term('source_language', source_language), query.Term('target_language', target_language), self.get_filter(user, project, use_shared, True), ]) text_query = self.parser.parse(text) matches = self.searcher.search(text_query, filter=langfilter, limit=20000) for match in matches: similarity = self.comparer.similarity(text, match['source']) if similarity < 30: continue yield (match['source'], match['target'], similarity, match['category'], match['origin']) def delete(self, origin=None, category=None, project=None, user=None, use_file=False): """Delete entries based on filter.""" category = self.get_category(category, project, user, use_file) with self.writer() as writer: if origin: return writer.delete_by_term('origin', origin) return writer.delete_by_term('category', category) def empty(self): """Recreates translation memory.""" self.close() self.cleanup() self.index = self.open_index() def get_values(self, field): return [ force_text(x) for x in self.searcher.reader().field_terms(field) ] def dump(self, handle, indent=2): """Dump memory content to JSON file.""" json.dump( list(self.searcher.documents()), handle, indent=indent, )
class TranslationMemory(WhooshIndex): LOCATION = 'memory' SCHEMA = TMSchema def __init__(self): self.index = self.open_index() self.parser = qparser.QueryParser( 'source', schema=self.index.schema, group=qparser.OrGroup.factory(0.9), termclass=query.FuzzyTerm, ) self.searcher = None self.comparer = Comparer() def __del__(self): self.close() def open_searcher(self): if self.searcher is None: self.searcher = self.index.searcher() def doc_count(self): self.open_searcher() return self.searcher.doc_count() def close(self): if self.searcher is not None: self.searcher.close() self.searcher = None def writer(self): return self.index.writer() def get_language_code(self, code, langmap): language = Language.objects.auto_get_or_create(code) if langmap and language.code in langmap: language = Language.objects.auto_get_or_create( langmap[language.code] ) return language.code def import_tmx(self, fileobj, langmap=None): origin = force_text(os.path.basename(fileobj.name)) storage = tmxfile.parsefile(fileobj) header = next( storage.document.getroot().iterchildren( storage.namespaced("header") ) ) source_language_code = header.get('srclang') source_language = self.get_language_code(source_language_code, langmap) languages = {} with self.writer() as writer: for unit in storage.units: # Parse translations (translate-toolkit does not care about # languages here, it just picks first and second XML elements) translations = {} for node in unit.getlanguageNodes(): lang, text = get_node_data(unit, node) translations[lang] = text if lang not in languages: languages[lang] = self.get_language_code(lang, langmap) try: source = translations.pop(source_language_code) except KeyError: # Skip if source language is not present continue for lang, text in translations.items(): writer.add_document( source_language=source_language, target_language=languages[lang], source=source, target=text, origin=origin, category=CATEGORY_FILE, ) def lookup(self, source_language, target_language, text): langfilter = query.And([ query.Term('source_language', source_language), query.Term('target_language', target_language), ]) self.open_searcher() text_query = self.parser.parse(text) matches = self.searcher.search( text_query, filter=langfilter, limit=20000 ) for match in matches: similarity = self.comparer.similarity(text, match['source']) if similarity < 30: continue yield ( match['source'], match['target'], similarity, match['origin'] ) def delete(self, origin): """Delete entries by origin.""" with self.writer() as writer: return writer.delete_by_term('origin', origin) def empty(self): """Recreates translation memory.""" self.cleanup() self.index = self.open_index() self.searcher = None def get_origins(self): self.open_searcher() return [ force_text(x) for x in self.searcher.lexicon('origin') ]