Ejemplo n.º 1
0
    def add_dict(self, unit, source_lang, target_lang, commit=True):
        """inserts units represented as dictionaries in database"""
        source_lang = data.normalize_code(source_lang)
        target_lang = data.normalize_code(target_lang)
        try:
            try:
                self.cursor.execute(
                    "INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)",
                    (unit["source"], unit["context"], source_lang,
                     len(unit["source"])))
                sid = self.cursor.lastrowid
            except dbapi2.IntegrityError:
                # source string already exists in db, run query to find sid
                self.cursor.execute(
                    "SELECT sid FROM sources WHERE text=? AND context=? and lang=?",
                    (unit["source"], unit["context"], source_lang))
                sid = self.cursor.fetchone()
                (sid, ) = sid
            try:
                # FIXME: get time info from translation store
                # FIXME: do we need so store target length?
                self.cursor.execute(
                    "INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)",
                    (sid, unit["target"], target_lang, int(time.time())))
            except dbapi2.IntegrityError:
                # target string already exists in db, do nothing
                pass

            if commit:
                self.connection.commit()
        except:
            if commit:
                self.connection.rollback()
            raise
Ejemplo n.º 2
0
 def _match_normalized_langcode(self, langcode):
     languages_keys = self.languages.keys()
     normalized_keys = [
         data.normalize_code(lang) for lang in languages_keys
     ]
     i = normalized_keys.index(data.normalize_code(langcode))
     return languages_keys[i]
Ejemplo n.º 3
0
    def translate_unit(self, unit_source, source_langs, target_langs):
        """return TM suggestions for unit_source"""
        if isinstance(unit_source, str):
            unit_source = unicode(unit_source, "utf-8")
        if isinstance(source_langs, list):
            source_langs = [data.normalize_code(lang) for lang in source_langs]
            source_langs = ','.join(source_langs)
        else:
            source_langs = data.normalize_code(source_langs)
        if isinstance(target_langs, list):
            target_langs = [data.normalize_code(lang) for lang in target_langs]
            target_langs = ','.join(target_langs)
        else:
            target_langs = data.normalize_code(target_langs)

        minlen = min_levenshtein_length(len(unit_source), self.min_similarity)
        maxlen = max_levenshtein_length(len(unit_source), self.min_similarity,
                                        self.max_length)

        # split source into words, remove punctuation and special
        # chars, keep words that are at least 3 chars long
        unit_words = STRIP_REGEXP.sub(' ', unit_source).split()
        unit_words = filter(lambda word: len(word) > 2, unit_words)

        if self.fulltext and len(unit_words) > 3:
            logging.debug("fulltext matching")
            query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid
                       WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ?
                       AND fulltext MATCH ?"""
            search_str = " OR ".join(unit_words)
            self.cursor.execute(
                query,
                (source_langs, target_langs, minlen, maxlen, search_str))
        else:
            logging.debug("nonfulltext matching")
            query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid
            WHERE s.lang IN (?) AND t.lang IN (?)
            AND s.length >= ? AND s.length <= ?"""
            self.cursor.execute(query,
                                (source_langs, target_langs, minlen, maxlen))

        results = []
        for row in self.cursor:
            quality = self.comparer.similarity(unit_source, row[0],
                                               self.min_similarity)
            if quality >= self.min_similarity:
                results.append({
                    'source': row[0],
                    'target': row[1],
                    'context': row[2],
                    'quality': quality,
                })
        results.sort(key=lambda match: match['quality'], reverse=True)
        results = results[:self.max_candidates]
        logging.debug("results: %s", unicode(results))
        return results
Ejemplo n.º 4
0
    def translate_unit(self, unit_source, source_langs, target_langs):
        """return TM suggestions for unit_source"""
        if isinstance(unit_source, bytes):
            unit_source = unit_source.decode("utf-8")
        if isinstance(source_langs, list):
            source_langs = [data.normalize_code(lang) for lang in source_langs]
            source_langs = ','.join(source_langs)
        else:
            source_langs = data.normalize_code(source_langs)
        if isinstance(target_langs, list):
            target_langs = [data.normalize_code(lang) for lang in target_langs]
            target_langs = ','.join(target_langs)
        else:
            target_langs = data.normalize_code(target_langs)

        minlen = min_levenshtein_length(len(unit_source), self.min_similarity)
        maxlen = max_levenshtein_length(len(unit_source), self.min_similarity,
                                        self.max_length)

        # split source into words, remove punctuation and special
        # chars, keep words that are at least 3 chars long
        unit_words = STRIP_REGEXP.sub(' ', unit_source).split()
        unit_words = list(filter(lambda word: len(word) > 2, unit_words))

        if self.fulltext and len(unit_words) > 3:
            logging.debug("fulltext matching")
            query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid
                       WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ?
                       AND fulltext MATCH ?"""
            search_str = " OR ".join(unit_words)
            self.cursor.execute(query, (source_langs, target_langs, minlen,
                                maxlen, search_str))
        else:
            logging.debug("nonfulltext matching")
            query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid
            WHERE s.lang IN (?) AND t.lang IN (?)
            AND s.length >= ? AND s.length <= ?"""
            self.cursor.execute(query, (source_langs, target_langs, minlen,
                                        maxlen))

        results = []
        for row in self.cursor:
            quality = self.comparer.similarity(unit_source, row[0],
                                               self.min_similarity)
            if quality >= self.min_similarity:
                results.append({
                    'source': row[0],
                    'target': row[1],
                    'context': row[2],
                    'quality': quality,
                })
        results.sort(key=lambda match: match['quality'], reverse=True)
        results = results[:self.max_candidates]
        logging.debug("results: %s", six.text_type(results))
        return results
Ejemplo n.º 5
0
def get_language_supported(lang_code, supported):
    normalized = data.normalize_code(data.simplify_to_common(lang_code))
    if normalized in supported:
        return normalized

    # FIXME: horribly slow way of dealing with languages with @ in them
    for lang in supported.keys():
        if normalized == data.normalize_code(lang):
            return lang

    return None
Ejemplo n.º 6
0
def get_language_supported(lang_code, supported):
    normalized = data.normalize_code(data.simplify_to_common(lang_code))
    if normalized in supported:
        return normalized

    # FIXME: horribly slow way of dealing with languages with @ in them
    for lang in supported.keys():
        if normalized == data.normalize_code(lang):
            return lang

    return None
Ejemplo n.º 7
0
def get_alt_src_langs(request, profile, translation_project):
    language = translation_project.language
    project = translation_project.project
    source_language = project.source_language

    langs = profile.alt_src_langs.exclude(id__in=(language.id, source_language.id)).filter(
        translationproject__project=project
    )

    if not profile.alt_src_langs.count():
        from pootle_language.models import Language

        accept = request.META.get("HTTP_ACCEPT_LANGUAGE", "")
        for accept_lang, unused in parse_accept_lang_header(accept):
            if accept_lang == "*":
                continue
            normalized = to_locale(data.normalize_code(data.simplify_to_common(accept_lang)))
            code = to_locale(accept_lang)
            if normalized in ("en", "en_US", source_language.code, language.code) or code in (
                "en",
                "en_US",
                source_language.code,
                language.code,
            ):
                continue
            langs = Language.objects.filter(code__in=(normalized, code), translationproject__project=project)
            if langs.count():
                break
    return langs
Ejemplo n.º 8
0
def get_alt_src_langs(request, user, translation_project):
    if request.user.is_anonymous:
        return
    language = translation_project.language
    project = translation_project.project
    source_language = project.source_language
    langs = list(
        user.alt_src_langs.exclude(
            id__in=(language.id, source_language.id)
        ).filter(
            translationproject__project=project))
    if langs:
        return langs
    accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '')
    for accept_lang, __ in parse_accept_lang_header(accept):
        if accept_lang == '*':
            continue
        normalized = to_locale(
            data.normalize_code(
                data.simplify_to_common(accept_lang)))
        code = to_locale(accept_lang)
        is_source_lang = any(
            langcode in ('en', 'en_US', source_language.code, language.code)
            for langcode in [code, normalized])
        if is_source_lang:
            continue

        langs = list(
            Language.objects.filter(
                code__in=(normalized, code),
                translationproject__project=project))
        if langs:
            return langs
Ejemplo n.º 9
0
Archivo: views.py Proyecto: arky/pootle
def get_alt_src_langs(request, user, translation_project):
    if request.user.is_anonymous:
        return
    language = translation_project.language
    project = translation_project.project
    source_language = project.source_language
    langs = list(
        user.alt_src_langs.exclude(
            id__in=(language.id, source_language.id)
        ).filter(
            translationproject__project=project))
    if langs:
        return langs
    accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '')
    for accept_lang, __ in parse_accept_lang_header(accept):
        if accept_lang == '*':
            continue
        normalized = to_locale(
            data.normalize_code(
                data.simplify_to_common(accept_lang)))
        code = to_locale(accept_lang)
        is_source_lang = any(
            langcode in ('en', 'en_US', source_language.code, language.code)
            for langcode in [code, normalized])
        if is_source_lang:
            continue

        langs = list(
            Language.objects.filter(
                code__in=(normalized, code),
                translationproject__project=project))
        if langs:
            return langs
Ejemplo n.º 10
0
    def gettargetlanguage(self):
        """Get the target language for this .qph file.

        :return: ISO code e.g. af, fr, pt_BR
        :rtype: String
        """
        return data.normalize_code(self.header.get('language'))
Ejemplo n.º 11
0
def get_alt_src_langs(request, user, translation_project):
    language = translation_project.language
    project = translation_project.project
    source_language = project.source_language

    langs = user.alt_src_langs.exclude(
        id__in=(language.id, source_language.id)).filter(
            translationproject__project=project)

    if not user.alt_src_langs.count():
        from pootle_language.models import Language
        accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '')

        for accept_lang, __ in parse_accept_lang_header(accept):
            if accept_lang == '*':
                continue

            simplified = data.simplify_to_common(accept_lang)
            normalized = to_locale(data.normalize_code(simplified))
            code = to_locale(accept_lang)
            if (normalized in ('en', 'en_US', source_language.code,
                               language.code) or code
                    in ('en', 'en_US', source_language.code, language.code)):
                continue

            langs = Language.objects.filter(
                code__in=(normalized, code),
                translationproject__project=project,
            )
            if langs.count():
                break

    return langs
Ejemplo n.º 12
0
    def gettargetlanguage(self):
        """Get the target language for this .ts file.

        @return: ISO code e.g. af, fr, pt_BR
        @rtype: String
        """
        return data.normalize_code(self.header.get("language"))
Ejemplo n.º 13
0
def get_alt_src_langs(request, user, translation_project):
    language = translation_project.language
    project = translation_project.project
    source_language = project.source_language

    langs = user.alt_src_langs.exclude(
        id__in=(language.id, source_language.id)
    ).filter(translationproject__project=project)

    if not user.alt_src_langs.count():
        from pootle_language.models import Language
        accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '')

        for accept_lang, unused in parse_accept_lang_header(accept):
            if accept_lang == '*':
                continue

            simplified = data.simplify_to_common(accept_lang)
            normalized = to_locale(data.normalize_code(simplified))
            code = to_locale(accept_lang)
            if (normalized in
                    ('en', 'en_US', source_language.code, language.code) or
                code in ('en', 'en_US', source_language.code, language.code)):
                continue

            langs = Language.objects.filter(
                code__in=(normalized, code),
                translationproject__project=project,
            )
            if langs.count():
                break

    return langs
Ejemplo n.º 14
0
def get_lang_from_http_header(request, supported):
    """If the user's browser sends a list of preferred languages in the
    HTTP_ACCEPT_LANGUAGE header, parse it into a list. Then walk through
    the list, and for each entry, we check whether we have a matching
    pootle translation project. If so, we return it.

    If nothing is found, return None."""
    accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '')
    for accept_lang, unused in trans_real.parse_accept_lang_header(accept):
        if accept_lang == '*':
            return None
        normalized = data.normalize_code(data.simplify_to_common(accept_lang, supported))
        if normalized in ['en-us', 'en']:
            return None
        if normalized in supported:
            return normalized

        #FIXME: horribly slow way of dealing with languages with @ in them
        for lang in supported.keys():
            if normalized == data.normalize_code(lang):
                return lang
    return None
Ejemplo n.º 15
0
def get_lang_from_http_header(request, supported):
    """If the user's browser sends a list of preferred languages in the
    HTTP_ACCEPT_LANGUAGE header, parse it into a list. Then walk through
    the list, and for each entry, we check whether we have a matching
    pootle translation project. If so, we return it.

    If nothing is found, return None."""
    accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '')
    for accept_lang, unused in trans_real.parse_accept_lang_header(accept):
        if accept_lang == '*':
            return None
        normalized = data.normalize_code(data.simplify_to_common(accept_lang, supported))
        if normalized in ['en-us', 'en']:
            return None
        if normalized in supported:
            return normalized

        #FIXME: horribly slow way of dealing with languages with @ in them
        for lang in supported.keys():
            if normalized == data.normalize_code(lang):
                return lang
    return None
Ejemplo n.º 16
0
    def add_dict(self, unit, source_lang, target_lang, commit=True):
        """inserts units represented as dictionaries in database"""
        source_lang = data.normalize_code(source_lang)
        target_lang = data.normalize_code(target_lang)
        try:
            try:
                self.cursor.execute("INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)",
                                    (unit["source"],
                                     unit["context"],
                                     source_lang,
                                     len(unit["source"])))
                sid = self.cursor.lastrowid
            except dbapi2.IntegrityError:
                # source string already exists in db, run query to find sid
                self.cursor.execute("SELECT sid FROM sources WHERE text=? AND context=? and lang=?",
                                    (unit["source"],
                                     unit["context"],
                                     source_lang))
                sid = self.cursor.fetchone()
                (sid,) = sid
            try:
                # FIXME: get time info from translation store
                # FIXME: do we need so store target length?
                self.cursor.execute("INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)",
                                    (sid,
                                     unit["target"],
                                     target_lang,
                                     int(time.time())))
            except dbapi2.IntegrityError:
                # target string already exists in db, do nothing
                pass

            if commit:
                self.connection.commit()
        except Exception:
            if commit:
                self.connection.rollback()
            raise
Ejemplo n.º 17
0
    def getsourcelanguage(self):
        """Get the source language for this .qph file.

        We don't implement setsourcelanguage as users really shouldn't be
        altering the source language in .qph files, it should be set correctly
        by the extraction tools.

        :return: ISO code e.g. af, fr, pt_BR
        :rtype: String
        """
        lang = data.normalize_code(self.header.get('sourcelanguage', "en"))
        if lang == 'en-us':
            return 'en'
        return lang
Ejemplo n.º 18
0
def get_alt_src_langs(request, profile, language):
    langs = profile.alt_src_langs.exclude(id=language.id)
    if not langs.count():
        accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '')
        codes = []
        for accept_lang, unused in parse_accept_lang_header(accept):
            if accept_lang == '*':
                continue
            normalized = to_locale(data.normalize_code(data.simplify_to_common(accept_lang)))
            if normalized in ['en_US', 'en', language.code]:
                continue
            codes.append(normalized)
        if codes:
            from pootle_language.models import Language
            langs = Language.objects.filter(code__in=codes)
    return langs
Ejemplo n.º 19
0
    def getsourcelanguage(self):
        """Get the source language for this .ts file.

        The 'sourcelanguage' attribute was only added to the TS format in
        Qt v4.5. We return 'en' if there is no sourcelanguage set.

        We don't implement setsourcelanguage as users really shouldn't be
        altering the source language in .ts files, it should be set correctly
        by the extraction tools.

        :return: ISO code e.g. af, fr, pt_BR
        :rtype: String
        """
        lang = data.normalize_code(self.header.get('sourcelanguage', "en"))
        if lang == 'en-us':
            return 'en'
        return lang
Ejemplo n.º 20
0
    def getsourcelanguage(self):
        """Get the source language for this .ts file.

        The 'sourcelanguage' attribute was only added to the TS format in
        Qt v4.5. We return 'en' if there is no sourcelanguage set.

        We don't implement setsourcelanguage as users really shouldn't be
        altering the source language in .ts files, it should be set correctly
        by the extraction tools.

        :return: ISO code e.g. af, fr, pt_BR
        :rtype: String
        """
        lang = data.normalize_code(self.header.get('sourcelanguage', "en"))
        if lang == 'en-us':
            return 'en'
        return lang
Ejemplo n.º 21
0
    def create_suggestions(self, suggestion):
        # Skip any suggestions where the suggested translation contains parenthesis
        if re.match(r'\(.*\)', suggestion['text']):
            return []

        units = []

        for proj in suggestion['projects']:
            # Skip fuzzy matches:
            if proj['flags'] != 0:
                continue

            source = proj['orig_phrase'].strip()
            # Skip strings that are too short
            if len(source) < MIN_TERM_LENGTH:
                continue
            # Skip any units containing parenthesis
            if re.match(r'\(.*\)', source):
                continue
            unit = TranslationUnit(source)

            target = suggestion['text'].strip()

            # Skip phrases already found:
            old_unit = self.store.findunit(proj['orig_phrase'])
            if old_unit and old_unit.target == target:
                continue
            # We mostly want to work with lowercase strings, but in German (and
            # some languages with a related writing style), this will probably
            # irritate more often than help, since nouns are always written to
            # start with capital letters.
            target_lang_code = self.main_controller.lang_controller.target_lang.code
            if not data.normalize_code(target_lang_code) in (
                    'de', 'de-de', 'lb', 'als', 'ksh', 'stq', 'vmf'):
                # unless the string contains multiple consecutive uppercase
                # characters or using some type of camel case, we take it to
                # lower case
                if not is_case_sensitive(target):
                    target = target.lower()
            unit.target = target
            units.append(unit)
        return units
Ejemplo n.º 22
0
    def create_suggestions(self, suggestion):
        # Skip any suggestions where the suggested translation contains parenthesis
        if re.match(r"\(.*\)", suggestion["text"]):
            return []

        units = []

        for proj in suggestion["projects"]:
            # Skip fuzzy matches:
            if proj["flags"] != 0:
                continue

            source = proj["orig_phrase"].strip()
            # Skip strings that are too short
            if len(source) < MIN_TERM_LENGTH:
                continue
            # Skip any units containing parenthesis
            if re.match(r"\(.*\)", source):
                continue
            unit = TranslationUnit(source)

            target = suggestion["text"].strip()

            # Skip phrases already found:
            old_unit = self.store.findunit(proj["orig_phrase"])
            if old_unit and old_unit.target == target:
                continue
            # We mostly want to work with lowercase strings, but in German (and
            # some languages with a related writing style), this will probably
            # irritate more often than help, since nouns are always written to
            # start with capital letters.
            target_lang_code = self.main_controller.lang_controller.target_lang.code
            if not data.normalize_code(target_lang_code) in ("de", "de-de", "lb", "als", "ksh", "stq", "vmf"):
                # unless the string contains multiple consecutive uppercase
                # characters or using some type of camel case, we take it to
                # lower case
                if not is_case_sensitive(target):
                    target = target.lower()
            unit.target = target
            units.append(unit)
        return units
Ejemplo n.º 23
0
def test_normalise_code():
    """test the normalisation of language codes"""
    assert data.normalize_code("af_ZA") == "af-za"
    assert data.normalize_code("xx@Latin") == "xx-latin"
Ejemplo n.º 24
0
def test_normalise_code():
    """test the normalisation of language codes"""
    assert data.normalize_code("af_ZA") == "af-za"
    assert data.normalize_code("xx@Latin") == "xx-latin"
Ejemplo n.º 25
0
 def _match_normalized_langcode(self, langcode):
     languages_keys = self.languages.keys()
     normalized_keys = [data.normalize_code(lang) for lang in languages_keys]
     i = normalized_keys.index(data.normalize_code(langcode))
     return languages_keys[i]