Example #1
0
    def __init__(self,
                 db_file,
                 max_candidates=3,
                 min_similarity=75,
                 max_length=1000):

        self.max_candidates = max_candidates
        self.min_similarity = min_similarity
        self.max_length = max_length

        if not isinstance(db_file, unicode):
            db_file = unicode(db_file)  # don't know which encoding
        self.db_file = db_file
        # share connections to same database file between different instances
        if db_file not in self._tm_dbs:
            self._tm_dbs[db_file] = {}
        self._tm_db = self._tm_dbs[db_file]

        # FIXME: do we want to do any checks before we initialize the DB?
        self.init_database()
        self.fulltext = False
        self.init_fulltext()

        self.comparer = LevenshteinComparer(self.max_length)

        self.preload_db()
    def _check_xliff_alttrans(self, unit):
        if not hasattr(unit, 'getalttrans'):
            return []
        alttrans = unit.getalttrans()
        if not alttrans:
            return []

        from translate.search.lshtein import LevenshteinComparer
        lcomparer = LevenshteinComparer(max_len=1000)

        results = []
        for alt in alttrans:
            tmsource = _('This file')

            xmlelement = getattr(alt, 'xmlelement', None)
            if xmlelement:
                origin = alt.xmlelement.get('origin', '')
                if origin:
                    tmsource += "\n" + origin

            results.append({
                'source': alt.source,
                'target': alt.target,
                'quality': lcomparer.similarity(unit.source, alt.source, 0),
                'tmsource': tmsource,
            })
        return results
Example #3
0
    def _check_alttrans(self, unit):
        if not hasattr(unit, 'getalttrans'):
            return []
        alttrans = unit.getalttrans()
        if not alttrans:
            return []

        from translate.search.lshtein import LevenshteinComparer
        lcomparer = LevenshteinComparer(max_len=1000)

        results = []
        for alt in alttrans:
            quality = lcomparer.similarity(unit.source, alt.source,
                                           self.controller.min_quality - 15)
            # let's check if it is useful, but be more lenient
            if quality < self.controller.min_quality - 10:
                continue
            tmsource = _('This file')

            xmlelement = getattr(alt, 'xmlelement', None)
            if xmlelement is not None:
                origin = alt.xmlelement.get('origin', '')
                if origin:
                    if origin == "lmc":
                        # Experimental code to test lmc research. Everything
                        # in a try block, just in case.
                        try:
                            from lxml import etree
                            import os.path
                            extras = xmlelement.xpath(
                                'processing-instruction()')
                            meta = dict((pi.target, pi.text) for pi in extras)
                            tmsource = [
                                meta.get("contact-name", ""),
                                meta.get("category", ""),
                                os.path.splitext(meta.get("original", ""))[0]
                            ]
                            tmsource = u"\n".join(filter(None, tmsource))
                        except Exception, e:
                            import logging
                            logging.info(e)

                    tmsource += "\n" + origin

            results.append({
                'source': alt.source,
                'target': alt.target,
                'quality': quality,
                'tmsource': tmsource,
            })
Example #4
0
    def _check_alttrans(self, unit):
        if not hasattr(unit, 'getalttrans'):
            return []
        alttrans = unit.getalttrans()
        if not alttrans:
            return []

        from translate.search.lshtein import LevenshteinComparer
        lcomparer = LevenshteinComparer(max_len=1000)

        results = []
        for alt in alttrans:
            quality = lcomparer.similarity(unit.source, alt.source, self.controller.min_quality - 15)
            # let's check if it is useful, but be more lenient
            if quality < self.controller.min_quality - 10:
                continue
            tmsource = _('This file')

            xmlelement = getattr(alt, 'xmlelement', None)
            if xmlelement is not None:
                origin = alt.xmlelement.get('origin', '')
                if origin:
                    if origin == "lmc":
                        # Experimental code to test lmc research. Everything
                        # in a try block, just in case.
                        try:
                            from lxml import etree
                            import os.path
                            extras = xmlelement.xpath('processing-instruction()')
                            meta = dict((pi.target, pi.text) for pi in extras)
                            tmsource = [meta.get("contact-name", ""), meta.get("category", ""), os.path.splitext(meta.get("original", ""))[0]]
                            tmsource = u"\n".join(filter(None, tmsource))
                        except Exception, e:
                            import logging
                            logging.info(e)

                    tmsource += "\n" + origin

            results.append({
                'source': alt.source,
                'target': alt.target,
                'quality': quality,
                'tmsource': tmsource,
            })
Example #5
0
    def __init__(self, max_candidates=3, min_similarity=75, max_length=1000):
        gobject.GObject.__init__(self)
        HTTPClient.__init__(self)

        self.max_candidates = max_candidates
        self.min_similarity = min_similarity
        self.comparer = LevenshteinComparer(max_length)
        self.last_suggestions = []  # used by the open-tran terminology backend

        self._languages = set()

        self.source_lang = None
        self.target_lang = None
        #detect supported language

        self.url_getlanguages = 'http://open-tran.eu/json/supported'
        self.url_translate = 'http://%s.%s.open-tran.eu/json/suggest'
        langreq = RESTRequest(self.url_getlanguages, id='')
        self.add(langreq)
        langreq.connect('http-success',
                        lambda langreq, response: self.got_languages(response))
Example #6
0
    def __init__(self, db_file, max_candidates=3, min_similarity=75, max_length=1000):

        self.max_candidates = max_candidates
        self.min_similarity = min_similarity
        self.max_length = max_length

        if not isinstance(db_file, six.text_type):
            db_file = six.text_type(db_file)  # don't know which encoding
        self.db_file = db_file
        # share connections to same database file between different instances
        if db_file not in self._tm_dbs:
            self._tm_dbs[db_file] = {}
        self._tm_db = self._tm_dbs[db_file]

        # FIXME: do we want to do any checks before we initialize the DB?
        self.init_database()
        self.fulltext = False
        self.init_fulltext()

        self.comparer = LevenshteinComparer(self.max_length)

        self.preload_db()
Example #7
0
def get_tm_results(request, unit):
    """Gets a list of TM results for the current object.

    :return: JSON string with a list of TM results.
    """

    max_len = settings.LV_MAX_LENGTH
    min_similarity = settings.LV_MIN_SIMILARITY

    results = []

    # Shortcut Levenshtein comparer, since the distance, by definition, can't
    # be less than the difference in string length
    diff_len = unit.source_length * (100 - min_similarity)/100
    max_unit_len = unit.source_length + diff_len
    min_unit_len = unit.source_length - diff_len

    criteria = {
        'target_lang': unit.store.translation_project.language,
        'source_lang': unit.store.translation_project.project.source_language,
        'source_length__range': (min_unit_len, max_unit_len),
    }
    tmunits = TMUnit.objects.filter(**criteria).exclude(unit=unit)

    comparer = LevenshteinComparer(max_len)
    for tmunit in tmunits:
        quality = comparer.similarity(tmunit.source, unit.source,
                                      min_similarity)
        if quality >= min_similarity:
            project = tmunit.project
            profile = tmunit.submitted_by
            result = {
                'source': tmunit.source,
                'target': tmunit.target,
                'quality': quality,
                'project': {
                    'project': project.code,
                    'projectname': project.fullname,
                    'absolute_url': project.get_absolute_url(),
                    'icon': _get_project_icon(project),
                }
            }

            if profile is not None:
                submissions = Submission.objects.filter(
                                submitter=profile,
                                type=SubmissionTypes.NORMAL,
                                ).distinct().count()
                suggestions = SuggestionStat.objects.filter(
                                suggester=profile,
                                ).distinct().count()
                translations = submissions - suggestions  # XXX: is this correct?
                title = _("By %s on %s<br/><br/>%s translations<br/>%s suggestions" % (
                            profile.user.get_full_name(),
                            tmunit.submitted_on,
                            translations, suggestions))

                result['translator'] = {
                    'username': unicode(profile.user),
                    'title': title,
                    'absolute_url': profile.get_absolute_url(),
                    'gravatar': profile.gravatar_url(24),
                }

            results.append(result)

    return HttpResponse(jsonify(results), mimetype="application/json")
Example #8
0
class TMDB(object):
    _tm_dbs = {}

    def __init__(self,
                 db_file,
                 max_candidates=3,
                 min_similarity=75,
                 max_length=1000):

        self.max_candidates = max_candidates
        self.min_similarity = min_similarity
        self.max_length = max_length

        if not isinstance(db_file, unicode):
            db_file = unicode(db_file)  # don't know which encoding
        self.db_file = db_file
        # share connections to same database file between different instances
        if db_file not in self._tm_dbs:
            self._tm_dbs[db_file] = {}
        self._tm_db = self._tm_dbs[db_file]

        # FIXME: do we want to do any checks before we initialize the DB?
        self.init_database()
        self.fulltext = False
        self.init_fulltext()

        self.comparer = LevenshteinComparer(self.max_length)

        self.preload_db()

    def _get_connection(self, index):
        current_thread = threading.currentThread()
        if current_thread not in self._tm_db:
            connection = dbapi2.connect(self.db_file.encode('utf-8'))
            cursor = connection.cursor()
            self._tm_db[current_thread] = (connection, cursor)
        return self._tm_db[current_thread][index]

    connection = property(lambda self: self._get_connection(0))
    cursor = property(lambda self: self._get_connection(1))

    def init_database(self):
        """creates database tables and indices"""

        script = """
CREATE TABLE IF NOT EXISTS sources (
       sid INTEGER PRIMARY KEY AUTOINCREMENT,
       text VARCHAR NOT NULL,
       context VARCHAR DEFAULT NULL,
       lang VARCHAR NOT NULL,
       length INTEGER NOT NULL
);
CREATE INDEX IF NOT EXISTS sources_context_idx ON sources (context);
CREATE INDEX IF NOT EXISTS sources_lang_idx ON sources (lang);
CREATE INDEX IF NOT EXISTS sources_length_idx ON sources (length);
CREATE UNIQUE INDEX IF NOT EXISTS sources_uniq_idx ON sources (text, context, lang);

CREATE TABLE IF NOT EXISTS targets (
       tid INTEGER PRIMARY KEY AUTOINCREMENT,
       sid INTEGER NOT NULL,
       text VARCHAR NOT NULL,
       lang VARCHAR NOT NULL,
       time INTEGER DEFAULT NULL,
       FOREIGN KEY (sid) references sources(sid)
);
CREATE INDEX IF NOT EXISTS targets_sid_idx ON targets (sid);
CREATE INDEX IF NOT EXISTS targets_lang_idx ON targets (lang);
CREATE INDEX IF NOT EXISTS targets_time_idx ON targets (time);
CREATE UNIQUE INDEX IF NOT EXISTS targets_uniq_idx ON targets (sid, text, lang);
"""

        try:
            self.cursor.executescript(script)
            self.connection.commit()
        except:
            self.connection.rollback()
            raise

    def init_fulltext(self):
        """detects if fts3 fulltext indexing module exists, initializes fulltext table if it does"""

        # HACKISH: no better way to detect fts3 support except trying to
        # construct a dummy table?!
        try:
            script = """
DROP TABLE IF EXISTS test_for_fts3;
CREATE VIRTUAL TABLE test_for_fts3 USING fts3;
DROP TABLE test_for_fts3;
"""
            self.cursor.executescript(script)
            logging.debug("fts3 supported")
            # for some reason CREATE VIRTUAL TABLE doesn't support IF NOT
            # EXISTS syntax check if fulltext index table exists manually
            self.cursor.execute(
                "SELECT name FROM sqlite_master WHERE name = 'fulltext'")
            if not self.cursor.fetchone():
                # create fulltext index table, and index all strings in sources
                script = """
CREATE VIRTUAL TABLE fulltext USING fts3(text);
"""
                logging.debug("fulltext table not exists, creating")
                self.cursor.executescript(script)
                logging.debug("created fulltext table")
            else:
                logging.debug("fulltext table already exists")

            # create triggers that would sync sources table with fulltext index
            script = """
INSERT INTO fulltext (rowid, text) SELECT sid, text FROM sources WHERE sid NOT IN (SELECT rowid FROM fulltext);
CREATE TRIGGER IF NOT EXISTS sources_insert_trig AFTER INSERT ON sources FOR EACH ROW
BEGIN
    INSERT INTO fulltext (docid, text) VALUES (NEW.sid, NEW.text);
END;
CREATE TRIGGER IF NOT EXISTS sources_update_trig AFTER UPDATE OF text ON sources FOR EACH ROW
BEGIN
    UPDATE fulltext SET text = NEW.text WHERE docid = NEW.sid;
END;
CREATE TRIGGER IF NOT EXISTS sources_delete_trig AFTER DELETE ON sources FOR EACH ROW
BEGIN
    DELETE FROM fulltext WHERE docid = OLD.sid;
END;
"""
            self.cursor.executescript(script)
            self.connection.commit()
            logging.debug("created fulltext triggers")
            self.fulltext = True

        except dbapi2.OperationalError as e:
            self.fulltext = False
            logging.debug("failed to initialize fts3 support: " + str(e))
            script = """
DROP TRIGGER IF EXISTS sources_insert_trig;
DROP TRIGGER IF EXISTS sources_update_trig;
DROP TRIGGER IF EXISTS sources_delete_trig;
"""
            self.cursor.executescript(script)

    def preload_db(self):
        """ugly hack to force caching of sqlite db file in memory for
        improved performance"""
        if self.fulltext:
            query = """SELECT COUNT(*) FROM sources s JOIN fulltext f ON s.sid = f.docid JOIN targets t on s.sid = t.sid"""
        else:
            query = """SELECT COUNT(*) FROM sources s JOIN targets t on s.sid = t.sid"""
        self.cursor.execute(query)
        (numrows, ) = self.cursor.fetchone()
        logging.debug("tmdb has %d records" % numrows)
        return numrows

    def add_unit(self, unit, source_lang=None, target_lang=None, commit=True):
        """inserts unit in the database"""
        # TODO: is that really the best way to handle unspecified
        # source and target languages? what about conflicts between
        # unit attributes and passed arguments
        if unit.getsourcelanguage():
            source_lang = unit.getsourcelanguage()
        if unit.gettargetlanguage():
            target_lang = unit.gettargetlanguage()

        if not source_lang:
            raise LanguageError("undefined source language")
        if not target_lang:
            raise LanguageError("undefined target language")

        unitdict = {
            "source": unit.source,
            "target": unit.target,
            "context": unit.getcontext(),
        }
        self.add_dict(unitdict, source_lang, target_lang, commit)

    def add_dict(self, unit, source_lang, target_lang, commit=True):
        """inserts units represented as dictionaries in database"""
        source_lang = data.normalize_code(source_lang)
        target_lang = data.normalize_code(target_lang)
        try:
            try:
                self.cursor.execute(
                    "INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)",
                    (unit["source"], unit["context"], source_lang,
                     len(unit["source"])))
                sid = self.cursor.lastrowid
            except dbapi2.IntegrityError:
                # source string already exists in db, run query to find sid
                self.cursor.execute(
                    "SELECT sid FROM sources WHERE text=? AND context=? and lang=?",
                    (unit["source"], unit["context"], source_lang))
                sid = self.cursor.fetchone()
                (sid, ) = sid
            try:
                # FIXME: get time info from translation store
                # FIXME: do we need so store target length?
                self.cursor.execute(
                    "INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)",
                    (sid, unit["target"], target_lang, int(time.time())))
            except dbapi2.IntegrityError:
                # target string already exists in db, do nothing
                pass

            if commit:
                self.connection.commit()
        except:
            if commit:
                self.connection.rollback()
            raise

    def add_store(self, store, source_lang, target_lang, commit=True):
        """insert all units in store in database"""
        count = 0
        for unit in store.units:
            if unit.istranslatable() and unit.istranslated():
                self.add_unit(unit, source_lang, target_lang, commit=False)
                count += 1
        if commit:
            self.connection.commit()
        return count

    def add_list(self, units, source_lang, target_lang, commit=True):
        """insert all units in list into the database, units are
        represented as dictionaries"""
        count = 0
        for unit in units:
            self.add_dict(unit, source_lang, target_lang, commit=False)
            count += 1
        if commit:
            self.connection.commit()
        return count

    def translate_unit(self, unit_source, source_langs, target_langs):
        """return TM suggestions for unit_source"""
        if isinstance(unit_source, str):
            unit_source = unicode(unit_source, "utf-8")
        if isinstance(source_langs, list):
            source_langs = [data.normalize_code(lang) for lang in source_langs]
            source_langs = ','.join(source_langs)
        else:
            source_langs = data.normalize_code(source_langs)
        if isinstance(target_langs, list):
            target_langs = [data.normalize_code(lang) for lang in target_langs]
            target_langs = ','.join(target_langs)
        else:
            target_langs = data.normalize_code(target_langs)

        minlen = min_levenshtein_length(len(unit_source), self.min_similarity)
        maxlen = max_levenshtein_length(len(unit_source), self.min_similarity,
                                        self.max_length)

        # split source into words, remove punctuation and special
        # chars, keep words that are at least 3 chars long
        unit_words = STRIP_REGEXP.sub(' ', unit_source).split()
        unit_words = filter(lambda word: len(word) > 2, unit_words)

        if self.fulltext and len(unit_words) > 3:
            logging.debug("fulltext matching")
            query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid
                       WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ?
                       AND fulltext MATCH ?"""
            search_str = " OR ".join(unit_words)
            self.cursor.execute(
                query,
                (source_langs, target_langs, minlen, maxlen, search_str))
        else:
            logging.debug("nonfulltext matching")
            query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid
            WHERE s.lang IN (?) AND t.lang IN (?)
            AND s.length >= ? AND s.length <= ?"""
            self.cursor.execute(query,
                                (source_langs, target_langs, minlen, maxlen))

        results = []
        for row in self.cursor:
            quality = self.comparer.similarity(unit_source, row[0],
                                               self.min_similarity)
            if quality >= self.min_similarity:
                results.append({
                    'source': row[0],
                    'target': row[1],
                    'context': row[2],
                    'quality': quality,
                })
        results.sort(key=lambda match: match['quality'], reverse=True)
        results = results[:self.max_candidates]
        logging.debug("results: %s", unicode(results))
        return results
Example #9
0
 def similarity(self, a, b, stoppercentage=50):
     return int(LevenshteinComparer.similarity(self, a, b, stoppercentage))
Example #10
0
 def __init__(self, max_len=10000):
     LevenshteinComparer.__init__(self, max_len)
Example #11
0
 def comparer(self):
     if not hasattr(self, '_comparer'):
         max_length = current_app.config.get('MAX_LENGTH', 1000)
         self._comparer = LevenshteinComparer(max_length)
     return self._comparer
Example #12
0
class TMDB(object):
    _tm_dbs = {}

    def __init__(self, db_file, max_candidates=3, min_similarity=75,
                 max_length=1000):

        self.max_candidates = max_candidates
        self.min_similarity = min_similarity
        self.max_length = max_length

        if not isinstance(db_file, six.text_type):
            db_file = six.text_type(db_file)  # don't know which encoding
        self.db_file = db_file
        # share connections to same database file between different instances
        if db_file not in self._tm_dbs:
            self._tm_dbs[db_file] = {}
        self._tm_db = self._tm_dbs[db_file]

        # FIXME: do we want to do any checks before we initialize the DB?
        self.init_database()
        self.fulltext = False
        self.init_fulltext()

        self.comparer = LevenshteinComparer(self.max_length)

        self.preload_db()

    def _get_connection(self, index):
        current_thread = threading.currentThread()
        if current_thread not in self._tm_db:
            connection = dbapi2.connect(self.db_file.encode('utf-8') if six.PY2 else self.db_file)
            cursor = connection.cursor()
            self._tm_db[current_thread] = (connection, cursor)
        return self._tm_db[current_thread][index]

    connection = property(lambda self: self._get_connection(0))
    cursor = property(lambda self: self._get_connection(1))

    def init_database(self):
        """creates database tables and indices"""

        script = """
CREATE TABLE IF NOT EXISTS sources (
       sid INTEGER PRIMARY KEY AUTOINCREMENT,
       text VARCHAR NOT NULL,
       context VARCHAR DEFAULT NULL,
       lang VARCHAR NOT NULL,
       length INTEGER NOT NULL
);
CREATE INDEX IF NOT EXISTS sources_context_idx ON sources (context);
CREATE INDEX IF NOT EXISTS sources_lang_idx ON sources (lang);
CREATE INDEX IF NOT EXISTS sources_length_idx ON sources (length);
CREATE UNIQUE INDEX IF NOT EXISTS sources_uniq_idx ON sources (text, context, lang);

CREATE TABLE IF NOT EXISTS targets (
       tid INTEGER PRIMARY KEY AUTOINCREMENT,
       sid INTEGER NOT NULL,
       text VARCHAR NOT NULL,
       lang VARCHAR NOT NULL,
       time INTEGER DEFAULT NULL,
       FOREIGN KEY (sid) references sources(sid)
);
CREATE INDEX IF NOT EXISTS targets_sid_idx ON targets (sid);
CREATE INDEX IF NOT EXISTS targets_lang_idx ON targets (lang);
CREATE INDEX IF NOT EXISTS targets_time_idx ON targets (time);
CREATE UNIQUE INDEX IF NOT EXISTS targets_uniq_idx ON targets (sid, text, lang);
"""

        try:
            self.cursor.executescript(script)
            self.connection.commit()
        except Exception:
            self.connection.rollback()
            raise

    def init_fulltext(self):
        """detects if fts3 fulltext indexing module exists, initializes fulltext table if it does"""

        # HACKISH: no better way to detect fts3 support except trying to
        # construct a dummy table?!
        try:
            script = """
DROP TABLE IF EXISTS test_for_fts3;
CREATE VIRTUAL TABLE test_for_fts3 USING fts3;
DROP TABLE test_for_fts3;
"""
            self.cursor.executescript(script)
            logging.debug("fts3 supported")
            # for some reason CREATE VIRTUAL TABLE doesn't support IF NOT
            # EXISTS syntax check if fulltext index table exists manually
            self.cursor.execute("SELECT name FROM sqlite_master WHERE name = 'fulltext'")
            if not self.cursor.fetchone():
                # create fulltext index table, and index all strings in sources
                script = """
CREATE VIRTUAL TABLE fulltext USING fts3(text);
"""
                logging.debug("fulltext table not exists, creating")
                self.cursor.executescript(script)
                logging.debug("created fulltext table")
            else:
                logging.debug("fulltext table already exists")

            # create triggers that would sync sources table with fulltext index
            script = """
INSERT INTO fulltext (rowid, text) SELECT sid, text FROM sources WHERE sid NOT IN (SELECT rowid FROM fulltext);
CREATE TRIGGER IF NOT EXISTS sources_insert_trig AFTER INSERT ON sources FOR EACH ROW
BEGIN
    INSERT INTO fulltext (docid, text) VALUES (NEW.sid, NEW.text);
END;
CREATE TRIGGER IF NOT EXISTS sources_update_trig AFTER UPDATE OF text ON sources FOR EACH ROW
BEGIN
    UPDATE fulltext SET text = NEW.text WHERE docid = NEW.sid;
END;
CREATE TRIGGER IF NOT EXISTS sources_delete_trig AFTER DELETE ON sources FOR EACH ROW
BEGIN
    DELETE FROM fulltext WHERE docid = OLD.sid;
END;
"""
            self.cursor.executescript(script)
            self.connection.commit()
            logging.debug("created fulltext triggers")
            self.fulltext = True

        except dbapi2.OperationalError as e:
            self.fulltext = False
            logging.debug("failed to initialize fts3 support: " + str(e))
            script = """
DROP TRIGGER IF EXISTS sources_insert_trig;
DROP TRIGGER IF EXISTS sources_update_trig;
DROP TRIGGER IF EXISTS sources_delete_trig;
"""
            self.cursor.executescript(script)

    def preload_db(self):
        """ugly hack to force caching of sqlite db file in memory for improved
        performance
        """
        if self.fulltext:
            query = """SELECT COUNT(*) FROM sources s JOIN fulltext f ON s.sid = f.docid JOIN targets t on s.sid = t.sid"""
        else:
            query = """SELECT COUNT(*) FROM sources s JOIN targets t on s.sid = t.sid"""
        self.cursor.execute(query)
        (numrows,) = self.cursor.fetchone()
        logging.debug("tmdb has %d records" % numrows)
        return numrows

    def add_unit(self, unit, source_lang=None, target_lang=None, commit=True):
        """inserts unit in the database"""
        # TODO: is that really the best way to handle unspecified
        # source and target languages? what about conflicts between
        # unit attributes and passed arguments
        if unit.getsourcelanguage():
            source_lang = unit.getsourcelanguage()
        if unit.gettargetlanguage():
            target_lang = unit.gettargetlanguage()

        if not source_lang:
            raise LanguageError("undefined source language")
        if not target_lang:
            raise LanguageError("undefined target language")

        unitdict = {
            "source": unit.source,
            "target": unit.target,
            "context": unit.getcontext(),
        }
        self.add_dict(unitdict, source_lang, target_lang, commit)

    def add_dict(self, unit, source_lang, target_lang, commit=True):
        """inserts units represented as dictionaries in database"""
        source_lang = data.normalize_code(source_lang)
        target_lang = data.normalize_code(target_lang)
        try:
            try:
                self.cursor.execute("INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)",
                                    (unit["source"],
                                     unit["context"],
                                     source_lang,
                                     len(unit["source"])))
                sid = self.cursor.lastrowid
            except dbapi2.IntegrityError:
                # source string already exists in db, run query to find sid
                self.cursor.execute("SELECT sid FROM sources WHERE text=? AND context=? and lang=?",
                                    (unit["source"],
                                     unit["context"],
                                     source_lang))
                sid = self.cursor.fetchone()
                (sid,) = sid
            try:
                # FIXME: get time info from translation store
                # FIXME: do we need so store target length?
                self.cursor.execute("INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)",
                                    (sid,
                                     unit["target"],
                                     target_lang,
                                     int(time.time())))
            except dbapi2.IntegrityError:
                # target string already exists in db, do nothing
                pass

            if commit:
                self.connection.commit()
        except Exception:
            if commit:
                self.connection.rollback()
            raise

    def add_store(self, store, source_lang, target_lang, commit=True):
        """insert all units in store in database"""
        count = 0
        for unit in store.units:
            if unit.istranslatable() and unit.istranslated():
                self.add_unit(unit, source_lang, target_lang, commit=False)
                count += 1
        if commit:
            self.connection.commit()
        return count

    def add_list(self, units, source_lang, target_lang, commit=True):
        """insert all units in list into the database, units are represented as
        dictionaries
        """
        count = 0
        for unit in units:
            self.add_dict(unit, source_lang, target_lang, commit=False)
            count += 1
        if commit:
            self.connection.commit()
        return count

    def translate_unit(self, unit_source, source_langs, target_langs):
        """return TM suggestions for unit_source"""
        if isinstance(unit_source, bytes):
            unit_source = unit_source.decode("utf-8")
        if isinstance(source_langs, list):
            source_langs = [data.normalize_code(lang) for lang in source_langs]
            source_langs = ','.join(source_langs)
        else:
            source_langs = data.normalize_code(source_langs)
        if isinstance(target_langs, list):
            target_langs = [data.normalize_code(lang) for lang in target_langs]
            target_langs = ','.join(target_langs)
        else:
            target_langs = data.normalize_code(target_langs)

        minlen = min_levenshtein_length(len(unit_source), self.min_similarity)
        maxlen = max_levenshtein_length(len(unit_source), self.min_similarity,
                                        self.max_length)

        # split source into words, remove punctuation and special
        # chars, keep words that are at least 3 chars long
        unit_words = STRIP_REGEXP.sub(' ', unit_source).split()
        unit_words = list(filter(lambda word: len(word) > 2, unit_words))

        if self.fulltext and len(unit_words) > 3:
            logging.debug("fulltext matching")
            query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid
                       WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ?
                       AND fulltext MATCH ?"""
            search_str = " OR ".join(unit_words)
            self.cursor.execute(query, (source_langs, target_langs, minlen,
                                maxlen, search_str))
        else:
            logging.debug("nonfulltext matching")
            query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid
            WHERE s.lang IN (?) AND t.lang IN (?)
            AND s.length >= ? AND s.length <= ?"""
            self.cursor.execute(query, (source_langs, target_langs, minlen,
                                        maxlen))

        results = []
        for row in self.cursor:
            quality = self.comparer.similarity(unit_source, row[0],
                                               self.min_similarity)
            if quality >= self.min_similarity:
                results.append({
                    'source': row[0],
                    'target': row[1],
                    'context': row[2],
                    'quality': quality,
                })
        results.sort(key=lambda match: match['quality'], reverse=True)
        results = results[:self.max_candidates]
        logging.debug("results: %s", six.text_type(results))
        return results
Example #13
0
def get_tm_results(request, unit):
    """Gets a list of TM results for the current object.

    :return: JSON string with a list of TM results.
    """

    max_len = settings.LV_MAX_LENGTH
    min_similarity = settings.LV_MIN_SIMILARITY

    results = []

    # Shortcut Levenshtein comparer, since the distance, by definition, can't
    # be less than the difference in string length
    diff_len = unit.source_length * (100 - min_similarity)/100
    max_unit_len = unit.source_length + diff_len
    min_unit_len = unit.source_length - diff_len

    criteria = {
        'target_lang': unit.store.translation_project.language,
        'source_lang': unit.store.translation_project.project.source_language,
        'source_length__range': (min_unit_len, max_unit_len),
    }
    tmunits = TMUnit.objects.filter(**criteria).exclude(unit=unit)

    comparer = LevenshteinComparer(max_len)
    for tmunit in tmunits:
        quality = comparer.similarity(tmunit.source, unit.source,
                                      min_similarity)
        if quality >= min_similarity:
            project = tmunit.project
            profile = tmunit.submitted_by
            result = {
                'source': tmunit.source,
                'target': tmunit.target,
                'quality': quality,
                'project': {
                    'project': project.code,
                    'projectname': project.fullname,
                    'absolute_url': project.get_absolute_url(),
                    'icon': _get_project_icon(project),
                }
            }

            if profile is not None:
                submissions = Submission.objects.filter(
                                submitter=profile,
                                type=SubmissionTypes.NORMAL,
                                ).distinct().count()
                suggestions = SuggestionStat.objects.filter(
                                suggester=profile,
                                ).distinct().count()
                translations = submissions - suggestions  # XXX: is this correct?
                title = _("By %s on %s<br/><br/>%s translations<br/>%s suggestions" % (
                            profile.user.get_full_name(),
                            tmunit.submitted_on,
                            translations, suggestions))

                result['translator'] = {
                    'username': unicode(profile.user),
                    'title': title,
                    'absolute_url': profile.get_absolute_url(),
                    'gravatar': profile.gravatar_url(24),
                }

            results.append(result)

    return HttpResponse(jsonify(results), mimetype="application/json")
Example #14
0
 def similarity(self, a, b, stoppercentage=50):
     return int(
         LevenshteinComparer.similarity(self, a, b, stoppercentage)
     )
Example #15
0
 def __init__(self, max_len=10000):
     LevenshteinComparer.__init__(self, max_len)