Esempio n. 1
0
def loadDictionary(langKey, langVal, minLen=0):
    langKeyInfo = languages.get(code3=langKey)
    langValInfo = languages.get(code3=langVal)

    minKeyLen = langKeyInfo.ngrams or minLen
    minValLen = langValInfo.ngrams or minLen

    dictionary = gizmo.Dictionary()

    def addEntry(key, val):
        if len(key) >= minKeyLen and len(val) >= minValLen:
            if langKeyInfo.rightToLeft: key = key[::-1]
            if langValInfo.rightToLeft: val = val[::-1]
            for k in splitNgrams(key, langKeyInfo.ngrams):
                for v in splitNgrams(val, langValInfo.ngrams):
                    dictionary.add(k.lower(), v)

    asset = assets.getAsset('dict', (langKey, langVal))
    if asset.localVersion():
        for key, val in asset.readDictionary():
            addEntry(key, val)
    else:
        asset = assets.getAsset('dict', (langVal, langKey))
        if asset.localVersion():
            for key, val in asset.readDictionary():
                addEntry(val, key)

    if not asset.localVersion():
        raise Error(_('There is no dictionary for transaltion from {} to {}') \
                .format(langKey, langVal)) \
                .add('language1', langKey) \
                .add('language2', langVal)

    logger.info('dictionary ready with %u entries', dictionary.size())
    return dictionary
Esempio n. 2
0
def getLangFromPath(path):
    """Get language code from file name.

    Reads 2- or 3-letter language code from file name, if code is present as
    last thing before file extension.

    Returns
    -------
    str
        3-letter language code or `None` if code is not present.

    Examples
    --------
    - `subtitles.eng.srt` - eng
    - `subtitles-fr.srt` - fre
    """

    name = path.rsplit('.', 1)[0]
    size = 0

    for c in reversed(name):
        if c.isalpha():
            size += 1
        else:
            break

    if size == 2 or size == 3:
        return languages.get(name[-size:].lower()).code3
Esempio n. 3
0
    def format(self, pattern, sub, ref):
        if pattern is None or sub is None or ref is None:
            return None

        cacheKey = (sub.path, sub.no, sub.lang, ref.path, ref.no, ref.lang)

        if self.cache[0] == cacheKey:
            if self.cache[1] == pattern:
                return self.cache[2]

        else:
            self.d = {}
            for prefix, item in [('sub_', sub), ('ref_', ref)]:
                self.d[prefix + 'path'] = item.path
                self.d[prefix + 'no'] = str(item.no + 1)
                self.d[prefix + 'lang'] = item.lang or ''
                self.d[prefix +
                       'lang2'] = languages.get(code3=item.lang).code2 or ''
                self.d[prefix + 'name'] = os.path.splitext(
                    os.path.basename(item.path))[0]
                self.d[prefix + 'dir'] = os.path.dirname(item.path)

        path = _formatPattern(pattern, self.d)
        self.cache = (cacheKey, pattern, path)
        return path
Esempio n. 4
0
    def __init__(self, stream, runCb=None):
        ''' Speech recognition pipeline:

        Demux --> SpeechDec  --[words]--> {NgramSplitter} --[words]--> ...
        '''

        super().__init__(stream, runCb)
        self.dec = gizmo.SubtitleDec()
        self.dec.setMinWordLen(settings().minWordLen)
        self.ngramSplitter = None
        self.sink = self.dec

        langInfo = stream.lang and languages.get(code3=stream.lang.lower())
        if langInfo:
            if langInfo.rightToLeft:
                logger.info('switching to right-to-left for file "%s"',
                            stream.path)
                self.dec.setRightToLeft(True)

            if langInfo.ngrams:
                logger.info('switching to %i-gram for file "%s"',
                            langInfo.ngrams, stream.path)
                self.dec.setMinWordLen(langInfo.ngrams)
                self.ngramSplitter = gizmo.NgramSplitter(langInfo.ngrams)
                self.dec.addWordsListener(self.ngramSplitter.pushWord)
                self.sink = self.ngramSplitter

        if stream.enc != None:
            self.dec.setEncoding(stream.enc)

        self.demux.connectDec(self.dec, stream.no)
Esempio n. 5
0
def detectEncoding(path, lang, probeSize=32 * 1024):
    try:
        dlang, denc = locale.getdefaultlocale()
    except Exception as e:
        logger.warn('getdefaultlocale failed, %r', e)
        dlang, denc = None, None

    if not lang and dlang:
        lang = dlang.split('_', 1)[0]

    encs = ['UTF-8'] + languages.get(lang).encodings
    if denc and denc not in encs:
        encs.append(denc)

    try:
        for enc in encs:
            with open(path, 'r', encoding=enc) as fp:
                try:
                    fp.read(32 * 1024)
                    logger.info('detected encoding %s for file "%s"', enc,
                                path)
                    return enc
                except UnicodeError:
                    pass
    except FileNotFoundError:
        raise Error('File not found').add('path', path)

    logger.info('couldn\'t detect encoding for file "%s", tried %s', path,
                encs)
Esempio n. 6
0
def getLangFromPath(path):
    ''' Returns two- or three-letters language code from filename in form
    name.code.extension, e.g. subtitles.eng.srt or subtitles-fr.srt
    '''

    name = path.rsplit('.', 1)[0]
    size = 0

    for c in reversed(name):
        if c.isalpha():
            size += 1
        else:
            break

    if size == 2 or size == 3:
        return languages.get(name[-size:].lower()).code3
Esempio n. 7
0
    def genDefaultFileName(self, path, suffix=None):
        try:
            res = []
            basename, _ = os.path.splitext(os.path.basename(path))
            res.append(basename)

            if suffix:
                res.append(suffix)

            elif settings().appendLangCode and self.task.sub.lang:
                if settings().appendLangCode in [3, True]:
                    res.append(self.task.sub.lang)
                elif settings().appendLangCode == 2:
                    lang = languages.get(code=self.task.sub.lang)
                    if lang.code2:
                        res.append(lang.code2)

            res.append('srt')
            return '.'.join(res)
        except Exception as e:
            logger.warning('%r', e)
Esempio n. 8
0
    def __init__(self, stream, runCb=None):
        ''' Speech recognition pipeline:

        Demux --> AudioDec --> Resampler --> SpeechRecognition --[words]--> {NgramSplitter} --[words]--> ...
        '''

        super().__init__(stream, runCb)

        speechModel = speech.loadSpeechModel(stream.lang)
        self.dec = gizmo.AudioDec()

        speechAudioFormat = speech.getSpeechAudioFormat(speechModel)
        logger.info('speech recognition audio format: %s', speechAudioFormat)

        self.speechRec = speech.createSpeechRec(speechModel)
        self.speechRec.setMinWordProb(settings().minWordProb)
        self.speechRec.setMinWordLen(settings().minWordLen)
        self.ngramSplitter = None
        self.sink = self.speechRec

        langInfo = stream.lang and languages.get(code3=stream.lang.lower())
        if langInfo and langInfo.ngrams:
            logger.info('switching to %i-gram for audio "%s"', langInfo.ngrams,
                        stream.path)
            self.speechRec.setMinWordLen(langInfo.ngrams)
            self.ngramSplitter = gizmo.NgramSplitter(langInfo.ngrams)
            self.speechRec.addWordsListener(self.ngramSplitter.pushWord)
            self.sink = self.ngramSplitter

        self.resampler = gizmo.Resampler()
        self.channels = stream.channels
        self.resampler.connectFormatChangeCallback(self.onAudioFormatChanged)

        self.demux.connectDec(self.dec, stream.no)
        self.dec.connectOutput(self.resampler)
        self.resampler.connectOutput(self.speechRec, speechAudioFormat)