def detect_language(text: str, language_codes: Optional[Iterable[str]] = None, use_cld2full: bool = False):
    language_codes=check_languages(language_codes, SUPPORTED_LANGUAGES)

    if use_cld2full:
        is_reliable, bytes_found, details = cld2full.detect(text, bestEffort=True)
    else:
        is_reliable, bytes_found, details = cld2.detect(text, bestEffort=True)

    if not is_reliable:
        return []

    out = []
    for language_name, language_code, percent, score in details:
        if '-' in language_code:
            language_code = language_code.split('-')[0]
        if language_code not in language_codes:
            continue
        if score < 1:
            continue
        if percent < 50:
            continue
        if use_cld2full:
            out.append((language_code, score / 500))
        else:
            out.append((language_code, score / 1000))

    return sorted(out, key=lambda x: x[1:], reverse=True)
    def get(self):
        text = self.get_argument('q')
        if not text:
            return self.send_error(400, explanation='Missing q argument')

        if cld2:
            cldResults = cld2.detect(text)
            if cldResults[0]:
                possibleLangs = filter(lambda x: x[1] != 'un', cldResults[2])
                self.sendResponse({toAlpha3Code(possibleLang[1]): possibleLang[2] for possibleLang in possibleLangs})
            else:
                self.sendResponse({'nob': 100})  # TODO: Some more reasonable response
        else:
            def handleCoverages(coverages):
                self.sendResponse(coverages)

            pool = Pool(processes=1)
            result = pool.apply_async(getCoverages, [text, self.analyzers], {'penalize': True}, callback=handleCoverages)
            pool.close()
            try:
                coverages = result.get(timeout=self.timeout)
                # TODO: Coverages are not actually sent!!
            except TimeoutError:
                self.send_error(408, explanation='Request timed out')
                pool.terminate()
Exemple #3
0
    def get(self):
        text = self.get_argument('q')
        if not text:
            return self.send_error(400, explanation='Missing q argument')

        if cld2:
            cld_results = cld2.detect(text)
            if cld_results[0]:
                possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])
                self.send_response({
                    to_alpha3_code(possible_lang[1]): possible_lang[2]
                    for possible_lang in possible_langs
                })
            else:
                self.send_response({'nob': 100
                                    })  # TODO: Some more reasonable response
        else:
            try:
                coverages = yield gen.with_timeout(
                    timedelta(seconds=self.timeout),
                    get_coverages(text, self.analyzers, penalize=True),
                )
                self.send_response(coverages)

            except gen.TimeoutError:
                self.send_error(408, explanation='Request timed out')
Exemple #4
0
    def get(self):
        text = self.get_argument('q')
        if not text:
            return self.send_error(400, explanation='Missing q argument')

        if cld2:
            cldResults = cld2.detect(text)
            if cldResults[0]:
                possibleLangs = filter(lambda x: x[1] != 'un', cldResults[2])
                self.sendResponse({
                    toAlpha3Code(possibleLang[1]): possibleLang[2]
                    for possibleLang in possibleLangs
                })
            else:
                self.sendResponse({'nob':
                                   100})  # TODO: Some more reasonable response
        else:

            def handleCoverages(coverages):
                self.sendResponse(coverages)

            pool = Pool(processes=1)
            result = pool.apply_async(getCoverages, [text, self.analyzers],
                                      {'penalize': True},
                                      callback=handleCoverages)
            pool.close()
            try:
                coverages = result.get(timeout=self.timeout)
                # TODO: Coverages are not actually sent!!
            except TimeoutError:
                self.send_error(408, explanation='Request timed out')
                pool.terminate()
Exemple #5
0
def add_langs(doc, langs, langdocs, text=True):
    success, length, languages = cld.detect(doc, text)
    for lang in languages:
        name, code, prc, score = lang
        langs[name] = langs.get(name, 0.0) + length * prc / 100
        if prc > 0:
            langdocs[name] = langdocs.get(name, 0) + 1
    return length
    def get(self):
        text = self.get_argument('q')
        if not text:
            return self.send_error(400, explanation='Missing q argument')

        if cld2:
            cld_results = cld2.detect(text)
            if cld_results[0]:
                possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])
                self.send_response({to_alpha3_code(possible_lang[1]): possible_lang[2] for possible_lang in possible_langs})
            else:
                self.send_response({'nob': 100})  # TODO: Some more reasonable response
        else:
            try:
                coverages = yield gen.with_timeout(
                    timedelta(seconds=self.timeout),
                    get_coverages(text, self.analyzers, penalize=True),
                )
                self.send_response(coverages)

            except gen.TimeoutError:
                self.send_error(408, explanation='Request timed out')
            raise RuntimeError('malformed line %d: %s' % (lineCount, line))
    lang = m.group(1)
    source = m.group(2)
    text = m.group(3)

    # Ignore odd combinations:
    if lang in (
            'ar-Latn',  # Arabic
            'hr-Cyrl',  # Croatian
            'ko-Latn',  # Korean
            'fa-Latn'):
        print('NOTE: skip odd lang/script combination %s: source=%s, text=%s' %
              (lang, source, text))
        continue

    isReliable, textBytesFound, details = cld2detect.detect(text,
                                                            isPlainText=True)
    langCode = lang.split('-')[0]
    if langCode == details[0][1]:
        #if langCode in [x[1] for x in details]:
        correct += 1
    else:
        wrong += 1
        print("wrong: %s vs %s: %s" % (langCode, details, text))
        #print('%s: %s, %s' % (lang, isReliable, details))
        #print('%s: %s' % (lang, source))

t1 = time.time()
total = correct + wrong
print('Took %.1f sec (%.3f msec per test); %d correct of %d total: %.3f %% accuracy' % \
      (t1-t0,
       1000*(t1-t0)/total,
Exemple #8
0
        m = reOneLine.match(line)
        if m is None:
            raise RuntimeError('malformed line %d: %s' % (lineCount, line))
    lang = m.group(1)
    source = m.group(2)
    text = m.group(3)

    # Ignore odd combinations:
    if lang in ('ar-Latn',  # Arabic
                'hr-Cyrl',  # Croatian
                'ko-Latn',  # Korean
                'fa-Latn'):
        print('NOTE: skip odd lang/script combination %s: source=%s, text=%s' % (lang, source, text))
        continue

    isReliable, textBytesFound, details = cld2detect.detect(text, isPlainText=True)
    langCode = lang.split('-')[0]
    if langCode == details[0][1]:
        #if langCode in [x[1] for x in details]:
        correct += 1
    else:
        wrong += 1
        print("wrong: %s vs %s: %s" % (langCode, details, text))
        #print('%s: %s, %s' % (lang, isReliable, details))
        #print('%s: %s' % (lang, source))

t1 = time.time()
total = correct + wrong
print('Took %.1f sec (%.3f msec per test); %d correct of %d total: %.3f %% accuracy' % \
      (t1-t0,
       1000*(t1-t0)/total,