def detect_language(text: str, language_codes: Optional[Iterable[str]] = None, use_cld2full: bool = False): language_codes=check_languages(language_codes, SUPPORTED_LANGUAGES) if use_cld2full: is_reliable, bytes_found, details = cld2full.detect(text, bestEffort=True) else: is_reliable, bytes_found, details = cld2.detect(text, bestEffort=True) if not is_reliable: return [] out = [] for language_name, language_code, percent, score in details: if '-' in language_code: language_code = language_code.split('-')[0] if language_code not in language_codes: continue if score < 1: continue if percent < 50: continue if use_cld2full: out.append((language_code, score / 500)) else: out.append((language_code, score / 1000)) return sorted(out, key=lambda x: x[1:], reverse=True)
def get(self): text = self.get_argument('q') if not text: return self.send_error(400, explanation='Missing q argument') if cld2: cldResults = cld2.detect(text) if cldResults[0]: possibleLangs = filter(lambda x: x[1] != 'un', cldResults[2]) self.sendResponse({toAlpha3Code(possibleLang[1]): possibleLang[2] for possibleLang in possibleLangs}) else: self.sendResponse({'nob': 100}) # TODO: Some more reasonable response else: def handleCoverages(coverages): self.sendResponse(coverages) pool = Pool(processes=1) result = pool.apply_async(getCoverages, [text, self.analyzers], {'penalize': True}, callback=handleCoverages) pool.close() try: coverages = result.get(timeout=self.timeout) # TODO: Coverages are not actually sent!! except TimeoutError: self.send_error(408, explanation='Request timed out') pool.terminate()
def get(self): text = self.get_argument('q') if not text: return self.send_error(400, explanation='Missing q argument') if cld2: cld_results = cld2.detect(text) if cld_results[0]: possible_langs = filter(lambda x: x[1] != 'un', cld_results[2]) self.send_response({ to_alpha3_code(possible_lang[1]): possible_lang[2] for possible_lang in possible_langs }) else: self.send_response({'nob': 100 }) # TODO: Some more reasonable response else: try: coverages = yield gen.with_timeout( timedelta(seconds=self.timeout), get_coverages(text, self.analyzers, penalize=True), ) self.send_response(coverages) except gen.TimeoutError: self.send_error(408, explanation='Request timed out')
def get(self): text = self.get_argument('q') if not text: return self.send_error(400, explanation='Missing q argument') if cld2: cldResults = cld2.detect(text) if cldResults[0]: possibleLangs = filter(lambda x: x[1] != 'un', cldResults[2]) self.sendResponse({ toAlpha3Code(possibleLang[1]): possibleLang[2] for possibleLang in possibleLangs }) else: self.sendResponse({'nob': 100}) # TODO: Some more reasonable response else: def handleCoverages(coverages): self.sendResponse(coverages) pool = Pool(processes=1) result = pool.apply_async(getCoverages, [text, self.analyzers], {'penalize': True}, callback=handleCoverages) pool.close() try: coverages = result.get(timeout=self.timeout) # TODO: Coverages are not actually sent!! except TimeoutError: self.send_error(408, explanation='Request timed out') pool.terminate()
def add_langs(doc, langs, langdocs, text=True): success, length, languages = cld.detect(doc, text) for lang in languages: name, code, prc, score = lang langs[name] = langs.get(name, 0.0) + length * prc / 100 if prc > 0: langdocs[name] = langdocs.get(name, 0) + 1 return length
def get(self): text = self.get_argument('q') if not text: return self.send_error(400, explanation='Missing q argument') if cld2: cld_results = cld2.detect(text) if cld_results[0]: possible_langs = filter(lambda x: x[1] != 'un', cld_results[2]) self.send_response({to_alpha3_code(possible_lang[1]): possible_lang[2] for possible_lang in possible_langs}) else: self.send_response({'nob': 100}) # TODO: Some more reasonable response else: try: coverages = yield gen.with_timeout( timedelta(seconds=self.timeout), get_coverages(text, self.analyzers, penalize=True), ) self.send_response(coverages) except gen.TimeoutError: self.send_error(408, explanation='Request timed out')
raise RuntimeError('malformed line %d: %s' % (lineCount, line)) lang = m.group(1) source = m.group(2) text = m.group(3) # Ignore odd combinations: if lang in ( 'ar-Latn', # Arabic 'hr-Cyrl', # Croatian 'ko-Latn', # Korean 'fa-Latn'): print('NOTE: skip odd lang/script combination %s: source=%s, text=%s' % (lang, source, text)) continue isReliable, textBytesFound, details = cld2detect.detect(text, isPlainText=True) langCode = lang.split('-')[0] if langCode == details[0][1]: #if langCode in [x[1] for x in details]: correct += 1 else: wrong += 1 print("wrong: %s vs %s: %s" % (langCode, details, text)) #print('%s: %s, %s' % (lang, isReliable, details)) #print('%s: %s' % (lang, source)) t1 = time.time() total = correct + wrong print('Took %.1f sec (%.3f msec per test); %d correct of %d total: %.3f %% accuracy' % \ (t1-t0, 1000*(t1-t0)/total,
m = reOneLine.match(line) if m is None: raise RuntimeError('malformed line %d: %s' % (lineCount, line)) lang = m.group(1) source = m.group(2) text = m.group(3) # Ignore odd combinations: if lang in ('ar-Latn', # Arabic 'hr-Cyrl', # Croatian 'ko-Latn', # Korean 'fa-Latn'): print('NOTE: skip odd lang/script combination %s: source=%s, text=%s' % (lang, source, text)) continue isReliable, textBytesFound, details = cld2detect.detect(text, isPlainText=True) langCode = lang.split('-')[0] if langCode == details[0][1]: #if langCode in [x[1] for x in details]: correct += 1 else: wrong += 1 print("wrong: %s vs %s: %s" % (langCode, details, text)) #print('%s: %s, %s' % (lang, isReliable, details)) #print('%s: %s' % (lang, source)) t1 = time.time() total = correct + wrong print('Took %.1f sec (%.3f msec per test); %d correct of %d total: %.3f %% accuracy' % \ (t1-t0, 1000*(t1-t0)/total,