def feature_language(sentence, code): if code=="nb": code = "no" reliable = False bytes = 0 details = () try: reliable, bytes, details = pycld2.detect(sentence) except: sent2 = "".join(filter(lambda x: x in string.printable, sentence)) reliable, bytes, details = pycld2.detect(sent2) if not reliable: return 0.0 else: score = float(details[0][2])/100.0 if details[0][1] != code: if code=="gl" and (details[0][1] == "pt" or details[0][1] == "es"): return score if code=="no" and details[0][1] == "da": return score if code=="nn" and (details[0][1] == "no" or details[0][1] == "da"): return score else: return 0.0 else: return score
def detect(self, text): """Decide which language is used to write the text. The method tries first to detect the language with high reliability. If that is not possible, the method switches to best effort strategy. Args: text (string): A snippet of text, the longer it is the more reliable we can detect the language used to write the text. """ t = text.encode("utf-8") reliable, index, top_3_choices = cld2.detect(t, bestEffort=False) if not reliable: self.reliable = False reliable, index, top_3_choices = cld2.detect(t, bestEffort=True) if not self.quiet: if not reliable: raise UnknownLanguage("Try passing a longer snippet of text") else: logger.warning("Detector is not able to detect the language reliably.") self.languages = [Language(x) for x in top_3_choices] self.language = self.languages[0] return self.language
def handle_client(reader, writer): request = None classifier = 'langid' langhint = None received = '' while not '<<CLASSIFY>>' in received: request = (yield from reader.read(255)).decode('utf8') if (request): # print("received "+request) received += request response = None text = '' received = received.replace('<<CLASSIFY>>', '') lines = received.split("\n") for l in lines: if ('CLASSIFIER=' in l): classifier = l.split('=')[1] elif ('LANGHINT=' in l): langhint = l.split('=')[1] else: text += l # print("classify "+text) if (classifier == 'cld2'): if (langhint): isReliable, textBytesFound, details = cld2.detect( text, bestEffort=True, hintLanguage=langhint) else: isReliable, textBytesFound, details = cld2.detect(text, bestEffort=True) response = str((details[0][1], isReliable, details)) else: response = str(identifier.classify(text)) writer.write(response.encode('utf8'))
def c_different_language(left, right): l_reliable = False l_bytes = 0 l_details = () try: l_reliable, l_bytes, l_details = pycld2.detect(left) except: return False # encoding error -> noise r_reliable = False r_bytes = 0 r_details = () try: r_reliable, r_bytes, r_details = pycld2.detect(right) except: return False # encoding error -> noise if l_reliable and r_reliable and l_details[0][1] != r_details[0][1]: return True elif not l_reliable or not r_reliable: return True else: return False
def fe_title_lang(raw: RawData) -> RawData: raw.train["title_lang"] = ( raw.train["title"].fillna("").map(lambda x: cld2.detect(x)[2][0][1]) ) raw.test["title_lang"] = ( raw.test["title"].fillna("").map(lambda x: cld2.detect(x)[2][0][1]) ) return raw
def detect_language(text): t = text.encode("utf-8") reliable, index, top_3_choices = cld2.detect(t, bestEffort=False) if not reliable: reliable, index, top_3_choices = cld2.detect(t, bestEffort=True) if not reliable: raise UnknownLanguage("Try passing a longer snippet of text") return top_3_choices[0]
def detect_lang(text): try: is_reliable, text_bytes_found, details = cld2.detect(text) except: text = ''.join(x for x in text if x in string.printable) is_reliable, text_bytes_found, details = cld2.detect(text) # print('detected: %s' % detectedLangName) # print('reliable: %s' % (isReliable != 0)) # print('textBytes: %s' % textBytesFound) # print('details: %s' % str(details)) return details[0][1]
def is_accepted(line,accept,reject): # isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True) if accept: isReliable, textBytesFound, details = cld2.detect(line, hintLanguage=accept, bestEffort=True) if details[0][1] == accept: if isReliable: return True if args.verbose: print("language mismatch: " + details[0][1] + " != " + accept + ", " + line, file=sys.stderr, flush=True) else: isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True) if details[0][1] != reject: return True if args.verbose: print("reject because detected: " + details[0][1] + ", " + line, file=sys.stderr, flush=True)
def generate_stats(corpus): confidence = 0 pages = 0 words = 0 chars = 0 for text in corpus.texts: if type(text.author) is list: author = '|'.join(text.author) else: author = text.author confidence += text.avg_word_confidence * text.words pages += text.pages words += text.words chars += len(text.text) vid = text.book_id if text.volume: vid += ('_%02d' % text.volume) isReliable, textBytesFound, langDetails = pycld2.detect(text.title.encode('utf-8'), isPlainText=True) if isReliable: language = langDetails[0][0] langCode = langDetails[0][1] else: language = 'Unknown' langCode = '--' print('\t'.join([vid, str(text.avg_word_confidence), langCode, str(text.pages), str(text.words), str(len(text.text)), author, text.title])) return(confidence, pages, words, chars)
def detect_langage(text, method='cld2'): '''For each piece of text input, this function uses the method passed to return the detected languages Pass the 'method' parameter for different models. Valid params = [cld2,langdetect,polyglot]''' ## Encode to utf-8 text = text.encode('utf-8').decode("utf-8", "ignore") try: if method == 'cld2': # Pass to cld2 result = cld2.detect(text, bestEffort=False) elif method == 'langdetect': ### TODO : return values properly result = detect_langs(text) elif method == 'polyglot': ### TODO : implement polyglot result = tuple() else: result = tuple() except Exception as e: # logger.error(e) result = tuple() # Now, compute the probabilities _p = compute_lang_prob(result) return _p
def check_page(url, lang): content = requests.get(url).content isReliable, textBytesFound, details = cld2.detect(content) for lname, lcode, conf, _ in details: if lcode == lang: return True return False
def c_reliable_long_language(sentence, language): if language=="nb": language = "no" reliable = False bytes = 0 details = () try: reliable, bytes, details = pycld2.detect(sentence) except: return True # encoding error -> noise if len(sentence) > 30 and reliable and details[0][1] != language: if language=="gl" and (details[0][1] == "pt" or details[0][1] == "es"): return True if language=="no" and details[0][1] == "da": return True if language=="nn" and (details[0][1] == "no" or details[0][1] == "da"): return True #print(sentence + " " + str(details[0][1])) return False else: return True
def func_dbpedia_spotlight(d): """ Helper function for processing a paper in a thread with DBpedia Spotlight :param d: content of the paper :return: result of the annotation with DBpedia Spotlight in JSON || None if the JSON annotation exists already """ d_json = {} paper_id = d['paper_id'] title = d["metadata"]["title"] if os.path.isfile(path_output + '/dbpedia-spotlight/' + folder + '/' + paper_id + '.json'): pbar.update() return None try: body_text = cotools.text(d) isreliable, textbytesfound, details, vectors = pycld2.detect( body_text, returnVectors=True) lang = vectors[0][3] # None or out of range except: lang = 'en' if os.path.isfile('/data/CORD19-Annotation-multi/entity-fishing/' + folder + '/' + paper_id + '.json'): return None d_json["paper_id"] = paper_id d_json["lang"] = lang try: abstract = cotools.abstract(d) d_json["abstract"] = wa.request_dbpedia_spotlight(abstract, lang) # no abstract except Exception: pass d_json["title"] = wa.request_dbpedia_spotlight(title, lang) d_json["body_text"] = wa.request_dbpedia_spotlight(body_text, lang) d_json["ref_entries"] = {} for key, value in d["ref_entries"].items(): d_json["ref_entries"][key] = wa.request_dbpedia_spotlight( value["text"]) #d_json["bib_entries"] = {} #for key, value in d["bib_entries"].items(): # d_json["bib_entries"][key] = wa.request_dbpedia_spotlight(value["title"]) d_json["back_matter"] = [] for matter in d["back_matter"]: for key, value in matter.items(): if key == 'text': text = {'text': wa.request_dbpedia_spotlight(value)} d_json["back_matter"].append(text) Output().save_json( d_json, path_output + '/dbpedia-spotlight/' + folder + '/' + d["paper_id"] + '.json') pbar.update() return d_json
def english_check(corpus): # It's not that we are cultural imperialists, but the people at textstat, and nltk may have been, # so we are also forced into this tacit agreement. # Japanese characters massively distort information theory estimates, as they are potentially very concise. _, _, details = cld2.detect(" ".join(corpus), bestEffort=True) detectedLangName, _ = details[0][:2] return bool(detectedLangName == "ENGLISH")
def try_get_lang(content): try: reliable, _, details = cld.detect(content) if reliable: return details[0][1] except cld_error: pass return None
def feature_language(sentence, code): reliable = False bytes = 0 details = () try: reliable, bytes, details = pycld2.detect(sentence) except: sent2 = "".join(filter(lambda x: x in string.printable, sentence)) reliable, bytes, details = pycld2.detect(sent2) if not reliable: return 0.0 if details[0][1] != code: return 0.0 else: return float(details[0][2]) / 100.0
def get_lang(text): rel, _, matches = cld2.detect(text) if not rel: return matches = list(filter(lambda m: m[1] in ['ru', 'uk', 'en'], matches)) if len(matches) == 0: return langid.classify(text)[0] return matches[0][1]
def detect_language(text): """ Uses CLD2 to detect the language of text. Returns the BCP 47 language code, and a boolean indicating whether the result is confident. We modify CLD2's confidence value to say it's not confident if: - The detected language is a non-language placeholder like 'xx' - The detected language appears to be incorrect because it contains particular characters from a different script - The text is shorter than 50 bytes """ # Format of pycld2.detect: # (Confident in result: bool, # Number of bytes of text: Int, # Triples of detected languages in order of certainty: # (Language name: str, # Language code: str # Percent of text in this language: float # Confidence score: float)) text = CLD2_BAD_CHARS_RE.sub('', text) det_result = pycld2.detect(text) confident = det_result[0] lang = pycld2.detect(text)[2][0][1] # Normalize the language code: 'iw' becomes 'he', and 'zh-Hant' # becomes 'zh', for example code = CLD2_LANGUAGE_MAP.get(lang, lang) if len(text.encode('utf-8')) < 50: confident = False elif code not in CLD2_LANGUAGES: confident = False elif code == 'sh': # Fix cases of Arabic being detected as Bosnian if 'ا' in text: code = 'ar' confident = False # Fix cases of Russian being detected as Serbian if CYRILLIC_RE.search(text): confident = False return code, confident
def guess_lang_from_data2(data): try: reliable, text_bytes, detected_languages = cld2.detect( data, isPlainText=False) except: sys.stderr.write("error guessing language") return False, None return True, detected_languages[0][1]
def get_lang(tweet): cleaned = ' '.join([x for x in tweet.split() if is_valid(x)]) try: lang = cld2.detect(cleaned)[2][0][1] except: return 'unk' if lang == 'un' or lang == 'xxx': return 'unk' return lang
def detect_language(text: str): cleaned_text = ''.join( x for x in text if x in printable ) # https://github.com/mikemccand/chromium-compact-language-detector/issues/22 _, _, details = cld2.detect( cleaned_text) # Tuple of up to 3 detected languages (languageName, languageCode, percent, score) = details[ 0] # percent is what percentage of the original text was detected # as this language and score is the confidence score for that language. return (languageName, score, text)
def detect_lang(s): ''' Return the language(s) in string s. Naive Bayes classifier under the hood - results are less certain for strings that are too short. Returns up to three languages with confidence scores. More on usage: https://pypi.org/project/pycld2/ ''' _, _, details = cld2.detect(s) return details[0][0]
def language_keyword(keyword): r = { 'en': Rake(stopwords=ENGLISH_STOP_WORDS, punctuations=punctuation), 'es': Rake(stopwords=SPANISH_STOP_WORDS, punctuations=punctuation), 'ca': Rake(stopwords=CATALAN_STOP_WORDS, punctuations=punctuation) } _, _, details = cld2.detect(keyword.encode('utf-8', 'replace'), isPlainText=True, bestEffort=True) lang = details[0][1] if not lang in r.keys(): lang = 'en' # Default language return lang
def _extract_page_info(article: dict, url: str) -> dict: """Extracts additional page information.""" if not article: return {} language = detect(article.get('content_text')) if len(language) > 2 and len(language[2]) > 1: language_code = language[2][0][1] else: language_code = None return {'url': url, 'language': language_code}
def get_lang_info(self, txt): try: txt = txt.encode('utf8') isReliable, textBytesFound, details = cld2.detect(txt) except: txt = ''.join( x for x in txt if x in string.printable) # Handle invalid utf-8 chars isReliable, textBytesFound, details = cld2.detect(txt) outstr = str(textBytesFound) out_dict = {"ur": 0} for item in details: # Iterate 3 languages if item[0] != "Unknown": outstr += '&' + item[0] + '-' + str(item[2]) + '-' + str( int(item[3])) if item[0] == "URDU": out_dict["ur"] = item[2] out_dict["result"] = outstr return out_dict
def language_detect(text: str) -> str: """:return: the language short string detected in the text or 'un' if an error occurs""" if isinstance(text, tuple): text = text[0] try: return cast( str, pycld2.detect(text.replace('\x7f', '').replace('\b', ''))[2][0][1]) except pycld2.error: _LOGGER.exception('couldn\'t process input: %s', text) return 'un'
def check_language(line,expected): # Manipuri unsupported by cld2, but normally detected as Bengali if expected == "mni" : expected = "bn" try: _,_,details = pycld2.detect(line, hintLanguage=expected) if expected != details[0][1]: print("WRONG {} != {} {}".format(details[0][1], expected, line), file=sys.stderr) return expected == details[0][1] except pycld2.error: print("pycld2.error",file=sys.stderr) return False
def html_to_text(args): html, meta = args try: html = html.decode('utf-8') except UnicodeDecodeError: # try to figure out encoding if not urf-8 guess = chardet.detect(html)['encoding'] if not guess or guess == 'UTF-8': return try: html = html.decode(guess) except (UnicodeDecodeError, LookupError): # still cant figure out encoding, give up return try: try: _,_,details = cld2.detect(html) except: # cld2 doesn't like control characters # https://github.com/mikemccand/chromium-compact-language-detector/issues/22#issuecomment-435904616 html_no_ctrl_chars = ''.join([l for l in html if unicodedata.category(l)[0] not in ['C',]]) _,_,details = cld2.detect(html_no_ctrl_chars) if details[0][1] == 'en': meta = { 'primary_language': 'en', 'lang_detector': 'pycld2', 'lang_detector_extra_info': details, **meta } return [x.text for x in justext.justext(html, justext.get_stoplist('English')) if not x.is_boilerplate], meta except lxml.etree.ParserError: return except: traceback.print_exc()
def apiResult(sentence): ''' Returns JSON format form data.Sentence is passed to url with + to act as spaces ie "habari+yako" ''' cleanSentence = sentence.split('+') cleanSentence = ' '.join(cleanSentence) get1, get2, get3 = cld2.detect(sentence) reliability = (get1) lang = (get3[0][0]) match = ('{0:.4f} %'.format(get3[0][2])) output = {'reliablity': reliability, 'lang': lang, 'match': match} return jsonify(output)
def check_message(msg): dct = json.loads(msg) if 'owner_id' not in dct: raise ValueError("message should contain owner_id key") if 'text' not in dct: raise ValueError("message should contain text key") isReliable, textBytesFound, details = cld2.detect(dct["text"]) lang = details[0][1] if lang != "en": raise NotEnglishLanguageError(lang) return {"owner_id": dct["owner_id"], "text": dct["text"], "id": dct.get("id", ""), "source": dct.get("source", "")}
def tag_records(partition): for warc_record in partition: parser = BeautifulSoup(warc_record.html_source, 'html.parser') plaintext = ' '.join(parser.stripped_strings) plaintext_stripped = sub('\\s+', ' ', plaintext) if plaintext_stripped is None or plaintext_stripped == '': yield () # empty tuple else: cleaned_text = ''.join(x for x in plaintext_stripped if x in printable) _, _, details = pycld2.detect(cleaned_text) (languageName, languageCode, percent, score) = details[0] yield warc_record.target_uri, languageCode, str(score)
def detect_language(text): try: # text = bytes(text, 'utf-8').decode('utf-8', 'backslashreplace') detected = cld2.detect(text) if detected[0]: lang = detected[2][0][0].lower() lang_code = detected[2][0][1] else: lang = lang_code = None except Exception as err: raise Exception("TextProcessor::detect_language: " + str(err)) return lang, lang_code
def result(): ''' Get language,reliabilty percentage and match ''' if request.method == 'POST': if not request.form['word']: return redirect(url_for('show_home')) data = str(request.form['word']) get1, get2, get3 = cld2.detect(data) reliability = (get1) lang = (get3[0][0]) match = ('{0:.4f} %'.format(get3[0][2])) output = {'reliablity': reliability, 'lang': lang, 'match': match} return render_template('find.html', **output)
def cld2_detect_language(text): """ Uses CLD2 to detect the language. """ # Format of pycld2.detect: # (Confident in result: bool, # Number of bytes of text: Int, # Triples of detected languages in order of certainty: # (Language name: str, # Language code: str # Percent of text in this language: float # Confidence score: float)) text = CLD2_BAD_CHARS_RE.sub('', text) return pycld2.detect(text)[2][0][1]
def run(text, args): """ Run the language identification module. :param text: the input text :param args: the command line arguments :return: a tuple in the form (language_code, text) """ _, _, languages = pycld2.detect(text.strip().encode('utf-8')) if len(list(languages)) > 0: language = languages[0] _, lang_code, _, _ = language else: lang_code = 'un' return lang_code, text
def cld2_detect_language(text): """ Uses CLD2 to detect the language. """ # Format of pycld2.detect: # (Confident in result: bool, # Number of bytes of text: Int, # Triples of detected languages in order of certainty: # (Language name: str, # Language code: str # Percent of text in this language: float # Confidence score: float)) text = CLD2_BAD_CHARS_RE.sub('', text) lang = pycld2.detect(text)[2][0][1] # Normalize the language code: 'iw' becomes 'he', and 'zh-Hant' # becomes 'zh' code = langcodes.get(lang).language return code
def detect(self, text): t = text.encode("utf-8") reliable, index, top_3_choices = cld2.detect(t, bestEffort=False) self.language = [Language(x) for x in top_3_choices][0] return self.language
def main(): lookup = dict() with open('metadata/crosswalk.tsv') as infile: for line in infile: if line.startswith('Print Id'): continue ids = line.split() if len(ids) < 2 or ids[0] == 'None': # Lines like '\tNNNN' split into a single element lookup[ids[0]] = None else: lookup[ids[1]] = ids[0] total = 0 found = 0 all_agree = 0 new_agree = 0 mismatch = 0 langs = Counter() with codecs.open('metadata/booklist.tsv','r','utf-8') as infile: for line in infile: line = line.rstrip('\n') if line.startswith('Aleph'): # handle header print('Print sysnum\tFlag\tDetected Lang\tBest Lang\t%s' % line) continue total += 1 fields = line.split('\t') scanId = fields[0] title = fields[7] title = cleanTitle(title) lang1 = 'unk' try: lang1 = langdetect.detect(title) except LangDetectException: # print(('Language detection failed for %s' % line).encode('utf-8')) pass title = title.encode('utf-8') # CLD2 needs UTF-8 bytes isReliable, textBytesFound, langDetails = cld2.detect(title) # @UnusedVariable lang2 = 'unk' if isReliable: lang2 = langDetails[0][1] origLang = fields[2] if origLang and not origLang in ['und', 'mul']: origLang = bib2std(origLang) if not origLang: origLang = 'und' newLang = 'unk' flag = '' bestLang = origLang if lang1 == lang2: newLang = lang1 if lang1 == origLang: all_agree += 1 elif not origLang in ['und', 'mul']: mismatch += 1 flag = '*' bestLang = lang1 else: new_agree += 1 if origLang != 'mul': bestLang = lang1 langs[newLang+'-'+origLang] += 1 printId = 'None' if scanId in lookup: printId = lookup[scanId] found += 1 # TODO: Blaclist pig latin, Klingon, etc #if lang == 'zzp': # print(lang,title,line) print(('%s\t%s\t%s\t%s\t%s' % (printId, flag, newLang, bestLang, line)).encode('utf-8')) print('Found print ID for %d of %d total' % (found, total)) print('Found %d title language mismatches, %d agreed new, %d all 3 agree, total = %d' % (mismatch, new_agree, all_agree, total)) print('Language pair count: %d' %len(langs)) # Print our language pairs (basically a confusion matrix) for k,v in langs.most_common(40): if k.find('-mul') < 0: # Skip multiple languages print("%s\t%5d\t%4.2f%%" % (k, v, v*100.0/total))
def detect_language(text): _, _, unsorted_results = pycld2.detect(text, hintLanguageHTTPHeaders='en,de') sorted_results = sorted([d for d in unsorted_results], key=lambda x: -x[3]) return sorted_results[0][1]
with open(test_file_name_in,"r") as myfile: data=myfile.read() words_test = re.findall("\w+",data) ngram_test=sys.argv[2] ngram_test_nr=int(ngram_test,0) bigram_counter_test = Counter(ngram_words(words_test,ngram_test_nr)) bigram_letter_counter_test =convert_ngram(bigram_counter_test) counter=collections.defaultdict(int) for ngram,value in bigram_letter_counter_test.iteritems(): actual_list=collections.defaultdict(int) for word in ngram.split(): actual_list[word]+=1 print 'pycld2' print cld2.detect(ngram) print 'langid' print langid.classify(ngram) result1=test_part(actual_list,order,trigram_witten_bell1) result2=test_part(actual_list,order,trigram_witten_bell2) result3=test_part(actual_list,order,trigram_witten_bell3) result4=test_part(actual_list,order,trigram_witten_bell4) test_result[language1]=result1 test_result[language2]=result2 test_result[language3]=result3 test_result[language4]=result4 sorted_test = sorted(test_result.iteritems(), key=lambda (k,v):v,reverse=True) counter[sorted_test[0][0]]+=1