def detect_utterance_langs(reply, history, candidates): try: c = Detector(reply, quiet=True).language.code if c in LANG_LIST: return c else: raise Exception() except: try: txt = " ".join(history) + " " + reply c = Detector(txt, quiet=True).language.code if c in LANG_LIST: return c else: raise Exception() except: try: txt = " ".join(history) + " " + reply + " " + ' '.join(candidates) c = Detector(txt, quiet=True).language.code if c in LANG_LIST: return c else: return 'unk' except: return 'unk'
def language_detection(table_dict, english): if english == False: for k in list(table_dict): tweets = table_dict[k]['tweets'][0:7] tweets = (" ".join(tweets)) try: detecor = Detector(tweets) conf = detecor.language.confidence if conf <= 70.0: table_dict['lang'] = 'en' else: table_dict['lang'] = 'non-eng' except: table_dict['lang'] = 'non-eng' else: for k in list(table_dict): tweets = table_dict[k]['tweets'][0:7] tweets = (" ".join(tweets)) try: detecor = Detector(tweets) conf = detecor.language.confidence if conf <= 70.0: table_dict['lang'] = 'en' except: continue return table_dict
def determine_text_languages(string): input_object = Text(input) temp_list = [] for sentence in input_object.sentences: detect = Detector(str(sentence)) if len(temp_list) == 0: temp_list.append([sentence, detect.language.code]) else: if temp_list[-1][1] == detect.language.code: temp_list[-1][0] = temp_list[-1][0]+" "+sentence else: temp_list.append([sentence, detect.language.code]) new_list= [] for i in temp_list: new_list.append(i[0]) output = [] start = 0 end = 0 for sentence in new_list: detect = Detector(str(sentence)) end = start + len(sentence) position = (start, end) start = end + 1 output.append((detect.language.code, position, detect.language.confidence,)) return output
def getWebpageMeanVector(response) -> list: metaDescription: str = response.xpath( "//meta[@property='og:description']/@content").extract_first() if metaDescription: metaTitle: str = response.xpath( "//meta[@property='og:title']/@content").extract_first() if metaTitle: webPageTopic: str = metaTitle else: webPageHeader: str = getPropertyFromHTMLResponse( response, "header").strip() webPageTitle: str = getPropertyFromHTMLResponse(response, "title").strip() webPageTopic: str = webPageHeader + ". " + webPageTitle return [ getTextVectors(webPageTopic), metaDescription, Detector(metaDescription).language.name, ] else: webPageBody: str = getPropertyFromHTMLResponse(response, "body").strip() webPageHeader: str = getPropertyFromHTMLResponse(response, "header").strip() webPageTitle: str = getPropertyFromHTMLResponse(response, "title").strip() wholeWebPageText: str = webPageBody + ". " + webPageHeader + ". " + webPageTitle return [ getTextVectors(wholeWebPageText), webPageBody, Detector(wholeWebPageText).language.name, ]
def process_tu(self, tu, num_of_finished_scans): src_lang = Detector(tu.src_phrase, quiet=True).language.code trg_lang = Detector(tu.trg_phrase, quiet=True).language.code if src_lang != self.src_language and src_lang not in self.src_language: return [0] if trg_lang != self.trg_language and trg_lang not in self.trg_language: return [0] return [1]
def decide(self, tu): src_lang = Detector(tu.src_phrase, quiet=True).language.code trg_lang = Detector(tu.trg_phrase, quiet=True).language.code # print("PO: ", src_lang + " to " + trg_lang) if src_lang != self.src_language and src_lang not in self.src_language: return 'reject' if trg_lang != self.trg_language and trg_lang not in self.trg_language: return 'reject' return 'accept'
def filter_out_non_english_posts(dataobject): """ given a list of posts, filter in clean monolingual english posts :param dataobject: user to posts object :return: user to posts clean dictionary """ clean_data = {} data = Serialization.load_obj(dataobject) for author in data: print('processing:', author) author_eng_posts = [] for post in data[author]: sentences = [] for sentence in re.split('\.|\! |\? |\n', post): if len(sentence.split()) < 10: continue try: detector = Detector(sentence) except: continue if detector.languages[0].name == 'English' and \ detector.languages[0].confidence > DETECTOR_CONFIDENCE: sentences.append(sentence) # end if # end for if len(sentences) == 0: continue author_eng_posts.append('. '.join(sentences)) # end for if len(author_eng_posts) == 0: continue clean_data[author] = author_eng_posts # end for Serialization.save_obj(clean_data, dataobject+'.clean') for author in clean_data: print(author, len(clean_data[author]))
def is_valid_lang_name(lang): """ Return True if given language name in English exists in polyglot library; False otherwise. :param lang: str (language name in English) :return: bool """ return lang.title() in Detector.supported_languages()
async def get_text_language(page_text): try: detector = Detector(page_text) except UnknownLanguage as exc: return "un", exc return detector.languages[0].code, None
def get_sentiment(self, text): try: try: detector = Detector(text) language = detector.language.code except UnknownLanguage: if (len(text) <= 2): language = 'en' else: language = 'google' self.language_counter.update({language: 1}) # print(self.language_counter) if (language == 'en'): return self.proccess_eng(text) if (language == 'google'): return self.process_google(text) try: return self.process_poly(text) except ZeroDivisionError: return 'N' except Exception as e: print(e) return self.process_google(text) except Exception as e: print(e) return self.proccess_eng(text)
def find_langs(raw_text, translation=True): """ :param translation: bool if we should remove translation posts :param raw_text: the raw text from the subreddit :return: tuple if post is not codeswitch post then return None, else return lang1, lang2 and confidence of lang1 in post """ global false_langs if "http" in raw_text: return None # skip posts that have links (these posts are too noisy and hard to built regex to remove the links) clean_string = clean_text(raw_text) if translation: if is_translation(clean_string): return None detector = Detector(clean_string, quiet=True) if ("en" != detector.languages[0].code) and ("en" != detector.languages[1].code): # skip posts that don't contain any english return None if (detector.languages[1].code not in false_langs) and (detector.languages[0].code not in false_langs): if detector.reliable: lang1 = detector.languages[0].name lang2 = detector.languages[1].name confidence = detector.languages[1].confidence return lang1, lang2, confidence else: return None else: return None
def get_language(x): try: return Detector(x).language.code except UnknownLanguage: return None except pycld2.error: return None
def lang_detect(self, task): sentence = task['text'] languages_result = Detector(sentence, True).languages detected = {} for lang_result in languages_result: code = lang_result.code[:2] if code in detected: detected[code] += lang_result.confidence * \ float(lang_result.read_bytes) else: detected[code] = lang_result.confidence * \ float(lang_result.read_bytes) detected_MAX = [max(detected, key=detected.get)] # Log self.logger.info("Detected:%s:%s" % (str(detected_MAX), sentence)) for lang in languages_result: self.logger.info(lang) self.logger.info('---') # create json format json_result = dict() json_result['predicted'] = detected_MAX[0] json_result['data'] = [] for ele in languages_result: tmp = {} tmp_ele = str(ele).replace(" ", " ").split(" ") tmp_ele = [item for item in tmp_ele if item != ''] # tmp['name'] = tmp_ele[1] tmp['code'] = tmp_ele[3][:2] tmp['score'] = float(detected[tmp['code']]) tmp['bytes'] = tmp_ele[8] json_result['data'].append(tmp) return json.dumps(json_result)
def lang_detect(self, text, threshold=0.9): detector = Detector(text, quiet=True) if detector.language.confidence > threshold: return detector.language.code else: raise LanguageNotRecognisedError( 'Could not recognize the language')
def detect_lang(text, print_error=False, raise_error=False, keep_unreliable=False): from polyglot.detect import Detector """ For detecting language using polyglot, but with exception handling Examples: >>> detect_lang("This is a test text") ('en', 95.0) >>> detect_lang(text = "Dette er åbenbart en norsk tekst", keep_unreliable = True) ('no', 97.0) >>> detect_lang(text = "Dette er åbenbart en norsk tekst. This is also an english text.", keep_unreliable = True) """ text = str(text) try: detector = Detector(text, quiet=True) if detector.reliable or keep_unreliable: lang = detector.language return lang.code, lang.confidence except Exception as e: if print_error and not raise_error: print(e) if raise_error: raise Exception(e) return np.nan, np.nan
def langDetect(content): global languageDetected printmessage("Detecting languages") langDict = {} try: languages = Detector(content).languages for x in range(0, len(languages)): code = languages[x].code if code != 'un': confidence = languages[x].confidence langDict[code]=confidence except Exception as e: printmessage("Language detection error -->"+str(e)) languages = langid.classify(content) #printmessage(languages) code = languages[0] confidence = abs(languages[1]) if confidence > 99: confidence = 99 else: confidence = round(confidence, 0) langDict[code]=confidence printmessage(langDict) return langDict
def detect_langs(text): try: c = Detector(text, quiet=True).language.code if c in LANG_LIST: return c except: return "unk"
def get_valid_videos(tmp_dir: str, lang_code: str) -> list: valid_videos = [] for tmp_file in os.listdir(tmp_dir): tmp_file_path = os.path.join(tmp_dir, tmp_file) if not os.path.isfile(tmp_file_path): logging.error('File {tmp_file} does not exist') continue with open(tmp_file_path) as tmp_json_file: try: metadata = json.load(tmp_json_file) except ValueError: logging.error('Failed to decode json from file {tmp_file}') delete_file(tmp_file_path) continue text = metadata['title'].strip() + ' ' + \ metadata['description'].strip() # Remove non printable characters/symbols, which sometimes cause errors in Polyglot Detecor printable_str = ''.join(x for x in text if x.isprintable()) detector = Detector(printable_str, quiet=True) if detector.language.code == lang_code: valid_videos.append(metadata['id']) delete_file(tmp_file_path) return valid_videos
def get_language(reader): """ Gets language of descriptions. :param reader: CSV file content :return: string """ desc = [x[4] for x in reader] text = ' '.join(desc) try: lang = Detector(text).language.name if lang == "un": return test_for_language(desc) else: return lang.lower() except: return test_for_language(desc)
def detect_lang(text1: str, name: Union[bool, int] = False) -> str: """ return name.lower() if name is True Detect Chinese and other languages using polyglot. """ if not text1.strip(): detected = "en" if name: detected = "english" else: try: # detected = Detector(text1).languages[0].code _ = Detector(text1).language if name: detected = _.name.lower() else: detected = _.code except Exception as exc: # LOGGER.debug(" langid.classify failed: %s", exc) LOGGER.debug( " Detector(text1).language[0] failed: %s, setting to 'en'/'english' ", exc, ) if name: detected = "english" else: detected = "en" return detected
def recognition_language(line): try: language = Detector(line) except: print(line) else: return language.language.name
def detect(): query = request.json['text'] detector = Detector(query, quiet=True) locl = detector.language.locale.getName().replace('_', '-') conf = detector.language.confidence read = detector.language.read_bytes parsed = [] try: blob = Text(query) for entity in blob.entities: eobj = {} eobj['tag'] = entity.tag eobj['entity'] = entity parsed.append(eobj) except: pass return { "locale": locl, "confidence": conf, "read_bytes": read, "entities": parsed }
def process_text(text_data): text_data = ''.join(x for x in text_data if x.isprintable()) text_data = text_data.replace("#", " ") text_data = text_data.replace("\n", " ") languages = Detector(text_data, quiet=True).languages word_list = [] if languages[0].code in ["ko"]: tokens = okt.pos(text_data) #print(tokens) for token in tokens: word = token[0] if token[1] in ['Foreign', 'Number', 'URL', 'Email', 'ScreenName', 'Hashtag']: # all Hashtag remaining are Japanese continue elif token[1] == 'Alpha': word = word.lower() if word == '그램': if len(word_list) > 0: if word_list[-1] == '스타': word_list[-1] = '스타그램' elif word_list[-1] == '맛스타': word_list[-1] = '맛스타그램' else: word_list.append(word) else: word_list.append(word) else: word_list.append(word) return word_list
def detect_lang(self): result = [] for line in self.data: lang = Detector(line, quiet=True).language.name result.append([line, lang]) self.save(result)
def detect(): json = request.get_json() text = json['text'] result = {'result': False} try: detector = Detector(text) lc = detector.language.code fixCode = lc if languageCodes.get(lc): fixCode = languageCodes[lc] result = { 'result': True, 'text': text, 'language': detector.language.name, 'code': fixCode } except Exception as e: print(e) finally: pass return jsonify(result)
def language_detection(input_keyword): language_list = ["English"] #text=Text(input_keyword) input_keyword = input_keyword.lower() detector = Detector(input_keyword, quiet=True) detected_lang = detector.language.name confidence = detector.language.confidence keyword = keyword_lemmatization(input_keyword) try: if detected_lang in language_list: keyword = keyword_lemmatization(input_keyword) print("Root of Input keyword is {}".format(keyword)) elif detected_lang == "un": raise Exception( "Sorry !! can not detected Language for {} BNG Model Can only work for English Keyword" .format(input_keyword)) elif detected_lang not in language_list: raise Exception( "Detected Language is {} BNG Model Can only work for English Keyword" .format(detected_lang)) except Exception as e: #logging.error(e) print(e) login_function(e, "warning.log") return detected_lang, confidence, keyword
def detectLanguage(self, text): try: detectionResult = Detector(text) lang = detectionResult.language.code except: lang = "un" return lang
def save_english_by_paragraph(filename, filename_out): """Processes the original corpus, collapses paragraphs into a single string and saves english paragraphs Args: filename: path to the original corpus filename_out: path where to store the english language corpus Returns: """ try: from polyglot.detect import Detector from polyglot.detect.base import UnknownLanguage with open(filename) as fin, open(filename_out, 'w') as fout: paragraph = '' for line in fin: line = line.strip() if line: paragraph += ' ' + line else: if paragraph: try: la = Detector(paragraph, quiet=True).language.code except UnknownLanguage: la = 'un' if la == 'en': fout.write('{}\n'.format(preprocess(paragraph))) paragraph = '' except ImportError: print('Error: polyglot has not been installed') print('to install polyglot:') print('install icu4c - see instruction in the readme file') print('pip install polyglot')
def detect_language(tweet): # tweet = filter(lambda x: x in string.printable, tweet) # is_utf8 = isUTF8Strict(tweet) # if is_utf8: # tweet = tweet.decode('utf-8') try: print(tweet) languages = Detector(tweet, quiet=True).languages is_english = False max_confidence = 0 for language in languages: if language.name == "English": max_confidence = language.confidence print(language.confidence) if float(language.confidence) >= 93.0: is_english = True else: is_english = False else: if float(language.confidence) >= 10.0: is_english = False return is_english except UnicodeDecodeError: print("UnicodeDecodeError ew") return False
def find_error(source): delete_list = [] t0 = time.time() found = 0 proc_name = multiprocessing.current_process().name #print "@@@@@@@@@@@@@@@@@@@@@"*4 #print "Current process:%s" % proc_name for runner, two in enumerate(source): index = two[0] sentence = two[1] if runner % 1000 == 0: t1 = time.time() sys.stdout.write("PROC:" + proc_name + ",Line:" + str(runner) + ",Time Cost:" + str(1000.0 / (t1 - t0)) + "lines/s\r") t0 = t1 sys.stdout.flush() tmp_s = sentence.decode('utf8') # #detected = Detector(tmp_s).language.code[:2] #print detected try: detected = Detector(tmp_s).language.code[:2] if detected == t_lang: #print "INDEX:%d,%s" % (index,sentence) #print detected delete_list.append(index) found += 1 except BaseException: pass #print "%%%%%%%%%%%%%%%%%%%%%%%%" #print "Found ERROR sentences : %d" % found return delete_list, found