def DetectLanguageForScan(filePath, countryLanguage, sample): pdfReader = PyPDF2.PdfFileReader(filePath) pdfWriter = PyPDF2.PdfFileWriter() pagesNumber = CountPagesNumber(filePath) lan1 = "" temporaryFolderPath = "Temporary Folder" if os.path.exists(temporaryFolderPath): shutil.rmtree(temporaryFolderPath) os.makedirs(temporaryFolderPath) while (len(lan1) < sample): randomPage = random.sample(range(0, pagesNumber), 1)[0] pdfWriter.addPage(pdfReader.getPage(randomPage)) temporaryFilePath = temporaryFolderPath +"/out_" + str(randomPage) + ".pdf" stream = open(temporaryFilePath, "wb") pdfWriter.write(stream) stream.close() lan1 = "" + pytesseract.image_to_string(pdf2image.convert_from_path(temporaryFilePath)[0], lang=countryLanguage) if (len(lan1) < sample): os.remove(temporaryFilePath) lan2 = "" + pytesseract.image_to_string(pdf2image.convert_from_path(temporaryFilePath)[0],lang='eng') shutil.rmtree(temporaryFolderPath) out_lan1 = langdetect.detect_langs(lan1)[0] out_lan2 = langdetect.detect_langs(lan2)[0] lang=re.findall(r"[a-zA-Z]+",str(max(out_lan1,out_lan2)))[0] return lang
def get_language(): "检测语言是否为中文" from Ref_Data import replace_word import json from langdetect import detect_langs from langdetect.lang_detect_exception import LangDetectException train = input.read_dataset('train.csv').fillna(replace_word['unknow']) test = input.read_dataset('test.csv').fillna(replace_word['unknow']) records = {} for index, row in tqdm(train.iterrows()): try: lang_prob = detect_langs(row['comment_text']) language = lang_prob[0].lang if language != 'en': records['tr' + str(index)] = (row['comment_text'], language, lang_prob[0].prob) except LangDetectException: records['tr' + str(index)] = (row['comment_text'], 'none',0) for index, row in tqdm(test.iterrows()): try: lang_prob = detect_langs(row['comment_text']) language = lang_prob[0].lang if language != 'en': records['te' + str(index)] = (row['comment_text'], language, lang_prob[0].prob) except LangDetectException: records['te' + str(index)] = (row['comment_text'], 'none',0) records = sorted(records.items(), key=lambda item: item[1][2], reverse=True) with open('language_record.json', 'w') as f: f.write(json.dumps(records, indent=4, separators=(',', ': '),ensure_ascii=False))
def language_check(dataframe=None): """ Fucntion responsible to check whether a song lyric is in English Language or not. """ index_to_remove = [] progress_bar = tqdm(dataframe[~dataframe['lyrics'].isnull()].index.to_list()) for index in progress_bar: if isinstance(index, tuple): progress_bar.set_description("Processing %s" % index[0] + ' , ' + index[1]) else: progress_bar.set_description("Processing %s" % index) try: if isinstance(index, tuple): if 'en' not in [item.lang for item in detect_langs(dataframe['lyrics'].loc[index[0]].loc[index[1]])]: index_to_remove.append(index) else: if 'en' not in [item.lang for item in detect_langs(dataframe['lyrics'].loc[index])]: index_to_remove.append(index) except: index_to_remove.append(index) return index_to_remove
def ConvertFileToText(path, language): text = ConvertPdftoText(path) pagesNumber = CountPagesNumber(path) scannedFile = 0 if text in ["\x0c" * pagesNumber, ""]: scannedFile = 1 text = ConvertScanToText(path, language) languageEstimated = LanguageName(str(langdetect.detect_langs(text))[1:3]) # If the pdf language is confusing, extract the text with a more precise tool (but less efficient) if ((LanguageName(str(langdetect.detect_langs(text))[1:3]) != language) & (scannedFile == 0)): prm = PDFResourceManager() iob = io.BytesIO() device = TextConverter(prm, iob, codec = "utf-8", laparams = LAParams()) pdf = open(path, "rb") interpreter = PDFPageInterpreter(prm, device) for page in PDFPage.get_pages(pdf, set(), maxpages = 0, password = "", caching = True, check_extractable = True): interpreter.process_page(page) text = iob.getvalue() pdf.close() device.close() iob.close() languageEstimated = LanguageName(str(langdetect.detect_langs(text))[1:3]) return text, scannedFile, languageEstimated
def auto_detect_text(filename): import platform if platform.system() != 'Darwin': pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' text_eng = pytesseract.image_to_string(Image.open(filename), lang='eng') text_spa = pytesseract.image_to_string(Image.open(filename), lang='spa') text_fra = pytesseract.image_to_string(Image.open(filename), lang='fra') text_hin = pytesseract.image_to_string(Image.open(filename), lang='hin') possible_languages = [] lang_text_map = {} if text_eng: possible_languages.append(str(detect_langs(text_eng)[0]).split(":")) lang_text_map['en'] = text_eng if text_spa: possible_languages.append(str(detect_langs(text_spa)[0]).split(":")) lang_text_map['es'] = text_spa if text_fra: possible_languages.append(str(detect_langs(text_fra)[0]).split(":")) lang_text_map['fr'] = text_fra if text_hin: possible_languages.append(str(detect_langs(text_hin)[0]).split(":")) lang_text_map['hi'] = text_hin if possible_languages: res = max(possible_languages, key=lambda li: li[1]) if res[0] not in ACCEPTED_LANGUAGES: return 'XX', 'XX' else: return lang_text_map[res[0]], res[0] else: return 'XX', 'XX'
def which_lan(cap_clean,rownum): try: lang_temp = Series(str(detect_langs(cap_clean[rownum])[0]).split(':')[0]) lang_prob_temp = Series(str(detect_langs(cap_clean[rownum])[0]).split(':')[1]) except: return None print rownum res = concat([lang_temp, lang_prob_temp], axis=1) return res
def judge_pure_english(self, text): try: lang1 = detect_langs(text)[0] except UnicodeDecodeError: lang1 = detect_langs(text.decode("utf-8"))[0] prob = lang1.prob lang = lang1.lang if prob > 0.90: return lang return None
def data_from_CSV(): conn = establish_DB_connection_SQL() df = pd.read_csv(sys.argv[1], keep_default_na=False) tweets = df.set_index('status_id', drop=False) for index, row in tweets.iterrows(): phone_no = '' hashtag = [] text = str(row['text']) if str(detect_langs(text)[0])[0:2] == 'hi' and text != 'nan': text1 = text authenticator = IAMAuthenticator( 'worUzb_Eb5emCaIs0oL7sR86Fb2LeTJGOk1EN1Q-4Cni') language_translator = LanguageTranslatorV3( version='2018-05-01', authenticator=authenticator) language_translator.set_service_url( 'https://api.eu-gb.language-translator.watson.cloud.ibm.com/instances/cdafbc7e-b59a-40f8-818f-1914f02063cc' ) translation = language_translator.translate( text=text1, model_id='hi-en').get_result() output = json.loads( json.dumps(translation, indent=2, ensure_ascii=False)) text = output['translations'][0]['translation'] if str(detect_langs(text)[0])[0:2] == 'en' and text != 'nan': text = data_cleaning(text) words = text.split() words = set(words) words = list(words) for w in words: if re.match( '^((\+){0,1}91(\s){0,1}(\-){0,1}(\s){0,1}){0,1}0{0,1}[1-9]{1}[0-9]{9}$', w): phone_no = w[-10:] if w.startswith('#'): hashtag.append(w) request_type = processML(text) if request_type != '': name = str(row['screen_name']) if name == '': name = None if text == '': text = None if phone_no == '': phone_no = 9900990099 latitude = 9.9252 longitude = 78.1198 updateDB_SQL(row['screen_name'], text, latitude, longitude, phone_no, request_type, conn) disconnect_DB_SQL()
def _detect_subtitle_language(srt_path): log.debug('Detecting subtitle language') # Load srt file (try first iso-8859-1 with fallback to utf-8) try: subtitle = pysrt.open(path=srt_path, encoding='iso-8859-1') except Exception: try: subtitle = pysrt.open(path=srt_path, encoding='utf-8') except Exception: # If we can't read it, we can't detect, so return return None # Read first 5 subtitle lines to determine the language if len(subtitle) >= 5: text = '' for sub in subtitle[0:5]: text += sub.text # Detect the language with highest probability and return it if it's more than the required minimum probability detected_languages = langdetect.detect_langs(text) log.debug('Detected subtitle language(s): %s', detected_languages) if len(detected_languages) > 0: # Get first detected language (list is sorted according to probability, highest first) detected_language = detected_languages[0] language_probability = detected_language.prob if language_probability >= autosubliminal.DETECTEDLANGUAGEPROBABILITY: log.debug('Probability of detected subtitle language accepted: %s', detected_language) return Language.fromietf(detected_language.lang) else: log.debug('Probability of detected subtitle language too low: %s', detected_language) return None
def return_data(self, **kwargs) -> dict: retard_format = detect_langs(kwargs['text']) out_dict = {} for l in retard_format: out_dict[l.lang] = l.prob return out_dict
def clean_data(inputFile, cutoff=0.95): """Drops all empty rows, and initializes a number of counter variables. Uses the langdetect library to generate a language code and confidence. This is then split into component parts. If the identifier is 'en' for english, and the confidence is above the cutoff (0.95 used to process data), the index of that row is added to a list. Else if the labels ISO code is not the same as the detected language and the confidence is above the cutoff, that index is also added to the list. A progress counter and timer were added for convenience as the cleaner took a long time to run. Once complete, all rows of the corresponding indices were dropped from the table. This dataframe was then saved to a csv. Relevant statistics are printed at time of termination.""" ISOcodes = {'sk': 0, 'fr': 1, 'es': 2, 'de': 3, 'pl': 4} df = pd.read_csv(inputFile, encoding="utf8") df['text'].replace('', np.nan, inplace=True) df.dropna(subset=['text'], inplace=True) total = len(df) englishCount, misclassifiedCount, count = 0, 0, 0 hitList = [] startTime = time() for line in df.iterrows(): label = line[1]["label"] text = line[1]["text"] try: detectedLanguage = detect_langs(text) language = str(detectedLanguage[0]).split(":") if language[0] == 'en': if float(language[1]) > cutoff: englishCount += 1 hitList.append(count) elif label != ISOcodes[language[0]]: if float(language[1]) > cutoff: misclassifiedCount += 1 hitList.append(count) except: pass count += 1 if count % 1000 == 0: percentComplete = count * 100 / total now = time() timeLeft = (1 - count / total) * ( (now - startTime) / 60) / (count / total) timeLeft = str(round(timeLeft, 2)).split(".") minutes = timeLeft[0] seconds = (float(timeLeft[1]) / 100) * 60 print("Percent Complete: {}%".format(round(percentComplete, 2))) print("Time Left: {}:{:02d}".format(minutes, int(seconds))) df.drop(df.index[hitList], inplace=True) now = time() print("Number of English examples removed: {}".format(englishCount)) print("Number of misclassified examples removed: {}".format( misclassifiedCount)) print("Number of rows originally in dataframe: {}".format(total)) print("Percent of training examples classified as English: {}%".format( round(englishCount * 100 / total, 2))) print("Percent of training examples classified as incorrect: {}%".format( round(misclassifiedCount * 100 / total, 2))) print("New dataframe length: {}".format(len(df))) print("Actual time taken in minutes: {}".format((now - startTime) / 60)) return df
def _detect_message_language(message): lang = detect_langs(message)[0] if (lang.prob > 0.99): lang = lang.lang else: lang = 'en' return languages.get(alpha_2=lang).name
def assert_language(text, expected_language): langs = detect_langs(text) langs = [ MatchedLang({ 'cs': 'cze', 'en': 'eng' }.get(x.lang), x.prob) for x in langs ] if not langs or langs[0].prob < 0.50: return expected_language # unable to decide if langs[0].lang != expected_language: if langs[0].lang is None: # unknown language detected return expected_language if len(text) < 15: # text too short to say return expected_language # detected but different log.warning( 'Warning: error: language does not match. Expected %s, has %s, langs %s, value %s', expected_language, langs[0].lang, langs, text) if langs: # return the detected language return langs[0].lang return expected_language
def isLang(post: str, targetLang: str) -> bool: lang = None try: lang = str(detect_langs(post)[0]) except: return False return targetLang in lang
def find_language(text): language_list = list(detect_langs(text)) language_text = "<p>" results_list = [] for lang in language_list: l, p = str(lang).split(":") results_list.append(l) language_text += f"<p>{l.upper()}: {round(float(p),2)}</p>" ''' if results_list == ['en', 'fr'] or results_list == ['fr', 'en']: language_text += "EN/FR" elif results_list == ['fr']: language_text += "FR" else: language_text += "EN"''' language_text += "</p>" return language_text
def detect_songs_language(song_lyrics): """ Takes the lyrics of a song and returns the languages that it has and the probabilities in a list of tuples Args: song_lyrics: str returns: lang_probs = list of tuples (lang, probability) """ try: probs = langdetect.detect_langs(song_lyrics) lang_probs = list() for prob in probs: str_lang_prob = str(prob) lang_prob = get_lang_probability(str_lang_prob) lang_probs.append(lang_prob) return lang_probs except Exception as e: print(e) # if error return no english language # to delete that particular song lang_probs = [("en", 0.9)] return lang_probs
def languages_with_examples(self): resp = {} try: for (source, posts) in self.altmetric_api_raw["posts"].iteritems(): for post in posts: for key in ["title", "summary"]: try: num_words_in_post = len(post[key].split(" ")) top_detection = langdetect.detect_langs(post[key])[0] if (num_words_in_post > 7) and (top_detection.prob > 0.90): if top_detection.lang != "en": language_name = get_language_from_abbreviation(top_detection.lang) # print u"LANGUAGE:", language_name, top_detection.prob, post[key] # overwrites. that's ok, we just want one example resp[language_name] = post["url"] except langdetect.lang_detect_exception.LangDetectException: pass except (KeyError, AttributeError, TypeError): pass return resp
def identify( self, text, constrain_to_discussion_locales=SECURE_IDENTIFICATION_LIMIT): "Try to identify locale of text. Boost if one of the expected locales." if not text: return Locale.UNDEFINED, {Locale.UNDEFINED: 1} len_nourl = self.strlen_nourl(text) if len_nourl < 5: return Locale.NON_LINGUISTIC expected_locales = set(( Locale.extract_root_locale(l) for l in self.discussion.discussion_locales)) language_data = detect_langs(text) if constrain_to_discussion_locales and ( len_nourl < constrain_to_discussion_locales): data = [(x.prob, x.lang) for x in language_data if Locale.any_compatible( Locale.extract_root_locale(x.lang), expected_locales)] else: # boost with discussion locales. data = [ (x.prob * ( 5 if Locale.Locale.extract_root_locale(x.lang) in expected_locales else 1 ), x.lang) for x in language_data] data.sort(reverse=True) top = data[0][1] if (data and (data[0][0] > 0.5) ) else Locale.UNDEFINED return top, {lang: prob for (prob, lang) in data}
def ocr_core(filename): im = Image.open(filename) im = im.point(lambda p: p > 75 and p + 100) text = pytesseract.image_to_string(im) lang = str(detect_langs(text)[0])[:2] confidence = float(str(detect_langs(text)[0])[3:]) if lang == 'en': while confidence < 0.99: im = im.rotate(-90) text = pytesseract.image_to_string(im) confidence = float(str(detect_langs(text)[0])[3:]) print('Processing ...') else: print('No English Detected') return text
def data_from_CSV(): conn=establish_DB_connection_SQL() df=pd.read_csv(sys.argv[1], keep_default_na=False) tweets = df.set_index('status_id',drop=False) for index,row in tweets.iterrows(): phone_no='' hashtag=[] text=str(row['text']) if str(detect_langs(text)[0])[0:2] == 'en' and text!='nan': text = data_cleaning(text) words=text.split() words=set(words) words=list(words) for w in words: if re.match('^((\+){0,1}91(\s){0,1}(\-){0,1}(\s){0,1}){0,1}0{0,1}[1-9]{1}[0-9]{9}$',w): phone_no = w[-10:] if w.startswith('#'): hashtag.append(w) request_type = processML(text) if request_type != '': name=str(row['screen_name']) if name == '': name = None if text == '': text = None if phone_no == '': phone_no = 9900990099 latitude = 9.9252 longitude = 78.1198 updateDB_SQL(row['screen_name'],str(row['text']),latitude,longitude,phone_no,request_type, conn) disconnect_DB_SQL()
def check_language(self, msg, target=None): """Check the language of the message. Add the result to the metadata and and trigger the rule if it is present in the config and the languages are not in the ok list. :return True if the message language is unwanted and False otherwise """ prob = self["textcat_acceptable_prob"] results = langdetect.detect_langs(msg.text) self.ctxt.log.debug("TextCat results: %s", results) langs = [lang.lang for lang in results if lang.prob > prob] if len(langs) > self["textcat_max_languages"]: self.ctxt.log.debug("Too many languages.") return False msg.plugin_tags["LANGUAGES"] = " ".join(langs) ok_languages = self["ok_languages"] if "all" in ok_languages: # All good. return False for lang in langs: if lang not in ok_languages: return True return False
def is_lang(text, lang, prob): # print(is_lang("Smth", "ru", 0.75)) try: list_of_languages = detect_langs(text) return len(list_of_languages) > 0 and list_of_languages[0].lang == lang and list_of_languages[0].prob > prob except Exception: print("error in detecting language for text: \"" + text + "\"") return False
def detect(self, strict=True): try: summary = {} for i in range(0, TIMES): results = detect_langs(self.text) for res in results: lang = res.lang prob = float(res.prob) #float if not lang in summary: summary[lang] = prob else: summary[lang] += prob languages = sorted(summary, key=summary.get, reverse=True) language = languages[0] logger.info("language detection: lang = {} ; summary = {}".format( language, summary)) if strict: return language else: return languages except: logger.info( "failed when detecting language for text: {}\nError: {}". format(self.text, traceback.format_exc())) return None
def CheckLanguage(self, text): # identifier.set_languages(DETECT_LANGUAGES) try: langs = langdetect.detect_langs(text) except UnicodeDecodeError: langs = langdetect.detect_langs(text.decode("utf-8")) sorted_lang = sorted(langs) for lang in langs: prob = lang.prob lang = lang.lang is_very_probable = (prob > 0.50) if (is_very_probable): return lang return None
def languages_with_examples(self): resp = {} try: for (source, posts) in self.altmetric_api_raw["posts"].iteritems(): for post in posts: for key in ["title", "summary"]: try: num_words_in_post = len(post[key].split(" ")) top_detection = langdetect.detect_langs( post[key])[0] if (num_words_in_post > 7) and (top_detection.prob > 0.90): if top_detection.lang != "en": language_name = get_language_from_abbreviation( top_detection.lang) # print u"LANGUAGE:", language_name, top_detection.prob, post[key] # overwrites. that's ok, we just want one example resp[language_name] = post["url"] except langdetect.lang_detect_exception.LangDetectException: pass except (KeyError, AttributeError, TypeError): pass return resp
def detect_languages(post_text): try: langs = detect_langs(post_text) lang_dict = dict() for lang in langs: lang = lang.__repr__() lang_dict[lang.split(':')[0]] = float(lang.split(':')[1]) except: lang_dict = dict() # Romansh (Rumantsch) cannot be detected by langdetect de = lang_dict.get('de', 0) fr = lang_dict.get('fr', 0) it = lang_dict.get('it', 0) en = lang_dict.get('en', 0) lang = 'unclassified' lang_loading = 0 if de > lang_loading: lang = 'de' lang_loading = de if fr > lang_loading: lang = 'fr' lang_loading = fr if it > lang_loading: lang = 'it' lang_loading = it if en > lang_loading: lang = 'en' return lang, de, fr, it, en
def guess(string): if len(string) < 25: # we cannot guess accurately on short strings return 'UNKNOWN' r = langdetect.detect_langs(string)[0] return r.lang if r.prob >= 0.75 else 'UNKNOWN'
def find_out_language(candidate_languages, *args): candidates = [] for sample in args: candidate = guess_language(sample) if candidate != UNKNOWN_LANGUAGE and candidate in candidate_languages: candidates.append(candidate) try: for candidate in detect_langs(sample): if candidate.lang in candidate_languages: candidates.append(candidate.lang) except LangDetectException: continue if len(candidates) == 0: return None leading_candidate = { 'lang': candidates[0], 'count': candidates.count(candidates[0]) } for leading_candidate in candidates[1:0]: if leading_candidate['count'] < candidates.count(candidate): leading_candidate['lang'] = candidate leading_candidate['size'] = candidates.count(candidate) if leading_candidate['lang'] == UNKNOWN_LANGUAGE: return None return leading_candidate['lang']
def tweeter_user_lang_detect(user: str, limit: int = 10, csv_path: str = pj(TWEETS_DIR, TWEETS_FILENAME), delete_csv: bool = True, scrap: bool = True) -> str: try: if scrap: scrap_tweets(user=user, limit=limit, group='') tweets = pd.read_csv(csv_path, header=0) except: with open('failed.txt', 'a') as myfile: print('failed') myfile.write(user + '\n') return (0, 'bug') lang_probs = defaultdict(list) for tweetLang in chain( *[detect_langs(i) for i in preprocess_tweet(tweets.tweet)]): lang_probs[tweetLang.lang].append(tweetLang.prob) if delete_csv: os.remove(csv_path) os.remove(csv_path.replace('tweets.csv', 'users.csv')) return max((np.mean(v), k) for k, v in lang_probs.items())[1]
def which_language(anchor): try: langs = detect_langs(anchor) return str(langs[0])[:2] except: # if not possible to estimate language (like numbers, acronymus etc.) suppose English return 'en'
def detect_all(): detector = UniversalDetector() results = [] for file in os.listdir(store_dir): if file.endswith(".txt"): with open(store_dir + '/' + file, 'r') as myfile: data = '' for line in myfile.readlines(): data += line detector.feed(line) #if detector.done: break detector.close() # Si se pudo detectar el encoding del fichero if detector.result['encoding']: print(detect_langs(data.decode(detector.result['encoding']))) dlang = detect(data.decode(detector.result['encoding'])) if dlang == 'fr': print data results.append({'mail': file, 'lang': dlang}) else: results.append({'mail': file, 'lang': 'error'}) stats = {} for result in results: lang = result['lang'] if lang in stats: stats[lang] = stats[lang] + 1 else: stats[lang] = 1 if lang != u'en': print str(lang) + ' - ' + str(result['mail']) return stats
def receive_feedback(bot, update): feedback_msg = update.message.text valid_lang = False langdetect.DetectorFactory.seed = 0 langs = langdetect.detect_langs(feedback_msg) for lang in langs: if lang.lang in ("en", "zh-tw", "zh-cn"): valid_lang = True break if not valid_lang: update.message.reply_text(_("The feedback you sent is not in English or Chinese. Please try again.")) return 0 install_lang(update.message.from_user.id) update.message.reply_text(_("Thank you for your feedback, I will let my developer know.")) if is_email_feedback: server = smtplib.SMTP(smtp_host) server.ehlo() server.starttls() server.login(dev_email, dev_email_pw) text = "Feedback received from %d\n\n%s" % (update.message.from_user.id, update.message.text) message = "Subject: %s\n\n%s" % ("Telegram Big Two Bot Feedback", text) server.sendmail(dev_email, dev_email, message) else: logger.info("Feedback received from %d: %s" % (update.message.from_user.id, update.message.text)) return ConversationHandler.END
def fun_loc(arg): r2 = False loc = arg['location'] pl = arg['place'] if re.search(string_loc, loc, re.IGNORECASE) != None or re.search( r'(\W|\b)India(\W|\b)|हिन्दुस्तान|भारत', pl, re.IGNORECASE) != None: #if location is desirable then check if it needs translation a = 0 b = len(arg['text']) - 1 if arg['display_text_range'] != None: a = arg['display_text_range'][0] b = arg['display_text_range'][1] try: d = detect_langs( arg['text'][a:b]) #obtain list of possible languages in d except: return False, None #if detect_langs() is throwing exception then it means that tweet is useless , it either contains only #simileys, or only a link (a shared tweet) or a language which can't be understood #so for such tweets we return false even if location is desireable for el in d: #among detected langs , check if anyone of them matches with desired langs if el.lang in tr_lang: r2 = True break return True, r2 return False, None
def update_languages(): response = gspread.get_sheet_values(SHEET_ID_INSTAGRAM, "hashtag", "FORMULA") label_list, hashtag_list = gspread.convert_to_dict_data(response) for index, hashtag in enumerate(hashtag_list): name = hashtag['name'] print(name) try: detect_list = detect_langs(name) languages = [detect.lang for detect in detect_list] print(languages) except Exception as e: print(e) continue new_data = hashtag_list[index] new_data['languages'] = ','.join(languages) hashtag_list[index] = new_data body = { 'values': gspread.convert_to_sheet_values(label_list, hashtag_list) } gspread.update_sheet_values(SHEET_ID_INSTAGRAM, 'hashtag', body) print("SUCCESS!! update_languages")
def lan_prob(text): from langdetect import detect_langs try: lang_prob = detect_langs(text) return lang_prob except: return 'N/A'
def get_language(text): """Classify the language of text. Uses Google's language detection algorithm to assign a language to a text string. Parameters ---------- text :str The text to classify. Returns ------- str The most likely language of the text that meets the :const:`~PROBABILTY` threshold, among the list of supported :const:`~LANGUAGES`. If no supported language meets the threshold, returns the value 'none'. Notes ----- When associated with a field called "language", the string "none" tells MongoDB's text index to use simple tokenization with no list of stop words and no stemming. See http://docs.mongodb.org/manual/reference/text-search-languages/ for more info. """ results = detect_langs(text) for result in results: if result.lang in LANGUAGES and result.prob >= PROBABILTY: return result.lang return 'none'
def get_html_response_language(response): try: raw_body = strip_tags(response.body_as_unicode()) langs = detect_langs(raw_body) return (langs[0].lang, langs[-len(langs)+1].lang) # return detect(raw_body) except Exception,e: print str(e) return "unknown"
def word_processing(filename): doc = docx.Document(filename) fullText = [] for para in doc.paragraphs: fullText.append(para.text) langHolder = '\n'.join(fullText) return detect_langs(langHolder)
def get_word_chords_in_lang(s): langs = detect_langs(s) if not langs: raise Exception("Can't guess lang (langdetect)") if langs[0] == 'en': return CHORDS_EN + " " if langs[0] == 'ru': return CHORDS_RU + " " print("Lang guess return " + str(langs) + " (but we want 'ru' or 'en' as first)", file=sys.stderr) return CHORDS_RU + " "
def detectLang(a): det = detect_langs(a) # извлечение процентного значения принадлежность строки к английскому языку det = [str(i).split(":") for i in det] det = [y for z, y in det if z == "en"] if det: det = float(det[0]) else: det = 0 return det
def is_english(text): if not only_roman_chars(text): return False try: stats = langdetect.detect_langs(text) except LangDetectException: return False if any(stats.lang == 'en' for stats in stats): return True
def discover_language(self, *sentences): text = '. '.join(sentences) candidates = {candidate.lang: candidate.prob for candidate in detect_langs(text)} lang = max(candidates, key=candidates.get) delta = candidates.get(lang) - (candidates.get(self.preferred_language) or 0) if delta < 0.15: return self.preferred_language return lang
def ppt_processing(filename): prs = Presentation(filename) holder = [] for slide in prs.slides: for shape in slide.shapes: if not shape.has_text_frame: continue for paragraph in shape.text_frame.paragraphs: for run in paragraph.runs: holder.append(run.text) langHolder = '\n'.join(holder) return(detect_langs(langHolder))
def is_valid(self, whitelist): if not self.title.strip(): logger.info("has invalid title: " + self.url) return False if not self.text.strip(): logger.info("has invalid text: " + self.url) return False if len(self.text.split(" ")) < 40: logger.info("text is too short: " + self.url) return False if self.normalized_title in whitelist: logger.info("is whitelisted: " + self.url) return False possible_langs = detect_langs(self.title) + detect_langs(self.normalized_title) if not "en" in map(lambda x: x.lang, possible_langs): logger.info("could not detect english language: " + self.url) return False return True
def excel_processing(filename): workbook = xlrd.open_workbook(filename) holder = [] for sheet in workbook.sheet_names(): current_sheet = workbook.sheet_by_name(sheet) numRows = current_sheet.nrows numCols = current_sheet.ncols # print numRows, numCols for row in range(0,numRows): for column in range(0,numCols): holder.append(current_sheet.cell(row,column).value) langHolder = ''.join(holder) return detect_langs(langHolder)
def prepare_context(self, ctx): # if there is no support, put "" filter_langs = self.cfg.property("languages").split(",") try: results = langdetect.detect_langs(ctx["text"]) if len(results) == 0: return # apenas as langs configuradas results = [result for result in results if result.lang in filter_langs] # a maior prob best = max(results, key=lambda result: result.prob) ctx["language"] = best.lang except: # da erro se nao tiver nada... pass
def identify(self, text, constrain_to_discussion_locales=True): "Try to identify locale of text. Boost if one of the expected locales." if not text: return Locale.UNDEFINED, {Locale.UNDEFINED: 1} expected_locales = set(( Locale.extract_root_locale(l) for l in self.discussion.discussion_locales)) language_data = detect_langs(text) if constrain_to_discussion_locales: data = [(x.prob, x.lang) for x in language_data if Locale.extract_root_locale(x.lang) in expected_locales] else: # boost with discussion locales. data = [ (x.prob * ( 5 if Locale.Locale.extract_root_locale(x.lang) in expected_locales else 1 ), x.lang) for x in language_data] data.sort(reverse=True) top = data[0][1] if (data and (data[0][0] > 0.5) ) else Locale.UNDEFINED return top, {lang: prob for (prob, lang) in data}
def convert_pdf_to_txt(path, encoding): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = encoding laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return detect_langs(text)
f_text = parsed["content"] #print(f_text) if f_text != None: #print("Found the Text to be None and hence Skipping !") fjson["text-count"] = len(f_text) #fhandle.close() #continue f_metadata = parsed["metadata"] #print(f_metadata) fjson["metadata"] = json.dumps(f_metadata) if isinstance(f_metadata,dict): fjson["metadata_length"] = len(f_metadata.keys()) #print("Metadata Fields are: "+str(len(f_metadata.keys()))) try: fjson["languages"] = {} languages = detect_langs(f_text) for l in languages: (lang,probability) = str(l).split(":") fjson["languages"][lang] = probability except: print("\n Language Detection module exncountered error") #print(" Languages Detected {l}".format(l=languages)) #pp.pprint(fjson["languages"]) except (KeyError,ValueError): print("Tika could not get content for {f}".format(f=fpath)) fjson["languages"] = " " fhandle.close() fjson["id"] = fname fjson["size"] = os.path.getsize(fpath) #print("Size of file : "+str(fjson["size"])) except ValueError:
def textFileProcessing(filepath, encoding): fileContent = codecs.open(filepath, "r", encoding).read() return detect_langs(fileContent)
def get_similarity_to_english(text): return next((lng.prob for lng in langdetect.detect_langs(text) if lng.lang == 'en'), None)
def detect_languages(self, text): """Detect the probabilities for the top languages Override this method in the subclass(es). """ return langdetect.detect_langs(text)
def detect_language(query): print detect_langs(query) print detect(query) return detect(query)[:2]
def get_similarity_to_english(text): if ' ' not in text: return 0 return next((lng for lng in langdetect.detect_langs(text) if lng.lang == 'en'), None)
def search_result(request): if not request.method == "POST": return redirect('/') query = request.POST.get('query') q_words = query.split() stemmed_words = [] filters = [] for word in q_words: if word.startswith('domain:'): filters.append(('domain', word.replace('domain:', '').lower())) elif word.startswith('lang:'): filters.append(('lang', word.replace('lang:', '').lower())) elif word.startswith('encoding:'): filters.append(('encoding', word.replace('encoding:', '').lower())) else: try: lngs = detect_langs(word) correct_lng = 'english' for lng in lngs: if lng in LANGUAGES and LANGUAGES[lng].lower() in snowballstemmer.algorithms(): correct_lng = LANGUAGES[lng].lower() stemmed_words.append(snowballstemmer.stemmer(correct_lng).stemWord(word)) except: stemmed_words.append(word) doc_ratings = {} for word in stemmed_words: try: stem = Stem.objects.get(stem=word) except: continue term_ratings = {} for relation in DocumentStemMap.objects.filter(stem=stem): corresponding = True for fil in filters: if fil[0] == 'domain': if not fil[1] in relation.doc.domain: corresponding = False elif fil[0] == 'lang': if not fil[1] == relation.doc.language: corresponding = False elif fil[0] == 'encoding': if not fil[1] == relation.doc.encoding: corresponding = False if not corresponding: continue rc = relation.rank_component if rc < 0: rc = 0 if relation.doc_id in term_ratings: term_ratings[relation.doc_id] += rc else: term_ratings[relation.doc_id] = rc for doc_id in term_ratings: term_ratings[doc_id] = term_ratings[doc_id] / (2 + term_ratings[doc_id]) * stem.idf if doc_id in doc_ratings: doc_ratings[doc_id] += term_ratings[doc_id] else: doc_ratings[doc_id] = term_ratings[doc_id] del term_ratings rated_docs = doc_ratings.items() rated_docs.sort(key=lambda x: x[1]) results = [] for doc_id in rated_docs[:10]: results.append(Document.objects.get(id=doc_id[0])) return render(request, 'searchres/search_result.html', { 'documents': results, 'query': query, })
def load(limit = None, host = "localhost", port = 27017, db = "instagram", media_feed_collection = 'media_feeds', **kwargs): # Connect to mongo client = MongoClient(host, int(port)) mongo = client[db][media_feed_collection] # Extract the features that we are interested in and put it into a dataframe data = {} for k in [ 'uid', 'uname', 'mid', 'date', 'text', 'tags', 'tags_count', \ 'likes', 'type', 'locid', 'locname', 'lat', 'long', 'url', 'lang', 'lang_prob' ]: data[k] = [] cnt = 0 for x in mongo.find(): if limit is not None and cnt >= int(limit): break if 'feed' not in x: continue for y in x['feed'][:30]: cnt = cnt + 1 if cnt % 100 == 0: print "%d documents loaded" % cnt data['uid'].append(x['_id']) data['uname'].append(x['name'] if 'name' in x else "instagram_user") data['mid'].append(y['id']) data['date'].append(y['created']) data['text'].append(y['caption']) data['type'].append(y['type']) data['tags'].append(" ".join(sorted(y['tags'], key=lambda x: len(x), reverse=True))) data['tags_count'].append(len(y['tags'])) data['likes'].append(y['like_count']) data['locid'].append(y['location']['id'] if y['location'] is not None else None) data['locname'].append(y['location']['name'] if y['location'] is not None else None) data['lat'].append(y['location']['latitude'] if y['location'] is not None else None) data['long'].append(y['location']['longitude'] if y['location'] is not None else None) data['url'].append(y['images']['standard_resolution']) try: langs = filter(lambda x: x > 0.2, detect_langs(y['caption'].replace('#', ''))) data['lang'].append(langs[0].lang) data['lang_prob'].append(langs[0].prob) except Exception: data['lang'].append("??") data['lang_prob'].append(0.0) client.close() df = pd.DataFrame(data) if limit is not None: df = df[:int(limit)] df['text_cleaned'] = [ reduce(lambda y,z: y.replace('#'+z, ''), x['tags'].split(' '), x['text']) \ for i,x in df.iterrows() ] df['text_length'] = [ len(x) for x in df['text_cleaned'] ] df['tag_length'] = [ len(x) for x in df['tags'] ] - df['tags_count'] df['tt_ratio'] = (df['tag_length'].astype(np.float)+1) / (df['text_length']+1) print df.describe() return df
def langd_englishness(_,text): l = langdetect.detect_langs(text) if 'en' in l: return l['en'] return 0