def clean(content): #################### # Add language testing here and removal of punctuation etc # taken from normalisatino code global stopwords reject = True match = False for l in langid.rank(content): if l[0] in targetLangs and l[1] > 0.7: reject = False # If content is detected as Arabic/Farsi with reasonable # confidence then consider it if reject: return None else: content = content.split(' ') content = [re.sub(allRe, '', c) for c in content if not c in stopWords] # if any([re.match(hahaRe,c) for c in content]): # match=True # print '\t',' '.join(content) content = [re.sub(hahaRe, u'هه', c) for c in content] if match: print('\t', ' '.join(content)) sys.exit(1) content = ' '.join(content) return content
def rank_language(string): import sc.lib.langid as langid rank = langid.rank(string.casefold())[:4] biased_rank = sorted( ((lang, score * lang_bias.get(lang, 1.0)) for lang, score in rank), key=lambda t: t[1], reverse=True) return biased_rank
def rank_language(string): import sc.lib.langid as langid rank = langid.rank(string.casefold())[:4] biased_rank = sorted( ((lang, score * lang_bias.get(lang, 1.0)) for lang, score in rank), key=lambda t: t[1], reverse=True ) return biased_rank
def classify(self, txt): ''' Classifies text by language. Uses preferred_languages weighting. ''' ranks = [] for lang, score in langid.rank(txt): if lang in self.preferred_languages: score *= self.preferred_factor ranks.append((lang, score)) ranks.sort(key=lambda x: x[1], reverse=True) return ranks[0][0]
def getProbDist(text): try: import langid except RuntimeError: return result = langid.rank(text) result2 = {} for r in result: if r[1]>0.01: result2[r[0]]=r[1] return result2
def classify(self, txt): ''' Classifies text by language. Uses preferred_languages weighting. ''' ranks = [] for lang, score in langid.rank(txt): if lang in self.preferred_languages: score += self.preferred_factor ranks.append((lang, score)) ranks.sort(key=lambda x: x[1], reverse=True) return ranks[0][0]
def langid_(text, label): score = 0 try: sentence = text.strip() result = langid.rank(sentence) if result[0][0] in label: score += 1 if result[1][0] in label: score += 1 if result[0][0] == label[0]: score += 2 if result[0][0] == label[0] and result[1][0] == label[1]: score += 3 except Exception as e: print(e) pass return score
def add_lid (transactions, debug): known_langs = set(['en','es','pt']) for key, value in transactions.items(): msg = value['msg_norm'] if (debug > 0): print u"msg: {}".format(msg) if (len(msg.split()) < 5): lang = '--' else: lang_list = langid.rank(msg) for lang_pr in lang_list: if (lang_pr[0] in known_langs): break lang = lang_pr[0] value['lid_lui'] = lang if (debug > 0): print "predicted language lui: {}".format(lang) print
def get_user_langs(summaryfile): """extract strings and variables for each user; use language ID on these to infer the language label (distant supervision) """ user_langs = {} # map user to language ignored = set() # set of users with too few tokens ctr = 0 for line in open('user_project_summaries.json'): summaries = ujson.loads(line) print 'Loaded summaries', for userid in summaries: tokens = extract_namestrings(summaries[userid]) if len(tokens) < 50: ignored.add(userid) print '*', else: langlist = langid.rank(' '.join(tokens)) if langlist[0][0] == 'la': user_langs[userid] = langlist[1][0] #if len(tokens)<100: # print ' '.join(tokens) else: user_langs[userid] = langlist[0][0] if user_langs[userid] == 'da': print userid #print user_langs[userid], print ctr ctr += 1 print 'Ignored', len(ignored), 'users with too few tokens' return user_langs
def classify(s): rank = langid.rank(s) if rank[0][0] == 'la': return rank[1][0] return rank[0][0]
def is_english(text, threshold=0.5): ranks = dict(langid.rank(text)) return ranks.get('en', 0) >= threshold
""" pip install langid langid.py comes pre-trained on 97 languages (ISO 639-1 codes given): af, am, an, ar, as, az, be, bg, bn, br, bs, ca, cs, cy, da, de, dz, el, en, eo, es, et, eu, fa, fi, fo, fr, ga, gl, gu, he, hi, hr, ht, hu, hy, id, is, it, ja, jv, ka, kk, km, kn, ko, ku, ky, la, lb, lo, lt, lv, mg, mk, ml, mn, mr, ms, mt, nb, ne, nl, nn, no, oc, or, pa, pl, ps, pt, qu, ro, ru, rw, se, si, sk, sl, sq, sr, sv, sw, ta, te, th, tl, tr, ug, uk, ur, vi, vo, wa, xh, zh, zu You can also use langdetect (only 55 languages supported) """ import langid print(langid.rank("Questa e una prova")) print(langid.classify("Questa e una prova")) print(langid.classify("I do not speak english")) langid.set_languages(['de', 'fr', 'it']) print(langid.classify("I do not speak english")) print(langid.classify("Je ne parle pas français")) """" The probabilistic model implemented by langid.py involves the multiplication of a large number of probabilities. For computational reasons, the actual calculations are implemented in the log-probability space (a common numerical technique for dealing with vanishingly small probabilities). One side-effect of this is that it is not necessary to compute a full probability in order to determine the most probable language in a set of candidate languages. However, users sometimes find it helpful to have a "confidence" score for the probability prediction. Thus, langid.py implements a re-normalization that produces an output in the 0-1 range. """ from langid.langid import LanguageIdentifier, model langid.set_languages(None) identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) print(identifier.classify("This is a test")) #After setting langid to None
,FunctionalTextEssayFeature(feature_name="text_length_2nd_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.50)/1000.0) ,FunctionalTextEssayFeature(feature_name="text_length_4th_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.25)/1000.0) ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_4_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],4)/100.0) ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_6_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],6)/100.0) ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_8_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],8)/100.0) ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_10_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],10)/100.0) ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_12_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],12)/100.0) ,FunctionalTextEssayFeature(feature_name="words_length_mean_raw", fun=lambda essay: words_length_mean(essay.texts["raw"])/100.0) ,FunctionalTextEssayFeature(feature_name="words_length_variance_raw", fun=lambda essay: words_length_variance(essay.texts["raw"])/100.0) ,FunctionalTextEssayFeature(feature_name="unique_words_norm_raw", fun=lambda essay: unique_words_norm(essay.texts["raw"])/100.0) ,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_10_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],10)/100.0) ,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_18_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],18)/100.0) ,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_25_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],25)/100.0) # language probability ,FunctionalTextEssayFeature(feature_name="langid_lang_prob_en", fun=lambda essay: dict(langid.rank(essay.texts["raw"]))["en"]) ,FunctionalTextEssayFeature(feature_name="langid_lang_prob_fr", fun=lambda essay: dict(langid.rank(essay.texts["raw"]))["fr"]) ,FunctionalTextEssayFeature(feature_name="langid_lang_prob_es", fun=lambda essay: dict(langid.rank(essay.texts["raw"]))["es"]) # wiki coverage ,FunctionalTextEssayFeature(feature_name="wiki_1gram_coverage", fun=lambda essay: check_1gram_coverage(essay.texts["raw"])) ,FunctionalTextEssayFeature(feature_name="wiki_2gram_coverage", fun=lambda essay: check_2gram_coverage(essay.texts["raw"])) ,FunctionalTextEssayFeature(feature_name="wiki_3gram_coverage", fun=lambda essay: check_3gram_coverage(essay.texts["raw"])) # sentiment ,FunctionalTextEssayFeature(feature_name="sentiment_polarity", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.polarity) ,FunctionalTextEssayFeature(feature_name="sentiment_subj", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.subjectivity) # ngrams ,EssaySkipgram(name="LETTER1",source="raw",base=lambda text: text, nskip=0, ngram=1, cumulative=True) # count of each character ,EssaySkipgram(name="LETTER2",source="raw",base=lambda text: text, nskip=0, ngram=2) # count of each character
def guess(content, config): return langid.rank(content)
def guess_language(text): """Returns list of language guesses for the message, with confidence measure (0 to 1). """ return langid.rank(text)
import subprocess try: import winsound except ImportError: winsound = None from talkey.utils import process_options, check_executable import langid import contextlib import audioread import wave # Get the list of identifiable languages DETECTABLE_LANGS = sorted([a[0] for a in langid.rank('')]) def genrst(label, opt, txt, indent=' '): txt += '\n%s%s:\n\n' % (indent, label) for key in sorted(opt.keys()): val = opt[key] txt += indent + '``%s``\n' % key txt += indent + ' %s\n\n' % val.get('description', '%s option' % key) txt += indent + ' :type: %s\n' % val['type'] txt += indent + ' :default: %s\n' % val['default'] if 'min' in val.keys(): txt += indent + ' :min: %s\n' % val['min'] if 'max' in val.keys(): txt += indent + ' :max: %s\n' % val['max']
def classify(s): rank = langid.rank(s) if rank[0][0] == "la": return rank[1][0] return rank[0][0]