def clean(content):
    ####################
    # Add language testing here and removal of punctuation etc
    # taken from normalisatino code
    global stopwords
    reject = True
    match = False

    for l in langid.rank(content):
        if l[0] in targetLangs and l[1] > 0.7:
            reject = False
    # If content is detected as Arabic/Farsi with reasonable
    # confidence then consider it
    if reject:
        return None
    else:
        content = content.split(' ')
        content = [re.sub(allRe, '', c) for c in content if not c in stopWords]
        #        if any([re.match(hahaRe,c) for c in content]):
        #            match=True
        #            print '\t',' '.join(content)
        content = [re.sub(hahaRe, u'هه', c) for c in content]
        if match:
            print('\t', ' '.join(content))
            sys.exit(1)
        content = ' '.join(content)
        return content
Exemple #2
0
def rank_language(string):
    import sc.lib.langid as langid
    rank = langid.rank(string.casefold())[:4]

    biased_rank = sorted(
        ((lang, score * lang_bias.get(lang, 1.0)) for lang, score in rank),
        key=lambda t: t[1],
        reverse=True)
    return biased_rank
Exemple #3
0
def rank_language(string):
    import sc.lib.langid as langid

    rank = langid.rank(string.casefold())[:4]

    biased_rank = sorted(
        ((lang, score * lang_bias.get(lang, 1.0)) for lang, score in rank), key=lambda t: t[1], reverse=True
    )
    return biased_rank
Exemple #4
0
 def classify(self, txt):
     '''
     Classifies text by language. Uses preferred_languages weighting.
     '''
     ranks = []
     for lang, score in langid.rank(txt):
         if lang in self.preferred_languages:
             score *= self.preferred_factor
         ranks.append((lang, score))
     ranks.sort(key=lambda x: x[1], reverse=True)
     return ranks[0][0]
Exemple #5
0
def getProbDist(text):
    try:
        import langid
    except RuntimeError:
        return
    result = langid.rank(text)
    result2 = {}
    for r in result:
        if r[1]>0.01:
            result2[r[0]]=r[1]
    return result2
Exemple #6
0
 def classify(self, txt):
     '''
     Classifies text by language. Uses preferred_languages weighting.
     '''
     ranks = []
     for lang, score in langid.rank(txt):
         if lang in self.preferred_languages:
             score += self.preferred_factor
         ranks.append((lang, score))
     ranks.sort(key=lambda x: x[1], reverse=True)
     return ranks[0][0]
def langid_(text, label):
    score = 0
    try:
        sentence = text.strip()
        result = langid.rank(sentence)
        if result[0][0] in label:
            score += 1
        if result[1][0] in label:
            score += 1
        if result[0][0] == label[0]:
            score += 2
        if result[0][0] == label[0] and result[1][0] == label[1]:
            score += 3
    except Exception as e:
        print(e)
        pass
    return score
Exemple #8
0
def add_lid (transactions, debug):
    known_langs = set(['en','es','pt'])
    for key, value in transactions.items():
        msg = value['msg_norm']
        if (debug > 0):
            print u"msg: {}".format(msg)
        if (len(msg.split()) < 5):
            lang = '--'
        else:
            lang_list = langid.rank(msg)
            for lang_pr in lang_list:
                if (lang_pr[0] in known_langs):
                    break
            lang = lang_pr[0]
        value['lid_lui'] = lang
        if (debug > 0):
            print "predicted language lui: {}".format(lang)
            print
def get_user_langs(summaryfile):
    """extract strings and variables for each user;
    use language ID on these to infer the language label
    (distant supervision)
    """
    user_langs = {}  # map user to language

    ignored = set()  # set of users with too few tokens

    ctr = 0
    for line in open('user_project_summaries.json'):
        summaries = ujson.loads(line)
        print 'Loaded summaries',

        for userid in summaries:
            tokens = extract_namestrings(summaries[userid])
            if len(tokens) < 50:
                ignored.add(userid)
                print '*',
            else:
                langlist = langid.rank(' '.join(tokens))
                if langlist[0][0] == 'la':
                    user_langs[userid] = langlist[1][0]
                    #if len(tokens)<100:
                    #    print ' '.join(tokens)
                else:
                    user_langs[userid] = langlist[0][0]
                    if user_langs[userid] == 'da':
                        print userid
                #print user_langs[userid],

        print ctr
        ctr += 1

    print 'Ignored', len(ignored), 'users with too few tokens'
    return user_langs
Exemple #10
0
 def classify(s):
     rank = langid.rank(s)
     if rank[0][0] == 'la':
         return rank[1][0]
     return rank[0][0]
Exemple #11
0
def is_english(text, threshold=0.5):
    ranks = dict(langid.rank(text))
    return ranks.get('en', 0) >= threshold
"""
pip install langid

langid.py comes pre-trained on 97 languages (ISO 639-1 codes given):
af, am, an, ar, as, az, be, bg, bn, br, bs, ca, cs, cy, da, de, dz, el, en, eo, es, et, eu, fa, fi, fo, fr, ga, gl, gu, 
he, hi, hr, ht, hu, hy, id, is, it, ja, jv, ka, kk, km, kn, ko, ku, ky, la, lb, lo, lt, lv, mg, mk, ml, mn, mr, ms, mt, 
nb, ne, nl, nn, no, oc, or, pa, pl, ps, pt, qu, ro, ru, rw, se, si, sk, sl, sq, sr, sv, sw, ta, te, th, tl, tr, ug, uk, 
ur, vi, vo, wa, xh, zh, zu
You can also use langdetect (only 55 languages supported)
"""
import langid

print(langid.rank("Questa e una prova"))
print(langid.classify("Questa e una prova"))
print(langid.classify("I do not speak english"))
langid.set_languages(['de', 'fr', 'it'])
print(langid.classify("I do not speak english"))
print(langid.classify("Je ne parle pas français"))
"""" 
The probabilistic model implemented by langid.py involves the multiplication of a large number of probabilities. For 
computational reasons, the actual calculations are implemented in the log-probability space (a common numerical technique 
for dealing with vanishingly small probabilities). One side-effect of this is that it is not necessary to compute a full 
probability in order to determine the most probable language in a set of candidate languages. However, users sometimes 
find it helpful to have a "confidence" score for the probability prediction. Thus, langid.py implements a re-normalization 
that produces an output in the 0-1 range.
"""
from langid.langid import LanguageIdentifier, model

langid.set_languages(None)
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
print(identifier.classify("This is a test"))  #After setting langid to None
     ,FunctionalTextEssayFeature(feature_name="text_length_2nd_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.50)/1000.0)
     ,FunctionalTextEssayFeature(feature_name="text_length_4th_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.25)/1000.0)
     ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_4_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],4)/100.0)
     ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_6_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],6)/100.0)
     ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_8_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],8)/100.0)
     ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_10_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],10)/100.0)
     ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_12_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],12)/100.0)
     ,FunctionalTextEssayFeature(feature_name="words_length_mean_raw", fun=lambda essay: words_length_mean(essay.texts["raw"])/100.0)
     ,FunctionalTextEssayFeature(feature_name="words_length_variance_raw", fun=lambda essay: words_length_variance(essay.texts["raw"])/100.0)
     ,FunctionalTextEssayFeature(feature_name="unique_words_norm_raw", fun=lambda essay: unique_words_norm(essay.texts["raw"])/100.0)
     ,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_10_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],10)/100.0)
     ,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_18_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],18)/100.0)
     ,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_25_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],25)/100.0)
     
     # language probability
     ,FunctionalTextEssayFeature(feature_name="langid_lang_prob_en", fun=lambda essay: dict(langid.rank(essay.texts["raw"]))["en"])
     ,FunctionalTextEssayFeature(feature_name="langid_lang_prob_fr", fun=lambda essay: dict(langid.rank(essay.texts["raw"]))["fr"])
     ,FunctionalTextEssayFeature(feature_name="langid_lang_prob_es", fun=lambda essay: dict(langid.rank(essay.texts["raw"]))["es"])        
     
     # wiki coverage
     ,FunctionalTextEssayFeature(feature_name="wiki_1gram_coverage", fun=lambda essay: check_1gram_coverage(essay.texts["raw"]))
     ,FunctionalTextEssayFeature(feature_name="wiki_2gram_coverage", fun=lambda essay: check_2gram_coverage(essay.texts["raw"]))
     ,FunctionalTextEssayFeature(feature_name="wiki_3gram_coverage", fun=lambda essay: check_3gram_coverage(essay.texts["raw"]))
 
     # sentiment
     ,FunctionalTextEssayFeature(feature_name="sentiment_polarity", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.polarity)
     ,FunctionalTextEssayFeature(feature_name="sentiment_subj", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.subjectivity)
     
     # ngrams    
     ,EssaySkipgram(name="LETTER1",source="raw",base=lambda text: text, nskip=0, ngram=1, cumulative=True) # count of each character        
     ,EssaySkipgram(name="LETTER2",source="raw",base=lambda text: text, nskip=0, ngram=2) # count of each character        
Exemple #14
0
def guess(content, config):
    return langid.rank(content)
Exemple #15
0
    def guess_language(text):
        """Returns list of language guesses for the message,
        with confidence measure (0 to 1).
        """

        return langid.rank(text)
    def guess_language(text):
        """Returns list of language guesses for the message,
        with confidence measure (0 to 1).
        """

        return langid.rank(text)
Exemple #17
0
    import subprocess

try:
    import winsound
except ImportError:
    winsound = None

from talkey.utils import process_options, check_executable

import langid
import contextlib
import audioread
import wave

# Get the list of identifiable languages
DETECTABLE_LANGS = sorted([a[0] for a in langid.rank('')])


def genrst(label, opt, txt, indent='    '):
    txt += '\n%s%s:\n\n' % (indent, label)
    for key in sorted(opt.keys()):
        val = opt[key]
        txt += indent + '``%s``\n' % key
        txt += indent + '    %s\n\n' % val.get('description',
                                               '%s option' % key)
        txt += indent + '    :type: %s\n' % val['type']
        txt += indent + '    :default: %s\n' % val['default']
        if 'min' in val.keys():
            txt += indent + '    :min: %s\n' % val['min']
        if 'max' in val.keys():
            txt += indent + '    :max: %s\n' % val['max']
 def classify(s):
     rank = langid.rank(s)
     if rank[0][0] == "la":
         return rank[1][0]
     return rank[0][0]