Beispiel #1
0
def detect_text_language(text):
    """
    Detect language now based on langid (langdetect fails miserably).
    When classification is below probability_min or text is too short
    returns undetermined language

    >>> detect_text_language(u"review with text too small 30")
    u'und'
    >>> detect_text_language(u'Mixed language etrange et muy bieno super confusing')
    u'und'
    >>> detect_text_language(u"סיפור נפלא ממש. תרגום בסדר פלוס.")
    u'heb'
    >>> detect_text_language(u"J'ai adoré ce livre mais il était long")
    u'fre'
    >>> detect_text_language(u"CLASSIC BOOKS YOU CAN'T BELIEVE ANYONE WOULD EVER READ EXCEPT FOR SCHOOL OR TO LOOK SMART?")
    u'eng'
    """

    global MIN_DETECT_SIZE
    global MIN_DETECT_PROB

    if not text:
        return None
    elif len(text) < MIN_DETECT_SIZE:
        return u'und'

    # langid does not work well with all-capitalized text
    lang_prob = _langidentifier.classify(text.lower())

    if lang_prob[1] < MIN_DETECT_PROB:
        return u'und'
    else:
        return brd.get_marc_code(lang_prob[0], capital=False)