コード例 #1
0
def detect_language(text):
    """
    Detect the most likely language of a text and return its 2-letter code
    (see https://cloud.google.com/translate/v2/using_rest#language-params).
    Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package;
    to take advantage of optional params, call :func:`cld2.detect()` directly.

    Args:
        text (str)

    Returns:
        str
    """
    try:
        cld2_detect
    except NameError:
        raise ImportError(
            '`cld2-cffi` must be installed to use textacy\'s automatic language detection; '
            'you may do so via `pip install cld2-cffi` or `pip install textacy[lang]`.'
            )

    if compat.is_python2:
        is_reliable, _, best_guesses = cld2_detect(compat.unicode_to_bytes(text), bestEffort=True)
    else:
        is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True)
    if is_reliable is False:
        LOGGER.warning(
            'Text language detected with low confidence; best guesses: %s',
            best_guesses)
    return best_guesses[0][1]
コード例 #2
0
ファイル: text_utils.py プロジェクト: akshayjh/textacy
def detect_language(text):
    """
    Detect the most likely language of a text and return its 2-letter code
    (see https://cloud.google.com/translate/v2/using_rest#language-params).
    Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package;
    to take advantage of optional params, call :func:`cld2.detect()` directly.

    Args:
        text (str)

    Returns:
        str
    """
    try:
        cld2_detect
    except NameError:
        raise ImportError(
            'cld2-cffi is not installed, so language detection won\'t work; '
            'install it individually, or with textacy via `pip install textacy[lang]`'
        )

    if is_python2:
        is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text),
                                                   bestEffort=True)
    else:
        is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True)
    if is_reliable is False:
        msg = 'Text language detected with low confidence; best guesses: %s'
        logger.warning(msg, best_guesses)
    return best_guesses[0][1]
コード例 #3
0
def is_good_snippet(snippet, len_range, min_text_frac, exclude_en):
    """
    Args:
        snippet (str)
        len_range (Tuple[int, int])
        min_text_frac (float)
        exclude_en (bool)

    Returns:
        bool
    """
    len_snippet = len(snippet)
    if len_snippet < len_range[0] or len_snippet >= len_range[1]:
        return False
    # make sure snippet is *mostly* text
    len_text = sum(match.end() - match.start() for match in re_text.finditer(snippet))
    if len_text / len_snippet < min_text_frac:
        return False
    # ugh, math and urls!
    if any(s in snippet for s in (r"\displaystyle", "http://", "https://")):
        return False
    # check for citations/references
    if any(re_pat.search(snippet) for re_pat in (re_doi, re_issn, re_isbn)):
        return False
    # filter out english copy-paste jobs
    if exclude_en is True:
        is_reliable, _, best_guesses = cld2_detect(
            snippet.encode("utf-8"), bestEffort=True
        )
        if is_reliable is True and best_guesses[0][1] == "en":
            logging.debug(
                "found english-heavy snippet in non-english wiki text:\n%s", snippet
            )
            return False
    return True
コード例 #4
0
def is_english(text):
    try:
        is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True)
        if is_reliable is False:
            pass
        lang = best_guesses[0][1]
        return lang == "en"
    except Exception as e:
        return False
コード例 #5
0
ファイル: text_utils.py プロジェクト: henningko/textacy
def detect_language(text):
    """
    Detect the most likely language of a text and return its 2-letter code
    (see https://cloud.google.com/translate/v2/using_rest#language-params).
    Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package;
    to take advantage of optional params, call :func:`cld2.detect()` directly.

    Args:
        text (str)

    Returns:
        str
    """
    if PY2:
        is_reliable, _, best_guesses = cld2_detect(str(text).encode('utf8'), bestEffort=True)
    else:
        is_reliable, _, best_guesses = cld2_detect(str(text), bestEffort=True)
    if is_reliable is False:
        msg = '**WARNING: Text language detected with low confidence; best guesses: {}'
        print(msg.format(best_guesses))
    return best_guesses[0][1]
コード例 #6
0
def detect_language(text):
    """
    Detect the most likely language of a text and return its 2-letter code
    (see https://cloud.google.com/translate/v2/using_rest#language-params).
    Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package;
    to take advantage of optional params, call :func:`cld2.detect()` directly.

    Args:
        text (str)

    Returns:
        str
    """
    if PY2:
        is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text), bestEffort=True)
    else:
        is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True)
    if is_reliable is False:
        msg = 'Text language detected with low confidence; best guesses: %s'
        logger.warning(msg, best_guesses)
    return best_guesses[0][1]