コード例 #1
0
ファイル: utils.py プロジェクト: alou4agithub/Xpath
def detect_cloudflare_protection(response):
    # This is a check some websites tends to protect data using cloudflare
    # such as email address so that automated bots cannot detect
    is_protected = False
    if response:
        mobj = re.search(r'(?is)(?:data-cfemail="(?P<xpath_data>(.+?))")',
                         response)
        if not mobj:
            mobj = re.search(
                r'(?is)(?:<script\sdata-cfasync="false"\ssrc="(.+?)cloudflare(.+?)"></script>)',
                response,
            )
        if not mobj:
            mobj = re.search(r"(?is)(?:>\[(.+?)\sprotected\])", response)
        if mobj:
            is_protected = True
    return is_protected
コード例 #2
0
def search_regex(
    pattern,
    string,
    default=NO_DEFAULT,
    fatal=True,
    flags=0,
    group=None,
):
    """
    Perform a regex search on the given string, using a single or a list of
    patterns returning the first matching group.
    In case of failure return a default value or raise a WARNING or a
    RegexNotFoundError, depending on fatal, specifying the field name.
    """
    if isinstance(pattern, str):
        mobj = re.search(pattern, string, flags)
    else:
        for p in pattern:
            mobj = re.search(p, string, flags)
            if mobj:
                break

    if mobj:
        if group is None:
            # return the first matching group
            value = next(g for g in mobj.groups() if g is not None)
        else:
            value = mobj.group(group)
            value = re.sub(r"^\(+", "", value)
        if not value:
            value = "<blank_value>"
        value = value_cleanup(value)
        return value
    elif default is not NO_DEFAULT:
        return default
    elif fatal:
        logger.warning("unable to filter out values..")
    else:
        logger.warning("unable to filter out values..")