def detect_cloudflare_protection(response): # This is a check some websites tends to protect data using cloudflare # such as email address so that automated bots cannot detect is_protected = False if response: mobj = re.search(r'(?is)(?:data-cfemail="(?P<xpath_data>(.+?))")', response) if not mobj: mobj = re.search( r'(?is)(?:<script\sdata-cfasync="false"\ssrc="(.+?)cloudflare(.+?)"></script>)', response, ) if not mobj: mobj = re.search(r"(?is)(?:>\[(.+?)\sprotected\])", response) if mobj: is_protected = True return is_protected
def search_regex( pattern, string, default=NO_DEFAULT, fatal=True, flags=0, group=None, ): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. In case of failure return a default value or raise a WARNING or a RegexNotFoundError, depending on fatal, specifying the field name. """ if isinstance(pattern, str): mobj = re.search(pattern, string, flags) else: for p in pattern: mobj = re.search(p, string, flags) if mobj: break if mobj: if group is None: # return the first matching group value = next(g for g in mobj.groups() if g is not None) else: value = mobj.group(group) value = re.sub(r"^\(+", "", value) if not value: value = "<blank_value>" value = value_cleanup(value) return value elif default is not NO_DEFAULT: return default elif fatal: logger.warning("unable to filter out values..") else: logger.warning("unable to filter out values..")