def verify_code(self): timestemp = time.time() imgurl = 'http://api.chaxun.la/api/seccode/?0.{0}'.format(timestemp) if self.download(imgurl): captcha = Captcha() code_result = captcha.verification(filename='captcha.gif') self.verify = code_result.get('Result')
class Sitedossier(object): """docstring for Sitedossier""" def __init__(self, domain): super(Sitedossier, self).__init__() self.domain = domain self.captcha = Captcha() self.subset = [] def run(self): try: url = 'http://www.sitedossier.com/parentdomain/{0}'.format(self.domain) r = self.get_content(url) self.parser(r) return list(set(self.subset)) except Exception as e: return self.subset def get_content(self, url): r = http_request_get(url).text if self.human_act(r) is True: return r else: self.get_content(url) def parser(self, response): npage = re.search('<a href="/parentdomain/(.*?)"><b>Show', response) if npage: for sub in self.get_subdomain(response): self.subset.append(sub) nurl = 'http://www.sitedossier.com/parentdomain/{0}'.format(npage.group(1)) response = self.get_content(nurl) self.parser(response) else: for sub in self.get_subdomain(response): self.subset.append(sub) def get_subdomain(self, response): domain = re.compile(r'(?<=<a href\=\"/site/).*?(?=\">)') for sub in domain.findall(response): yield sub def human_act(self, response): if 'auditimage' in response or 'blacklisted' in response: imgurl = self.get_audit_img(response) if imgurl is not None: ret = self.captcha.verification(imgurl) if ret.has_key('Result'): self.audit(ret['Result']) return True else: raise Exception("captcha_verification_is_empty") else: raise Exception("audit_img_is_empty") else: return True def audit(self, code): payload = {'w':code} url = 'http://www.sitedossier.com/audit' r = http_request_post(url, payload=payload) def get_audit_img(self, response): auditimg = re.compile(r'(?<=<img src\=\"/auditimage/).*?(?=\?" alt="Please)') imgurl = auditimg.findall(response)[0:] if len(imgurl) >= 1: imgurl = 'http://www.sitedossier.com/auditimage/{0}'.format(imgurl[0]) return imgurl else: return None def __str__(self): handler = lambda e: str(e) return json.dumps(self, indent=2, default=handler)