def login(self): """ 登录账号密码 """ acc_el = self.wait_el_presence_by_xpath(self.acc_xpath) try: acc_el.clear() # 处理acc残留 except: warning('acc clear error') acc_el.send_keys(self.acc) time.sleep(1.5) pwd_el = self.wait_el_presence_by_xpath(self.pwd_xpath) try: pwd_el.clear() # 处理acc残留 except: warning('pwd clear error') pwd_el.send_keys(self.pwd) end_el = pwd_el time.sleep(1.5) # 处理数字字母验证码 if self.cap_xpath: cap_el = self.wait_el_presence_by_xpath(self.cap_xpath) cap_text = self.get_captcha_text() cap_el.send_keys(cap_text) end_el = cap_el # seld.enter,处理滑动验证码 if not self.enter(): end_el.send_keys(Keys.ENTER)
def login(self): """ 登录账号密码 """ acc_el = self.wait_el_presence_by_xpath(self.acc_xpath) try: acc_el.clear() # 处理acc残留 except: warning('acc clear error') acc_el.send_keys(self.acc) time.sleep(1.5) pwd_el = self.wait_el_presence_by_xpath(self.pwd_xpath) try: pwd_el.clear() # 处理acc残留 except: warning('pwd clear error') pwd_el.send_keys(self.pwd) end_el = pwd_el time.sleep(1.5) end_el.send_keys(Keys.ENTER) time.sleep(2) # 处理数字字母验证码 if self.cap_xpath: try: cap_el = self.wait_el_presence_by_xpath(self.cap_xpath) cap_text = self.get_captcha_text() cap_el.send_keys(cap_text) end_el = cap_el except selenium.common.exceptions.TimeoutException: print('---暂无验证码') # seld.enter,处理滑动验证码 if not self.enter(): end_el.send_keys(Keys.ENTER)
def get_data(self, url, headers, timeout=30, data=None): try: res = requests.get(url, headers=headers, verify=False, timeout=timeout) if not data \ else requests.post(url, headers=headers, verify=False, timeout=timeout, data=data) except Exception as e: warning('访问报错 plat: {} || {}'.format(self.plat, e)) return False return res.text
def login_task(self, task): plat = task['plat'] try: task = copy(task) plat_obj = LOGIN_MAPPING[plat](**task) plat_obj.run() plat_obj.close() except Exception as e: warning('plat: {} || login_error || {}'.format(plat, e))
def after_login(self): time.sleep(2) try: el = self.wait_el_presence_by_xpath( '//*[@id="app"]/div/div/div[2]/div/div/div/div[3]/button') ActionChains(self.browser).move_to_element(el).click().perform() except Exception as e: warning('bilibili miss click_login_redirct') time.sleep(3)
def crawl_task(self, task): plat = task['plat'] plat_obj = EXTRACT_MAPPING[plat](**task) try: plat_obj.start_crawl() except Exception as e: warning('plat: {} || crawl_data_error || {} || reason: {}'.format( plat, task['acc'], e)) plat_obj.close()
def get_captcha_text(self): element = self.wait_el_presence_by_xpath('//*[@id="img-captcha"]') left = element.location['x'] top = element.location['y'] right = element.location['x'] + element.size['width'] bottom = element.location['y'] + element.size['height'] self.browser.save_screenshot(DIR_ + 'screenshot.png') im = Image.open(DIR_ + 'screenshot.png') im = im.crop((left, top, right, bottom)) im.save(DIR_ + 'baitong.png') captcha = YDMHttp().run(DIR_ + 'baitong.png') warning('baitong || captcha: {}'.format(captcha)) return captcha
def start_crawl(self): """ 流程 """ self.set_headers(self.get_cookie()) data = self.get_next_page_data(self.first_page) if not data: warning( 'plat: {}|| classes->extract_data->start_crawl || miss start_crawl_data' .format(self.plat)) return data = json.loads(data, encoding='utf-8') self.total_nums = self.get_total_nums(data) self.parse(data, self.first_page, self.headers) self.deal_end()
def get_data(self, url, headers, timeout=30, data=None): try: proxy = self.get_proxy() res = requests.get(url, headers=headers, verify=False, timeout=timeout, proxies=proxy) if not data \ else requests.post(url, headers=headers, verify=False, data=data, timeout=timeout, proxies=proxy) if res.status_code != 200: raise Exception('proxy error') except: try: res = requests.get(url, headers=headers, verify=False, timeout=timeout) if not data \ else requests.post(url, headers=headers, verify=False, timeout=timeout, data=data) except Exception as e: warning('访问报错 plat: {} || {}'.format(self.plat, e)) return False return res.text
def to_login(self): """ 登录 """ print('---plat:{}, user:{}, mark:{}, login_retry_time:{}'.format( self.plat, self.acc, self.mark, self.login_retry_times)) self.browser.delete_all_cookies() self.before_login() time.sleep(2) self.login() time.sleep(2) res = self.after_login() if res is False and self.mark < self.login_retry_times: self.mark += 1 self.to_login() elif self.mark >= self.login_retry_times: warning('>>>>login faile, plat:{}, user:{}'.format( self.plat, self.acc))
def __init__(self, first_page=1, channel_category_id=None, channel_id=None, agent_id=None, plat='', acc=''): self.first_page = first_page self.plat = plat self.acc = acc self.total_nums = 0 self.headers = {} self.time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') self.ad_data = {} self.ad_data = {self.plat: []} self.channel_category_id = channel_category_id self.channel_id = channel_id self.agent_id = agent_id try: assert channel_category_id and channel_id and agent_id, 'classes->extract_data->__init__: channel_id' except Exception as e: warning(e) raise e
Tasks(process_nums=num, type='extra').run() def run_plats_to_login(num=PROCESS_NUMS): # plats = PLATS_INFO # now = time.strftime("%Y-%m-%d %H:%M:%S") # print('\n>>>>{}--{}'.format(now, plats)) # Tasks(process_nums=num).run(plats) Tasks(process_nums=num).run() if __name__ == '__main__': import sys if len(sys.argv) < 2: print('####------set argv----------####') sys.exit() try: if sys.argv[1] == 'login': run_plats_to_login() # # os.system("kill -9 $(ps -ef|grep chrom|grep -v grep|awk '{print $2}')") # os.system("kill -9 $(ps -ef|grep phantomjs|grep -v grep|awk '{print $2}')") elif sys.argv[1] == 'extract': crawl_plats_data() else: warning('sys.argv[1] not in [login, extract], invalid argument') except: content = 'location:{}, error:{}'.format(sys.argv[0], traceback.format_exc()) warning(content)