def run(self): # 从redis中取出账号和密码 con = RedisPool() accounts_dict = con.get_accounts() for username, password in accounts_dict.items(): logger.info(f'当前生成cookies账号为:{username},密码:{password}') self.get_cookie(username, password)
def recieve_cookie(): if request.method == 'POST': try: params = json.loads(request.data) logger.info({'code': 200, 'msg': 'cookies接收成功!'}) data = params['data'] cookie = data['cookie'] username = data['username'] con = RedisPool() con.save_cookie(username, json.dumps({ 'cookie': cookie, 'counter': 3 })) return json.dumps({'code': 200, 'msg': 'cookies接收成功!'}) except Exception as e: logger.info({'code': 500, 'msg': f'错误信息{e}'}) return json.dumps({'code': 500, 'msg': f'错误信息{e}'})
def run(self): # 导入账号 con = RedisPool() con.save_account(ACCOUNTS) # 开启生成器 if GENERATOR_PROCESS: print(f'开启cookies生成器!') generate_process = Process(target=Scheduler.generate_cookie) generate_process.start() # # 开启验证 # if VALID_PROCESS: # logger.info(f'开启cookies验证!') # valid_process = Process(target=Scheduler.valid_cookie) # valid_process.start() # 开启flask工程 if API_PROCESS: print(f'开启flask工程成功!') api_process = Process(target=Scheduler.api) api_process.start()
def get_cookies(self, account=''): con = RedisPool() cookies_pool = con.cookies_pool(account) if cookies_pool is None: print('cookies池为空!') return None self.account = cookies_pool['account'] token = cookies_pool['cookie_token'] # cookie_dict = cookies_pool['cookie_dict'] cookie_str = cookies_pool['cookie_str'] # 预处理增加cookie可用度 unb_random = ''.join(str(random.choice(range(9))) for _ in range(13)) pattern = r'unb=(\d+)' cookie_unb = re.sub(pattern=pattern, repl='unb=' + unb_random, string=cookie_str) cookie_dict = dict( [tuple(x.split('=', 1)) for x in cookie_unb.split(';')]) print(f'当前账号:{self.account}') return cookie_dict
def request_crawler(): if request.method == 'POST': params = request.json logger.info(f'接收到参数:{params}') data = params['data'] con = RedisPool() # 执行请求scrapy爬虫 if 'shop_link' in data.keys(): try: url = data['shop_link'] print('需要采集的店铺:', url) post_data = {} s_id = data['id'] # todo 这个是什么排序? sort_type = data.get('sort_type', '_sale') post_data['project'] = 'DataItem' # 需要启动的爬虫 post_data['spider'] = 'NewTbShopSpider' # 店铺在shop_fetch_tasks中的表 post_data['s_id'] = s_id post_data['sort_type'] = sort_type r = requests.post('http://localhost:6800/schedule.json', data=post_data, timeout=50) logger.info('scrapyd状态:', r.text) # 爬虫的jobid jobid = r.json()['jobid'] logger.info('当前任务ID:', jobid) con.hset('all_shop', s_id, json.dumps(data, ensure_ascii=False)) con.hset('store', jobid, json.dumps(data, ensure_ascii=False)) logger.info(f'采集爬虫启动成功!详情:{r.text}') logger.info({'code': 200, 'msg': '成功接收任务!'}) return json.dumps({'code': 200, 'msg': '成功接收任务!'}) except Exception as e: logger.info({'code': 500, 'msg': f'错误信息:{e}'}) return json.dumps({'code': 500, 'msg': f'错误信息:{e}'}) # 获取店铺信息 elif 'url' in data.keys(): try: url = data['url'] print('需要采集的店铺:', url) r = requests.get(url) s = Selector(text=r.text) # 店铺id shopId = s.css('#LineZing::attr(shopid)').get() shopName = '' sellerName = '' # 获取淘宝店铺基本信息 if 'taobao.com' in url: if s.css('.hd-shop-name a::text').get(): shopName = s.css('.hd-shop-name a::text').get() elif s.css('.first-block .shop-name span::text').get(): shopName = s.css( '.first-block .shop-name span::text').get() else: shopName = '' if s.css('.tb-box-half.tb-seller-info label::text').get(): sellerName = s.css( '.tb-box-half.tb-seller-info label::text').get( ).strip() elif s.css('.seller-name::text').get(): sellerName = s.css('.seller-name::text').get().strip( '掌柜:') elif s.css('.shop-more-info p.info-item:nth-child(2)::text' ).get(): sellerName = s.css( '.shop-more-info p.info-item:nth-child(2)::text' ).get().strip() else: sellerName = '' # 获取天猫店铺信息 elif 'tmall.com' in url: if s.css('.hd-shop-name a::text').get(): shopName = s.css('.hd-shop-name a::text').get() else: shopName = s.css('.slogo-shopname strong::text').get() # 天猫默认 卖家和商铺同名 sellerName = shopName # if s.css('.tb-box-half.tb-seller-info label::text').get(): # sellerName = s.css('.tb-box-half.tb-seller-info label::text').get().strip() # else: # sellerName = s.css('.shopkeeper div a::text').get() # 商铺基本信息 shopInfo = { 'shopname': shopName, 'shopid': shopId, 'sellername': sellerName } print(f'采集店铺基本信息:{shopInfo}') con.hset('ShopInfoQuery', json.dumps(shopInfo, ensure_ascii=False), url) logger.info({ 'code': 200, 'msg': '店铺基本信息获取成功!', 'shop_info': shopInfo }) return json.dumps( { 'code': 200, 'msg': '店铺基本信息获取成功!', 'shop_info': shopInfo }, ensure_ascii=False) except Exception as e: logger.info({'code': 500, 'msg': f'错误信息:{e}'}) return json.dumps({'code': 500, 'msg': f'错误信息:{e}'}) else: logger.info({'code': 500, 'msg': '任务接收失败,请检查参数!'}) return json.dumps({'code': 500, 'msg': '任务接收失败,请检查参数!'})
def get_cookie(self, username, password): """ 使用微博关联登陆淘宝,进而从m端淘宝页面获取cookie """ option = options.Options() option.add_argument("disable-infobars") # # 去除info框 option.add_experimental_option('excludeSwitches', ['enable-automation']) # 开发者模式,防止被识别 option.add_argument('log-level=3') # option.add_argument("--proxy-server=http://114.239.254.76:4236") #添加代理 # option.add_argument('--headless') # headless模式 # option.add_argument("window-size=2436, 1125") # option.add_argument("--no-sandbox") # executable_path = os.path.join(os.getcwd(), 'chromedriver.exe') # chromedriver路径 # browser = webdriver.Chrome(executable_path=executable_path, options=option) browser = webdriver.Chrome(options=option) browser.implicitly_wait(20) # 隐式加载 try: browser.get('https://weibo.com/') browser.maximize_window() # 等待微博登录框加载完成后输入账号密码登陆 wb_locator = '.login_innerwrap .W_login_form[node-type=normal_form] input[name=username]' WebDriverWait(browser, 300, 0.5).until( EC.presence_of_element_located((By.CSS_SELECTOR, wb_locator))) logger.info('当前步骤:输入账号密码!') browser.find_element_by_css_selector( '.login_innerwrap [node-type=normal_form] input[name=username]' ).send_keys(username) browser.find_element_by_css_selector( '.login_innerwrap [node-type=normal_form] input[name=password]' ).send_keys(password) # 增加延时 time.sleep(3) yundama = Yundama() sel = Selector(text=browser.page_source) verify_img = sel.css( '[action-type=btn_change_verifycode]::attr(src)').get() if not verify_img: browser.find_element_by_css_selector( '.login_innerwrap [node-type=normal_form] .W_btn_a').click( ) else: print('出现验证码,开始识别验证码') # 截图大法 pic = browser.find_element_by_xpath( '//*[@id="pl_login_form"]/div/div[3]/div[3]/a/img') pic.screenshot('yzm.png') # 元素截图 time.sleep(1) result = yundama.identify(file='yzm.png') if not result: print('验证码识别失败, 跳过识别') return else: browser.find_element_by_xpath( '//*[@id="pl_login_form"]/div/div[3]/div[3]/div/input' ).send_keys(result) time.sleep(1) browser.find_element_by_css_selector( '.login_innerwrap [node-type=normal_form] .W_btn_a').click() logger.info('当前步骤:登陆淘宝!') # 等待微博登陆完成后转到淘宝登陆页面并点击使用微博登陆 wb_login_locator = '.WB_feed_detail' WebDriverWait(browser, 300, 0.5).until( EC.presence_of_element_located( (By.CSS_SELECTOR, wb_login_locator))) browser.get('https://login.taobao.com') try: browser.find_element_by_css_selector('#J_Quick2Static').click() except Exception as e: # 添加错误日志 logger.warning(f'错误信息{e}') browser.find_element_by_css_selector('.weibo-login').click() time.sleep(1) # 判断是否有微博快速登录框出现,有则点击,无则输入微博密码登陆 if browser.find_element_by_css_selector('.logged_info .W_btn_g'): # tb_submit_locator = '.logged_info .W_btn_g' # WebDriverWait(browser, 300, 0.5).until(EC.presence_of_element_located((By.CSS_SELECTOR, tb_submit_locator))) browser.find_element_by_css_selector( '.logged_info .W_btn_g').click() elif browser.find_elements_by_css_selector( '[node-type=submitStates]'): browser.find_element_by_css_selector('.enter_psw').send_keys( password) browser.find_element_by_css_selector( '[node-type=submitStates]').click() return time.sleep(2) # 等待淘宝登陆完成后转入淘宝m端首页 tb_locator = '.logo-bd' WebDriverWait(browser, 300, 0.5).until( EC.presence_of_element_located((By.CSS_SELECTOR, tb_locator))) browser.get('https://h5.m.taobao.com') # 等待淘宝m端首页加载完成,获取cookie并存入redis m_tb_locator = '.header-bd' WebDriverWait(browser, 300, 0.5).until( EC.presence_of_element_located( (By.CSS_SELECTOR, m_tb_locator))) cookies = browser.get_cookies() result = self.cookie_handler(cookies) cookie_str = result[0] logger.info('成功获取到cookies!') # 存储到redis->cookies_pool con = RedisPool() con.save_cookie({"username": username, "cookie": cookie_str}) browser.quit() # 传输cookies,当前工程在本地,则传输cookies到线上 # if LOCAL: # url = 'http://47.56.68.237:8888/recieve_cookie' # headers = {"Content-Type": "application/json"} # data = { # 'data': {'cookie': cookie_str, 'username': username, "token": "ce934bc118beedabd789ed5cf6a20dc7"}} # try: # r = requests.post(url=url, headers=headers, json=data) # # logger.info(f'状态码:{r.status_code},响应内容:{json.loads(r.text)}') # except Exception as e: # logger.warning(f'cookies传输出错,{e}') except Exception as e: logger.warning(f'cookie获取失败: {e}') finally: browser.quit() # 下次获取cookies # 加入随机等待时间减少被反爬识别概率 # todo 登录多了,会封Ip time.sleep(random.randint(1, 4))
def __init__(self, **kwargs): logger.info(f'----------------开始采集,{datetime.now()}----------------') # 初始化redis self.con = RedisPool() # 初始化请求类别, 用于区别代理 self.request_tp = ('store', 'detail') # 获取店铺所有商品id,价格,名称 self.store_list = '/i/asynSearch.htm?mid=w-{0}-0&wid={0}&pageNo=' self.baseurl = 'http://shop.m.taobao.com/shop/shopsearch/search_page_json.do?sort=default&type=all&q=' self.store_detail = ( 'http://h5api.m.taobao.com/h5/mtop.taobao.geb.shopinfo.queryshopinfo/2.0/?jsv=2.4.2' '&appKey={appkey}&t={t}&sign={sign}&api=mtop.taobao.geb.shopinfo.queryshopinfo&v=2.0' '&type=originaljson&timeout=3000&AntiCreep=true&dataType=json&H5Request=true&data={data}' ) self.chrome_agents = [ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36", "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36", "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36", "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14" ] # 计算淘宝sign的appkeys self.appkeys = '12574478' # 存储全部商品id self.all_pids = [] # scrapyd爬虫的jobid self.job_id = kwargs.get('_job', '1') # 设置all_shop 的key,s_id是店铺在数据库中的id self.s_id = kwargs.get('s_id', '1') # 备忘录,记录被封禁url的处理 self.block_record = [] self.title = '' self.seller_name = '' self.shop_id = 0 logger.info(f'接收到的参数:{kwargs}') # cookie池 self.cookie_pool = self.con.cookies_pool() self.count = 0 self.account = self.cookie_pool['account'] self.token = self.cookie_pool['cookie_token'] self.cookies = self.cookie_pool['cookie_dict'] logger.info(f'当前使用的cookies的账号为:{self.account}') # 失败url链接 self.fail_urls = [] super().__init__(**kwargs)
class TestSpider(scrapy.Spider): name = 'TestSpider' allowed_domains = ['taobao.com', 'tmall.com'] custom_settings = { # 'CRAWLERA_ENABLED': False 'CONCURRENT_REQUESTS': 1, 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, 'DOWNLOAD_DELAY': 1, 'REDIRECT_ENABLED': False, 'DOWNLOADER_MIDDLEWARES': { # 'DataItem.middlewares.ProxyDownloaderMiddleware': 610, 'DataItem.middlewares.CookiesPoolMiddleware': 610, # 'scrapy_crawlera.CrawleraMiddleware': 610 } } def __init__(self, **kwargs): logger.info(f'----------------开始采集,{datetime.now()}----------------') # 初始化redis self.con = RedisPool() # 初始化请求类别, 用于区别代理 self.request_tp = ('store', 'detail') # 获取店铺所有商品id,价格,名称 self.store_list = '/i/asynSearch.htm?mid=w-{0}-0&wid={0}&pageNo=' self.baseurl = 'http://shop.m.taobao.com/shop/shopsearch/search_page_json.do?sort=default&type=all&q=' self.store_detail = ( 'http://h5api.m.taobao.com/h5/mtop.taobao.geb.shopinfo.queryshopinfo/2.0/?jsv=2.4.2' '&appKey={appkey}&t={t}&sign={sign}&api=mtop.taobao.geb.shopinfo.queryshopinfo&v=2.0' '&type=originaljson&timeout=3000&AntiCreep=true&dataType=json&H5Request=true&data={data}' ) self.chrome_agents = [ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36", "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36", "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36", "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14" ] # 计算淘宝sign的appkeys self.appkeys = '12574478' # 存储全部商品id self.all_pids = [] # scrapyd爬虫的jobid self.job_id = kwargs.get('_job', '1') # 设置all_shop 的key,s_id是店铺在数据库中的id self.s_id = kwargs.get('s_id', '1') # 备忘录,记录被封禁url的处理 self.block_record = [] self.title = '' self.seller_name = '' self.shop_id = 0 logger.info(f'接收到的参数:{kwargs}') # cookie池 self.cookie_pool = self.con.cookies_pool() self.count = 0 self.account = self.cookie_pool['account'] self.token = self.cookie_pool['cookie_token'] self.cookies = self.cookie_pool['cookie_dict'] logger.info(f'当前使用的cookies的账号为:{self.account}') # 失败url链接 self.fail_urls = [] super().__init__(**kwargs) def start_requests(self): # resp = self.con.hget('all_shop', self.s_id) # # if resp: # # 从redis取出的数据 # data = json.loads(resp) # else: # logger.warning(f'redis数据库表:_all_shop出现异常,参数为:{self.s_id}') # return # 测试数据 data = { "id": 1, "shop_id": "59002150", # 店铺id "title": "哈果超人 haco实用轻", # 店铺名称 "name": "behsu网购", # 掌柜名称 "platform_id": 1, # 产品库定义的平台ID,淘宝为3,天猫为4 "shop_link": "https://haco.taobao.com/index.htm?spm=a1z10.3-c-s.w5002-14453794708.2.5ae258eaRYkhrS", # 'url': 'https://haco.taobao.com/index.htm?spm=a1z10.3-c-s.w5002-14453794708.2.5ae258eaRYkhrS', # "callback": "https://pr.test.puget.work/api/screen/shop", # 发送采集结果的回调地址 "token": "ce934bc118beedabd789ed5cf6a20dc7" # 接口请求认证 } self.title = data.get('title') self.seller_name = data.get('name') self.shop_id = data.get('shop_id') shop_link = data.get('shop_link') shop_search_url = 'https://' + urlparse(shop_link).netloc # 请求店铺所有商品 r = requests.get(shop_search_url + '/search.htm') sel = Selector(text=r.text) # 获取权值id if 'tmall.com' in shop_search_url: # wid = sel.css('[data-title=搜索列表]::attr(data-widgetid)').get() wid = int(sel.css( '#bd > div::attr(data-widgetid)').get()) + 1 # 天猫 10月更新 # wid = sel.css('#hd > div::attr(data-widgetid)').get() # '//*[@id="bd"]//*[@class="J_TModule"]/@data-widgetid' self.shop_search_list = shop_search_url + self.store_list.format( str(wid)) elif 'taobao.com' in shop_link: # 权值id wid = sel.css('[data-title=宝贝列表]::attr(data-widgetid)').get() # 获取全部商品id的接口 # 'https://guchun.tmall.com/i/asynSearch.htm?mid=w-18297719823-0&wid=18297719823&pageNo=' self.shop_search_list = shop_search_url + self.store_list.format( wid) else: logger.warning('店铺信息解析失败!请查看scrapyd日志获取详情!') return self.con.hset('widget_id', self.title, wid) # json接口 免登陆 # http://shop.m.taobao.com/shop/shopsearch/search_page_json.do?sort=default&type=all&q=Simple%E9%9F%A9%E5%9B%BD%E5%A5%B3%E8%A3%85 self.json_url = self.baseurl + self.title yield scrapy.Request(url=self.json_url, callback=self.parse_search) # 搜索店铺信息并匹配 def parse_search(self, response): self.re_dt = {} shopid = self.shop_id sid = self.s_id doc = unescape(response.text) # json接口内容 参考文件:接口1.json jd_data = json.loads(doc) if jd_data.get('listItem'): for s in jd_data.get('listItem'): try: if str(shopid) in str(s): self.re_dt['sign'] = str(sid) # sign 产品库id self.re_dt['shopid'] = str( s.get('shop').get('id')) # 淘宝shopid self.re_dt['is_mall'] = s.get('shop').get( 'isMall') # 是否天猫 self.re_dt['total_sold'] = str( s.get('shop').get('totalSold')) # 销量 self.re_dt['level'] = ''.join( re.findall('\d-\d', str(s.get('icon')))) # 店铺等级 self.re_dt['sellerid'] = ''.join( re.findall('userid=(\d+)', str(s.get('medal')))) # 卖家id self.re_dt['goods'] = s.get('favRate') # 好评率 break except Exception as e: logger.warning(f'发生错误的函数:parse_search(),错误信息:{e}') continue # 随机轮换 User-Agent,因为是h5接口,要用手机浏览器带cookies访问 iphone_headers = [ 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Mobile/15A372 MicroMessenger/6.5.16 NetType/WIFI Language/zh_CN', 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Mobile/15A372 wxwork/2.1.5 MicroMessenger/6.3.22', 'Mozilla/5.0 (iPhone 6s; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 MQQBrowser/7.7.2 Mobile/15A372 Safari/8536.25 MttCustomUA/2 QBWebViewType/1', 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Mobile/15A402 MicroMessenger/6.5.16 NetType/WIFI Language/zh_CN' ] headers = {'user-agent': random.choice(iphone_headers)} data = '{"sellerId": %s}' % self.re_dt.get('sellerid') cookies = self.cookies token = self.token account = self.account # 这部分只需要更换cookies,一般能访问成功,因为只访问一次接口 if response.meta.get('change_cookie'): logger.info('-----------------更换cookies-----------------') cookie_pool = self.con.cookies_pool(account) if cookie_pool is None: return self.close(TestSpider, 'cookie池为空!') account = cookie_pool['account'] token = cookie_pool['cookie_token'] cookies = cookie_pool['cookie_dict'] cookie_str = cookie_pool['cookie_str'] logger.info(f'更换后的cookies的账号为:{account}') # 解码 st = self.sign(token=token, appkey=self.appkeys, t=str(int(time.time() * 1000)), data=data) # 实时接口 ''' http://h5api.m.taobao.com/h5/mtop.taobao.geb.shopinfo.queryshopinfo/2.0/?jsv=2.4.2 &appKey=12574478&t=1569315579952&sign=88d9947b4728cc649bc50f3cb66ef2c6& api=mtop.taobao.geb.shopinfo.queryshopinfo&v=2.0&type=originaljson&timeout=3000& AntiCreep=true&dataType=json&H5Request=true&data={"sellerId": 3790578263} ''' url = self.store_detail.format(**st) yield scrapy.Request(url=url, headers=headers, cookies=cookies, callback=self.parse_store_item, meta={'account': account}) # 进一步抓取店铺信息 def parse_store_item(self, response): re_data = self.re_dt # 参考:接口2.json doc = json.loads(response.text) # 接口访问状态 status = doc.get('ret') logger.info(f'json接口响应状态{status}') if 'SUCCESS' not in status[0]: logger.info('处理淘宝封禁!') yield scrapy.Request(url=self.json_url, callback=self.parse_search, dont_filter=True, meta={'change_cookie': True}) else: # 存储动态json接口内容 self.con.hset('item_json', self.title, response.text) tdt = doc.get('data') self.count = tdt.get('itemCount') # 商品总数 re_data['shopname'] = tdt.get('shopName') # 店铺名 re_data['fans_num'] = str(tdt.get('fansNum')) # 粉丝数 re_data['item_count'] = str(self.count) # 产品数量 re_data['new_item'] = str(tdt.get('newItem')) # 新品数 re_data['golden_seller'] = tdt.get('goldenSeller') # 金牌卖家 logger.info(f'店铺信息:{re_data}') # url = r'https://shop142840423.taobao.com/i/asynSearch.htm?mid=w-13377183093-0&wid=13377183093&pageNo=1' # 商品详情的第一页 # 天猫用不上这个详情页,直接计算出页数 # 淘宝暂时直接计算出页数 if 'tmall.com' in self.shop_search_list: # 通过计算得出页数 totalPages = math.ceil(int(self.count) / 90) + 1 # 向上取整 elif 'taobao.com' in self.shop_search_list: # s = Selector(text=html) # # count = int(s.xpath('//*[@id="shop-search-list"]/div/div[2]/span/text()').extract_first()) # # if count is not None and count > item_count: # # item_count = count # 通过计算得出页数 totalPages = math.ceil(int(self.count) / 24) + 1 # 向上取整 logger.info(f'总共需要采集的页面有:{totalPages}页') for page in range(1, totalPages + 1): url = self.shop_search_list + str(page) # 预处理cookies headers = {'User-Agent': random.choice(self.chrome_agents)} time.sleep(1) yield scrapy.Request(url=url, headers=headers, callback=self.parse_productId, meta={'with_cookie': True}) def parse_productId(self, response): ''' 解析当前商品详情页中的所有商品id :param response: :return: ''' headers = {'User-Agent': random.choice(self.chrome_agents)} if 'rgv587_flag' in response.text: yield scrapy.Request(url=response.request.url, headers=headers, callback=self.parse_productId, meta={ 'with_cookie': True, 'block': True }) text = response.text.replace('\\"', '') s = Selector(text=text) firstPagePids = s.css('.J_TItems div .item::attr(data-id)').getall() if len(firstPagePids) == 0: firstPagePids = s.css( '.shop-hesper-bd .item::attr(data-id)').getall() logger.info(f'{response.request.url},商品id:{firstPagePids}') pids = firstPagePids # 存储全部商品 # todo 增加持续化存储 self.all_pids.extend(pids) # 根据所需元素计算sign def sign(self, token, appkey, t, data): ''' :param token: :param appkey: :param t: str(int(time.time() * 1000)) :param data: :return: ''' pp = '&'.join([token, t, appkey, data]).encode() sign = hashlib.md5(pp).hexdigest() return {'sign': sign, 't': t, 'appkey': appkey, 'data': data} # 店铺爬虫结束后,数据存入redis并调用商品详情爬虫 def close(self, spider, reason): # 去重存储所有商品id all_products_id = list(set(self.all_pids)) logger.info(all_products_id) logger.info(f'采集到了{len(all_products_id)}条商品数据') lose_count = int(self.count) - len(all_products_id) if lose_count > 0: logger.warning( f'采集缺失:{lose_count},缺失百分比:{lose_count / int(self.count) * 100}%' )
cls.lua_extend = redis.register_script(cls.LUA_EXTEND_SCRIPT) def do_acquire(self, token): timeout = self.timeout and int(self.timeout * 1000) or '' return bool(self.lua_acquire(keys=[self.name], args=[token, timeout], client=self.redis)) def do_release(self, expected_token): if not bool(self.lua_release(keys=[self.name], args=[expected_token], client=self.redis)): raise LockError("Cannot release a lock that's no longer owned") def do_extend(self, additional_time): additional_time = int(additional_time * 1000) if not bool(self.lua_extend(keys=[self.name], args=[self.local.token, additional_time], client=self.redis)): raise LockError("Cannot extend a lock that's no longer owned") return True if __name__ == '__main__': conn = RedisPool().connection() with RedisLuaLock(conn, 'hello_world', timeout=15, blocking=True, blocking_timeout=1): # do something print 'xxx' time.sleep(3)