Ejemplo n.º 1
0
    def run(self):
        # 从redis中取出账号和密码
        con = RedisPool()
        accounts_dict = con.get_accounts()

        for username, password in accounts_dict.items():
            logger.info(f'当前生成cookies账号为:{username},密码:{password}')
            self.get_cookie(username, password)
Ejemplo n.º 2
0
def recieve_cookie():
    if request.method == 'POST':
        try:
            params = json.loads(request.data)
            logger.info({'code': 200, 'msg': 'cookies接收成功!'})
            data = params['data']
            cookie = data['cookie']
            username = data['username']
            con = RedisPool()
            con.save_cookie(username,
                            json.dumps({
                                'cookie': cookie,
                                'counter': 3
                            }))

            return json.dumps({'code': 200, 'msg': 'cookies接收成功!'})
        except Exception as e:
            logger.info({'code': 500, 'msg': f'错误信息{e}'})
            return json.dumps({'code': 500, 'msg': f'错误信息{e}'})
Ejemplo n.º 3
0
    def run(self):
        # 导入账号
        con = RedisPool()
        con.save_account(ACCOUNTS)

        # 开启生成器
        if GENERATOR_PROCESS:
            print(f'开启cookies生成器!')
            generate_process = Process(target=Scheduler.generate_cookie)
            generate_process.start()

        # # 开启验证
        # if VALID_PROCESS:
        #     logger.info(f'开启cookies验证!')
        #     valid_process = Process(target=Scheduler.valid_cookie)
        #     valid_process.start()

        # 开启flask工程
        if API_PROCESS:
            print(f'开启flask工程成功!')
            api_process = Process(target=Scheduler.api)
            api_process.start()
Ejemplo n.º 4
0
    def get_cookies(self, account=''):
        con = RedisPool()
        cookies_pool = con.cookies_pool(account)
        if cookies_pool is None:
            print('cookies池为空!')
            return None
        self.account = cookies_pool['account']
        token = cookies_pool['cookie_token']
        # cookie_dict = cookies_pool['cookie_dict']
        cookie_str = cookies_pool['cookie_str']

        # 预处理增加cookie可用度
        unb_random = ''.join(str(random.choice(range(9))) for _ in range(13))

        pattern = r'unb=(\d+)'
        cookie_unb = re.sub(pattern=pattern,
                            repl='unb=' + unb_random,
                            string=cookie_str)
        cookie_dict = dict(
            [tuple(x.split('=', 1)) for x in cookie_unb.split(';')])
        print(f'当前账号:{self.account}')

        return cookie_dict
Ejemplo n.º 5
0
def request_crawler():
    if request.method == 'POST':
        params = request.json
        logger.info(f'接收到参数:{params}')
        data = params['data']
        con = RedisPool()
        # 执行请求scrapy爬虫
        if 'shop_link' in data.keys():
            try:
                url = data['shop_link']
                print('需要采集的店铺:', url)
                post_data = {}

                s_id = data['id']
                # todo 这个是什么排序?
                sort_type = data.get('sort_type', '_sale')

                post_data['project'] = 'DataItem'
                # 需要启动的爬虫
                post_data['spider'] = 'NewTbShopSpider'

                # 店铺在shop_fetch_tasks中的表
                post_data['s_id'] = s_id
                post_data['sort_type'] = sort_type
                r = requests.post('http://localhost:6800/schedule.json',
                                  data=post_data,
                                  timeout=50)
                logger.info('scrapyd状态:', r.text)
                # 爬虫的jobid
                jobid = r.json()['jobid']
                logger.info('当前任务ID:', jobid)
                con.hset('all_shop', s_id, json.dumps(data,
                                                      ensure_ascii=False))

                con.hset('store', jobid, json.dumps(data, ensure_ascii=False))

                logger.info(f'采集爬虫启动成功!详情:{r.text}')
                logger.info({'code': 200, 'msg': '成功接收任务!'})
                return json.dumps({'code': 200, 'msg': '成功接收任务!'})
            except Exception as e:
                logger.info({'code': 500, 'msg': f'错误信息:{e}'})
                return json.dumps({'code': 500, 'msg': f'错误信息:{e}'})

        # 获取店铺信息
        elif 'url' in data.keys():
            try:
                url = data['url']
                print('需要采集的店铺:', url)
                r = requests.get(url)
                s = Selector(text=r.text)
                # 店铺id
                shopId = s.css('#LineZing::attr(shopid)').get()

                shopName = ''
                sellerName = ''

                # 获取淘宝店铺基本信息
                if 'taobao.com' in url:
                    if s.css('.hd-shop-name a::text').get():
                        shopName = s.css('.hd-shop-name a::text').get()
                    elif s.css('.first-block .shop-name span::text').get():
                        shopName = s.css(
                            '.first-block .shop-name span::text').get()
                    else:
                        shopName = ''
                    if s.css('.tb-box-half.tb-seller-info label::text').get():
                        sellerName = s.css(
                            '.tb-box-half.tb-seller-info label::text').get(
                            ).strip()
                    elif s.css('.seller-name::text').get():
                        sellerName = s.css('.seller-name::text').get().strip(
                            '掌柜:')
                    elif s.css('.shop-more-info p.info-item:nth-child(2)::text'
                               ).get():
                        sellerName = s.css(
                            '.shop-more-info p.info-item:nth-child(2)::text'
                        ).get().strip()
                    else:
                        sellerName = ''

                # 获取天猫店铺信息
                elif 'tmall.com' in url:
                    if s.css('.hd-shop-name a::text').get():
                        shopName = s.css('.hd-shop-name a::text').get()
                    else:
                        shopName = s.css('.slogo-shopname strong::text').get()
                    # 天猫默认 卖家和商铺同名
                    sellerName = shopName

                    # if s.css('.tb-box-half.tb-seller-info label::text').get():
                    #     sellerName = s.css('.tb-box-half.tb-seller-info label::text').get().strip()
                    # else:
                    #     sellerName = s.css('.shopkeeper div a::text').get()

                # 商铺基本信息
                shopInfo = {
                    'shopname': shopName,
                    'shopid': shopId,
                    'sellername': sellerName
                }
                print(f'采集店铺基本信息:{shopInfo}')

                con.hset('ShopInfoQuery',
                         json.dumps(shopInfo, ensure_ascii=False), url)

                logger.info({
                    'code': 200,
                    'msg': '店铺基本信息获取成功!',
                    'shop_info': shopInfo
                })
                return json.dumps(
                    {
                        'code': 200,
                        'msg': '店铺基本信息获取成功!',
                        'shop_info': shopInfo
                    },
                    ensure_ascii=False)
            except Exception as e:
                logger.info({'code': 500, 'msg': f'错误信息:{e}'})
                return json.dumps({'code': 500, 'msg': f'错误信息:{e}'})
        else:
            logger.info({'code': 500, 'msg': '任务接收失败,请检查参数!'})
            return json.dumps({'code': 500, 'msg': '任务接收失败,请检查参数!'})
Ejemplo n.º 6
0
    def get_cookie(self, username, password):
        """
        使用微博关联登陆淘宝,进而从m端淘宝页面获取cookie
        """

        option = options.Options()

        option.add_argument("disable-infobars")  # # 去除info框
        option.add_experimental_option('excludeSwitches',
                                       ['enable-automation'])  # 开发者模式,防止被识别
        option.add_argument('log-level=3')
        # option.add_argument("--proxy-server=http://114.239.254.76:4236")  #添加代理
        # option.add_argument('--headless')         # headless模式
        # option.add_argument("window-size=2436, 1125")
        # option.add_argument("--no-sandbox")

        # executable_path = os.path.join(os.getcwd(), 'chromedriver.exe')  # chromedriver路径

        # browser = webdriver.Chrome(executable_path=executable_path, options=option)
        browser = webdriver.Chrome(options=option)

        browser.implicitly_wait(20)  # 隐式加载
        try:
            browser.get('https://weibo.com/')
            browser.maximize_window()
            # 等待微博登录框加载完成后输入账号密码登陆
            wb_locator = '.login_innerwrap .W_login_form[node-type=normal_form] input[name=username]'
            WebDriverWait(browser, 300, 0.5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, wb_locator)))
            logger.info('当前步骤:输入账号密码!')
            browser.find_element_by_css_selector(
                '.login_innerwrap [node-type=normal_form] input[name=username]'
            ).send_keys(username)
            browser.find_element_by_css_selector(
                '.login_innerwrap [node-type=normal_form] input[name=password]'
            ).send_keys(password)
            # 增加延时
            time.sleep(3)

            yundama = Yundama()

            sel = Selector(text=browser.page_source)
            verify_img = sel.css(
                '[action-type=btn_change_verifycode]::attr(src)').get()

            if not verify_img:
                browser.find_element_by_css_selector(
                    '.login_innerwrap [node-type=normal_form] .W_btn_a').click(
                    )
            else:
                print('出现验证码,开始识别验证码')

                # 截图大法
                pic = browser.find_element_by_xpath(
                    '//*[@id="pl_login_form"]/div/div[3]/div[3]/a/img')
                pic.screenshot('yzm.png')  # 元素截图
                time.sleep(1)
                result = yundama.identify(file='yzm.png')
                if not result:
                    print('验证码识别失败, 跳过识别')
                    return
                else:
                    browser.find_element_by_xpath(
                        '//*[@id="pl_login_form"]/div/div[3]/div[3]/div/input'
                    ).send_keys(result)
            time.sleep(1)
            browser.find_element_by_css_selector(
                '.login_innerwrap [node-type=normal_form] .W_btn_a').click()

            logger.info('当前步骤:登陆淘宝!')
            # 等待微博登陆完成后转到淘宝登陆页面并点击使用微博登陆
            wb_login_locator = '.WB_feed_detail'
            WebDriverWait(browser, 300, 0.5).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, wb_login_locator)))
            browser.get('https://login.taobao.com')
            try:
                browser.find_element_by_css_selector('#J_Quick2Static').click()
            except Exception as e:
                # 添加错误日志
                logger.warning(f'错误信息{e}')

            browser.find_element_by_css_selector('.weibo-login').click()
            time.sleep(1)
            # 判断是否有微博快速登录框出现,有则点击,无则输入微博密码登陆
            if browser.find_element_by_css_selector('.logged_info .W_btn_g'):
                # tb_submit_locator = '.logged_info .W_btn_g'
                # WebDriverWait(browser, 300, 0.5).until(EC.presence_of_element_located((By.CSS_SELECTOR, tb_submit_locator)))
                browser.find_element_by_css_selector(
                    '.logged_info .W_btn_g').click()
            elif browser.find_elements_by_css_selector(
                    '[node-type=submitStates]'):
                browser.find_element_by_css_selector('.enter_psw').send_keys(
                    password)
                browser.find_element_by_css_selector(
                    '[node-type=submitStates]').click()
                return

            time.sleep(2)
            # 等待淘宝登陆完成后转入淘宝m端首页
            tb_locator = '.logo-bd'
            WebDriverWait(browser, 300, 0.5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, tb_locator)))
            browser.get('https://h5.m.taobao.com')

            # 等待淘宝m端首页加载完成,获取cookie并存入redis
            m_tb_locator = '.header-bd'
            WebDriverWait(browser, 300, 0.5).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, m_tb_locator)))
            cookies = browser.get_cookies()

            result = self.cookie_handler(cookies)

            cookie_str = result[0]

            logger.info('成功获取到cookies!')

            # 存储到redis->cookies_pool
            con = RedisPool()
            con.save_cookie({"username": username, "cookie": cookie_str})

            browser.quit()

            # 传输cookies,当前工程在本地,则传输cookies到线上
            # if LOCAL:
            #     url = 'http://47.56.68.237:8888/recieve_cookie'
            #     headers = {"Content-Type": "application/json"}
            #     data = {
            #         'data': {'cookie': cookie_str, 'username': username, "token": "ce934bc118beedabd789ed5cf6a20dc7"}}
            #     try:
            #         r = requests.post(url=url, headers=headers, json=data)
            #
            #         logger.info(f'状态码:{r.status_code},响应内容:{json.loads(r.text)}')
            #     except Exception as e:
            #         logger.warning(f'cookies传输出错,{e}')

        except Exception as e:
            logger.warning(f'cookie获取失败: {e}')

        finally:
            browser.quit()
            # 下次获取cookies
            # 加入随机等待时间减少被反爬识别概率
            # todo 登录多了,会封Ip
            time.sleep(random.randint(1, 4))
Ejemplo n.º 7
0
    def __init__(self, **kwargs):

        logger.info(f'----------------开始采集,{datetime.now()}----------------')

        # 初始化redis
        self.con = RedisPool()

        # 初始化请求类别, 用于区别代理
        self.request_tp = ('store', 'detail')

        # 获取店铺所有商品id,价格,名称
        self.store_list = '/i/asynSearch.htm?mid=w-{0}-0&wid={0}&pageNo='

        self.baseurl = 'http://shop.m.taobao.com/shop/shopsearch/search_page_json.do?sort=default&type=all&q='

        self.store_detail = (
            'http://h5api.m.taobao.com/h5/mtop.taobao.geb.shopinfo.queryshopinfo/2.0/?jsv=2.4.2'
            '&appKey={appkey}&t={t}&sign={sign}&api=mtop.taobao.geb.shopinfo.queryshopinfo&v=2.0'
            '&type=originaljson&timeout=3000&AntiCreep=true&dataType=json&H5Request=true&data={data}'
        )

        self.chrome_agents = [
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
            "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
            "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
            "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
            "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14"
        ]

        # 计算淘宝sign的appkeys
        self.appkeys = '12574478'

        # 存储全部商品id
        self.all_pids = []

        # scrapyd爬虫的jobid
        self.job_id = kwargs.get('_job', '1')

        # 设置all_shop 的key,s_id是店铺在数据库中的id
        self.s_id = kwargs.get('s_id', '1')

        # 备忘录,记录被封禁url的处理
        self.block_record = []

        self.title = ''
        self.seller_name = ''
        self.shop_id = 0
        logger.info(f'接收到的参数:{kwargs}')

        # cookie池
        self.cookie_pool = self.con.cookies_pool()
        self.count = 0
        self.account = self.cookie_pool['account']
        self.token = self.cookie_pool['cookie_token']
        self.cookies = self.cookie_pool['cookie_dict']
        logger.info(f'当前使用的cookies的账号为:{self.account}')

        # 失败url链接
        self.fail_urls = []
        super().__init__(**kwargs)
Ejemplo n.º 8
0
class TestSpider(scrapy.Spider):
    name = 'TestSpider'
    allowed_domains = ['taobao.com', 'tmall.com']

    custom_settings = {
        # 'CRAWLERA_ENABLED': False
        'CONCURRENT_REQUESTS': 1,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'REDIRECT_ENABLED': False,
        'DOWNLOADER_MIDDLEWARES': {
            # 'DataItem.middlewares.ProxyDownloaderMiddleware': 610,
            'DataItem.middlewares.CookiesPoolMiddleware': 610,
            # 'scrapy_crawlera.CrawleraMiddleware': 610
        }
    }

    def __init__(self, **kwargs):

        logger.info(f'----------------开始采集,{datetime.now()}----------------')

        # 初始化redis
        self.con = RedisPool()

        # 初始化请求类别, 用于区别代理
        self.request_tp = ('store', 'detail')

        # 获取店铺所有商品id,价格,名称
        self.store_list = '/i/asynSearch.htm?mid=w-{0}-0&wid={0}&pageNo='

        self.baseurl = 'http://shop.m.taobao.com/shop/shopsearch/search_page_json.do?sort=default&type=all&q='

        self.store_detail = (
            'http://h5api.m.taobao.com/h5/mtop.taobao.geb.shopinfo.queryshopinfo/2.0/?jsv=2.4.2'
            '&appKey={appkey}&t={t}&sign={sign}&api=mtop.taobao.geb.shopinfo.queryshopinfo&v=2.0'
            '&type=originaljson&timeout=3000&AntiCreep=true&dataType=json&H5Request=true&data={data}'
        )

        self.chrome_agents = [
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
            "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
            "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
            "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
            "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14"
        ]

        # 计算淘宝sign的appkeys
        self.appkeys = '12574478'

        # 存储全部商品id
        self.all_pids = []

        # scrapyd爬虫的jobid
        self.job_id = kwargs.get('_job', '1')

        # 设置all_shop 的key,s_id是店铺在数据库中的id
        self.s_id = kwargs.get('s_id', '1')

        # 备忘录,记录被封禁url的处理
        self.block_record = []

        self.title = ''
        self.seller_name = ''
        self.shop_id = 0
        logger.info(f'接收到的参数:{kwargs}')

        # cookie池
        self.cookie_pool = self.con.cookies_pool()
        self.count = 0
        self.account = self.cookie_pool['account']
        self.token = self.cookie_pool['cookie_token']
        self.cookies = self.cookie_pool['cookie_dict']
        logger.info(f'当前使用的cookies的账号为:{self.account}')

        # 失败url链接
        self.fail_urls = []
        super().__init__(**kwargs)

    def start_requests(self):

        # resp = self.con.hget('all_shop', self.s_id)
        #
        # if resp:
        #     # 从redis取出的数据
        #     data = json.loads(resp)
        # else:
        #     logger.warning(f'redis数据库表:_all_shop出现异常,参数为:{self.s_id}')
        #     return

        # 测试数据

        data = {
            "id": 1,
            "shop_id": "59002150",  # 店铺id
            "title": "哈果超人 haco实用轻",  # 店铺名称
            "name": "behsu网购",  # 掌柜名称
            "platform_id": 1,  # 产品库定义的平台ID,淘宝为3,天猫为4
            "shop_link":
            "https://haco.taobao.com/index.htm?spm=a1z10.3-c-s.w5002-14453794708.2.5ae258eaRYkhrS",

            # 'url': 'https://haco.taobao.com/index.htm?spm=a1z10.3-c-s.w5002-14453794708.2.5ae258eaRYkhrS',
            # "callback": "https://pr.test.puget.work/api/screen/shop",  # 发送采集结果的回调地址
            "token": "ce934bc118beedabd789ed5cf6a20dc7"  # 接口请求认证
        }

        self.title = data.get('title')
        self.seller_name = data.get('name')
        self.shop_id = data.get('shop_id')

        shop_link = data.get('shop_link')

        shop_search_url = 'https://' + urlparse(shop_link).netloc

        # 请求店铺所有商品
        r = requests.get(shop_search_url + '/search.htm')
        sel = Selector(text=r.text)

        # 获取权值id
        if 'tmall.com' in shop_search_url:
            # wid = sel.css('[data-title=搜索列表]::attr(data-widgetid)').get()
            wid = int(sel.css(
                '#bd > div::attr(data-widgetid)').get()) + 1  # 天猫 10月更新
            # wid = sel.css('#hd > div::attr(data-widgetid)').get()
            # '//*[@id="bd"]//*[@class="J_TModule"]/@data-widgetid'
            self.shop_search_list = shop_search_url + self.store_list.format(
                str(wid))

        elif 'taobao.com' in shop_link:
            # 权值id
            wid = sel.css('[data-title=宝贝列表]::attr(data-widgetid)').get()
            # 获取全部商品id的接口
            # 'https://guchun.tmall.com/i/asynSearch.htm?mid=w-18297719823-0&wid=18297719823&pageNo='
            self.shop_search_list = shop_search_url + self.store_list.format(
                wid)
        else:
            logger.warning('店铺信息解析失败!请查看scrapyd日志获取详情!')
            return

        self.con.hset('widget_id', self.title, wid)

        # json接口 免登陆
        # http://shop.m.taobao.com/shop/shopsearch/search_page_json.do?sort=default&type=all&q=Simple%E9%9F%A9%E5%9B%BD%E5%A5%B3%E8%A3%85
        self.json_url = self.baseurl + self.title

        yield scrapy.Request(url=self.json_url, callback=self.parse_search)

    # 搜索店铺信息并匹配
    def parse_search(self, response):

        self.re_dt = {}
        shopid = self.shop_id
        sid = self.s_id

        doc = unescape(response.text)
        # json接口内容  参考文件:接口1.json
        jd_data = json.loads(doc)

        if jd_data.get('listItem'):
            for s in jd_data.get('listItem'):
                try:
                    if str(shopid) in str(s):
                        self.re_dt['sign'] = str(sid)  # sign 产品库id
                        self.re_dt['shopid'] = str(
                            s.get('shop').get('id'))  # 淘宝shopid
                        self.re_dt['is_mall'] = s.get('shop').get(
                            'isMall')  # 是否天猫
                        self.re_dt['total_sold'] = str(
                            s.get('shop').get('totalSold'))  # 销量
                        self.re_dt['level'] = ''.join(
                            re.findall('\d-\d', str(s.get('icon'))))  # 店铺等级
                        self.re_dt['sellerid'] = ''.join(
                            re.findall('userid=(\d+)',
                                       str(s.get('medal'))))  # 卖家id
                        self.re_dt['goods'] = s.get('favRate')  # 好评率
                        break
                except Exception as e:
                    logger.warning(f'发生错误的函数:parse_search(),错误信息:{e}')
                    continue

        # 随机轮换 User-Agent,因为是h5接口,要用手机浏览器带cookies访问
        iphone_headers = [
            'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Mobile/15A372 MicroMessenger/6.5.16 NetType/WIFI Language/zh_CN',
            'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Mobile/15A372 wxwork/2.1.5 MicroMessenger/6.3.22',
            'Mozilla/5.0 (iPhone 6s; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 MQQBrowser/7.7.2 Mobile/15A372 Safari/8536.25 MttCustomUA/2 QBWebViewType/1',
            'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Mobile/15A402 MicroMessenger/6.5.16 NetType/WIFI Language/zh_CN'
        ]

        headers = {'user-agent': random.choice(iphone_headers)}

        data = '{"sellerId": %s}' % self.re_dt.get('sellerid')
        cookies = self.cookies
        token = self.token
        account = self.account

        # 这部分只需要更换cookies,一般能访问成功,因为只访问一次接口
        if response.meta.get('change_cookie'):
            logger.info('-----------------更换cookies-----------------')
            cookie_pool = self.con.cookies_pool(account)
            if cookie_pool is None:
                return self.close(TestSpider, 'cookie池为空!')
            account = cookie_pool['account']
            token = cookie_pool['cookie_token']
            cookies = cookie_pool['cookie_dict']
            cookie_str = cookie_pool['cookie_str']
            logger.info(f'更换后的cookies的账号为:{account}')

        # 解码
        st = self.sign(token=token,
                       appkey=self.appkeys,
                       t=str(int(time.time() * 1000)),
                       data=data)

        # 实时接口
        '''
        http://h5api.m.taobao.com/h5/mtop.taobao.geb.shopinfo.queryshopinfo/2.0/?jsv=2.4.2
        &appKey=12574478&t=1569315579952&sign=88d9947b4728cc649bc50f3cb66ef2c6&
        api=mtop.taobao.geb.shopinfo.queryshopinfo&v=2.0&type=originaljson&timeout=3000&
        AntiCreep=true&dataType=json&H5Request=true&data={"sellerId": 3790578263}
        '''
        url = self.store_detail.format(**st)

        yield scrapy.Request(url=url,
                             headers=headers,
                             cookies=cookies,
                             callback=self.parse_store_item,
                             meta={'account': account})

    # 进一步抓取店铺信息
    def parse_store_item(self, response):
        re_data = self.re_dt

        # 参考:接口2.json
        doc = json.loads(response.text)
        # 接口访问状态
        status = doc.get('ret')

        logger.info(f'json接口响应状态{status}')
        if 'SUCCESS' not in status[0]:
            logger.info('处理淘宝封禁!')
            yield scrapy.Request(url=self.json_url,
                                 callback=self.parse_search,
                                 dont_filter=True,
                                 meta={'change_cookie': True})
        else:
            # 存储动态json接口内容
            self.con.hset('item_json', self.title, response.text)
            tdt = doc.get('data')
            self.count = tdt.get('itemCount')  # 商品总数
            re_data['shopname'] = tdt.get('shopName')  # 店铺名
            re_data['fans_num'] = str(tdt.get('fansNum'))  # 粉丝数
            re_data['item_count'] = str(self.count)  # 产品数量
            re_data['new_item'] = str(tdt.get('newItem'))  # 新品数
            re_data['golden_seller'] = tdt.get('goldenSeller')  # 金牌卖家
            logger.info(f'店铺信息:{re_data}')

            # url = r'https://shop142840423.taobao.com/i/asynSearch.htm?mid=w-13377183093-0&wid=13377183093&pageNo=1'

            # 商品详情的第一页
            # 天猫用不上这个详情页,直接计算出页数
            # 淘宝暂时直接计算出页数

            if 'tmall.com' in self.shop_search_list:
                # 通过计算得出页数
                totalPages = math.ceil(int(self.count) / 90) + 1  # 向上取整
            elif 'taobao.com' in self.shop_search_list:
                # s = Selector(text=html)
                #                 # count = int(s.xpath('//*[@id="shop-search-list"]/div/div[2]/span/text()').extract_first())
                #                 # if count is not None and count > item_count:
                #                 #     item_count = count
                # 通过计算得出页数
                totalPages = math.ceil(int(self.count) / 24) + 1  # 向上取整

            logger.info(f'总共需要采集的页面有:{totalPages}页')

            for page in range(1, totalPages + 1):
                url = self.shop_search_list + str(page)
                # 预处理cookies
                headers = {'User-Agent': random.choice(self.chrome_agents)}
                time.sleep(1)
                yield scrapy.Request(url=url,
                                     headers=headers,
                                     callback=self.parse_productId,
                                     meta={'with_cookie': True})

    def parse_productId(self, response):
        '''
        解析当前商品详情页中的所有商品id
        :param response:
        :return:
        '''
        headers = {'User-Agent': random.choice(self.chrome_agents)}
        if 'rgv587_flag' in response.text:
            yield scrapy.Request(url=response.request.url,
                                 headers=headers,
                                 callback=self.parse_productId,
                                 meta={
                                     'with_cookie': True,
                                     'block': True
                                 })

        text = response.text.replace('\\"', '')
        s = Selector(text=text)
        firstPagePids = s.css('.J_TItems div .item::attr(data-id)').getall()
        if len(firstPagePids) == 0:
            firstPagePids = s.css(
                '.shop-hesper-bd .item::attr(data-id)').getall()
        logger.info(f'{response.request.url},商品id:{firstPagePids}')
        pids = firstPagePids

        # 存储全部商品
        # todo 增加持续化存储
        self.all_pids.extend(pids)

    # 根据所需元素计算sign
    def sign(self, token, appkey, t, data):
        '''
        :param token:
        :param appkey:
        :param t: str(int(time.time() * 1000))
        :param data:
        :return:
        '''
        pp = '&'.join([token, t, appkey, data]).encode()
        sign = hashlib.md5(pp).hexdigest()
        return {'sign': sign, 't': t, 'appkey': appkey, 'data': data}

    # 店铺爬虫结束后,数据存入redis并调用商品详情爬虫
    def close(self, spider, reason):
        # 去重存储所有商品id
        all_products_id = list(set(self.all_pids))
        logger.info(all_products_id)

        logger.info(f'采集到了{len(all_products_id)}条商品数据')
        lose_count = int(self.count) - len(all_products_id)
        if lose_count > 0:
            logger.warning(
                f'采集缺失:{lose_count},缺失百分比:{lose_count / int(self.count) * 100}%'
            )
Ejemplo n.º 9
0
            cls.lua_extend = redis.register_script(cls.LUA_EXTEND_SCRIPT)

    def do_acquire(self, token):
        timeout = self.timeout and int(self.timeout * 1000) or ''
        return bool(self.lua_acquire(keys=[self.name],
                                     args=[token, timeout],
                                     client=self.redis))

    def do_release(self, expected_token):
        if not bool(self.lua_release(keys=[self.name],
                                     args=[expected_token],
                                     client=self.redis)):
            raise LockError("Cannot release a lock that's no longer owned")

    def do_extend(self, additional_time):
        additional_time = int(additional_time * 1000)
        if not bool(self.lua_extend(keys=[self.name],
                                    args=[self.local.token, additional_time],
                                    client=self.redis)):
            raise LockError("Cannot extend a lock that's no longer owned")
        return True


if __name__ == '__main__':
    conn = RedisPool().connection()

    with RedisLuaLock(conn, 'hello_world', timeout=15, blocking=True, blocking_timeout=1):
        # do something
        print 'xxx'
        time.sleep(3)