class TestTaobaoIntegrateListPageAction(TestCase):
    __cookie_service = get_cookie_service()
    account = global_config.accounts[0]

    def test_login(self):
        cookies, origin_cookies = spider_qt5_bootstrap(url=SpiderUrls.get_sycm_login_url(), account=self.account)
        self.__cookie_service.dump(cookies, self.account)

    def test(self):
        requests_cookie_jar = self.__cookie_service.load(account=self.account)

        context = Context()
        context.attach(Context.KEY_IS_UPDATE_COOKIES, True)
        context.attach(Context.KEY_HEADERS, SpiderHttp.get_taobao_headers('https://s.m.taobao.com/h5'))
        context.attach(Context.KEY_COOKIES, requests_cookie_jar)

        good = Good()
        good.set_brand_name('Flyco/飞科')
        good.set_model_name('FR5218')
        context.attach(Context.KEY_GOOD_DICT, good)

        action = TaobaoPresearchAction()
        action.execute(context=context)

        try:
            # action = TaobaoBaichuanAction()
            # action.execute(context=context)
            action = TaobaoIntegrateListPageAction()
            action.execute(context=context)
        except CookieNeedUpdateException as e:
            self.__cookie_service.dump(requests_cookie_jar, self.account)
Example #2
0
class TestSpiderQt5(TestCase):
    __cookie_service = get_cookie_service()
    __proxy_service = get_proxy_service()

    def test_login(self):
        url = SpiderUrls.get_sycm_login_url()
        # account = global_config.s_accounts[1]
        # proxy = __proxy_service.get_origin_static_proxy(account['username'])
        account = global_config.accounts[0]
        proxy = None
        cookies, origin_cookies = spider_qt5_bootstrap(url, account, True, proxy)
        self.__cookie_service.dump(cookies=cookies, account=account)
        print(cookies)

    def test_app_login(self):
        url = SpiderUrls.get_app_taobao_login_url()
        # account = global_config.s_accounts[1]
        # proxy = __proxy_service.get_origin_static_proxy(account['username'])
        account = global_config.s_accounts[0]
        proxy = None
        cookies, origin_cookies = spider_qt5_bootstrap(url, account, True, proxy)
        self.__cookie_service.dump(cookies=cookies, account=account)
        print(cookies)


    def test_spider_qt5(self):
        url = 'https://s.m.taobao.com/h5?q=Flyco%2BFR5218&search=%E6%8F%90%E4%BA%A4&tab=all'
        account = global_config.s_accounts[0]
        proxy = self.__proxy_service.get_origin_static_proxy(account['username'])
        cookies = self.__cookie_service.load(account=account, type_=PickleFileType.origin_cookie)
        cookies, origin_cookies = spider_qt5_bootstrap(url=url, account=account, risk=True,
                                                       proxy=proxy, cookies=cookies)
        self.__cookie_service.dump(cookies=cookies, account=account)
        self.__cookie_service.dump(cookies=origin_cookies, account=account, type_=PickleFileType.origin_cookie)
Example #3
0
class TestCookieService(TestCase):
    __cookie_service = get_cookie_service()

    def test_load(self):
        account = global_config.accounts[0]
        cookies = self.__cookie_service.load(account=account)

    def test_dump(self):
        account = global_config.accounts[1]
        cookie = PickleCookieJar()
        self.__cookie_service.dump(cookie, account)
Example #4
0
 def __init__(self, pool_size):
     super().__init__()
     self._counter = 0
     self._executor = ExecutorService(pool_size)
     self._proxy_service = get_proxy_service()
     self._cookie_service = get_cookie_service()
     s_accounts = global_config.s_accounts
     self._account_counter = dict()
     for item in s_accounts:
         self._account_counter[item['username']] = 0
     self._fail_account_counter = dict()
     for item in s_accounts:
         self._fail_account_counter[item['username']] = 0
Example #5
0
class ActionService(object):
    _cookie_service = get_cookie_service()

    def __init__(self):
        super(ActionService, self).__init__()

    def get_sycm_product_actions(self):
        actions = list()
        actions.append(SycmProductGetBrandsAction())
        actions.append(SycmProductProdHotRankAction())
        actions.append(SycmProductProdHotRankPersistAction())
        return actions

    def get_taobao_integrate_list_actions(self):
        actions = list()
        actions.append(TaobaoPresearchAction())
        actions.append(TaobaoIntegrateListPageAction())
        actions.append(ThreadSleepAction())
        actions.append(TaobaoSaleListPageAction())
        actions.append(TaobaoListPagePersistAction())
        actions.append(ThreadSleepAction())
        return actions

    def get_taobao_http_detail_actions(self):
        actions = list()
        actions.append(DetailMPageAction())
        actions.append(TaobaoDetailPageAction())
        actions.append(ThreadSleepAction())
        actions.append(GoodPersistAction())
        return actions

    def get_taobao_detail_actions(self):
        actions = list()
        actions.append(GoodPersistAction())
        return actions

    def get_task_actions(self):
        actions = list()
        actions.append(TaskCollectAction())
        return actions

    def get_sycm_category_job_init_actions(self):
        actions = list()
        actions.append(SycmCategoryJobInitAction())
        return actions

    def execute_direct_good_actions(self, date_str):
        context = Context()
        context.attach(Context.KEY_DIRECT_COLLECT_DATE, date_str)
        context.attach(Context.KEY_CURRENT_TASK_TYPE, TaobaoTaskType.sycm_list)
        action = TaskDirectCollectAction()
        action.execute(context=context)
        tasks = context.get(Context.KEY_CURRENT_TASKS, [])
        for task in tasks:
            raw_data = task.raw_data
            good = Good(raw_data['goodResult'])
            context.attach(Context.KEY_GOOD_DICT, good)
            action = GoodDirectPersistAction()
            action.execute(context=context)

    def execute_sycm_category_job_init_actions(self, date_str=None):
        actions = self.get_sycm_category_job_init_actions()
        context = Context()
        if date_str:
            context.attach(Context.KEY_SYCM_SPECIFIC_DATE, date_str)
        for action in actions:
            action.execute(context=context)

    def execute_sycm_product_actions(self, date_str=None):
        context = self.execute_task_actions(TaobaoTaskType.sycm_init)
        tasks = context.get(Context.KEY_CURRENT_TASKS, [])
        return tasks

    def _execute_legality_token_actions(self, cookies, account):
        context = Context()
        context.attach(Context.KEY_IS_UPDATE_COOKIES, False)
        context.attach(Context.KEY_HEADERS, SpiderHttp.get_sycm_home_htm_headers())
        context.attach(Context.KEY_COOKIES, cookies)
        action = SycmHomeHtmTokenAction()
        action.execute(context=context)
        legality_token = context.get(Context.KEY_SYCM_HOME_HTM_TOKEN_RESULT)
        self._cookie_service.dump(cookies=legality_token, account=account, type_=PickleFileType.legality_token)

    def _relogin(self, account, cate_id, cate_name):
        for i in range(0, 3):
            try:
                cookies, origin_cookies = spider_qt5_bootstrap(url=SpiderUrls.get_sycm_login_url(), account=account)
                self._execute_legality_token_actions(cookies=cookies, account=account)
                self._cookie_service.dump(cookies, account)
                return
            except CookieExpiredException as e:
                logger.warning('relogin cookie is expired,cate_id:%s,cate_name:%s,account:%s', cate_id, cate_name,
                               account['username'])
                if i == 2:
                    raise ExitException('exit when try login util max 3 times retry')
            except Exception as e:
                logger.error('retry error,retry times,%s,%s', i, e)
                if i == 2:
                    raise ExitException('exit when try login util max 3 times retry')
            time.sleep(5)

    def _execute_sycm_product_actions(self, task):
        raw_data = task.raw_data
        account = raw_data['account']
        cate_id = raw_data['cateId']
        cate_name = raw_data['cateName']

        try:
            for i in range(0, 3):
                try:
                    self.__execute_sycm_product_actions(task)
                    return
                except CookieNotFoundException as e:
                    logger.warning('cookie is not exist,cate_id:%s,cate_name:%s,account:%s', cate_id, cate_name,
                                   account['username'])
                    self._relogin(account=account, cate_id=cate_id, cate_name=cate_name)
                    if i == 2:
                        raise e
                except CookieExpiredException as e:
                    logger.warning('cookie is expired,cate_id:%s,cate_name:%s,account:%s', cate_id, cate_name,
                                   account['username'])
                    self._relogin(account=account, cate_id=cate_id, cate_name=cate_name)
                    if i == 2:
                        raise e
                except Exception as e:
                    logger.exception(e)
                    if i == 2:
                        raise e
                time.sleep(5)
        except Exception as e:
            logger.exception(e)
            raise e

    def __execute_sycm_product_actions(self, task):
        raw_data = task.raw_data
        cate_id = raw_data['cateId']
        cate_name = raw_data['cateName']
        account = raw_data['account']
        date_str = raw_data['dateStr']
        context = Context()
        context.attach(Context.KEY_CURRENT_TASK, task)
        context.attach(Context.KEY_IS_UPDATE_COOKIES, False)
        referer = 'https://sycm.taobao.com/mc/mq/product_insight'
        context.attach(Context.KEY_HEADERS, SpiderHttp.get_sycm_headers(referer))
        requests_cookie_jar = self._cookie_service.load(account)

        if not requests_cookie_jar:
            raise CookieNotFoundException('cookie not found')
            # cookies = spider_qt5_bootstrap(url=SpiderUrls.get_sycm_login_url(), account=account)
            # requests_cookie_jar = cookies
            # self._cookie_service.dump(cookies, account)
        context.attach(Context.KEY_COOKIES, requests_cookie_jar)
        context.attach(Context.KEY_IS_UPDATE_COOKIES, False)
        context.attach(Context.KEY_CURRENT_TASK_ACCOUNT, account)
        good = Good()
        good.set_category_id(cate_id)
        good.set_category_name(cate_name)
        good.set_flag(str(int(GoodDataType.initial)))

        # yesterday_date_str = yesterday().strftime("%Y-%m-%d")
        # yesterday_date_str = day_before_yesterday().strftime("%Y-%m-%d")
        yesterday_date_str = date_str
        good.set_date(yesterday_date_str)
        context.attach(Context.KEY_GOOD_DICT, good)

        sycm_brands_url = SpiderUrls.get_sycm_product_get_brands_url(cate_id)
        token = self._cookie_service.load(account=account, type_=PickleFileType.legality_token)
        sycm_prod_hot_rank_url = SpiderUrls.get_sycm_product_prod_hot_rank(start_data=yesterday_date_str,
                                                                           end_date=yesterday_date_str, cate_id=cate_id,
                                                                           token=token)
        sycm_brands_http_request = HttpRequest(url=sycm_brands_url, method=HttpMethod.GET)
        sycm_prod_hot_rank_http_request = HttpRequest(url=sycm_prod_hot_rank_url, method=HttpMethod.GET)
        context.attach(Context.KEY_SYCM_PRODUCT_GET_BRANDS_HTTP_REQUEST, sycm_brands_http_request)
        context.attach(Context.KEY_SYCM_PRODUCT_PROD_HOT_RANK_HTTP_REQUEST, sycm_prod_hot_rank_http_request)

        for action in self.get_sycm_product_actions():
            action.execute(context=context)
        return

    def _execute_taobao_integrate_list_actions(self, task, account, proxy):
        try:
            self.__execute_taobao_integrate_list_actions(task, account, proxy)
        except ProxyException as e:
            logger.exception(e)
            # return proxy
        except CookieExpiredException as e:
            logger.exception(e)
            # raw_data = task.raw_data
            # default_account = global_config.accounts[0]
            # account = raw_data.get('account', default_account)
            return account, True, True
        except InterruptException as e:
            logger.exception(e)
            # raw_data = task.raw_data
            # default_account = global_config.accounts[0]
            # account = raw_data.get('account', default_account)
            return account, True, False
        except Exception as e:
            logger.exception(e)
        return account, False, False

    def _login(self, account, force, risk=False, proxy=None):
        requests_cookie_jar = self._cookie_service.load(account)
        if not requests_cookie_jar or force:
            cookies, origin_cookies = spider_qt5_bootstrap(url=SpiderUrls.get_sycm_login_url(), account=account,
                                                           risk=risk, proxy=proxy)
            self._cookie_service.dump(cookies, account)
            self._cookie_service.dump(cookies=origin_cookies, account=account, type_=PickleFileType.origin_cookie)

    def __execute_taobao_integrate_list_actions(self, task, account, proxy):

        raw_data = task.raw_data
        # default_account = global_config.accounts[0]
        # account = raw_data.get('account', default_account)
        # account = raw_data.get('account', default_account)
        good = Good(raw_data['goodResult'])

        context = Context()
        context.attach(Context.KEY_CURRENT_TASK, task)
        context.attach(Context.KEY_CURRENT_TASK_ACCOUNT, account)
        context.attach(Context.KEY_CURRENT_PROXY, proxy)
        # query = 'Flyco/飞科 + FR5218'
        # query = brand_name + '+' + model_name
        # page = '1'

        requests_cookie_jar = self._cookie_service.load(account)

        if not requests_cookie_jar:
            raise CookieExpiredException('integrate list need first login')

        context.attach(Context.KEY_IS_UPDATE_COOKIES, True)
        referer = 'https://s.m.taobao.com/h5'
        context.attach(Context.KEY_HEADERS, SpiderHttp.get_taobao_headers(referer))
        context.attach(Context.KEY_COOKIES, requests_cookie_jar)
        context.attach(Context.KEY_CURRENT_SLEEP_SECS, 2)
        context.attach(Context.KEY_GOOD_DICT, good)
        actions = self.get_taobao_integrate_list_actions()
        for action in actions:
            try:
                result = action.execute(context=context)
            except CookieNeedUpdateException as e:
                self._cookie_service.dump(requests_cookie_jar, account)
                raise e
            # except CookieExpiredException as e:
            #     raise e
            # except InterruptException as e:
            #     raise e
            if not result:
                break

    def execute_taobao_integrate_list_actions(self):
        context = self.execute_task_actions(type=TaobaoTaskType.sycm_list)
        tasks = context.get(Context.KEY_CURRENT_TASKS, [])
        return tasks

    def execute_task_actions(self, type):
        context = Context()
        context.attach(Context.KEY_CURRENT_TASK_TYPE, type)
        for action in self.get_task_actions():
            action.execute(context)
        return context

    def _execut_taobao_detail_actions(self, task, proxy=None):
        try:
            self.__execut_taobao_detail_actions(task, proxy)
        except ProxyException as e:
            # logger.exception(e)
            return proxy
        except Exception as e:
            logger.exception(e)

    def __execut_taobao_detail_actions(self, task, proxy=None):
        raw_data = task.raw_data
        good_result = Good(raw_data['goodResult'])
        model_name = good_result.get_model_name()
        cate_id = good_result.get_category_id()
        integrate_infos = raw_data['integrateInfos']
        sale_infos = raw_data['saleInfos']
        i = 0
        j = 1
        length = min(len(integrate_infos), len(sale_infos))
        is_success = False

        context = Context()
        context.attach(Context.KEY_GOOD_DICT, good_result)
        context.attach(Context.KEY_CURRENT_TASK, task)
        context.attach(Context.KEY_CURRENT_PROXY, proxy)

        for x in range(0, length):
            is_need_retry = False
            if i < len(sale_infos):
                sale_info = sale_infos[i]
                sale_item_id = sale_info['itemId']
                sale_title = sale_info['title']
                sale_cate_id = sale_info['category']
                sale_price = sale_info['price']
                # str(sale_title).upper()
                # if str(sale_title).upper().find(str(model_name).upper()) != -1 and str(cate_id) == str(sale_cate_id):
                if str(sale_title).upper().find(str(model_name).upper()) != -1 and Category.check_cate_id(cate_id,
                                                                                                          sale_cate_id):
                    actions = self.get_taobao_detail_actions()
                    is_success = True
                    price_info = [{
                        'skuId': '-1',
                        'price': yuan_2_cent(sale_price)
                    }]
                    good_result.set_price_info(price_info=price_info)
                    good_result.set_flag(str(int(GoodDataType.success)))
                    for action in actions:
                        action.execute(context=context)
                    break
                elif i < 5:
                    actions = self.get_taobao_http_detail_actions()
                    timestamps = int(datetime.now().timestamp() * 1000)
                    # sign = get_sign('414804c1e894540b7f18f703c74346cf', str(timestamps), '12574478',
                    #                 '{"itemNumId":"%s"' % (sale_item_id))
                    sale_detail_url = SpiderUrls.get_taobao_detail_url(timestamps, '', sale_item_id)
                    context = Context()
                    context.attach(Context.KEY_GOOD_DICT, good_result)
                    context.attach(Context.KEY_CURRENT_TASK, task)
                    context.attach(Context.KEY_COOKIES, RequestsCookieJar())
                    context.attach(Context.KEY_IS_UPDATE_COOKIES, True)
                    context.attach(Context.KEY_CURRENT_PROXY, proxy)

                    detail_m_url = SpiderUrls.get_detail_m_url(sale_info['userType'], sale_item_id)
                    detail_m_http_request = HttpRequest(detail_m_url, method=HttpMethod.GET)
                    context.attach(Context.KEY_DETAIL_M_HTTP_REQUEST, detail_m_http_request)

                    sale_http_request = HttpRequest(url=sale_detail_url, method=HttpMethod.GET)
                    context.attach(Context.KEY_TAOBAO_DETAIL_HTTP_REQUEST, sale_http_request)
                    context.attach(Context.KEY_HEADERS, SpiderHttp.get_taobao_headers(detail_m_url))
                    try:
                        for action in actions:
                            action.execute(context=context)
                        is_success = True
                        break
                    except RetryException as e:
                        logger.error(e)
                        time.sleep(5)
                    except InterruptException as e:
                        logger.exception(e)
                        time.sleep(10)
                        is_need_retry = True
                        # raise e
                # if is_success:
                #     break
            if j < len(integrate_infos):
                integrate_info = integrate_infos[j]
                integrate_item_id = integrate_info['itemId']
                integrate_title = integrate_info['title']
                integrate_cate_id = integrate_info['category']
                integrate_price = integrate_info['price']
                # if str(integrate_title).upper().find(
                #         str(model_name).upper()) != -1 and str(cate_id) == str(integrate_cate_id):
                if str(integrate_title).upper().find(
                        str(model_name).upper()) != -1 and Category.check_cate_id(cate_id, integrate_cate_id):
                    actions = self.get_taobao_detail_actions()
                    is_success = True
                    price_info = [{
                        'skuId': '-2',
                        'price': yuan_2_cent(integrate_price)
                    }]
                    good_result.set_price_info(price_info=price_info)
                    good_result.set_flag(str(int(GoodDataType.success)))
                    for action in actions:
                        action.execute(context=context)
                    break
                elif j < 6:
                    actions = self.get_taobao_http_detail_actions()
                    timestamps = int(datetime.now().timestamp() * 1000)
                    integrate_detail_url = SpiderUrls.get_taobao_detail_url(timestamps, '', integrate_item_id)
                    context = Context()
                    # referer = 'https://s.m.taobao.com/h5'
                    # context.attach(Context.KEY_HEADERS, SpiderHttp.get_taobao_headers(referer))
                    context.attach(Context.KEY_GOOD_DICT, good_result)
                    context.attach(Context.KEY_CURRENT_TASK, task)
                    context.attach(Context.KEY_COOKIES, RequestsCookieJar())
                    context.attach(Context.KEY_IS_UPDATE_COOKIES, True)
                    context.attach(Context.KEY_CURRENT_PROXY, proxy)

                    detail_m_url = SpiderUrls.get_detail_m_url(integrate_info['userType'], integrate_item_id)
                    detail_m_http_request = HttpRequest(detail_m_url, method=HttpMethod.GET)
                    context.attach(Context.KEY_DETAIL_M_HTTP_REQUEST, detail_m_http_request)

                    integrate_http_request = HttpRequest(url=integrate_detail_url, method=HttpMethod.GET)
                    context.attach(Context.KEY_TAOBAO_DETAIL_HTTP_REQUEST, integrate_http_request)

                    context.attach(Context.KEY_HEADERS, SpiderHttp.get_taobao_headers(detail_m_url))
                    try:
                        for action in actions:
                            action.execute(context=context)
                        is_success = True
                        break
                    except RetryException as e:
                        logger.exception(e)
                        time.sleep(5)
                    except InterruptException as e:
                        logger.exception(e)
                        time.sleep(10)
                        is_need_retry = True
                        # raise e
            if not is_need_retry:
                i += 1
                j += 1
        if not is_success:
            actions = self.get_taobao_detail_actions()
            good_result.set_flag(str(int(GoodDataType.not_found)))
            for action in actions:
                action.execute(context=context)

    def execut_taobao_detail_actions(self):
        context = self.execute_task_actions(type=TaobaoTaskType.taobao_list)
        tasks = context.get(Context.KEY_CURRENT_TASKS, [])
        return tasks
Example #6
0
class TaobaoSaleListPageAction(HttpAction):
    __cookie_service = get_cookie_service()

    def __build_data(self, context):
        good = context.get(Context.KEY_GOOD_DICT)
        # brand_name = good.get_brand_name()
        # model_name = good.get_model_name()
        query = good.get_query()
        page = 1
        extra = context.get(Context.KEY_TAOBAO_PRESEARCH_RESULT)
        data = SpiderParams.build_taobao_search_data(
            query, page, 'list', 'nav,selecthot,onesearch', '_sale', **extra)
        datastr = json.dumps(data, ensure_ascii=False)
        datastr = datastr.replace(' ', '')
        return datastr

    def __build_token(self, context):
        cookies = context.get(Context.KEY_COOKIES)
        _m_h5_tk = cookies.get('_m_h5_tk')
        if not _m_h5_tk:
            return ''
            # raise CookieExpiredException('cookie is expired,msg:_m_h5_tk is not exist')
        return str(_m_h5_tk).split('_')[0]

    def do_execute(self, context):
        """
        :param context:
        :return:
        """
        timestamps = int(datetime.now().timestamp() * 1000)
        data = self.__build_data(context=context)
        s_data = data.replace('+', ' ')
        token = self.__build_token(context=context)
        sign = get_sign(token=token,
                        timestamps=str(timestamps),
                        appKey='12574478',
                        data=s_data)
        # data = data.replace(':', '%3A')

        sale_url = SpiderUrls.get_taobao_serarch_url(
            type=TaobaoSearchType.sale,
            timestamps=timestamps,
            sign=sign,
            data=data)
        sale_http_request = HttpRequest(url=sale_url, method=HttpMethod.GET)
        context.attach(Context.KEY_TAOBAO_SALE_LIST_HTTP_REQUEST,
                       sale_http_request)
        http_request = context.get(Context.KEY_TAOBAO_SALE_LIST_HTTP_REQUEST)
        response = self.execute_in_retry(context=context,
                                         http_request=http_request)
        self.unmarshal(context=context, response=response)
        cookies = context.get(Context.KEY_COOKIES)
        account = context.get(Context.KEY_CURRENT_TASK_ACCOUNT)
        self.__cookie_service.dump(cookies, account)
        return True

    def unmarshal(self, context, response):
        result = response.text
        if result.find('请稍后重试') != -1:
            raise InterruptException(u'happen to risk,value:%s' % (result))
        if result.find('login.m.taobao.com') != -1:
            raise CookieExpiredException(
                'cookie is expired,msg:taobao sale list page')
        if result.find('FAIL_SYS_TOKEN_EMPTY') != -1:
            raise CookieNeedUpdateException(
                'cookies need update,msg:taobao sale list page')
        result = result.replace('mtopjsonp2(', '')
        result = ast.Str(result.replace(')', ''))
        result = self.parse_js(result)
        result = json.loads(result)
        result = result['data']
        # context.attach(Context.KEY_TAOBAO_INTERGRATE_RESULT, result)
        # result = response.json()
        context.attach(Context.KEY_TAOBAO_SALE_RESULT, result)

    def on_create(self, context):
        pass

    def on_start(self, context):
        pass

    def on_complete(self, context):
        pass

    def on_destroy(self, context):
        pass

    def parse_js(self, expr):
        """
        解析非标准JSON的Javascript字符串,等同于json.loads(JSON str)
        :param expr:非标准JSON的Javascript字符串
        :return:Python字典
        """
        import ast
        m = ast.parse(expr)
        a = ast.Str(m.s)

        def parse(node):
            if isinstance(node, ast.Expr):
                return parse(node.value)
            elif isinstance(node, ast.Num):
                return node.n
            elif isinstance(node, ast.Str):
                return node.s
            elif isinstance(node, ast.Name):
                return node.id
            elif isinstance(node, ast.Dict):
                return dict(zip(map(parse, node.keys), map(parse,
                                                           node.values)))
            elif isinstance(node, ast.List):
                return map(parse, node.elts)
            else:
                raise NotImplementedError(node.__class__)

        result = parse(a)
        return result