def unmarshal(self, context, response):
     result = response.text
     if result.find('请稍后重试') != -1:
         raise InterruptException(u'happen to risk,value:%s' % (result))
     if result.find('apiStack') == -1:
         raise RetryException(u'may happen to risk,value:%s' % (result))
     result = result.replace('mtopjsonp3(', '')
     result = ast.Str(result.replace(')', ''))
     result = self.parse_js(result)
     # print(result)
     # result = result.replace(':"\\"', ':"')
     # result = result.replace('\\""', '"')
     # result = result.replace('mtopjsonp2(', '')
     # result = result.replace(')', '')
     # result = result.replace('\\', '')
     # result = result.replace('"{', '{')
     # result = result.replace('}"', '}')
     # result = result.replace('"[', '[')
     # result = result.replace(']"', ']')
     result = json.loads(result)
     try:
         value_str = result['data']['apiStack'][0]['value']
         result['data']['apiStack'][0]['value'] = json.loads(value_str)
     except Exception as e:
         logger.error('taobao detail page unmarshal error,result:%s', result)
         raise e
     return result
Beispiel #2
0
 def __get_list_api_param(self, html, soup, param_name):
     try:
         pattern = re.compile(param_name + ":'(.*?)'", re.MULTILINE | re.DOTALL)
         script = soup.find("script", text=pattern)
         return pattern.search(script.text).group(1)
     except Exception as e:
         logger.error('list api params error,param_name%s,origin response:%s,exp:%s', param_name, html, e)
         raise e
Beispiel #3
0
def mongo_collection_scope(**kwargs):
    collection_name = kwargs.get('collection_name')
    try:
        collection = db.get_collection(name=collection_name)
        yield collection
    except:
        logger.error("failed to finish the mongo commit: %s", traceback.format_exc())
        raise
Beispiel #4
0
 def unmarshal(self, context, response):
     html = response.text
     try:
         rst = re.search("legalityToken=(.*?);", html).group(1)
         return rst
     except Exception as e:
         logger.error('legalityToken fetch error,origin response:%s,exp:%s',
                      html, e)
         raise CookieExpiredException(e)
Beispiel #5
0
def _session_scope(engine, **kwargs):
    session = _Session(bind=engine, **kwargs)
    try:
        yield session
        session.commit()
    except:
        logger.error("failed to finish the commit: %s", traceback.format_exc())
        session.rollback()
        raise
    finally:
        session.close()
Beispiel #6
0
def get_sign_js():
    data_file = Path(__file__).parent.joinpath('../../../js/' + 'sign.js')
    if not data_file.exists():
        return None
    try:
        text = data_file.read_text()
        return text
        # ctx = execjs.compile(text)
        # print(ctx.call('p', 'aaa'))
        logger.info('load sign.js success')
    except Exception as e:
        logger.error('load sign.js error,exp:%s', e)
        raise e
Beispiel #7
0
    def on_error(self, context, exp):
        task = context.get(Context.KEY_CURRENT_TASK, '')
        good = context.get(Context.KEY_GOOD_DICT, dict())
        task_id = None
        data = None
        if task:
            task_id = task.id
            data = task.raw_data

        logger.error(
            u'context key:[%s],action:[%s],task_id:[%s],good:[%s],execute error,data:%s,exception:%s',
            context.context_key, self.__class__.__name__, task_id, good, data,
            exp)
Beispiel #8
0
    def on_error(self, context, exp):
        task = context.get(Context.KEY_CURRENT_TASK, '')
        good = context.get(Context.KEY_GOOD_DICT, dict())
        hot_rank_result = context.get(
            Context.KEY_SYCM_PRODUCT_PROD_HOT_RANK_RESULT)
        task_id = None
        data = None
        if task:
            task_id = task.id
            data = task.raw_data

        logger.error(
            u'context key:[%s],action:[%s],task_id:[%s],good:[%s],execute error,data:%s,origin data:%s,exception:%s',
            context.context_key, self.__class__.__name__, task_id, good, data,
            hot_rank_result, exp)
 def _relogin(self, account, cate_id, cate_name):
     for i in range(0, 3):
         try:
             cookies, origin_cookies = spider_qt5_bootstrap(url=SpiderUrls.get_sycm_login_url(), account=account)
             self._execute_legality_token_actions(cookies=cookies, account=account)
             self._cookie_service.dump(cookies, account)
             return
         except CookieExpiredException as e:
             logger.warning('relogin cookie is expired,cate_id:%s,cate_name:%s,account:%s', cate_id, cate_name,
                            account['username'])
             if i == 2:
                 raise ExitException('exit when try login util max 3 times retry')
         except Exception as e:
             logger.error('retry error,retry times,%s,%s', i, e)
             if i == 2:
                 raise ExitException('exit when try login util max 3 times retry')
         time.sleep(5)
Beispiel #10
0
    def execute(self, num):
        tasks = self.execut_taobao_detail_actions()
        i = 0
        future_tasks = {}

        for task in tasks:
            proxy = self._proxy.get_proxy()
            if i < num:
                future_tasks[self._executor.submit(
                    self._execut_taobao_detail_actions, task, proxy)] = task
            i += 1
        for future in as_completed(future_tasks):
            try:
                proxy = future.result()
                if proxy:
                    self._proxy.remove_proxy(url=proxy['https'])
            except Exception as e:
                logger.error(e)
Beispiel #11
0
    def load(self, account, path=None, type_=PickleFileType.cookie):
        if not path:
            path = '../../data/'
        account = account['username']
        key = self.gen_by_account(account=account, type_=type_)

        if type_ == PickleFileType.cookie:
            # cookies = self.cookies_dict.get(key, None)
            # if cookies:
            #     logger.info('load mem cookies success,account:[%s]', account)
            #     return cookies

            data_file = Path(__file__).parent.joinpath(path + self.gen_by_account(account=account, type_=type_))
            if not data_file.exists():
                return None
            try:
                bytes = data_file.read_bytes()
                cookies = pickle.loads(bytes)
                logger.info('load cookies success,account:[%s]', account)
                self.cookies_dict[key] = cookies
                return cookies
            except Exception as e:
                logger.error('load cookies error,account:[%s],exp:%s', account, e)
                raise e
        elif type_ == PickleFileType.origin_cookie:
            data_file = Path(__file__).parent.joinpath(path + self.gen_by_account(account=account, type_=type_))
            if not data_file.exists():
                return None
            try:
                bytes = data_file.read_bytes()
                cookies = pickle.loads(bytes)
                logger.info('load origin cookies success,account:[%s]', account)
                return cookies
            except Exception as e:
                logger.error('load lorigin cookies error,account:[%s],exp:%s', account, e)
                raise e
        else:
            data_file = Path(__file__).parent.joinpath(path + self.gen_by_account(account=account, type_=type_))
            if not data_file.exists():
                return None
            try:
                bytes = data_file.read_bytes()
                legality_token = pickle.loads(bytes)
                logger.info('load legality_token success,account:[%s]', account)
                return legality_token
            except Exception as e:
                logger.error('load legality_token error,account:[%s],exp:%s', account, e)
                return ''
Beispiel #12
0
    def execute(self, num):

        cycle_login_num = 0
        tasks = self.execute_taobao_integrate_list_actions()
        i = 0
        future_tasks = {}
        with read_session_scope() as session:
            _stream_risk_dao = get_stream_risk_dao(session=session)
            rsts = _stream_risk_dao.base_query.all()
            risk_usernames = set(item.raw_data for item in rsts)
        s_accounts = global_config.s_accounts
        for task in tasks:
            account = s_accounts[self._counter % len(s_accounts)]
            proxy = self._proxy_service.get_static_proxy(account['username'])
            # raw_data = task.raw_data
            # account = raw_data['account']
            self._counter += 1
            i += 1
            if account['username'] in risk_usernames:
                continue
            if i < num:
                future_tasks[self._executor.submit(
                    self._execute_taobao_integrate_list_actions, task, account,
                    proxy)] = task

        for future in as_completed(future_tasks):
            try:
                account, flag, force = future.result()
                if flag:
                    if force:
                        with write_session_scope() as session:
                            _stream_risk_dao = get_stream_risk_dao(
                                session=session)
                            self._risk(stream_risk_dao=_stream_risk_dao,
                                       account=account)
                        # self._login(account=account, force=True if cycle_login_num == 0 else False)
                        cycle_login_num += 1
                    else:
                        self._fail_account_counter[account['username']] += 1
                        if self._fail_account_counter[account['username']] > 2:
                            self._cookie_service.remove(account=account)
                            with write_session_scope() as session:
                                _stream_risk_dao = get_stream_risk_dao(
                                    session=session)
                                self._risk(stream_risk_dao=_stream_risk_dao,
                                           account=account)
                            # self._login(account=account, force=True if cycle_login_num == 0 else False)
                            cycle_login_num += 1
                        else:
                            url = 'https://s.m.taobao.com/h5?q=Flyco%2BFR5218&search=%E6%8F%90%E4%BA%A4&tab=all'
                            # url = 'https://s.m.taobao.com/h5?q=Flyco%2BFR5218&search=%E6%8F%90%E4%BA%A4&tab=all'
                            proxy = self._proxy_service.get_origin_static_proxy(
                                account['username'])
                            cookies = self._cookie_service.load(
                                account=account,
                                type_=PickleFileType.origin_cookie)
                            time.sleep(5)
                            cookies, origin_cookies = spider_qt5_bootstrap(
                                url=url,
                                account=account,
                                risk=False,
                                proxy=proxy,
                                cookies=cookies)
                            self._cookie_service.dump(cookies=cookies,
                                                      account=account)
                            self._cookie_service.dump(
                                cookies=origin_cookies,
                                account=account,
                                type_=PickleFileType.origin_cookie)
                    self._account_counter[account['username']] = 0
                else:
                    self._fail_account_counter[account['username']] = 0
                    self._account_counter[account['username']] += 1
                    if self._account_counter[account['username']] >= 2:
                        url = 'https://s.m.taobao.com/h5?q=Flyco%2BFR5218&search=%E6%8F%90%E4%BA%A4&tab=all'
                        # url = 'https://s.m.taobao.com/h5?q=Flyco%2BFR5218&search=%E6%8F%90%E4%BA%A4&tab=all'
                        proxy = self._proxy_service.get_origin_static_proxy(
                            account['username'])
                        cookies = self._cookie_service.load(
                            account=account,
                            type_=PickleFileType.origin_cookie)
                        time.sleep(5)
                        cookies, origin_cookies = spider_qt5_bootstrap(
                            url=url,
                            account=account,
                            risk=False,
                            proxy=proxy,
                            cookies=cookies)
                        self._cookie_service.dump(cookies=cookies,
                                                  account=account)
                        self._cookie_service.dump(
                            cookies=origin_cookies,
                            account=account,
                            type_=PickleFileType.origin_cookie)
                        self._account_counter[account['username']] = 0

            except Exception as e:
                logger.error(e)
Beispiel #13
0
    def __execut_taobao_detail_actions(self, task, proxy=None):
        raw_data = task.raw_data
        good_result = Good(raw_data['goodResult'])
        model_name = good_result.get_model_name()
        cate_id = good_result.get_category_id()
        integrate_infos = raw_data['integrateInfos']
        sale_infos = raw_data['saleInfos']
        i = 0
        j = 1
        length = min(len(integrate_infos), len(sale_infos))
        is_success = False

        context = Context()
        context.attach(Context.KEY_GOOD_DICT, good_result)
        context.attach(Context.KEY_CURRENT_TASK, task)
        context.attach(Context.KEY_CURRENT_PROXY, proxy)

        for x in range(0, length):
            is_need_retry = False
            if i < len(sale_infos):
                sale_info = sale_infos[i]
                sale_item_id = sale_info['itemId']
                sale_title = sale_info['title']
                sale_cate_id = sale_info['category']
                sale_price = sale_info['price']
                # str(sale_title).upper()
                # if str(sale_title).upper().find(str(model_name).upper()) != -1 and str(cate_id) == str(sale_cate_id):
                if str(sale_title).upper().find(str(model_name).upper()) != -1 and Category.check_cate_id(cate_id,
                                                                                                          sale_cate_id):
                    actions = self.get_taobao_detail_actions()
                    is_success = True
                    price_info = [{
                        'skuId': '-1',
                        'price': yuan_2_cent(sale_price)
                    }]
                    good_result.set_price_info(price_info=price_info)
                    good_result.set_flag(str(int(GoodDataType.success)))
                    for action in actions:
                        action.execute(context=context)
                    break
                elif i < 5:
                    actions = self.get_taobao_http_detail_actions()
                    timestamps = int(datetime.now().timestamp() * 1000)
                    # sign = get_sign('414804c1e894540b7f18f703c74346cf', str(timestamps), '12574478',
                    #                 '{"itemNumId":"%s"' % (sale_item_id))
                    sale_detail_url = SpiderUrls.get_taobao_detail_url(timestamps, '', sale_item_id)
                    context = Context()
                    context.attach(Context.KEY_GOOD_DICT, good_result)
                    context.attach(Context.KEY_CURRENT_TASK, task)
                    context.attach(Context.KEY_COOKIES, RequestsCookieJar())
                    context.attach(Context.KEY_IS_UPDATE_COOKIES, True)
                    context.attach(Context.KEY_CURRENT_PROXY, proxy)

                    detail_m_url = SpiderUrls.get_detail_m_url(sale_info['userType'], sale_item_id)
                    detail_m_http_request = HttpRequest(detail_m_url, method=HttpMethod.GET)
                    context.attach(Context.KEY_DETAIL_M_HTTP_REQUEST, detail_m_http_request)

                    sale_http_request = HttpRequest(url=sale_detail_url, method=HttpMethod.GET)
                    context.attach(Context.KEY_TAOBAO_DETAIL_HTTP_REQUEST, sale_http_request)
                    context.attach(Context.KEY_HEADERS, SpiderHttp.get_taobao_headers(detail_m_url))
                    try:
                        for action in actions:
                            action.execute(context=context)
                        is_success = True
                        break
                    except RetryException as e:
                        logger.error(e)
                        time.sleep(5)
                    except InterruptException as e:
                        logger.exception(e)
                        time.sleep(10)
                        is_need_retry = True
                        # raise e
                # if is_success:
                #     break
            if j < len(integrate_infos):
                integrate_info = integrate_infos[j]
                integrate_item_id = integrate_info['itemId']
                integrate_title = integrate_info['title']
                integrate_cate_id = integrate_info['category']
                integrate_price = integrate_info['price']
                # if str(integrate_title).upper().find(
                #         str(model_name).upper()) != -1 and str(cate_id) == str(integrate_cate_id):
                if str(integrate_title).upper().find(
                        str(model_name).upper()) != -1 and Category.check_cate_id(cate_id, integrate_cate_id):
                    actions = self.get_taobao_detail_actions()
                    is_success = True
                    price_info = [{
                        'skuId': '-2',
                        'price': yuan_2_cent(integrate_price)
                    }]
                    good_result.set_price_info(price_info=price_info)
                    good_result.set_flag(str(int(GoodDataType.success)))
                    for action in actions:
                        action.execute(context=context)
                    break
                elif j < 6:
                    actions = self.get_taobao_http_detail_actions()
                    timestamps = int(datetime.now().timestamp() * 1000)
                    integrate_detail_url = SpiderUrls.get_taobao_detail_url(timestamps, '', integrate_item_id)
                    context = Context()
                    # referer = 'https://s.m.taobao.com/h5'
                    # context.attach(Context.KEY_HEADERS, SpiderHttp.get_taobao_headers(referer))
                    context.attach(Context.KEY_GOOD_DICT, good_result)
                    context.attach(Context.KEY_CURRENT_TASK, task)
                    context.attach(Context.KEY_COOKIES, RequestsCookieJar())
                    context.attach(Context.KEY_IS_UPDATE_COOKIES, True)
                    context.attach(Context.KEY_CURRENT_PROXY, proxy)

                    detail_m_url = SpiderUrls.get_detail_m_url(integrate_info['userType'], integrate_item_id)
                    detail_m_http_request = HttpRequest(detail_m_url, method=HttpMethod.GET)
                    context.attach(Context.KEY_DETAIL_M_HTTP_REQUEST, detail_m_http_request)

                    integrate_http_request = HttpRequest(url=integrate_detail_url, method=HttpMethod.GET)
                    context.attach(Context.KEY_TAOBAO_DETAIL_HTTP_REQUEST, integrate_http_request)

                    context.attach(Context.KEY_HEADERS, SpiderHttp.get_taobao_headers(detail_m_url))
                    try:
                        for action in actions:
                            action.execute(context=context)
                        is_success = True
                        break
                    except RetryException as e:
                        logger.exception(e)
                        time.sleep(5)
                    except InterruptException as e:
                        logger.exception(e)
                        time.sleep(10)
                        is_need_retry = True
                        # raise e
            if not is_need_retry:
                i += 1
                j += 1
        if not is_success:
            actions = self.get_taobao_detail_actions()
            good_result.set_flag(str(int(GoodDataType.not_found)))
            for action in actions:
                action.execute(context=context)
Beispiel #14
0
            if i < num:
                future_tasks[self._executor.submit(
                    self._execut_taobao_detail_actions, task, proxy)] = task
            i += 1
        for future in as_completed(future_tasks):
            try:
                proxy = future.result()
                if proxy:
                    self._proxy.remove_proxy(url=proxy['https'])
            except Exception as e:
                logger.error(e)

    def init(self):
        super().init()

    def init_argparse(self, parser):
        super().init_argparse(parser)

    def process(self):
        # return super().process()
        self.execute(5)
        time.sleep(3)


if __name__ == "__main__":
    s = TaobaoDetailPageJob(40)
    logger.info("start to execute taobao_detail_page job")
    s.run()
    # s.process()
    logger.error("exit taobao_detail_page job")
Beispiel #15
0
# -*- coding: utf-8 -*-
from apscheduler.schedulers.blocking import BlockingScheduler

from config.config_loader import logger
from mall_spider.spiders.actions.action_service import ActionService


class SycmScheduleJob(ActionService, BlockingScheduler):

    def __init__(self):
        super().__init__()

    def handle(self):
        # self.execute_sycm_category_job_init_actions()
        self.add_job(self.execute_sycm_category_job_init_actions, 'cron', day_of_week='0-6', hour=10, minute=30,
                     second=0)

    def run(self):
        self.handle()
        self.start()


if __name__ == "__main__":
    s = SycmScheduleJob()
    logger.info("start to execute sycm_schedule job")
    s.run()
    # jobs = s.get_jobs()
    # print(jobs)
    logger.error("exit sycm_schedule job")
Beispiel #16
0
    def execute_in_retry(self, context, http_request, data=None):
        method = http_request.method
        is_update_cookies = context.get(Context.KEY_IS_UPDATE_COOKIES, False)
        headers = context.get(Context.KEY_HEADERS, '')
        cookies = context.get(Context.KEY_COOKIES, RequestsCookieJar())
        start_time = time.time()
        retry = int(default_retry)
        retry_interval = float(default_retry_interval)
        timeout = 25.0
        connect_time_out = int(default_connect_timeout)

        proxies = context.get(Context.KEY_CURRENT_PROXY, '')
        account = context.get(Context.KEY_CURRENT_TASK_ACCOUNT, {})
        while retry > 0:
            retry = retry - 1
            response = None
            try:
                if proxies:
                    logger.info('context key:[%s],proxy inject,[%s]->[%s]',
                                context.context_key, account, proxies)
                if method == HttpMethod.GET:
                    response = get(url=http_request.url,
                                   params=None,
                                   headers=headers,
                                   cookies=cookies,
                                   proxies=proxies)
                elif method == HttpMethod.POST:
                    response = post(url=http_request.url,
                                    data=data,
                                    headers=headers,
                                    cookies=cookies,
                                    proxies=proxies,
                                    connect_timeout=connect_time_out,
                                    timeout=timeout)
                logger.debug(u'context key:[%s],action:[%s] execute result:%s',
                             context.context_key, self.__class__.__name__,
                             response.text)
                if response.status_code != 200:
                    raise StatusCodeException(response.status_code)
                return response
            except ProxyError as e:
                logger.error('proxy error,[%s]->[%s],exp:%s', account, proxies,
                             e)
                raise ProxyException(e)
            except ConnectTimeoutError as e:
                logger.error('proxy error,[%s]->[%s],exp:%s', account, proxies,
                             e)
                raise ProxyException(e)
            except ReadTimeout as e:
                import sys
                exc_info = sys.exc_info()
                if time.time() - start_time > timeout or retry == 0:
                    raise e
                    # raise exc_info[0], exc_info[1], exc_info[2]
                logger.error(
                    u'context key:[%s],action:[%s] execute read time out,exception:%s',
                    context.context_key, self.__class__.__name__,
                    traceback.format_exc())
            except ConnectTimeout as e:
                import sys
                exc_info = sys.exc_info()
                if time.time() - start_time > timeout or retry == 0:
                    raise e
                    # raise exc_info[0], exc_info[1], exc_info[2]
                logger.error(
                    u'context key:[%s],action:[%s] execute connect time out,exception:%s',
                    context.context_key, self.__class__.__name__,
                    traceback.format_exc())
            except Exception as e:
                import sys
                exc_info = sys.exc_info()
                if time.time() - start_time > timeout or retry == 0:
                    raise e
                    # raise exc_info[0], exc_info[1], exc_info[2]
                logger.error(
                    u'context key:[%s],action:[%s] execute error,exception:%s',
                    context.context_key, self.__class__.__name__,
                    traceback.format_exc())
            finally:
                if is_update_cookies and response:
                    cookies.update(response.cookies)

            time.sleep(retry_interval)
Beispiel #17
0
    def _risk(self, stream_risk_dao, account):
        entity = stream_risk_dao.query_one(_filter=[
            CmmSysStreamRisk.type == int(RiskType.taobao_search),
            CmmSysStreamRisk.raw_data == account['username']
        ])
        if not entity:
            entity = CmmSysStreamRisk()
            entity.raw_data = account['username']
            entity.type = int(RiskType.taobao_search)
            stream_risk_dao.insert_entity(entity=entity)

    def init(self):
        super().init()

    def init_argparse(self, parser):
        super().init_argparse(parser)

    def process(self):
        # return super().process()
        self.execute(2)
        time.sleep(10)


if __name__ == "__main__":
    s = TaobaoListPageJob(10)
    logger.info("start to execute taobao_list_page job")
    s.run()
    # s.process()
    logger.error("exit taobao_list_page job")
Beispiel #18
0
        self._proxy_service = get_proxy_service()

    def execute(self):
        with write_session_scope() as session:
            _stream_risk_dao = get_stream_risk_dao(session=session)
            rsts = _stream_risk_dao.base_query.limit(self.account_num).all()
            if rsts:
                for item in rsts:
                    username = item.raw_data
                    account = global_config.s_accounts_dict[username]
                    proxy = self._proxy_service.get_origin_static_proxy(account['username'])
                    self._login(account=account, force=True, risk=True, proxy=proxy)
                    _stream_risk_dao.delete(_filter=[CmmSysStreamRisk.id == item.id])
                    session.commit()

    def init(self):
        super().init()

    def init_argparse(self, parser):
        super().init_argparse(parser)

    def process(self):
        self.execute()


if __name__ == "__main__":
    s = LoginJob(1)
    logger.info("start to execute login job")
    s.process()
    logger.error("exit login job")
Beispiel #19
0
        tasks = self.execute_sycm_product_actions()
        logger.info("start to execute sycm tasks,tasks length:%s", len(tasks))
        i = 0
        for task in tasks:
            if i < num:
                self._execute_sycm_product_actions(task)
                # self._executor.submit(self._execute_sycm_product_actions, task)
                i += 1
                time.sleep(15)

    def init(self):
        super().init()

    def init_argparse(self, parser):
        super().init_argparse(parser)

    def process(self):
        # return super().process()
        self.execute(10)
        time.sleep(10)

    # def run(self):
    #     self.execute(self, maxInt)


if __name__ == "__main__":
    s = SycmJob(1)
    logger.info("start to execute sycm job")
    s.run()
    logger.error("exit sycm job")