Beispiel #1
0
def release_proxy(group, task):
    """Handle proxy release
    """
    tp = TaskProtocal(task)
    task_dct = tp.get_data()
    logger.info("release %s" % task_dct)
    if task_dct['proxy']:
        redis_execute(redis_client.sadd)(KEY_NAME, task_dct['proxy'])
Beispiel #2
0
def remove_proxy(group, task):
    """Handle proxy remove
    """
    tp = TaskProtocal(task)
    task_dct = tp.get_data()
    logger.info("remove %s" % task_dct)
    if task_dct['proxy']:
        ret = redis_execute(redis_client.srem)(KEY_NAME, task_dct['proxy'])
        if ret:
            tp.set_to('output')
            return tp
Beispiel #3
0
async def handle_worker(group, task):
    """Handle statistic task
    """
    tp = TaskProtocal(task)
    task_dct = tp.get_data()
    if 'extra' in task_dct and 'stats' in task_dct['extra']:
        tid = task_dct['extra']['stats'].get('tid')
        step = task_dct['extra']['stats'].get('step')
        if (tid, step) in TID_MAP:
            stats_name = TID_MAP[(tid, step)]
            data_dct.setdefault(stats_name, 0)
            data_dct[stats_name] += 1
Beispiel #4
0
async def handle_worker(group, task):
    """Handle amz_bsr_result task

    [input] task data format:
        JSON:
            {
                #product info
                "extra": {
                    "bsr": {
                        "bs_cate": [item["cate"]],
                        "date": "xxxx-xx-xx"
                    }
                }
            }
    [output] result data format:
        JSON:
            {
                #product info
                +"bs_cate": "cate",
                +"date": "2017-09-10",
                -"extra",
            }
    """
    tp = TaskProtocal(task)
    info = tp.get_data()
    popt_dct = popt_map.get(info['platform'], {})
    cat_name = info['detail_info']['cat_1_name'].strip().lower(
    ) if info['detail_info']['cat_1_name'] else ''
    cat_rank = info['detail_info']['cat_1_rank'] if info['detail_info'][
        'cat_1_rank'] is not None else -1
    info['detail_info']['cat_1_sales'] = -1
    if cat_name and cat_rank != -1 and popt_dct:
        info['detail_info']['cat_1_sales'] = CURVE_FUNC(
            cat_rank, *popt_dct.get(cat_name, popt_dct['default']))
    if info.get('extra') and info['extra'].get('bsr'):
        info['bs_cate'] = info['extra']['bsr']['bs_cate']
        info['date'] = info['extra']['bsr']['date']
        del info['extra']
    else:
        cate = ''
        if info['detail_info']['cat_ls']:
            cate = ':'.join(info['detail_info']['cat_ls'][0]['name_ls'])
        info['bs_cate'] = [cate]
        info['date'] = time.strftime("%Y-%m-%d", time.localtime())
    res = pipeflow.Task(json.dumps(info).encode('utf-8'))
    res.set_to('output')
    return res
Beispiel #5
0
async def handle_worker(group, task):
    """Handle callback task
    """
    tp = TaskProtocal(task)
    task_dct = tp.get_data()
    if 'extra' in task_dct and 'cb' in task_dct['extra']:
        url = task_dct['extra']['cb'].get('url')
        async with aiohttp.ClientSession(conn_timeout=7) as session:
            try:
                async with session.post(
                        url,
                        timeout=TIME_OUT,
                        data=zlib.compress(
                            json.dumps(task_dct).encode('utf-8'))) as resp:
                    html = await resp.read()
                    if resp.status != 200:
                        logger.error('[%d] %s' % (resp.status, url))
            except Exception as exc:
                logger.error('Request page fail : %s' % exc)
Beispiel #6
0
async def handle_worker(group, task):
    tp = TaskProtocal(task)
    f = tp.get_from()
    tid = tp.get_tid()
    step = tp.get_step()

    logger.info("ep: %s, tid: %s, step: %s" % (f, tid, step))
    if tid not in flow_conf[FLOW_TASK_CONF]:
        logger.error("Task ID [%s] error" % tid)
        return
    task_ls = []
    task_data = tp.get_data()
    if step+1 < len(flow_conf[FLOW_TASK_CONF][tid]):
        endpoint_name = flow_conf[FLOW_TASK_CONF][tid][step+1]['name']
        next_tp = tp.new_task(task_data, next_step=True)
        next_tp.set_to(endpoint_name)
        task_ls.append(next_tp)
    for f_tid in flow_conf[FLOW_TASK_CONF][tid][step].get('fork', []):
        endpoint_name = flow_conf[FLOW_TASK_CONF][f_tid][0]['name']
        fork_tp = tp.new_task(task_data, tid=f_tid)
        fork_tp.set_to(endpoint_name)
        task_ls.append(fork_tp)
    return task_ls
Beispiel #7
0
async def handle_task(group, task):
    """Handle amz_bsr_product task

    [input] task data format:
        JSON:
            {
                "platform": "amazon_us",
                "root_url": "https://www.amazon.de/gp/bestsellers",
                "category_filter": ["name1", ... ,"namex"]
                "with_qty": True    #optional
            }
    [notify] task data format:
        BYTES:
            b"task done"
    """
    global filter_ls
    global task_start
    global task_count

    from_name = task.get_from()
    if from_name == 'input':
        tp = TaskProtocal(task)
        task_dct = tp.get_data()
        if task_dct['platform'] not in PLATFORM_FILTER_LS:
            return
        if task_start:
            tp.set_to('input_back')
            return tp
        else:
            group.suspend_endpoint('input')
            task_start = True
            logger.info(task_dct['root_url'])
            filter_ls = [cate.lower() for cate in task_dct['category_filter']]
            task_dct['url'] = task_dct['root_url']
            task_dct['date'] = time.strftime("%Y-%m-%d", time.localtime())
            del task_dct['root_url']
            del task_dct['category_filter']
            new_tp = tp.new_task(task_dct)
            new_tp.set_to('inner_output')
            task_count = 1
            return new_tp

    if from_name == 'notify' and task_start:
        if task.get_data() == b'task done':
            filter_ls = []
            task_start = False
            group.resume_endpoint('input')
Beispiel #8
0
async def handle_task(group, task):
    """Handle amz_keyword task

    [input] task data format:
        JSON:
            {
                "platform": "amazon_us",
                "keyword": "xx xxx",
                "end_page": 10,
            }
    [notify] task data format:
        BYTES:
            b"task done"
    """
    global task_count

    from_name = task.get_from()
    if from_name == 'input':
        tp = TaskProtocal(task)
        if task_count >= MAX_WORKERS:
            tp.set_to('input_back')
            return tp
        else:
            task_count += 1
            if task_count >= MAX_WORKERS:
                group.suspend_endpoint('input')
            task_dct = tp.get_data()
            logger.info("%s %s" % (task_dct['platform'], task_dct['keyword']))
            task_dct.setdefault('end_page', 20)
            task_dct['page'] = 1
            task_dct['url'] = get_search_index_url(task_dct['platform'],
                                                   task_dct['keyword'])
            new_tp = tp.new_task(task_dct)
            new_tp.set_to('inner_output')
            return new_tp

    if from_name == 'notify' and task_count:
        if task.get_data() == b'task done':
            task_count -= 1
            if task_count + 1 == MAX_WORKERS:
                group.resume_endpoint('input')
Beispiel #9
0
async def handle_task(group, task):
    """Handle amz_qa task

    [input] task data format:
        JSON:
            {
                "platform": "amazon_us",
                "asin": "xxxx",
                "till": "qa id",
            }
    [notify] task data format:
        BYTES:
            b"task done"
    """
    global task_count

    from_name = task.get_from()
    if from_name == 'input':
        tp = TaskProtocal(task)
        if task_count >= MAX_WORKERS:
            tp.set_to('input_back')
            return tp
        else:
            task_count += 1
            if task_count >= MAX_WORKERS:
                group.suspend_endpoint('input')
            task_dct = tp.get_data()
            logger.info("%s %s %s" % (task_dct['platform'], task_dct['asin'],
                                      task_dct.get('till', '')))
            task_dct["page"] = 1
            new_tp = tp.new_task(task_dct)
            new_tp.set_to('inner_output')
            return new_tp

    if from_name == 'notify' and task_count:
        if task.get_data() == b'task done':
            task_count -= 1
            if task_count + 1 == MAX_WORKERS:
                group.resume_endpoint('input')
Beispiel #10
0
async def handle_worker(group, task):
    """Handle amz_qa task

    [input] task data format:
        JSON:
            {
                "platform": "amazon_us",
                "asin": "xxxx",
                "till": "qa id",
                "page": 1,
            }
    [output] result data format:
        JSON:
            {
                "platform": "amazon_us",
                "asin": "xxxx",
                "page": 1,
                "end": true,
                "qas": [
                    {
                        'qa_id': 'xdf',
                        'vote': 5,
                        'question': 'qqq',
                        'answer': 'aaa',
                        'author': 'author',
                        'date': '2017-09-09',
                    }
                ]
            }
    """

    tp = TaskProtocal(task)
    task_dct = tp.get_data()
    handle_cls = get_spider_by_platform(task_dct['platform'])
    notify_task = pipeflow.Task(b'task done')
    notify_task.set_to('notify')
    url = get_url_by_platform(task_dct['platform'], task_dct['asin'],
                              task_dct['page'])
    current_page = task_dct['page']
    with GetPageSession() as sess:
        try:
            #sess = GetPageSession()
            html = await sess.get_page('get',
                                       url,
                                       timeout=60,
                                       captcha_bypass=True)
            soup = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8'))
            handle = handle_cls(soup)
        except BannedError as exc:
            tp.set_to('input_back')
            ban_tp = tp.new_task({'proxy': exc.proxy[7:]})
            ban_tp.set_to('ban')
            return [ban_tp, tp]
        except RequestError:
            tp.set_to('inner_output')
            return tp
        except CaptchaError:
            tp.set_to('inner_output')
            return tp
        except Exception as exc:
            exc_info = (type(exc), exc, exc.__traceback__)
            taks_info = ' '.join([task_dct['platform'], url])
            logger.error('Get page handle error\n' + taks_info,
                         exc_info=exc_info)
            exc.__traceback__ = None
            return notify_task

    is_qa_page = handle.is_qa_page()
    # abandon result
    if not is_qa_page:
        return notify_task

    try:
        next_page, qa_ls = handle.get_info()
    except Exception as exc:
        exc_info = (type(exc), exc, exc.__traceback__)
        taks_info = ' '.join([task_dct['platform'], url])
        logger.error('Get page info error\n' + taks_info, exc_info=exc_info)
        exc.__traceback__ = None
        return notify_task

    qa_id_ls = [item['qa_id'] for item in qa_ls]
    if 'till' in task_dct and task_dct['till'] in qa_id_ls:
        next_page = None
        i = qa_id_ls.index(task_dct['till'])
        qa_ls = qa_ls[:i]

    task_ls = []
    if next_page:
        task_dct['page'] = next_page
        new_tp = tp.new_task(task_dct)
        new_tp.set_to('inner_output')
        task_ls.append(new_tp)
    else:
        task_ls.append(notify_task)
    if qa_ls:
        info = {
            'platform': task_dct['platform'],
            'asin': task_dct['asin'],
            'page': current_page,
            'qas': qa_ls
        }
        if not next_page:
            info['end'] = True
        new_tp = tp.new_task(info)
        new_tp.set_to('output')
        task_ls.append(new_tp)
    return task_ls
Beispiel #11
0
async def handle_worker(group, task):
    """Handle amz_bsr_product task

    [inner_input] task data format:
        JSON:
            {
                "platform": "amazon_us",
                "url": "https://www.amazon.de/gp/bestsellers",
                "date": "2017-08-08"
                "with_qty": True    #optional
            }
    [output] result data format:
        JSON:
            {
                "platform": "amazon_us"
                "asin": "xxxx"
                "with_qty": True    #optional
                "extra": {
                    "bsr": {
                        "bs_cate": [item["cate"]],
                        "date": "xxxx-xx-xx"
                    }
                }
            }
    """
    global filter_ls
    global task_count
    global category_id_set
    global asin_set

    tp = TaskProtocal(task)
    task_dct = tp.get_data()
    try:
        task_count -= 1
        handle_cls = get_spider_by_platform(task_dct['platform'])
        url = task_dct['url']
        logger.info("%s" % (url, ))
        with GetPageSession() as sess:
            try:
                #sess = GetPageSession()
                html = await sess.get_page('get',
                                           url,
                                           timeout=60,
                                           captcha_bypass=True)
                soup = etree.HTML(html,
                                  parser=etree.HTMLParser(encoding='utf-8'))
                handle = handle_cls(soup)
            except BannedError as exc:
                tp.set_to('input_back')
                ban_tp = tp.new_task({'proxy': exc.proxy[7:]})
                ban_tp.set_to('ban')
                return [ban_tp, tp]
            except RequestError:
                tp.set_to('inner_output')
                task_count += 1
                return tp
            except CaptchaError:
                tp.set_to('inner_output')
                task_count += 1
                return tp
            except Exception as exc:
                exc_info = (type(exc), exc, exc.__traceback__)
                taks_info = ' '.join([task_dct['platform'], url])
                logger.error('Get page handle error\n' + taks_info,
                             exc_info=exc_info)
                exc.__traceback__ = None
                return

        is_bsr_page = handle.is_bsr_page()
        # abandon result
        if not is_bsr_page:
            return

        is_exist = False
        reg = re.search(r'/(\d+)/ref=', url)
        if reg:
            cate_id = int(reg.group(1))
            if cate_id not in category_id_set:
                category_id_set.add(cate_id)
            else:
                is_exist = True
        try:
            url_ls, asin_ls = handle.get_info(filter_ls, is_exist)
        except Exception as exc:
            exc_info = (type(exc), exc, exc.__traceback__)
            taks_info = ' '.join([task_dct['platform'], url])
            logger.error('Get page info error\n' + taks_info,
                         exc_info=exc_info)
            exc.__traceback__ = None
            return
        asin_ls = [item for item in asin_ls if item['asin'] not in asin_set]
        asin_set.update([item['asin'] for item in asin_ls])

        task_ls = []
        for url in url_ls:
            new_tp = tp.new_task({
                'platform': task_dct['platform'],
                'url': url,
                'date': task_dct['date'],
                'with_qty': task_dct.get('with_qty', False)
            })
            new_tp.set_to('inner_output')
            task_ls.append(new_tp)
        task_count += len(url_ls)
        for item in asin_ls:
            new_tp = tp.new_task({
                'platform': task_dct['platform'],
                'asin': item['asin'],
                'with_qty': task_dct.get('with_qty', False),
                'extra': {
                    'bsr': {
                        'bs_cate': [item['cate']],
                        'date': task_dct['date']
                    }
                }
            })
            new_tp.set_to('output')
            task_ls.append(new_tp)
        return task_ls
    finally:
        if group.get_running_cnt() == 1 and task_count == 0:
            category_id_set = set([])
            asin_set = set([])
            new_task = pipeflow.Task(b'task done')
            new_task.set_to('notify')
            return new_task
Beispiel #12
0
async def handle_worker(group, task):
    """Handle amz_review task

    [input] task data format:
        JSON:
            {
                "platform": "amazon_us",
                "asin": "xxxx",
                "till": "reveiw id",
                "url": "xxxx",
            }
    [output] result data format:
        JSON:
            {
                "platform": "amazon_us",
                "asin": "xxxx",
                "page": 1,
                "end": true,
                "reviews": [
                    {
                        "review_id": "xdf",
                        "rating": 4.0,
                        "title": "title",
                        "content": "content",
                        "author": "author",
                        "author_id": "author_id",
                        "date": "2017-09-09",
                        "verified_purchase": False,
                        "imgs": [],
                    }
                ]
            }
    """

    tp = TaskProtocal(task)
    task_dct = tp.get_data()
    handle_cls = get_spider_by_platform(task_dct['platform'])
    notify_task = pipeflow.Task(b'task done')
    notify_task.set_to('notify')
    url = task_dct['url']
    if not url:
        url = get_url_by_platform(task_dct['platform'], task_dct['asin'])
    with GetPageSession() as sess:
        try:
            #sess = GetPageSession()
            html = await sess.get_page('get', url, timeout=60, captcha_bypass=True)
            soup = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8'))
            handle = handle_cls(soup)
        except BannedError as exc:
            tp.set_to('input_back')
            ban_tp = tp.new_task({'proxy': exc.proxy[7:]})
            ban_tp.set_to('ban')
            return [ban_tp, tp]
        except RequestError:
            tp.set_to('inner_output')
            return tp
        except CaptchaError:
            tp.set_to('inner_output')
            return tp
        except Exception as exc:
            exc_info = (type(exc), exc, exc.__traceback__)
            taks_info = ' '.join([task_dct['platform'], url])
            logger.error('Get page handle error\n'+taks_info, exc_info=exc_info)
            exc.__traceback__ = None
            return notify_task

    is_review_page = handle.is_review_page()
    # abandon result
    if not is_review_page:
        return notify_task

    try:
        page_info, review_ls = handle.get_info()
    except Exception as exc:
        exc_info = (type(exc), exc, exc.__traceback__)
        taks_info = ' '.join([task_dct['platform'], url])
        logger.error('Get page info error\n'+taks_info, exc_info=exc_info)
        exc.__traceback__ = None
        return notify_task

    ### just for redirect response situation
    if page_info['cur_page_url']:
        pr = parse.urlparse(page_info['cur_page_url'])
        query_dct = dict(parse.parse_qsl(pr.query))
        if 'reviewerType' not in query_dct or 'pageSize' not in query_dct or 'sortBy' not in query_dct:
            new_url = get_url_by_platform(task_dct['platform'], task_dct['asin'], pr.path)
            task_dct['url'] = new_url
            new_tp = tp.new_task(task_dct)
            new_tp.set_to('inner_output')
            return new_tp

    if page_info['next_page_url']:
        page_info['next_page_url'] = formalize_url(task_dct['platform'], page_info['next_page_url'])
    review_id_ls = [item['review_id'] for item in review_ls]
    if 'till' in task_dct and task_dct['till'] in review_id_ls:
        page_info['next_page_url'] = None
        i = review_id_ls.index(task_dct['till'])
        review_ls = review_ls[:i]

    task_ls = []
    if page_info['next_page_url']:
        task_dct['url'] = page_info['next_page_url']
        new_tp = tp.new_task(task_dct)
        new_tp.set_to('inner_output')
        task_ls.append(new_tp)
    else:
        task_ls.append(notify_task)
    if review_ls:
        for item in review_ls:
            if not item['asin']:
                item['asin'] = task_dct['asin']
        info = {
            'platform': task_dct['platform'], 'asin': task_dct['asin'],
            'page': page_info['cur_page'],
            'reviews': review_ls
        }
        if not page_info['next_page_url']:
            info['end'] = True
        new_tp = tp.new_task(info)
        new_tp.set_to('output')
        task_ls.append(new_tp)
    return task_ls
Beispiel #13
0
async def handle_worker(group, task):
    """Handle amz_keyword task

    [input] task data format:
        JSON:
            {
                "platform": "amazon_us",
                "keyword": "xx xxx",
                "end_page": 10,
                "page": 1,
                "url": "xxxx",
            }
    [output] result data format:
        JSON:
            {
                "platform": "amazon_us",
                "keyword": "xx xxx",
                "page": 1,
                "end": true,
                "status": 0,
                "products": [
                    {'is_sponsored': 1, 'rank': 1, 'asin': 'B073S6F9JQ'}
                ],
                "count": 10,
                "category": ['xxx1','xx2'],
                "department": "xxxxx",
            }
    """

    tp = TaskProtocal(task)
    task_dct = tp.get_data()
    notify_task = pipeflow.Task(b'task done')
    notify_task.set_to('notify')
    if task_dct['page'] > task_dct['end_page']:
        return notify_task
    handle_cls = get_spider_by_platform(task_dct['platform'])

    with GetPageSession() as sess:
        try:
            #sess = GetPageSession()
            html = await sess.get_page('get',
                                       task_dct['url'],
                                       timeout=60,
                                       captcha_bypass=True)
            soup = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8'))
            handle = handle_cls(soup)
        except BannedError as exc:
            tp.set_to('input_back')
            ban_tp = tp.new_task({'proxy': exc.proxy[7:]})
            ban_tp.set_to('ban')
            return [ban_tp, tp]
        except RequestError:
            tp.set_to('inner_output')
            return tp
        except CaptchaError:
            tp.set_to('inner_output')
            return tp
        except Exception as exc:
            exc_info = (type(exc), exc, exc.__traceback__)
            taks_info = ' '.join([task_dct['platform'], task_dct['url']])
            logger.error('Get page handle error\n' + taks_info,
                         exc_info=exc_info)
            exc.__traceback__ = None

            tps = [notify_task]
            new_tp = tp.new_task({
                'platform': task_dct['platform'],
                'keyword': task_dct['keyword'],
                'page': task_dct['page'],
                'end': True,
                'status': 1,
                'message': 'Get Page handle error'
            })
            new_tp.set_to('output')
            tps.append(new_tp)
            return tps

    is_search_page = handle.is_search_page()
    if not is_search_page:
        return notify_task

    try:
        next_url = handle.get_next_url()
        asin_ls = handle.get_asins()
        result_dct = handle.get_search_result()
        department = handle.get_nav_search()
    except Exception as exc:
        exc_info = (type(exc), exc, exc.__traceback__)
        taks_info = ' '.join([task_dct['platform'], task_dct['url']])
        logger.error('Get page info error\n' + taks_info, exc_info=exc_info)
        exc.__traceback__ = None
        return notify_task

    if next_url is not None:
        next_url = formalize_url(task_dct['platform'], next_url)

    task_ls = []
    info = {
        'platform': task_dct['platform'],
        'keyword': task_dct['keyword'],
        'page': task_dct['page'],
        'products': asin_ls,
        'count': result_dct['count'],
        'category': result_dct['category'],
        'department': department,
    }
    next_page = task_dct['page'] + 1
    if next_url and next_page <= task_dct['end_page']:
        task_dct['page'] = next_page
        task_dct['url'] = next_url
        new_tp = tp.new_task(task_dct)
        new_tp.set_to('inner_output')
        task_ls.append(new_tp)
    else:
        info['end'] = True
        info['status'] = 0
        task_ls.append(notify_task)

    new_tp = tp.new_task(info)
    new_tp.set_to('output')
    task_ls.append(new_tp)
    return task_ls
Beispiel #14
0
async def handle_worker(group, task):
    """Handle amz_product task

    [input] task data format:
        JSON:
            {
                "platform": "amazon_us",
                "asin": "B02KDI8NID8",
                "with_qty": True,
            }
    [output] result data format:
        JSON:
            {
                'asin': 'B02KDI8NID8',
                'platform': 'amazon_us',
                'parent_asin': 'B02KDI8NID8',
                'title': 'Active Wow Teeth Whitening Charcoal Powder Natural',
                'brand': 'Active Wow',
                'price': 24.79,
                'discount': 0.83,
                'merchant_id': 'A3RJPJ9XCKYOM5',
                'merchant': 'MarketWeb',
                'description': [],
                'category': [],
                "product_info": {
                    "product_dimensions": "2 x 2 x 2 inches ; 0.6 ounces",
                    "shipping_weight": "3.2 ounces ()",
                    "date_first_available": null
                },
                'detail_info': {
                    'cat_1_rank': 5,
                    'cat_1_name': 'Beauty & Personal Care',
                    "cat_ls": [{"rank": 4, "name_ls": ["Health & Household", "Oral Care", "Teeth Whitening"]}],
                },
                'relative_info': {
                    'bought_together': [],
                    'also_bought': [],
                    'also_viewed': [],
                    'viewed_also_bought': [],
                    'sponsored_1': [],
                    'sponsored_2': [],
                    'compare_to_similar': [],
                },
                'sku_info': [],
                'fba': 1,
                'review': 4.6,
                'review_count': 9812,
                "review_statistics": {
                    "1": 6,
                    "2": 2,
                    "3": 3,
                    "4": 9,
                    "5": 80
                },
                'img': 'https://images-na.ssl-images-amazon.com/images/I/514RSPIJMKL.jpg',
                'imgs': [],
                'qty': 123, #None
            }
    """
    tp = TaskProtocal(task)
    from_end = tp.get_from()
    task_dct = tp.get_data()
    logger.info("%s %s %s" % (task_dct['platform'], task_dct['asin'],
                              task_dct.get('with_qty', False)))

    handle_cls = get_spider_by_platform(task_dct['platform'])
    url = get_url_by_platform(task_dct['platform'], task_dct['asin'])
    qty_info = {}
    with GetPageSession() as sess:
        try:
            if not task_dct.get('with_qty'):
                #sess = GetPageSession()
                html = await sess.get_page('get',
                                           url,
                                           timeout=60,
                                           captcha_bypass=True)
                soup = etree.HTML(html,
                                  parser=etree.HTMLParser(encoding='utf-8'))
                handle = handle_cls(soup)
            else:
                #sess = GetPageSession()
                html = await sess.get_page('get',
                                           url,
                                           timeout=60,
                                           captcha_bypass=True)
                soup = etree.HTML(html,
                                  parser=etree.HTMLParser(encoding='utf-8'))
                handle = handle_cls(soup)
                offer_listing_id = handle.get_offer_listing_id()
                ue_id = handle.get_ue_id()
                session_id = handle.get_session_id()
                domain = get_domain_by_platform(task_dct['platform'])
                if offer_listing_id and ue_id and session_id:
                    ### get ubid-main cookie
                    collect_coupon_url = COLLECT_COUPON.format(domain=domain)
                    data = {"pageReImpType": "aplImpressionPC"}
                    headers = {
                        'Referer': url,
                        "X-Requested-With": "XMLHttpRequest",
                        "Content-Type": "application/x-www-form-urlencoded"
                    }
                    cookies = {
                        'csm-hit':
                        's-{ue_id:s}|{time:d}'.format(ue_id=ue_id,
                                                      time=int(time.time() *
                                                               1000))
                    }
                    await sess.get_page('post',
                                        collect_coupon_url,
                                        data=data,
                                        headers=headers,
                                        cookies=cookies,
                                        timeout=30)
                    ### get qty
                    add_to_cart_url = ADD_TO_CART.format(domain=domain)
                    data = ADD_TO_CART_DATA.format(
                        asin=task_dct['asin'],
                        session_id=session_id,
                        offer_listing_id=offer_listing_id,
                        qty=999)
                    headers = {
                        'Referer': url,
                        "X-Requested-With": "XMLHttpRequest",
                        "Content-Type": "application/x-www-form-urlencoded"
                    }
                    cookies = {
                        'csm-hit':
                        '{ue_id:s}+s-{ue_id:s}|{time:d}'.format(
                            ue_id=ue_id, time=int(time.time() * 1000))
                    }
                    ret = await sess.get_page('post',
                                              add_to_cart_url,
                                              data=data,
                                              headers=headers,
                                              cookies=cookies,
                                              timeout=30)
                    qty_info = json.loads(ret.decode('utf-8'))
        except BannedError as exc:
            tp.set_to(from_end)
            ban_tp = tp.new_task({'proxy': exc.proxy[7:]})
            ban_tp.set_to('ban')
            return [ban_tp, tp]
        except RequestError:
            tp.set_to(from_end)
            return tp
        except CaptchaError:
            tp.set_to(from_end)
            return tp
        except Exception as exc:
            exc_info = (type(exc), exc, exc.__traceback__)
            taks_info = ' '.join([task_dct['platform'], task_dct['asin']])
            logger.error('Get page handle error\n' + taks_info,
                         exc_info=exc_info)
            exc.__traceback__ = None
            return

    is_product_page = handle.is_product_page()
    if not is_product_page:
        return

    try:
        info = handle.get_info()
        info['qty'] = int(
            qty_info['cartQuantity']) if qty_info.get('cartQuantity') else None
        # extra info
        info['asin'] = task_dct['asin']
        info['platform'] = task_dct['platform']
        new_tp = tp.new_task(info)
        new_tp.set_to('output')
        return new_tp
    except Exception as exc:
        exc_info = (type(exc), exc, exc.__traceback__)
        taks_info = ' '.join([task_dct['platform'], task_dct['asin']])
        logger.error('Get page info error\n' + taks_info, exc_info=exc_info)
        exc.__traceback__ = None
        return
Beispiel #15
0
async def handle_worker(group, task):
    """Handle amz_product task

    [input] task data format:
        JSON:
            {
                "platform": "amazon_us",
                "asin": "B02KDI8NID8"
            }
    [output] result data format:
        JSON:
            {
                'asin': 'B02KDI8NID8',
                'platform': 'amazon_us',
                'title': 'Active Wow Teeth Whitening Charcoal Powder Natural',
                'brand': 'Active Wow',
                'price': 24.79,
                'discount': 0.83,
                'merchant_id': 'A3RJPJ9XCKYOM5',
                'merchant': 'MarketWeb',
                'detail_info': {
                    'cat_1_rank': 5,
                    'cat_1_name': 'Beauty & Personal Care'
                },
                'relative_info': {
                    'bought_together': [],
                    'also_bought': [],
                },
                'fba': 1,
                'review': 4.6,
                'review_count': 9812,
                'img': 'https://images-na.ssl-images-amazon.com/images/I/514RSPIJMKL.jpg'
            }
    """
    tp = TaskProtocal(task)
    from_end = tp.get_from()
    task_dct = tp.get_data()
    logger.info("%s %s" % (task_dct['platform'], task_dct['asin']))

    handle_cls = get_spider_by_platform(task_dct['platform'])
    url = get_url_by_platform(task_dct['platform'], task_dct['asin'])
    try:
        soup = await get_page(url, timeout=70)
        handle = handle_cls(soup)
    except RequestError:
        if from_end == 'routine_input':
            tp.set_to('routine_input_back')
        elif from_end == 'input':
            tp.set_to('input_back')
        return tp.to_task()
    except CaptchaError:
        if from_end == 'routine_input':
            tp.set_to('routine_input_back')
        elif from_end == 'input':
            tp.set_to('input_back')
        return tp.to_task()
    except Exception as exc:
        exc_info = (type(exc), exc, exc.__traceback__)
        taks_info = ' '.join([task_dct['platform'], task_dct['asin']])
        logger.error('Get page handle error\n'+taks_info, exc_info=exc_info)
        exc.__traceback__ = None
        return

    is_product_page = handle.is_product_page()
    if not is_product_page:
        return

    try:
        info = handle.get_info()
        # extra info
        info['asin'] = task_dct['asin']
        info['platform'] = task_dct['platform']
        if task_dct.get('extra'):
            info['extra'] = task_dct['extra']
        new_tp = tp.new_task(info)
        new_tp.set_to('output')
        return new_tp.to_task()
    except Exception as exc:
        exc_info = (type(exc), exc, exc.__traceback__)
        taks_info = ' '.join([task_dct['platform'], task_dct['asin']])
        logger.error('Get page info error\n'+taks_info, exc_info=exc_info)
        exc.__traceback__ = None
        return