def get_time_desc(t):
    """
    获取时间描述
    :param t:
    :return:
    """
    _time_desc = ''
    h = int(t / 3600)
    if h >= 1:
        _time_desc += '%s 小时' % h
    m = int((t - h * 3600) / 60)
    if m >= 1:
        _time_desc += '%s 分' % m
    s = util.number_format(t - h * 3600 - m * 60, 3)
    if s >= 0:
        _time_desc += '%s 秒' % s
    return _time_desc
Beispiel #2
0
def fetch_search_data(keyword=None,
                      id=None,
                      data_dict=None,
                      headers=None,
                      proxy=None,
                      **kwargs):
    """获取搜索数据"""
    if keyword:
        print '正在获取 richardsonrfpd 中关键词:%s 的相关数据' % keyword
        url = 'http://www.richardsonrfpd.com/Pages/AdvanceSearch.aspx'
    elif 'url' in kwargs:
        url = kwargs['url']
    else:
        return 404
    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    try:
        proxies = kwargs.get('proxies')
        if proxies is None and proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {
                'http': 'http://' + proxy[1][i],
                'https': 'https://' + proxy[1][i]
            }
        response = requests.get(url,
                                headers=_headers,
                                timeout=30,
                                proxies=proxies)
        resp = do_search(response, keyword)
        if isinstance(resp, int):
            raise ValueError
    except Exception as e:
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        if 'Invalid URL' not in str(e):
            data_dict['list'].append({
                'status': -400,
                'url': url,
                'id': id,
                'count': kwargs.get('count', 1)
            })
        return -400
    if resp.status_code != 200:
        if resp.status_code == 404 and '404.html' in resp.url:
            logger.info('STATUS:404; INFO:无效产品; URL: %s' % url)
            return 404
        logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' %
                     (resp.status_code, proxies['http'] if proxy else '', url))
        data_dict['list'].append({
            'status': -405,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return -405
    resp.encoding = 'utf-8'
    # 开始解析resp
    # 获取搜索的数量
    if 'Search-Results.aspx' in resp.url:
        product_list = analyse_product_url(resp)
    root = lxml.html.fromstring(resp.text.encode('utf-8'))
    product_list = root.xpath('//tr[@valign="top"][@height=85]')
    if len(product_list) <= 0:
        data_dict['list'].append({
            'status': 404,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return 404
    for product in product_list:
        detail = product.xpath('.//a[@class="lnk12b-blackOff"]')
        detail_url = util.urljoin(
            resp.url, detail[0].xpath('./@href')[0]) if detail else ''
        match = goods_sn_pattern.search(detail_url)
        if not match and detail_url:
            logger.debug(u"无法匹配链接中的goods_sn URL{url}".format(url=detail_url))
            return -404
        goods_sn = match.group(1)
        goods_name = detail[0].text_content() if detail else ''
        data_dict['url'].append({
            'id': id,
            'url': detail_url,
            'goods_sn': goods_sn,
            'goods_name': goods_name,
        })
    if 'showMore=true' in url:
        return 200
    count = root.xpath('//td[@class="medtext"]')
    count = util.number_format(count[0].text, places=0, index=999,
                               smart=True) if count else 0
    page_num = int(math.ceil(count / 10.0))
    if page_num <= 1:
        return 200
    max_list_num = util.intval(kwargs.get('max_list_num', 5))
    page_list = root.xpath('//td[@class="medtext"]/a/@href')
    for x in xrange(1, page_num + 1):
        if max_list_num and x > max_list_num:
            break
        page_url = 'http://shopping.netsuite.com/s.nl/c.402442/sc.2/.f?search={search}&range={start}%2C{end}%2C{total}'.format(
            search=keyword, start=x * 10 + 1, end=(x + 1) * 10, total=count)
        data_dict['list'].append({
            'id': id,
            'url': page_url,
        })
    return 200