def fetch_data(url, proxy=None, headers=None, **kwargs): """获取页面数据 @param proxy 代理ip,[代理数量,代理列表] @param headers 头部信息,如user_agent @param kwargs 扩展参数,如fetch_update其表示是否为获取更新 @return 获取数据异常时返回信息为负值,成功为字典类型数据 """ _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = kwargs.get('proxies') if proxies is None and proxy: i = random.randint(0, proxy[0] - 1) proxies = { 'http': 'http://' + proxy[1][i], 'https': 'https://' + proxy[1][i] } resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies) except Exception as e: # 将进行重试,可忽略 logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) return -400 # 强制utf-8 resp.encoding = 'utf-8' if '404.html' in resp.url: return 404 return _parse_detail_data(resp, headers=_headers, **kwargs)
def fetch_data(url, proxy=None, headers=None, **kwargs): """获取页面数据 @param proxy 代理ip,[代理数量,代理列表] @param headers 头部信息,如user_agent @param kwargs 扩展参数,如fetch_update其表示是否为获取更新 @return 获取数据异常时返回信息为负值,成功为字典类型数据 :param url: """ if 'goods_sn' in kwargs: del kwargs['goods_sn'] _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) if url[0:2] == '//': url = 'http:' + url try: proxies = None if proxy: i = random.randint(0, proxy[0] - 1) proxies = {'http': 'http://' + proxy[1][i]} ti_domain = urlparse.urlsplit(url)[1] if 'www.ti.com.cn' == ti_domain: product_path_pattern = re.compile(r'/cn/(.*)', re.IGNORECASE) product_path = product_path_pattern.search(url) if product_path: url = "http://www.ti.com/product/{path}".format( path=product_path.group(1)) elif 'store.ti.com' in ti_domain: kwargs['proxies'] = proxies return _parse_store_ti_com(url, **kwargs) resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies) except Exception as e: # 将进行重试,可忽略 logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) return -400 # 是否需要添加500的判断 # 强制utf-8 resp.encoding = 'utf-8' if '404.html' in resp.url: return 404 if '/tool/' in resp.url: return _parse_tool_detail(resp, **kwargs) kwargs['proxies'] = proxies return _parse_detail_data(resp, headers=_headers, **kwargs)
def fetch_search_data(keyword=None, id=None, data_dict=None, headers=None, proxy=None, **kwargs): """获取搜索数据""" if keyword: print '正在获取 richardsonrfpd 中关键词:%s 的相关数据' % keyword url = 'http://www.richardsonrfpd.com/Pages/AdvanceSearch.aspx' elif 'url' in kwargs: url = kwargs['url'] else: return 404 _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = kwargs.get('proxies') if proxies is None and proxy: i = random.randint(0, proxy[0] - 1) proxies = { 'http': 'http://' + proxy[1][i], 'https': 'https://' + proxy[1][i] } response = requests.get(url, headers=_headers, timeout=30, proxies=proxies) resp = do_search(response, keyword) if isinstance(resp, int): raise ValueError except Exception as e: logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) if 'Invalid URL' not in str(e): data_dict['list'].append({ 'status': -400, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -400 if resp.status_code != 200: if resp.status_code == 404 and '404.html' in resp.url: logger.info('STATUS:404; INFO:无效产品; URL: %s' % url) return 404 logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % (resp.status_code, proxies['http'] if proxy else '', url)) data_dict['list'].append({ 'status': -405, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -405 resp.encoding = 'utf-8' # 开始解析resp # 获取搜索的数量 if 'Search-Results.aspx' in resp.url: product_list = analyse_product_url(resp) root = lxml.html.fromstring(resp.text.encode('utf-8')) product_list = root.xpath('//tr[@valign="top"][@height=85]') if len(product_list) <= 0: data_dict['list'].append({ 'status': 404, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return 404 for product in product_list: detail = product.xpath('.//a[@class="lnk12b-blackOff"]') detail_url = util.urljoin( resp.url, detail[0].xpath('./@href')[0]) if detail else '' match = goods_sn_pattern.search(detail_url) if not match and detail_url: logger.debug(u"无法匹配链接中的goods_sn URL{url}".format(url=detail_url)) return -404 goods_sn = match.group(1) goods_name = detail[0].text_content() if detail else '' data_dict['url'].append({ 'id': id, 'url': detail_url, 'goods_sn': goods_sn, 'goods_name': goods_name, }) if 'showMore=true' in url: return 200 count = root.xpath('//td[@class="medtext"]') count = util.number_format(count[0].text, places=0, index=999, smart=True) if count else 0 page_num = int(math.ceil(count / 10.0)) if page_num <= 1: return 200 max_list_num = util.intval(kwargs.get('max_list_num', 5)) page_list = root.xpath('//td[@class="medtext"]/a/@href') for x in xrange(1, page_num + 1): if max_list_num and x > max_list_num: break page_url = 'http://shopping.netsuite.com/s.nl/c.402442/sc.2/.f?search={search}&range={start}%2C{end}%2C{total}'.format( search=keyword, start=x * 10 + 1, end=(x + 1) * 10, total=count) data_dict['list'].append({ 'id': id, 'url': page_url, }) return 200
def fetch_search_data(keyword=None, id=None, data_dict=None, headers=None, proxy=None, **kwargs): """获取搜索数据""" if keyword: if not kwargs.get('other_usage', False): print '正在获取 ti.com 中关键词:%s 的相关数据' % keyword url = 'http://www.ti.com/sitesearch/docs/partnumsearch.tsp?sort=asc&linkId=2&filter=p&sortBy=pstatus&searchTerm=%s' % keyword elif 'url' in kwargs: url = kwargs['url'] else: return 404 _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = None if proxy: i = random.randint(0, proxy[0] - 1) proxies = {'http': 'http://' + proxy[1][i]} resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies) except Exception as e: logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) if 'Invalid URL' not in str(e): data_dict['list'].append({ 'status': -400, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -400 if resp.status_code != 200: if resp.status_code == 404 and '404.html' in resp.url: logger.info('STATUS:404; INFO:无效产品; URL: %s' % url) return 404 logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % (resp.status_code, proxies['http'] if proxy else '', url)) data_dict['list'].append({ 'status': -405, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -405 resp.encoding = 'utf-8' resp_json = {} try: resp_json = json.loads(resp.content) product = resp_json.get('response', {}).get('searchResults', {}).get('PartNoArray', []) # print len(product) except: product = [] logger.debug('STATUS:-404 ; INFO:数据异常 ; URL:%s' % url) if len(product) <= 0: data_dict['list'].append({ 'status': 404, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return 404 links = product for vo in links: pn = vo.get('PartNumber', '') tn = vo.get('PartType', '') if pn: link = 'http://www.ti.com/product/%s' % pn if 'tool' in tn: link = 'http://www.ti.com/tool/%s' % pn data_dict['url'].append({'id': id, 'url': link, 'goods_sn': pn}) if 'startNum=' in resp.url: return 200 page_num = 0 count = 0 try: count = resp_json.get('response', {}).get('searchResults', {}).get('filter', {}).get('MaxRecordCount', '') count = util.intval(count) except: count = 0 page_num = int(math.ceil(count / 25.0)) if page_num <= 1: return 200 # 翻页的form_data max_list_num = util.intval(kwargs.get('max_list_num', 5)) for x in xrange(1, page_num + 1): if max_list_num and x > max_list_num: break url = 'http://www.ti.com/sitesearch/docs/partnumsearch.tsp?sort=asc&linkId=2&startNum=%d&filter=p&sortBy=pstatus&searchTerm=%s' % ( 25 * x, keyword) page_url = url data_dict['list'].append({ 'id': id, 'url': page_url, }) return 200
def fetch_search_data(keyword=None, id=None, data_dict=None, headers=None, proxy=None, **kwargs): """获取搜索数据""" if keyword: print '正在获取 avnet 中关键词:%s 的相关数据' % keyword url = "https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1¤cy=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber=1&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag%3ANPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&wt=json".format( keyword=keyword) elif 'url' in kwargs: url = kwargs['url'] else: return 404 _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = kwargs.get('proxies') if proxies is None and proxy: i = random.randint(0, proxy[0] - 1) proxies = { 'http': 'http://' + proxy[1][i], 'https': 'https://' + proxy[1][i] } resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies) except Exception as e: logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) if 'Invalid URL' not in str(e): data_dict['list'].append({ 'status': -400, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -400 if resp.status_code != 200: if resp.status_code == 404 and '404.html' in resp.url: logger.info('STATUS:404; INFO:无效产品; URL: %s' % url) return 404 logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % (resp.status_code, proxies['http'] if proxy else '', url)) data_dict['list'].append({ 'status': -405, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -405 resp.encoding = 'utf-8' # 开始解析resp # 获取搜索的数量 search_dict = {} try: search_dict = json.loads(resp.text.encode('utf-8')) product_list = search_dict.get('catalogEntryView', []) except: product_list = [] logger.debug('STATUS:-404 ; INFO:数据异常 ; URL:%s' % url) if len(product_list) <= 0: data_dict['list'].append({ 'status': 404, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return 404 # sn = product.xpath('.//td[@class="partColHeader"]//span[@class="defaultSearchText"]') for product in product_list: goods_sn = product.get('seo_token_ntk', '') base_url = 'https://www.avnet.com/shop/apac/' product_url = product.get('avn_pdp_seo_path', '') data_dict['url'].append({ 'id': id, 'url': util.urljoin(base_url, product_url), 'goods_sn': goods_sn }) if 'showMore=true' in url: return 200 count = search_dict.get('recordSetTotal', 0) page_num = int(math.ceil(count / 20.0)) if page_num <= 1: return 200 max_list_num = util.intval(kwargs.get('max_list_num', 5)) for x in xrange(2, page_num + 1): if max_list_num and x > max_list_num: break page_url = 'https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_More_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1¤cy=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber={next_page}&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag:NPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&showMore=true&wt=json'.format( next_page=x, keyword=keyword) # print page_url data_dict['list'].append({ 'id': id, 'url': page_url, }) return 200