def extract(self): self.html = re.sub("<!--.*?-->", "", self.html) doc = PyQuery(self.html) content_node = doc("div#blog_article_content") content = content_node.outerHtml() cpl = re.compile('<img.*?src=".*?"', re.I) content = re.sub("%", "%%", content) content_doc = PyQuery(content) content_doc("img").attr("src", "%s") item = ContentItem() item["title"] = self.title = doc("div.blog_main_left_content").find("h3").text() item["author"] = self.author = doc("div#common_person_blogtitle")("div#title01")("a").text() item["content"] = self.content = content_doc.outerHtml() self.release_time = doc("div.blog_main_time").find("p").text().strip() item["release_time"] = self.release_time item["source"] = u"凤凰网" item["pic_url"] = "" item["image_urls"] = [img.get("src") for img in content_node("img")] return item
def render(self, edit=False): layout = self.current_layout rendered = layout.render() if not edit: widget_markup = """ <div id="%(wid)s" class="view-widget"> %(content)s </div> """ else: widget_markup = """ <div id="%(wid)s" class="widget"> <div class="widget-head"><h3>%(title)s</h3></div> <div class="widget-content">%(content)s</div> </div> """ pq = PyQuery(rendered) for column, addwidgets in self.widget_map.items(): for addwidget in addwidgets: try: widget = self[addwidget] except KeyError: continue widget_info = {'col': column, 'wid': addwidget, 'title': widget.title, 'content': widget.render(), 'url': widget.absolute_url() } pq('#%s' % column).append(widget_markup % widget_info) return pq.outerHtml()
def render(self, edit=False): layout = self.current_layout rendered = layout.render() if not edit: widget_markup = """ <div id="%(wid)s" class="view-widget"> %(content)s </div> """ else: widget_markup = """ <div id="%(wid)s" class="widget"> <div class="widget-head"><h3>%(title)s</h3></div> <div class="widget-content">%(content)s</div> </div> """ pq = PyQuery(rendered) for column, addwidgets in self.widget_map.items(): for addwidget in addwidgets: try: widget = self[addwidget] except KeyError: continue widget_info = { 'col': column, 'wid': addwidget, 'title': widget.title, 'content': widget.render(), 'url': widget.absolute_url() } pq('#%s' % column).append(widget_markup % widget_info) return pq.outerHtml()
def test_django_templatevar_conversion(): hp = create_htmlproducer() pre_html = """<div class="something"> <div class="test"></div> <a href="{{ STATIC_URL }}docson/widget.js">text</a> </div>""" pq_dom = PyQuery(pre_html) post_pq_html = pq_dom.outerHtml() repaired_html = hp.repair_django_tags(post_pq_html) assert pre_html == repaired_html
def test_django_templatetag_url_conversion(): hp = create_htmlproducer() pre_html = """<div class="something"> <div class="test"></div> <a href="{% url 'schema' release_name 'release_package' %}">text</a> </div>""" pq_dom = PyQuery(pre_html) post_pq_html = pq_dom.outerHtml() repaired_html = hp.repair_django_tags(post_pq_html) assert pre_html == repaired_html
def multi(self, url): try: resp = self.session.get(url) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message='status_code Error', backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) elements = pqhtml("section article.product-look") plist = [] for product in elements.items(): obj = {} obj['url'] = product('a:first').attr("href") obj['img'] = product('picture img:first').attr("srcset") obj['name'] = product('hgroup.look-name').text() obj['price'] = product("span.price").text() plist.append(obj) log_info = json.dumps( dict(time=time.time(), count=len(plist), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=plist) except Exception, e: raise
def convert_md_to_html(self, mdcontent, outer_menu_html, inner_menu_html): htmlcontent = markdown(mdcontent, extensions=['footnotes', 'sane_lists', 'toc']) pq_dom = PyQuery(htmlcontent) rendered_menu = self.extract_toc_to_html(pq_dom) self.insert_included_json(pq_dom) self.insert_included_csv(pq_dom) htmlcontent = pq_dom.outerHtml() htmlcontent = self.repair_django_tags(htmlcontent) rendered_html = render_to_string('main/menu_content.html', { 'outer_menu': outer_menu_html, 'inner_menu': inner_menu_html, 'html_content': htmlcontent, }) return rendered_html, rendered_menu
def clean_links(obj, url, html_raw_response=False): """ Rewrite (internal) links to make them absolute. 1. external links are not changed 2. prepend URL to links that are just fragments (e.g. #section) 3. prepend URL (without filename) to internal relative links """ # TODO: do not depend on PyQuery obj = PQ(obj) if url is None: return obj for link in obj.find('a'): base_url = urlparse(url) # We need to make all internal links, to be absolute href = link.attrib['href'] parsed_href = urlparse(href) if parsed_href.scheme or parsed_href.path.startswith('/'): # don't change external links continue if not parsed_href.path and parsed_href.fragment: # href="#section-link" new_href = base_url.geturl() + href link.attrib['href'] = new_href continue if not base_url.path.endswith('/'): # internal relative link # href="../../another.html" and ``base_url`` is not HTMLDir # (e.g. /en/latest/deep/internal/section/page.html) # we want to remove the trailing filename (page.html) and use the rest as base URL # The resulting absolute link should be # https://slug.readthedocs.io/en/latest/deep/internal/section/../../another.html # remove the filename (page.html) from the original document URL (base_url) and, path, _ = base_url.path.rsplit('/', 1) # append the value of href (../../another.html) to the base URL. base_url = base_url._replace(path=path + '/') new_href = base_url.geturl() + href link.attrib['href'] = new_href if html_raw_response: return obj.outerHtml() return obj
def convert_md_to_html(self, mdcontent, outer_menu_html, inner_menu_html): htmlcontent = markdown(mdcontent, extensions=['footnotes', 'sane_lists', 'toc']) pq_dom = PyQuery(htmlcontent) rendered_menu = self.extract_toc_to_html(pq_dom) self.insert_included_json(pq_dom) self.insert_included_csv(pq_dom) htmlcontent = pq_dom.outerHtml() htmlcontent = self.repair_django_tags(htmlcontent) rendered_html = render_to_string( 'main/menu_content.html', { 'outer_menu': outer_menu_html, 'inner_menu': inner_menu_html, 'html_content': htmlcontent, }) return rendered_html, rendered_menu
def sanitise(text, markdown=False): if markdown: text = md(text) dom = PyQuery(text) for a in dom.find('a[href^="javascript:"]'): a = PyQuery(a) a.replaceWith(a.text()) for obj in UNCLEAN_TAGS: dom.find(obj).remove() for attr in UNCLEAN_ATTRS: dom.find('[%s]' % attr).removeAttr(attr) text = dom.outerHtml() if markdown: dom = HTML2Text() text = dom.handle(text) return text
def sanitise(self, text, markdown = True): if markdown: text = md(text) dom = PyQuery(text) for a in dom.find('a[href^="javascript:"]'): a = PyQuery(a) a.replaceWith(a.text()) for obj in UNCLEAN_TAGS: dom.find(obj).remove() for attr in UNCLEAN_ATTRS: dom.find('[%s]' % attr).removeAttr(attr) text = dom.outerHtml() if markdown: dom = HTML2Text() text = dom.handle(text) return text
def prepare_html(fileobj): """ prepares the html for wordpress pages """ pq=PyQuery("".join(strip_if_not_pre(fileobj))) pq("a.headerlink").remove() # Do we want title at all? if pq("div.section h1"): title= pq("div.section h1")[0].text pq("div.section h1:first").remove() else: title="" # TODO: insert toc (??) out = PyQuery(pq("div.content").outerHtml() ) # insert after h1 on 4th ine # lines = out.split('\n') # out = '\n'.join(lines[:4] + [ '[toc]' ] + lines[4:]) # now various regex out.append("<p><small>Last update: %s</small></p>"%( datetime.datetime.now().strftime("%Y-%m-%d"))) out=out.outerHtml() # replace .html with / and index.html with simple ./ pattern = '(internal" href=".[^"]*)index\.html"' out = re.sub(pattern, '\\1"', out) pattern = 'internal" href="index\.html"' out = re.sub(pattern, 'href="./"', out) pattern = '(internal" href="[^"]*).html"' out = re.sub(pattern, '\\1/"', out) pattern = '(internal" href="[^"]*).html#([^"]*)"' out = re.sub(pattern, '\\1/#\\2"', out) pattern = '(internal" href="[^"]*/)index/#([^"]*)"' out = re.sub(pattern, '\\1/#\\2"', out) return (out, title)
def prepare_html(fileobj): """ prepares the html for wordpress pages """ pq=PyQuery("".join(strip_if_not_pre(fileobj))) pq("a.headerlink").remove() # Do we want title at all? if pq("div.section h1"): title= pq("div.section h1")[0].text pq("div.section h1:first").remove() else: title="" # TODO: insert toc (??) out = PyQuery(pq("div.content").outerHtml() ) # insert after h1 on 4th ine # lines = out.split('\n') # out = '\n'.join(lines[:4] + [ '[toc]' ] + lines[4:]) # now various regex out.append("<p><small>%s</small></p>"%pq("p.meta").text()) out=out.outerHtml() # replace .html with / and index.html with simple ./ pattern = '(internal" href=".[^"]*)index\.html"' out = re.sub(pattern, '\\1"', out) pattern = 'internal" href="index\.html"' out = re.sub(pattern, 'href="./"', out) pattern = '(internal" href="[^"]*).html"' out = re.sub(pattern, '\\1/"', out) pattern = '(internal" href="[^"]*).html#([^"]*)"' out = re.sub(pattern, '\\1/#\\2"', out) pattern = '(internal" href="[^"]*/)index/#([^"]*)"' out = re.sub(pattern, '\\1/#\\2"', out) return (out, title)
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#container') domain = tool.get_domain(url) pdata = self.get_pdata(pqhtml, domain) # print area.outerHtml() # print json.dumps(pdata) # exit() #下架 if not pdata['hasOrderableVariants']: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = area('.product-meta').attr('data-brand') detail['brand'] = brand #名称 detail['name'] = area('.product-meta').attr('data-productname') #货币 currency = re.search(r's\["currencyCode"\]="(\w{3})";', pqhtml('script').text()).groups()[0] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #获取信息. price, sizes = self.get_info(pdata) #价格 detail['price'] = price ptxt = area('.pricenotebucket').text() listPrice = re.search(r'\d[\d\.]', ptxt).groups()[0] if ptxt else price detail['listPrice'] = listPrice #颜色 status, color, imgs = self.get_color(pdata) detail['color'] = color detail['colorId'] = dict([(key, key) for key in color.keys()]) #钥匙 detail['keys'] = color.keys() #图片集 detail['img'] = imgs[0] if isinstance(imgs, list) else dict( [(cId, imgArr[0]) for cId, imgArr in imgs.items()]) detail['imgs'] = imgs #产品ID productId = area('.product-meta').attr('data-pid') detail['productId'] = productId #规格 detail['sizes'] = sizes #描述 detail['descr'] = area( 'section.product-details .longdescription').text() #详细 detail['detail'] = area('section.product-details').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = status #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self,url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备: Jtxt = pqhtml('script').text() pdata = self.get_pdata(Jtxt) area = pqhtml('#detail-display-wrapper') #下架 if not pdata : data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #名称 detail['name'] = pqhtml('h2.detail-title').text() #品牌 detail['brand'] = self.get_brand(area) #价格符号 currency = pqhtml('meta[itemprop="priceCurrency"]:first').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #颜色 detail['color'] = pqhtml('ul.detail-additional-info:first>li:last').text() #价格 detail['price'] = pdata['Products']['Info']['BasePrice'].replace(',','') listPrice = pdata['Products']['Info']['OldPrice'].replace(',','') detail['listPrice'] = (pqhtml('span.strokeText>span.price').text() or pqhtml('div#detail-display-info-wrapper span.price').text())[1:] #图片集合 imgsTmp = [li.attr('data-zoom') for li in pqhtml('div#detail-display-icon ul').children('li').items()] detail['img'] = imgsTmp[0] detail['imgs'] = imgsTmp #规格 detail['sizes'] = self.get_sizes(pdata,area) #描述 detail['descr'] = area('p.detail-description:first').text() #产品ID detail['productId'] = pdata['Products']['Info']['ParentProductId'] detail['colorId'] = pdata['Products']['Info']['ParentProductId'] #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail[ 'name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) JscriptTxt = pqhtml('script').text() pqhtml.remove('script').remove('style') area = pqhtml('div#product-summary') # print area.outerHtml().encode('utf-8') buttonTxt = area('#product-form .add-button').text() if u'售罄' in buttonTxt.lower() or u'sold out' in buttonTxt.lower(): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #所有图片 imgs = self.get_imgs(pqhtml) detail['imgs'] = imgs detail['img'] = imgs[0] #名称 detail['name'] = area('h1.brand').text() + ' ' + area( '.name').text() #货币 currency = area('span.regular-price').text().split()[0] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice color, sizes = self.get_sizes(area) #颜色 detail['color'] = color #sizes detail['sizes'] = sizes #下架: if isinstance(detail['sizes'], basestring) and detail['sizes'] == 'sold out': log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #描述 detail['descr'] = area('div#description').text() or pqhtml( '#product-details .product-details-section').text() #品牌 detail['brand'] = area('h1.brand').text() #产品ID prodId = area.attr('data-id') detail['productId'] = prodId detail['colorId'] = prodId #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer: detail['ip_port'] = ':'.join( map(lambda x: str(x), resp.raw._original_response.peer)) log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
class KNMISource(Source): """Weather source class""" __baseUrl = 'http://www.knmi.nl/klimatologie/daggegevens/index.cgi?station=370' __d = None __date = None __day = None __url = None def __init__(self): """Constructor""" def get_weather(self, day): if day >= 0: sys.stderr.write('`day` must be an integer and less than 0\n') return None date = datetime.datetime.now() + datetime.timedelta(days=day) url = self.__get_url(date) self.__date = date self.__day = day self.__url = url self.__d = PyQuery(url=url) data = None try: data = self.__parse() except: html = self.__d.outerHtml().encode('ascii', 'replace') sys.stderr.write(html) sys.stderr.write('\n\n') traceback.print_exc() return data def __parse(self): """Parse the HTML page""" rows = self.__d('#printable > table > tr') w = {} w['url'] = self.__url w['date'] = self.__date.strftime("%Y-%m-%d") w['url_timestamp'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") w['day'] = self.__day w['temperature_average'] = self.__numeric(rows.eq(2).find('td').eq(1).text()) w['temperature_maximum'] = self.__numeric(rows.eq(3).find('td').eq(1).text()) w['temperature_minimum'] = self.__numeric(rows.eq(4).find('td').eq(1).text()) w['rain_amount'] = self.__numeric(rows.eq(2).find('td').eq(6).text().lstrip('<-')) w['rain_duration'] = self.__numeric(rows.eq(3).find('td').eq(6).text().lstrip('-')) w['sunshine_duration'] = self.__numeric(rows.eq(7).find('td').eq(1).text()) w['sunshine_relative'] = self.__numeric(rows.eq(8).find('td').eq(1).text()) w['sky_coverage'] = self.__numeric(rows.eq(9).find('td').eq(1).text()) w['sky_visibiliy'] = self.__numeric(rows.eq(11).find('td').eq(1).text().lstrip('<')) w['wind_speed_average'] = self.__numeric(rows.eq(7).find('td').eq(6).text()) w['wind_speed_maximum_average'] = self.__numeric(rows.eq(8).find('td').eq(6).text()) w['wind_speed_maximum'] = self.__numeric(rows.eq(9).find('td').eq(6).text()) w['wind_direction'] = self.__numeric(rows.eq(11).find('td').eq(6).text()) w['atmosphere_humidity'] = self.__numeric(rows.eq(14).find('td').eq(1).text()) w['atmosphere_pressure'] = self.__numeric(rows.eq(14).find('td').eq(6).text()) return w def __get_url(self, date): """Get the remote URL for fetching the weather""" params = { 'year': date.year, 'month': date.month, 'day': date.day } url = self.__baseUrl for key, value in params.iteritems(): url += '&' + key + '=' + str(value) return url def __numeric(self, x): x = x.rstrip('-') if not x: return 0 return float(x) if '.' in x else int(x)
def detail(self,url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('.product-detail') detail_tab = pqhtml('#product-detail-tabs') img_tab = pqhtml('div.images') domain = tool.get_domain(url) pdata = self.get_pdata(pqhtml) # print area.outerHtml().encode('utf-8') # print json.dumps(pdata) # print detail_tab.outerHtml().encode('utf-8') # print img_tab.outerHtml().encode('utf-8') # exit() #下架 if not area or 'out of stock' in area('.out-of-stock').text(): log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) video_prefix = 'http://image1.superdry.com/static/images/products/' detail = dict() detail['stock'] = pdata['product']['stock'] #该商品总库存. detail['video'] = video_prefix+pdata['product']['video'] detail['gender'] = pdata['product']['gender'] detail['season'] = pdata['product']['season'] detail['category'] = pdata['product']['category'] detail['productSku'] = pdata['product']['sku_code'] detail['size_guide'] = pdata['product']['size_guide'] detail['subcategory'] = pdata['product']['subcategory'] detail['productCode'] = pdata['product']['sku_code'] #产品ID productId = pdata['product']['id'] detail['productId'] = productId #品牌 brand = 'SUPERDRY' detail['brand'] = brand #名称 detail['name'] = pdata['product']['name'] #货币 currency = pdata['product']['currency'] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 detail['price'] = pdata['product']['unit_sale_price'] detail['listPrice'] = pdata['product']['unit_price'] #描述 detail['descr'] = pdata['product']['description'] #详细 detail['detail'] = detail_tab.text() #退换货 detail['returns'] = detail_tab('tab-page:last').text() #颜色 detail['color'] = pdata['product']['color'] detail['colorId'] = pdata['product']['color'] #图片集 imgs = [ ele.attr('src') for ele in img_tab('.scroller li img').items()] imgs = map(lambda x : x.replace('/productthumbs/','/zoom/'), imgs) detail['img'] = img_tab('.scroller li img:first').attr('src').replace('/productthumbs/','/zoom/') detail['imgs'] = imgs #规格 detail['sizes'] = self.get_sizes(area) #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer : detail['ip_port']=':'.join(map( lambda x:str(x),resp.raw._original_response.peer)) log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = requests.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#bd .grid') # domain = tool.get_domain(url) pdata = self.get_pdata(pqhtml) # print area.outerHtml().encode('utf-8') #下架 if not len(area('.p-buy #addCart .buynow')): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #产品应该只有一个 if len(pdata['prods']) != 1: raise ValueError('yintai product data length great than 1') detail = dict() #品牌 brand = area('h4.y-pro-cooper-name').text() detail['brand'] = brand #名称 detail['name'] = pdata['prods'][0]['name'] #货币 currency = 'CNY' detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) price = pdata['prods'][0]['price'] if u'直降' in area('#Y_ProBen').text(): self.session.headers['Referer'] = url self.session.headers['X-Requested-With'] = 'XMLHttpRequest' # self.session.headers['Origin'] = 'http://item.yintai.com' self.session.headers['Origin'] = url # subArea = PyQuery(self.session.post(url,data=dict()).text) subArea = PyQuery( requests.post(url, data=dict(), headers=self.session.headers, cookies=resp.cookies).text) price = re.search( r'(\d[\d\.]*)', subArea('.marketPriceNum .yt-num').text()).groups()[0] price = price + subArea('.marketPriceNum .yt-num em').text() #价格,该业务逻辑后边删除 detail['price'] = float(price) detail['listPrice'] = pdata['prods'][0]['mPrice'] # print area('.productInfo .s-s-color').next()('a[href="Javascript:void(0);"]').outerHtml().encode('utf-8') # print area('.productInfo .s-s-color').next()('.selected a').text() #颜色 # color = self.get_color(area) color = area('.productInfo .s-s-color').next()( 'a[href="Javascript:void(0);"]').text() color = color or area('.productInfo .s-s-color').next()( '.selected a').text() #2016-12-15添加 detail['color'] = color detail['colorId'] = pdata['prods'][0]['colorID'] #图片集 imgs = self.get_imgs(area) detail['img'] = imgs[0] detail['imgs'] = imgs #产品ID productId = pdata['prods'][0]['sku'] detail['productId'] = productId #规格 detail['sizes'] = self.get_sizes(area) #描述 detail['descr'] = area('.yp-con-desc').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url self.session.cookies = RequestsCookieJar() self.session.headers = tool.get_one_header() log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: if 'get YinTai_TagData Fail' in str(e) and self._retry < 10: self._retry += 1 return self.detail(url) elif self._retry >= 10: raise ValueError('yintai retry five times ,{0}'.format(str(e))) else: raise
def detail(self, url): resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error( code=status_code, message='status_code:{0},need 200, message:{1}'.format( status_code, self.cfg.GET_ERR.get('SCERR', 'ERROR')), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #错误 if len(pqhtml('.error_message')) >= 1: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SAKERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#pdp-content-area') pdata = self.get_pdata(area) # print json.dumps(pdata) # exit() #下架 if pdata['sold_out_message']['enabled'] or pdata[ 'intl_shipping_restriction']['enabled']: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = pdata['brand_name']['label'] if pdata['brand_name'][ 'enabled'] else '' detail['brand'] = brand #名称 detail['name'] = pdata['short_description'] #货币 currency = pdata['price']['list_price']['local_currency_code'] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(pdata) detail['price'] = price detail['listPrice'] = listPrice #颜色,此处必须取color的id,虽然有为0的坑.但是下面价格是根据id来进行区分颜色的. color = { str(clor['id']): clor['label'] for clor in pdata['colors']['colors'] } colorId = { str(clor['id']): str(clor['id']) for clor in pdata['colors']['colors'] } detail['color'] = color or self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = colorId or self.cfg.DEFAULT_COLOR_SKU #图片集 imgs = self.get_imgs(pdata, area) detail['img'] = imgs[0] if isinstance(imgs, list) else dict( [(cid, Arr[0]) for cid, Arr in imgs.items()]) detail['imgs'] = imgs #规格,包括多颜色的price.listPrice sprice, slistPrice, sizes = self.get_sizes(pdata) #钥匙 if sizes.keys(): detail['keys'] = sizes.keys() elif color: detail['keys'] = color.keys() # self.logger.debug('price.keys()->{}'.format(price.keys() if isinstance(price,dict) else 'not dict')) # self.logger.debug('color.keys()->{}'.format(color.keys() if isinstance(color,dict) else 'not dict')) # self.logger.debug('sizes.keys()->{}'.format(sizes.keys() if isinstance(sizes,dict) else 'not dict')) # self.logger.debug('detail[\'keys\']->{}'.format(detail['keys'] if 'keys' in detail else 'not keys')) #产品ID productId = pdata['product_code'] detail['productId'] = productId # print price,listPrice # print sprice,slistPrice detail['sizes'] = sizes detail['price'] = sprice detail['listPrice'] = slistPrice #描述 detail['descr'] = PyQuery(pdata['description']).text() #退换货 detail['returns'] = pdata['simple_shipping_statement']['message'] #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail)
def detail(self, url): try: self.domain = tool.get_domain(url) resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#content>#productContainer') pdata = self.get_pdata(pqhtml) # print area.outerHtml() # print json.dumps(pdata) # exit() #下架 if not area or area('.productButtons #disabledAddtobasket'): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = 'COS' detail['brand'] = brand #名称 detail['name'] = area('.productInfo h1:first').text() #货币 currency = pqhtml('meta[property="og:price:currency"]').attr( 'content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(pqhtml, area) detail['price'] = price detail['listPrice'] = listPrice #颜色 color = self.get_color(area) detail['color'] = color detail['colorId'] = dict([(key, key) for key in color.keys()]) #图片集 imgs = self.get_imgs(area) detail['img'] = imgs[0] if isinstance(imgs, list) else dict( [(cid, Arr[0]) for cid, Arr in imgs.items()]) detail['imgs'] = imgs #钥匙 detail['keys'] = color.keys() #产品ID productId = area('input[data-product-identifier!=""]').attr( 'data-product-identifier') detail['productId'] = productId #规格 detail['sizes'] = self.get_sizes(area) #描述 detail['descr'] = area('.productInfo>.infowrap>dl>dd:first').text() #退换货 detail['returns'] = area( '.productInfo>.infowrap>dl>dd:first').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') # 下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) # 其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) # 前期准备: # Jtxt = pqhtml('script').text() pdata = self.get_pdata(pqhtml) area = pqhtml('.productDetailSummary') pinfo = pqhtml('#productInfo') imgPath = url.split('/')[3] # print area.outerHtml() # print json.dumps(pdata) # exit() # 下架 if not pdata: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() # 名称 detail['name'] = pqhtml('.productName').text() # 品牌 detail['brand'] = pqhtml('.productName a').text() # 货币 currency = area('span[itemprop="priceCurrency"]').text() detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) # 价格 price, listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice # 图片集 img, imgs = self.get_imgs(area, imgPath) detail['img'] = img detail['imgs'] = imgs # 视频 if len(area('.MagicScroll .productVideo')) > 0: detail['video'] = area('.MagicScroll a.productVideo').attr( 'data-video-url') # 颜色 colors, sizes = self.get_colors_sizes(area, pdata) detail['color'] = colors detail['sizes'] = sizes detail['keys'] = colors.keys() detail['colorId'] = dict([(key, key) for key in colors.keys()]) # 产品ID productId = area('input#baseNo').attr('value') detail['productId'] = productId # 描述 detail['descr'] = pinfo('#overview').text() # 详情 detail['detail'] = pinfo('#specs').text() # HTTP状态码 detail['status_code'] = status_code # 状态 detail['status'] = self.cfg.STATUS_SALE # 返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail)
def detail(self,url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#theater') domain = tool.get_domain(url) pdata = self.get_pdata(pqhtml('script:gt(20)')) # exit() #下架 # if True : # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) # self.logger.info(log_info) # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) # return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = area('.brand').text() detail['brand'] = brand #名称 detail['name'] = area('h1:first').text() currencySymbol,price,listPrice = self.get_price_info(pdata) if currencySymbol != '$' : raise ValueError('currencySymbol is not USD') #货币 currency = 'USD' detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 detail['price'] = price detail['listPrice'] = listPrice #颜色 color = self.get_color(pdata) detail['color'] = color detail['colorId'] = {cid:cid for cid in color.keys() } #图片集 img,imgs = self.get_imgs(pdata) detail['img'] = img detail['imgs'] = imgs #产品ID productId = pqhtml('input[name="productId"]').attr('value') detail['productId'] = productId #规格 sizes = self.get_sizes(pdata) detail['sizes'] = sizes #描述 detail['descr'] = area('.description').text() detail['keys'] = set(img.keys())&set(sizes.keys()) #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer : detail['ip_port']=':'.join(map( lambda x:str(x),resp.raw._original_response.peer)) log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') domain = tool.get_domain(url) #下架: if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 Jtxt = pqhtml('script').text() area = pqhtml('#container') pdata = self.get_pdata(Jtxt) domain = tool.get_domain(url) #下架 # if not instock : # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) # return tool.return_data(successful=False, data=data) # print area.outerHtml() # exit() detail = dict() #图片 imgsTmp = [ domain + a.attr('href') for a in area('form#addToCart ul.alt_imgs:first>li>a').items() ] detail['img'] = imgsTmp[0] detail['imgs'] = imgsTmp #名称 detail['name'] = pdata['product']['name'] #品牌 detail['brand'] = area('form#addToCart a#sameBrandProduct').text() #价格 detail['price'] = pdata['product']['unit_sale_price'] detail['listPrice'] = pdata['product']['unit_price'] #价格符号 currency = pdata['product']['currency'] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #产品id productId = pdata['product']['id'] detail['productId'] = productId #颜色 detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = productId #规格 detail['sizes'] = [ dict(name=self.cfg.DEFAULT_ONE_SIZE, inventory=pdata['product']['stock'], sku=productId) ] #描述 detail['descr'] = area('.prod_desc').text() + ' ' + area( 'div#info_tabs>div.wrap>div#tab1_info').text() #详细 detail['detail'] = area('#tab1_info').text() #品牌描述 detail['brandDescr'] = area('#tab2_info').text() #保修 detail['note'] = area('#tab5_info').text() #配送 detail['delivery'] = area('#shippingData').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('div.primary-content') domain = tool.get_domain(url) # print area.outerHtml().encode('utf-8') # exit() #下架 # if True : # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) # self.logger.info(log_info) # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) # return tool.return_data(successful=False, data=data) detail = dict() #产品ID # productId = area('input.productId').attr('value') productId = pqhtml('span[itemprop="productID"]').attr('content') detail['productId'] = productId detail['productSku'] = productId detail['productCode'] = productId #品牌 brand = pqhtml('span[itemprop="brand"]').attr('content') detail['brand'] = brand #名称 detail['name'] = pqhtml('span[itemprop="name"]').attr('content') #货币 currency = pqhtml('span[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(area) price = pqhtml('span[itemprop="price"]').attr('content') detail['price'] = price detail['listPrice'] = listPrice #一级分类 detail['category'] = area('a[data-bigpopup="sizeChart"]').attr( 'data-category') #二级分类 detail['subcategory'] = area('a[data-bigpopup="sizeChart"]').attr( 'data-sub-category') #描述 detail['descr'] = pqhtml('span[itemprop="description"]').attr( 'content') #详细 detail['detail'] = area('#collapseOne').text() #退换货 detail['returns'] = area('#collapseFive').text() #颜色 # color = self.get_color(area) detail['color'] = pqhtml('span[itemprop="color"]').attr('content') detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU #图片集 imgs = [ img.attr('src') for img in area( '.product-image-carousel img.primary-image').items() ] detail['img'] = pqhtml('span[itemprop="image"]').attr('content') detail['imgs'] = imgs #规格 detail['sizes'] = self.get_sizes(area) #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer: detail['ip_port'] = ':'.join( map(lambda x: str(x), resp.raw._original_response.peer)) log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) #加载第一次. #验证resp.防爬虫.!!! resp = self.resp_verify(resp) if 'window.location.reload(true);' in resp.text: resp = self.session.get(url, verify=False) #加载第二次. #会出现不返回内容的情况 while not resp.text: return self.detail(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #下架 if 'Out of stock' in pqhtml('.product-availability').text(): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('div#product-view') domain = tool.get_domain(url) pdata = self.get_pdata(area) detail = dict() #品牌 brand = area('.panel-a h1:first').text().split('-')[0].strip() detail['brand'] = brand #名称 detail['name'] = area('.panel-a h1:first').text() #货币 currency = pqhtml('meta[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #产品ID productId = pdata['productId'] detail['productId'] = productId #价格 price, listPrice = pdata['basePrice'].replace( ',', ''), pdata['oldPrice'].replace(',', '') detail['price'] = price detail['listPrice'] = listPrice #颜色 # color = self.get_color(area) detail['color'] = area('button#product-addtocart-button').attr( 'data-variant') or self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = productId #图片集 imgs = [ img.attr('data-src') for img in area('div#mobile-carousel-images a>img').items() ] detail['img'] = imgs[0] detail['imgs'] = imgs #规格 detail['sizes'] = self.get_sizes(area, pdata) #描述 detail['descr'] = area('div.tog-desc').text() + area.parent()( '.description-section:first').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
class WeeronlineSource(Source): """Weather source class""" __d = None __date = None __day = None __url = 'http://www.weeronline.nl/Europa/Nederland/Eindhoven/4058591' def __init__(self): """Constructor""" def get_weather(self, day): if day < 0 or day > 13: sys.stderr.write('`day` must be an integer between 0 and 13\n') return None if day > 3: sys.stderr.write('`day` >= 4 not yet supported\n') return None self.__date = datetime.datetime.now() + datetime.timedelta(days=day) self.__day = day if not self.__d: self.__d = PyQuery(url=self.__url) data = None try: data = self.__parse() except: html = self.__d.outerHtml().encode('ascii', 'replace') sys.stderr.write(html) sys.stderr.write('\n\n') traceback.print_exc() return data def __parse(self): """Parse the HTML page""" container = self.__d('.weatherforecast.FiveDays') rows = container.find('.row_forecast') iconRows = container.find('.row_weathericons') ratingRows = container.find('.row_weathernumbers') index = self.__day + 1 w = {} w['url'] = self.__url w['date'] = self.__date.strftime("%Y-%m-%d") w['url_timestamp'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") w['day'] = self.__day icons = iconRows.eq(0).find('td').eq(index).find('div') for i in range(3): val = icons.eq(i).attr('class') w['icon_' + str(i + 1)] = val val = rows.eq(0).find('td').eq(index).text() val = val.encode('ascii', 'ignore') # strip the °-sign w['temperature_minimum'] = self.__numeric(val) val = rows.eq(1).find('td').eq(index).text() val = val.encode('ascii', 'ignore') # strip the °-sign w['temperature_maximum'] = self.__numeric(val) val = rows.eq(2).find('td').eq(index).text() val = val.rstrip('/') # strip the '/' w['wind_force'] = self.__numeric(val) val = rows.eq(2).find('td').eq(index).find('.windImageDiv.darkImage > div').attr('class') val = val.replace('wind_icon_small_', '').replace('_xs darkImage', '') w['wind_direction'] = val val = rows.eq(3).find('td').eq(index).text() val = val.rstrip('%') # strip the '%' w['rain_percentage'] = self.__numeric(val) val = rows.eq(4).find('td').eq(index).text() val = val.rstrip('m') # strip the 'mm' w['rain_amount'] = self.__numeric(val) val = ratingRows.eq(0).find('td').eq(index).text() w['rating'] = self.__numeric(val) return w def __numeric(self, x): x = x.replace(',', '.') if not x: return 0 return float(x) if '.' in x else int(x)
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('.product-detail-container') domain = tool.get_domain(url) # print area.outerHtml() # exit() #下架 if u'缺货' in area('#stock-status').text(): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = area('#brand:first span').text() or area('#brand a').text() detail['brand'] = brand #名称 detail['name'] = area('#name').text() #货币 currency = area('#price-currency').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice #颜色 detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU #图片集 imgs = [ a.attr('data-large-img') for a in area( '.image-container .thumbnail-container img').items() ] or [ img.attr('src') for img in area('#iherb-product-zoom img').items() ] imgs = imgs or [ area('#product-image .product-summary-image a').attr('href') ] detail['img'] = imgs[0] detail['imgs'] = imgs #产品ID productId = area('input[name="pid"]').attr('value') detail['productId'] = productId #规格 stock_txt = area('#stock-status').text() inv = area('#ddlQty option:last').attr( 'value' ) if 'In Stock' in stock_txt or u'有库存' in stock_txt else 0 detail['sizes'] = [ dict(name=self.cfg.DEFAULT_ONE_SIZE, inventory=inv, id=productId, sku=productId) ] #描述 detail['descr'] = area('#product-specs-list li').text() #详细 detail['detail'] = pqhtml('div[itemprop="description"]').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self,url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#goodsInfo') domain = tool.get_domain(url) # pdata = self.get_pdata(area) # print area.outerHtml().encode('utf-8') # exit() #下架 if not area : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #产品ID productId = pqhtml('#goodsForm input#bskGodGodNo').attr('value') detail['productId'] = productId detail['productSku'] = productId detail['productCode'] = area('.prd-code').text() #品牌 brand = pqhtml('#goodsForm input#brndNm').attr('value') detail['brand'] = brand #名称 detail['name'] = u'{0} {1}'.format(brand,pqhtml('#goodsForm input#godNm').attr('value')) #货币,价格 currency,price,listPrice = self.get_currency_prices(pqhtml,area) detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) detail['price'] = price detail['listPrice'] = listPrice #描述 detail['descr'] = pqhtml('meta[name="description"]').attr('content') #详细 detail['detail'] = pqhtml('meta[name="description"]').attr('content') + area('.desc-area').text() #颜色 color = self.get_color(area) detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU #图片集 imgs = [ img.attr('src') for img in pqhtml('#prdImgWrap .prdImg ul>li>img').items()] detail['img'] = pqhtml('meta[property="og:image"][name="og_image"]').attr('content') detail['imgs'] = imgs #规格 detail['sizes'] = self.get_sizes(area) #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer : detail['ip_port']=':'.join(map( lambda x:str(x),resp.raw._original_response.peer)) log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 # area = pqhtml('.caption-product') area = pqhtml('.product-single-section-main') imgArea = pqhtml('.slider') domain = tool.get_domain(url) pdata = self.get_pdata(pqhtml('head')) # print area.outerHtml().encode('utf-8') # exit() #下架 # if len(area('#variant-listbox')) == 0 : # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) # self.logger.info(log_info) # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) # return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = pdata['product']['vendor'] detail['brand'] = brand #名称 detail['name'] = area('h1[itemprop="name"]').text() #货币 currency = pqhtml('meta[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice #颜色 # color = self.get_color(area) detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = pdata['product']['id'] #图片集 # imgs = [ 'https:'+a.attr('src') for a in imgArea('img').items()] imgs = [ 'http:' + img.attr('src') for img in area('.super-slider-main img').items() ] detail['img'] = imgs[0] detail['imgs'] = imgs #产品ID productId = pdata['product']['id'] detail['productId'] = productId #规格 detail['sizes'] = self.get_sizes(pdata, area) #描述 detail['descr'] = area('.product-single-details-dropdown').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message='status_code Error', backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) # area = pqhtml('.product_schema_wrapper>.page_width') area = pqhtml('.container-full--small-only .grid') if not area: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) # print area.outerHtml().encode('utf-8') # exit() detail = dict() #名称 detail['name'] = area('h1.product__title').text() + ' ' + area( 'h2.product__desc').text() + ' ' + area( 'span.product__desc').text() #颜色 detail['color'] = area('span[itemprop="color"]').text() #图片集 # imgsTmp = [ a.attr('href') for a in area('.product-gallery__imgholder a').items() ] # imgsTmp = [ a.attr('data-zoom-image') for a in area('.product-gallery__imgholder a').items() ] imgsTmp = [ img.attr('data-lazy') or img.attr('src') for img in area('.product-gallery__imgholder a img').items() ] detail['img'] = imgsTmp[0] detail['imgs'] = imgsTmp #货币 currency = area('meta[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #现价 price = area('meta[itemprop="price"]').attr('content') detail['price'] = price #原价 # detail['listPrice'] = area('span[itemprop="standard_price"]').text().replace(',','') listPriceBlock = area('span.product__price--old') detail['listPrice'] = re.search( r'(\d[\.\d,]*)', listPriceBlock.text()).groups()[0].replace( ',', '') if len(listPriceBlock) else price productInfo = area('#product-info') #描述 detail['descr'] = productInfo('#design').text() #品牌 detail['brand'] = 'REISS' #产品ID productId = area('span[itemprop="productID"]').text() detail['productId'] = productId #颜色ID detail['colorId'] = productId #配送和退货 detail['delivery'] = productInfo('#delivery').text() detail['returns'] = productInfo('#delivery').text() #设计 detail['designer'] = productInfo('#design').text() #sizeFit detail['sizeFit'] = productInfo('#size').text() #fabric detail['fabric'] = productInfo('#care').text() #规格 detail['sizes'] = [ dict(name=opt.text(), sku=opt('input').attr('value'), id=opt('input').attr('value'), inventory=self.cfg.DEFAULT_STOCK_NUMBER if opt.attr('class') != 'size_not_available' else 0) for opt in area('form .product-attributes .product-sizes .product-sizes__item' ).items() if len(opt('input')) #if 过滤没有库存的size. ] #没有sizes? if not detail['sizes']: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail)
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架: if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 Jtxt = pqhtml('script').text() area = pqhtml('#itemContent') # print area.outerHtml().encode('utf-8') #下架 if not area: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) pdata = json.loads( re.search(r'jsInit.item.colorSizeJson =\s*(.*?\}\});\s*', Jtxt, re.DOTALL).groups()[0]) detail = dict() #名称: name = re.search(r'tc_vars\["product_title"\] =\s*"(.*?)";', Jtxt, re.DOTALL).groups()[0] # name = json.loads(u'[{0}]'.format(HTMLParser().unescape(name)))[0] detail['name'] = area('#itemTitle').text() #品牌 brand = re.search(r'tc_vars\["product_brand"\] =\s*"(.*?)";', Jtxt, re.DOTALL).groups()[0] detail['brand'] = area( '#itemTitle span[itemprop="brand"]').text() or brand #货币符号 currency = re.search(r'tc_vars\["nav_currency"\] =\s*"(.*?)";', Jtxt, re.DOTALL).groups()[0] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 detail['price'] = re.search( r'tc_vars\["product_discountprice"\] =\s*"(.*?)";', Jtxt, re.DOTALL).groups()[0] detail['listPrice'] = re.search( r'tc_vars\["product_price"\] =\s*"(.*?)";', Jtxt, re.DOTALL).groups()[0] #图片集 imgsTmp = self.get_imgs(Jtxt, area, pdata) detail['img'] = dict([ (cid, imgs[0]) for cid, imgs in imgsTmp.items() ]) if isinstance(imgsTmp, dict) else imgsTmp[0] detail['imgs'] = imgsTmp #规格 detail['sizes'] = self.get_sizes(pdata) #产品ID detail['productId'] = dict([(color['Cod10'], color['Cod10']) for color in pdata['Colors']]) #颜色 detail['color'] = dict([(color['Cod10'], color['Name']) for color in pdata['Colors']]) detail['colorId'] = dict([(color['Cod10'], color['Cod10']) for color in pdata['Colors']]) #描述,2016-09-25 12:31:54 修改 detail['descr'] = area('#item-infos li:first').remove( 'script').text() # detail['descr'] = area('#itemInfoTab #tabs-1').remove('script').text() #构造物,2016-09-25 12:31:54 修改 detail['fabric'] = area('#item-infos li:first').remove( 'script').text() # detail['fabric'] = area('#item-infos #tabs-1').remove('script').text() #退换货,2016-09-25 12:31:54 修改 detail['returns'] = area('#item-infos li:last').remove( 'script').text() # detail['returns'] = area('#item-infos #tabs-3').remove('script').text() #设计者 detail['designer'] = re.search( r'tc_vars\["product_author"\] =\s*"(.*?)";', Jtxt, re.DOTALL).groups()[0] #钥匙 detail['keys'] = [color['Cod10'] for color in pdata['Colors']] #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self,url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('.fwd_page .fwd_content') # domain = tool.get_domain(url) # pdata = self.get_pdata(area) # print area.outerHtml().encode('utf-8') # print pqhtml.outerHtml() # exit() #下架 if 'Sold Out' in area('.stock_info:first').text() : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = area('.product_info:first .designer_brand:first a:first').text() or area('.product_info:first .product-titles__brand a:first').text() detail['brand'] = brand #名称 detail['name'] = brand + ' ' + (area('.product_info:first h2.product_name:first').text() or area('.product_info:first h1.product_name:first').text()) #货币 currency = pqhtml('meta[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price,listPrice = self.get_all_price(area('.eagle .prices')) detail['price'] = price detail['listPrice'] = listPrice #产品ID productId = area('button.addtobag').attr('data-code') detail['productId'] = productId #颜色 detail['color'] = area('.color_dd .one_sizeonly').text() or area('.color_dd option:first').text() detail['colorId'] = productId #图片集 imgs = [ a.attr('data-zoom-image') for a in area('.cycle-slideshow .product-detail-image-zoom img').items()] detail['img'] = imgs[0] detail['imgs'] = imgs #规格 detail['sizes'] = self.get_sizes(area) #描述 detail['descr'] = area('#details').text() #品牌描述 detail['brandDescr'] = area('#aboutdesigner').text() #退换货 detail['returns'] = area('#free_ship_popup').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail[ 'name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) area = pqhtml('#content') self.link_area = re.search(r'/en-(\w{2})/', url).groups()[0] SoldOut = self.checkSoldOut(pqhtml) if SoldOut: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) pdata = self.get_pdata(area) detail = dict() #品牌 brand = pdata['brand']['name'] detail['brand'] = brand #名称 detail['name'] = brand + ' ' + pdata['name'] #货币单位 currency = pdata['price']['currency'] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(pdata) detail['price'] = price detail['listPrice'] = listPrice #图片集 imgsTmp = self.get_imgs(area) detail['img'] = imgsTmp[0] detail['imgs'] = imgsTmp #规格 sizesTmp = self.get_sizes(pdata) if sizesTmp is None: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #处理one size if len(sizesTmp) == 1 and sizesTmp[0]['name'].lower() in [ 'one size', 'onesize' ]: sizesTmp[0]['name'] = self.cfg.DEFAULT_ONE_SIZE detail['sizes'] = sizesTmp #视频 if 'videos' in pdata and pdata['videos']: detail['video'] = self.get_video(pdata) #产品注意: detail['note'] = area( 'section.product-accordion--desktop>section:first').text() #产品sizeFit detail['sizeFit'] = area( 'section.product-accordion--desktop>section:eq(1)').text() #产品详情 detail['detail'] = area( 'section.product-accordion--desktop>section:eq(2)').text() #产品送货 detail['delivery'] = area( 'section.product-accordion--desktop>section:last').text() #产品退货 detail['returns'] = area( 'section.product-accordion--desktop>section:last').text() #描述 detail['descr'] = self.get_descr(area) #产品ID detail['productId'] = pdata['id'] print #颜色 detail['color'] = pdata['colourInfo'][0]['colourName'] if pdata[ 'colourInfo'] else self.cfg.DEFAULT_ONE_COLOR #颜色ID detail['colorId'] = (pdata['colourInfo'][0]['colourId'] or self.cfg.DEFAULT_COLOR_SKU ) if pdata['colourInfo'] else pdata['id'] #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except TooManyRedirects as e: self.logger.exception(e) data = tool.get_off_shelf(code=0, message=self.cfg.SOLD_OUT, backUrl=url, html=str(e)) return tool.return_data(successful=False, data=data) except Exception, e: raise
def simple_package(package_name): ''' Given a package name, returns all the versions for downloading that package. If the package doesn't exists, then it will call PyPi (CheeseShop). But if the package exists in the local path, then it will get all the versions for the local package. This will take into account if the egg is private or if it is a normal egg that was uploaded to PyPi. This is important to take into account the version of the eggs. For example, a proyect requires request==1.0.4 and another package uses request==1.0.3. Then the instalation of the second package will fail because it wasn't downloaded an the **request** folder only has the 1.0.4 version. To solve this problem, the system uses 2 different kinds of eggs: * private eggs: are the eggs that you uploaded to the private repo. * normal eggs: are the eggs that are downloaded from pypi. So the normal eggs will always get the simple page from the pypi repo, will the private eggs will always be read from the filesystem. :param package_name: the name of the egg package. This is only the name of the package with the version or anything else. :return: a template with all the links to download the packages. ''' app.logger.debug('Requesting index for: %s', package_name) package_folder = get_package_path(package_name) if (is_private(package_name) or ( exists(package_name) and app.config['SHOULD_USE_EXISTING'])): app.logger.debug('Found information of package: %s in local repository', package_name) package_versions = [] template_data = dict( source_letter=package_name[0], package_name=package_name, versions=package_versions ) for filename in listdir(package_folder): if not filename.endswith('.md5'): # I only read .md5 files so I skip this egg (or tar, # or zip) file continue with open(join(package_folder, filename)) as md5_file: md5 = md5_file.read(-1) # remove .md5 extension name = filename[:-4] data = VersionData(name, md5) package_versions.append(data) return render_template('simple_package.html', **template_data) else: app.logger.debug('Didnt found package: %s in local repository. ' 'Using proxy.', package_name) url = app.config['PYPI_URL'] + 'simple/%s' % package_name response = get(url) if response.status_code != 200: app.logger.warning('Error while getting proxy info for: %s' 'Errors details: %s', package_name, response.text) abort(response.status_code) content = response.content p = PyQuery(content) external_links = set() for anchor in p("a"): panchor = PyQuery(anchor) href = panchor.attr('href') # robin-jarry: modified the href to ../../packages/ # so that it works also for non-source packages (.egg, .exe and .msi) parsed = urlparse.urlparse(href) if parsed.hostname: # the link is to an external server. if parsed.hostname == 'pypi.python.org': # we remove the hostname to make the URL relative panchor.attr('href', parsed.path) else: if panchor.attr('rel') == 'download': if url_is_egg_file(parsed.path): # href points to a filename external_links.add('<a href="%s">%s</a>' % (href, basename(parsed.path))) else: # href points to an external page where we will find # links to package files external_links.update(find_external_links(href)) # what ever happens, we remove the link for now # we'll add the external_links after that we found after panchor.remove() else: # local link to pypi.python.org if not href.startswith('../../packages/'): # ignore anything else than package links panchor.remove() # after collecting all external links, we insert them in the html page for link in external_links: plink = PyQuery(link) href = plink.attr('href') plink.attr('href', convert_to_internal_url(href, package_name, basename(href))) p('a').after(plink) content = p.outerHtml() return content
def detail(self, url): resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') # 下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf( code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) # 其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) # 前期准备 area = pqhtml('#main #primary') # domain = tool.get_domain(url) # pdata = self.get_pdata(area) # print area.outerHtml().encode('utf-8') # exit() # 下架 # if True : # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) # self.logger.info(log_info) # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) # return tool.return_data(successful=False, data=data) detail = dict() # 产品ID productId = area('[itemprop="productID"]:first').text().replace('#','') detail['productId'] = productId detail['productSku'] = productId detail['productCode'] = productId # 品牌 brand = area('.brand-name:first').text() detail['brand'] = brand # 名称 detail['name'] = ' '.join([brand,area('.product-name:first').text()]) # 价格 price, listPrice, currency = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice # 货币,该渠道只有 欧元,美元,英镑,三种单位. detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) # 描述 detail['descr'] = area('#pdpMain .product-detail .product-information').text() # 颜色 # color = self.get_color(area) detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU # 图片集 imgs = [img.attr('src') for img in area('#pdpMain #product-col-2 img').items()] detail['img'] = imgs[0] detail['imgs'] = imgs # 规格 detail['sizes'] = self.get_sizes(area) # HTTP状态码 detail['status_code'] = status_code # 状态 detail['status'] = self.cfg.STATUS_SALE # 返回链接 detail['backUrl'] = resp.url # 返回的IP和端口 if resp.raw._original_response.peer: detail['ip_port'] = ':'.join( map(lambda x: str(x), resp.raw._original_response.peer)) log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail)
def detail(self, url): try: resp = self.session.get(url, timeout=self.cfg.REQUEST_TIME_OUT) # resp = requests.get(url,headers=self.session.headers,timeout=self.cfg.REQUEST_TIME_OUT) # print self.session.headers # resp = requests.get(url,headers=self.session.headers,timeout=20) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') # print resp.headers #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) Jtxt = pqhtml('script').text() #下架 if 'productDetails' not in Jtxt: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) pdata = self.get_pdata(Jtxt) #前期准备 product = pdata['product'] allLooks = product['allLooks'] skuJournal = self.get_skuJournal(Jtxt) sizeAttribute = product['sizeAttribute'] if product.has_key( 'sizeAttribute') else { 'values': [{ 'id': 0, 'value': self.cfg.DEFAULT_ONE_SIZE }] } colorAttribute = product['colorAttribute'] if product.has_key( 'colorAttribute') else { 'values': [{ 'id': 0, 'value': self.cfg.DEFAULT_ONE_COLOR }] } #lookId 和 SkuArr 映射 # lookId2SkuArr = dict([(look['productLookId'],[Id['skuId'] for Id in look['skus']]) for look in allLooks]) #lookId 和 ImgArr 映射 lookId2ImgArr = dict([(look['productLookId'], [ 'http:' + img['retinaQuickViewLookUrl'] for img in look['images'] ]) for look in allLooks]) #lookId 和 现价 映射, 多颜色多价格 lookId2Price = dict([(look['productLookId'], look['pricing']['maxSkuSalePrice']['raw']) for look in allLooks]) #lookId 和 原价 映射,多颜色多价格 lookId2ListPrice = dict([ (look['productLookId'], look['pricing']['maxSkuMsrpPrice']['raw']) for look in allLooks ]) #lookId 和 skuArr 映射 lookId2SkuArr = dict([(look['productLookId'], [Id['skuId'] for Id in look['skus']]) for look in allLooks]) #sizeId 和 名称 映射 #{2000: u's', 2001: u'm', 2002: u'l', 2003: u'xl', 2004: u'xxl'} sizeId2Name = dict([(size['id'], size['value']) for size in sizeAttribute['values']]) #colorId 和 名称 映射 #{1000: u'dark red', 1001: u'true navy'} colorId2Name = dict([(color['id'], color['value']) for color in colorAttribute['values']]) #sku 和 有库存 映射 sku2Inventory = self.get_sku2Inventory(skuJournal) #sku 和 无库存 映射 sku2NoInventory = dict([ (sku['skuId'], sku['numberUnitsForSale']) for sku in skuJournal['entries'] if sku['type'] == 'inventory' and sku['status'] == ['X', 'U'] ]) #更新 库存 字典 sku2Inventory.update(sku2NoInventory) #sku 和 现价 映射, 多size多价格. sku2Price = dict([(sku['skuId'], str(sku['salePrice']['raw'])) for sku in skuJournal['entries'] if sku['type'] == 'pricing']) #sku 和 原价 映射, 多size多价格. sku2ListPrice = dict([(sku['skuId'], str(sku['msrpPrice']['raw'])) for sku in skuJournal['entries'] if sku['type'] == 'pricing']) #skuId 和 sizeId 映射 skuId2SizeId = dict([ (sku['skuId'], sku['savId']) for sku in skuJournal['entries'] if sku['type'] == 'associate' and sku['attribute'] == 'Size' ]) #skuId 和 colorId 映射 skuId2ColorId = dict([ (sku['skuId'], sku['savId']) for sku in skuJournal['entries'] if sku['type'] == 'associate' and sku['attribute'] == 'Color' ]) #sku 和 sizeName 映射 sku2SizeName = self.get_sku2SizeName(product, skuId2SizeId, sizeId2Name) #sku 和 colorName 映射 sku2ColorName = self.get_sku2ColorName(product, skuId2ColorId, colorId2Name) #lookId 和 colorId 映射 lookId2ColorId = self.get_lookIe2ColorId(lookId2SkuArr, skuId2ColorId) #lookId 和 colorName 映射 lookId2ColorName = self.get_lookIe2ColorName( lookId2SkuArr, sku2ColorName) #lookId 和 size集合 映射 lookId2Sizes = self.get_lookId2Sizes(lookId2SkuArr, sku2SizeName, sku2Inventory, sku2Price, sku2ListPrice) # print(json.dumps(sku2Price)) # print(json.dumps(sku2ListPrice)) # print(json.dumps(lookId2SkuArr)) # print(json.dumps(sku2ColorName)) # print(json.dumps(lookId2ColorName)) # print(json.dumps(sku2SizeName)) detail = dict() #只获取当前连接中的sku值 try: lookId = None if '-' in url[url.rindex('/'):]: lookId = url[url.rindex('/') + 1:].split('-')[0] lookIds = [int(lookId)] except Exception, e: pass #钥匙 detail['keys'] = lookId2SkuArr.keys() #只获取链接中lookId # detail['keys'] = lookIds or lookId2SkuArr.keys() #颜色 detail['color'] = lookId2ColorName detail['colorId'] = lookId2ColorId #产品ID detail['productId'] = product['productId'] #图片 detail['img'] = dict([(lookId, imgArr[0]) for lookId, imgArr in lookId2ImgArr.items()]) detail['imgs'] = lookId2ImgArr #规格 detail['sizes'] = lookId2Sizes #价格 detail['price'] = lookId2Price detail['listPrice'] = lookId2ListPrice #品牌 brand = pdata['brand']['name'] detail['brand'] = brand #名称 detail['name'] = brand + ' ' + pdata['product']['name'] #货币符号 currency = pdata['defaultLook']['pricing']['currencyCode'] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #退换货 detail['returns'] = pdata['returnPolicy']['description'] #描述 dtxt = PyQuery(pdata['product']['description']) dtxt.remove('strong') detail['descr'] = dtxt.text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail)
def detail(self, url): resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') add_tocart = pqhtml('#buy') # 下架 if status_code == 404 or not add_tocart: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) # 其他错误, 或没有加入购物车按钮 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) # 前期准备 img_area = pqhtml('body div.left') prod_area = pqhtml('body .right') # print img_area.outerHtml().encode('utf-8') # print prod_area.outerHtml().encode('utf-8') # exit() # 下架 if not prod_area: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() # 产品ID productId = re.search(r'goods\/(\d+)[\/]?', url).groups()[0] detail['productId'] = productId detail['productSku'] = productId detail['productCode'] = productId # 品牌 brand = prod_area('p:last').text().replace(u'进入品牌', '').strip() detail['brand'] = brand # 名称 detail['name'] = prod_area('#kuriosity_code').prev().text() # 货币 currency = 'CNY' detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) # 价格 price, listPrice = self.get_all_price(prod_area) detail['price'] = price detail['listPrice'] = listPrice # 退换货 detail['returns'] = '' # img_area('div:last').text() # 描述 img_area('div:last').empty() # 清空售后说明 detail['descr'] = prod_area('.text').text() + img_area( 'div:first').text() # 颜色 # color = self.get_color(area) detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU # 图片集 imgs = [ 'https://www.k11kuriosity.com' + img.attr('src') for img in img_area('img.small').items() ] detail['img'] = imgs[0] detail['imgs'] = imgs # 规格 detail['sizes'] = self.get_sizes(prod_area) # HTTP状态码 detail['status_code'] = status_code # 状态 detail['status'] = self.cfg.STATUS_SALE # 返回链接 detail['backUrl'] = resp.url # 返回的IP和端口 if resp.raw._original_response.peer: detail['ip_port'] = ':'.join( map(lambda x: str(x), resp.raw._original_response.peer)) log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail)