def name_en(cls, input_value): try: if isinstance(input_value, list): input_value = ''.join(input_value) name = Selector(text=input_value).xpath( "//h1//text()") if len(name.re("-.*-(.*)")) > 0: try: name_en = str(''.join(name.re("-.*-(.*)"))) return name_en except UnicodeEncodeError: return elif len(name.re("-(.*)")) > 0: name_en = name.re("-(.*)") return name_en else: name = name.extract() if isinstance(name, list): name = ''.join(name) try: name = str(name) return name except UnicodeEncodeError: return else: return except Exception as e: print e
def name_vn(cls, input_value): if isinstance(input_value, list): input_value = ''.join(input_value) name_1 = Selector(text=input_value).xpath("//span[@itemprop='title']//text()").extract() try: name_1 = str(''.join(name_1)) name = Selector(text=input_value).xpath( "//h1//text()") if len(name.re("(.*-.*-.*)")) > 0: try: name_en = str(''.join(name.re("(.*-.*-.*)"))) return except UnicodeEncodeError: return ''.join(name.re("(.*-.*-.*)")) elif len(name.re("(.*)-")) > 0: name_vn = name.re("(.*)-") return name_vn else: name = name.extract() if isinstance(name, list): name = ''.join(name) try: name = str(name) return except UnicodeEncodeError: return name except UnicodeEncodeError: return ''.join(name_1) else: return
def neirong(self, response): data = Selector(response) itme = DianyingItem() itme['moviename'] = data.xpath( '//h1[@class="font14w"]/text()').extract() itme['jianjie'] = ''.join( data.xpath( '//div[@class="info" and child::h1[@class="font14w"]]/span/text()' )[0:2].extract()).strip() itme['actor'] = data.xpath( '//span/a[contains(@href,"actor")]/text()').extract() itme['kind'] = data.xpath( '//span/a[contains(@href,"----")]/text()').extract() itme['country'] = data.xpath( '//span[child::span[contains(text(),"地区")]]/a/text()').extract() itme['language'] = data.xpath( '//span[child::span[contains(text(),"语言")]]/a/text()').extract() itme['daoyan'] = data.xpath( '//span/a[contains(@href,"dir")]/text()').extract() itme['sysj'] = data.re('上映日期:.*?(\d{4}-\d{2}-\d{2})') itme['pc'] = data.re('片长:\D+?(\d+[\u4E00-\u9FA5]+)') itme['gxsj'] = data.re('更新日期:.*?(\d{4}-\d{2}-\d{2})') itme['jqjs'] = ''.join( data.xpath('//div[@id="movie_content"]/text()').extract()).strip() itme['dbpf'] = data.xpath( '//span[child::span[contains(text(),"豆瓣评分")]]/text()').re( '\d+.\d+') downlink = data.xpath( '//div[@id="cpdl2list"]//a[@rel="nofollow"]/@href').extract() downlink_2 = data.xpath('//input[@class="checkone"]/@value').extract() for i in downlink_2: downlink.append(i) itme['downlink'] = downlink print(itme) return itme
def _enrich_same_part(self, item, response): sel = Selector(response) item['NBR'] = ''.join(sel.re(r'var model_nbr = (.*);')).strip() item['model'] = json.loads(''.join(sel.re(r'var model = (.*);')).strip().replace(' ', '')) item['styles'] = json.loads(''.join(sel.re(r'var styles = (.*);')).strip().replace(' ', '')) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'])
def _enrich_same_part(self, item, response): sel = Selector(response) item['NBR'] = ''.join(sel.re(r'var model_nbr = (.*);')).strip() item['model'] = json.loads(''.join( sel.re(r'var model = (.*);')).strip().replace(' ', '')) item['styles'] = json.loads(''.join( sel.re(r'var styles = (.*);')).strip().replace(' ', '')) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'])
def _enrich_same_part(self, item, response): sel = Selector(response) item['nowPrice'] = ''.join( sel.xpath( '//div[@id="productPrice"]/div/span[@class="nowPrice"]/text()' ).extract()).strip() item['wasPrice'] = ''.join( sel.xpath( '//div[@id="productPrice"]/div/span[@class="wasPrice"]/text()' ).extract()).strip() item['price'] = sel.re(r'price: \"(.*?)\"') item['product_id'] = ''.join(sel.re(r'"product_id" : \["(.*?)\"'))
def parse_item(self, response): print("FinishlineSpider#parse_item ...") self._logger.debug("FinishlineSpider#parse_item ...") sel = Selector(response) item = FinishlineItem() self._enrich_base_data(item, response, is_update=False) self._enrich_same_part(item, response) item['title'] = ''.join( sel.xpath('//h1[@id="title"]/text()').extract()).strip() list_size = [] sizes = sel.xpath('//div[@id="productSizes"]/div[@class="size"]') for size in sizes: list_size.append([ ''.join(size.xpath('@id').extract()), ''.join(size.xpath('text()').extract()) ]) item['size'] = list_size item['productDescription'] = format_html_string(''.join( sel.xpath('//div[@id="productDescription"]').extract())) item['product_images'] = json.loads(''.join( sel.re(r"JSON.parse\(\'(.*?)\'")).strip()) item['links'] = ''.join(sel.re(r"links: \'(.*?)\'")).split(';') item['product_color'] = ''.join( sel.re(r'"product_color" : \["(.*?)\"')) item['style_color_ids'] = ''.join( sel.xpath( '//div[@id="styleColors"]/span[@class="styleColorIds"]/text()' ).extract()) colorid = ''.join( sel.xpath('//h1[@id="title"]/@data-colorid').extract()) styleid = ''.join( sel.xpath('//h1[@id="title"]/@data-styleid').extract()) imageset_url = 'http://www.finishline.com/store/api/scene7/imageset/?colorId=%s&styleId=%s' % ( colorid, styleid) meta = response.meta meta['item-half'] = item req = Request(url=imageset_url, meta=meta, callback=self.parse_images, dont_filter=response.request.dont_filter) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid']) print('self.crawler.stats.inc_crawled_pages::::::::::', ) yield req
def get_comment_url(response, page_num=1): sel = Selector(response) forum_id = sel.re(r'var\s+forumid\s*=\s*(.*);')[0] thread_id = sel.re(r'var\s+threadid\s*=\s*(.*);')[0] thread_list = [thread_id[:-5], thread_id[-5:-3], thread_id[-3:-1], thread_id[-1:]] thread_path = '/'.join(thread_list) comment_href = 'http://st01.club.china.com/data/thread/' \ + forum_id + '/' + thread_path + '_' + str(page_num) + '_re.js' return comment_href
def parse_car_conf(self, response): id = response.meta.get('id') req = Selector(response) res = req.re(r'url:(.*Price.*seriesId.*)') conf_urls = res[0].strip().replace('"', '') conf_url = self.mian_urls + conf_urls.replace(',', '').split('/', 1)[1] yield Request(conf_url, callback=self.prase_car_conf, meta={'id': id})
def get_comment_list(response, post_id): sel = Selector(response) # noinspection PyBroadException try: comment_json = sel.re(r'page_obj\s*=\s*(.*);\s*printReCallBack')[0] except: comment_json = None print('comment page error: ' + response.url) if comment_json: # noinspection PyBroadException try: comment_obj = json.loads(comment_json .replace('"%5C%22', '\\"') .replace('%5C%22"', '\\"') .replace('"\\', "") .replace(';" src=', ';\\" src=') .replace('onerror="', "onerror='")) if comment_obj: comment_list = comment_obj['l'] for comment in comment_list: comment_item = CommentItem() comment_item['post_id'] = post_id comment_item['comment_id'] = comment['mi'] comment_item['author_id'] = comment['ui'] comment_item['author_name'] = comment['nc'] comment_item['date_time'] = comment['cd'] comment_item['floor'] = comment['lc'] comment_item['content'] = comment['nr'] yield comment_item except: print('json: ' + comment_json + ' error!!')
def parse_item(self, response): print("FinishlineSpider#parse_item ...") self._logger.debug("FinishlineSpider#parse_item ...") sel = Selector(response) item = FinishlineItem() self._enrich_base_data(item, response, is_update=False) self._enrich_same_part(item, response) item['title'] = ''.join(sel.xpath('//h1[@id="title"]/text()').extract()).strip() list_size = [] sizes = sel.xpath('//div[@id="productSizes"]/div[@class="size"]') for size in sizes: list_size.append([ ''.join(size.xpath('@id').extract()), ''.join(size.xpath('text()').extract()) ]) item['size'] = list_size item['productDescription'] = format_html_string(''.join(sel.xpath('//div[@id="productDescription"]').extract())) item['product_images'] = json.loads(''.join(sel.re(r"JSON.parse\(\'(.*?)\'")).strip()) item['links'] = ''.join(sel.re(r"links: \'(.*?)\'")).split(';') item['product_color'] = ''.join(sel.re(r'"product_color" : \["(.*?)\"')) item['style_color_ids'] = ''.join(sel.xpath('//div[@id="styleColors"]/span[@class="styleColorIds"]/text()').extract()) colorid = ''.join(sel.xpath('//h1[@id="title"]/@data-colorid').extract()) styleid = ''.join(sel.xpath('//h1[@id="title"]/@data-styleid').extract()) imageset_url = 'http://www.finishline.com/store/api/scene7/imageset/?colorId=%s&styleId=%s' % (colorid,styleid) meta = response.meta meta['item-half'] = item req = Request( url=imageset_url, meta=meta, callback=self.parse_images, dont_filter=response.request.dont_filter ) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) print('self.crawler.stats.inc_crawled_pages::::::::::',) yield req
def _enrich_same_part(self, item, response): sel = Selector(response) item['title'] = ' '.join(sel.xpath('//*[@id="prdImage"]/h1/*//text()').extract()).strip() if len(item['title']) < 2: item['title'] = ' '.join(sel.xpath('//*[@id="productStage"]/h1/*/text()').extract()).strip() item['productDescription'] = format_html_string(''.join(sel.xpath('//div[@id="prdInfoText"]').extract()).strip()) if len(item['productDescription']) == 0: item['productDescription'] = format_html_string(''.join(sel.xpath('//div[@id="productDescription"]').extract()).strip()) item['stockJSON'] = json.loads(''.join(sel.re(r'var stockJSON =(.*);')).strip().replace(' ', '')) item['dimensions'] = json.loads(''.join(sel.re(r'var dimensions =(.*);')).strip().replace(' ', '')) item['dimToUnitToValJSON'] = json.loads(''.join(sel.re(r'var dimToUnitToValJSON =(.*);')).strip().replace(' ', '')) item['dimensionIdToNameJson'] = json.loads(''.join(sel.re(r'var dimensionIdToNameJson =(.*);')).strip().replace(' ', '')) item['valueIdToNameJSON'] = json.loads(''.join(sel.re(r'var valueIdToNameJSON =(.*);')).strip().replace(' ', '')) item['colorNames'] = json.loads(re_search(r'var colorNames =(.*?);', response.body)) item['colorPrices'] = json.loads(re_search(r'var colorPrices =(.*?);', response.body)) item['styleIds'] = json.loads(re_search(r'var styleIds =(.*?);', response.body)) item['colorIds'] = json.loads(re_search(r'var colorIds =(.*?);', response.body))
def get_price(self): r = requests.get(_BASE_URL.format(self.symbol), timeout=20) sel = Selector(text=r.text) company_id = sel.re(r"_chartConfigObject.companyId = '(\d+)';")[0] timestamp = sel.xpath("//span[@id='ref_{}_ltt']/text()".format(company_id)).extract() if timestamp: timestamp = '{} {}'.format(self.date, timestamp[0].strip()) price = sel.xpath("//span[@id='ref_{}_l']/text()".format(company_id)).extract()[0] return timestamp, price return None, None
async def get_film(self, url): source = await html_source(url) bt_url = re.findall(r'href="(attach-dialog-fid-.*\.htm)"', source) selector = Selector(text=source) film_name = selector.re(r'\[BT下载\].*B\b') film_name = film_name[0] if film_name else '' bt_name = selector.css('td:nth-child(1) > a::text').extract_first() if film_name and bt_name: return film_name, bt_name, self.domain + bt_url[0].replace( 'dialog', 'download')
def _enrich_same_part(self, item, response): sel = Selector(response) item['suggested_price'] = ''.join( sel.xpath( '//*[@id="divPricing"]/span[1]/s/text()').extract()).strip() item['our_price'] = ''.join( sel.xpath( '//*[@id="productprice"]/span/text()').extract()).strip() item['product_id'] = ''.join( sel.re(r"dtmProductId = \'(.*?)\'")).strip()
def get_prise_num_dict(self, response): sel = Selector(response) comment_vote = sel.re("var\s+commentsVotes\s*=\s*\'(.*)\',")[0] if comment_vote: vote_obj = json.loads(comment_vote) return vote_obj return {}
def get_game_id_name_map(self): """获取游戏名与id的映射关系""" url = 'https://m.dailiantong.com/js/game.js' resp = requests.get(url) doc = Selector(text=resp.text) result = doc.re(r'JSON.parse\(\'(.*)\'\);')[0] result = json.loads(result) d = {} for i in result: d[i['GameName']] = i['GameID'] return d
def get_order_list(self, game_code): """获取订单列表""" url = 'https://server.dailiantong.com/API/AppService.ashx?Action=LevelOrderList&callback=callback&' params = 'IsPub=1&GameID={}&ZoneID=0&ServerID=0&SearchStr=&Sort_Str=&PageIndex=1&PageSize=20000&Price_Str=&PubCancel=0&SettleHour=0&UserID=0&TimeStamp={}&Ver=1.0&AppOS=webapp&AppID=webapp'.format( game_code, self.timestamp) amethod = "LevelOrderList" sign = self.get_sgin(amethod, params) params += '&Sign={}'.format(sign) resp = requests.get(url + params) doc = Selector(text=resp.text) r = doc.re(r'callback\((.*)\)')[0] order_list = json.loads(r)['LevelOrderList'] return order_list
def parse_item(self, response): self._logger.info("start response in parse_item -> response type:%s" %type(response).__name__) sel = Selector(response) item = AmazonItem() self._enrich_base_data(item, response, is_update=False) node_id_re = re.compile(r'node=(?P<node_id>\w+)') # breadcrum node_id_hrefs = sel.xpath('//div[@id="wayfinding-breadcrumbs_feature_div"]//a/@href').extract() item['node_ids'] = [node_id_re.search(x).group('node_id') for x in node_id_hrefs if node_id_re.search(x)] # Look for Similar Items by Category similar_node_id_links = [x.xpath('a/@href').extract() for x in sel.xpath('//div[@id="browse_feature_div"]/div/p')] item['similar_node_ids'] = [[node_id_re.search(x).group('node_id') for x in links] for links in [links for links in similar_node_id_links]] item['parent_asin'] = ''.join(sel.re(r'"parent_asin":"(.*?)"')).strip() if len(item['parent_asin']) == 0: item['parent_asin'] = ''.join(sel.xpath('//form[@id="addToCart"]/input[@id="ASIN"]/@value').extract()).strip() item['title'] = ''.join(sel.xpath('//span[@id="productTitle"]/text()').extract()).strip() item['product_specifications'] = format_html_string(''.join(sel.xpath('//div[@id="technicalSpecifications_feature_div"]//table').extract()).strip()) item['product_description'] = format_html_string(''.join(sel.xpath('//div[@id="productDescription"]//p/text()').extract()).strip()) brand_href = ''.join(sel.xpath('//a[@id="brand"]/@href').extract()).strip() brand_re = re.compile(r'^/(?P<brand>.*)/b/') m = brand_re.search(brand_href) if m: brand = brand_re.search(brand_href).group('brand') else: brand = ''.join(sel.xpath('//a[@id="brand"]/text()').extract()).strip() item['brand'] = brand item['feature'] = format_html_string(''.join(sel.xpath('//div[@id="feature-bullets"]').extract()).strip()) item['dimensions_display'] = safely_json_loads(format_html_string(''.join(sel.re(r'"dimensionsDisplay":(.*?]),')).strip())) item['variations_data'] = safely_json_loads(''.join(sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip()) enrich_color_images(item, sel) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) return item
def get_info(self): xpath_info = { "username": '******', "password": '******', "login_button": '//*[@id="login"]', "check_code": "", "code_image_url": "", "success_ele": "" } # 设置session session = Session() # 获取cookie cookie = self.no_check_get_cookie(xpath_info) print(cookie) # 给session设置cookie session.cookies.update(cookie_to_dict(cookie)) # json的url url = "http://demand.lianfen360.com/market?id=mk103BP3r&login=true" # 设置头部 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36", } # 访问url response = session.get(url, headers=headers) selector = Selector( text=response.text).xpath('/html/body/div/div[6]/div/h2/text()') print(selector) result = { "注册人数": selector.re('注册数:(\d+)')[0], "实名人数": "null", "申请人数": selector.re('申请数:(\d+)')[0], "放款人数": "null", "备注": '' } self.write_sql(result)
def prase_url(self, response): self.captcha_check(response) html = Selector(text=response.text) text = html.re("var url = '';([\s\S]*?)url.replace")[0].replace( "\r\n", "").replace(" ", "") url_list = text.split(";") url = "".join([(url_text.split("+=")[1].replace("\'", "")) for url_text in url_list if url_text != ""]) self.add_request( Request(url=url, method="GET", verify=False, headers=self.headers, callback=self.prase_content, meta=response.meta))
def parse_zsptbs_bbs_no_boardid(self, response): hd_id = response.meta["hd_id"] stockcode = response.meta["stockcode"] content = response.body.decode('gbk').encode("utf-8") sel = Selector(text=content) rid_pattern = sel.re("boardid=\d+']") boardid = rid_pattern[0][8:len(rid_pattern[0])] zsptbs_url = "http://zsptbs.p5w.net/bbs/chatbbs/left.asp?boardid=%s&pageNo=1" % boardid yield Request(url=zsptbs_url, meta={ "hd_id": hd_id, "pageNo": '1', "boardid": boardid, "stockcode": stockcode }, callback=self.parse_zsptbs_bbs)
def parse_no_rid(self, response): hd_id = response.meta["hd_id"] sel = Selector(response) rid_pattern = sel.re("rid=\d+") rid = rid_pattern[0][4:len(rid_pattern[0])] question_url = "http://ircs.p5w.net/ircs/topicInteraction/questionPage.do" yield FormRequest(url=question_url, meta={ "hd_id": hd_id, "pageNo": '1' }, formdata={ 'pageNo': str(1), 'rid': rid }, callback=self.parseIndex)
def get_comment_prise(response): sel = Selector(response) prise_json = sel.re(r'var\s+dingJson\s*=\s*(.*);')[0] json_obj = json.loads( ('{"l":' + prise_json + '}') .replace('messageid', '"messageid"') .replace('ding', '"ding"') .replace('\'', '"')) prise_list = json_obj['l'] prise_dict = {} for prise in prise_list: prise_dict[prise['messageid']] = prise['ding'] return prise_dict
def first_parse_item(self, response): sel = Selector(response) trs = sel.xpath('//section/div/div[1]/table/tr') times = trs.xpath("td[1]/text()").extract() prices = trs.xpath("td[3]/span/text()").extract() for index, tt in enumerate(times): res_item = ScprojectItem() res_item['time'] = tt res_item['price'] = prices[index] res_item['sid'] = '1111' res_item['mid'] = '111111' yield res_item page = sel.re(r'var v_PageCount = (\d*);') surl = response.url + '&page=%d' if not page is None: for x in range(2, int(page[0]) + 1): url = surl % x print(url) yield Request(url, callback=self.parse_item)
def _enrich_same_part(self, item, response): sel = Selector(response) item['nowPrice'] = ''.join(sel.xpath('//div[@id="productPrice"]/div/span[@class="nowPrice"]/text()').extract()).strip() item['wasPrice'] = ''.join(sel.xpath('//div[@id="productPrice"]/div/span[@class="wasPrice"]/text()').extract()).strip() item['price'] = sel.re(r'price: \"(.*?)\"') item['product_id'] = ''.join(sel.re(r'"product_id" : \["(.*?)\"'))
def parse_item_update(self, response): self._logger.info("start response in parse_item_update -> response type:%s" % type(response).__name__) item = AmazonItem() meta = response.meta self._enrich_base_data(item, response, is_update=True) item['asin'] = re_search(r'product/(.*)/', response.url) sel = Selector(response) asin_divs = sel.xpath('//input[@id="ASIN"]/@value').extract() if len(asin_divs) > 0: item['parent_asin'] = ''.join(asin_divs[0]).strip() else: item['parent_asin'] = '' item['size'] = re_search(r'\"%s\":\[(.*?)\]' % item['asin'], ''.join(sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip()) item['dimensions_display'] = safely_json_loads(format_html_string(''.join(sel.re(r'"dimensionsDisplay":(.*?]),')).strip())) item['merchants'] = sel.xpath('//div[@id="merchant-info"]/a/text()').extract() item['merchant_3p'] = ''.join(sel.xpath('//div[@id="soldByThirdParty"]/b/text()').extract()).strip() item['price_3p'] = ''.join(sel.xpath('//div[@id="soldByThirdParty"]/span[contains(@class, "price3P")]/text()').extract()).strip() shipping_cost_3p_string = ''.join(sel.xpath('//div[@id="soldByThirdParty"]/span[contains(@class, "shipping3P")]/text()').extract()).strip() item['shipping_cost_3p'] = extract_shipping_cost_price_from_shipping_cost_string(shipping_cost_3p_string) item['from_price'] = ''.join(sel.xpath('//div[@id="mbc"]/div[@class="a-box"]/div/span/span[@class="a-color-price"]/text()').extract()).strip() availability_divs = [ ''.join(sel.xpath('//div[@id="availability"]/span/text()').extract()), ''.join(sel.xpath('//span[@class="availRed"]/text()').extract()), ''.join(sel.xpath('//span[@class="availGreen"]/text()').extract()) ] availability_str = ''.join(availability_divs).strip().lower() merchant_info_str = ''.join(sel.xpath('//div[@id="merchant-info"]/text()').extract()).strip().lower() if ( (len(availability_divs) <= 0) or availability_str.startswith('only') or availability_str.startswith('in stock') or availability_str.startswith('usually') ): item['availability'] = 'true' item['availability_reason'] = "001: %s" % availability_str elif ( merchant_info_str.startswith('ships from and sold by') ): item['availability'] = 'true' item['availability_reason'] = "002: %s" % merchant_info_str elif ( availability_str.startswith('available from') ): item['availability'] = 'other' item['availability_reason'] = "003: %s" % availability_str elif availability_str.startswith('currently unavailable'): item['availability'] = 'false' item['availability_reason'] = "004: %s" % availability_str else: item['availability'] = 'false' item['availability_reason'] = '000: _' if item['availability'] in ['true']: item['list_price'] = ''.join([ ''.join(sel.xpath('//div[@id="price"]//tr[1]/td[2]/text()').extract()).strip(), ''.join(sel.xpath('//span[@id="listPriceValue"]/text()').extract()).strip() ]) item['price'] = ''.join([ ''.join(sel.xpath('//span[@id="priceblock_ourprice"]/text()').extract()).strip(), ''.join(sel.xpath('//span[@id="priceblock_saleprice"]/text()').extract()).strip(), ''.join(sel.xpath('//span[@id="priceblock_dealprice"]/text()').extract()).strip(), ''.join(sel.xpath('//span[@id="actualPriceValue"]/b/text()').extract()).strip() ]) if ((len(item['list_price']) + len(item['price'])) <= 0): #self.log("response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body))) self._logger.info("response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body))) dump_response_body(item['asin'], response.body) shipping_cost_string_ourprice = ''.join(sel.xpath('//*[@id="ourprice_shippingmessage"]/span/text()').extract()).strip() shipping_cost_string_saleprice = ''.join(sel.xpath('//*[@id="saleprice_shippingmessage"]/span/text()').extract()).strip() shipping_cost_string = shipping_cost_string_ourprice or shipping_cost_string_saleprice item['shipping_cost'] = extract_shipping_cost_price_from_shipping_cost_string(shipping_cost_string) self._logger.info("Spiderid: %s Crawlid: %s yield item in parse, asin: %s" % (response.meta['spiderid'],response.meta['crawlid'],item.get("asin", "unknow"))) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) return item elif item['availability'] in ['other']: item['price'] = ''.join([ ''.join(sel.xpath('//*[@id="unqualifiedBuyBox"]//span[@class="a-color-price"]/text()').extract()).strip() ]) new_url = ''.join(sel.xpath('//div[@id="unqualifiedBuyBox"]/div/div[1]/a/@href').extract()).strip() new_url = urljoin(response.url, new_url) meta['item_half'] = item req = Request( url=new_url, meta=meta, callback=self.parse_shipping_cost, dont_filter=response.request.dont_filter ) self._logger.info("Spiderid: %s Crawlid: %s yield request in parse, asin: %s" % (response.meta['spiderid'],response.meta['crawlid'],req.meta.get("asin", "unknow"))) return req else: self._logger.info("yield item in parse, asin: %s" % item.get("asin", "unknow")) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) return item
def _enrich_same_part(self, item, response): sel = Selector(response) item['suggested_price'] = ''.join(sel.xpath('//*[@id="divPricing"]/span[1]/s/text()').extract()).strip() item['our_price'] = ''.join(sel.xpath('//*[@id="productprice"]/span/text()').extract()).strip() item['product_id'] = ''.join(sel.re(r"dtmProductId = \'(.*?)\'")).strip()
def get_post_item(response): url = response.url sel = Selector(response) post_item = PostItem() post_item['url'] = url forum_id = sel.re(r'var\s+forumid\s*=\s*(.*);')[0] thread_id = sel.re(r'var\s+threadid\s*=\s*(.*);')[0] post_item['post_id'] = forum_id + '_' + thread_id path_text = sel.xpath( '//div[contains(@class, "breadcrumbs")]/a/text()').extract() path_href = sel.xpath( '//div[contains(@class, "breadcrumbs")]/a/@href').extract() post_item['path_text'] = ', '.join(path_text) post_item['path_href'] = ', '.join( [response.urljoin(p_href) for p_href in path_href if p_href]) title = sel.xpath('//*[@id="chan_newsTitle"]').xpath( 'string(.)').extract_first() post_item['title'] = check_value(title) key_words = sel.xpath('//meta[@name="keywords"]/@content').extract_first() post_item['key_words'] = check_value(key_words) hot_words = sel.xpath('//div[@class="hotWords"]/a/text()').extract() post_item['hot_words'] = ', '.join(hot_words) author_id = sel.xpath( '//span[@class="author"]/a[@name="onlineIcon"]/@_webim_ppid' ).extract_first() post_item['author_id'] = check_value(author_id) author_name = sel.xpath('//span[@class="author"]/a/text()').extract_first() post_item['author_name'] = check_value(author_name) level = sel.xpath('//span[@class="level"]/img/@title').extract_first() post_item['level'] = check_value(level) point = sel.xpath( '//div[@class="grade"]/span[not(@class)]/text()').extract_first() if point and len(point) > 3: post_item['point'] = point[3:] else: post_item['point'] = '' date_time = sel.xpath('//li[@class="time"]/span/text()').extract_first() if date_time and len(date_time) > 4: post_item['date_time'] = date_time[4:] else: post_item['date_time'] = '' num_href = sel.xpath( '//div[@class="postStaticData"]/span/script/@src').extract_first() post_item['_num_href'] = check_value(num_href) content, picture_hrefs = get_content(response) post_item['content'] = check_value(content) post_item['picture_hrefs'] = picture_hrefs post_item['comment_ids'] = [] post_item['parse_time'] = time.time() return post_item
def parse_item(self, response): self._logger.info("start response in parse_item -> response type:%s" % type(response).__name__) sel = Selector(response) item = AmazonItem() self._enrich_base_data(item, response, is_update=False) node_id_re = re.compile(r'node=(?P<node_id>\w+)') # breadcrum node_id_hrefs = sel.xpath( '//div[@id="wayfinding-breadcrumbs_feature_div"]//a/@href' ).extract() item['node_ids'] = [ node_id_re.search(x).group('node_id') for x in node_id_hrefs if node_id_re.search(x) ] # Look for Similar Items by Category similar_node_id_links = [ x.xpath('a/@href').extract() for x in sel.xpath('//div[@id="browse_feature_div"]/div/p') ] item['similar_node_ids'] = [[ node_id_re.search(x).group('node_id') for x in links ] for links in [links for links in similar_node_id_links]] item['parent_asin'] = ''.join(sel.re(r'"parent_asin":"(.*?)"')).strip() if len(item['parent_asin']) == 0: item['parent_asin'] = ''.join( sel.xpath('//form[@id="addToCart"]/input[@id="ASIN"]/@value'). extract()).strip() item['title'] = ''.join( sel.xpath('//span[@id="productTitle"]/text()').extract()).strip() item['product_specifications'] = format_html_string(''.join( sel.xpath('//div[@id="technicalSpecifications_feature_div"]//table' ).extract()).strip()) item['product_description'] = format_html_string(''.join( sel.xpath('//div[@id="productDescription"]//p/text()').extract()). strip()) brand_href = ''.join( sel.xpath('//a[@id="brand"]/@href').extract()).strip() brand_re = re.compile(r'^/(?P<brand>.*)/b/') m = brand_re.search(brand_href) if m: brand = brand_re.search(brand_href).group('brand') else: brand = ''.join( sel.xpath('//a[@id="brand"]/text()').extract()).strip() item['brand'] = brand item['feature'] = format_html_string(''.join( sel.xpath('//div[@id="feature-bullets"]').extract()).strip()) item['dimensions_display'] = safely_json_loads( format_html_string(''.join( sel.re(r'"dimensionsDisplay":(.*?]),')).strip())) item['variations_data'] = safely_json_loads(''.join( sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip()) enrich_color_images(item, sel) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid']) return item
def parse_item_update(self, response): self._logger.info( "start response in parse_item_update -> response type:%s" % type(response).__name__) item = AmazonItem() meta = response.meta self._enrich_base_data(item, response, is_update=True) item['asin'] = re_search(r'product/(.*)/', response.url) sel = Selector(response) asin_divs = sel.xpath('//input[@id="ASIN"]/@value').extract() if len(asin_divs) > 0: item['parent_asin'] = ''.join(asin_divs[0]).strip() else: item['parent_asin'] = '' item['size'] = re_search( r'\"%s\":\[(.*?)\]' % item['asin'], ''.join(sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip()) item['dimensions_display'] = safely_json_loads( format_html_string(''.join( sel.re(r'"dimensionsDisplay":(.*?]),')).strip())) item['merchants'] = sel.xpath( '//div[@id="merchant-info"]/a/text()').extract() item['merchant_3p'] = ''.join( sel.xpath( '//div[@id="soldByThirdParty"]/b/text()').extract()).strip() item['price_3p'] = ''.join( sel.xpath( '//div[@id="soldByThirdParty"]/span[contains(@class, "price3P")]/text()' ).extract()).strip() shipping_cost_3p_string = ''.join( sel.xpath( '//div[@id="soldByThirdParty"]/span[contains(@class, "shipping3P")]/text()' ).extract()).strip() item[ 'shipping_cost_3p'] = extract_shipping_cost_price_from_shipping_cost_string( shipping_cost_3p_string) item['from_price'] = ''.join( sel.xpath( '//div[@id="mbc"]/div[@class="a-box"]/div/span/span[@class="a-color-price"]/text()' ).extract()).strip() availability_divs = [ ''.join( sel.xpath('//div[@id="availability"]/span/text()').extract()), ''.join(sel.xpath('//span[@class="availRed"]/text()').extract()), ''.join(sel.xpath('//span[@class="availGreen"]/text()').extract()) ] availability_str = ''.join(availability_divs).strip().lower() merchant_info_str = ''.join( sel.xpath('//div[@id="merchant-info"]/text()').extract()).strip( ).lower() if ((len(availability_divs) <= 0) or availability_str.startswith('only') or availability_str.startswith('in stock') or availability_str.startswith('usually')): item['availability'] = 'true' item['availability_reason'] = "001: %s" % availability_str elif (merchant_info_str.startswith('ships from and sold by')): item['availability'] = 'true' item['availability_reason'] = "002: %s" % merchant_info_str elif (availability_str.startswith('available from')): item['availability'] = 'other' item['availability_reason'] = "003: %s" % availability_str elif availability_str.startswith('currently unavailable'): item['availability'] = 'false' item['availability_reason'] = "004: %s" % availability_str else: item['availability'] = 'false' item['availability_reason'] = '000: _' if item['availability'] in ['true']: item['list_price'] = ''.join([ ''.join( sel.xpath('//div[@id="price"]//tr[1]/td[2]/text()'). extract()).strip(), ''.join( sel.xpath('//span[@id="listPriceValue"]/text()'). extract()).strip() ]) item['price'] = ''.join([ ''.join( sel.xpath('//span[@id="priceblock_ourprice"]/text()'). extract()).strip(), ''.join( sel.xpath('//span[@id="priceblock_saleprice"]/text()'). extract()).strip(), ''.join( sel.xpath('//span[@id="priceblock_dealprice"]/text()'). extract()).strip(), ''.join( sel.xpath('//span[@id="actualPriceValue"]/b/text()'). extract()).strip() ]) if ((len(item['list_price']) + len(item['price'])) <= 0): #self.log("response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body))) self._logger.info( "response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body))) dump_response_body(item['asin'], response.body) shipping_cost_string_ourprice = ''.join( sel.xpath('//*[@id="ourprice_shippingmessage"]/span/text()'). extract()).strip() shipping_cost_string_saleprice = ''.join( sel.xpath('//*[@id="saleprice_shippingmessage"]/span/text()'). extract()).strip() shipping_cost_string = shipping_cost_string_ourprice or shipping_cost_string_saleprice item[ 'shipping_cost'] = extract_shipping_cost_price_from_shipping_cost_string( shipping_cost_string) self._logger.info( "Spiderid: %s Crawlid: %s yield item in parse, asin: %s" % (response.meta['spiderid'], response.meta['crawlid'], item.get("asin", "unknow"))) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid']) return item elif item['availability'] in ['other']: item['price'] = ''.join([ ''.join( sel.xpath( '//*[@id="unqualifiedBuyBox"]//span[@class="a-color-price"]/text()' ).extract()).strip() ]) new_url = ''.join( sel.xpath('//div[@id="unqualifiedBuyBox"]/div/div[1]/a/@href'). extract()).strip() new_url = urljoin(response.url, new_url) meta['item_half'] = item req = Request(url=new_url, meta=meta, callback=self.parse_shipping_cost, dont_filter=response.request.dont_filter) self._logger.info( "Spiderid: %s Crawlid: %s yield request in parse, asin: %s" % (response.meta['spiderid'], response.meta['crawlid'], req.meta.get("asin", "unknow"))) return req else: self._logger.info("yield item in parse, asin: %s" % item.get("asin", "unknow")) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid']) return item