def test_first_match_unicode_list(self): sample_list = SelectorList() sample_list.extract = Mock( return_value=[u'one', u'two', u'three', u'four']) first_match = Utils.first_match(sample_list) assert first_match == u'one', "First item from unicode list incorrect!"
def test_first_match_string_list(self): sample_list = SelectorList() sample_list.extract = Mock( return_value=['one', 'two', 'three', 'four']) first_match = Utils.first_match(sample_list) assert first_match == 'one', "First item from string list incorrect!"
def parse(self, response): menu = response.css('#nav_custom').xpath('li/ul') lis = menu.xpath('li') if self.categories: lis = SelectorList([li for cat in self.categories for li in menu.xpath('li[a[span[text() = "' + cat + '"]]]') if cat]) for url in lis.xpath('ul/li/a[span[not(contains(text(), "Accessories"))]]/@href').extract(): yield Request("%s?limit=all" % url, callback=self.parse_items)
def parse(self, response): menu = response.css('#top-menu').xpath('ul/li/ul') lis = menu.xpath('li').extract() if self.categories: lis = SelectorList([li for cat in self.categories for li in menu.xpath('li[h3[contains(text(), "' + cat + '")]]') if cat]) for url in lis.xpath('ul/li/a[not(contains(text(), "All"))]/@href').extract(): yield Request(url, callback=self.parse_items)
def parse(self, response): menu = get_extracted(response.css('#vmenu_69')) lis = menu.xpath('li') if self.categories: lis = SelectorList([li for cat in self.categories for li in menu.xpath('li[div/a[text() = "' + cat + '"]]') if cat]) for url in lis.xpath('div/a/@href').extract(): yield Request(url, callback=self.parse_items)
def test_trim_list_mixed_list(self): sample_list = SelectorList() sample_list.extract = Mock(return_value=[ ' one ', u' two point five ', 'three ', u' four' ]) desired_list = ['one', 'two point five', 'three', 'four'] trimmed_list = Utils.trim_list(sample_list) assert desired_list == trimmed_list, "Mixed list incorrect trimmed"
def select_from(self, selector: SelectorList) -> SelectorList: selected = selector.css(self.string_selector) if not selected: msg = 'Not found any "{}" containers.'.format(self.name) if self.raise_on_missed: raise RuntimeError(msg) else: self.logger.warning(msg) return SelectorList([]) return selected
def test_trim_list_string_list(self): sample_list = SelectorList() sample_list.extract = Mock(return_value=[ ' one ', ' two point five ', 'three ', ' four' ]) desired_list = ['one', 'two point five', 'three', 'four'] trimmed_list = Utils.trim_list(sample_list) assert desired_list == trimmed_list, "String list incorrect trimmed: {%s} vs {%s}" % ( ', '.join(map(str, desired_list)), ', '.join(map( str, trimmed_list)))
def test_trim_list_unicode_list(self): sample_list = SelectorList() sample_list.extract = Mock(return_value=[ u' one ', u' two point five ', u'three ', u' four' ]) desired_list = [u'one', u'two point five', u'three', u'four'] trimmed_list = Utils.trim_list(sample_list) assert desired_list == trimmed_list, "Unicode list incorrect trimmed: {%s} vs {%s}" % ( ', '.join(map(unicode, desired_list)), ', '.join( map(unicode, trimmed_list)))
def parse(self, response): menu = response.css('#top-menu').xpath('ul/li/ul') lis = menu.xpath('li').extract() if self.categories: lis = SelectorList([ li for cat in self.categories for li in menu.xpath('li[h3[contains(text(), "' + cat + '")]]') if cat ]) for url in lis.xpath( 'ul/li/a[not(contains(text(), "All"))]/@href').extract(): yield Request(url, callback=self.parse_items)
def _make_selector_list(self, elems, is_text, text_recurse, attr): if type(elems) is not list: elems = [elems] if is_text: return SelectorList( _TextNode(self.webdriver, s) for elem in elems for s in self._text_content(elem, text_recurse) ) selectors = self._make_result(elems) if attr: selectors = (_NodeAttribute(s.element, attr) for s in selectors) return SelectorList(selectors)
def parse_comment(self, response): movie_name = Selector(response).xpath( '//h1/text()').extract_first().replace('短评', '').strip() comments = SelectorList( Selector(response).xpath('//div[@class="comment"]').extract()) for comment in comments: shorts = Selector( text=comment).xpath('//p/span/text()').extract_first() votes = Selector(text=comment).xpath( '//h3/span[@class="comment-vote"]/span/text()').extract_first( ) stars = Selector(text=comment).xpath( '//h3/span[@class="comment-info"]/span[contains(@class,"rating")]/@class' ).extract_first() if stars: stars = stars.split()[0].replace('allstar', '').strip().replace('0', '') else: stars = 0 comment_time = Selector(text=comment).xpath( '//h3/span[@class="comment-info"]/span[@class="comment-time "]/text()' ).extract_first() comment_item = CommentItem() comment_item['movie_name'] = movie_name comment_item['shorts'] = shorts comment_item['stars'] = stars comment_item['votes'] = votes comment_item['comment_time'] = comment_time yield comment_item
def parse_body(obj_name: str, logger: Logger, selectors: SelectorList) -> str or None: logger.info( f'--------------------------parse_body() {obj_name}, selectors[0]: {selectors.get(default="")[:50] or None}' ) if obj_name != 'body' and not selectors.get(): return def parse_line(selector: Selector) -> str: if selector.css('br').get(): return '' elif selector.css('a[href*="mitemin"]').get(): return parse_mitemin_href(f'an illust in {obj_name}', logger, selector)[1] texts = validate(f"a line of {obj_name}", logger, selector.css("*::text").getall()) if len(texts) == 1: return texts[0] larges = validate(f"words in a line of {obj_name}", logger, selector.css('rb::text').getall()) smalls = validate(f"rubies in a line of {obj_name}", logger, selector.css('rt::text').getall()) ruby_texts = [ f'|{large}《{small}》' for large, small in zip(larges, smalls) ] return ''.join([ ''.join(pair) for pair in zip_longest(texts, ruby_texts, fillvalue='') ]) return '\n'.join(map(parse_line, selectors))
def childes( selector: SelectorList, parent_tag: str, ) -> SelectorList: if not isinstance(parent_tag, str): raise TypeError('Given `parent_tag` is not `str` object.') childes_selector = SelectorList() iterate_selector_string_template = parent_tag + ' > :nth-child({i})' i = 1 # starting the iteration while True: child = selector.css(iterate_selector_string_template.format(i=i)) if child: childes_selector.append(child) i += 1 else: return childes_selector
def parse_href_end_num(obj_name: str, logger: Logger, selector: SelectorList) -> int: logger.info('--------------------------parse_href_end_num()') url: str = validate( f'url of {obj_name}', logger, selector.xpath('@href').get()) # e.g. 'https://hoge.com/huga/114514/' return int( validate(f'{obj_name} in url', logger, safe_get(url.split('/'), -2)))
def parse_model(self, listing): vehicle = VehicleInfo() full_name = SelectorList( listing.xpath( '//*[@id="vdp-title"]/div/div/div[1]/div[1]/div[1]/h1/text()') ).extract_first() url_split = listing.url.split('/') ulr_path = re.sub('detail-', '', url_split[url_split.__len__() - 1]) vehicle['year'] = re.findall('[0-9]{4}', ulr_path)[0] vehicle['make'] = re.sub('_', ' ', re.split('-', ulr_path)[1]).capitalize() vehicle['model'] = re.sub( vehicle['year'] + ' ' + vehicle['make'] + ' ', '', full_name) vehicle['domain'] = self.domain vehicle['trim'] = ( listing.xpath('//*[@id="vdp-1-toggle"]/div[2]/div/div[2]/text()' ).extract_first()).strip() vehicle['ext_color'] = listing.xpath( '//*[@id="tab-details"]/div[3]/table/tbody/tr[1]/td[2]/text()' ).extract_first().strip() vehicle['int_color'] = listing.xpath( '//*[@id="tab-details"]/div[3]/table/tbody/tr[2]/td[2]/text()' ).extract_first().strip() vehicle['stock_no'] = listing.xpath( '//*[@id="tab-details"]/div[3]/table/tbody/tr[3]/td[2]/text()' ).extract_first().strip() vehicle['miles'] = "0" vehicle['vin'] = listing.xpath( '//*[@id="tab-details"]/div[3]/table/tbody/tr[4]/td[2]/text()' ).extract_first() vehicle['url'] = listing.url vehicle['price'] = listing.xpath( '//*[@id="vdp-price"]/div/h4/text()').extract_first().lstrip('$') vehicle['veh_state'] = "new" vehicle['engine'] = listing.xpath( '//*[@id="tab-details"]/div[2]/table/tbody/tr[3]/td[2]/text()' ).extract_first() vehicle['transmission'] = listing.xpath( '//*[@id="tab-details"]/div[2]/table/tbody/tr[4]/td[2]/text()' ).extract_first() vehicle['drivetrain'] = listing.xpath( '//*[@id="tab-details"]/div[2]/table/tbody/tr[5]/td[2]/text()' ).extract_first() vehicle['body_type'] = listing.xpath( '//*[@id="vdp-title"]/div/div/div[1]/div[1]/div[1]/h4/text()' ).extract_first().strip() vehicle['title'] = listing.xpath( '/html/head/title/text()').extract_first() image_urls = listing.css( '#tab-slideshow-photos .swiper-slide a::attr(href)').extract() result = "" for img in image_urls: result = result + "http:" + img + "," vehicle['image'] = result.rstrip(',') yield vehicle
def parse(self, response): if self.record_spec.get('css'): record_selectors = response.css(self.record_spec['css']) elif self.record_spec.get('xpath'): record_selectors = response.xpath(self.record_spec['xpath']) # elif self.record_spec.get('jsonpath'): # json_response = json.loads(response.body_as_unicode()) # jsonpath_expr = parse(self.record_spec['jsonpath']) # records_raw = [json.dumps(match.value) for match in jsonpath_expr.find(json_response)] # record_selectors = SelectorList([Selector(text=record_raw) for record_raw in records_raw]) else: record_selectors = SelectorList() for record_selector in record_selectors: yield self.get_record_fields(record_selector)
def parse_article_item(self, response, extra_info): item = CommonNewsItem() item['original_url'] = format_url(response.url) item['id'] = hash_digest(item.get('original_url')) try: item['scr'] = extra_info.get('scr') item['cid'] = extra_info.get('cid') item['media'] = extra_info.get('media_id') item['title'] = response.css('div#article div.articlehead h1::text').extract_first().replace('\r\n', '') author = response.css('div#article div.articlehead span.author::text').extract_first() or '' item['author'] = author.replace('\r\n', '') date_str = '%s %s' % (response.xpath('head/meta[contains(@name,"publication_date")]/@content').extract_first(), response.css('div#article.bloc span.timestampUpdatedright').re_first('\d+:\d\d')) if extra_info.get('release_time'): item['release_time'] = extra_info.get('release_time') else: item['release_time'] = int((datetime.strptime(date_str,'%Y-%m-%d %H:%M')).timestamp()*1000) item['recom_time'] = int(datetime.now().timestamp()*1000) abstract = extra_info.get('abstract') or response.css('div#article div.articlehead span.kicker::text').extract_first() or '' item['abstract'] = abstract.replace('\r\n', '') item['content_type'] = 0 item['url'] = '' content_selector = response.css('div#articlebody') thumbnail_src = response.xpath('head/meta[@property = "og:image"]/@content').extract_first() thumbnail_selector = SelectorList([Selector(text = '<img src="%s">' % thumbnail_src)]) content, item['img'] = strip_html_imgs(content_selector, thumbnail_selector) # extract videos html, item['video'] = extract_content_videos(content) if item['video']: item['content_type'] = 1 else: item['content_type'] = 0 # format html html = strip_html_attrs(html) html = re.sub(r'</?[^p!][^>]*>','',html) item['content'] = html return item except Exception as e: print('failed to parse_article, url: %s' % response.url, e) traceback.print_exc() item['recom_time'] = None # illegal flag return
import logging
def parse_product(self, response): pic = response.xpath('//ul[@id="J_UlThumb"]/li/a/img/@src').extract() detail = response.xpath('//div[@class="attributes" and @id="attributes"]').extract() color_pics = response.xpath('//dd/ul[contains(@class,"tm-clear J_TSaleProp tb-img")]/li') total_list = response.xpath('//dd/ul[contains(@class,"tm-clear J_TSaleProp")]/li') json_getter = response.xpath('//div[@class="tm-clear"]/script[3]').extract() extract_pic = print len(extract_pic) size_list = SelectorList() for i in total_list: if i.xpath('a/span/text()').extract()[0] not in color_pics.xpath('a/span/text()').extract(): size_list.append(i) # cnt = 0 # sku_type = 0 # for i in total_list: # cnt = cnt + 1 # if i.xpath('a/span/text()').extract()[0] not in color_pics.xpath('a/span/text()').extract(): # size_list.append(i) # sku_type = cnt # if sku_type > len(total_list)-len(color_pics): # sku_type = 0 # else: # sku_type = 1 # # if len(size_list)==0: # sku_type = 3 # # cut_line = len(total_list)-len(color_pics) str_val_map = {} for i in range(len(total_list)): v = total_list[i].xpath('@data-value').extract() n = total_list[i].xpath('a/span/text()').extract() if len(n)>0 and len(v)>0: str_val_map[n[0]] = v[0] else: print "Value Error" #size_list = total_list[:cut_line] st = json_getter[0].split('TShop.Setup(')[1] info_dict = self.python_getter(st) skuMap = info_dict["valItemInfo"]["skuMap"] product_img = [] for i in pic: tmp = self.resize_pic(i) product_img.append(tmp) # color_set = selen.single_page(response.url) color_set = [] for i in color_pics: color = {} color['color'] = i.xpath('@title').extract()[0] try: tmp_str = i.xpath('a/@style').extract()[0] tmp_str = tmp_str[tmp_str.find('(')+3:tmp_str.rfind(')')] tmp_str = tmp_str[0:tmp_str.rfind('_')] color['image_url'] = tmp_str except: color['image_url'] = product_img[0] color['alternative_image_urls'] = product_img color['pricing_list'] = self.get_pricing(skuMap, str_val_map, i, size_list, response.meta['price']) if len(color['pricing_list'])>0: color_set.append(color) item = MTSGetdataItem() item['product_url'] = response.meta['product_url'] item['item_id'] = response.meta['item_id'] item['title'] = response.meta['title'] item['brand'] = 'Midi' item['merchant'] = 'Tmall' item['product_description'] = '' item['product_detail'] = detail[0] item['colors'] = color_set item['categories'] = info_dict['itemDO']['categoryId'] if len(item['colors'])>0: yield item
def select(selector: SelectorList, string_selector: str) -> SelectorList: return selector.css(string_selector)
def select_script(self, script, *args): """Return elements using JavaScript snippet execution.""" result = self.webdriver.execute_script(script, *args) return SelectorList(self._make_result(result))
def test_first_match_empty_list(self): sample_list = SelectorList([]) first_match = Utils.first_match(sample_list) assert first_match is None, "First item from empty list incorrect!"
def test_trim_list_empty_list(self): sample_list = SelectorList() sample_list.extract = Mock(return_value=[]) trimmed_list = Utils.trim_list(sample_list) assert trimmed_list is None, "Empty list incorrect trimmed"