def parse(self, response): productos = response.css('div.product-tile-inner') print(f"\n\n{len(productos)}\n\n") for producto in productos: existe_producto = len(producto.css('div.detail')) if (existe_producto > 0): producto_loader = ItemLoader(item=ProductoFybeca(), selector=producto) producto_loader.add_css('titulo', 'a.name::text') producto_loader.add_xpath( 'precio_1', 'div[contains(@class,"detail")]/div[@class="side"]/div[@class="price-member"]/div/@data-bind' ) producto_loader.add_xpath( 'precio_0', 'div[contains(@class,"detail")]/div[@class="side"]/div[@class="price"]/@data-bind' ) producto_loader.add_xpath( 'imagen', 'div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src' ) yield producto_loader.load_item()
def parse_row(self, response, row): print(row) il = ItemLoader(item=IlHospitalLicensesSpiderItem()) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value( 'url', 'https://data.illinois.gov/dataset/410idph_hospital_directory/resource/9bdedb85-77f3-490a-9bbd-2f3f5f227981' ) il.add_value('sourceName', 'IL_Home_Nursing_Agency_Licenses') il.add_value('permit_type', "nursing_home_license") name = self._getDBA(row['Home Nursing Agencies']) company_name = str(name[0]).replace(' -', '') if ' -' in str( name[0]) else name[0] address = self.format__address_4( row['Address'], row['City'], 'IL', str(row['Zip']) if '.' not in str(row['Zip']) else str( row['Zip'])[:str(row['Zip']).rfind('.')]) il.add_value('dba_name', name[1]) il.add_value('permit_lic_no', row.get('License #', '')) il.add_value( 'permit_lic_exp', self.format_date(row.get('Exp. Date', '')) if row.get('Exp. Date') else '') il.add_value('company_name', company_name) il.add_value('location_address_string', address) il.add_value('county', row.get('County', '')) il.add_value( 'permit_lic_desc', "Medical License for " + company_name if company_name else "Medical License") il.add_value('company_phone', row.get('Phone', '')) il.add_value( 'company_subtype', row.get('Type', '') if row.get('Type', '') else 'Nursing Home License') yield il.load_item()
def parse_busi_art(self, res): tag = res.meta['tag'] url = res.url main = res.css('.container.js-social-anchor-start') ci = ItemLoader(item=CNN(), selector=main) ci.add_value('tag', tag) ci.add_value('crawled_at', self.crawled_at) ci.add_value('url', url) ci.add_css('title', 'h1.article-title.speakable::text') ci.add_xpath('timestamp', './/span[@class="cnnDateStamp"]/text()') img_ = main.xpath('.//div[@id="storytext"]//img/@src').extract() ci.add_value('image_urls', img_) ci.add_css('summary', 'h2.speakable::text') ci.add_xpath('text', './/p/text()') ci.add_value('source', self.source) return ci.load_item()
def parse_detail(self, response): # 从request_url 中提取question_item内容 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css('title', '.QuestionHeader-title::text') item_loader.add_css('content', '.QuestionHeader-detail') item_loader.add_value('url', response.url) question_id = response.meta.get('question_id', '') item_loader.add_value('question_id', question_id) item_loader.add_css('answer_num', 'h4.List-headerText span::text') item_loader.add_css('click_num', '.NumberBoard-itemValue::attr(title)') item_loader.add_css('comment_num', '.QuestionHeader-Comment button::text') item_loader.add_css('watch_user_num', '.NumberBoard-itemValue::attr(title)') item_loader.add_css('topics', '.Popover div::text') item_loader.add_value( 'crawl_time', datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)) question_item = item_loader.load_item() yield scrapy.Request(url=self.start_answer_url.format( question_id, 20, 0), callback=self.parse_answer) yield question_item
def parse(self, response): productos = response.css('div.product-tile-inner') for producto in productos: detalles = producto.css('div.detail') tiene_detalles = len(detalles) > 0 if (tiene_detalles): # valida si existe el detalle del producto producto_loder = ItemLoader( # instancia que carga propiedades del item item=ProductoFybeca(), # clase item selector=producto # selector por defecto ) producto_loder.default_output_processor = TakeFirst( ) # no guarda como arreglo producto_loder.add_css( 'titulo', # nombre de la propiedad del item 'a.name::text' # css que contiene el dato que se le quiere dar al nombre de la propiedad del item ) producto_loder.add_xpath( 'imagen', 'div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src' # xpath que contiene el dato ) yield producto_loder.load_item()
def parse(self, response): # Get the list of products from html response products_list = response.xpath('//div[@data-search-results=""]/div//li//a/@href').extract() products_id_list = [product_href.split("/")[4] for product_href in products_list] # For each product extracts the product URL print(f"#### FOUND {len(products_id_list)} PRODUCTS") if self.URLS_ONLY: for product_id in products_id_list: # Create the ItemLoader object that stores each product information l = ItemLoader(item=ProductItem(), response=response) product_url = f'https://www.etsy.com/listing/{product_id}' l.add_value('url', product_url) yield l.load_item() else: for product_id in products_id_list: product_url = f'https://www.etsy.com/listing/{product_id}' # Stops if the COUNTER reaches the maximum set value if self.COUNTER < self.COUNT_MAX: # Go to the product's page to get the data yield scrapy.Request(product_url, callback=self.parse_product, dont_filter=True) # Pagination - Go to the next page current_page_number = int(response.url.split('=')[-1]) next_page_number = current_page_number + 1 # Build the next page URL next_page_url = '='.join(response.url.split('=')[:-1]) + '=' + str(next_page_number) # If the current list is not empty if len(products_id_list) > 0: yield scrapy.Request(next_page_url)
def parse(self, response): json_data = json.loads(response.text) twitters = json_data[0]['card_group'] for twitter in twitters: if twitter.get('mblog'): loader = ItemLoader(item=WeiboItem()) loader.default_output_processor = Join() try: loader.add_value('user_name', twitter['mblog']['user']['screen_name']) loader.add_value('time', twitter['mblog']['created_at']) loader.add_value('comments', str(twitter['mblog']['comments_count'])) loader.add_value('likes', str(twitter['mblog']['attitudes_count'])) loader.add_value('text', twitter['mblog']['text']) if twitter['mblog'].get('retweeted_status'): loader.add_value('type', '转发') elif twitter['mblog'].get('page_info'): if twitter['mblog']['page_info'].get('video_details'): loader.add_value('type', '视频') else: loader.add_value('type', '原创') else: loader.add_value('type', '原创') yield loader.load_item() except KeyError: self.logger.error('KeyError') else: self.logger.info('No mblog key') # go to next link next_link = json_data[0]['next_cursor'] yield scrapy.Request( 'https://m.weibo.cn/feed/friends?version=v4&next_cursor={}&page=1'. format(str(next_link)), callback=self.parse)
def parse_article(self, response, date): if 'pdf' in response.url: return item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = response.xpath('//h3[@class="bronze"]/text()').get() if title: title = title.strip() content = response.xpath( '//div[contains(@class,"content")][h3]//text()').getall() content = [ text for text in content if text.strip() and '{' not in text ] content = "\n".join(content).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse_item(self, response): """ This function parses a property page. @url http://scrapybook.s3.amazonaws.com/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ # Create the loader using the response l = ItemLoader(item=PropertiesItem(), response=response) # Load fields using XPath expressions l.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(str.strip, str.title)) l.add_xpath('price', './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') l.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(str.strip), Join()) l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(str.strip)) make_url = lambda i: response.urljoin(i) l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(make_url)) # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse(self, response): # with open('initialResp.json', 'wb') as f: # f.write(response.body) current_page = response.request.meta['currentPage'] json_resp = json.loads(response.body) houses = json_resp.get('searchResults').get('listResults') for house in houses: loader = ItemLoader(item=ZillowItem()) loader.add_value('id', house.get('id')) loader.add_value('image_urls', house.get('imgSrc')) loader.add_value('detail_url', house.get('detailUrl')) loader.add_value('status_type', house.get('statusType')) loader.add_value('status_text', house.get('statusText')) loader.add_value('price', house.get('price')) loader.add_value('address', house.get('address')) loader.add_value('beds', house.get('beds')) loader.add_value('baths', house.get('baths')) loader.add_value('area_sqft', house.get('area')) loader.add_value('latitude', house.get('latLong').get('latitude')) loader.add_value('longitude', house.get('latLong').get('longitude')) loader.add_value('broker_name', house.get('brokerName')) loader.add_value('broker_phone', house.get('brokerPhone')) yield loader.load_item() total_pages = json_resp.get('searchList').get('totalPages') if current_page <= total_pages: current_page += 1 yield scrapy.Request( url=parse_new_url(URL, page_number=current_page), callback=self.parse, cookies=cookie_parser(), meta={ 'currentPage': current_page } )
def parse_article(self, response): if 'pdf' in response.url: return item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = response.xpath('//h1/text()').get() date = '' if title: title = title.strip() if title[:2].isnumeric(): date = title.split()[0] content = response.xpath('//div[@class="text"]//text()').getall() content = [text for text in content if text.strip()] content = "\n".join(content).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse(self, response): """This function parses a property page @url https://nj.58.com/ershoufang/pn3/?PGTID=0d30000c-000a-c568-cd81-f02b4ffbea21&ClickID=1 @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ # Create the loader using the response l = ItemLoader(item=PropertiesItem(), response=response) # Load fields using XPath expressions l.add_xpath('title', '//div[@class="list-info"][1]/h2[@class="title"]/a/text()') l.add_xpath('price', '//p[@class="sum"][1]/b/text()') l.add_xpath('description', '//div[@class="list-info"][1]/p[@class="baseinfo"][1]//text()', MapCompose(str.strip), Join()) l.add_xpath('address', '//div[@class="list-info"][1]/p[@class="baseinfo"][2]/span//text()', MapCompose(str.strip), Join()) l.add_xpath('image_urls', '//div[@class = "pic"][1]/a/img/@src') # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_hotel(self, response): sel = Selector(response) item = ItemLoader(Hotel(), sel) item.add_xpath('nombre', '//h1[@id="HEADING"]/text()') item.add_xpath( 'precio', '//div[@class="ui_columns is-mobile is-multiline is-vcentered is-gapless-vertical _2mWM5u8t" or @class= "ui_columns is-gapless is-mobile"]//div[contains(text(),"$")]' ) item.add_xpath( 'descripcion', '//div[contains(@data-ssrev-handlers,"load") and contains(@data-ssrev-handlers,"Description")]/div[1]/div[contains(text(),"")]' ) item.add_xpath( 'amenities', '//div[contains(@data-ssrev-handlers,"amenities")]//text()') yield item.load_item() # EJECUCION # scrapy runspider 1_tripadvisor.py -o tripadvisor.csv -t csv
def parse(self, response): # Create the loader using the response l = ItemLoader(item=ScrapyLeoItem(), response=response) # Load fields using XPath expressions # page_url l.add_value('page_url', response.url) # rental_or_monthly if response.url[-20] == "r": l.add_value('rental_or_monthly', 'rental') else: l.add_value('rental_or_monthly', 'monthly') # leo_or_par if response.url[-18] == 0: l.add_value('leo_or_par', 'leo') else: l.add_value('leo_or_par', 'par') # address if len(response.xpath('.//td[@colspan="2"][2]/text()')) == 1: l.add_xpath('address', './/td[@colspan="2"][2]/text()') else: l.add_xpath( 'address', '//*[@id="inquiry-form"]/div[1]/div/table/tbody/tr/td[2]/text()' ) # mail_box l.add_xpath('mail_box', './/ul[2]/li[8]/span/@class', MapCompose(lambda i: i.replace('sprite ico ', ''))) return l.load_item()
def parse_review_container(self, response, **kwargs): film_id = kwargs['film_id'] rv_containers = response.xpath('/html/body/div[1]/div[1]/div') xpath = '/html/body/div[1]/div[1]/div[{0}]/{1}' for idx in range(1, len(rv_containers) + 1): l = ItemLoader(item=ReviewItem(), response=response) l.add_value('film_id', film_id) l.add_xpath( 'user_id', xpath.format(idx, 'div[1]/div[1]/div[2]/span[1]/a/@href')) l.add_xpath('comment_id', xpath.format(idx, 'div[1]/div[1]/a/@href')) l.add_xpath( 'date', xpath.format(idx, 'div[1]/div[1]/div[2]/span[2]/text()')) l.add_xpath( 'star_rating', xpath.format(idx, 'div[1]/div[1]/div[1]/span/span[1]/text()')) l.add_xpath('title', xpath.format(idx, 'div[1]/div[1]/a/text()')) l.add_xpath( 'content', xpath.format( idx, 'div[1]/div[1]/div[@class="content"]/div[1]/text()')) yield l.load_item()
def parse_content(self, response): def deal_publish_time(publish_time_list): year = publish_time_list[0] mounth = publish_time_list[1] day = publish_time_list[2] if len(mounth) < 2: mounth = '0' + mounth if len(day) < 2: day = '0' + day publish_time = year + '-' + mounth + '-' + day + ' 00:00:00' return publish_time loaders1 = ItemLoader(response=response, item=YfspiderspeakItem()) loaders1.add_value('url', response.url) loaders1.add_value('spider_time', time.time()) loaders1.add_xpath('title', '//h1[@class="entry-title"]/text()') loaders1.add_value( 'publish_time', response.xpath('//span[@class="entry-date"]').re( r'(\d{4}).*?(\d).*?(\d)'), deal_publish_time) loaders1.add_xpath( 'content', '//div[contains(@id,"post")]/div[@class="entry-content"]//text()', Join()) loaders1.add_value( 'img_urls', response.xpath( '//div[contains(@id,"post")]/div[@class="entry-content"]').re( r'href="([\S]*?\.jpg)"')) loaders1.add_xpath( 'id', '//div[@id="content"]/div[contains(@id,"post")]/@id') item1 = loaders1.load_item() print(item1) return item1
def parse_item(self, response): """Parse an ad page with an apartment. @url https://www.immobilienscout24.de/expose/93354819 @returns items 1 1 @scrapes url title address neighborhood cold_rent warm_rent rooms """ self.shutdown_on_error() item = ItemLoader(ApartmentItem(), response=response) item.add_value('url', response.url) item.add_css('title', 'h1#expose-title::text') for field, css_class in self.DIV_PRE_MAPPING.items(): item.add_xpath( field, "//div/pre[contains(@class, '{}')]/text()".format(css_class)) full_address = ''.join( response.xpath("//span[@data-qa='is24-expose-address']/div//text()" ).extract()).strip() parts = full_address.split(self.CITY) if len(parts) == 1: item.add_value('address', full_address) else: street_zip = (parts[0] + self.CITY).strip(' ,').replace( ' (zur Karte) ', '') item.add_value('address', street_zip) item.add_value('neighborhood', ''.join(parts[1:]).strip(' ,')) item.add_css('cold_rent', 'div.is24qa-kaltmiete::text') item.add_css('warm_rent', 'dd.is24qa-gesamtmiete::text') item.add_css('rooms', 'div.is24qa-zi::text') item.add_xpath( 'active', '//div[contains(@class, "status-message")]' '/h3[starts-with(normalize-space(.), "Angebot")]/text()') yield item.load_item()
def read_news(self, response): cuerpoPaths = [ '//article//section[@class="article-content"]//p/text()', '//article//section[@class="article-content"]/text()', ] titulo = response.xpath(self.tituloPath).get() fecha_publicacion = response.xpath(self.fechaPath).get() for path in cuerpoPaths: cuerpo = response.xpath(path).getall() if cuerpo: break # Change until find one that works # Date should has format: YYYY-MM-DDTHH:MM:SS fecha_publicacion = self.format_fecha(fecha_publicacion) news = ItemLoader(item=News()) news.add_value('titulo', titulo) news.add_value('cuerpo', cuerpo) news.add_value('fecha_publicacion', fecha_publicacion) news.add_value('url', response.url) news.add_value('diario', self.name) news.add_value('page', self.current_page) return news.load_item()
def parse(self, response): productos = response.css('div.product-tile-inner') for producto in productos: detalles = producto.css('div.detail') tiene_detalle = len(detalles) > 0 if (tiene_detalle): producto_loader = ItemLoader(item=ProductoFybeca(), selector=producto) # producto_loader.default_output_processor=TakeFirst() producto_loader.add_css('titulo', 'a.name::text') producto_loader.add_xpath( 'imagen', 'div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src' ) producto_loader.add_css( 'precio_normal', 'div.side > div.price::attr(data-bind)') producto_loader.add_css( 'precio_descuento', 'div.price-member > div::attr(data-bind)') yield producto_loader.load_item()
def parse(self, response): self.logger.info("start parese url %s" %response.url) for div in response.xpath('//div[@class="house-listBox"]/div'): l = ItemLoader(item=PropertyItem(), selector=div) l.default_output_processor = TakeFirst() l.add_xpath("title", '(.//a)[2]/text()', MapCompose(lambda x: self.spc_reg.sub("",x))) l.add_xpath("url", "(.//a)[2]/@href", MapCompose(lambda x: urljoin(response.url,urlparse(x).path))) l.add_xpath("price", './/p[@class="price-nub cRed"]/text()',Join()) l.add_xpath("address",'.//a[@class="f000 mr_10"]//text()', MapCompose(lambda x: self.spc_reg.sub("",x)),Join()) l.add_xpath("dist_name", './/p[@class="f7b mb_15"]/text()',Join(), MapCompose(lambda x: x.split("-")[0].strip())) l.add_xpath("subdist_name",'.//p[@class="f7b mb_15"]/text()',Join(), MapCompose(lambda x: x.split("-")[1].split()[0])) # housekeeping l.add_value("source", response.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("date", datetime.datetime.now().strftime("%Y%m%d%H%M%S")) yield l.load_item()
def parse_item(self, response): item = ItemLoader(item=Jl2763Item(), response=response) url = response.url item_list = item_code(url, self.web_name, 'code=(.*?)$') item.add_value('web_name', self.web_name) item.add_value('web_code', self.name) item.add_value('url', url) item.add_value('item_code', item_list.get('item_code')) item.add_css('title', '.tit_left_invest::text') item.add_css('amount', '.dl_left_invest.width-250 span::text') item.add_css('rate', 'dd') item.add_css('period', 'dd:nth-child(2)') # item.add_xpath('loan_using', '//*[contains(text(),"")]/following-sibling::td[1]/text()') # item.add_xpath('loaner_info', '//*[contains(text(),"证件号码")]/parent::li[1]') item.add_css('pay_type', '.money_left_invest i::text') item.add_css('progress', "[src='/mdw/images/repayment_r.png']") # invest records i_v = [] invest_records_temp = '{{username={lst[0]}|rate=-1|postmoney={lst[1]}|money={lst[1]}|postdate={lst[2]}|status=全部通过}}' invest_records_format = "" tr = response.css('div .table02_repay').css('tr') # print(tr) try: for i in tr: lst = i.css('td::text').extract() if lst: i_v.append(lst) for n in i_v: invest_records_format += invest_records_temp.format(lst=n) item.add_value('invest_records', invest_records_format) item.add_value('start', i_v[-1][2]) item.add_value('end', i_v[0][2]) except Exception: self.logger.info('invest records is error %s' % url) yield item.load_item()
def parse_item(self, response): selector = Selector(response=response) selector.css('div#content div.article div.topic-content') item_loader = ItemLoader(item=HouseRentingDoubanItem(), selector=selector, response=response) item_loader.add_css(field_name='title', css='table.infobox *::text') item_loader.add_css(field_name='title', css='div#content > h1:first-child::text') item_loader.add_value(field_name='source', value=self.name) item_loader.add_css(field_name='author', css='h3 span.from a::text') # item_loader.add_css(field_name='image_urls', css='div.topic-content div#link-report img::attr(src)') item_loader.add_css(field_name='author_link', css='h3 span.from a::attr(href)') item_loader.add_css(field_name='content', css='div.topic-content div#link-report *::text', re=r'\s*(.*)\s*') item_loader.add_value(field_name='source_url', value=response.url) item_loader.add_css(field_name='publish_time', css='h3 span:last-child::text', re=r'\s*(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s*') yield item_loader.load_item()
def parse_itemsEb(self, response): """ Funcion que se encarga de extraer la información de cada producto en Mercadolibre. """ item = ItemLoader(Articulo(), response) if self.item_countE < 8: link = response.url item.add_value('store', 'Ebay') item.add_value('link', link) item.add_xpath('imageURL', '//div[@id="mainImgHldr"]/img[@id="icImg"]/@src') item.add_xpath('name', '//h1[@id="itemTitle"]/text()[1]') item.add_xpath('Price', '//span[@id="prcIsum"]/text()', MapCompose(self.priceCleaningE)) item.add_xpath('description', '//span[@id="vi-cond-addl-info"]/text()') yield item.load_item() self.item_countE = self.item_countE + 1 else: print('Limit reached for Ebay') if 'ebay.com' in self.allowed_domains: self.allowed_domains.remove('ebay.com')
def parse_book_info(self, response: Response, short_name): # Get book's full name and author loader = ItemLoader(item=BookInfo(), response=response) # Find elements loader.add_css(FULL_NAME, BOOK_FULL_NAME_PATH) loader.add_css(AUTHOR, BOOK_AUTHOR_PATH) loader.add_css(LAST_CHAPTER, BOOK_LAST_CHAPTER_PATH) # Extracting data page = loader.load_item() last_chapter = int(page.get(LAST_CHAPTER)) yield { SHORT_NAME: short_name, FULL_NAME: page.get(FULL_NAME), AUTHOR: page.get(AUTHOR), LAST_CHAPTER: last_chapter } for i in range(1, last_chapter + 1): yield Request(url=CHAPTER_URL.format(short_name, i), callback=self.parse_chapter, cb_kwargs=dict(short_name=short_name, chapter_index=i))
def parse_item(self, response): """ This function parses a property page @url http://localhost:9312/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ loader = ItemLoader(item=PropertiesItem(), response=response) loader.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(str.strip, str.title)) loader.add_xpath('price', '//*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') loader.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(str.strip, lambda i: i.replace('\r\n', ' '))) loader.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(str.strip)) loader.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: parse.urljoin(response.url, i))) loader.add_value('url', response.url) loader.add_value('project', self.settings.get('BOT_NAME')) loader.add_value('spider', self.name) loader.add_value('server', socket.gethostname()) loader.add_value('date', datetime.datetime.now()) yield loader.load_item()
def parse_trainer(self, response): """ Parse trainer page. @url https://www.oddspark.com/keiba/TrainerDetail.do?trainerNb=018052 @returns items 1 1 @returns requests 0 0 @trainer """ logger.info(f"#parse_trainer: start: url={response.url}") # Parse trainer logger.debug("#parse_trainer: parse trainer") loader = ItemLoader(item=TrainerItem(), response=response) loader.add_value("trainer_url", response.url) loader.add_xpath("trainer_name", "normalize-space(//div[contains(@class,'section')]/div/span[1]/text())") loader.add_xpath("birthday", "normalize-space(//table[contains(@class,'tb72')]/tr[1]/td/text())") loader.add_xpath("gender", "normalize-space(//table[contains(@class,'tb72')]/tr[2]/td/text())") loader.add_xpath("belong_to", "normalize-space(//table[contains(@class,'tb72')]/tr[3]/td/text())") i = loader.load_item() logger.info(f"#parse_trainer: trainer={i}") yield i
def parse_question(self, response): """处理问题函数""" item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_value('zhihu_id', response.meta.get('question_id', '')) item_loader.add_css('title', 'h1.QuestionHeader-title::text') item_loader.add_css('content', '.QuestionRichText') item_loader.add_value('url', response.url) item_loader.add_css('answer_num', '.List-headerText span::text') item_loader.add_css('comments_num', '.QuestionHeader-Comment > button::text') item_loader.add_css('watch_user_num', '.NumberBoard-item .NumberBoard-value::text') item_loader.add_css('click_num', '.NumberBoard-item .NumberBoard-value::text') item_loader.add_css('topics', '.TopicLink .Popover div::text') question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format( response.meta.get('question_id', ''), 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_odds_trifecta(self, response): """ Parse odds(trifecta) page. @url https://www.oddspark.com/keiba/Odds.do?sponsorCd=06&raceDy=20201018&opTrackCd=11&raceNb=7&betType=8&horseNb=1 @returns items 1 @returns requests 0 0 @odds_trifecta """ logger.info(f"#parse_odds_trifecta: start: url={response.url}") # Parse odds trifecta for tr in response.xpath("//table[@summary='odds']/tr"): if len(tr.xpath("th")) == 2: logger.debug("#parse_odds_trifecta: skip header") else: loader = ItemLoader(item=OddsTrifectaItem(), selector=tr) loader.add_value("odds_url", response.url) loader.add_xpath("horse_number", "th/text()") loader.add_xpath("odds", "td/span/text()") i = loader.load_item() logger.debug(f"#parse_odds_trifecta: odds trifecta={i}") yield i
def parse_auto_page(self, response): item = ItemLoader(AvitoParserItem(), response) item.add_xpath( 'title', '//h1[@class="title-info-title"]/span[@class="title-info-title-text"]/text()' ) item.add_xpath( 'price', '//div[@class="item-price-value-wrapper"]//span[@class="js-item-price"]/@content' ) item.add_xpath( 'params', '//div[@class="item-params"]/ul[@class="item-params-list"]/li') item.add_xpath( 'photos', '//div[contains(@class, "gallery-img-wrapper")]/div[contains(@class, "gallery-img-frame")]/@data-url' ) autoteka_link_id = response.xpath( '//div[@class="js-autoteka-teaser"]/@data-item-id').extract_first( ) autoteka_link = 'https://www.avito.ru/web/1/swaha/v1/autoteka/teaser/' yield response.follow(autoteka_link + autoteka_link_id, callback=self.get_VIN_official, meta={'item': item})
def parse(self, response): json_resp = json.loads(response.body) houses = json_resp.get('cat1').get('searchResults').get('listResults') for house in houses: loader = ItemLoader(item = ZillowItem()) loader.add_value('id',house.get('id')) loader.add_value('image_urls',house.get('imgSrc')) loader.add_value('detail_url',house.get('detailUrl')) loader.add_value('status_type',house.get('statusType')) loader.add_value('status_text',house.get('statusText')) loader.add_value('price',house.get('price')) loader.add_value('address',house.get('address')) loader.add_value('beds',house.get('beds')) loader.add_value('baths',house.get('baths')) loader.add_value('area_sqft',house.get('area')) loader.add_value('latitude',house.get('latLong').get('latitude')) loader.add_value('longitude',house.get('latLong').get('longitude')) loader.add_value('broker_name',house.get('brokerName')) yield loader.load_item() current_page = response.meta['currentPage'] total_pages = json_resp.get('cat1').get('searchList').get('totalPages') if current_page <= total_pages: nxt_pg = current_page + 1 yield scrapy.Request( url= parse_new_url(URL,pg_num=nxt_pg), callback=self.parse, cookies=cookie_parser(), meta={ 'currentPage': nxt_pg } )