def start_scraping(response): ### REMOVE TBODY FROM ANY XPATH ! WARNING, else [] empty response for ticket in response.xpath( '/html/body/div/table/tr[position() > 2]' ): # don't put get or extract here, > 2 because first two are header rows item = TicketItem() for date in ticket.xpath( 'normalize-space(./td[11]/div/text())').extract( ): # date string grabbed from every xpath datetime_object = datetime.datetime.strptime( date, '%d/%m/%Y %I:%M %p' ) # formatted into proper datetime date object (comes like '27/05/2020 12:17 PM' on the website) time_ago = timeago.format( datetime_object, datetime.datetime.now( )) # given date vs time taken now to povide time ago item[ 'time_ago'] = time_ago + ' (' + date + ')' # and finally, i have the time ago ( date ) for double check! item['id'] = ticket.xpath( 'normalize-space(./td[6]/div/text())').extract() item['subject'] = ticket.xpath('td[7]/div/a/text()').extract() yield item open_in_browser(response)
def parse(self, response): selector = Selector(response) articles = selector.xpath('//ul[@class="pt_ul clearfix"]/div') print(selector) print(articles) inspect_response(response, self) open_in_browser(response)
def parse(self, response): item = get_item(response.meta, "aa") xpath_data = response.meta['xpath_data'] if response.meta["pdf"]: open_in_browser(response) url_data = response.xpath(xpath_data['url']).getall() url_data = " ".join(url_data) item = parse_pdf(url_data, item, response.meta) else: root_data = response.xpath(xpath_data['root']) if not root_data: open_in_browser(response) # open_in_browser(response) item = parse_root(root_data, item, response.meta) # print(item) if response.meta.get("platformname"): # If all data is in a single request and needs to be split if item: for k, v in item.items(): if k not in excluded_keys and v: single_item = v single_item['invname'] = k single_item['type'] = item['type'] single_item['settings'] = item['settings'] single_item['platformname'] = item['platformname'] # print(single_item) yield yield_item(single_item, self) else: yield yield_item(item, self)
def parse(self, response): open_in_browser(response) links = response.xpath( '//*[contains(concat( " ", @class, " " ), concat( " ", "r", " " ))]//a/@href' ).extract() for link in links: yield FormRequest(url=link, callback=self.searchOneLink)
def parse_item(self, response): super().parse_item(response) # response = Selector(response) open_in_browser(response) tmp = [] with open('1.json', 'a') as f: for i in response.xpath('//code/text()').getall(): try: tmp.append(json.loads(i)) json.dump(json.loads(i), f) except json.decoder.JSONDecodeError: tmp.append(i) json.dump(i, f) set_trace() self.logger.info('RESPONSE URL: %s', response.url) first_name = response.xpath( '//meta[@itemprop="givenName"]/@content').get() last_name = response.xpath( '//meta[@itemprop="familyName"]/@content').get() site = response.xpath('//a[@itemprop="sameAs"]/@href').get() city = response.xpath( '//meta[@itemprop="addressLocality"]/@content').get() phone = response.css( 'a.profile-view-popup__phone::text').extract_first() if first_name and last_name and city and phone: self.item[ 'name'] = first_name + ' ' + last_name if first_name and last_name else None self.item['city'] = city self.item['site'] = site self.item['phone'] = phone yield self.item
def parse(self, response): site = urlparse(response.url).hostname.split('.')[1] debug = False if debug: from scrapy.shell import inspect_response inspect_response(response, self) from scrapy.utils.response import open_in_browser open_in_browser(response) row_xpath = PARSE_MAP[site].get('row_xpath', '//table/tbody/tr') col_xpath = PARSE_MAP[site].get('col_xpath', 'td') ip_pos = PARSE_MAP[site].get('ip_pos', 0) port_pos = PARSE_MAP[site].get('port_pos', 1) protocal_pos = PARSE_MAP[site].get('protocal_pos', 2) rows = response.xpath(row_xpath) for row in rows: cols = row.xpath(col_xpath) ip = cols[ip_pos].xpath('text()').get() port = cols[port_pos].xpath('text()').get() for protocol in self.get_protocols( cols[protocal_pos].xpath('text()').get().lower()): if self.is_valid_proxy(ip, port, protocol): yield ProxyUrlItem(url=f'{protocol}://{ip}:{port}') else: self.logger.error( f'invalid proxy: {protocol}://{ip}:{port}')
def list_quotes(self, response): open_in_browser(response) for quote in response.css('ul.cote li'): auction_url = quote.css('.link-result a::attr(href)').get() quote_id = auction_url.split("=")[1] if auction_url else None self.quoteid = quote_id print("//////////////",self.quoteid) if auction_url is not None: url = response.urljoin(auction_url) yield Request(url, callback=self.analyze_auction) next_page = response.css('a.nextItem ::attr(href)').get() page_number = next_page.split("page=")[1] print("PAAAAGE NUUUUMBEEEER", page_number) print(next_page) base_url = "https://www.lva-auto.fr/cote.php?cote_php?" if next_page != "javascript:void()": next_page2 = base_url + str(next_page) print(next_page2) yield Request(next_page2, callback=self.list_quotes, dont_filter=True)
def parse(self, response): open_in_browser(response) products_links = response.xpath( "//span[@class='a-size-medium a-color-base a-text-normal']/ancestor::a/@href" ).extract() # products_images = response.xpath("//div[@class='a-section aok-relative s-image-fixed-height']/img/@src").extract() products_titles = response.css( ".a-color-base.a-text-normal::text").extract() products_ratings = response.css( ".aok-align-bottom > span.a-icon-alt::text").extract() products_no_reviews = response.xpath( "//span[@class='a-size-base' and @dir='auto']/text()").extract() category = response.css(".a-color-state.a-text-bold::text").extract() for product in range(len(products_links)): result = products_links[product].find('dp/') + 3 second = products_links[product].find('/ref') product_id = products_links[product][result:second] feature = "https://www.amazon.com/hz/reviews-render/ajax/lazy-widgets/stream?asin=" + products_links[ product][ result:second] + "&lazyWidget=cr-summarization-attributes" products_links[ product] = "https://www.amazon.com" + products_links[product] request = scrapy.Request(url=str(feature), callback=properties) # request.cb_kwargs['image_urls'] = products_images[product] request.cb_kwargs['link'] = products_links[product] request.cb_kwargs['title'] = products_titles[product] request.cb_kwargs['rating'] = products_ratings[product] request.cb_kwargs['no_of_reviews'] = products_no_reviews[product] request.cb_kwargs['product_id'] = product_id request.cb_kwargs['category'] = category yield request
def parse(self, response): item = get_item(response.meta, "_investment_fee") xpath_data = response.meta['xpath_data'] if response.meta["pdf"]: url_data = response.xpath(xpath_data['url']).getall() url_data = " ".join(url_data) if not url_data: open_in_browser(response) item = parse_pdf(url_data, item, response.meta) yield yield_item(item, self) else: root_data = response.xpath(xpath_data['root']) # open_in_browser(response) if not root_data: open_in_browser(response) # print(root_data) item = parse_root(root_data, item, response.meta) # print(item) yield yield_item(item, self)
def scrape_home(self, response): # use the response xpath here # h1_tag = response.xpath('//h1/a/text()').extract()[0] # tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() # yield { # 'h1': h1_tag, # 'Tags': tags # } open_in_browser(response) # using debuging loader = ItemLoader(item=QuotesSpiderItem(), response=response) quotes = response.xpath('//*[@class="quote"]') for quote in quotes: text = quote.xpath('.//*[@class="text"]/text()').extract_first() author = quote.xpath( './/*[@itemprop="author"]/text()').extract_first() tags = quote.xpath( './/*[@itemprop="keywords"]/@content').extract_first() loader.add_value('text', text) loader.add_value('author', author) loader.add_value('tags', tags) yield loader.load_item() next_page_url = response.xpath( '//*[@class="next"]/a/@href').extract_first() absolut_next_page_url = response.urljoin(next_page_url) yield Request(absolut_next_page_url)
def parse(self, response): open_in_browser(response) json_obj = json.loads(response.body_as_unicode()) # 从json获得搜索结果 spider_item = FspiderItem() for result in json_obj['ResultSet']: spider_item['title'] = result['IndexedFields']['title']['Value'] spider_item['data_of_transaction'] = result['IndexedFields'][ 'publicationdatetime']['Value'] spider_item['name_of_company'] = result['IndexedFields'][ 'announcercompany']['Value'] # 从json获得交易文档下载地址 transaction_detail_page_url = result['IndexedFields']['showurlen'][ 'Value'] detail_page_response = requests.get(transaction_detail_page_url) document_urls = parsel.Selector(text=detail_page_response.text).xpath\ ("""//div[@class="pagecontent"]//div[@class="data"]/ul/li/div/a/@href""").extract() spider_item['transaction_documents'] = [] for url in document_urls: spider_item['transaction_documents'].append( "https://oasm.finanstilsynet.dk/dk/vismeddelelse.aspx" + url) yield spider_item
def parse_result(self, response): self.year = response.css( 'select#ctl00_ContentPlaceHolder1_ddl_year > option[selected] ::attr(value)' ).extract_first() self.implement = response.css( 'select#ctl00_ContentPlaceHolder1_ddlImpl > option[selected] ::attr(value)' ).extract_first() self.district = response.css( 'select#ctl00_ContentPlaceHolder1_ddlDist > option[selected] ::attr(value)' ).extract_first() self.circle = response.css( 'select#ctl00_ContentPlaceHolder1_ddlDao > option[selected] ::attr(value)' ).extract_first() #self.fw.write(self.year + ' ' + self.implement + ' ' + self.district + ' ' + self.circle + '\n') print(self.year, self.implement, self.district, self.circle) #h=Selector(response) key = [] print(response.css("tr")) #y=key. value = response.css('td.fmDispData::text') for keys in response.css('tr'): key = keys.css('td.textLeft::text').extract() #print(key) print(len(key), len(value)) #self.fw.write(key ) #self.fw.write(value) #view(response) open_in_browser(response) #self.fw.write() #print(5,response.css('select#ctl00_ContentPlaceHolder1_ddlDao > option[selected] ::attr(value).text()').extract_first()) #print(response.css('//*[@id="ctl00_ContentPlaceHolder1_grdCitz"]/tbody/tr[1]/td/table/tbody/tr[3]/td[1]').extract()) return
def analyze_auction(self, response): open_in_browser(response) for row in response.css('tr'): auction_brand = row.css('h2::text').get() auction_model = row.css('td:nth-child(3)::text').get() auction_organizor = row.xpath('normalize-space(td[4])').get() auction_sales_code = row.css('td:nth-child(5) abbr::text').get() auction_restauration_code = row.css('td:nth-child(6) abbr::text').get() auction_price = row.css('td:nth-child(7)::text').get() auction_location = row.css('td:nth-child(8)::text').get() mycar = Car() mycar['auction_brand'] = auction_brand mycar['auction_model'] = auction_model mycar['auction_organizor'] = auction_organizor mycar['auction_sales_code'] = auction_sales_code mycar['auction_restauration_code'] = auction_restauration_code mycar['auction_price'] = auction_price mycar['auction_location'] = auction_location yield mycar
def scrape_home_page(self, response): open_in_browser(response) l = ItemLoader(item=SpiderItem(), response=response) h1_tag = response.xpath('//h1/a/text()').extract_first() tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() l.add_value('h1_tag', h1_tag) l.add_value('tags', tags) return l.load_item()
def parse_details(self, response): if "item_name" not in response.body: open_in_browser(response) item = response.mega.get('item', None) if item: return item else: self.logger.warning("pas d'item reçu pour %s", response.url)
def start_scraping(self, response): open_in_browser(response) item = QuotesItem() for quote in response.css(".quote"): title = quote.css(".text::text").extract_first() author = quote.css(".author::text").extract_first() item["title"] = title item["author"] = author yield item
def parse(self, response): if self.anime_id == 11554: #id limit pass else: open_in_browser(response) anime_id = self.anime_id reviews = response.xpath('//*[@id="anime_detail_reviews"]/div/div[@class="anime_title_eval"]') reviews1 = response.xpath('//div[@class="anime_detail_reviews1"]/div[@class="anime_title_eval"]') print('___________________________________________________') print(reviews1) print('___________________________________________________') reviews = reviews + reviews1 for review in reviews: timestamp = review.xpath('.//span[@class="ateval_dtreviewed"]/text()').extract() reviewer = review.xpath('.//span[@class="ateval_reviewer"]/a/text()').extract() review_state = review.xpath('.//span[@class="bold"]/text()').extract() reviewer_url = review.xpath('.//span[@class="ateval_reviewer"]/a/@href').extract() reading_num = review.xpath('.//span[@class="ateval_reviewer"]/span[@class="red bold"]/text()').extract() point = review.xpath('.//span[@class="ateval_rating"]//span/text()').extract() point_story = review.xpath('.//span[@class="ateval_ratings"]/span[1]/text()').extract() point_animation = review.xpath('.//span[@class="ateval_ratings"]/span[2]/text()').extract() point_vc = review.xpath('.//span[@class="ateval_ratings"]/span[3]/text()').extract() point_music = review.xpath('.//span[@class="ateval_ratings"]/span[4]/text()').extract() point_chara = review.xpath('.//span[@class="ateval_ratings"]/span[5]/text()').extract() #if review text exist, review_url is in it. review_url1 = review.xpath('.//h3[@class="ateval_summary"]/a/@href').extract_first() #if reveiw text does NOT exist,review_url is in it. review_url2 = review.xpath('.//span[@class="ateval_rating"]//a/@href').extract_first() review_url1 = nonetostr(review_url1) review_url2 = nonetostr(review_url2) review_url = review_url1 + review_url2 review_id = re.findall('[0-9]+', review_url)[0] review_text1 = review.xpath('.//span[@class="review_content"]/text()').extract() review_text2 = review.xpath('.//p[@class="ateval_description"]/text()').extract() review_text1 = nonetostr(review_text1) review_text2 = nonetostr(review_text2) review_text = review_text1 + review_text2 yield {'anime_id':anime_id, 'timestamp':timestamp, 'reviewer':reviewer, 'reviewer_url':reviewer_url,\ 'reviews_state':review_state,'reading_num':reading_num, \ 'point':point, 'point_story':point_story, 'point_animation':point_animation, \ 'point_vc':point_vc, 'point_music':point_music, 'point_chara':point_chara,\ 'review_url':review_url, 'review_id':review_id, 'review_text':review_text} self.anime_id +=1 url = 'https://www.anikore.jp/anime_review/' + str(self.anime_id) + '/' yield Request(url, callback=self.parse, dont_filter=True)
def after_login(self, response): open_in_browser(response) card = response.xpath('//div[@class="row myaccountrow"]') if card: print('success') else: print(':(')
def scrape_home_page(self, response): open_in_browser(response) l = ItemLoader(item=QuotesToscrapeItem(), response=response) tags = response.xpath("//*[@class='tag-item']/a/text()").extract() h1_tags = response.xpath('//h1/a/text()').extract_first() l.add_value('tags', tags) l.add_value('h1_tags', h1_tags) return l.load_item()
def parse_details(self, response, item=None): if item: # populate more `item` fields return item else: self.logger.warning('No item received for %s', response.url) inspect_response(response, self) if "item name" not in response.body: open_in_browser(response)
def parse(self, response): open_in_browser(response) items = DdspiderItem() product_info = response.css('#component_59 li') for product in product_info: product_name_dd = product.css('.name a::attr(title)').extract() product_price_dd = product.css('.price_n::text').extract() items['product_name_dd'] = product_name_dd items['product_price_dd'] = product_price_dd yield items
def parse_repo(self, response): """This will fetch all the teams and repos of the user""" open_in_browser(response) self.record = { 'Teams': response.css('span.width-fit::text').extract(), 'All repositories': response.css('a.d-flex::attr(href)').extract(), } yield self.record yield scrapy.Request(url='https://github.com/ali-gillani/test/pulls', callback=self.parse_pull)
def start_scraping(self, response): # Parse the downloading link of the video open_in_browser(response) print("In Succesfully!!!!!!!!!!!!!") for lk in response.css('div.info-box'): yield { 'video_title': lk.css('div.row title::text').extract_first(), 'video_link': lk.css('div.def-btn-box::attr(href)').extract_first() } print("Done Succesfully!!!!!!!!!!!!!")
def parse(self, response): if response.status != 200: self.logger.critical('ERROR LOADING PAGE') open_in_browser(response) raise CloseSpider('bandwidth_exceeded') open_in_browser(response) links = response.xpath('//table//td/a[@class="town"]/@href').extract() for link in links: #for link in links[:2]: url = response.urljoin(link) yield scrapy.Request(url, callback=self.parse_dir_contents)
def scrape_sunset(self, response): open_in_browser(response) # city_sunset = response.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "day", " " )) and (((count(preceding-sibling::*) + 1) = 6) and parent::*)]//*[contains(concat( " ", @class, " " ), concat( " ", "sunset", " " ))]/@innerText').extract() h1 = response.css('h1::text').extract() half_h1 = h1[0].split("in") list_city = half_h1[1] list_city = list_city.split(',') city = list_city[0] city = city.lstrip() yield {'city': city, 'sunset': ""}
def scrape_home_page(self, response): # for debugging only open_in_browser(response) l = ItemLoader(item=QuotesSpiderItem(), response=response) h1_tag = response.xpath('//h1/a/text()').extract_first() tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() l.add_value('h1_tag', h1_tag) l.add_value('tags', tags) return l.load_item() # Commenting all out. # def parse(self, response): # l = ItemLoader(item=QuotesSpiderItem(), response=response) # # Commenting this out. Just for understanding. # h1_tag = response.xpath('//h1/a/text()').extract_first() # tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() # # yield {'H1 Tag': h1_tag, 'Tags': tags} # l.add_value('h1_tag', h1_tag) # l.add_value('tags', tags) # return l.load_item() # # quotes = response.xpath('//*[@class="quote"]') # # for quote in quotes: # # text = quote.xpath('.//*[@class="text"]/text()').extract_first() # # author = quote.xpath('.//*[@itemprop="author"]/text()').extract_first() # # tags = quote.xpath('.//*[@itemprop="keywords"]/@content').extract_first() # # # tags = quote.xpath('.//*[@class="tag"]/text()').extract() # # # If you want to print the data. # # # print('\n') # # # print(text) # # # print(author) # # # print(tags) # # # print('\n') # # yield{'Text':text, # # 'Author':author, # # 'Tags':tags} # # next_page_url = response.xpath('//*[@class="next"]/a/@href').extract_first() # # absolute_next_page_url = response.urljoin(next_page_url) # # yield scrapy.Request(absolute_next_page_url)
def scrape_home_page(self, response): open_in_browser(response) l = ItemLoader(item=QuotesSpiderItem(), response=response) h1_tag = response.xpath('//h1/a/text()').extract_first() tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() l.add_value('h1_tag', h1_tag) l.add_value('tags', tags) return l.load_item()
def parse1(self, response): # when user try to login in the website if fails has "msg" at url to show a error at browser if "msg=" in response.url: logging.info("-----------------------------") logging.info(" LOGIN FAILED ") logging.info("-----------------------------") else: logging.info("-----------------------------") logging.info(" LOGIN SUCCESSFUL ") logging.info("-----------------------------") open_in_browser(response)
def parse(self,response): items = AmazonItem() open_in_browser(response) title= response.css(".a-color-base.a-text-normal").css("::text").extract() price = response.css(".a-price-whole").css("::text").extract() dilivery_charge=response.css(".s-align-children-center+ .a-row span").css("::text").extract() image_link = response.css(".s-image::attr(src)").extract() items['title']=title items['price']=price items['dilivery_charge']=dilivery_charge items['image_link']=image_link yield items
def start_scraping(self, response): open_in_browser(response) items = QuoteItem() all_div_quotes = response.css('div.quote') for quotes in all_div_quotes: title = quotes.css('span.text::text').extract() author = quotes.css('.author::text').extract() tag = quotes.css('.tag::text').extract() items['title'] = title items['author'] = author items['tag'] = tag yield items
def detect_anti_spider_from_js_obj(self, js_obj, response): if 'pageName' not in js_obj or js_obj['pageName'] != u'spulist' : msg = '\nthis page is not search result! the url is {}\n'\ 'please consider a retry, or use random USER_AGENT in settings.py, '\ 'or load a different cookie, or set separate cookie for each spider' \ 'or try a proxy\n' \ '{} done scrapes / {} due scrapes'.format(response.url, self.scrape_count, self.total_scrape) self.logger.critical(msg) self.logger.critical('let me check this in browser...') open_in_browser(response) return True return False
def start_scraping(self,response): open_in_browser(response) x = response.xpath("//script[starts-with(.,'window._sharedData')]/text()").extract_first() json_string = x.strip().split('= ')[1][:-1] jsondata = json.loads(json_string) full_name = jsondata['entry_data']['ProfilePage'][0]['graphql']['user']['full_name'] biography = jsondata['entry_data']['ProfilePage'][0]["graphql"]["user"]["biography"] website = jsondata['entry_data']['ProfilePage'][0]["graphql"]["user"]["external_url"] numberOfollowers = jsondata['entry_data']['ProfilePage'][0]["graphql"]["user"]["edge_followed_by"]["count"] numberOfollowing = jsondata['entry_data']['ProfilePage'][0]["graphql"]["user"]["edge_follow"]["count"] businessacount = jsondata['entry_data']['ProfilePage'][0]["graphql"]["user"]["is_business_account"] if businessacount: typeP = "Business" isprivate = jsondata['entry_data']['ProfilePage'][0]['graphql']['user']['is_private'] if isprivate: public = "private" numberOfPosts = jsondata['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['count'] BasicInfo = [{"full_name": full_name, "biography": biography, "Website": website, "No. followers": numberOfollowers, "No. following": numberOfollowing, "Type of Account": typeP, "Public/Private": public, "No. posts": numberOfPosts}] if numberOfPosts != 0 and numberOfPosts > 11 : numberOfPosts = 11 PostInfo = [] for n in range (numberOfPosts): typeOfPost = jsondata['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'][n]['node']['__typename'] dimensions = jsondata['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'][n]['node']['dimensions'] urlDisplay = jsondata['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'][n]['node']['display_url'] caption = jsondata['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'][n]['node']['edge_media_to_caption']['edges'][0]['node']['text'] comments = jsondata['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'][n]['node']['edge_media_to_comment']['count'] likes = jsondata['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'][n]['node']['edge_liked_by']['count'] PostInfo.append([{"typeOfPost": typeOfPost, "dimension":dimensions, "url": urlDisplay, "caption": caption, "No. comments": comments, "No. likes": likes}]) totalInfo = BasicInfo,PostInfo json.dumps(totalInfo) file = open("scrapy.txt", "w", encoding='utf-8') file.write(repr(totalInfo)) file.close()
def login(self, response): open_in_browser(response) file = open(response.html, 'w') file.write(response) file.close() # 下载验证码 image_urls = 'https://upassport.lianjia.com/freshCaptch' yield {'image_urls': [image_urls]} formdata = { 'user': self.user, 'password' : self.password, 'code': self.get_captcha_by_OCR(images.path) } yield FormRequest.form_response(login_response, callback=self.parse_login,formdata=formdata, dont_filter=True)
def scrape_nutri_info(self, response): meal = response.css('div[ng-controller=DishDetailsCtrl]') open_in_browser(response) foods = meal.css('.ng-scope') data = {} for food in foods: name = food.css('div.attribute.dish-name.ng-binding').extract().trim() # food_props = food.css('span.attribute-title ') food_props = food.css('.attribute .ng-binding :not(.ng-hide)') food_data = {} for prop in food_props: key = prop.css('span').extract().trim().replace(':', '') value = prop.extract().trim() food_data[key] = value name = food.css('.attribute .dish-name .ng-binding').extract().trim() data[name] = food_data yield data
def parse_list(self, reponse): # the selector build and the data meta hxs = HtmlXPathSelector(response) meta = response.meta # log msg self.log('msg', loglevel=Error) # for debug and first xpath write open_in_browser(response) inspect_response(response, self) # selector choose and join or strip ''.join(hxs.select('').extract()).strip() # url join base_url = get_base_url(response) n_url = urljoin_rfc(base_url, 'url') return item
def view(data): if isinstance(data, HtmlResponse) or isinstance(data, TextResponse): open_in_browser(data) elif isinstance(data, Selector): open_in_browser(TextResponse(url="",encoding='utf-8', body=data.extract(), request=None)) elif isinstance(data, SelectorList): content = "" for i in data: content += "%s <br>" % (i.extract()) open_in_browser(TextResponse(url="",encoding='utf-8', body=content, request=None)) else: open_in_browser(TextResponse(url="",encoding='utf-8', body=str(data), request=None))
def parse_page(self, response): EVENTVALIDATION = response.xpath("//*[@id='__EVENTVALIDATION']/@value").extract() VIEWSTATE = response.xpath("//*[@id='__VIEWSTATE']/@value").extract() LASTFOCUS = response.xpath("//*[@id='__LASTFOCUS']/@value").extract() VIEWSTATEGENERATOR = response.xpath("//*[@id='__VIEWSTATEGENERATOR']/@value").extract() for i in range(1,4): data = { '__EVENTTARGET': 'GridView1', '__EVENTARGUMENT': "'Page$" + str(i) + "'", '__LASTFOCUS': LASTFOCUS, '__VIEWSTATE': VIEWSTATE, '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR, '__EVENTVALIDATION': EVENTVALIDATION, } currentPage = FormRequest.from_response( response, formdata = data, method = 'POST', callback = self.parse ) open_in_browser(currentPage) yield currentPage
def parse(self, response): self.log("parse url %s" % response.url) base_url = get_base_url(response) self.log("base_url %s" % base_url) open_in_browser(response) api_url = '/api/v1/stores?' d = {'city': 'temp', 'district': '', 'keyword': ''} sel = scrapy.Selector(response) sites = sel.xpath('//select[@id="store_city"]/option') for site in sites: option = site.xpath('text()').extract() self.log('option = %s' % option) d['city'] = option[0].encode('utf8') # self.log('city = %s' % d['city']) url = urljoin(base_url, api_url + urllib.urlencode(d)) self.log('url = %s' % url) req = scrapy.Request(url, callback=self.parse_geo) yield req
def test_open_in_browser(self): url = "http:///www.example.com/some/page.html" body = "<html> <head> <title>test page</title> </head> <body>test body</body> </html>" def browser_open(burl): path = urlparse.urlparse(burl).path if not os.path.exists(path): path = burl.replace('file://', '') bbody = open(path).read() assert '<base href="%s">' % url in bbody, "<base> tag not added" return True response = HtmlResponse(url, body=body) assert open_in_browser(response, _openfunc=browser_open), \ "Browser not called" self.assertRaises(TypeError, open_in_browser, Response(url, body=body), \ debug=True)
def test_open_in_browser(self): url = "http:///www.example.com/some/page.html" body = b"<html> <head> <title>test page</title> </head> <body>test body</body> </html>" def browser_open(burl): path = urlparse(burl).path if not os.path.exists(path): path = burl.replace('file://', '') with open(path, "rb") as f: bbody = f.read() self.assertIn(b'<base href="' + to_bytes(url) + b'">', bbody) return True response = HtmlResponse(url, body=body) assert open_in_browser(response, _openfunc=browser_open), \ "Browser not called" resp = Response(url, body=body) self.assertRaises(TypeError, open_in_browser, resp, debug=True)
def end_login(self, response): open_in_browser(response)
def game_over(self, response): open_in_browser(response)
def parse_10qk(self, response): print "parse_10qk" open_in_browser(response)
def _print_response(self, response, opts): open_in_browser(response)
# TODO: 优化预约的检测和自动提交逻辑,区分出本轮、下轮的预约 elif 'tip_NoYY.html' in res.url: print '忘记预约了吧!!唉~~~没救了,洗洗睡吧...' # 顺便提醒下是否自动预约下一轮 yyNext = raw_input( '是否预约下一轮(Y/N):' ) if yyNext.lower() == 'y': return self.start_subscribe() elif 'tip_tooMuchTry' in res.url: # http://p.www.xiaomi.com/m/activities/open/common/tip_tooMuchTry.html print '暂时被冻结了~~因验证码输入错误次数超限,等吧...' else: print 'TODO: 未知错误, 已显示在浏览器窗口中' open_in_browser( res ) return def process_code4subscribe( self, res ): print '' print '>>>>>> process_code4subscribe:' print res.url # print res.body # 1. 存到本地 img = open( 'tmp/a.png', 'w' ) img.write( res.body ) img.close() print '验证码已下载: ', 'tmp/a.png' # 2. ASCII显示到控制台
def get_teacher_info(self,response): open_in_browser(response)
def parse1(self, response): open_in_browser(response)