def confirm_parse(self, response): html_response = HtmlResponse(response.url, body=response.body) body = html_response.body_as_unicode().encode('utf8') if 'Esta solicitação de transferência será gratuita' in body: data = { 'acsrfToken': response.xpath('//input[@name="acsrfToken"]/@value') [0].extract() } yield FormRequest( url= 'https://pagseguro.uol.com.br/operations/startWithdraw.jhtml', dont_filter=True, callback=self.success_parse, formdata=data, meta=response.meta) else: yield Request( url= 'https://pagseguro.uol.com.br/operations/changeWithdraw.jhtml', dont_filter=True, callback=self.logout_parse, meta=response.meta)
def parse_station_information(self, response: HtmlResponse): data = json.loads(response.body_as_unicode()) model: StationInformationModel = StationInformationModel.parse(data) # chain station status request # see https://stackoverflow.com/questions/13910357/how-can-i-use-multiple-requests-and-pass-items-in-between-them-in-scrapy-python # TODO is there RxJS forkJoin mechanism for Twisted? could then dispatch multiple requests and await responses yield scrapy.Request(self.feeds['station_status'], callback=self.parse_station_status, meta={'station_information': model})
def parse(self, response: HtmlResponse): data = json.loads(response.body_as_unicode()) model: GbfsModel = GbfsModel.parse(data) self.feeds = {feed.name: feed.url for feed in model.feeds} for name, url in self.feeds.items(): if name == 'system_information': yield scrapy.Request(url, callback=self.parse_system_information) elif name == 'station_information': yield scrapy.Request(url, callback=self.parse_station_information)
def process_response(self, request, response, spider): #log.msg('%s is type %s' % (response.url, type(response)), level=log.DEBUG) if type(response) is Response and not _file_pattern.match(response.url): response = HtmlResponse(response.url, body=response.body) if hasattr(response, 'body_as_unicode'): hdoc = html.fromstring(response.body_as_unicode()) links = hdoc.xpath('//a') for link in links: href = link.get('href') link.set('href', urlparse.urljoin(get_base_url(response), href) ) return response.replace(body=html.tostring(hdoc, encoding='unicode')) return response
def parse_station_status(self, response: HtmlResponse): data = json.loads(response.body_as_unicode()) status_model: StationStatusModel = StationStatusModel.parse(data) statuses = { station.station_id: station for station in status_model.stations } # Get previous model information_model: StationInformationModel = response.meta[ 'station_information'] for station in information_model.stations: station_id = station.station_id status: Optional[ StationStatusModel.DataModel.StationModel] = statuses.get( station_id, None) bikes_available = None docks_available = None bikes_disabled = None docks_disabled = None is_open = None if status: bikes_available = status.num_bikes_available docks_available = status.num_docks_available bikes_disabled = status.num_bikes_disabled docks_disabled = status.num_docks_disabled is_open = status.is_renting yield { 'item_type': 'station', 'data': attr.asdict( StationItem(scraper_id=self.scraper.id, source_id=station_id, name=station.name, address=station.address, latitude=station.lat, longitude=station.lon, capacity=station.capacity, bikes_available=bikes_available, docks_available=docks_available, bikes_disabled=bikes_disabled, docks_disabled=docks_disabled, open=is_open)) }
def after_post(self, response): item = [] # body = response.body.decode('cp949') # html_response = HtmlResponse(url=response.url, body=response.body, encoding='cp949') html_response = HtmlResponse(url=response.url, body=response.body, encoding='cp949') # xxs = HtmlXPathSelector(html_response) # prices = xxs.select('//td[@class="border7_2 text_align1"]/text()').extract() doc = html_response.body_as_unicode() # parser = etree.HTMLParser(encoding='utf-8') # tree = etree.fromstring(doc, parser) # xresult = tree.xpath('//td[@class="border7_2 text_align1 dataItem1"]/text()') # print len(xresult) # for x in xresult: # print x # return pattern = re.compile(r'var chartData = ([^;]*);') results = pattern.findall(doc) a = json.loads(results[0]) print a
def parse_system_information(self, response: HtmlResponse): data = json.loads(response.body_as_unicode()) model: SystemInformationModel = SystemInformationModel.parse(data) system = model.data # Note - not passing source_id through as gbfs sources only contain a single system. # In addition, the station feeds do not contain the system_id, so it makes it easier # for pipeline to find the system when its source_id is null. yield { 'item_type': 'system', 'data': attr.asdict( SystemItem( scraper_id=self.scraper.id, name=system.name, # source_id=system.system_id, phone_number=system.phone_number, email=system.email, timezone=system.timezone, url=system.url, language=system.language)) }
def parse_VIN_db(self, response: HtmlResponse, item, page_id): json_VIN_db = json.loads(response.body_as_unicode()) item.add_value('json_VIN_db', json_VIN_db) yield item.load_item()
def parse_car_spec(self, response: HtmlResponse, item, page_id): json_car_spec = json.loads(response.body_as_unicode()) item.add_value('json_car_spec', json_car_spec) url_VIN_db = f'https://www.avito.ru/web/1/swaha/v1/autoteka/teaser/{page_id}' yield response.follow(url_VIN_db, callback=self.parse_VIN_db, cb_kwargs={'item': item, 'page_id': page_id})