Beispiel #1
0
    def confirm_parse(self, response):
        html_response = HtmlResponse(response.url, body=response.body)
        body = html_response.body_as_unicode().encode('utf8')

        if 'Esta solicitação de transferência será gratuita' in body:
            data = {
                'acsrfToken':
                response.xpath('//input[@name="acsrfToken"]/@value')
                [0].extract()
            }

            yield FormRequest(
                url=
                'https://pagseguro.uol.com.br/operations/startWithdraw.jhtml',
                dont_filter=True,
                callback=self.success_parse,
                formdata=data,
                meta=response.meta)
        else:
            yield Request(
                url=
                'https://pagseguro.uol.com.br/operations/changeWithdraw.jhtml',
                dont_filter=True,
                callback=self.logout_parse,
                meta=response.meta)
Beispiel #2
0
 def parse_station_information(self, response: HtmlResponse):
     data = json.loads(response.body_as_unicode())
     model: StationInformationModel = StationInformationModel.parse(data)
     # chain station status request
     # see https://stackoverflow.com/questions/13910357/how-can-i-use-multiple-requests-and-pass-items-in-between-them-in-scrapy-python
     # TODO is there RxJS forkJoin mechanism for Twisted? could then dispatch multiple requests and await responses
     yield scrapy.Request(self.feeds['station_status'],
                          callback=self.parse_station_status,
                          meta={'station_information': model})
Beispiel #3
0
 def parse(self, response: HtmlResponse):
     data = json.loads(response.body_as_unicode())
     model: GbfsModel = GbfsModel.parse(data)
     self.feeds = {feed.name: feed.url for feed in model.feeds}
     for name, url in self.feeds.items():
         if name == 'system_information':
             yield scrapy.Request(url,
                                  callback=self.parse_system_information)
         elif name == 'station_information':
             yield scrapy.Request(url,
                                  callback=self.parse_station_information)
Beispiel #4
0
 def process_response(self, request, response, spider):
     #log.msg('%s is type %s' % (response.url, type(response)), level=log.DEBUG)
     if type(response) is Response and not _file_pattern.match(response.url):
         response = HtmlResponse(response.url, body=response.body)
         
     if hasattr(response, 'body_as_unicode'):
         hdoc = html.fromstring(response.body_as_unicode())
         links = hdoc.xpath('//a')
         for link in links:
             href = link.get('href')
             link.set('href', urlparse.urljoin(get_base_url(response), href) )    
         return response.replace(body=html.tostring(hdoc, encoding='unicode'))            
     return response
Beispiel #5
0
    def parse_station_status(self, response: HtmlResponse):
        data = json.loads(response.body_as_unicode())
        status_model: StationStatusModel = StationStatusModel.parse(data)
        statuses = {
            station.station_id: station
            for station in status_model.stations
        }

        # Get previous model
        information_model: StationInformationModel = response.meta[
            'station_information']

        for station in information_model.stations:
            station_id = station.station_id
            status: Optional[
                StationStatusModel.DataModel.StationModel] = statuses.get(
                    station_id, None)

            bikes_available = None
            docks_available = None
            bikes_disabled = None
            docks_disabled = None
            is_open = None

            if status:
                bikes_available = status.num_bikes_available
                docks_available = status.num_docks_available
                bikes_disabled = status.num_bikes_disabled
                docks_disabled = status.num_docks_disabled
                is_open = status.is_renting

            yield {
                'item_type':
                'station',
                'data':
                attr.asdict(
                    StationItem(scraper_id=self.scraper.id,
                                source_id=station_id,
                                name=station.name,
                                address=station.address,
                                latitude=station.lat,
                                longitude=station.lon,
                                capacity=station.capacity,
                                bikes_available=bikes_available,
                                docks_available=docks_available,
                                bikes_disabled=bikes_disabled,
                                docks_disabled=docks_disabled,
                                open=is_open))
            }
  def after_post(self, response):
    item = []
    # body = response.body.decode('cp949')

    # html_response = HtmlResponse(url=response.url, body=response.body, encoding='cp949')
    html_response = HtmlResponse(url=response.url, body=response.body, encoding='cp949')
    # xxs = HtmlXPathSelector(html_response)
    # prices = xxs.select('//td[@class="border7_2 text_align1"]/text()').extract()
    doc = html_response.body_as_unicode()
    # parser = etree.HTMLParser(encoding='utf-8')
    # tree = etree.fromstring(doc, parser)
    # xresult = tree.xpath('//td[@class="border7_2 text_align1 dataItem1"]/text()')
    # print len(xresult)
    # for x in xresult:
    #   print x
    # return

    pattern = re.compile(r'var chartData =  ([^;]*);')
    results = pattern.findall(doc)
    a = json.loads(results[0])
    print a
Beispiel #7
0
 def parse_system_information(self, response: HtmlResponse):
     data = json.loads(response.body_as_unicode())
     model: SystemInformationModel = SystemInformationModel.parse(data)
     system = model.data
     # Note - not passing source_id through as gbfs sources only contain a single system.
     # In addition, the station feeds do not contain the system_id, so it makes it easier
     # for pipeline to find the system when its source_id is null.
     yield {
         'item_type':
         'system',
         'data':
         attr.asdict(
             SystemItem(
                 scraper_id=self.scraper.id,
                 name=system.name,
                 # source_id=system.system_id,
                 phone_number=system.phone_number,
                 email=system.email,
                 timezone=system.timezone,
                 url=system.url,
                 language=system.language))
     }
Beispiel #8
0
    def after_post(self, response):
        item = []
        # body = response.body.decode('cp949')

        # html_response = HtmlResponse(url=response.url, body=response.body, encoding='cp949')
        html_response = HtmlResponse(url=response.url,
                                     body=response.body,
                                     encoding='cp949')
        # xxs = HtmlXPathSelector(html_response)
        # prices = xxs.select('//td[@class="border7_2 text_align1"]/text()').extract()
        doc = html_response.body_as_unicode()
        # parser = etree.HTMLParser(encoding='utf-8')
        # tree = etree.fromstring(doc, parser)
        # xresult = tree.xpath('//td[@class="border7_2 text_align1 dataItem1"]/text()')
        # print len(xresult)
        # for x in xresult:
        #   print x
        # return

        pattern = re.compile(r'var chartData =  ([^;]*);')
        results = pattern.findall(doc)
        a = json.loads(results[0])
        print a
Beispiel #9
0
 def parse_VIN_db(self, response: HtmlResponse, item, page_id):
     json_VIN_db = json.loads(response.body_as_unicode())
     item.add_value('json_VIN_db', json_VIN_db)
     yield item.load_item()
Beispiel #10
0
 def parse_car_spec(self, response: HtmlResponse, item, page_id):
     json_car_spec = json.loads(response.body_as_unicode())
     item.add_value('json_car_spec', json_car_spec)
     url_VIN_db = f'https://www.avito.ru/web/1/swaha/v1/autoteka/teaser/{page_id}'
     yield response.follow(url_VIN_db, callback=self.parse_VIN_db, cb_kwargs={'item': item, 'page_id': page_id})