コード例 #1
0
    def logged_in(self, response):
        if response.url == 'https://accounts.pixiv.net/login':
            raise CloseSpider('username or password error !')

        yield SplashRequest(self.generate_search_url(), self.parse)
コード例 #2
0
 def start_requests(self):
     for url in self.start_urls:
         # Uncomment below if you want a screenshot of the response
         # yield SplashRequest(url, self.parse, endpoint='render.json', args={"wait": 5, "png": 1, "render_all": 1})
         yield SplashRequest(url, self.parse, args={"wait": 5})
コード例 #3
0
 def start_requests(self):
     yield SplashRequest(self.start_urls[0],
                         callback=self.scrap_search_result_page,
                         args={'wait': 0.5})
コード例 #4
0
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url=url,
                             callback=self.parse,
                             args={'headers': self.headers, 'wait': 10,'timeout':20},
                             encoding='utf-8')
コード例 #5
0
 def start_requests(self):
     yield SplashRequest(url="https://groceries.asda.com/shelf/health-beauty/hair-care/shampoo-conditioner/shampoo/103730", callback=self.parse, endpoint="execute",args={
         'lua_source': self.script
     })
コード例 #6
0
 def start_requests(self):
     yield SplashRequest(
         url=
         'http://www.rslvic.com.au/rsl-network/victorian-map-of-all-branches/',
         callback=self.parse,
     )
コード例 #7
0
ファイル: scraper.py プロジェクト: gobfink/Groceries
 def start_requests(self):
     print("lua script - " + self.expand_and_scroll_lua)
     for url in self.start_urls:
         yield SplashRequest(url, self.parse, args={'wait': 0.5})
コード例 #8
0
ファイル: deadmanrealty.py プロジェクト: Tantial/demo
 def go_to_listings(self, response):
     listings_url = response.xpath('//*[@id="menu2"]/ul/li[4]/a/@href').extract_first()
     yield SplashRequest(url=response.urljoin(listings_url), callback=self.parse, args={'wait': 0.5, 'timeout': 60})
コード例 #9
0
ファイル: goodList.py プロジェクト: hehanlin/jdDA-spider
 def start_requests(self):
     yield SplashRequest(self.start_url, args={"images": 0, "wait": 3})
コード例 #10
0
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url,
                             self.parse,
                             endpoint='execute',
                             args={'lua_source': script})
コード例 #11
0
ファイル: deadmanrealty.py プロジェクト: Tantial/demo
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url, callback=self.login)
コード例 #12
0
ファイル: ieee.py プロジェクト: vladlemos/curso-scrapy-aulas
 def splash_request(self, request):
     return SplashRequest(url=request.url,
                          callback=self.parse_conference,
                          args={'wait': 2})
コード例 #13
0
 def modify_realtime_request(self, request):
     user_url_input = request.meta["url"]
     return SplashRequest(user_url_input,
                          self.parse,
                          args={'lua_source': self.script},
                          endpoint='execute')
コード例 #14
0
ファイル: copadobrasil.py プロジェクト: marvinbraga/pybr2017
 def start_requests(self):
     yield SplashRequest(
         url='http://globoesporte.globo.com/mg/futebol/'\
             'copa-do-brasil/jogo/27-09-2017/cruzeiro-flamengo/',
         callback=self.parse, args={'wait': 2}
     )
コード例 #15
0
    def adidas_parse(self, response):
        products = response.xpath('//*[@id="hc-container"]/div')

        for product in products:
            # If product don't have comming soon tag, scrape
            tag = product.xpath(
                "./div[2]/div[3]/div[2]/span/text()").extract_first()
            if "coming soon" not in tag.lower().strip():
                sneaker = Sneaker()
                root_url = "https://www.adidas.com"

                data = product.xpath("./div/@data-context").extract_first()

                # Name
                m = re.search('name:(.*);', data)
                sneaker["name"] = m.group(1)

                # Model
                m = re.search('model:(.*)', data)
                description = 'Model: ' + m.group(1)

                # Id
                m = re.search('id:(.*);name', data)
                description += ' ID: ' + m.group(1)

                sneaker["description"] = description

                sneaker["image"] = product.xpath(
                    "./div[2]/div[3]/div[3]/a/img[1]/@data-original"
                ).extract_first()

                sneaker["currency"] = product.xpath(
                    "./div[2]/div[3]/div[4]/div[4]/div/span[1]/text()"
                ).extract_first().strip()

                sneaker["price"] = product.xpath(
                    "./div[2]/div[3]/div[4]/div[4]/div/span[2]/text()"
                ).extract_first().strip()

                url = product.xpath(
                    "./div[2]/div[3]/div[3]/a/@href").extract_first()
                sneaker["url"] = root_url + url

                sneaker["tag"] = 'adidas'

                yield sneaker

        self.page += 120
        if products:
            next_page = "http://www.adidas.com/us/men-shoes?sz=120&start=" + str(
                self.page)

            # With proxy
            if self.settings.get('ADIDAS_PROXY_ENABLED'):
                yield SplashRequest(next_page,
                                    self.adidas_parse,
                                    headers=self.adidas_headers(),
                                    args={
                                        'images_enabled': 'false',
                                        'proxy': self.random_proxy()
                                    })

            # Without proxy
            else:
                yield SplashRequest(next_page,
                                    self.adidas_parse,
                                    headers=self.adidas_headers(),
                                    args={'images_enabled': 'false'})
コード例 #16
0
 def start_requests(self):
     yield SplashRequest(url=self.url + '/sapi/category/getAllBrand', callback=self.parse, meta={'splash': {
                 'endpoint': 'render.html'
                 }
     })
コード例 #17
0
    def start_requests(self):
        urls = list()
        non_shopify_list = list()
        bots_list = list()

        # Get all urls to scrape
        with open(os.path.dirname(__file__) + self.url_file, "rt") as f:
            urls = [url.strip() for url in f.readlines()]

        # Supported non shopify sites list
        with open(os.path.dirname(__file__) + self.non_shopify_file,
                  "rt") as f:
            non_shopify_list = [url.strip() for url in f.readlines()]

        # Supported bots sites list
        with open(os.path.dirname(__file__) + self.bots_file, "rt") as f:
            bots_list = [url.strip() for url in f.readlines()]

        for url in urls:
            t = tldextract.extract(url)
            root = t.domain + '.' + t.suffix
            proxy_enabled = self.settings.get('PROXY_ENABLED')
            adidas_proxy_enabled = self.settings.get('ADIDAS_PROXY_ENABLED')

            # Adidas site (uses scrapy-splash)
            if "adidas.com" in url:
                # With proxy
                if adidas_proxy_enabled:
                    yield SplashRequest(url,
                                        self.adidas_parse,
                                        headers=self.adidas_headers(),
                                        args={
                                            'images_enabled': 'false',
                                            'proxy': self.random_proxy()
                                        })

                # Without proxy
                else:
                    yield SplashRequest(url,
                                        self.adidas_parse,
                                        headers=self.adidas_headers(),
                                        args={'images_enabled': 'false'})

            # Non shopify site
            elif any(root in s for s in non_shopify_list):
                # With proxy
                if proxy_enabled:
                    yield scrapy.Request(url,
                                         self.non_shoify,
                                         meta={'proxy': self.random_proxy()})

                # Without proxy
                else:
                    yield scrapy.Request(url, self.non_shoify)

            # Bots
            elif any(root in s for s in bots_list):
                # With proxy
                if proxy_enabled:
                    yield scrapy.Request(url,
                                         self.bots_parse,
                                         meta={'proxy': self.random_proxy()})

                # Without proxy
                else:
                    yield scrapy.Request(url, self.bots_parse)

            # Shopify sites
            else:
                # With proxy
                if proxy_enabled:
                    yield scrapy.Request(url,
                                         self.shopify_parse,
                                         meta={'proxy': self.random_proxy()})

                # Without proxy
                else:
                    yield scrapy.Request(url, self.shopify_parse)
コード例 #18
0
 def start_requests(self):
     sampleURL = 'https://www.fahasa.com/sach-trong-nuoc/van-hoc-trong-nuoc/page/'
     for i in range(0, self.numOfPage):
         yield SplashRequest(sampleURL + str(i + 1) + '.html',
                             self.parse,
                             args={"wait": 5})
コード例 #19
0
 def start_requests(self):
     yield SplashRequest(
         url=
         'https://angel.co/companies?locations[]=1688-United+States&tab=hiring&stage[]=Series+A&stage[]=Series+B&stage[]=Series+C',
         callback=self.parse,
     )
コード例 #20
0
 def start_requests(self):
     urls = ['http://stock.qq.com/l/stock/ywq/list20150423143546.htm']
     for url in urls:
         yield SplashRequest(url=url, callback=self.parse, args={'wait': 0.5})
コード例 #21
0
ファイル: scraper.py プロジェクト: gobfink/Groceries
    def parse(self, response):
        # This callback determines if the selected menu is
        # at the top of the list, if it is then it adds the urls
        # to the list and keeps going
        # if its not, then it calls the lua to prepare the page
        # for scraping, and then scrapes it
        url = response.url

        menu = response.css(".category-filter__link")
        #submenu = response.css("")
        #print ("self.urls - " +str(self.urls))
        print("processing response.url - " + response.url)

        #print ("menu: ")
        #print (menu.getall())
        #print ("len(menu): " + str(len(menu)))
        #print ("menu[0] : " + menu.get())
        #print("name - " + menu[0].css('.category-filter__text ::text').get())
        #inspect_response(response,self)

        if (len(menu) > 0 and menu[0].css('[aria-current="page"]')):
            print(f"inside menu page for url - {url}")
            # The top page is active
            #print ("menu[0] : [aria-current=page] " + menu[0].css('[aria-current="page"]').get())
            # therefore we need to scrape the links, and continue searching
            # we then need to loop through each other page.
            # call parse, and scrape it is not
            menu_url = menu[0].css('::attr(href)').get()

            menu_name = menu[0].css('.category-filter__text ::text').get()
            for item in menu:
                heading = item.css('.category-filter__text ::text').get()
                scraped_url = item.css('::attr(href)').get()
                scraped_url = self.base_url + scraped_url
                section = menu_name
                subsection = heading
                category = lookup_category("", section, subsection)
                store_url(self.conn, scraped_url, self.store_id, category,
                          section, subsection)

                #self.section_dict[url]=(menu_name, heading)
                #if self.urls.count(url) == 0:
                #    self.urls.append(url)

            #urls=menu.css('::attr(href)').getall()
            # Remove the the first(this) page from list to parse
            #urls.pop()
            #self.urls.extend(urls)
            #print("urls to scrape - " + str(self.urls))
            #print("local urls - " + str(urls))
            """
            while len(self.urls) != 0:
                url = self.urls.pop()
                self.processedUrls.append(url)
                #url = self.base_url + url_suffix
                #print ("urls - " + str(self.urls))
                #print ("pulling from url - " + url)
                #print ("urls lengths - " + str(len(self.urls)))
                yield SplashRequest(url,
                                self.parse,
                                endpoint='execute',
                                args={'lua_source': self.expand_and_scroll_lua})
            """

        elif (len(menu) == 0):
            inspect_response(response, self)

        else:
            #we are on a subpage, so now we can start scraping
            #

            GROCERY_SELECTOR = '.grid-item'
            NAME_SELECTOR = '.small-type.detail-card-description ::text'
            PRICE_SELECTOR = '.price ::text'
            PRICE_PER_UNIT_SELECTOR = '.sub-headline.detail-card-subtext ::text'

            metadata = get_url_metadata(self.cursor, url)
            section = metadata[0]
            subsection = metadata[1]
            print("subpage - scraping " + url + ", from section - " + section)
            for grocery in response.css(GROCERY_SELECTOR):
                self.name = grocery.css(NAME_SELECTOR).extract_first()
                self.price = grocery.css(PRICE_SELECTOR).extract_first()
                if self.price is not None:
                    self.price = self.price.replace('*', '').replace('$', '')
                self.ppu = grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first()
                if self.ppu is not None:
                    self.ppu = convert_ppu(self.ppu)
                #inspect_response(response, self)
                #parse the ounces off of the name
                yield {
                    'name': self.name,
                    'price': self.price,
                    'price-per-unit': self.ppu,
                    'section': section,
                    'subsection': subsection,
                    'url': response.url
                }
        finish_url(self.conn, self.store_id, url)
        print("finishing url - " + url)
        next_url = get_next_url(self.cursor, 1)
        if next_url is not None:
            print("got next_url - " + next_url)
            yield SplashRequest(
                next_url,
                self.parse,
                endpoint='execute',
                dont_filter=True,
                args={'lua_source': self.expand_and_scroll_lua})
        else:
            print("Next url is none therefore we must be finished ! ")
コード例 #22
0
ファイル: quotes.py プロジェクト: L-fly123456/ScrapyProject
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url)
コード例 #23
0
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url,
                             self.parse,
                             endpoint='render.html',
                             args={'wait': 5})
コード例 #24
0
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url,
                             endpoint="render.html",
                             callback=self.parse)
コード例 #25
0
 def get_request(self, url):
     return SplashRequest(url=url,
                          endpoint='execute',
                          cache_args=['lua_source'],
                          args={'lua_source': self.lua_script},
                          cb_kwargs={'provider': self})
コード例 #26
0
ファイル: cool.py プロジェクト: yhr-git/study_git
 def start_requests(self):
     for page in range(1, 11):
         url = self.base_url % page
         yield SplashRequest(url=url, args={'wait': 3, 'images': 0})
コード例 #27
0
    def start_requests(self):
        for title in self.start_urls:
            url = self.start_urls[title]
            # identify the stock area ,like HK, US , sar etc
            if 'sau' in title.lower():
                stock_area = 'SAU'
                stock_come = 'SAU'
                yield SplashRequest(url,
                                    endpoint='execute',
                                    args={
                                        'lua_source': self.lua_extract_page,
                                        'images': 0,
                                        'timeout':
                                        self.rendering_page_timeout + 30
                                    },
                                    callback=self.extract_page,
                                    meta={
                                        'stock_area': stock_area,
                                        'stock_come': stock_come
                                    })
            elif 'hk' in title.lower():
                stock_area = 'HK'
                stock_come = 'CN'
                if not 'sp' in title.lower():
                    yield SplashRequest(url,
                                        endpoint='execute',
                                        args={
                                            'lua_source':
                                            self.lua_extract_page,
                                            'images': 0,
                                            'timeout':
                                            self.rendering_page_timeout
                                        },
                                        callback=self.parse_page_num,
                                        meta={
                                            'stock_area': stock_area,
                                            'stock_come': stock_come
                                        })
                else:
                    # direct extract page
                    # get stock_id  firstly
                    stock_id = url.split('/')[-1].split('.')[0]
                    yield SplashRequest(url,
                                        endpoint='execute',
                                        args={
                                            'lua_source':
                                            self.lua_extract_page,
                                            'images': 0,
                                            'timeout':
                                            self.rendering_page_timeout
                                        },
                                        callback=self.extract_page,
                                        dont_filter=True,
                                        meta={
                                            'stock_name': 'None',
                                            'stock_id': stock_id,
                                            'stock_area': stock_area,
                                            'stock_come': stock_come
                                        })

            elif 'us_chinese' in title.lower():
                stock_area = 'US'
                stock_come = 'CN'
                yield SplashRequest(url,
                                    endpoint='execute',
                                    args={
                                        'lua_source': self.lua_extract_page,
                                        'images': 0,
                                        'timeout': self.rendering_page_timeout
                                    },
                                    callback=self.parse_page_num,
                                    meta={
                                        'stock_area': stock_area,
                                        'stock_come': stock_come
                                    })
            elif 'united_states' in title.lower():
                stock_area = 'US'
                stock_come = 'US'
                for i in range(1, self.united_states_pages + 1):
                    real_lua_source = self.lua_United_states_pages.format(i)
                    yield SplashRequest(url,
                                        endpoint='execute',
                                        args={
                                            'lua_source': real_lua_source,
                                            'images': 0,
                                            'timeout':
                                            self.rendering_page_timeout
                                        },
                                        callback=self.parse,
                                        meta={
                                            'stock_area': stock_area,
                                            'stock_come': stock_come
                                        })
            elif 'hs_a' in title.lower():
                stock_area = 'CN'
                stock_come = 'CN'
                for i in range(1, self.hsa_default_pages + 1):
                    real_lua_source = self.lua_HSA_pages.format(i)
                    #print(real_lua_source)
                    yield SplashRequest(url,
                                        endpoint='execute',
                                        args={
                                            'lua_source': real_lua_source,
                                            'images': 0,
                                            'timeout':
                                            self.rendering_page_timeout
                                        },
                                        callback=self.parse,
                                        dont_filter=True,
                                        meta={
                                            'stock_area': stock_area,
                                            'stock_come': stock_come
                                        })
コード例 #28
0
 def start_requests(self):
     for url in self.start_url:
         yield SplashRequest(url, callback=self.parse)
コード例 #29
0
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url, self.parse, args={'wait': 0.5})
コード例 #30
0
 def start_requests(self):
     setting = self.settings
     page = setting['CRAWL_PAGE']
     for p in range(1, page + 1):
         yield SplashRequest(self.generate_search_url(page=p), self.parse)