コード例 #1
0
    def start_requests(self):
        url = self.start_urls[0]
        request = create_unfiltered_parse_request(url,
                                                  self.collect_menu,
                                                  EC.element_to_be_clickable(
                                                      (By.CSS_SELECTOR, '[id="catalog-nav-main-shop.categories"]')),
                                                  meta_url=url
                                                  )

        if is_url_scraped(self.cursor, url, self.store_id, scrape_urls=True):
            #next_url = get_next_url(self.cursor, 1, store_id=self.store_id,
            #                        scrape_urls=True,filter=self.page_string,reverse_filter=True)
            next_url = get_next_url(self.cursor, 1, store_id=self.store_id,
                        scrape_urls=True)

            request = create_unfiltered_parse_request(next_url,
                                           self.handle_first_request,
                                           EC.element_to_be_clickable(
                                               (By.CSS_SELECTOR, '#shopping-selector-parent-process-modal-close-click')),
                                           meta_url=next_url,
                                           errback=self.handle_pagination
                                           )

        store_url(self.conn, url, self.store_id, "Start", "", "")
        yield request
コード例 #2
0
    def collect_menu(self, response):
        self.logger.info("inside collect_menu! ")
        self.driver = response.request.meta['driver']
        close_modal(self)
        change_store_location(self)
        departments = self.driver.find_elements_by_css_selector(
            '[category-filter="subcategory"]')
        for department in departments:
            dept_name = department.find_element_by_css_selector(
                '[data-test="category-card-"]').text
            aisles = department.find_elements_by_css_selector('a')
            self.logger.info(f"dept_name: {dept_name}")
            self.aisles = aisles
            for aisle in aisles:
                aisle_name = aisle.text
                aisle_url = aisle.get_attribute("href")
                category = lookup_category("", dept_name, aisle_name)
                store_url(self.conn, aisle_url, self.store_id,
                          category, dept_name, aisle_name)
            #inspect_response(response, self)

        self.logger.info("finished collect_menu! ")
        finish_url(self.conn, self.store_id, response.url, scrape_urls=True)

        request = self.get_next_request()
        yield request
コード例 #3
0
    def scrape_urls(self, response):
        #1. sort through data and extract urls
        #2. put urls together
        #3. Loop to each url, returning @parse
        base_url = "https://www.walmart.com"
        self.raw = response.body_as_unicode()
        #print("raw: " + self.raw)
        remove = ['{', '}', 'Link', ' ']
        self.cleaned = self.raw
        for char in remove:
            self.cleaned = self.cleaned.replace(char, '')
        self.comma_split = self.cleaned.split('","')
        #print ("cleaned - " + cleaned)
        #print ("comma_split - " )
        #print (*comma_split)
        self.colon_split = [entry.split('":"') for entry in self.comma_split]
        #inspect_response(response, self)
        self.colon_split[0].remove('"sections')
        #print ("colon_split - ")
        #print (*colon_split)
        self.urls = [entry[-1] for entry in self.colon_split]
        #print("urls - ")
        #print(self.urls)

        section = "unset"
        subsection = "unset"

        self.section_dict = {}
        chars_to_remove=["\'","&"]
        for entry in self.colon_split:

            # each entry will have a subheading (normally at 0 unless it has a heading entry)
            section = clean_string(entry[0],chars_to_remove)
            url_end = clean_string(entry[-1],"\"")

            # if its a section header it will contain 3 entries
            #   and all subsequent entries will have the same heading
            if len(entry) > 2:
                section = clean_string(entry[0],chars_to_remove)
                subsection = clean_string(entry[1],chars_to_remove)

            url = base_url + url_end
            category=lookup_category("",section,subsection)
            store_url(self.conn,url,self.store_id,category,section,subsection)
            #self.section_dict[url] = (self.section, self.subsection)

            #print(section, subsection, url)

        next_url=get_next_url(self.cursor, 1)
        if next_url is None:
            print("No more urls to parse finishing")
        else:
            yield SplashRequest(url,
                            self.parse,
                            endpoint='render.html',
                            args={
                                'wait': 10,
                                'section': section,
                                'subsection': subsection
                            })
コード例 #4
0
ファイル: urlScraper.py プロジェクト: gobfink/Groceries
    def parse(self, response):
        # This callback determines if the selected menu is
        # at the top of the list, if it is then it adds the urls
        # to the list and keeps going
        # if its not, then it calls the lua to prepare the page
        # for scraping, and then scrapes it
        url = response.url

        menu = response.css(".category-filter__link")
        #submenu = response.css("")
        #print ("self.urls - " +str(self.urls))
        print("processing response.url - " + response.url)

        #print ("menu: ")
        #print (menu.getall())
        #print ("len(menu): " + str(len(menu)))
        #print ("menu[0] : " + menu.get())
        #print("name - " + menu[0].css('.category-filter__text ::text').get())
        #inspect_response(response,self)

        if (len(menu) > 0 and menu[0].css('[aria-current="page"]')):
            print(f"inside menu page for url - {url}")
            # The top page is active
            #print ("menu[0] : [aria-current=page] " + menu[0].css('[aria-current="page"]').get())
            # therefore we need to scrape the links, and continue searching
            # we then need to loop through each other page.
            # call parse, and scrape it is not
            menu_url = menu[0].css('::attr(href)').get()

            menu_name = menu[0].css('.category-filter__text ::text').get()
            for item in menu:
                heading = item.css('.category-filter__text ::text').get()
                scraped_url = item.css('::attr(href)').get()
                scraped_url = self.base_url + scraped_url
                section = menu_name
                subsection = heading
                category = lookup_category("", section, subsection)
                store_url(self.conn, scraped_url, self.store_id, category,
                          section, subsection)

        elif (len(menu) == 0):
            inspect_response(response, self)

        finish_url(self.conn, self.store_id, url, True)
        print("finishing url - " + url)
        next_url = get_next_url(self.cursor, 1, self.store_id, True)

        if next_url is not None:
            print("got next_url - " + next_url)
            yield SplashRequest(
                next_url,
                self.parse,
                endpoint='execute',
                dont_filter=True,
                args={'lua_source': self.expand_and_scroll_lua})
        else:
            print("Next url is none therefore we must be finished ! ")
コード例 #5
0
ファイル: groceryScraper.py プロジェクト: gobfink/Groceries
    def start_requests(self):
        url = self.start_urls[0]

        start_request = create_nocookies_request(url,
                                             self.handle_onboard,
                                             EC.element_to_be_clickable(
                                                 (By.CSS_SELECTOR, '[data-automation-id="onboardingModalCloseBtn"]')),
                                             meta_url=url)
        store_url(self.conn, url, self.store_id, "Start", "", "")
        yield start_request
コード例 #6
0
    def parse_urls(self, response):
        location = response.css('[data-test="store-button"] ::text').get()
        self.driver = response.request.meta['driver']
        location = self.driver.find_element_by_css_selector(
            '[data-test="store-button"]').text
        print(f"detected location - {location}")
        if location != self.location:
            self.change_store_location(response)

        self.section_group = response.css(".subcategory.category")
        section_group = response.css(".subcategory.category")
        for section in section_group:
            section_name = section.css(".css-1pita2n ::text").get()
            url_nodes = section.css("ul.children a")
            for url_node in url_nodes:
                subsection_name = url_node.css("::text").get()
                url = self.base_url + url_node.css("::attr(href)").get()

                store_url(self.conn, url, self.store_id,
                          lookup_category("", section_name, subsection_name),
                          section_name, subsection_name)

        finish_url(self.conn, self.store_id, response.url)
        function = self.parse
        item_to_find = '[add-to-cart]'
        if len(self.start_urls) != 0:
            next_url = self.start_urls.pop()
            store_url(self.conn, next_url, self.store_id, "", "", "")
            function = self.parse_urls
            item_to_find = '[data-test="store-button"]'
            #request = self.create_parse_request(next_url,self.parse_urls,EC.element_to_be_clickable((By.CSS_SELECTOR, '[data-test="store-button"]')))

        else:
            next_url = get_next_url(self.cursor, 1)
        #    request = self.create_parse_request(next_url,self.parse,EC.element_to_be_clickable((By.CSS_SELECTOR, '[add-to-cart]')))

        if next_url is None:
            print("No more URLs to parse. Finishing")
            return
        else:
            request = self.create_parse_request(
                next_url, function,
                EC.element_to_be_clickable((By.CSS_SELECTOR, item_to_find)))

        #FIXME these try except blocks don't actually handle timeout exceptions from navigating to the wrong url
        try:
            yield request
        except:
            print(f"Parse -  Errored out processing request for - {next_url} ")
            next_url = get_next_url(self.cursor, 2)
            print(f"Parse - Now handling {next_url}")
            request = self.create_parse_request(
                next_url, self.parse,
                EC.element_to_be_clickable((By.CSS_SELECTOR, '[add-to-cart]')))
            yield request
コード例 #7
0
ファイル: urlScraper.py プロジェクト: gobfink/Groceries
    def walk_menu(self, response):
        # inspect_response(response,self)
        self.driver = response.request.meta['driver']
        self.logger.info('Inside walk_menu')
        start_url = self.driver.current_url
        menu_button = self.driver.find_element_by_css_selector(
            '[data-automation-id="NavigationBtn"]')
        menu_button.click()

        time.sleep(.5)

        departments = self.driver.find_elements_by_css_selector(
            '.NavigationPanel__department___1DF7d button')
        for department in departments:
            department_name = department.get_attribute('aria-label')
            department.click()
            time.sleep(.5)
            aisles = self.driver.find_elements_by_css_selector(
                '.NavigationPanel__aisleLink___309i2')
            for aisle in aisles:
                url = aisle.get_attribute('href')
                aisle_name = aisle.get_attribute('innerText')
                # self.department_name = department_name
                # self.aisle_name = aisle_name
                self.logger.info(
                    f"department_name: {department_name}, aisle_name: {aisle_name}"
                )
                category = lookup_category("", department_name, aisle_name)
                self.logger.info(f"Storing aisle: {aisle_name}, url: {url}")
                store_url(self.conn, url, self.store_id, category,
                          department_name, aisle_name)

        finish_url(self.conn, self.store_id, start_url, scrape_urls=True)
        next_url = get_next_url(self.cursor,
                                1,
                                store_id=self.store_id,
                                scrape_urls=True,
                                filter="aisle=")
        if next_url is None:
            self.logger.debug(
                "Next_url is None therefore we must be finished!")
            return

        self.next_url = next_url
        pagination_request = create_parse_request(next_url,
                                                  self.handle_pagination,
                                                  EC.element_to_be_clickable(
                                                      (By.CSS_SELECTOR,
                                                       self.PAGE_LOAD)),
                                                  errback=self.retry,
                                                  meta_url=next_url,
                                                  cookies=False)

        yield pagination_request
コード例 #8
0
ファイル: urlScraper.py プロジェクト: gobfink/Groceries
 def walk_through_pages(self, section, subsection):
     #print(f"walk_through_pages for {section},{subsection}")
     category = lookup_category("", section, subsection)
     start_url = self.driver.current_url
     try:
         #From here we should check if we are in a different menu
         next_arrow = self.driver.find_element_by_css_selector(
             '.next-arrow')
     except NoSuchElementException:
         return
     self.handle_click(next_arrow, self.delay)
     current_url = self.driver.current_url
     store_url(self.conn, current_url, self.store_id, category, section,
               subsection, self.get_quantity())
     #Unfortunately we want to recurse until their is no more pages to walk through
     self.walk_through_pages(section, subsection)
コード例 #9
0
    def start_requests(self):
        self.store_id = find_store_id(self.cursor, self.store_name,
                                      self.location)

        if len(self.start_urls) != 0:
            url = self.start_urls.pop()
            store_url(self.conn, url, self.store_id, "", "", "")
            print(f"Starting requests with - {url}")
            wait_until = EC.element_to_be_clickable(
                (By.CSS_SELECTOR, '[data-test="store_button"]'))
            request = self.create_parse_request(
                url, self.parse_urls,
                EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, '[data-test="store-button"]')))
            #request = self.create_parse_request(url,self.parse_urls,wait_until)
            yield request

        else:
            print("start_requests - len(start_urls) == 0 : exiting")
コード例 #10
0
ファイル: urlScraper.py プロジェクト: gobfink/Groceries
    def handle_pagination(self, response):
        self.logger.info('inside handle_pagination')
        url = self.driver.current_url
        next_button = self.driver.find_elements_by_css_selector(
            self.NEXT_BUTTON_SELECTOR)
        # inspect_response(response,self)
        if len(next_button) != 0:
            next_page_url = get_next_pagination(self.PAGE_STRING, url)
            metadata = get_url_metadata(self.cursor, url)
            category = metadata[0]
            section = metadata[1]
            subsection = metadata[2]
            quantity = self.driver.find_element_by_css_selector(
                '.Title__browseTotalCount___OWylh').get_attribute('innerText')
            quantity = re.findall('[0-9]+', quantity)[0]
            store_url(self.conn,
                      next_page_url,
                      self.store_id,
                      category,
                      section,
                      subsection,
                      grocery_quantity=quantity)

        finish_url(self.conn, self.store_id, url, scrape_urls=True)
        next_url = get_next_url(self.cursor,
                                1,
                                store_id=self.store_id,
                                scrape_urls=True,
                                filter="aisle=")
        if next_url is None:
            self.logger.debug(
                "Next_url is None therefore we must be finished!")
            return
        request = create_parse_request(next_url,
                                       self.handle_pagination,
                                       EC.element_to_be_clickable(
                                           (By.CSS_SELECTOR, self.PAGE_LOAD)),
                                       errback=self.retry,
                                       meta_url=next_url,
                                       cookies=False)
        yield request
コード例 #11
0
    def scrape_urls(self,response):
        mainGroups = response.css('.col-12.col-sm-12.col-md-4.col-lg-4.col-xl-3')
        #TODO can probably infer some categories from location
        for mainGroup in mainGroups:
            view_all = mainGroup.css('.text-uppercase.view-all-subcats ::attr(href)').get()
            view_all_url = self.base_url + view_all
            section = mainGroup.css('.product-title.text-uppercase ::text').get()
            section = section.strip()
            category = lookup_category("",section,"")
            #print (f"view_all_url - {view_all_url}, section - {section}, category - {category}")
            store_url(self.conn,view_all_url,self.store_id, category,section,"")

        siblingAisles = response.css('.siblingAisle')
        for siblingAisle in siblingAisles:
            href = siblingAisle.css('::attr(href)').get()
            siblingAisleUrl = self.base_url + href
            section = response.css('[aria-current="location"] ::text').get()
            section = section.strip()
            subsection = siblingAisle.css('::text').get()
            subsection = subsection.strip()
            category = lookup_category("",section,subsection)
            store_url(self.conn,siblingAisleUrl,self.store_id,category,section,subsection)
#
        #check if it has a load-more button and then increment page number on it
        if response.css('.primary-btn.btn.btn-default.btn-secondary.bloom-load-button').get() is not None:
            path = response.css('[aria-current]:not(.menu-nav__sub-item) ::text').getall()
            #print(f"path - {path} for url - {response.url}")
            section = path[1]
            section = section.strip()
            subsection = path[-2]
            subsection = subsection.strip()
            category = lookup_category("",section,subsection)
            next_page_url=get_next_pagination(self.page_str,response.url)
            print (f'load-more-button. storing - {next_page_url}, section - {section}, subsection - {subsection}, category - {category}')
            store_url(self.conn,next_page_url,self.store_id,category,section,subsection)
コード例 #12
0
ファイル: urlScraper.py プロジェクト: gobfink/Groceries
    def crawl_2nd_layer_menu(self, section, subsection):
        section_url = self.driver.current_url
        section_category = lookup_category("", section, subsection)

        #print (f"inside crawl_2nd_layer_menu for {section}:{subsection} with url - {section_url}")

        sections = self.driver.find_elements_by_css_selector(
            '#collapseOne > li > a')
        next_section = self.get_next_2nd_layer_section(section, subsection,
                                                       sections)

        while next_section is not None:
            current_section = next_section
            section_text = current_section.get_attribute('innerText')
            #The trick here is that for 2nd layer sections is to append the layer2 info on the subsection
            subsection_text = subsection + ": " + section_text

            self.handle_click(current_section, self.delay)

            #print (f"subsection_text - {subsection_text}")
            current_url = self.driver.current_url
            category = lookup_category("", section, subsection_text)
            num_groceries = self.get_quantity()
            #We'll need to handle the pagination here, because we don't revisit this spot
            self.walk_through_pages(section, subsection_text)
            store_url(self.conn, current_url, self.store_id, category, section,
                      subsection_text, num_groceries)
            sections = self.driver.find_elements_by_css_selector(
                '#collapseOne > li > a')
            next_section = self.get_next_2nd_layer_section(
                section, subsection, sections)

        #Store the section url after so we know we've completed it
        store_url(self.conn, section_url, self.store_id, section_category,
                  section, subsection, self.get_quantity())
        #We then need to click on the section header to get back outside the menu and continue on
        section_button = self.driver.find_element_by_css_selector(
            'li.breadcrumb-item:nth-child(2) > span:nth-child(1) > a:nth-child(1)'
        )
        self.handle_click(section_button, self.delay)
コード例 #13
0
ファイル: urlScraper.py プロジェクト: gobfink/Groceries
    def start_requests(self):
        url = self.start_urls[0]

        start_request = create_unfiltered_parse_request(
            url,
            self.handle_onboard,
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR,
                 '[data-automation-id="onboardingModalCloseBtn"]')),
            errback=self.prompt_blocked,
            meta_url=url,
            cookies=False)
        store_url(self.conn, url, self.store_id, "Start", "", "")
        self.logger.info(f"about to call walk_menu with response.url: {url}")
        request = create_unfiltered_parse_request(
            url,
            self.walk_menu,
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, '[data-automation-id="NavigationBtn"]')),
            meta_url=url,
            cookies=False)
        yield start_request
コード例 #14
0
    def handle_pagination(self, response):
        # if it has a page-last class, read that content, and interprolate
        # else, get the last pager, page and interprolate
        self.logger.info("Inside handle_pagination")
        close_modal(self)
        change_store_location(self)
        base_url = response.url
        string_location=base_url.find(self.page_string)
        if string_location != -1:
            base_url = base_url[:string_location]
        pag_last = self.driver.find_elements_by_css_selector(
            '.pagination-last.pager-item')
        if pag_last:
            final_page_number = pag_last[0].text
        else:
            last_page = self.driver.find_elements_by_css_selector(
                '.pagination-page.pager-item')[-1]
            final_page_number = last_page.text

        final_page_number = int(final_page_number)
        metadata = get_url_metadata(self.cursor, base_url)

        category = metadata[0]
        section = metadata[1]
        subsection = metadata[2]

        for page_num in range(1, final_page_number+1):
            # Something like -
            # https://shop.wegmans.com/shop/categories/94 ?page= 13
            page_url = base_url + self.page_string + str(page_num)
            store_url(self.conn, page_url, self.store_id,
                      category, section, subsection)

        self.logger.info(f"finished handling pagination for {base_url}")
        finish_url(self.conn, self.store_id, response.url, scrape_urls=True)
        request = self.get_next_request()
        yield request
コード例 #15
0
ファイル: urlScraper.py プロジェクト: gobfink/Groceries
    def crawl_submenu(self, response, section):
        ret = None
        subsections = self.driver.find_elements_by_css_selector(
            '#collapseOne > li > a')
        next_subsection = self.get_next_subsection(section, subsections)

        while next_subsection is not None:
            current_subsection = next_subsection
            subsection_text = current_subsection.get_attribute('innerText')

            self.handle_click(current_subsection, self.delay)

            try:
                #From here we should check if we are in a different menu
                clicked_element = self.driver.find_element_by_css_selector(
                    '#collapseOne > li > span')
            except NoSuchElementException:
                clicked_element = None

            if clicked_element is None:
                #print(f"Now entered submenu for {subsection_text}")
                self.crawl_2nd_layer_menu(section, subsection_text)
            else:
                #print(f"Not in submenu for {subsection_text}")
                current_url = self.driver.current_url
                category = lookup_category("", section, subsection_text)
                num_groceries = self.get_quantity()
                self.walk_through_pages(section, subsection_text)
                store_url(self.conn, current_url, self.store_id, category,
                          section, subsection_text, num_groceries)
            #inspect_response(response,self)
            #print (f"subsection_text - {subsection_text}")
            local_subsections = self.driver.find_elements_by_css_selector(
                '#collapseOne > li > a')
            next_subsection = self.get_next_subsection(section,
                                                       local_subsections)
        return ret
コード例 #16
0
ファイル: urlScraper.py プロジェクト: gobfink/Groceries
    def crawl_menu(self, response):
        self.driver = response.request.meta['driver']
        actions = ActionChains(self.driver)
        print("inside crawl_menu")

        accept_cookies = self.driver.find_element_by_css_selector(
            '[title="Accept Cookies"]')
        self.handle_click(accept_cookies, self.delay)
        menu_button = self.driver.find_element_by_css_selector('.nav-open')
        self.handle_click(menu_button, self.delay)
        #We then need to scrape all of the (.'category-link') and then hover over each one and scrape the hrefs that appear
        sections = self.driver.find_elements_by_css_selector('.category-link')
        next_section = self.get_next_section(sections)
        self.section_list = sections

        #inspect_response(response,self)
        while next_section is not None:
            actions.move_to_element(next_section)
            section_name = next_section.get_attribute('innerText')
            print(f"using next_section: {section_name}")
            self.handle_click(next_section, self.delay)

            current_url = self.driver.current_url
            category = lookup_category("", section_name, "")
            #While on this page we need to click on all of the subsections
            self.crawl_submenu(response, section_name)
            #inspect_response(response,self)

            num_groceries = self.get_quantity()
            store_url(self.conn, current_url, self.store_id, category,
                      section_name, "", num_groceries)
            # Now we need to reset it and do it again
            self.handle_click(menu_button, self.delay)
            sections = self.driver.find_elements_by_css_selector(
                '.category-link')
            next_section = self.get_next_section(sections)
        return
コード例 #17
0
ファイル: urlScraper.py プロジェクト: gobfink/Groceries
    def scrape_urls(self, response):
        # FIXME the links for the hrefs default to 3132 then change to the correct 2635

        mainGroups = response.css(
            '.col-12.col-sm-12.col-md-4.col-lg-4.col-xl-3')
        section = response.css('[aria-current="location"] ::text').get()
        if section is not None:
            section = section.strip()
        self.logger.info("Inside scrape_urls")
        #TODO can probably infer some categories from location
        for mainGroup in mainGroups:
            #self.logger.info (f"Using mainGroup - {mainGroup}")
            #It might be coming from here? it looks like the main categories are all having issues
            view_all = mainGroup.css(
                '.text-uppercase.view-all-subcats ::attr(href)').get()
            view_all_url = self.base_url + view_all
            view_all_url = self.replace_store_number(view_all_url)
            section = mainGroup.css(
                '.product-title.text-uppercase ::text').get()
            section = section.strip()
            category = lookup_category("", section, "")
            self.logger.info(
                f"view_all_url - {view_all_url}, section - {section}, category - {category}"
            )
            store_url(self.conn, view_all_url, self.store_id, category,
                      section, "")

        aisleCategories = response.css('a.aisle-category')
        for aisleCategory in aisleCategories:
            aisleName = aisleCategory.css(
                '::attr(data-aisle-name)').get().strip()
            aisleHref = aisleCategory.css('::attr(href)').get()
            aisleUrl = self.base_url + aisleHref
            aisleUrl = self.replace_store_number(aisleUrl)
            subsection = aisleName
            category = lookup_category("", section, subsection)
            self.logger.info(
                f"found aisleCategory with section - {section}, subsection - {subsection} "
            )
            store_url(self.conn, aisleUrl, self.store_id, category, section,
                      subsection)

        siblingAisles = response.css('.siblingAisle')
        for siblingAisle in siblingAisles:
            self.logger.info(f"using siblingAisle - {siblingAisle}")
            href = siblingAisle.css('::attr(href)').get()
            siblingAisleUrl = self.base_url + href
            siblingAisleUrl = self.replace_store_number(siblingAisleUrl)
            section = response.css('[aria-current="location"] ::text').get()
            section = section.strip()
            subsection = siblingAisle.css('::text').get()
            subsection = subsection.strip()
            category = lookup_category("", section, subsection)
            self.logger.info(f"siblingAisle storing: {siblinAisleUrl}")
            store_url(self.conn, siblingAisleUrl, self.store_id, category,
                      section, subsection)
#
#check if it has a load-more button and then increment page number on it
        if response.css(
                '.primary-btn.btn.btn-default.btn-secondary.bloom-load-button'
        ).get() is not None:
            path = response.css(
                '[aria-current]:not(.menu-nav__sub-item) ::text').getall()
            #self.logger.info(f"path - {path} for url - {response.url}")
            section = path[1]
            section = section.strip()
            subsection = path[-2]
            subsection = subsection.strip()
            category = lookup_category("", section, subsection)
            next_page_url = get_next_pagination(self.page_str, response.url)
            next_page_url = self.replace_store_number(next_page_url)
            self.logger.info(
                f'load-more-button. storing - {next_page_url}, section - {section}, subsection - {subsection}, category - {category}'
            )
            store_url(self.conn, next_page_url, self.store_id, category,
                      section, subsection)
コード例 #18
0
    def parse(self, response):
        GROCERY_SELECTOR = '[data-automation-id="productTile"]'
        SPONSORED_SELECTOR = '[data-automation-id="sponsoredProductTile"]'
        GROCERIES_SELECTOR = GROCERY_SELECTOR + ',' + SPONSORED_SELECTOR
        NEXT_BUTTON = '[data-automation-id="nextButton"]'
        # Handle pagination
        url = response.url
        print (f"working on url - {url}")
        metadata=get_url_metadata(self.cursor,url)
        section=metadata[1]
        subsection=metadata[2]

        next_page=response.css(NEXT_BUTTON).get()

        if next_page is not None:
            #inspect_response(response,self)
            page_string="&page="
            page_str_len=len(page_string)
            next_page_url=get_next_pagination(page_string,url)

            store_url(self.conn,next_page_url, self.store_id, lookup_category("",section,subsection) ,section, subsection)


        for grocery in response.css(GROCERIES_SELECTOR):
            NAME_SELECTOR = '[data-automation-id="name"] ::attr(name)'
            self.name = grocery.css(NAME_SELECTOR).extract_first()
            #parse the ounces off of the name
            decimal_regex = "([\d]+[.]?[\d]*|[.\d]+)"
            self.ounces = re.findall(decimal_regex + "\s*o(?:z|unces?)",
                                     self.name, re.IGNORECASE)
            self.pounds = re.findall(decimal_regex + "\s*(?:pound|lb)s?",
                                     self.name, re.IGNORECASE)
            self.count = re.findall("([\d]+)\s*(?:c(?:t|ount)|p(?:k|ack))",
                                    self.name, re.IGNORECASE)
            #Check if the arrays returned from re.findall are empty
            if self.ounces:
                self.ounces = parse_float(self.ounces[0])
            else:
                self.ounces = 0
            if self.pounds:
                self.pounds = parse_float(self.pounds[0])
            else:
                self.pounds = 0
            if self.count:
                self.count = parse_float(self.count[0])
            else:
                self.count = 0



            if self.pounds != 0:
                self.ounces = 16*self.pounds
            elif self.count != 0:
                self.ounces *= self.count

            #            inspect_response(response,self)
            SALEPRICE_SELECTOR = '[data-automation-id="salePrice"] ::text'
            PRICE_SELECTOR = '[data-automation-id="price"] ::text'
            PRICE_PER_UNIT_SELECTOR = '[data-automation-id="price-per-unit"] ::text'

            name=grocery.css(NAME_SELECTOR).extract_first()
            name=clean_string(name,"\"")
            ounces=self.ounces
            pounds=self.pounds
            count=self.count
            price=str(handle_none(grocery.css(SALEPRICE_SELECTOR).extract_first())).replace('$','')
            ppu=convert_ppu(grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first())
            url=response.url

            yield {
                'name': name,
                'ounces': ounces,
                'pounds': pounds,
                'count': count,
                'price': price,
                'price-per-unit': ppu,
                'section': section,
                'subsection': subsection,
                'url': url,
            }

        finish_url(self.conn,self.store_id,url)
        next_url=get_next_url(self.cursor,1)

        print(f"next_url - {next_url}")
        if next_url is None:
            print ("No more urls - finishing")
        else:
            yield SplashRequest(next_url,
                        self.parse,
                        endpoint='render.html',
                        args={
                            'wait': 10,
                            'section': section,
                            'subsection': subsection
                        })
コード例 #19
0
    def parse(self, response):

        url = response.url
        finish_url(self.conn, self.store_id, url)
        items = response.css('.cell-content-wrapper')
        metadata = get_url_metadata(self.cursor, url)
        section = metadata[1]
        subsection = metadata[2]
        #check if it has a next button,
        next_page = response.css('.pagination-next:not(.disabled)').get()
        if next_page is not None:
            #inspect_response(response,self)
            page_string = "?page="
            page_str_len = len(page_string)
            i = url.find(page_string)
            #if yes, check url if it has a page part on it
            if i == -1:
                #if no, add ?page=2 to it
                next_url = url + page_string + "2"
            else:
                #if yes, extract page and add 1
                page_number = i + page_str_len
                current_page = int(url[page_number:])
                next_page = current_page + 1
                next_url = url[:page_number] + str(next_page)
            #then add to self.urls
            store_url(self.conn, next_url, self.store_id,
                      lookup_category("", section, subsection), section,
                      subsection)

        for item in items:
            name = item.css('.cell-title-text ::text').get()
            name = clean_string(name, ['\"'])
            price = item.css('[data-test="amount"] .css-19m8h51 ::text').get()
            price = convert_dollars(price)

            quantity = item.css('[data-test="amount"] .css-cpy6p ::text').get()

            unit = item.css('.cell-product-size ::text').get()
            ounces = convert_to_ounces(unit)

            ppu = item.css('[data-test="per-unit-price"] ::text').get()
            ppu = convert_ppu(ppu)

            print(
                f"name - {name}, price - {price}, quantity - {quantity}, ounces - {ounces}, ppu - {ppu}, url - {url}, section - {section}, subsection - {subsection} "
            )
            #inspect_response(response,self)
            yield {
                "name": name,
                "price": price,
                "ounces": ounces,
                "unit": unit,
                "price-per-unit": ppu,
                "url": url,
                "section": section,
                "subsection": subsection
            }

        next_url = get_next_url(self.cursor, 1)
        if next_url is None:
            print("No more URLs to parse. Finishing")
            return
        request = self.create_parse_request(
            next_url, self.parse,
            EC.element_to_be_clickable((By.CSS_SELECTOR, '[add-to-cart]')))

        if next_url is not None:
            try:
                yield request
            except:
                print(
                    f"Parse -  Errored out processing request for - {next_url} "
                )
                next_url = get_next_url(self.cursor, 2)
                print(f"Parse - Now handling {next_url}")
                request = self.create_parse_request(
                    next_url, self.parse,
                    EC.element_to_be_clickable(
                        (By.CSS_SELECTOR, '[add-to-cart]')))

            yield SeleniumRequest(url=next_url,
                                  callback=self.parse,
                                  wait_time=50,
                                  wait_until=EC.element_to_be_clickable(
                                      (By.CSS_SELECTOR,
                                       '.button.full.cart.add')))
コード例 #20
0
ファイル: scraper.py プロジェクト: gobfink/Groceries
    def parse(self, response):
        # This callback determines if the selected menu is
        # at the top of the list, if it is then it adds the urls
        # to the list and keeps going
        # if its not, then it calls the lua to prepare the page
        # for scraping, and then scrapes it
        url = response.url

        menu = response.css(".category-filter__link")
        #submenu = response.css("")
        #print ("self.urls - " +str(self.urls))
        print("processing response.url - " + response.url)

        #print ("menu: ")
        #print (menu.getall())
        #print ("len(menu): " + str(len(menu)))
        #print ("menu[0] : " + menu.get())
        #print("name - " + menu[0].css('.category-filter__text ::text').get())
        #inspect_response(response,self)

        if (len(menu) > 0 and menu[0].css('[aria-current="page"]')):
            print(f"inside menu page for url - {url}")
            # The top page is active
            #print ("menu[0] : [aria-current=page] " + menu[0].css('[aria-current="page"]').get())
            # therefore we need to scrape the links, and continue searching
            # we then need to loop through each other page.
            # call parse, and scrape it is not
            menu_url = menu[0].css('::attr(href)').get()

            menu_name = menu[0].css('.category-filter__text ::text').get()
            for item in menu:
                heading = item.css('.category-filter__text ::text').get()
                scraped_url = item.css('::attr(href)').get()
                scraped_url = self.base_url + scraped_url
                section = menu_name
                subsection = heading
                category = lookup_category("", section, subsection)
                store_url(self.conn, scraped_url, self.store_id, category,
                          section, subsection)

                #self.section_dict[url]=(menu_name, heading)
                #if self.urls.count(url) == 0:
                #    self.urls.append(url)

            #urls=menu.css('::attr(href)').getall()
            # Remove the the first(this) page from list to parse
            #urls.pop()
            #self.urls.extend(urls)
            #print("urls to scrape - " + str(self.urls))
            #print("local urls - " + str(urls))
            """
            while len(self.urls) != 0:
                url = self.urls.pop()
                self.processedUrls.append(url)
                #url = self.base_url + url_suffix
                #print ("urls - " + str(self.urls))
                #print ("pulling from url - " + url)
                #print ("urls lengths - " + str(len(self.urls)))
                yield SplashRequest(url,
                                self.parse,
                                endpoint='execute',
                                args={'lua_source': self.expand_and_scroll_lua})
            """

        elif (len(menu) == 0):
            inspect_response(response, self)

        else:
            #we are on a subpage, so now we can start scraping
            #

            GROCERY_SELECTOR = '.grid-item'
            NAME_SELECTOR = '.small-type.detail-card-description ::text'
            PRICE_SELECTOR = '.price ::text'
            PRICE_PER_UNIT_SELECTOR = '.sub-headline.detail-card-subtext ::text'

            metadata = get_url_metadata(self.cursor, url)
            section = metadata[0]
            subsection = metadata[1]
            print("subpage - scraping " + url + ", from section - " + section)
            for grocery in response.css(GROCERY_SELECTOR):
                self.name = grocery.css(NAME_SELECTOR).extract_first()
                self.price = grocery.css(PRICE_SELECTOR).extract_first()
                if self.price is not None:
                    self.price = self.price.replace('*', '').replace('$', '')
                self.ppu = grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first()
                if self.ppu is not None:
                    self.ppu = convert_ppu(self.ppu)
                #inspect_response(response, self)
                #parse the ounces off of the name
                yield {
                    'name': self.name,
                    'price': self.price,
                    'price-per-unit': self.ppu,
                    'section': section,
                    'subsection': subsection,
                    'url': response.url
                }
        finish_url(self.conn, self.store_id, url)
        print("finishing url - " + url)
        next_url = get_next_url(self.cursor, 1)
        if next_url is not None:
            print("got next_url - " + next_url)
            yield SplashRequest(
                next_url,
                self.parse,
                endpoint='execute',
                dont_filter=True,
                args={'lua_source': self.expand_and_scroll_lua})
        else:
            print("Next url is none therefore we must be finished ! ")