Esempi in Python per finish_url, esempi in Python per util.finish_url

Esempio n. 1

0

Mostra file

File: urlScraper.py Progetto: gobfink/Groceries

    def run(self, response):
        page_1_str = self.page_str + "1"
        meta_url = response.meta.get("url")
        #Basically the website redirects us to the url and page_1_str, which isn't added to our database
        # So we trim that off so we can get the url in our database
        this_url = trim_url(response.url, page_1_str)
        self.logger.info(f"inside run for {this_url}, meta_url: {meta_url}")
        if meta_url != this_url:
            self.logger.info(
                f"meta_url: {meta_url} !=  response.url: {response.url}, therefore it must be invalid - skipping"
            )
            this_url = meta_url
        else:
            self.scrape_urls(response)

        finish_url(self.conn, self.store_id, this_url, scrape_urls=True)
        self.logger.info("finishing url - " + this_url)
        next_url = get_next_url(self.cursor, 1, self.store_id, True)
        if next_url is None:
            self.logger.info(
                "Next url is none therefore we must be finished ! ")
            return
        else:
            next_request = create_unfiltered_parse_request(
                next_url, self.run,
                EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, '#openFulfillmentModalButton')))
        self.logger.info(f"got next_url - {next_url}")

        yield next_request

Esempio n. 2

0

Mostra file

    def collect_menu(self, response):
        self.logger.info("inside collect_menu! ")
        self.driver = response.request.meta['driver']
        close_modal(self)
        change_store_location(self)
        departments = self.driver.find_elements_by_css_selector(
            '[category-filter="subcategory"]')
        for department in departments:
            dept_name = department.find_element_by_css_selector(
                '[data-test="category-card-"]').text
            aisles = department.find_elements_by_css_selector('a')
            self.logger.info(f"dept_name: {dept_name}")
            self.aisles = aisles
            for aisle in aisles:
                aisle_name = aisle.text
                aisle_url = aisle.get_attribute("href")
                category = lookup_category("", dept_name, aisle_name)
                store_url(self.conn, aisle_url, self.store_id,
                          category, dept_name, aisle_name)
            #inspect_response(response, self)

        self.logger.info("finished collect_menu! ")
        finish_url(self.conn, self.store_id, response.url, scrape_urls=True)

        request = self.get_next_request()
        yield request

Esempio n. 3

0

Mostra file

    def no_pagination(self, failure):
        url = failure.request.url
        self.logger.info(f"no_pagination for url: {url}, continuing")

        finish_url(self.conn, self.store_id, url, set_val=-1, scrape_urls=True)
        # TODO add a filter so we don't get the ones with ?page=
        request = self.get_next_request()
        yield request

Esempio n. 4

0

Mostra file

File: urlScraper.py Progetto: gobfink/Groceries

    def parse(self, response):
        # This callback determines if the selected menu is
        # at the top of the list, if it is then it adds the urls
        # to the list and keeps going
        # if its not, then it calls the lua to prepare the page
        # for scraping, and then scrapes it
        url = response.url

        menu = response.css(".category-filter__link")
        #submenu = response.css("")
        #print ("self.urls - " +str(self.urls))
        print("processing response.url - " + response.url)

        #print ("menu: ")
        #print (menu.getall())
        #print ("len(menu): " + str(len(menu)))
        #print ("menu[0] : " + menu.get())
        #print("name - " + menu[0].css('.category-filter__text ::text').get())
        #inspect_response(response,self)

        if (len(menu) > 0 and menu[0].css('[aria-current="page"]')):
            print(f"inside menu page for url - {url}")
            # The top page is active
            #print ("menu[0] : [aria-current=page] " + menu[0].css('[aria-current="page"]').get())
            # therefore we need to scrape the links, and continue searching
            # we then need to loop through each other page.
            # call parse, and scrape it is not
            menu_url = menu[0].css('::attr(href)').get()

            menu_name = menu[0].css('.category-filter__text ::text').get()
            for item in menu:
                heading = item.css('.category-filter__text ::text').get()
                scraped_url = item.css('::attr(href)').get()
                scraped_url = self.base_url + scraped_url
                section = menu_name
                subsection = heading
                category = lookup_category("", section, subsection)
                store_url(self.conn, scraped_url, self.store_id, category,
                          section, subsection)

        elif (len(menu) == 0):
            inspect_response(response, self)

        finish_url(self.conn, self.store_id, url, True)
        print("finishing url - " + url)
        next_url = get_next_url(self.cursor, 1, self.store_id, True)

        if next_url is not None:
            print("got next_url - " + next_url)
            yield SplashRequest(
                next_url,
                self.parse,
                endpoint='execute',
                dont_filter=True,
                args={'lua_source': self.expand_and_scroll_lua})
        else:
            print("Next url is none therefore we must be finished ! ")

Esempio n. 5

0

Mostra file

    def parse_urls(self, response):
        location = response.css('[data-test="store-button"] ::text').get()
        self.driver = response.request.meta['driver']
        location = self.driver.find_element_by_css_selector(
            '[data-test="store-button"]').text
        print(f"detected location - {location}")
        if location != self.location:
            self.change_store_location(response)

        self.section_group = response.css(".subcategory.category")
        section_group = response.css(".subcategory.category")
        for section in section_group:
            section_name = section.css(".css-1pita2n ::text").get()
            url_nodes = section.css("ul.children a")
            for url_node in url_nodes:
                subsection_name = url_node.css("::text").get()
                url = self.base_url + url_node.css("::attr(href)").get()

                store_url(self.conn, url, self.store_id,
                          lookup_category("", section_name, subsection_name),
                          section_name, subsection_name)

        finish_url(self.conn, self.store_id, response.url)
        function = self.parse
        item_to_find = '[add-to-cart]'
        if len(self.start_urls) != 0:
            next_url = self.start_urls.pop()
            store_url(self.conn, next_url, self.store_id, "", "", "")
            function = self.parse_urls
            item_to_find = '[data-test="store-button"]'
            #request = self.create_parse_request(next_url,self.parse_urls,EC.element_to_be_clickable((By.CSS_SELECTOR, '[data-test="store-button"]')))

        else:
            next_url = get_next_url(self.cursor, 1)
        #    request = self.create_parse_request(next_url,self.parse,EC.element_to_be_clickable((By.CSS_SELECTOR, '[add-to-cart]')))

        if next_url is None:
            print("No more URLs to parse. Finishing")
            return
        else:
            request = self.create_parse_request(
                next_url, function,
                EC.element_to_be_clickable((By.CSS_SELECTOR, item_to_find)))

        #FIXME these try except blocks don't actually handle timeout exceptions from navigating to the wrong url
        try:
            yield request
        except:
            print(f"Parse -  Errored out processing request for - {next_url} ")
            next_url = get_next_url(self.cursor, 2)
            print(f"Parse - Now handling {next_url}")
            request = self.create_parse_request(
                next_url, self.parse,
                EC.element_to_be_clickable((By.CSS_SELECTOR, '[add-to-cart]')))
            yield request

Esempio n. 6

0

Mostra file

File: urlScraper.py Progetto: gobfink/Groceries

    def walk_menu(self, response):
        # inspect_response(response,self)
        self.driver = response.request.meta['driver']
        self.logger.info('Inside walk_menu')
        start_url = self.driver.current_url
        menu_button = self.driver.find_element_by_css_selector(
            '[data-automation-id="NavigationBtn"]')
        menu_button.click()

        time.sleep(.5)

        departments = self.driver.find_elements_by_css_selector(
            '.NavigationPanel__department___1DF7d button')
        for department in departments:
            department_name = department.get_attribute('aria-label')
            department.click()
            time.sleep(.5)
            aisles = self.driver.find_elements_by_css_selector(
                '.NavigationPanel__aisleLink___309i2')
            for aisle in aisles:
                url = aisle.get_attribute('href')
                aisle_name = aisle.get_attribute('innerText')
                # self.department_name = department_name
                # self.aisle_name = aisle_name
                self.logger.info(
                    f"department_name: {department_name}, aisle_name: {aisle_name}"
                )
                category = lookup_category("", department_name, aisle_name)
                self.logger.info(f"Storing aisle: {aisle_name}, url: {url}")
                store_url(self.conn, url, self.store_id, category,
                          department_name, aisle_name)

        finish_url(self.conn, self.store_id, start_url, scrape_urls=True)
        next_url = get_next_url(self.cursor,
                                1,
                                store_id=self.store_id,
                                scrape_urls=True,
                                filter="aisle=")
        if next_url is None:
            self.logger.debug(
                "Next_url is None therefore we must be finished!")
            return

        self.next_url = next_url
        pagination_request = create_parse_request(next_url,
                                                  self.handle_pagination,
                                                  EC.element_to_be_clickable(
                                                      (By.CSS_SELECTOR,
                                                       self.PAGE_LOAD)),
                                                  errback=self.retry,
                                                  meta_url=next_url,
                                                  cookies=False)

        yield pagination_request

Esempio n. 7

0

Mostra file

File: groceryScraper.py Progetto: gobfink/Groceries

 def retry_page(self, failure):
     url = failure.request.url
     attempt = failure.request.meta['attempt']
     attempt = int(attempt) + 1
     self.logger.info(
         f"retrying url: {url}, on attempt: {attempt}, continuing")
     if attempt > self.MAX_RETRIES:
         self.logger.warning(f"Failed {url}, {attempt} times. Skipping")
         finish_url(self.conn, self.store_id, url, set_val=-1)
         attempt = 1
     request = self.get_next_request(attempt=attempt)
     yield request

Esempio n. 8

0

Mostra file

    def parse(self, response):
        page_1_str=self.page_str+"1"
        this_url = trim_url(response.url,page_1_str)
        print (f"inside parse for {this_url}")
        self.scrape_urls(response)

        # Only scrape pages that have the page_str in the url.
        if this_url.find(self.page_str) != -1:
            print (f"scraping for {this_url}")
            items = response.css('product-item-v2')
            print(f"length of items - {len(items)}")
            metadata=get_url_metadata(self.cursor,this_url)
            section=metadata[1]
            subsection=metadata[2]
            for item in items:
                name = item.css('.product-title ::text').get()
                price_strings = item.css('.product-price ::text').getall()
                price = clean_string(price_strings[-1],['$'])
                ppu = item.css('.product-price-qty ::text').get()
                unit = self.collect_units(name)
                #inspect_response(response,self)

                if unit == "OZ" or unit == "LB":
                    ounces = self.collect_ounces(name)
                else:
                    ounces = 0
                print (f"yielding - {name}, {price}, {ppu}, {ounces}, {unit}")
                yield{
                  "name": name,
                  "price": price,
                  "ounces": ounces,
                  "unit": unit,
                  "price-per-unit": ppu,
                  "url": this_url,
                  "section": section,
                  "subsection": subsection
                }

        #Basically the website redirects us to the url and page_1_str, which isn't added to our database
        # So we trim that off so we can get the url in our database
        finish_url(self.conn,self.store_id,this_url)
        print("finishing url - " + this_url)
        next_url = get_next_url(self.cursor, 1)
        if next_url is None:
            print ("Next url is none therefore we must be finished ! ")
            return
        else:
            next_request = create_parse_request(next_url,
                                                self.check_location,
                                                EC.element_to_be_clickable((By.CSS_SELECTOR,'#openFulfillmentModalButton')))
        print(f"got next_url - {next_url}")
        yield next_request

Esempio n. 9

0

Mostra file

File: groceryScraper.py Progetto: gobfink/Groceries

    def parse(self, response):
        time.sleep(1)
        url = response.url
        print(f"inside parse for {url}")
        PRODUCTS_CSS = '#product-main'
        metadata = get_url_metadata(self.cursor, url)
        if metadata is None:
            print("Could not find metadata for url - " + url + " - skipping")
            finish_url(self.conn, self.store_id, url)
            return

        section = metadata[1]
        subsection = metadata[2]
        products = response.css(PRODUCTS_CSS)
        for product in products:
            name = product.css('.product-name ::text').get()
            name = name.replace("'", "")
            raw_price = product.css('.product-price ::text').get()
            price = re.findall(
                "[0-9]+.[0-9]*", raw_price
            )[0]  # This will filter out any of the $'s and other text in the price'

            quantity = product.css('.product-quantity ::text').get()
            index_split = quantity.find('|')
            ppu = quantity[index_split + 1:]

            amount = quantity[:index_split]

            ounces = self.collect_ounces(amount)
            unit = self.collect_unit(amount)

            print(f"yielding - {name}, {price}, {ppu}, {ounces}, {unit}")
            yield {
                "name": name,
                "price": price,
                "ounces": ounces,
                "unit": unit,
                "price-per-unit": ppu,
                "url": url,
                "section": section,
                "subsection": subsection
            }

        check_subsection_amount(self.cursor, url)
        finish_url(self.conn, self.store_id, url)

Esempio n. 10

0

Mostra file

File: groceryScraper.py Progetto: gobfink/Groceries

    def parse(self, response):
        self.driver = response.request.meta['driver']
        close_modal(self)
        change_store_location(self)

        url = response.url
        metadata = get_url_metadata(self.cursor, url)
        section = metadata[1]
        subsection = metadata[2]
        #check if it has a next button,
        items = response.css('.cell-content-wrapper')
        for item in items:
            name = item.css('.cell-title-text ::text').get()
            name = clean_string(name, ['\"'])
            price = item.css('[data-test="amount"] .css-19m8h51 ::text').get()
            price = convert_dollars(price)

            quantity = item.css('[data-test="amount"] .css-cpy6p ::text').get()

            unit = item.css('.cell-product-size ::text').get()
            ounces = convert_to_ounces(unit)

            ppu = item.css('[data-test="per-unit-price"] ::text').get()
            ppu = convert_ppu(ppu)

            self.logger.info(
                f"name - {name}, price - {price}, quantity - {quantity}, ounces - {ounces}, ppu - {ppu}, url - {url}, section - {section}, subsection - {subsection} "
            )
            #inspect_response(response,self)
            yield {
                "name": name,
                "price": price,
                "ounces": ounces,
                "unit": unit,
                "price-per-unit": ppu,
                "url": url,
                "section": section,
                "subsection": subsection
            }

        finish_url(self.conn, self.store_id, url)

        request = self.get_next_request()
        yield request

Esempio n. 11

0

Mostra file

File: urlScraper.py Progetto: gobfink/Groceries

    def handle_pagination(self, response):
        self.logger.info('inside handle_pagination')
        url = self.driver.current_url
        next_button = self.driver.find_elements_by_css_selector(
            self.NEXT_BUTTON_SELECTOR)
        # inspect_response(response,self)
        if len(next_button) != 0:
            next_page_url = get_next_pagination(self.PAGE_STRING, url)
            metadata = get_url_metadata(self.cursor, url)
            category = metadata[0]
            section = metadata[1]
            subsection = metadata[2]
            quantity = self.driver.find_element_by_css_selector(
                '.Title__browseTotalCount___OWylh').get_attribute('innerText')
            quantity = re.findall('[0-9]+', quantity)[0]
            store_url(self.conn,
                      next_page_url,
                      self.store_id,
                      category,
                      section,
                      subsection,
                      grocery_quantity=quantity)

        finish_url(self.conn, self.store_id, url, scrape_urls=True)
        next_url = get_next_url(self.cursor,
                                1,
                                store_id=self.store_id,
                                scrape_urls=True,
                                filter="aisle=")
        if next_url is None:
            self.logger.debug(
                "Next_url is None therefore we must be finished!")
            return
        request = create_parse_request(next_url,
                                       self.handle_pagination,
                                       EC.element_to_be_clickable(
                                           (By.CSS_SELECTOR, self.PAGE_LOAD)),
                                       errback=self.retry,
                                       meta_url=next_url,
                                       cookies=False)
        yield request

Esempio n. 12

0

Mostra file

    def handle_pagination(self, response):
        # if it has a page-last class, read that content, and interprolate
        # else, get the last pager, page and interprolate
        self.logger.info("Inside handle_pagination")
        close_modal(self)
        change_store_location(self)
        base_url = response.url
        string_location=base_url.find(self.page_string)
        if string_location != -1:
            base_url = base_url[:string_location]
        pag_last = self.driver.find_elements_by_css_selector(
            '.pagination-last.pager-item')
        if pag_last:
            final_page_number = pag_last[0].text
        else:
            last_page = self.driver.find_elements_by_css_selector(
                '.pagination-page.pager-item')[-1]
            final_page_number = last_page.text

        final_page_number = int(final_page_number)
        metadata = get_url_metadata(self.cursor, base_url)

        category = metadata[0]
        section = metadata[1]
        subsection = metadata[2]

        for page_num in range(1, final_page_number+1):
            # Something like -
            # https://shop.wegmans.com/shop/categories/94 ?page= 13
            page_url = base_url + self.page_string + str(page_num)
            store_url(self.conn, page_url, self.store_id,
                      category, section, subsection)

        self.logger.info(f"finished handling pagination for {base_url}")
        finish_url(self.conn, self.store_id, response.url, scrape_urls=True)
        request = self.get_next_request()
        yield request

Esempio n. 13

0

Mostra file

File: scraper.py Progetto: gobfink/Groceries

    def parse(self, response):
        # This callback determines if the selected menu is
        # at the top of the list, if it is then it adds the urls
        # to the list and keeps going
        # if its not, then it calls the lua to prepare the page
        # for scraping, and then scrapes it
        url = response.url

        menu = response.css(".category-filter__link")
        #submenu = response.css("")
        #print ("self.urls - " +str(self.urls))
        print("processing response.url - " + response.url)

        #print ("menu: ")
        #print (menu.getall())
        #print ("len(menu): " + str(len(menu)))
        #print ("menu[0] : " + menu.get())
        #print("name - " + menu[0].css('.category-filter__text ::text').get())
        #inspect_response(response,self)

        if (len(menu) > 0 and menu[0].css('[aria-current="page"]')):
            print(f"inside menu page for url - {url}")
            # The top page is active
            #print ("menu[0] : [aria-current=page] " + menu[0].css('[aria-current="page"]').get())
            # therefore we need to scrape the links, and continue searching
            # we then need to loop through each other page.
            # call parse, and scrape it is not
            menu_url = menu[0].css('::attr(href)').get()

            menu_name = menu[0].css('.category-filter__text ::text').get()
            for item in menu:
                heading = item.css('.category-filter__text ::text').get()
                scraped_url = item.css('::attr(href)').get()
                scraped_url = self.base_url + scraped_url
                section = menu_name
                subsection = heading
                category = lookup_category("", section, subsection)
                store_url(self.conn, scraped_url, self.store_id, category,
                          section, subsection)

                #self.section_dict[url]=(menu_name, heading)
                #if self.urls.count(url) == 0:
                #    self.urls.append(url)

            #urls=menu.css('::attr(href)').getall()
            # Remove the the first(this) page from list to parse
            #urls.pop()
            #self.urls.extend(urls)
            #print("urls to scrape - " + str(self.urls))
            #print("local urls - " + str(urls))
            """
            while len(self.urls) != 0:
                url = self.urls.pop()
                self.processedUrls.append(url)
                #url = self.base_url + url_suffix
                #print ("urls - " + str(self.urls))
                #print ("pulling from url - " + url)
                #print ("urls lengths - " + str(len(self.urls)))
                yield SplashRequest(url,
                                self.parse,
                                endpoint='execute',
                                args={'lua_source': self.expand_and_scroll_lua})
            """

        elif (len(menu) == 0):
            inspect_response(response, self)

        else:
            #we are on a subpage, so now we can start scraping
            #

            GROCERY_SELECTOR = '.grid-item'
            NAME_SELECTOR = '.small-type.detail-card-description ::text'
            PRICE_SELECTOR = '.price ::text'
            PRICE_PER_UNIT_SELECTOR = '.sub-headline.detail-card-subtext ::text'

            metadata = get_url_metadata(self.cursor, url)
            section = metadata[0]
            subsection = metadata[1]
            print("subpage - scraping " + url + ", from section - " + section)
            for grocery in response.css(GROCERY_SELECTOR):
                self.name = grocery.css(NAME_SELECTOR).extract_first()
                self.price = grocery.css(PRICE_SELECTOR).extract_first()
                if self.price is not None:
                    self.price = self.price.replace('*', '').replace('$', '')
                self.ppu = grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first()
                if self.ppu is not None:
                    self.ppu = convert_ppu(self.ppu)
                #inspect_response(response, self)
                #parse the ounces off of the name
                yield {
                    'name': self.name,
                    'price': self.price,
                    'price-per-unit': self.ppu,
                    'section': section,
                    'subsection': subsection,
                    'url': response.url
                }
        finish_url(self.conn, self.store_id, url)
        print("finishing url - " + url)
        next_url = get_next_url(self.cursor, 1)
        if next_url is not None:
            print("got next_url - " + next_url)
            yield SplashRequest(
                next_url,
                self.parse,
                endpoint='execute',
                dont_filter=True,
                args={'lua_source': self.expand_and_scroll_lua})
        else:
            print("Next url is none therefore we must be finished ! ")

Esempio n. 14

0

Mostra file

File: groceryScraper.py Progetto: gobfink/Groceries

    def parse(self, response):
        url = response.url
        self.logger.info(f"Inside parse for {url}")

        GROCERY_SELECTOR = '[data-automation-id="productTile"]'
        SPONSORED_SELECTOR = '[data-automation-id="sponsoredProductTile"]'
        GROCERIES_SELECTOR = GROCERY_SELECTOR + ',' + SPONSORED_SELECTOR
        metadata=get_url_metadata(self.cursor,url)
        section=metadata[1]
        subsection=metadata[2]

        for grocery in response.css(GROCERIES_SELECTOR):
            NAME_SELECTOR = '[data-automation-id="name"] ::attr(name)'
            name = grocery.css(NAME_SELECTOR).extract_first()
            #parse the ounces off of the name
            decimal_regex = "([\d]+[.]?[\d]*|[.\d]+)"
            ounces = re.findall(decimal_regex + "\s*o(?:z|unces?)",
                                     name, re.IGNORECASE)
            pounds = re.findall(decimal_regex + "\s*(?:pound|lb)s?",
                                     name, re.IGNORECASE)
            count = re.findall("([\d]+)\s*(?:c(?:t|ount)|p(?:k|ack))",
                                    name, re.IGNORECASE)
            self.ounce = ounces
            self.pounds = pounds
            self.count = count
            #Check if the arrays returned from re.findall are empty
            if ounces:
                ounces = parse_float(ounces[0])
            else:
                ounces = 0
            if pounds:
                pounds = parse_float(pounds[0])
            else:
                pounds = 0
            if count:
                count = parse_float(count[0])
            else:
                count = 0

            if pounds != 0:
                ounces = 16*pounds
            elif count != 0:
                ounces *= count

            #            inspect_response(response,self)
            SALEPRICE_SELECTOR = '[data-automation-id="salePrice"] ::text'
            PRICE_SELECTOR = '[data-automation-id="price"] ::text'
            PRICE_PER_UNIT_SELECTOR = '[data-automation-id="price-per-unit"] ::text'

            name=grocery.css(NAME_SELECTOR).extract_first()
            name=clean_string(name,"\"")
            ounces=ounces
            pounds=pounds
            count=count
            price=str(handle_none(grocery.css(SALEPRICE_SELECTOR).extract_first())).replace('$','')
            ppu=convert_ppu(grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first())

            yield {
                'name': name,
                'ounces': ounces,
                'pounds': pounds,
                'count': count,
                'price': price,
                'price-per-unit': ppu,
                'section': section,
                'subsection': subsection,
                'url': url,
            }

        finish_url(self.conn,self.store_id,url)
        next_url=get_next_url(self.cursor,1,store_id=self.store_id,filter="aisle=")

        print(f"next_url - {next_url}")
        if next_url is None:
            print ("No more urls - finishing")
        else:
            request = create_parse_request(next_url,
                                           self.parse,
                                           EC.element_to_be_clickable(
                                          (By.CSS_SELECTOR, '[aria-current="page"]')),
                                           meta_url=next_url)
            yield request

Esempio n. 15

0

Mostra file

    def parse(self, response):
        page_1_str = self.page_str + "1"
        meta_url = response.meta.get('url')
        this_url = response.url  #trim_url(response.url,page_1_str)
        trimmed_url = trim_url(response.url, page_1_str)
        self.logger.info(
            f"inside parse for meta_url: {meta_url}, response.url: {response.url}"
        )

        # Only scrape pages that have the page_str in the url.
        if this_url.find(self.page_str) == -1:
            self.logger.info(
                f"Skipping {this_url} because it couldn't find {self.page_str}"
            )
        elif meta_url != response.url:
            self.logger.info(
                f"meta_url: {meta_url} != response.url: {response.url}, and so we are finishing stale {meta_url}"
            )
            this_url = meta_url
        else:
            self.logger.info(f"scraping for {this_url}")
            items = response.css('product-item-v2')
            self.logger.info(f"length of items - {len(items)}")
            ## FIXME For some reason the middleware is returning an empty response for all of the urls that reach here.
            #inspect_response(response,self)
            metadata = get_url_metadata(self.cursor, trimmed_url)
            if len(metadata) != 3:
                self.logger.info(
                    f"Couldn't detect metadata: {metadata}, for trimmed_url: {trimmed_url}, defaulting to empty"
                )
                section = ""
                subsection = ""
            else:
                section = metadata[1]
                subsection = metadata[2]

            for item in items:
                name = item.css('.product-title ::text').get()
                price_string = item.css('.product-price').get()
                price = re.findall("\$([0-9]+\.[0-9]+)", price_string)[0]
                ppu = item.css('.product-price-qty ::text').get()
                unit = self.collect_units(name)

                if unit == "OZ" or unit == "LB":
                    ounces = self.collect_ounces(name)
                else:
                    ounces = 0
                self.logger.info(
                    f"yielding - {name}, {price}, {ppu}, {ounces}, {unit}")

                yield {
                    "name": name,
                    "price": price,
                    "ounces": ounces,
                    "unit": unit,
                    "price-per-unit": ppu,
                    "url": this_url,
                    "section": section,
                    "subsection": subsection
                }

        #Basically the website redirects us to the url and page_1_str, which isn't added to our database
        # So we trim that off so we can get the url in our database
        finish_url(self.conn, self.store_id, this_url)
        self.logger.info("finishing url - " + this_url + ", store_id: ",
                         self.store_id)
        # We only want requests that have the page= string in it because they have the groceries,
        # Also currently we're getting some urls in our database for locations that don't match our default_store_number
        # So filter those out too.
        next_url = get_next_url(self.cursor,
                                1,
                                self.store_id,
                                filter=f"{self.store_number}%page=")
        if next_url is None:
            self.logger.info(
                "Next url is none therefore we must be finished ! ")
            return
        else:
            next_request = create_unfiltered_parse_request(
                next_url, self.parse,
                EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, 'product-item-v2')))
        self.logger.info(f"got next_url - {next_url}")
        yield next_request

Esempio n. 16

0

Mostra file

    def parse(self, response):
        GROCERY_SELECTOR = '[data-automation-id="productTile"]'
        SPONSORED_SELECTOR = '[data-automation-id="sponsoredProductTile"]'
        GROCERIES_SELECTOR = GROCERY_SELECTOR + ',' + SPONSORED_SELECTOR
        NEXT_BUTTON = '[data-automation-id="nextButton"]'
        # Handle pagination
        url = response.url
        print (f"working on url - {url}")
        metadata=get_url_metadata(self.cursor,url)
        section=metadata[1]
        subsection=metadata[2]

        next_page=response.css(NEXT_BUTTON).get()

        if next_page is not None:
            #inspect_response(response,self)
            page_string="&page="
            page_str_len=len(page_string)
            next_page_url=get_next_pagination(page_string,url)

            store_url(self.conn,next_page_url, self.store_id, lookup_category("",section,subsection) ,section, subsection)


        for grocery in response.css(GROCERIES_SELECTOR):
            NAME_SELECTOR = '[data-automation-id="name"] ::attr(name)'
            self.name = grocery.css(NAME_SELECTOR).extract_first()
            #parse the ounces off of the name
            decimal_regex = "([\d]+[.]?[\d]*|[.\d]+)"
            self.ounces = re.findall(decimal_regex + "\s*o(?:z|unces?)",
                                     self.name, re.IGNORECASE)
            self.pounds = re.findall(decimal_regex + "\s*(?:pound|lb)s?",
                                     self.name, re.IGNORECASE)
            self.count = re.findall("([\d]+)\s*(?:c(?:t|ount)|p(?:k|ack))",
                                    self.name, re.IGNORECASE)
            #Check if the arrays returned from re.findall are empty
            if self.ounces:
                self.ounces = parse_float(self.ounces[0])
            else:
                self.ounces = 0
            if self.pounds:
                self.pounds = parse_float(self.pounds[0])
            else:
                self.pounds = 0
            if self.count:
                self.count = parse_float(self.count[0])
            else:
                self.count = 0



            if self.pounds != 0:
                self.ounces = 16*self.pounds
            elif self.count != 0:
                self.ounces *= self.count

            #            inspect_response(response,self)
            SALEPRICE_SELECTOR = '[data-automation-id="salePrice"] ::text'
            PRICE_SELECTOR = '[data-automation-id="price"] ::text'
            PRICE_PER_UNIT_SELECTOR = '[data-automation-id="price-per-unit"] ::text'

            name=grocery.css(NAME_SELECTOR).extract_first()
            name=clean_string(name,"\"")
            ounces=self.ounces
            pounds=self.pounds
            count=self.count
            price=str(handle_none(grocery.css(SALEPRICE_SELECTOR).extract_first())).replace('$','')
            ppu=convert_ppu(grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first())
            url=response.url

            yield {
                'name': name,
                'ounces': ounces,
                'pounds': pounds,
                'count': count,
                'price': price,
                'price-per-unit': ppu,
                'section': section,
                'subsection': subsection,
                'url': url,
            }

        finish_url(self.conn,self.store_id,url)
        next_url=get_next_url(self.cursor,1)

        print(f"next_url - {next_url}")
        if next_url is None:
            print ("No more urls - finishing")
        else:
            yield SplashRequest(next_url,
                        self.parse,
                        endpoint='render.html',
                        args={
                            'wait': 10,
                            'section': section,
                            'subsection': subsection
                        })

Esempio n. 17

0

Mostra file

    def parse(self, response):

        url = response.url
        finish_url(self.conn, self.store_id, url)
        items = response.css('.cell-content-wrapper')
        metadata = get_url_metadata(self.cursor, url)
        section = metadata[1]
        subsection = metadata[2]
        #check if it has a next button,
        next_page = response.css('.pagination-next:not(.disabled)').get()
        if next_page is not None:
            #inspect_response(response,self)
            page_string = "?page="
            page_str_len = len(page_string)
            i = url.find(page_string)
            #if yes, check url if it has a page part on it
            if i == -1:
                #if no, add ?page=2 to it
                next_url = url + page_string + "2"
            else:
                #if yes, extract page and add 1
                page_number = i + page_str_len
                current_page = int(url[page_number:])
                next_page = current_page + 1
                next_url = url[:page_number] + str(next_page)
            #then add to self.urls
            store_url(self.conn, next_url, self.store_id,
                      lookup_category("", section, subsection), section,
                      subsection)

        for item in items:
            name = item.css('.cell-title-text ::text').get()
            name = clean_string(name, ['\"'])
            price = item.css('[data-test="amount"] .css-19m8h51 ::text').get()
            price = convert_dollars(price)

            quantity = item.css('[data-test="amount"] .css-cpy6p ::text').get()

            unit = item.css('.cell-product-size ::text').get()
            ounces = convert_to_ounces(unit)

            ppu = item.css('[data-test="per-unit-price"] ::text').get()
            ppu = convert_ppu(ppu)

            print(
                f"name - {name}, price - {price}, quantity - {quantity}, ounces - {ounces}, ppu - {ppu}, url - {url}, section - {section}, subsection - {subsection} "
            )
            #inspect_response(response,self)
            yield {
                "name": name,
                "price": price,
                "ounces": ounces,
                "unit": unit,
                "price-per-unit": ppu,
                "url": url,
                "section": section,
                "subsection": subsection
            }

        next_url = get_next_url(self.cursor, 1)
        if next_url is None:
            print("No more URLs to parse. Finishing")
            return
        request = self.create_parse_request(
            next_url, self.parse,
            EC.element_to_be_clickable((By.CSS_SELECTOR, '[add-to-cart]')))

        if next_url is not None:
            try:
                yield request
            except:
                print(
                    f"Parse -  Errored out processing request for - {next_url} "
                )
                next_url = get_next_url(self.cursor, 2)
                print(f"Parse - Now handling {next_url}")
                request = self.create_parse_request(
                    next_url, self.parse,
                    EC.element_to_be_clickable(
                        (By.CSS_SELECTOR, '[add-to-cart]')))

            yield SeleniumRequest(url=next_url,
                                  callback=self.parse,
                                  wait_time=50,
                                  wait_until=EC.element_to_be_clickable(
                                      (By.CSS_SELECTOR,
                                       '.button.full.cart.add')))

Esempio n. 18

0

Mostra file

File: groceryScraper.py Progetto: gobfink/Groceries

    def parse(self, response):
        # This callback determines if the selected menu is
        # at the top of the list, if it is then it adds the urls
        # to the list and keeps going
        # if its not, then it calls the lua to prepare the page
        # for scraping, and then scrapes it
        url = response.url

        menu = response.css(".category-filter__link")
        #submenu = response.css("")
        #self.logger.info ("self.urls - " +str(self.urls))
        self.logger.info("processing response.url - " + response.url)

        #self.logger.info ("menu: ")
        #self.logger.info (menu.getall())
        #self.logger.info ("len(menu): " + str(len(menu)))
        #self.logger.info ("menu[0] : " + menu.get())
        #self.logger.info("name - " + menu[0].css('.category-filter__text ::text').get())
        #inspect_response(response,self)

        if not (len(menu) > 0 and menu[0].css('[aria-current="page"]')):
            #we are on a subpage, so now we can start scraping
            #    TODO check to see if we should just scrape all pages?

            GROCERY_SELECTOR = '.grid-item'
            NAME_SELECTOR = '.small-type.detail-card-description ::text'
            PRICE_SELECTOR = '.price ::text'
            PRICE_PER_UNIT_SELECTOR = '.sub-headline.detail-card-subtext ::text'

            metadata = get_url_metadata(self.cursor, url)
            if metadata is None:
                self.logger.debug(f"Metadata is none for {url}")
                metadata = ["", ""]

            section = metadata[0]
            subsection = metadata[1]
            self.logger.info("subpage - scraping " + url +
                             ", from section - " + section)
            for grocery in response.css(GROCERY_SELECTOR):
                self.name = grocery.css(NAME_SELECTOR).extract_first()
                self.price = grocery.css(PRICE_SELECTOR).extract_first()
                if self.price is not None:
                    self.price = self.price.replace('*', '').replace('$', '')
                self.ppu = grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first()
                if self.ppu is not None:
                    self.ppu = convert_ppu(self.ppu)
                #inspect_response(response, self)
                #parse the ounces off of the name
                yield {
                    'name': self.name,
                    'price': self.price,
                    'price-per-unit': self.ppu,
                    'section': section,
                    'subsection': subsection,
                    'url': response.url
                }
        finish_url(self.conn, self.store_id, url)
        self.logger.info("finishing url - " + url)
        next_url = get_next_url(self.cursor, 1, store_id=self.store_id)
        if next_url is not None:
            self.logger.info("got next_url - " + next_url)
            yield SplashRequest(
                next_url,
                self.parse,
                endpoint='execute',
                dont_filter=True,
                args={'lua_source': self.expand_and_scroll_lua})
        else:
            self.logger.info(
                "Next url is none therefore we must be finished ! ")