def to_best_sale(self, url_item):
     '''
     To get the common product with high rating and sale, we sort product by sale
     :param url_item:
     :return: None
     '''
     self.driver.get(url_item)
     self.driver = scroll_down_and_wait(self.driver)
     self.driver.find_elements_by_xpath(
         "//*[contains(text(), 'Bán chạy')]")[0].click()
     time.sleep(random.randint(3, 8))
     self.driver = scroll_down_and_wait(self.driver)
 def get_product_url(self):
     '''
     Get product url each page
     :return: None
     '''
     self.driver = scroll_down_and_wait(self.driver)
     time.sleep(random.randint(3, 8))
     _, product_url = get_link_product(self.driver)
     return product_url
Ejemplo n.º 3
0
 def crawl_all_category_rating(self, limit_prod, limit_user):
     '''
     This method to crawl all product category
     :param limit_prod: Quantities of product for each category
     :param limit_user: Quantities of user rating for each product
     :return: csv files contain user rating
     '''
     self.logger.info(
         'Bot not found index, collect product info from all categorical')
     for ind, url_item in enumerate(self.categorical_url):
         count_prod = 0
         user_rating_all = pd.DataFrame(
             [], columns=['username', 'rating', 'comment', 'sp'])
         label = self.index_category.get(ind)
         self.logger.info(
             'Bot started geting product info from {}'.format(label))
         self.to_best_sale(url_item)
         while True:
             with self.move_from_page_to_page() as list_product_url:
                 self.logger.info(
                     'Shopee returned {} url of product'.format(
                         len(list_product_url)))
                 if len(list_product_url) < 30:
                     self.driver.refresh()
                     self.driver = scroll_down_and_wait(self.driver)
                 else:
                     for product_url in list_product_url:
                         self.logger.info(
                             'Bot started get into each product')
                         self.to_product_site(product_url)
                         if 'event3' in self.driver.current_url:
                             self.logger.info(
                                 'Advertising block encountered, moving to next url'
                             )
                             continue
                         else:
                             self.logger.info('Bot parsed rating of user')
                             user_rating_item = self.get_user_rating(
                                 limit_user)
                             user_rating_all = user_rating_all.append(
                                 user_rating_item, sort=False)
                             user_rating_all = user_rating_all.drop_duplicates(
                             ).reset_index(drop=True)
                             self.logger.info(
                                 'Bot collected {} rating of user'.format(
                                     user_rating_all.shape[0]))
                             count_prod += 1
             self.logger.info(
                 'Bot parsed {} product from list'.format(count_prod))
             if count_prod >= limit_prod:
                 save_path = os.path.join(
                     BASE_PATH_CRAWLER,
                     'data_collected/{}_rating.csv'.format(label))
                 user_rating_all.to_csv(save_path, index=False)
                 break
 def to_product_site(self, product_url):
     '''
     Go to each product site for get information of product and user rating and comment
     :param product_url:
     :return:
     '''
     self.driver.get(product_url)
     time.sleep(random.randint(3, 8))
     self.driver = scroll_down_and_wait(self.driver)
     try:
         self.driver.find_element_by_xpath(
             "//button[@class='btn btn-solid-primary btn--m btn--inline shopee-alert-popup__btn']"
         ).click()
         print('Confirmed 18+')
     except NoSuchElementException:
         pass
Ejemplo n.º 5
0
 def move_from_page_to_page(self):
     '''
     Due to after crawl all product in page, we need to get back the original page for moving next,
     this contextmanager help to move back and jump into next page
     :return:
     '''
     try:
         current_url = self.driver.current_url
         list_product_url = self.get_product_url()
         yield list_product_url
     finally:
         self.driver.get(current_url)
         self.driver = scroll_down_and_wait(self.driver)
         self.driver.find_element_by_xpath(
             "//button[@class='shopee-icon-button shopee-icon-button--right ']"
         ).click()
Ejemplo n.º 6
0
 def crawl_all_category_prod(self, limit_prod):
     '''
     This method to crawl product of all category
     :param limit_prod:
     :return: csv files contain product info of all category
     '''
     self.logger.info(
         'Bot not found index, collect product info from all categorical')
     for ind, url_item in enumerate(self.categorical_url):
         product_all = list()
         label = self.index_category.get(ind)
         self.logger.info(
             'Bot started get product info from {}'.format(label))
         self.to_best_sale(url_item)
         while True:
             with self.move_from_page_to_page() as list_product_url:
                 self.logger.info(
                     'Shopee returned {} url of product'.format(
                         len(list_product_url)))
                 if len(list_product_url) < 30:
                     self.driver.refresh()
                     self.driver = scroll_down_and_wait(self.driver)
                 else:
                     for product_url in list_product_url:
                         self.logger.info(
                             'Bot started move into each product')
                         self.to_product_site(product_url)
                         if 'event3' in self.driver.current_url:
                             self.logger.info(
                                 'Advertising block encountered, moving to next url'
                             )
                             continue
                         else:
                             self.logger.info('Bot parsed info of product')
                             product_info = self.parse_info_product(label)
                             product_all.append(product_info)
             self.logger.info('Bot parsed {} product from list'.format(
                 len(product_all)))
             if len(product_all) >= limit_prod:
                 save_path = os.path.join(
                     BASE_PATH_CRAWLER,
                     'data_collected/{}_prod.csv'.format(label))
                 df = pd.DataFrame(product_all)
                 df.to_csv(save_path, index=False)
                 break
Ejemplo n.º 7
0
 def crawl_single_category_prod(self, ind, limit_prod):
     '''
     This method use to crawl single product category
     :param ind: Index of product category need to crawl
     :param limit_prod: Quantities of product we need to crawl
     :return: csv file contain info of product crawled
     '''
     product_all = list()
     label = self.index_category.get(ind)
     self.logger.info(
         'Bot started getting product info from {}'.format(label))
     self.to_best_sale(self.categorical_url[ind])
     while True:
         with self.move_from_page_to_page() as list_product_url:
             self.logger.info('Shopee returned {} url of product'.format(
                 len(list_product_url)))
             if len(list_product_url) < 30:
                 self.driver.refresh()
                 self.driver = scroll_down_and_wait(self.driver)
             else:
                 for product_url in list_product_url:
                     self.logger.info('Bot started move into each product')
                     self.to_product_site(product_url)
                     if 'event3' in self.driver.current_url:
                         self.logger.info(
                             'Advertising block encounter, move to next url'
                         )
                         continue
                     else:
                         self.logger.info('Bot parsed info of product')
                         product_info = self.parse_info_product(label)
                         product_all.append(product_info)
         self.logger.info('Bot parsed {} product from list'.format(
             len(product_all)))
         if len(product_all) >= limit_prod:
             df = pd.DataFrame(product_all)
             save_path = os.path.join(
                 BASE_PATH_CRAWLER,
                 'data_collected/{}_prod.csv'.format(label))
             df.to_csv(save_path, index=False)
             break