class Run: Merchant_html_path = '' Merchant_name = "" Merchant_rate = "" Merchant_product = 0 Merchant_established = "" Merchant_followers = 0 Merchant_location = '' product_page = 0 max_product_page = 100 number_item = 0 Date_Crawling = 0 word_separator = "+" link = "" Selenium = Selenium() Search = "beauty" page_ordinal = 100 def Crawling(self): self.link = "https://shopee.co.id/herbal_idr" try: self.Selenium.Load(self.link) time.sleep(4) if MongoDB().checkMerchant(self.Selenium.link) is not None: self.Merchant_name = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[1]/div[3]/div[1]/div/h1" ) self.Merchant_rate = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[6]/div[2]/div[2]" ) self.Merchant_product = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[1]/div[2]/div[2]" ) self.Merchant_established = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[7]/div[2]/div[2]" ) self.Merchant_followers = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[5]/div[2]/div[2]" ) self.Merchant_location = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[3]/div[2]/div/div[2]/div/div[1]/div/a/div/div[2]/div[5]" ) detail = { 'Merchant_name': self.Merchant_name, 'Merchant_rate': self.Merchant_rate, 'Merchant_location': self.Merchant_location, 'Merchant_established': self.Merchant_established, 'Merchant_product': self.Merchant_product, 'Merchant_followers': self.Merchant_followers, 'Merchant_crawlingTime': datetime.datetime.now(), 'status': 1 } db.Merchant.update({"_id": self.link}, {"$set": detail}) except Exception as e: print(e) self.Selenium.Refresh() self.Crawling()
class Run: idProduct = "" namaProduct = "" jumlahTerjual = "" hargaAsli = "" hargaDiskon = "" hargaRangeAtas = "" hargaRangeBawah = "" hargaDisRangeAtas = "" hargaDisRangeBawah = "" word_separator = "+" link = "" Selenium = Selenium() db = MongoDB() max_merchant = db.countMerchant() #print(max_merchant)#;input() page_ordinal = 100 def Crawling(self): while self.max_merchant != 0: page_product = 0 link = self.db.checkMerchantToCrawl() #print('link '+str(link));input() last = self.db.checkLastCrawling(link) #print(last) if last is not None: page_product = last if link is not None: self.db.updateStatus(link) self.Selenium.Load(link) time.sleep(4) max_page = int( self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[1]/div[2]/div/span[2]" )) while page_product < max_page: self.db.writeLog(link, page_product) page_product += 1 if page_product != 1: self.Selenium.Load("".join([ link, "?page=", str(page_product - 1), "&sortBy=pop" ])) time.sleep(4) item_count = 0 item_ordinal = len( self.Selenium.ExtractElements( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div" )) #print(item_ordinal);input() while item_count < item_ordinal: if item_count < 30: self.Selenium.scrollDown(5) item_count += 1 try: self.idProduct = self.Selenium.ExtractElementAttribute( "href", ''.join([ "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[", str(item_count), "]/div/a" ])) except Exception as e: self.Selenium.scrollDown(3) try: self.idProduct = self.Selenium.ExtractElementAttribute( "href", ''.join([ "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[", str(item_count), "]/div/a" ])) except Exception as e: self.Selenium.scrollUp(2) self.idProduct = self.Selenium.ExtractElementAttribute( "href", ''.join([ "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[", str(item_count), "]/div/a" ])) if self.idProduct is not None: k = self.idProduct.split("-i") l = k[0].split('id/') self.namaProduct = l[1].replace('-', ' ') #print(self.namaProduct) self.jumlahTerjual = self.Selenium.ExtractElementText( ''.join([ "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[", str(item_count), "]/div/a/div/div[2]/div[4]/div[3]" ])) check = self.Selenium.ExtractElements(''.join([ "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[", str(item_count), "]/div/a/div/div[2]/div[2]/div" ])) if len(check) == 2: check2 = self.Selenium.ExtractElements(''.join([ "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[", str(item_count), "]/div/a/div/div[2]/div[2]/div[1]/span" ])) if len(check2) == 2: self.hargaAsli = self.Selenium.ExtractElementText( ''.join([ "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[", str(item_count), "]/div/a/div/div[2]/div[2]/div[1]/span[2]" ])) self.hargaDiskon = None self.hargaRangeAtas = None self.hargaRangeBawah = None self.hargaDisRangeAtas = None self.hargaDisRangeBawah = None elif len(check2) == 4: self.Selenium.Load(self.idProduct) time.sleep(4) self.hargaAsli = None self.hargaDiskon = None check3 = self.Selenium.ExtractElements( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div" ) if len(check3) == 1: a = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div/div" ).replace('Rp', '').split(' - ') self.hargaRangeAtas = a[0].replace( '.', '') self.hargaRangeBawah = a[1].replace( '.', '') self.hargaDisRangeAtas = None self.hargaDisRangeBawah = None elif len(check3) == 2: a = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div[1]" ).replace('Rp', '').split(' - ') b = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div[2]/div[1]" ).replace('Rp', '').split(' - ') self.hargaRangeAtas = a[0].replace( '.', '') self.hargaRangeBawah = a[1].replace( '.', '') self.hargaDisRangeAtas = b[0].replace( '.', '') self.hargaDisRangeBawah = b[1].replace( '.', '') self.Selenium.BackPage() elif len(check) == 3: check2 = self.Selenium.ExtractElements(''.join([ "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[", str(item_count), "]/div/a/div/div[2]/div[2]/div[1]/span" ])) if len(check2) == 2: self.hargaAsli = self.Selenium.ExtractElementText( ''.join([ "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[", str(item_count), "]/div/a/div/div[2]/div[2]/div[1]/span[2]" ])) self.hargaDiskon = None self.hargaRangeAtas = None self.hargaRangeBawah = None self.hargaDisRangeAtas = None self.hargaDisRangeBawah = None elif len(check2) == 4: self.Selenium.Load(self.idProduct) time.sleep(4) self.hargaAsli = None self.hargaDiskon = None check3 = self.Selenium.ExtractElements( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div" ) if len(check3) == 1: a = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div/div" ).replace('Rp', '').split(' - ') self.hargaRangeAtas = a[0].replace( '.', '') self.hargaRangeBawah = a[1].replace( '.', '') self.hargaDisRangeAtas = None self.hargaDisRangeBawah = None elif len(check3) == 2: a = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div[1]" ).replace('Rp', '').split(' - ') b = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div[2]/div[1]" ).replace('Rp', '').split(' - ') self.hargaRangeAtas = a[0].replace( '.', '') self.hargaRangeBawah = a[1].replace( '.', '') self.hargaDisRangeAtas = b[0].replace( '.', '') self.hargaDisRangeBawah = b[1].replace( '.', '') self.Selenium.BackPage() #self.hargaAsli = self.Selenium.ExtractElementText(''.join(["//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",str(item_count),"]/div/a/div/div[2]/div[2]/div[1]"])).replace('Rp','').replace('-','').replace('.','') #self.hargaDiskon = self.Selenium.ExtractElementText(''.join(["//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",str(item_count),"]/div/a/div/div[2]/div[2]/div[2]/span[2]"])).replace('.','') #self.hargaRangeAtas = None #self.hargaRangeBawah = None #self.hargaDisRangeAtas = None #self.hargaDisRangeBawah = None #print(1) if item_count != 1: #print(2) if self.db.checkProduct( link, self.idProduct) is None: #print(3) #print(self.db.checkProduct(link, self.idProduct)) self.db.insertProduct( self.idProduct, link, self.namaProduct, self.jumlahTerjual, self.hargaAsli, self.hargaDiskon, self.hargaRangeAtas, self.hargaRangeBawah, self.hargaDisRangeAtas, self.hargaDisRangeBawah) else: #print(4) self.db.insertProduct( self.idProduct, link, self.namaProduct, self.jumlahTerjual, self.hargaAsli, self.hargaDiskon, self.hargaRangeAtas, self.hargaRangeBawah, self.hargaDisRangeAtas, self.hargaDisRangeBawah) self.db.updateStatusEnd(link) self.max_merchant -= 1
class Run: idProduct = "" namaProduct = "" checkCategory = "" arraySubCategory = [ "Perawatan Tubuh", "Alat Kecantikan", "Alat Rambut", "Kecantikan Lainnya", "Kosmetik Mata", "Perawatan Kuku", "Perawatan Pria", "Kosmetik Wajah", "Perawatan Rambut", "Parfum", "Kosmetik Bibir", "Perawatan Wajah", "Paket Kecantikan" ] jumlahTerjual = 0 hargaAsli = 0 hargaAsliTDAtas = 0 hargaAsliTDBawah = 0 hargaTanpaDiskonFix = 0 hargaAsliDisRangeAtas = 0 hargaAsliDisRangeBawah = 0 hargaDiskon = 0 hargaRangeAtas = 0 hargaRangeBawah = 0 hargaDisRangeAtas = 0 hargaDisRangeBawah = 0 variasi = "" variasiFix = "" productTerjual = 0 productRating = "" productUlasan = 0 word_separator = "+" link = "" Selenium = Selenium() db = MongoDB() max_merchant = db.countMerchant() #print(max_merchant)#;input() page_ordinal = 100 def Crawling(self): while self.max_merchant != 0: page_product = 0 link = self.db.checkMerchantToCrawl() linkadd = link #print('link '+str(link));input() last = self.db.checkLastCrawling(link) #print(last) if last is not None: page_product = last if link is not None: self.db.updateStatus(link) self.Selenium.Load(link) time.sleep(4) max_page = int( self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[1]/div[2]/div/span[2]" )) while page_product < max_page: page_product += 1 self.db.writeLog(link, page_product) self.Selenium.Load("".join([ link, "?page=", str(page_product - 1), "&sortBy=sales" ])) time.sleep(4) item_count = 0 item_ordinal = len( self.Selenium.ExtractElements( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div" )) #print(item_ordinal);input() while item_count < item_ordinal: item_count += 1 self.idProduct = self.Selenium.ExtractElementAttribute( "href", ''.join([ "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[", str(item_count), "]/div/a" ])) if self.idProduct is not None: k = self.idProduct.split("-i") l = k[0].split('id/') self.namaProduct = l[1].replace('-', ' ').replace( ',', ' ') self.Selenium.Load(self.idProduct) self.checkCategory = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div/a[3]" ) for i in self.arraySubCategory: # print('test') if i == self.checkCategory: print("category match") #ini untuk dapatin harga tanpa diskon yang ada range dan tidak ataupun tidak ada self.hargaTanpaDiskon = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div/div" ).replace("Rp", '').replace(".", '').split(' - ') if len(self.hargaTanpaDiskon) == 1: self.hargaTanpaDiskonFix = int( self.hargaTanpaDiskon[0]) elif len(self.hargaTanpaDiskon) == 2: self.hargaAsliTDAtas = int( self.hargaTanpaDiskon[1]) self.hargaAsliTDBawah = int( self.hargaTanpaDiskon[0]) else: self.hargaTanpaDiskonFix = 0 # print(self.hargaTanpaDiskon,self.hargaAsliTDAtas,self.hargaAsliTDBawah) #ini untuk dapatin harga asli diskon self.hargaAsliDiskon = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div[1]" ).replace("Rp", '').replace(".", '').split(' - ') if len(self.hargaAsliDiskon) == 1: self.hargaAsliDiskon = int( self.hargaAsliDiskon[0]) elif len(self.hargaAsliDiskon) == 2: self.hargaAsliDisRangeAtas = int( self.hargaAsliDiskon[1]) self.hargaAsliDisRangeBawah = int( self.hargaAsliDiskon[0]) else: self.hargaAsliDiskon = 0 # print(self.hargaAsliDiskon,self.hargaAsliDisRangeAtas,self.hargaAsliDisRangeBawah) #ini untuk dapatin harga range setelah diskon self.hargaDiskon = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div[2]/div[1]" ) if (self.hargaDiskon == None): self.hargaDiskon = 0 else: self.hargaDiskon = self.hargaDiskon.replace( "Rp", '').replace(".", '').split(' - ') if len(self.hargaDiskon) == 1: self.hargaDiskon = int( self.hargaDiskon[0]) elif len(self.hargaDiskon) == 2: self.hargaDisRangeAtas = int( self.hargaDiskon[1]) self.hargaDisRangeBawah = int( self.hargaDiskon[0]) else: self.hargaDiskon = 0 # print(self.hargaDiskon,self.hargaDisRangeAtas,self.hargaDisRangeBawah) #self.hargaTanpaDiskonFix, self.hargaAsliTDAtas, self.hargaAsliTDBawah, hargaAsliDiskon, hargaAsliDisRangeAtas,hargaAsliDisRangeBawah, hargaDiskon, hargaDisRangeAtas, hargaDisRangeBawah try: self.jmlvariasi = len( self.Selenium.ExtractElements( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[4]/div/div[2]/div/div[1]/div/button" )) self.jmlvariasi2 = len( self.Selenium.ExtractElements( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[4]/div/div[3]/div/div[1]/div/button" )) if (self.jmlvariasi != 0): print(self.jmlvariasi) item_variasi = 0 while item_variasi < self.jmlvariasi: item_variasi += 1 self.variasi = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[4]/div/div[2]/div/div[1]/div/button[" + str(item_variasi) + "]") self.variasiFix = self.variasiFix + "|" + self.variasi elif (self.jmlvariasi2 != 0): print(self.jmlvariasi2) item_variasi = 0 while item_variasi < self.jmlvariasi2: item_variasi += 1 self.variasi = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[4]/div/div[2]/div/div[1]/div/button[" + str(item_variasi) + "]") self.variasiFix = self.variasiFix + "|" + self.variasi else: self.variasi = "" self.variasi = self.variasi.replace( ',', ' ') except Exception as e: self.variasi = "" # print(self.variasiFix) self.productTerjual = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[2]/div[3]/div[1]" ) if (self.productTerjual == None): self.productTerjual = 0 else: self.productTerjual = int( self.productTerjual) self.productRating = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[2]/div[1]/div[1]" ) self.productUlasan = int( self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[2]/div[2]/div[1]" )) self.db.insertProductVerse2( linkadd, self.idProduct, self.checkCategory, self.namaProduct, self.productTerjual, self.productRating, self.productUlasan, self.variasiFix, self.hargaTanpaDiskonFix, self.hargaAsliTDAtas, self.hargaAsliTDBawah, self.hargaDiskon, self.hargaDisRangeAtas, self.hargaDisRangeBawah, self.hargaAsliDiskon, self.hargaAsliDisRangeAtas, self.hargaAsliDisRangeBawah) self.jumlahTerjual = 0 self.hargaAsli = 0 self.hargaAsliTDAtas = 0 self.hargaAsliTDBawah = 0 self.hargaTanpaDiskonFix = 0 self.hargaAsliDisRangeAtas = 0 self.hargaAsliDisRangeBawah = 0 self.hargaDiskon = 0 self.hargaRangeAtas = 0 self.hargaRangeBawah = 0 self.hargaDisRangeAtas = 0 self.hargaDisRangeBawah = 0 self.variasi = "" self.variasiFix = "" self.productTerjual = 0 self.productRating = "" self.productUlasan = 0 elif i != self.checkCategory: print("category not match") self.Selenium.BackPage() self.variasiFix = "" self.db.updateStatusEnd(linkadd)
class Run: Merchant_html_path = '' Merchant_name = "" Merchant_rate = "" Merchant_product = 0 Merchant_established = "" Merchant_followers = 0 Merchant_location = '' product_page = 0 max_product_page = 100 number_item = 0 Date_Crawling = 0 word_separator = "%2520" link = "" Selenium = Selenium() page_ordinal = 100 def Crawling(self, Link_Merchant, Code_City): MongoDB().updateRunning(Link_Merchant) MongoDB().updateStatusCityRun(Link_Merchant, Code_City) while self.product_page == 0 or ( self.product_page < self.page_ordinal and self.product_page < self.max_product_page): self.product_page = MongoDB().lastPage(Link_Merchant) self.Selenium.link = Link_Merchant + "?locations=" + Code_City.replace( " ", self.word_separator) + "&page=" + str( self.product_page) + "&sortBy=sales" self.Selenium.Load(self.Selenium.link) time.sleep(4) self.page_ordinal = int( self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[1]/div[2]/div/span[2]" )) - 1 self.product_page += 1 print("jumlah page ;" + str(self.page_ordinal)) item_count = 0 print('item :' + str(item_count)) item_ordinal = self.Selenium.ExtractElements( "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div" ) print("jumlah item : " + str(len(item_ordinal))) while item_count < len(item_ordinal): item_count += 1 if item_count < 11: product_link = self.Selenium.ExtractElementAttribute( 'href', ''.join([ "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div[" + str(item_count) + "]/div/a" ])) self.Merchant_location = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div[" + str(item_count) + "]/div/a/div/div[2]/div[5]") elif item_count < 21: self.Selenium.scrollDown(3) product_link = self.Selenium.ExtractElementAttribute( 'href', ''.join([ "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div[" + str(item_count) + "]/div/a" ])) self.Merchant_location = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div[" + str(item_count) + "]/div/a/div/div[2]/div[5]") elif item_count < 31: self.Selenium.scrollDown(4) product_link = self.Selenium.ExtractElementAttribute( 'href', ''.join([ "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div[" + str(item_count) + "]/div/a" ])) self.Merchant_location = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div[" + str(item_count) + "]/div/a/div/div[2]/div[5]") elif item_count < 41: self.Selenium.scrollDown(5) product_link = self.Selenium.ExtractElementAttribute( 'href', ''.join([ "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div[" + str(item_count) + "]/div/a" ])) self.Merchant_location = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div[" + str(item_count) + "]/div/a/div/div[2]/div[5]") elif item_count < 51: self.Selenium.scrollDown(6) product_link = self.Selenium.ExtractElementAttribute( 'href', ''.join([ "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div[" + str(item_count) + "]/div/a" ])) self.Merchant_location = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div[" + str(item_count) + "]/div/a/div/div[2]/div[5]") else: self.Selenium.scrollUp(3) print('item :' + str(item_count)) if product_link is not None: self.Selenium.Load(product_link) time.sleep(3) checkformat = len( self.Selenium.ExtractElements( "//*[@id='main']/div/div[2]/div[2]/div[2]/div[3]/div" )) if checkformat == 3: self.Merchant_html_path = self.Selenium.ExtractElementAttribute( 'href', "//*[@id='main']/div/div[2]/div[2]/div[2]/div[3]/div[2]/div[1]/div/div[3]/a" ) checkformat = 0 elif checkformat == 2: checkformat = 0 self.Merchant_html_path = self.Selenium.ExtractElementAttribute( 'href', "//*[@id='main']/div/div[2]/div[2]/div[2]/div[3]/div[1]/div[1]/div/div[3]/a" ) else: print(product_link) if MongoDB().checkMerchant( self.Merchant_html_path) is None: self.Selenium.Load(self.Merchant_html_path) time.sleep(3) self.Merchant_name = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[1]/div[3]/div[1]/div/h1" ) self.Merchant_rate = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[6]/div[2]/div[2]" ) self.Merchant_product = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[1]/div[2]/div[2]" ) self.Merchant_established = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[7]/div[2]/div[2]" ) self.Merchant_followers = self.Selenium.ExtractElementText( "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[5]/div[2]/div[2]" ) self.Selenium.BackPage() try: MongoDB().insert_Shopee(self.Merchant_html_path, self.Merchant_name, self.Merchant_rate, self.Merchant_product, self.Merchant_established, self.Merchant_followers, self.Merchant_location) self.Merchant_html_path = '' self.Merchant_name = "" self.Merchant_rate = "" self.Merchant_product = 0 self.Merchant_established = "" self.Merchant_followers = 0 self.Merchant_location = '' except Exception as e: print("skip") self.Selenium.BackPage() time.sleep(2) MongoDB().updatePages(Link_Merchant, self.product_page) self.product_page = 0 self.number_item = 0 self.Date_Crawling = 0 self.max_product_page = 100 MongoDB().updateStatusCityDone(Link_Merchant, Code_City) MongoDB().updatePages(Link_Merchant) self.Selenium.clear_cache() def main(self): Link_Merchant_count = MongoDB().countCategory() while Link_Merchant_count != 0: try: self.Crawling(MongoDB().getLinkToCrawling()[0], MongoDB().getLinkToCrawling()[1]) self.product_page = 0 self.number_item = 0 self.Date_Crawling = 0 self.max_product_page = 100 print(MongoDB().getLinkToCrawling()[1]) MongoDB().updateStatusCityDone( MongoDB().getLinkToCrawling()[0], MongoDB().getLinkToCrawling()[1]) MongoDB().updatedStatusCategory( MongoDB().getLinkToCrawling()[0]) except Exception as e: print(e) MongoDB().timeTrackError() self.Crawling(MongoDB().getLinkToCrawling()[0], MongoDB().getLinkToCrawling()[1]) Link_Merchant_count = MongoDB().countCategory() MongoDB().updateMerchantTimeEnd()
class Run: Product_name = "" Product_price = 0 Product_price_dis = 0 Product_location = "" Product_sold = 0 product_page = 0 max_product_page = 84 number_item = 0 Date_Crawling = 0 word_separator = "+" link = "" Selenium = Selenium() Search = "beauty" def Crawling(self): self.Selenium.link = "https://shopee.co.id/Kecantikan-cat.14840" + Concatenate( ).InfuseSeparator(main=self.Search, separator=self.word_separator) + "?page=" + str( self.product_page) #"https://shopee.co.id/search?keyword="+Concatenate().InfuseSeparator(main=self.Search, separator=self.word_separator)+"&page="+str(self.product_page) self.Selenium.Load(self.Selenium.link) time.sleep(3) print(self.Selenium.link) #input() while self.product_page < self.max_product_page: item_count = 1 item_ordinal = self.Selenium.ExtractElements( "//*[@id='main']/div/div[2]/div[2]/div/div/div[2]/div[2]/div[2]/div/div[2]/div" ) print(len(item_ordinal)) #input() self.product_page += 1 while item_count <= len(item_ordinal): sub_root = "//*[@id='main']/div/div[2]/div[2]/div/div/div[2]/div[2]/div[2]/div/div[2]/div[", str( item_count), "]" self.Product_location = self.Selenium.ExtractElementText("".join([ "//*[@id='main']/div/div[2]/div[2]/div/div/div[2]/div[2]/div[2]/div/div[2]/div[", str(item_count), "]/div/a/div/div[2]/div[5]" ])) print(self.Product_location) self.Product_price = self.Selenium.ExtractElementText("".join([ "//*[@id='main']/div/div[2]/div[2]/div/div/div[2]/div[2]/div[2]/div/div[2]/div[", str(item_count), "]/div/a/div/div[2]/div[2]" ])).replace('Rp', '') #self.Product_price = int(self.Selenium.ExtractElementText("".join(["//*[@id='main']/div/div[2]/div[2]/div/div/div[4]/div[2]/div/div/div[2]/div[",str(item_count),"]/div/a/div/div[2]/div[2]/div[1]"]))) print(self.Product_price) #self.Product_price_dis = int(self.Selenium.ExtractElementText("".join([sub_root,'/div/a/div/div[2]/div[2]/div[2]/span[2]']))) #print(self.Product_price_dis) self.Product_name = self.Selenium.ExtractElementAttribute( 'href', "".join([ "//*[@id='main']/div/div[2]/div[2]/div/div/div[2]/div[2]/div[2]/div/div[2]/div[", str(item_count), "]/div/a" ])).replace("https://shopee.co.id/", '').replace('-', ' ') index = self.Product_name.split('i.') print(index[0]) item_count += 1 detail = { 'name': self.Product_name, 'location': self.Product_location, } result = db.product.insert_one(detail) self.Selenium.link = "https://shopee.co.id/search?keyword=" + Concatenate( ).InfuseSeparator(main=self.Search, separator=self.word_separator) + "&page=" + str( self.product_page) self.Selenium.Load(self.Selenium.link) time.sleep(3)