Exemple #1
0
 def Crawling(self):
     self.link = "https://shopee.co.id/herbal_idr"
     try:
         self.Selenium.Load(self.link)
         time.sleep(4)
         if MongoDB().checkMerchant(self.Selenium.link) is not None:
             self.Merchant_name = self.Selenium.ExtractElementText(
                 "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[1]/div[3]/div[1]/div/h1"
             )
             self.Merchant_rate = self.Selenium.ExtractElementText(
                 "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[6]/div[2]/div[2]"
             )
             self.Merchant_product = self.Selenium.ExtractElementText(
                 "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[1]/div[2]/div[2]"
             )
             self.Merchant_established = self.Selenium.ExtractElementText(
                 "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[7]/div[2]/div[2]"
             )
             self.Merchant_followers = self.Selenium.ExtractElementText(
                 "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[5]/div[2]/div[2]"
             )
             self.Merchant_location = self.Selenium.ExtractElementText(
                 "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[3]/div[2]/div/div[2]/div/div[1]/div/a/div/div[2]/div[5]"
             )
             detail = {
                 'Merchant_name': self.Merchant_name,
                 'Merchant_rate': self.Merchant_rate,
                 'Merchant_location': self.Merchant_location,
                 'Merchant_established': self.Merchant_established,
                 'Merchant_product': self.Merchant_product,
                 'Merchant_followers': self.Merchant_followers,
                 'Merchant_crawlingTime': datetime.datetime.now(),
                 'status': 1
             }
             db.Merchant.update({"_id": self.link}, {"$set": detail})
     except Exception as e:
         print(e)
         self.Selenium.Refresh()
         self.Crawling()
Exemple #2
0
class Run:
    idProduct = ""
    namaProduct = ""
    jumlahTerjual = ""
    hargaAsli = ""
    hargaDiskon = ""
    hargaRangeAtas = ""
    hargaRangeBawah = ""
    hargaDisRangeAtas = ""
    hargaDisRangeBawah = ""
    word_separator = "+"
    link = ""
    Selenium = Selenium()
    db = MongoDB()

    max_merchant = db.countMerchant()
    #print(max_merchant)#;input()
    page_ordinal = 100

    def Crawling(self):
        while self.max_merchant != 0:
            page_product = 0
            link = self.db.checkMerchantToCrawl()
            #print('link '+str(link));input()
            last = self.db.checkLastCrawling(link)
            #print(last)
            if last is not None:
                page_product = last
            if link is not None:
                self.db.updateStatus(link)
                self.Selenium.Load(link)
                time.sleep(4)
                max_page = int(
                    self.Selenium.ExtractElementText(
                        "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[1]/div[2]/div/span[2]"
                    ))

                while page_product < max_page:
                    self.db.writeLog(link, page_product)
                    page_product += 1
                    if page_product != 1:
                        self.Selenium.Load("".join([
                            link, "?page=",
                            str(page_product - 1), "&sortBy=pop"
                        ]))
                        time.sleep(4)
                    item_count = 0
                    item_ordinal = len(
                        self.Selenium.ExtractElements(
                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div"
                        ))
                    #print(item_ordinal);input()
                    while item_count < item_ordinal:
                        if item_count < 30:
                            self.Selenium.scrollDown(5)
                        item_count += 1
                        try:
                            self.idProduct = self.Selenium.ExtractElementAttribute(
                                "href", ''.join([
                                    "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",
                                    str(item_count), "]/div/a"
                                ]))
                        except Exception as e:
                            self.Selenium.scrollDown(3)
                            try:
                                self.idProduct = self.Selenium.ExtractElementAttribute(
                                    "href", ''.join([
                                        "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",
                                        str(item_count), "]/div/a"
                                    ]))
                            except Exception as e:
                                self.Selenium.scrollUp(2)
                                self.idProduct = self.Selenium.ExtractElementAttribute(
                                    "href", ''.join([
                                        "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",
                                        str(item_count), "]/div/a"
                                    ]))
                        if self.idProduct is not None:
                            k = self.idProduct.split("-i")
                            l = k[0].split('id/')
                            self.namaProduct = l[1].replace('-', ' ')
                            #print(self.namaProduct)
                            self.jumlahTerjual = self.Selenium.ExtractElementText(
                                ''.join([
                                    "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",
                                    str(item_count),
                                    "]/div/a/div/div[2]/div[4]/div[3]"
                                ]))
                            check = self.Selenium.ExtractElements(''.join([
                                "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",
                                str(item_count),
                                "]/div/a/div/div[2]/div[2]/div"
                            ]))
                            if len(check) == 2:

                                check2 = self.Selenium.ExtractElements(''.join([
                                    "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",
                                    str(item_count),
                                    "]/div/a/div/div[2]/div[2]/div[1]/span"
                                ]))
                                if len(check2) == 2:
                                    self.hargaAsli = self.Selenium.ExtractElementText(
                                        ''.join([
                                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",
                                            str(item_count),
                                            "]/div/a/div/div[2]/div[2]/div[1]/span[2]"
                                        ]))
                                    self.hargaDiskon = None
                                    self.hargaRangeAtas = None
                                    self.hargaRangeBawah = None
                                    self.hargaDisRangeAtas = None
                                    self.hargaDisRangeBawah = None
                                elif len(check2) == 4:
                                    self.Selenium.Load(self.idProduct)
                                    time.sleep(4)
                                    self.hargaAsli = None
                                    self.hargaDiskon = None
                                    check3 = self.Selenium.ExtractElements(
                                        "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div"
                                    )
                                    if len(check3) == 1:
                                        a = self.Selenium.ExtractElementText(
                                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div/div"
                                        ).replace('Rp', '').split(' - ')
                                        self.hargaRangeAtas = a[0].replace(
                                            '.', '')
                                        self.hargaRangeBawah = a[1].replace(
                                            '.', '')
                                        self.hargaDisRangeAtas = None
                                        self.hargaDisRangeBawah = None
                                    elif len(check3) == 2:
                                        a = self.Selenium.ExtractElementText(
                                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div[1]"
                                        ).replace('Rp', '').split(' - ')
                                        b = self.Selenium.ExtractElementText(
                                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div[2]/div[1]"
                                        ).replace('Rp', '').split(' - ')
                                        self.hargaRangeAtas = a[0].replace(
                                            '.', '')
                                        self.hargaRangeBawah = a[1].replace(
                                            '.', '')
                                        self.hargaDisRangeAtas = b[0].replace(
                                            '.', '')
                                        self.hargaDisRangeBawah = b[1].replace(
                                            '.', '')
                                    self.Selenium.BackPage()
                            elif len(check) == 3:

                                check2 = self.Selenium.ExtractElements(''.join([
                                    "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",
                                    str(item_count),
                                    "]/div/a/div/div[2]/div[2]/div[1]/span"
                                ]))
                                if len(check2) == 2:
                                    self.hargaAsli = self.Selenium.ExtractElementText(
                                        ''.join([
                                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",
                                            str(item_count),
                                            "]/div/a/div/div[2]/div[2]/div[1]/span[2]"
                                        ]))
                                    self.hargaDiskon = None
                                    self.hargaRangeAtas = None
                                    self.hargaRangeBawah = None
                                    self.hargaDisRangeAtas = None
                                    self.hargaDisRangeBawah = None
                                elif len(check2) == 4:
                                    self.Selenium.Load(self.idProduct)
                                    time.sleep(4)
                                    self.hargaAsli = None
                                    self.hargaDiskon = None
                                    check3 = self.Selenium.ExtractElements(
                                        "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div"
                                    )
                                    if len(check3) == 1:
                                        a = self.Selenium.ExtractElementText(
                                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div/div"
                                        ).replace('Rp', '').split(' - ')
                                        self.hargaRangeAtas = a[0].replace(
                                            '.', '')
                                        self.hargaRangeBawah = a[1].replace(
                                            '.', '')
                                        self.hargaDisRangeAtas = None
                                        self.hargaDisRangeBawah = None
                                    elif len(check3) == 2:
                                        a = self.Selenium.ExtractElementText(
                                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div[1]"
                                        ).replace('Rp', '').split(' - ')
                                        b = self.Selenium.ExtractElementText(
                                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div[2]/div[1]"
                                        ).replace('Rp', '').split(' - ')
                                        self.hargaRangeAtas = a[0].replace(
                                            '.', '')
                                        self.hargaRangeBawah = a[1].replace(
                                            '.', '')
                                        self.hargaDisRangeAtas = b[0].replace(
                                            '.', '')
                                        self.hargaDisRangeBawah = b[1].replace(
                                            '.', '')
                                    self.Selenium.BackPage()
                                #self.hargaAsli = self.Selenium.ExtractElementText(''.join(["//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",str(item_count),"]/div/a/div/div[2]/div[2]/div[1]"])).replace('Rp','').replace('-','').replace('.','')
                                #self.hargaDiskon = self.Selenium.ExtractElementText(''.join(["//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",str(item_count),"]/div/a/div/div[2]/div[2]/div[2]/span[2]"])).replace('.','')
                                #self.hargaRangeAtas = None
                                #self.hargaRangeBawah = None
                                #self.hargaDisRangeAtas = None
                                #self.hargaDisRangeBawah = None
                            #print(1)
                            if item_count != 1:
                                #print(2)
                                if self.db.checkProduct(
                                        link, self.idProduct) is None:
                                    #print(3)
                                    #print(self.db.checkProduct(link, self.idProduct))
                                    self.db.insertProduct(
                                        self.idProduct, link, self.namaProduct,
                                        self.jumlahTerjual, self.hargaAsli,
                                        self.hargaDiskon, self.hargaRangeAtas,
                                        self.hargaRangeBawah,
                                        self.hargaDisRangeAtas,
                                        self.hargaDisRangeBawah)
                            else:
                                #print(4)
                                self.db.insertProduct(
                                    self.idProduct, link, self.namaProduct,
                                    self.jumlahTerjual, self.hargaAsli,
                                    self.hargaDiskon, self.hargaRangeAtas,
                                    self.hargaRangeBawah,
                                    self.hargaDisRangeAtas,
                                    self.hargaDisRangeBawah)
                self.db.updateStatusEnd(link)
                self.max_merchant -= 1
Exemple #3
0
class Run:
    idProduct = ""
    namaProduct = ""
    checkCategory = ""
    arraySubCategory = [
        "Perawatan Tubuh", "Alat Kecantikan", "Alat Rambut",
        "Kecantikan Lainnya", "Kosmetik Mata", "Perawatan Kuku",
        "Perawatan Pria", "Kosmetik Wajah", "Perawatan Rambut", "Parfum",
        "Kosmetik Bibir", "Perawatan Wajah", "Paket Kecantikan"
    ]
    jumlahTerjual = 0
    hargaAsli = 0
    hargaAsliTDAtas = 0
    hargaAsliTDBawah = 0
    hargaTanpaDiskonFix = 0
    hargaAsliDisRangeAtas = 0
    hargaAsliDisRangeBawah = 0
    hargaDiskon = 0
    hargaRangeAtas = 0
    hargaRangeBawah = 0
    hargaDisRangeAtas = 0
    hargaDisRangeBawah = 0
    variasi = ""
    variasiFix = ""
    productTerjual = 0
    productRating = ""
    productUlasan = 0
    word_separator = "+"
    link = ""
    Selenium = Selenium()
    db = MongoDB()

    max_merchant = db.countMerchant()
    #print(max_merchant)#;input()
    page_ordinal = 100

    def Crawling(self):
        while self.max_merchant != 0:
            page_product = 0
            link = self.db.checkMerchantToCrawl()
            linkadd = link
            #print('link '+str(link));input()
            last = self.db.checkLastCrawling(link)
            #print(last)
            if last is not None:
                page_product = last
            if link is not None:
                self.db.updateStatus(link)
                self.Selenium.Load(link)
                time.sleep(4)
                max_page = int(
                    self.Selenium.ExtractElementText(
                        "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[1]/div[2]/div/span[2]"
                    ))
                while page_product < max_page:
                    page_product += 1
                    self.db.writeLog(link, page_product)
                    self.Selenium.Load("".join([
                        link, "?page=",
                        str(page_product - 1), "&sortBy=sales"
                    ]))
                    time.sleep(4)
                    item_count = 0
                    item_ordinal = len(
                        self.Selenium.ExtractElements(
                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div"
                        ))
                    #print(item_ordinal);input()
                    while item_count < item_ordinal:
                        item_count += 1
                        self.idProduct = self.Selenium.ExtractElementAttribute(
                            "href", ''.join([
                                "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[3]/div[@class='shop-page__all-products-section']/div[2]/div/div[2]/div/div[",
                                str(item_count), "]/div/a"
                            ]))
                        if self.idProduct is not None:
                            k = self.idProduct.split("-i")
                            l = k[0].split('id/')
                            self.namaProduct = l[1].replace('-', ' ').replace(
                                ',', ' ')
                            self.Selenium.Load(self.idProduct)
                            self.checkCategory = self.Selenium.ExtractElementText(
                                "//*[@id='main']/div/div[2]/div[2]/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div/a[3]"
                            )
                            for i in self.arraySubCategory:
                                # print('test')
                                if i == self.checkCategory:
                                    print("category match")
                                    #ini untuk dapatin harga tanpa diskon yang ada range dan tidak ataupun tidak ada
                                    self.hargaTanpaDiskon = self.Selenium.ExtractElementText(
                                        "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div/div"
                                    ).replace("Rp",
                                              '').replace(".", '').split(' - ')
                                    if len(self.hargaTanpaDiskon) == 1:
                                        self.hargaTanpaDiskonFix = int(
                                            self.hargaTanpaDiskon[0])
                                    elif len(self.hargaTanpaDiskon) == 2:
                                        self.hargaAsliTDAtas = int(
                                            self.hargaTanpaDiskon[1])
                                        self.hargaAsliTDBawah = int(
                                            self.hargaTanpaDiskon[0])
                                    else:
                                        self.hargaTanpaDiskonFix = 0
                                    # print(self.hargaTanpaDiskon,self.hargaAsliTDAtas,self.hargaAsliTDBawah)
                                    #ini untuk dapatin harga asli diskon
                                    self.hargaAsliDiskon = self.Selenium.ExtractElementText(
                                        "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div[1]"
                                    ).replace("Rp",
                                              '').replace(".", '').split(' - ')
                                    if len(self.hargaAsliDiskon) == 1:
                                        self.hargaAsliDiskon = int(
                                            self.hargaAsliDiskon[0])
                                    elif len(self.hargaAsliDiskon) == 2:
                                        self.hargaAsliDisRangeAtas = int(
                                            self.hargaAsliDiskon[1])
                                        self.hargaAsliDisRangeBawah = int(
                                            self.hargaAsliDiskon[0])
                                    else:
                                        self.hargaAsliDiskon = 0

                                    # print(self.hargaAsliDiskon,self.hargaAsliDisRangeAtas,self.hargaAsliDisRangeBawah)
                                    #ini untuk dapatin harga range setelah diskon

                                    self.hargaDiskon = self.Selenium.ExtractElementText(
                                        "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[3]/div/div/div/div/div[2]/div[1]"
                                    )
                                    if (self.hargaDiskon == None):
                                        self.hargaDiskon = 0
                                    else:
                                        self.hargaDiskon = self.hargaDiskon.replace(
                                            "Rp", '').replace(".",
                                                              '').split(' - ')
                                        if len(self.hargaDiskon) == 1:
                                            self.hargaDiskon = int(
                                                self.hargaDiskon[0])
                                        elif len(self.hargaDiskon) == 2:
                                            self.hargaDisRangeAtas = int(
                                                self.hargaDiskon[1])
                                            self.hargaDisRangeBawah = int(
                                                self.hargaDiskon[0])
                                        else:
                                            self.hargaDiskon = 0

                                    # print(self.hargaDiskon,self.hargaDisRangeAtas,self.hargaDisRangeBawah)
                                    #self.hargaTanpaDiskonFix, self.hargaAsliTDAtas, self.hargaAsliTDBawah, hargaAsliDiskon, hargaAsliDisRangeAtas,hargaAsliDisRangeBawah, hargaDiskon, hargaDisRangeAtas, hargaDisRangeBawah
                                    try:
                                        self.jmlvariasi = len(
                                            self.Selenium.ExtractElements(
                                                "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[4]/div/div[2]/div/div[1]/div/button"
                                            ))
                                        self.jmlvariasi2 = len(
                                            self.Selenium.ExtractElements(
                                                "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[4]/div/div[3]/div/div[1]/div/button"
                                            ))
                                        if (self.jmlvariasi != 0):
                                            print(self.jmlvariasi)
                                            item_variasi = 0
                                            while item_variasi < self.jmlvariasi:
                                                item_variasi += 1
                                                self.variasi = self.Selenium.ExtractElementText(
                                                    "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[4]/div/div[2]/div/div[1]/div/button["
                                                    + str(item_variasi) + "]")
                                                self.variasiFix = self.variasiFix + "|" + self.variasi
                                        elif (self.jmlvariasi2 != 0):
                                            print(self.jmlvariasi2)
                                            item_variasi = 0
                                            while item_variasi < self.jmlvariasi2:
                                                item_variasi += 1
                                                self.variasi = self.Selenium.ExtractElementText(
                                                    "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[4]/div/div[2]/div/div[1]/div/button["
                                                    + str(item_variasi) + "]")
                                                self.variasiFix = self.variasiFix + "|" + self.variasi
                                        else:
                                            self.variasi = ""

                                        self.variasi = self.variasi.replace(
                                            ',', ' ')
                                    except Exception as e:
                                        self.variasi = ""
                                    # print(self.variasiFix)
                                    self.productTerjual = self.Selenium.ExtractElementText(
                                        "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[2]/div[3]/div[1]"
                                    )
                                    if (self.productTerjual == None):
                                        self.productTerjual = 0
                                    else:
                                        self.productTerjual = int(
                                            self.productTerjual)
                                    self.productRating = self.Selenium.ExtractElementText(
                                        "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[2]/div[1]/div[1]"
                                    )
                                    self.productUlasan = int(
                                        self.Selenium.ExtractElementText(
                                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div[2]/div[3]/div/div[2]/div[2]/div[1]"
                                        ))
                                    self.db.insertProductVerse2(
                                        linkadd, self.idProduct,
                                        self.checkCategory, self.namaProduct,
                                        self.productTerjual,
                                        self.productRating, self.productUlasan,
                                        self.variasiFix,
                                        self.hargaTanpaDiskonFix,
                                        self.hargaAsliTDAtas,
                                        self.hargaAsliTDBawah,
                                        self.hargaDiskon,
                                        self.hargaDisRangeAtas,
                                        self.hargaDisRangeBawah,
                                        self.hargaAsliDiskon,
                                        self.hargaAsliDisRangeAtas,
                                        self.hargaAsliDisRangeBawah)
                                    self.jumlahTerjual = 0
                                    self.hargaAsli = 0
                                    self.hargaAsliTDAtas = 0
                                    self.hargaAsliTDBawah = 0
                                    self.hargaTanpaDiskonFix = 0
                                    self.hargaAsliDisRangeAtas = 0
                                    self.hargaAsliDisRangeBawah = 0
                                    self.hargaDiskon = 0
                                    self.hargaRangeAtas = 0
                                    self.hargaRangeBawah = 0
                                    self.hargaDisRangeAtas = 0
                                    self.hargaDisRangeBawah = 0
                                    self.variasi = ""
                                    self.variasiFix = ""
                                    self.productTerjual = 0
                                    self.productRating = ""
                                    self.productUlasan = 0
                                elif i != self.checkCategory:
                                    print("category not match")
                            self.Selenium.BackPage()
                            self.variasiFix = ""
            self.db.updateStatusEnd(linkadd)
Exemple #4
0
    def Crawling(self, Link_Merchant, Code_City):
        MongoDB().updateRunning(Link_Merchant)
        MongoDB().updateStatusCityRun(Link_Merchant, Code_City)
        while self.product_page == 0 or (
                self.product_page < self.page_ordinal
                and self.product_page < self.max_product_page):
            self.product_page = MongoDB().lastPage(Link_Merchant)
            self.Selenium.link = Link_Merchant + "?locations=" + Code_City.replace(
                " ", self.word_separator) + "&page=" + str(
                    self.product_page) + "&sortBy=sales"
            self.Selenium.Load(self.Selenium.link)
            time.sleep(4)
            self.page_ordinal = int(
                self.Selenium.ExtractElementText(
                    "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[1]/div[2]/div/span[2]"
                )) - 1
            self.product_page += 1
            print("jumlah page ;" + str(self.page_ordinal))
            item_count = 0
            print('item :' + str(item_count))
            item_ordinal = self.Selenium.ExtractElements(
                "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div"
            )
            print("jumlah item : " + str(len(item_ordinal)))
            while item_count < len(item_ordinal):
                item_count += 1
                if item_count < 11:
                    product_link = self.Selenium.ExtractElementAttribute(
                        'href', ''.join([
                            "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div["
                            + str(item_count) + "]/div/a"
                        ]))
                    self.Merchant_location = self.Selenium.ExtractElementText(
                        "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div["
                        + str(item_count) + "]/div/a/div/div[2]/div[5]")
                elif item_count < 21:
                    self.Selenium.scrollDown(3)
                    product_link = self.Selenium.ExtractElementAttribute(
                        'href', ''.join([
                            "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div["
                            + str(item_count) + "]/div/a"
                        ]))
                    self.Merchant_location = self.Selenium.ExtractElementText(
                        "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div["
                        + str(item_count) + "]/div/a/div/div[2]/div[5]")
                elif item_count < 31:
                    self.Selenium.scrollDown(4)
                    product_link = self.Selenium.ExtractElementAttribute(
                        'href', ''.join([
                            "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div["
                            + str(item_count) + "]/div/a"
                        ]))
                    self.Merchant_location = self.Selenium.ExtractElementText(
                        "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div["
                        + str(item_count) + "]/div/a/div/div[2]/div[5]")
                elif item_count < 41:
                    self.Selenium.scrollDown(5)
                    product_link = self.Selenium.ExtractElementAttribute(
                        'href', ''.join([
                            "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div["
                            + str(item_count) + "]/div/a"
                        ]))
                    self.Merchant_location = self.Selenium.ExtractElementText(
                        "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div["
                        + str(item_count) + "]/div/a/div/div[2]/div[5]")
                elif item_count < 51:
                    self.Selenium.scrollDown(6)
                    product_link = self.Selenium.ExtractElementAttribute(
                        'href', ''.join([
                            "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div["
                            + str(item_count) + "]/div/a"
                        ]))
                    self.Merchant_location = self.Selenium.ExtractElementText(
                        "//*[@id='main']/div/div[2]/div[2]/div/div/div[@class='container _1EofO_']/div[2]/div/div/div[2]/div["
                        + str(item_count) + "]/div/a/div/div[2]/div[5]")
                else:
                    self.Selenium.scrollUp(3)
                print('item :' + str(item_count))
                if product_link is not None:
                    self.Selenium.Load(product_link)
                    time.sleep(3)
                    checkformat = len(
                        self.Selenium.ExtractElements(
                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div[3]/div"
                        ))
                    if checkformat == 3:
                        self.Merchant_html_path = self.Selenium.ExtractElementAttribute(
                            'href',
                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div[3]/div[2]/div[1]/div/div[3]/a"
                        )
                        checkformat = 0
                    elif checkformat == 2:
                        checkformat = 0
                        self.Merchant_html_path = self.Selenium.ExtractElementAttribute(
                            'href',
                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div[3]/div[1]/div[1]/div/div[3]/a"
                        )
                    else:
                        print(product_link)
                    if MongoDB().checkMerchant(
                            self.Merchant_html_path) is None:
                        self.Selenium.Load(self.Merchant_html_path)
                        time.sleep(3)
                        self.Merchant_name = self.Selenium.ExtractElementText(
                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[1]/div[3]/div[1]/div/h1"
                        )
                        self.Merchant_rate = self.Selenium.ExtractElementText(
                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[6]/div[2]/div[2]"
                        )
                        self.Merchant_product = self.Selenium.ExtractElementText(
                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[1]/div[2]/div[2]"
                        )
                        self.Merchant_established = self.Selenium.ExtractElementText(
                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[7]/div[2]/div[2]"
                        )
                        self.Merchant_followers = self.Selenium.ExtractElementText(
                            "//*[@id='main']/div/div[2]/div[2]/div[2]/div/div[1]/div/div[2]/div[5]/div[2]/div[2]"
                        )
                        self.Selenium.BackPage()

                        try:

                            MongoDB().insert_Shopee(self.Merchant_html_path,
                                                    self.Merchant_name,
                                                    self.Merchant_rate,
                                                    self.Merchant_product,
                                                    self.Merchant_established,
                                                    self.Merchant_followers,
                                                    self.Merchant_location)
                            self.Merchant_html_path = ''
                            self.Merchant_name = ""
                            self.Merchant_rate = ""
                            self.Merchant_product = 0
                            self.Merchant_established = ""
                            self.Merchant_followers = 0
                            self.Merchant_location = ''
                        except Exception as e:
                            print("skip")
                    self.Selenium.BackPage()
                    time.sleep(2)
            MongoDB().updatePages(Link_Merchant, self.product_page)
            self.product_page = 0
            self.number_item = 0
            self.Date_Crawling = 0
            self.max_product_page = 100

        MongoDB().updateStatusCityDone(Link_Merchant, Code_City)
        MongoDB().updatePages(Link_Merchant)
        self.Selenium.clear_cache()
Exemple #5
0
 def main(self):
     Link_Merchant_count = MongoDB().countCategory()
     while Link_Merchant_count != 0:
         try:
             self.Crawling(MongoDB().getLinkToCrawling()[0],
                           MongoDB().getLinkToCrawling()[1])
             self.product_page = 0
             self.number_item = 0
             self.Date_Crawling = 0
             self.max_product_page = 100
             print(MongoDB().getLinkToCrawling()[1])
             MongoDB().updateStatusCityDone(
                 MongoDB().getLinkToCrawling()[0],
                 MongoDB().getLinkToCrawling()[1])
             MongoDB().updatedStatusCategory(
                 MongoDB().getLinkToCrawling()[0])
         except Exception as e:
             print(e)
             MongoDB().timeTrackError()
             self.Crawling(MongoDB().getLinkToCrawling()[0],
                           MongoDB().getLinkToCrawling()[1])
         Link_Merchant_count = MongoDB().countCategory()
     MongoDB().updateMerchantTimeEnd()
Exemple #6
0
def sample_job_every_1h():
    if MongoDB().count_status() != 5:
        print("Repeat")
        os.system('screen ./shopeeProduct2.py &')
Exemple #7
0
def startpoint():
    if MongoDB().count_status() != 5:
        print('a')
        os.system('screen ./shopeeProduct2.py &')
Exemple #8
0
        Link_Merchant_count = MongoDB().countCategory()
        while Link_Merchant_count != 0:
            try:
                self.Crawling(MongoDB().getLinkToCrawling()[0],
                              MongoDB().getLinkToCrawling()[1])
                self.product_page = 0
                self.number_item = 0
                self.Date_Crawling = 0
                self.max_product_page = 100
                print(MongoDB().getLinkToCrawling()[1])
                MongoDB().updateStatusCityDone(
                    MongoDB().getLinkToCrawling()[0],
                    MongoDB().getLinkToCrawling()[1])
                MongoDB().updatedStatusCategory(
                    MongoDB().getLinkToCrawling()[0])
            except Exception as e:
                print(e)
                MongoDB().timeTrackError(e)
                self.Crawling(MongoDB().getLinkToCrawling()[0],
                              MongoDB().getLinkToCrawling()[1])
            Link_Merchant_count = MongoDB().countCategory()
        MongoDB().updateMerchantTimeEnd()


try:
    Run().main()
except Exception as e:
    print(Run().product_page)
    MongoDB().timeTrackError(e)
    Run().main()