Ejemplo n.º 1
0
class SQLite3Db:

    #建構子
    def __init__(self, strResFolderPath=None):
        logging.basicConfig(level=logging.INFO)
        self.fsUtil = FileSystemUtility()
        strDbPath = self.fsUtil.getPackageResourcePath(strPackageName=strResFolderPath, strResourceName="local.db")
        logging.info("connect to sqlite3 db.")
        self.conn = sqlite3.connect(strDbPath) #建立連線
        self.conn.row_factory = sqlite3.Row #資料封裝為 Row 物件
            
    #解構子
    def __del__(self):
        logging.info("close sqlite3 db connection.")
        self.conn.close() #關閉資料庫連線
        
    # 執行 SQL 並 commit (適用於 INSERT、UPDATE、DELETE)
    def commitSQL(self, strSQL=None):
        c = self.conn.cursor()
        c.execute(strSQL)
        self.conn.commit()
        return c.lastrowid #回傳最後 INSERT 的 row id

    # 執行 SQL 並 fetchall 資料 (適用於 SELECT)
    def fetchallSQL(self, strSQL=None):
        c = self.conn.cursor()
        c.execute(strSQL)
        return c.fetchall()
Ejemplo n.º 2
0
class FileSystemUtilityTest(unittest.TestCase):

    #準備
    def setUp(self):
        logging.basicConfig(level=logging.INFO)
        self.fsUtil = FileSystemUtility()
        
    #收尾
    def tearDown(self):
        pass

    #測試 取得 package 資源路徑
    def test_getPackageResourcePath(self):
        logging.info("FileSystemUtilityTest.test_getPackageResourcePath")
        strFileSystemPathOfResource = self.fsUtil.getPackageResourcePath(strPackageName="bennu_res", strResourceName="icon.ico")
        self.assertTrue(os.path.exists(strFileSystemPathOfResource))
Ejemplo n.º 3
0
class SeleniumTest(unittest.TestCase):

    #準備
    def setUp(self):
        logging.basicConfig(level=logging.INFO)
        self.fileUtil = FileSystemUtility()
        strChromeDriverPath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource", strResourceName="chromedriver.exe")
        self.driver = webdriver.Chrome(strChromeDriverPath)
    #收尾
    def tearDown(self):
        self.driver.quit()

    #測試 selenium
    def test_selenium(self):
        logging.info("SeleniumTest.test_selenium")
        self.driver.get("https://www.kkday.com/en/home")
        source = self.driver.page_source
class ImporterForCITYDISCOVERY:
    #建構子
    def __init__(self):
        self.ffUtil = FfUtility()
        self.filesysUtil = FilesysUtility()
        self.db = LocalDbForJsonImporter()
        #self.db = ExternalDbForJsonImporter()
        self.dicSubCommandHandler = {"import":[self.importProductJsonToDb]}
        
    #取得 importer 使用資訊
    def getUseageMessage(self):
        return (
            "- CITY-DISCOVERY -\n"
            "useage:\n"
            "import - import product.json to database \n"
        )
                
    #執行 importer
    def runImporter(self, lstSubcommand=None):
        strSubcommand = lstSubcommand[0]
        strArg1 = None
        if len(lstSubcommand) == 2:
            strArg1 = lstSubcommand[1]
        for handler in self.dicSubCommandHandler[strSubcommand]:
            handler(strArg1)
    
    #import product.json to MySQL DB
    def importProductJsonToDb(self, uselessArg1=None):
        #設定 City-Discovery trip 資料的 status 為 out-of-date
        self.db.setTripDataStatusAsOutOfDate(strSource="City-Discovery")
        #讀取 json 檔
        strBasedir = self.filesysUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json", strResourceName="city_discovery")
        lstStrProductJsonFilePath = self.ffUtil.getFilePathListWithSuffixes(strBasedir=strBasedir, strSuffixes="_product.json")
        for strProductJsonFilePath in lstStrProductJsonFilePath:
            logging.info("read %s"%strProductJsonFilePath)
            lstDicProductData = self.ffUtil.readObjectFromJsonFile(strJsonFilePath=strProductJsonFilePath)
            for dicProductData in lstDicProductData:
                try:
                    #UPSERT
                    self.db.upsertTrip(dicTripData=dicProductData)
                except Exception as e:
                    logging.warning("insert trip failed: %s"%(str(e)))
                    
Ejemplo n.º 5
0
class ImporterForExRate:
    #建構子
    def __init__(self):
        self.ffUtil = FfUtility()
        self.filesysUtil = FilesysUtility()
        self.db = LocalDbForJsonImporter()
        #self.db = ExternalDbForJsonImporter()
        self.dicSubCommandHandler = {"import":[self.importYahooCurrencyJsonToDb]}
        
    #取得 importer 使用資訊
    def getUseageMessage(self):
        return (
            "- ExRate -\n"
            "useage:\n"
            "import - import exrate/*.json to database \n"
        )
                
    #執行 importer
    def runImporter(self, lstSubcommand=None):
        strSubcommand = lstSubcommand[0]
        strArg1 = None
        if len(lstSubcommand) == 2:
            strArg1 = lstSubcommand[1]
        for handler in self.dicSubCommandHandler[strSubcommand]:
            handler(strArg1)
    
    #import exrate/*.json to MySQL DB
    def importYahooCurrencyJsonToDb(self, uselessArg1=None):
        #清除 trip_exrate 資料
        self.db.clearExRateData()
        #讀取 json 檔
        strBasedir = self.filesysUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json", strResourceName="exrate")
        lstStrExRateJsonFilePath = self.ffUtil.getFilePathListWithSuffixes(strBasedir=strBasedir, strSuffixes=".json")
        for strExRateJsonFilePath in lstStrExRateJsonFilePath:
            logging.info("read %s"%strExRateJsonFilePath)
            lstDicExRateData = self.ffUtil.readObjectFromJsonFile(strJsonFilePath=strExRateJsonFilePath)
            for dicExRateData in lstDicExRateData:
                try:
                    #UPDATE or INSERT
                    self.db.upsertExRate(dicExRateData=dicExRateData)
                except Exception as e:
                    logging.warning("upsert exrate failed: %s"%(str(e)))
Ejemplo n.º 6
0
class CrawlerForKKDAY:
    
    #建構子
    def __init__(self):
        self.dicSubCommandHandler = {
            "index":self.crawlIndexPage,
            "country":self.crawlCountryPage,
            "product":self.crawlProductPage
        }
        self.ffUtil = FfUtility()
        self.fileUtil = FilesysUtility()
        self.db = LocalDbForKKDAY()
        self.lstDicParsedProductJson = []  #product.json 資料
        self.intProductJsonIndex = 1
        self.driver = None
        
    #取得 spider 使用資訊
    def getUseageMessage(self):
        return (
            "- KKDAY -\n"
            "useage:\n"
            "index - crawl index page of KKDAY \n"
            "country - crawl not obtained country page \n"
            "product [country_page_1_url] - crawl not obtained product page [of given country_page_1_url] \n"
        )
    
    #取得 selenium driver 物件
    def getDriver(self):
        chromeDriverExeFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource", strResourceName="chromedriver.exe")
        driver = webdriver.Chrome(chromeDriverExeFilePath)
        return driver
        
    #初始化 selenium driver 物件
    def initDriver(self):
        if not self.driver:
            self.driver = self.getDriver()
        
    #終止 selenium driver 物件
    def quitDriver(self):
        self.driver.quit()
        self.driver = None
        
    #重啟 selenium driver 物件
    def restartDriver(self):
        self.quitDriver()
        time.sleep(5)
        self.initDriver()
        
    #執行 crawler
    def runCrawler(self, lstSubcommand=None):
        strSubcommand = lstSubcommand[0]
        strArg1 = None
        if len(lstSubcommand) == 2:
            strArg1 = lstSubcommand[1]
        self.initDriver() #init selenium driver
        self.dicSubCommandHandler[strSubcommand](strArg1)
        self.quitDriver() #quit selenium driver
        
    #爬取 index 頁面 
    def crawlIndexPage(self, uselessArg1=None):
        logging.info("crawl index page")
        #KKDAY index 頁面
        self.driver.get("https://www.kkday.com/en/home")
        #點擊搜尋
        self.driver.find_element_by_css_selector("#header-main-keywordSearch-button").click()
        time.sleep(5)
        #一一點擊區域
        lstEleAreaA = self.driver.find_elements_by_css_selector("#area_country_menu ul.slideTogglePage[role=area] li a")
        for indexOfLstEleAreaA in range(len(lstEleAreaA)):
            lstEleAreaA[indexOfLstEleAreaA].click()
            time.sleep(5)
            #解析國家超連結
            lstEleCountryA = self.driver.find_elements_by_css_selector("#area_country_menu ul.slideTogglePage[role=country] li a")
            for eleCountryA in lstEleCountryA:
                strCountryHref = eleCountryA.get_attribute("href")
                #儲存國家超連結至 localdb
                self.db.insertCountryIfNotExists(strCountryPage1Url=strCountryHref)
                logging.info("save country url: %s"%strCountryHref)
            self.driver.find_element_by_css_selector("#previousBtn").click()
            time.sleep(5)
            lstEleAreaA = self.driver.find_elements_by_css_selector("#area_country_menu ul.slideTogglePage li a")
    
    #解析 country 頁面
    def parseCountryPage(self, strCountryPage1Url=None):
        #找尋 product 超連結
        elesProductA = self.driver.find_elements_by_css_selector("article.product-listview div.product-info-container div div a")
        for eleProductA in elesProductA:
            strProductUrl = eleProductA.get_attribute("href")
            #儲存 product 超連結至 localdb
            logging.info("insert product url: %s"%strProductUrl)
            self.db.insertProductUrlIfNotExists(strProductUrl=strProductUrl, strCountryPage1Url=strCountryPage1Url)
    
    #檢查 country 有無下一頁
    def checkNextCountryPageExist(self):
        isNextCountryPageExist = False
        strNextPageAText = self.driver.find_element_by_css_selector("ul.pagination li.a-page:last-child a.toPage").text
        if strNextPageAText and strNextPageAText == "»":
            isNextCountryPageExist = True
        return isNextCountryPageExist
        
    #爬取 country 頁面
    def crawlCountryPage(self, uselessArg1=None):
        logging.info("crawl country page")
        #取得 Db 中尚未下載的 topic url
        lstStrNotObtainedCountryPage1Url = self.db.fetchallNotObtainedCountryUrl()
        for strNotObtainedCountryPage1Url in lstStrNotObtainedCountryPage1Url:
            #re 找出 country 名稱
            strCountryName = re.match("^https://www.kkday.com/en/product/productlist/.*countryname=(.*)$", strNotObtainedCountryPage1Url).group(1)
            #country 頁面
            try:
                intCountryPageNum = 1
                #country 第1頁
                time.sleep(random.randint(2,5)) #sleep random time
                strCountryUrlPageSuffix = "&sort=hdesc&page=%d"%intCountryPageNum
                self.driver.get(strNotObtainedCountryPage1Url + strCountryUrlPageSuffix)
                #解析 product 超連結
                self.parseCountryPage(strCountryPage1Url=strNotObtainedCountryPage1Url)
                #檢查 country 有無下一頁
                isNextCountryPageExist = self.checkNextCountryPageExist()
                while isNextCountryPageExist:
                    time.sleep(random.randint(5,8)) #sleep random time
                    intCountryPageNum = intCountryPageNum+1
                    strCountryUrlPageSuffix = "&sort=hdesc&page=%d"%intCountryPageNum
                    self.driver.get(strNotObtainedCountryPage1Url + strCountryUrlPageSuffix)
                    #解析 product 超連結
                    self.parseCountryPage(strCountryPage1Url=strNotObtainedCountryPage1Url)
                    #檢查 country 有無下一頁
                    isNextCountryPageExist = self.checkNextCountryPageExist()
                #更新 country DB 為已抓取 (isGot = 1)
                self.db.updateCountryStatusIsGot(strCountryPage1Url=strNotObtainedCountryPage1Url)
                logging.info("got country %s find %d pages"%(strCountryName, intCountryPageNum))
            except Exception as e:
                logging.warning(str(e))
                logging.warning("selenium driver crashed. skip get country: %s"%strCountryName)
            finally:
                self.restartDriver() #重啟
            
    #解析 product 頁面
    def parseProductPage(self, strProductUrl=None):
        dicProductJson = {}
        #strSource
        dicProductJson["strSource"] = "KKDAY"
        #strOriginUrl
        dicProductJson["strOriginUrl"] = strProductUrl
        #strUpdateStatus
        dicProductJson["strUpdateStatus"] = "up-to-date"
        #strUpdateTime
        dicProductJson["strUpdateTime"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        #strImageUrl
        strImageDivStyle = self.driver.find_element_by_css_selector("div#header-imageview div.productPage-photos div.img-bg-full").get_attribute("style")
        strImageDivStyle = re.sub("[:;\"\s\(\)]", "", strImageDivStyle).strip()
        strImageUrl = re.match("^background-imageurl//(img\.kkday\.com/image/.*)$", strImageDivStyle).group(1)
        dicProductJson["strImageUrl"] = "http://" + strImageUrl.strip()
        #strTitle
        strTitle = self.driver.find_element_by_css_selector("div.productview div.container div.productPage-detail h1").text
        dicProductJson["strTitle"] = strTitle.strip()
        #strLocation
        strLocation = self.driver.find_element_by_css_selector("div.productview div.container div.productPage-detail div.col-md-pull-4 span.h5").text
        strLocation = re.sub("The location:", "", strLocation)
        dicProductJson["strLocation"] = strLocation.strip()
        #intUsdCost
        strTwdCostText = self.driver.find_element_by_css_selector("div.lowestPrice div.text-right h2.h1").text
        strTwdCostText = re.sub("[^\d]", "", strTwdCostText.strip())
        fUsdTwdExrate = self.ffUtil.getUsdExrate(strCurrency="TWD")
        dicProductJson["intUsdCost"] = int(int(strTwdCostText)/fUsdTwdExrate)
        #intReviewStar
        elesStarI = self.driver.find_elements_by_css_selector("div.div-star span.h5 i.fa-star.text-primary")
        dicProductJson["intReviewStar"] = len(elesStarI)
        #intReviewVisitor
        intReviewVisitor = 0
        elesReviewVisitorSpan = self.driver.find_elements_by_css_selector("div.div-star span.h5 span.text-primary")
        if len(elesReviewVisitorSpan) > 0:
            strReviewVisitorText = elesReviewVisitorSpan[0].text
            intReviewVisitor = int(strReviewVisitorText.strip())
        dicProductJson["intReviewVisitor"] = intReviewVisitor
        #strIntroduction
        strIntroduction = self.driver.find_element_by_css_selector("div.prod-intro span").text
        dicProductJson["strIntroduction"] = strIntroduction.strip()
        #intDurationHour
        intDurationHour = 0
        strDurationText = self.driver.find_element_by_css_selector("div.productview div.container div.productPage-detail div.col-md-12 span.h5").text
        strIntInDurationHourText = re.sub("[^\d\.]", "", strDurationText)
        if "hour" in strDurationText:
            intDurationHour = int(float(strIntInDurationHourText))
        elif "day" in strDurationText:
            intDurationHour = int(float(strIntInDurationHourText))*24
        else:
            pass
        dicProductJson["intDurationHour"] = intDurationHour
        #strGuideLanguage
        lstStrGuideLanguage = []
        elesGuideLanguageImg = self.driver.find_elements_by_css_selector("div.productview div.container div.productPage-detail div.guide_lang_image img")
        for eleGuideLanguageImg in elesGuideLanguageImg:
            lstStrGuideLanguage.append(eleGuideLanguageImg.get_attribute("data-original-title").strip())
        dicProductJson["strGuideLanguage"] = ",".join(lstStrGuideLanguage)
        #intOption (待確認)
        dicProductJson["intOption"] = None
        #strStyle (kkday 無該資料)
        dicProductJson["strStyle"] = None
        self.lstDicParsedProductJson.append(dicProductJson)
    
    #爬取 product 頁面 (strCountryPage1Url == None 會自動找尋已爬取完成之 country)
    def crawlProductPage(self, strCountryPage1Url=None):
        #清空計憶體殘留資料
        self.lstDicParsedProductJson = []
        self.intProductJsonIndex = 1
        if not strCountryPage1Url:
            #未指定 country
            lstStrObtainedCountryUrl = self.db.fetchallCompletedObtainedCountryUrl()
            for strObtainedCountryUrl in lstStrObtainedCountryUrl:
                self.crawlProductPageWithGivenCountryUrl(strCountryPage1Url=strObtainedCountryUrl)
        else:
            #有指定 country url
            self.crawlProductPageWithGivenCountryUrl(strCountryPage1Url=strCountryPage1Url)
        #將最後資料寫入 json
        if len(self.lstDicParsedProductJson) > 0:
            strJsonFileName = "%d_product.json"%(self.intProductJsonIndex*100)
            strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.kkday", strResourceName=strJsonFileName)
            self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath)
            self.lstDicParsedProductJson = []
            
    #爬取 product 頁面 (指定 country url)
    def crawlProductPageWithGivenCountryUrl(self, strCountryPage1Url=None):
        logging.info("crawl product page with country %s"%strCountryPage1Url)
        #取得 DB 紀錄中,指定 strCountryPage1Url country 的 product url
        lstStrProductUrl = self.db.fetchallProductUrlByCountryUrl(strCountryPage1Url=strCountryPage1Url)
        for strProductUrl in lstStrProductUrl:
            #檢查 product 是否已下載
            if not self.db.checkProductIsGot(strProductUrl=strProductUrl):
                time.sleep(random.randint(5,8)) #sleep random time
                try:
                    self.driver.get(strProductUrl)
                    #解析 product 頁面
                    self.parseProductPage(strProductUrl=strProductUrl)
                    #更新 product DB 為已爬取 (isGot = 1)
                    #self.db.updateProductStatusIsGot(strProductUrl=strProductUrl)
                except Exception as e:
                    logging.warning(str(e))
                    logging.warning("selenium driver crashed. skip get product: %s"%strProductUrl)
                    self.restartDriver() #重啟 
            #顯示進度
            logging.info("進度: %d/100"%len(self.lstDicParsedProductJson))
            #寫入 json
            if len(self.lstDicParsedProductJson) == 100:
                strJsonFileName = "%d_product.json"%(self.intProductJsonIndex*100)
                strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.kkday", strResourceName=strJsonFileName)
                self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath)
                self.intProductJsonIndex = self.intProductJsonIndex+1
                self.lstDicParsedProductJson = []
Ejemplo n.º 7
0
class CrawlerForExRate:
    
    #建構子
    def __init__(self):
        self.dicSubCommandHandler = {
            "yahoo":self.crawlYahooCurrencyPage
        }
        self.ffUtil = FfUtility()
        self.fileUtil = FilesysUtility()
        self.lstDicParsedCurrencyJson = []  #currency.json 資料
        self.driver = None
        
    #取得 spider 使用資訊
    def getUseageMessage(self):
        return (
            "- ExRate -\n"
            "useage:\n"
            "yahoo - crawl yahoo currency page \n"
        )
    
    #取得 selenium driver 物件
    def getDriver(self):
        chromeDriverExeFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource", strResourceName="chromedriver.exe")
        driver = webdriver.Chrome(chromeDriverExeFilePath)
        return driver
        
    #初始化 selenium driver 物件
    def initDriver(self):
        if not self.driver:
            self.driver = self.getDriver()
        
    #終止 selenium driver 物件
    def quitDriver(self):
        self.driver.quit()
        self.driver = None
        
    #重啟 selenium driver 物件
    def restartDriver(self):
        self.quitDriver()
        time.sleep(5)
        self.initDriver()
        
    #執行 crawler
    def runCrawler(self, lstSubcommand=None):
        strSubcommand = lstSubcommand[0]
        strArg1 = None
        if len(lstSubcommand) == 2:
            strArg1 = lstSubcommand[1]
        self.initDriver() #init selenium driver
        self.dicSubCommandHandler[strSubcommand](strArg1)
        self.quitDriver() #quit selenium driver
        
    #爬取 yahoo currency 頁面
    def crawlYahooCurrencyPage(self, uselessArg1=None):
        #清空計憶體殘留資料
        self.lstDicParsedCurrencyJson = []
        #爬取
        self.driver.get("https://tw.money.yahoo.com/currency")
        #亞洲、美洲、歐非
        elesAreaTabLi = self.driver.find_elements_by_css_selector("ul.sub-tabs.D-ib li")
        for intAreaTabIndex in range(len(elesAreaTabLi)):
            time.sleep(random.randint(5,10))
            self.driver.find_element_by_css_selector("ul.sub-tabs.D-ib li:nth-of-type(%s)"%str(intAreaTabIndex+1)).click()
            time.sleep(random.randint(5,10))
            #加入美金兌美金匯率 1.0
            dicExRateData = {}
            dicExRateData["strCurrencyName"] = "USD"
            dicExRateData["fUSDollar"] = 1.0
            strUpdateTime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            dicExRateData["strUpdateTime"] = strUpdateTime
            self.lstDicParsedCurrencyJson.append(dicExRateData)
            #解析 1美金兌非美金匯率資料
            elesExRateTr = self.driver.find_elements_by_css_selector("tbody tr.Bd-b")
            for eleExRateTr in elesExRateTr:
                dicExRateData = {}
                #strCurrencyName
                strExRateHref = eleExRateTr.find_element_by_css_selector("td.Ta-start a").get_attribute("href")
                dicExRateData["strCurrencyName"] = re.match("https://tw.money.yahoo.com/currency/USD(...)=X", strExRateHref).group(1)
                #fUSDollar
                strUSDollar = eleExRateTr.find_element_by_css_selector("td.Ta-end:nth-of-type(3)").text
                dicExRateData["fUSDollar"] = float(strUSDollar)
                #dtUpdateTime (strUpdateTime in json)
                strUpdateTime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                dicExRateData["strUpdateTime"] = strUpdateTime
                logging.info("find %s ex-rate: %f USD"%(dicExRateData["strCurrencyName"], dicExRateData["fUSDollar"]))
                self.lstDicParsedCurrencyJson.append(dicExRateData)
        #將資料寫入 json
        strJsonFileName = "yahoo_currency.json"
        strExRateJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.exrate", strResourceName=strJsonFileName)
        self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedCurrencyJson, strJsonFilePath=strExRateJsonFilePath)
        self.lstDicParsedCurrencyJson = []
Ejemplo n.º 8
0
class Utility:
    
    #建構子
    def __init__(self):
        self.filesysUtil = FileSystemUtility()
    
    #儲存檔案
    def overwriteSaveAs(self, strFilePath=None, unicodeData=None):
        with open(strFilePath, "w+", encoding="utf-8") as file:
            file.write(unicodeData)
    
    #讀取 json 檔案內容,回傳 dict 物件
    def readObjectFromJsonFile(self, strJsonFilePath=None):
        dicRet = None
        with open(strJsonFilePath, "r", encoding="utf-8") as jsonFile:
            dicRet = json.load(jsonFile, encoding="utf-8")
        return dicRet
    
    #將 dict 物件的內容寫入到 json 檔案內
    def writeObjectToJsonFile(self, dicData=None, strJsonFilePath=None):
        with open(strJsonFilePath, "w+", encoding="utf-8") as jsonFile:
            jsonFile.write(json.dumps(dicData, ensure_ascii=False, indent=4, sort_keys=True))
    
    #取得子目錄的路徑
    def getSubFolderPathList(self, strBasedir=None):
        lstStrSubFolderPath = []
        for base, dirs, files in os.walk(strBasedir):
            if base == strBasedir:
                for dir in dirs:
                    strFolderPath = base + os.sep + dir
                    lstStrSubFolderPath.append(strFolderPath)
        return lstStrSubFolderPath
    
    #取得 strBasedir 目錄中,檔名以 strSuffixes 結尾的檔案路徑
    def getFilePathListWithSuffixes(self, strBasedir=None, strSuffixes=None):
        lstStrFilePathWithSuffixes = []
        for base, dirs, files in os.walk(strBasedir): 
            if base == strBasedir:#just check base dir
                for strFilename in files:
                    if strFilename.endswith(strSuffixes):#find target files
                        strFilePath = base + os.sep + strFilename
                        lstStrFilePathWithSuffixes.append(strFilePath)
        return lstStrFilePathWithSuffixes
        
    #深層取得 strBasedir 目錄中,檔名以 strSuffixes 結尾的檔案路徑
    def recursiveGetFilePathListWithSuffixes(self, strBasedir=None, strSuffixes=None):
        lstStrFilePathWithSuffixes = []
        for base, dirs, files in os.walk(strBasedir): 
            for strFilename in files:
                if strFilename.endswith(strSuffixes):#find target files
                    strFilePath = base + os.sep + strFilename
                    lstStrFilePathWithSuffixes.append(strFilePath)
        return lstStrFilePathWithSuffixes
        
    #取得檔案的建立日期
    def getCtimeOfFile(self, strFilePath=None):
        fCTimeStamp = os.path.getctime(strFilePath)
        dtCTime = datetime.datetime.fromtimestamp(fCTimeStamp)
        strCTime = dtCTime.strftime("%Y-%m-%d")
        return strCTime
        
    #取得 1美元 對 指定幣別 的匯率
    def getUsdExrate(self, strCurrency=None):
        strJsonPackageName = "findfine_crawler.resource.parsed_json.exrate"
        strJsonFileName = "yahoo_currency.json"
        strExRateJsonFilePath = self.filesysUtil.getPackageResourcePath(strPackageName=strJsonPackageName, strResourceName=strJsonFileName)
        lstDicExRateData = self.readObjectFromJsonFile(strJsonFilePath=strExRateJsonFilePath)
        fUSDollar = 0.0
        for dicExRateData in lstDicExRateData:
            if strCurrency == dicExRateData.get("strCurrencyName", None):
                fUSDollar = dicExRateData.get("fUSDollar", 0.0)
                break
        return fUSDollar
        
Ejemplo n.º 9
0
class CrawlerForVIATOR:
    
    #建構子
    def __init__(self):
        self.dicSubCommandHandler = {
            "download":self.downloadVapProductsXmlZip,
            "unzip":self.unzipVapProductsXmlZip,
            "json":self.crawlVapProductsXml
        }
        self.ffUtil = FfUtility()
        self.fileUtil = FilesysUtility()
        self.lstDicParsedProductJson = []  #product.json 資料
        self.intProductJsonIndex = 1
        
    #取得 spider 使用資訊
    def getUseageMessage(self):
        return (
            "- VIATOR -\n"
            "useage:\n"
            "download - download vapProducts.xml.zip \n"
            "unzip - unzip vapProducts.xml.zip \n"
            "json - crawl vapProducts.xml then create json \n"
        )
        
    #執行 crawler
    def runCrawler(self, lstSubcommand=None):
        strSubcommand = lstSubcommand[0]
        strArg1 = None
        if len(lstSubcommand) == 2:
            strArg1 = lstSubcommand[1]
        self.dicSubCommandHandler[strSubcommand](strArg1)
        
    #爬取 vapProducts.xml 取得 產品 json
    def crawlVapProductsXml(self, uselessArg1=None):
        #清空計憶體殘留資料
        self.lstDicParsedProductJson = []
        self.intProductJsonIndex = 1
        #分次讀取所有產品
        soupProduct = self.findNextProductData()
        while soupProduct: #is not None
            logging.info("find product: %s"%soupProduct.ProductURL.string)
            #轉換為 findfine 資料格式
            dicProductJson = {}
            #strSource
            dicProductJson["strSource"] = "Viator"
            #strOriginUrl
            dicProductJson["strOriginUrl"] = soupProduct.ProductURLs.ProductURL.string
            #strUpdateStatus
            dicProductJson["strUpdateStatus"] = "up-to-date"
            #strUpdateTime
            dicProductJson["strUpdateTime"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            #strImageUrl
            if soupProduct.ProductImage and soupProduct.ProductImage.ImageURL:
                dicProductJson["strImageUrl"] = soupProduct.ProductImage.ImageURL.string
            else:
                dicProductJson["strImageUrl"] = "#"
            #strTitle
            dicProductJson["strTitle"] = soupProduct.ProductName.string
            #strLocation
            setStrLocation = {soupProduct.Destination.Country.string, soupProduct.Destination.City.string}
            if None in setStrLocation:
                setStrLocation.remove(None)
            dicProductJson["strLocation"] = ",".join(setStrLocation)
            #intUsdCost
            dicProductJson["intUsdCost"] = int(float(soupProduct.Pricing.PriceUSD.string))
            #intReviewStar
            if soupProduct.ProductStarRating and soupProduct.ProductStarRating.AvgRating:
                dicProductJson["intReviewStar"] = int(float(soupProduct.ProductStarRating.AvgRating.string))
            else:
                dicProductJson["intReviewStar"] = 0
            #intReviewVisitor
            dicProductJson["intReviewVisitor"] = 1
            #strIntroduction
            dicProductJson["strIntroduction"] = soupProduct.Introduction.string
            #intDurationHour
            dicProductJson["intDurationHour"] = self.convertDurationStringToHourInt(strDurtation=soupProduct.Duration.string)
            #strGuideLanguage
            dicProductJson["strGuideLanguage"] = "english"
            #strStyle
            if soupProduct.ProductCategory and soupProduct.ProductCategory.Category:
                dicProductJson["strStyle"] = soupProduct.ProductCategory.Category.string
            else:
                dicProductJson["strStyle"] = ""
            #intOption
            #dicProductJson["intOption"] = -1
            #加入資料至 json
            self.lstDicParsedProductJson.append(dicProductJson)
            #每5000筆寫入一次 json
            if len(self.lstDicParsedProductJson) == 5000:
                strJsonFileName = "%d_viator_product.json"%(self.intProductJsonIndex*5000)
                strJsonPackageName = "findfine_crawler.resource.parsed_json.viator"
                strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName=strJsonPackageName, strResourceName=strJsonFileName)
                self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath)
                self.intProductJsonIndex = self.intProductJsonIndex+1
                self.lstDicParsedProductJson = []
            #讀取下一個 product
            soupProduct = self.findNextProductData(soupCurrentProduct=soupProduct)
        #將剩餘資料寫入 json
        strJsonFileName = "%d_viator_product.json"%(self.intProductJsonIndex*5000)
        strJsonPackageName = "findfine_crawler.resource.parsed_json.viator"
        strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName=strJsonPackageName, strResourceName=strJsonFileName)
        self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath)
        self.lstDicParsedProductJson = []
        
    #從 xml 讀取 下一筆產品資訊
    def findNextProductData(self, soupCurrentProduct=None):
        if soupCurrentProduct: # is not None 返回下一個 Product
            return soupCurrentProduct.find_next_sibling("Product")
        else: #尋找第一個 Product
            strXmlPackageName = "findfine_crawler.resource.source_data.viator"
            strXmlFileName = "vapProducts.xml"
            strXmlFilePath = self.fileUtil.getPackageResourcePath(strPackageName=strXmlPackageName, strResourceName=strXmlFileName)
            with open(strXmlFilePath, "r", encoding="utf-8") as xmlFile:
                soup = BeautifulSoup(xmlFile.read(), "xml")
            soupProduct = soup.Products.find("Product")
            return soupProduct
            
    #轉換 duration 資訊
    def convertDurationStringToHourInt(self, strDurtation=None):
        intDefaultDuration = 1
        if not strDurtation or ("hour" not in strDurtation and "day" not in strDurtation):
            return intDefaultDuration
        else:
            intTotalDurationHour = 0
            mDurationHour = re.match("([\d]+) hour", strDurtation)
            mDurationDay = re.match("([\d]+) day", strDurtation)
            if mDurationHour:
                intDurationHour = int(float(mDurationHour.group(1)))
                intTotalDurationHour = intTotalDurationHour + intDurationHour
            if mDurationDay:
                intDurationDay = int(float(mDurationDay.group(1)))
                intTotalDurationHour = intTotalDurationHour + (intDurationDay*8)
            return intTotalDurationHour
            
    #下載 vapProducts.xml.zip
    def downloadVapProductsXmlZip(self, uselessArg1=None):
        #login
        strPartnerAccount = "19993"
        strPartnerPwd = "a768768a"
        cj = http.cookiejar.MozillaCookieJar()
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
        dicLoginData = urllib.parse.urlencode({
            "adminPUID":strPartnerAccount,
            "login_password":strPartnerPwd
        }).encode("utf-8")
        req = urllib.request.Request("https://www.partner.viator.com/partner/login.jspa", dicLoginData, method="POST")
        response = opener.open(req)
        #wget https://www.partner.viator.com/partner/admin/tools/links_feeds/downloadFeed.jspa?feed=Products&format=xml
        strUrl = "https://www.partner.viator.com/partner/admin/tools/links_feeds/downloadFeed.jspa?feed=Products&format=xml"
        req = urllib.request.Request(url=strUrl, method="GET")
        response = opener.open(req)
        byteVapProductsXmlZip = response.read()
        #儲存 vapProducts.xml.zip
        strPackageName = "findfine_crawler.resource.source_data.viator"
        strZipFileName = "vapProducts.xml.zip"
        strZipFilePath = self.fileUtil.getPackageResourcePath(strPackageName=strPackageName, strResourceName=strZipFileName)
        with open(strZipFilePath, "bw+") as zipFile:
            zipFile.write(byteVapProductsXmlZip)
            
    #解壓縮 vapProducts.xml.zip
    def unzipVapProductsXmlZip(self, uselessArg1=None):
        strPackageName = "findfine_crawler.resource.source_data.viator"
        strZipFileName = "vapProducts.xml.zip"
        strZipFilePath = self.fileUtil.getPackageResourcePath(strPackageName=strPackageName, strResourceName=strZipFileName)
        with ZipFile(strZipFilePath, "r") as zipFile:
            strPackageName = "findfine_crawler.resource.source_data"
            strXmlBaseFolderPath = self.fileUtil.getPackageResourcePath(strPackageName=strPackageName, strResourceName="viator")
            zipFile.extract("vapProducts.xml", strXmlBaseFolderPath)
Ejemplo n.º 10
0
class CrawlerForTRIPBAA:
    
    #建構子
    def __init__(self):
        self.dicSubCommandHandler = {
            "search":self.crawlSearchPage,
            "product":self.crawlProductPage
        }
        self.ffUtil = FfUtility()
        self.fileUtil = FilesysUtility()
        self.db = LocalDbForTRIPBAA()
        self.lstDicParsedProductJson = []  #product.json 資料
        self.intProductJsonIndex = 1
        self.driver = None
        
    #取得 spider 使用資訊
    def getUseageMessage(self):
        return (
            "- Tripbaa -\n"
            "useage:\n"
            "search - crawl search page of Tripbaa \n"
            "product - crawl not obtained product page \n"
        )
    
    #取得 selenium driver 物件
    def getDriver(self):
        chromeDriverExeFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource", strResourceName="chromedriver.exe")
        driver = webdriver.Chrome(chromeDriverExeFilePath)
        return driver
        
    #初始化 selenium driver 物件
    def initDriver(self):
        if not self.driver:
            self.driver = self.getDriver()
        
    #終止 selenium driver 物件
    def quitDriver(self):
        self.driver.quit()
        self.driver = None
        
    #重啟 selenium driver 物件
    def restartDriver(self):
        self.quitDriver()
        time.sleep(5)
        self.initDriver()
        
    #執行 crawler
    def runCrawler(self, lstSubcommand=None):
        strSubcommand = lstSubcommand[0]
        strArg1 = None
        if len(lstSubcommand) == 2:
            strArg1 = lstSubcommand[1]
        self.initDriver() #init selenium driver
        self.dicSubCommandHandler[strSubcommand](strArg1)
        self.quitDriver() #quit selenium driver
        
    #爬取 search 頁面 
    def crawlSearchPage(self, uselessArg1=None):
        logging.info("crawl search page")
        #Tripbaa search 頁面
        self.driver.get("https://en.tripbaa.com/search.php?&ccid=JCU1IyE=")
        time.sleep(5)
        #展開頁面
        isMoreBtnShow = True
        while isMoreBtnShow:
            if len(self.driver.find_elements_by_css_selector("#morebutton a")) == 0:
                isMoreBtnShow = False
            else:
                self.driver.find_element_by_css_selector("#morebutton a").click()
                time.sleep(5)
        #解析 product 超連結
        lstEleProductA = self.driver.find_elements_by_css_selector("ul li div.htBox div.htPic a")
        for eleProductA in lstEleProductA:
            strProductHref = eleProductA.get_attribute("href")
            #儲存 product 超連結至 localdb
            if strProductHref.startswith("https://en.tripbaa.com/travel/"):
                strProductUrl = strProductHref + u"?&ccid=JCU1IyE=" #加上預設顯示 USD 的 ccid
                self.db.insertProductUrlIfNotExists(strProductUrl=strProductUrl)
                logging.info("save product url: %s"%strProductUrl)
        
    #解析 product 頁面
    def parseProductPage(self, strProductUrl=None):
        dicProductJson = {}
        #strSource
        dicProductJson["strSource"] = "Tripbaa"
        #strOriginUrl
        dicProductJson["strOriginUrl"] = strProductUrl
        #strUpdateStatus
        dicProductJson["strUpdateStatus"] = "up-to-date"
        #strUpdateTime
        dicProductJson["strUpdateTime"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        #strImageUrl
        strImageUrl = self.driver.find_element_by_css_selector("ul.picBox li:nth-of-type(1) a:nth-of-type(1) img").get_attribute("src")
        dicProductJson["strImageUrl"] = strImageUrl
        #strTitle
        strTitle = self.driver.find_element_by_css_selector("h1.mtTripName").text
        dicProductJson["strTitle"] = strTitle.strip()
        #strLocation
        strLocation = self.driver.find_element_by_css_selector("div.mtKind p:nth-of-type(1)").text.split("/")[0]
        dicProductJson["strLocation"] = strLocation.strip()
        #intUsdCost
        strUsdCost = self.driver.find_element_by_css_selector("div.mtIntro_NoPic div.okListMoney span.import01 span.blue").text
        strUsdCost = re.sub("[^\d\.]", "", strUsdCost)
        intUsdCost = int(float(strUsdCost.strip()))
        dicProductJson["intUsdCost"] = intUsdCost
        #intReviewStar
        strReviewStar = self.driver.find_element_by_css_selector("div.mtStarBox").get_attribute("data-average")
        intReviewStar = int(float(strReviewStar.strip()))
        dicProductJson["intReviewStar"] = intReviewStar
        #intReviewVisitor
        dicProductJson["intReviewVisitor"] = random.randint(0, 30)
        #strIntroduction
        strIntroduction = self.driver.find_element_by_css_selector("h2.mtTripInfo").text.strip()
        strIntroduction = re.sub("\s", " ", strIntroduction)
        dicProductJson["strIntroduction"] = strIntroduction
        #intDurationHour
        intDurationHour = 0
        lstStrSpanTextInOkList = []
        for eleSpanInOkList in self.driver.find_elements_by_css_selector("div.okList span.import01"):
            lstStrSpanTextInOkList.append(eleSpanInOkList.text)
        for strSpanTextInOkList in lstStrSpanTextInOkList:
            if "Duration" in strSpanTextInOkList:
                strDurationHour = strSpanTextInOkList.strip()
                strDurationHour = re.sub("\s", " ", strDurationHour.lower())
                intDurationHour = self.convertDurationStringToHourInt(strDurtation=strDurationHour)
                break
        dicProductJson["intDurationHour"] = intDurationHour
        #strGuideLanguage
        strGuideLanguage = "english"
        for strSpanTextInOkList in lstStrSpanTextInOkList:
            if "Language" in strSpanTextInOkList:
                strGuideLanguage = strSpanTextInOkList
                strGuideLanguage = re.sub("[^a-zA-Z]", " ", strGuideLanguage.lower()).strip()
                strGuideLanguage = re.sub("[\s]+", " ", strGuideLanguage).strip()
                strGuideLanguage = re.match("^language (.*)$", strGuideLanguage).group(1).strip()
                break
        dicProductJson["strGuideLanguage"] = strGuideLanguage
        #strStyle
        strStyle = self.driver.find_element_by_css_selector("div.mtKind p:nth-of-type(1)").text.split("/")[1].strip()
        dicProductJson["strStyle"] = strStyle
        #intOption (待確認)
        dicProductJson["intOption"] = None
        self.lstDicParsedProductJson.append(dicProductJson)
        
    #爬取 product 頁面
    def crawlProductPage(self, uselessArg1=None):
        logging.info("crawl product page")
        #清空計憶體殘留資料
        self.lstDicParsedProductJson = []
        self.intProductJsonIndex = 1
        #取得 DB 紀錄中,指定 strCityPage1Url city 的 product url
        lstStrProductUrl = self.db.fetchallProductUrl(isGot=False)
        for strProductUrl in lstStrProductUrl:
            #檢查 product 是否已下載
            if not self.db.checkProductIsGot(strProductUrl=strProductUrl):
                time.sleep(random.randint(5,8)) #sleep random time
                try:
                    self.driver.get(strProductUrl)
                    #解析 product 頁面
                    self.parseProductPage(strProductUrl=strProductUrl)
                    #更新 product DB 為已爬取 (isGot = 1)
                    #self.db.updateProductStatusIsGot(strProductUrl=strProductUrl)
                except Exception as e:
                    logging.warning(str(e))
                    logging.warning("selenium driver crashed. skip get product: %s"%strProductUrl)
                    self.restartDriver() #重啟 
            #顯示進度
            logging.info("進度: %d/100"%len(self.lstDicParsedProductJson))
            #寫入 json
            if len(self.lstDicParsedProductJson) == 100:
                strJsonFileName = "%d_product.json"%(self.intProductJsonIndex*100)
                strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.tripbaa", strResourceName=strJsonFileName)
                self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath)
                self.intProductJsonIndex = self.intProductJsonIndex+1
                self.lstDicParsedProductJson = []
        #寫入剩餘的資料到 json
        if len(self.lstDicParsedProductJson) > 0:
                strJsonFileName = "%d_product.json"%(self.intProductJsonIndex*100)
                strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.tripbaa", strResourceName=strJsonFileName)
                self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath)
                self.lstDicParsedProductJson = []
                self.intProductJsonIndex = 1
                
    #轉換 duration 資訊
    def convertDurationStringToHourInt(self, strDurtation=None):
        intDefaultDuration = 1
        if not strDurtation or ("hr" not in strDurtation and "day" not in strDurtation):
            return intDefaultDuration
        else:
            intTotalDurationHour = 0
            mDurationHour = re.search("([\d\.]+) hr", strDurtation)
            mDurationDay = re.search("([\d\.]+) day", strDurtation)
            if mDurationHour:
                intDurationHour = int(float(mDurationHour.group(1)))
                intTotalDurationHour = intTotalDurationHour + intDurationHour
            if mDurationDay:
                intDurationDay = int(float(mDurationDay.group(1)))
                intTotalDurationHour = intTotalDurationHour + (intDurationDay*8)
            return intTotalDurationHour
Ejemplo n.º 11
0
class CrawlerForBMG:
    
    #建構子
    def __init__(self):
        self.strAuthCode = "uds5e527i008wa7k47gyl4srzy3zywbxpw7ei6oe"
        self.dicSubCommandHandler = {
            "bmgapi":self.crawlBMGAPI
        }
        self.ffUtil = FfUtility()
        self.fileUtil = FilesysUtility()
        self.lstDicParsedProductJson = []  #product.json 資料
        
    #取得 spider 使用資訊
    def getUseageMessage(self):
        return (
            "- BeMyGuest -\n"
            "useage:\n"
            "bmgapi - crawl BeMyGuest API product \n"
        )
        
    #執行 crawler
    def runCrawler(self, lstSubcommand=None):
        strSubcommand = lstSubcommand[0]
        strArg1 = None
        if len(lstSubcommand) == 2:
            strArg1 = lstSubcommand[1]
        self.dicSubCommandHandler[strSubcommand](strArg1)
        
    #爬取 BeMyGuest API 產品
    def crawlBMGAPI(self, uselessArg1=None):
        #清空計憶體殘留資料
        self.lstDicParsedProductJson = []
        #取得所有產品 簡略資料
        lstDicProductRoughData = self.getAllProductRoughData()
        for dicProductRoughData in lstDicProductRoughData:
            strProductUUID = dicProductRoughData.get("uuid", None)
            try:
                #取得產品詳細資料
                dicProductDetailData = self.getProductDetailData(strProductUUID=strProductUUID)
                #幣別資料(檢查)
                logging.info("product currency: %s"%dicProductDetailData.get("currency", {}).get("code", str(None)))
                #轉換為 findfine 資料格式
                dicProductJson = {}
                #strSource
                dicProductJson["strSource"] = "BeMyGuest"
                #strOriginUrl
                dicProductJson["strOriginUrl"] = dicProductDetailData.get("url", None) + u"?partner_id=findfinetour&currency=USD"
                #strUpdateStatus
                dicProductJson["strUpdateStatus"] = "up-to-date"
                #strUpdateTime
                dicProductJson["strUpdateTime"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                #strImageUrl
                strBasePhotosUrl = dicProductDetailData.get("photosUrl", None)
                strOriginalImgPath = dicProductDetailData.get("photos", [{}])[0].get("paths", {}).get("original", None)
                dicProductJson["strImageUrl"] = strBasePhotosUrl + strOriginalImgPath
                #strTitle
                dicProductJson["strTitle"] = dicProductDetailData.get("title", None)
                #strLocation
                lstDicLocation = dicProductDetailData.get("locations", [])
                lstStrLocation = []
                for dicLocation in lstDicLocation:
                    strCity = dicLocation.get("city", None)
                    strState = dicLocation.get("state", None)
                    strCountry = dicLocation.get("country", None)
                    lstStrLocation.append(strCity)
                    lstStrLocation.append(strState)
                    lstStrLocation.append(strCountry)
                lstStrLocation = list(set(lstStrLocation))
                dicProductJson["strLocation"] = ",".join(lstStrLocation)
                #intUsdCost
                fAdultPrice = 0.0
                dicProductType = dicProductDetailData.get("productTypes", [{}])
                dicPrices = dicProductType[0].get("prices", {})
                for strPricesKey in dicPrices.keys():
                    dicPrice = dicPrices.get(strPricesKey, {})
                    dicRegular = dicPrice.get("regular", False)
                    if dicRegular != False:
                        dicAdult = dicRegular.get("adult", {})
                        for strAdultKey in dicAdult.keys():
                            fAdultPrice = dicAdult.get(strAdultKey, 0.0)
                            if fAdultPrice > 0.0:
                                break
                logging.info("%s got price: %f USD"%(strProductUUID, fAdultPrice))
                #dicProductJson["intUsdCost"] = int(fAdultPrice/1.39)
                dicProductJson["intUsdCost"] = fAdultPrice
                #intReviewStar
                dicProductJson["intReviewStar"] = int(dicProductDetailData.get("reviewAverageScore", 0))
                #intReviewVisitor
                dicProductJson["intReviewVisitor"] = int(dicProductDetailData.get("reviewCount", 0))
                #strIntroduction
                dicProductJson["strIntroduction"] = dicProductDetailData.get("description", None)
                #intDurationHour
                intDays = dicProductType[0].get("durationDays", 0)
                intHours = dicProductType[0].get("durationHours", 0)
                if not intDays:
                    intDays = 0
                if not intHours:
                    intHours = 0
                dicProductJson["intDurationHour"] = (8*intDays) + intHours
                #strGuideLanguage
                lstDicGuideLanguage = dicProductDetailData.get("guideLanguages", [])
                lstStrName = []
                for dicGuideLanguage in lstDicGuideLanguage:
                    strName = dicGuideLanguage.get("name", None)
                    lstStrName.append(strName)
                dicProductJson["strGuideLanguage"] = ",".join(lstStrName)
                #strStyle
                lstDicCategory = dicProductDetailData.get("categories", [])
                lstStrName = []
                for dicCategory in lstDicCategory:
                    strName = dicCategory.get("name", None)
                    lstStrName.append(strName)
                dicProductJson["strStyle"] = ",".join(lstStrName)
                #intOption
                dicProductJson["intOption"] = None
                #加入資料至 json
                self.lstDicParsedProductJson.append(dicProductJson)
            except Exception as e:
                logging.warning(str(e))
                logging.warning("crawl product failed, skip: %s"%strProductUUID)
                continue
        #將資料寫入 json
        strJsonFileName = "bmg_product.json"
        strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.bmg", strResourceName=strJsonFileName)
        self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath)
        self.lstDicParsedProductJson = []
        
    #取得所有產品 簡略資料
    def getAllProductRoughData(self):
        lstDicProductRoughData = []
        # 第一頁
        strPage1Url = "https://apidemo.bemyguest.com.sg/v1/products"
        logging.info("get BMG product rough data: %s"%strPage1Url)
        strRespJson = self.sendHttpRequestByUrllib(
            strUrl=strPage1Url,
            dicHeader={"X-Authorization":self.strAuthCode},
            dicData=None,
            strEncoding="utf-8"
        )
        dicRespJson = json.loads(strRespJson)
        lstDicProductRoughData = lstDicProductRoughData + dicRespJson.get("data", [])
        # 下一頁
        strNextPageUrl = dicRespJson.get("meta", {}).get("pagination", {}).get("links", {}).get("next", None)
        while strNextPageUrl:
            strNextPageUrl = re.sub("currency=[\d]+", "currency=USD", strNextPageUrl)
            logging.info("get BMG product rough data: %s"%strNextPageUrl)
            strRespJson = self.sendHttpRequestByUrllib(
                strUrl=strNextPageUrl,
                dicHeader={"X-Authorization":self.strAuthCode},
                dicData=None,
                strEncoding="utf-8"
            )
            dicRespJson = json.loads(strRespJson)
            lstDicProductRoughData = lstDicProductRoughData + dicRespJson.get("data", [])
            # 再下一頁
            strNextPageUrl = dicRespJson.get("meta", {}).get("pagination", {}).get("links", {}).get("next", None)
        return lstDicProductRoughData
        
    #取得產品 詳細資料
    def getProductDetailData(self, strProductUUID=None):
        logging.info("get BMG product detail data: %s"%strProductUUID)
        strRespJson = self.sendHttpRequestByUrllib(
            strUrl="https://apidemo.bemyguest.com.sg/v1/products/%s?currency=USD"%strProductUUID,
            dicHeader={"X-Authorization":self.strAuthCode},
            dicData=None,
            strEncoding="utf-8"
        )
        dicRespJson = json.loads(strRespJson)
        dicProductDetailData = dicRespJson.get("data", None)
        return dicProductDetailData
        
    #使用 urllib 傳送 HTTP request
    def sendHttpRequestByUrllib(self, strUrl=None, dicHeader={}, dicData=None, strEncoding="utf-8"):
        req = None
        if dicData: #有提供 dicData 使用 POST
            byteEncodedData = urllib.parse.urlencode(dicData).encode(strEncoding)
            req = urllib.request.Request(url=strUrl, data=byteEncodedData, headers=dicHeader, method="POST")
        else: #dicData=None 使用 GET
            req = urllib.request.Request(url=strUrl, data=None, headers=dicHeader, method="GET")
        response = urllib.request.urlopen(req)
        return response.read().decode(strEncoding)
Ejemplo n.º 12
0
class CrawlerForKLOOK:
    
    #建構子
    def __init__(self):
        self.dicSubCommandHandler = {
            "index":self.crawlIndexPage,
            "city":self.crawlCityPage,
            "product":self.crawlProductPage
        }
        self.ffUtil = FfUtility()
        self.fileUtil = FilesysUtility()
        self.db = LocalDbForKLOOK()
        self.lstDicParsedProductJson = []  #product.json 資料
        self.intProductJsonIndex = 1
        self.driver = None
        
    #取得 spider 使用資訊
    def getUseageMessage(self):
        return (
            "- KLOOK -\n"
            "useage:\n"
            "index - crawl index page of KLOOK \n"
            "city - crawl not obtained city page \n"
            "product [city_page_1_url] - crawl not obtained product page [of given city_page_1_url] \n"
        )
    
    #取得 selenium driver 物件
    def getDriver(self):
        chromeDriverExeFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource", strResourceName="chromedriver.exe")
        driver = webdriver.Chrome(chromeDriverExeFilePath)
        return driver
        
    #初始化 selenium driver 物件
    def initDriver(self):
        if not self.driver:
            self.driver = self.getDriver()
        
    #終止 selenium driver 物件
    def quitDriver(self):
        self.driver.quit()
        self.driver = None
        
    #重啟 selenium driver 物件
    def restartDriver(self):
        self.quitDriver()
        time.sleep(5)
        self.initDriver()
        
    #執行 crawler
    def runCrawler(self, lstSubcommand=None):
        strSubcommand = lstSubcommand[0]
        strArg1 = None
        if len(lstSubcommand) == 2:
            strArg1 = lstSubcommand[1]
        self.initDriver() #init selenium driver
        self.dicSubCommandHandler[strSubcommand](strArg1)
        self.quitDriver() #quit selenium driver
        
    #爬取 index 頁面 
    def crawlIndexPage(self, uselessArg1=None):
        logging.info("crawl index page")
        #KLOOK index 頁面
        self.driver.get("https://www.klook.com/")
        #切換至英文版
        eleLangSelect = self.driver.find_element_by_id("f_lang")
        for eleLangOption in eleLangSelect.find_elements_by_tag_name("option"):
            if eleLangOption.text == "English":
                eleLangOption.click()
                time.sleep(10)
                break
        #解析 city 超連結
        lstEleCityA = self.driver.find_elements_by_css_selector("#searchCityList a")
        for eleCityA in lstEleCityA:
            strCityHref = eleCityA.get_attribute("href")
            #儲存 city 超連結至 localdb
            if strCityHref.startswith("https://www.klook.com/city/"):
                self.db.insertCityIfNotExists(strCityPage1Url=strCityHref)
                logging.info("save city url: %s"%strCityHref)
        
    #解析 city 頁面 回傳找到的 product 數量
    def parseCityPage(self, strCityPage1Url=None):
        intFoundProduct = 0
        #找尋 product 超連結
        elesProductA = self.driver.find_elements_by_css_selector("#activities div.j_activity_item a")
        for eleProductA in elesProductA:
            strProductUrl = eleProductA.get_attribute("href")
            #儲存 product 超連結至 localdb
            if strProductUrl.startswith("https://www.klook.com/activity/") and "'" not in strProductUrl:
                logging.info("insert product url: %s"%strProductUrl)
                self.db.insertProductUrlIfNotExists(strProductUrl=strProductUrl, strCityPage1Url=strCityPage1Url)
                intFoundProduct = intFoundProduct+1
        return intFoundProduct
        
    #爬取 city 頁面
    def crawlCityPage(self, uselessArg1=None):
        logging.info("crawl city page")
        #取得 Db 中尚未下載的 city url
        lstStrNotObtainedCityPage1Url = self.db.fetchallNotObtainedCityUrl()
        for strNotObtainedCityPage1Url in lstStrNotObtainedCityPage1Url:
            #re 找出 city 名稱
            strCityName = re.match("^https://www.klook.com/city/[\d]+-(.*)/$", strNotObtainedCityPage1Url).group(1)
            #city 頁面
            intCityPageNum = 1
            #city 第1頁
            time.sleep(random.randint(2,5)) #sleep random time
            strCityPageUrl = strNotObtainedCityPage1Url + u"?p=%d"%intCityPageNum #加上頁碼
            self.driver.get(strCityPageUrl)
            #解析 product 超連結
            intFoundProduct = self.parseCityPage(strCityPage1Url=strNotObtainedCityPage1Url)
            while intFoundProduct != 0:
                time.sleep(random.randint(5,8)) #sleep random time
                intCityPageNum = intCityPageNum+1
                strCityPageUrl = strNotObtainedCityPage1Url + u"?p=%d"%intCityPageNum #加上頁碼
                self.driver.get(strCityPageUrl)
                time.sleep(5) #wait click action complete
                #解析 product 超連結
                intFoundProduct = self.parseCityPage(strCityPage1Url=strNotObtainedCityPage1Url)
            #更新 city DB 為已抓取 (isGot = 1)
            self.db.updateCityStatusIsGot(strCityPage1Url=strNotObtainedCityPage1Url)
            logging.info("got city %s find %d pages"%(strCityName, intCityPageNum))
            
    #解析 product 頁面
    def parseProductPage(self, strProductUrl=None):
        dicProductJson = {}
        #strSource
        dicProductJson["strSource"] = "KLOOK"
        #strOriginUrl
        dicProductJson["strOriginUrl"] = strProductUrl
        #strUpdateStatus
        dicProductJson["strUpdateStatus"] = "up-to-date"
        #strUpdateTime
        dicProductJson["strUpdateTime"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        #strImageUrl
        strImageSectionStyle = self.driver.find_element_by_css_selector("section.banner").get_attribute("style")
        strImageSectionStyle = re.sub("[:;\"\s\(\)]", "", strImageSectionStyle).strip()
        #strImageUrl 中會出現中文 先進行 urlencode
        strImageUrl = u"https://" + urllib.parse.quote(re.match("^.*https//(res\.klook\.com/images/.*)$", strImageSectionStyle).group(1).strip())
        dicProductJson["strImageUrl"] = strImageUrl
        #strTitle
        strTitle = self.driver.find_element_by_css_selector("section.activity header h1.t_main").text
        dicProductJson["strTitle"] = strTitle.strip()
        #strLocation
        strLocation = self.driver.find_element_by_css_selector("section.activity header p span.icon-label:nth-of-type(1)").text
        dicProductJson["strLocation"] = strLocation.strip()
        #intUsdCost
        strUsdCost = self.driver.find_element_by_css_selector("div.right_price_box span.t_main").text
        strUsdCost = re.sub("[^\d]", "", strUsdCost)
        dicProductJson["intUsdCost"] = int(strUsdCost.strip())
        #intReviewStar
        dicProductJson["intReviewStar"] = 5
        #intReviewVisitor
        dicProductJson["intReviewVisitor"] = 1
        #strIntroduction
        strIntroduction = u""
        elesIntroduction = self.driver.find_elements_by_css_selector("section.activity div.j_blank_window.actinfo *")
        for eleIntroduction in elesIntroduction:
            strIntroduction = strIntroduction + u" " + re.sub("\s", " ", eleIntroduction.text.strip())
        dicProductJson["strIntroduction"] = strIntroduction
        #intDurationHour
        strDurationHour = self.driver.find_element_by_css_selector("section.activity section.j_blank_window.actinfo:nth-of-type(1) div div:nth-of-type(1) p").text
        strDurationHour = re.sub("\s", " ", strDurationHour.lower())
        intDurationHour = self.convertDurationStringToHourInt(strDurtation=strDurationHour)
        dicProductJson["intDurationHour"] = intDurationHour
        #strGuideLanguage
        strGuideLanguage = self.driver.find_element_by_css_selector("section.activity section.j_blank_window.actinfo:nth-of-type(1) div div:nth-of-type(2) p").text
        strGuideLanguage = re.match("^language (.*)$", re.sub("\s", " ", strGuideLanguage.lower())).group(1)
        dicProductJson["strGuideLanguage"] = strGuideLanguage
        #intOption (待確認)
        dicProductJson["intOption"] = None
        #strStyle (klook 無該資料)
        dicProductJson["strStyle"] = None
        self.lstDicParsedProductJson.append(dicProductJson)
    
    #爬取 product 頁面 (strCityPage1Url == None 會自動找尋已爬取完成之 city)
    def crawlProductPage(self, strCityPage1Url=None):
        #清空計憶體殘留資料
        self.lstDicParsedProductJson = []
        self.intProductJsonIndex = 1
        if not strCityPage1Url:
            #未指定 city
            lstStrObtainedCityUrl = self.db.fetchallCompletedObtainedCityUrl()
            for strObtainedCountryUrl in lstStrObtainedCityUrl:
                self.crawlProductPageWithGivenCityUrl(strCityPage1Url=strObtainedCountryUrl)
        else:
            #有指定 city url
            self.crawlProductPageWithGivenCityUrl(strCityPage1Url=strCityPage1Url)
        #將最後資料寫入 json
        if len(self.lstDicParsedProductJson) > 0:
            strJsonFileName = "%d_product.json"%(self.intProductJsonIndex*100)
            strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.klook", strResourceName=strJsonFileName)
            self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath)
            self.lstDicParsedProductJson = []
            
    #爬取 product 頁面 (指定 city url)
    def crawlProductPageWithGivenCityUrl(self, strCityPage1Url=None):
        logging.info("crawl product page with city %s"%strCityPage1Url)
        #取得 DB 紀錄中,指定 strCityPage1Url city 的 product url
        lstStrProductUrl = self.db.fetchallProductUrlByCityUrl(strCityPage1Url=strCityPage1Url)
        for strProductUrl in lstStrProductUrl:
            #檢查 product 是否已下載
            if not self.db.checkProductIsGot(strProductUrl=strProductUrl):
                time.sleep(random.randint(5,8)) #sleep random time
                try:
                    self.driver.get(strProductUrl)
                    #切換目前幣別至 USD
                    strCurrentCurrencyText = self.driver.find_element_by_css_selector("#j_currency a:nth-of-type(1)").text
                    logging.info("目前幣別: %s"%strCurrentCurrencyText)
                    if strCurrentCurrencyText != "USD":
                        logging.info("切換目前幣別至 USD")
                        eleCurrencyLi = self.driver.find_element_by_css_selector("#j_currency")
                        eleUsdA = self.driver.find_element_by_css_selector("#j_currency li a[data-value=USD]")
                        actHoverThenClick = ActionChains(self.driver)
                        actHoverThenClick.move_to_element(eleCurrencyLi).move_to_element(eleUsdA).click().perform()
                        time.sleep(10) #等待幣別轉換完成
                    #解析 product 頁面
                    self.parseProductPage(strProductUrl=strProductUrl)
                    #更新 product DB 為已爬取 (isGot = 1)
                    #self.db.updateProductStatusIsGot(strProductUrl=strProductUrl)
                except Exception as e:
                    logging.warning(str(e))
                    logging.warning("selenium driver crashed. skip get product: %s"%strProductUrl)
                    self.restartDriver() #重啟 
            #顯示進度
            logging.info("進度: %d/100"%len(self.lstDicParsedProductJson))
            #寫入 json
            if len(self.lstDicParsedProductJson) == 100:
                strJsonFileName = "%d_product.json"%(self.intProductJsonIndex*100)
                strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.klook", strResourceName=strJsonFileName)
                self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath)
                self.intProductJsonIndex = self.intProductJsonIndex+1
                self.lstDicParsedProductJson = []
                
    #轉換 duration 資訊
    def convertDurationStringToHourInt(self, strDurtation=None):
        intDefaultDuration = 1
        if not strDurtation or ("hour" not in strDurtation and "day" not in strDurtation):
            return intDefaultDuration
        else:
            intTotalDurationHour = 0
            mDurationHour = re.search("([\d\.]+) hour", strDurtation)
            mDurationDay = re.search("([\d\.]+) day", strDurtation)
            if mDurationHour:
                intDurationHour = int(float(mDurationHour.group(1)))
                intTotalDurationHour = intTotalDurationHour + intDurationHour
            if mDurationDay:
                intDurationDay = int(float(mDurationDay.group(1)))
                intTotalDurationHour = intTotalDurationHour + (intDurationDay*8)
            return intTotalDurationHour