class SQLite3Db: #建構子 def __init__(self, strResFolderPath=None): logging.basicConfig(level=logging.INFO) self.fsUtil = FileSystemUtility() strDbPath = self.fsUtil.getPackageResourcePath(strPackageName=strResFolderPath, strResourceName="local.db") logging.info("connect to sqlite3 db.") self.conn = sqlite3.connect(strDbPath) #建立連線 self.conn.row_factory = sqlite3.Row #資料封裝為 Row 物件 #解構子 def __del__(self): logging.info("close sqlite3 db connection.") self.conn.close() #關閉資料庫連線 # 執行 SQL 並 commit (適用於 INSERT、UPDATE、DELETE) def commitSQL(self, strSQL=None): c = self.conn.cursor() c.execute(strSQL) self.conn.commit() return c.lastrowid #回傳最後 INSERT 的 row id # 執行 SQL 並 fetchall 資料 (適用於 SELECT) def fetchallSQL(self, strSQL=None): c = self.conn.cursor() c.execute(strSQL) return c.fetchall()
def __init__(self, strResFolderPath=None): logging.basicConfig(level=logging.INFO) self.fsUtil = FileSystemUtility() strDbPath = self.fsUtil.getPackageResourcePath(strPackageName=strResFolderPath, strResourceName="local.db") logging.info("connect to sqlite3 db.") self.conn = sqlite3.connect(strDbPath) #建立連線 self.conn.row_factory = sqlite3.Row #資料封裝為 Row 物件
def __init__(self): self.dicSubCommandHandler = { "yahoo":self.crawlYahooCurrencyPage } self.ffUtil = FfUtility() self.fileUtil = FilesysUtility() self.lstDicParsedCurrencyJson = [] #currency.json 資料 self.driver = None
def __init__(self): self.strAuthCode = "uds5e527i008wa7k47gyl4srzy3zywbxpw7ei6oe" self.dicSubCommandHandler = { "bmgapi":self.crawlBMGAPI } self.ffUtil = FfUtility() self.fileUtil = FilesysUtility() self.lstDicParsedProductJson = [] #product.json 資料
def __init__(self): self.dicSubCommandHandler = { "download":self.downloadVapProductsXmlZip, "unzip":self.unzipVapProductsXmlZip, "json":self.crawlVapProductsXml } self.ffUtil = FfUtility() self.fileUtil = FilesysUtility() self.lstDicParsedProductJson = [] #product.json 資料 self.intProductJsonIndex = 1
def __init__(self): self.dicSubCommandHandler = { "search":self.crawlSearchPage, "product":self.crawlProductPage } self.ffUtil = FfUtility() self.fileUtil = FilesysUtility() self.db = LocalDbForTRIPBAA() self.lstDicParsedProductJson = [] #product.json 資料 self.intProductJsonIndex = 1 self.driver = None
def __init__(self): self.dicSubCommandHandler = { "index":self.crawlIndexPage, "country":self.crawlCountryPage, "product":self.crawlProductPage } self.ffUtil = FfUtility() self.fileUtil = FilesysUtility() self.db = LocalDbForKKDAY() self.lstDicParsedProductJson = [] #product.json 資料 self.intProductJsonIndex = 1 self.driver = None
class FileSystemUtilityTest(unittest.TestCase): #準備 def setUp(self): logging.basicConfig(level=logging.INFO) self.fsUtil = FileSystemUtility() #收尾 def tearDown(self): pass #測試 取得 package 資源路徑 def test_getPackageResourcePath(self): logging.info("FileSystemUtilityTest.test_getPackageResourcePath") strFileSystemPathOfResource = self.fsUtil.getPackageResourcePath(strPackageName="bennu_res", strResourceName="icon.ico") self.assertTrue(os.path.exists(strFileSystemPathOfResource))
class SeleniumTest(unittest.TestCase): #準備 def setUp(self): logging.basicConfig(level=logging.INFO) self.fileUtil = FileSystemUtility() strChromeDriverPath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource", strResourceName="chromedriver.exe") self.driver = webdriver.Chrome(strChromeDriverPath) #收尾 def tearDown(self): self.driver.quit() #測試 selenium def test_selenium(self): logging.info("SeleniumTest.test_selenium") self.driver.get("https://www.kkday.com/en/home") source = self.driver.page_source
class ImporterForCITYDISCOVERY: #建構子 def __init__(self): self.ffUtil = FfUtility() self.filesysUtil = FilesysUtility() self.db = LocalDbForJsonImporter() #self.db = ExternalDbForJsonImporter() self.dicSubCommandHandler = {"import":[self.importProductJsonToDb]} #取得 importer 使用資訊 def getUseageMessage(self): return ( "- CITY-DISCOVERY -\n" "useage:\n" "import - import product.json to database \n" ) #執行 importer def runImporter(self, lstSubcommand=None): strSubcommand = lstSubcommand[0] strArg1 = None if len(lstSubcommand) == 2: strArg1 = lstSubcommand[1] for handler in self.dicSubCommandHandler[strSubcommand]: handler(strArg1) #import product.json to MySQL DB def importProductJsonToDb(self, uselessArg1=None): #設定 City-Discovery trip 資料的 status 為 out-of-date self.db.setTripDataStatusAsOutOfDate(strSource="City-Discovery") #讀取 json 檔 strBasedir = self.filesysUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json", strResourceName="city_discovery") lstStrProductJsonFilePath = self.ffUtil.getFilePathListWithSuffixes(strBasedir=strBasedir, strSuffixes="_product.json") for strProductJsonFilePath in lstStrProductJsonFilePath: logging.info("read %s"%strProductJsonFilePath) lstDicProductData = self.ffUtil.readObjectFromJsonFile(strJsonFilePath=strProductJsonFilePath) for dicProductData in lstDicProductData: try: #UPSERT self.db.upsertTrip(dicTripData=dicProductData) except Exception as e: logging.warning("insert trip failed: %s"%(str(e)))
class ImporterForExRate: #建構子 def __init__(self): self.ffUtil = FfUtility() self.filesysUtil = FilesysUtility() self.db = LocalDbForJsonImporter() #self.db = ExternalDbForJsonImporter() self.dicSubCommandHandler = {"import":[self.importYahooCurrencyJsonToDb]} #取得 importer 使用資訊 def getUseageMessage(self): return ( "- ExRate -\n" "useage:\n" "import - import exrate/*.json to database \n" ) #執行 importer def runImporter(self, lstSubcommand=None): strSubcommand = lstSubcommand[0] strArg1 = None if len(lstSubcommand) == 2: strArg1 = lstSubcommand[1] for handler in self.dicSubCommandHandler[strSubcommand]: handler(strArg1) #import exrate/*.json to MySQL DB def importYahooCurrencyJsonToDb(self, uselessArg1=None): #清除 trip_exrate 資料 self.db.clearExRateData() #讀取 json 檔 strBasedir = self.filesysUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json", strResourceName="exrate") lstStrExRateJsonFilePath = self.ffUtil.getFilePathListWithSuffixes(strBasedir=strBasedir, strSuffixes=".json") for strExRateJsonFilePath in lstStrExRateJsonFilePath: logging.info("read %s"%strExRateJsonFilePath) lstDicExRateData = self.ffUtil.readObjectFromJsonFile(strJsonFilePath=strExRateJsonFilePath) for dicExRateData in lstDicExRateData: try: #UPDATE or INSERT self.db.upsertExRate(dicExRateData=dicExRateData) except Exception as e: logging.warning("upsert exrate failed: %s"%(str(e)))
class CrawlerForKLOOK: #建構子 def __init__(self): self.dicSubCommandHandler = { "index":self.crawlIndexPage, "city":self.crawlCityPage, "product":self.crawlProductPage } self.ffUtil = FfUtility() self.fileUtil = FilesysUtility() self.db = LocalDbForKLOOK() self.lstDicParsedProductJson = [] #product.json 資料 self.intProductJsonIndex = 1 self.driver = None #取得 spider 使用資訊 def getUseageMessage(self): return ( "- KLOOK -\n" "useage:\n" "index - crawl index page of KLOOK \n" "city - crawl not obtained city page \n" "product [city_page_1_url] - crawl not obtained product page [of given city_page_1_url] \n" ) #取得 selenium driver 物件 def getDriver(self): chromeDriverExeFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource", strResourceName="chromedriver.exe") driver = webdriver.Chrome(chromeDriverExeFilePath) return driver #初始化 selenium driver 物件 def initDriver(self): if not self.driver: self.driver = self.getDriver() #終止 selenium driver 物件 def quitDriver(self): self.driver.quit() self.driver = None #重啟 selenium driver 物件 def restartDriver(self): self.quitDriver() time.sleep(5) self.initDriver() #執行 crawler def runCrawler(self, lstSubcommand=None): strSubcommand = lstSubcommand[0] strArg1 = None if len(lstSubcommand) == 2: strArg1 = lstSubcommand[1] self.initDriver() #init selenium driver self.dicSubCommandHandler[strSubcommand](strArg1) self.quitDriver() #quit selenium driver #爬取 index 頁面 def crawlIndexPage(self, uselessArg1=None): logging.info("crawl index page") #KLOOK index 頁面 self.driver.get("https://www.klook.com/") #切換至英文版 eleLangSelect = self.driver.find_element_by_id("f_lang") for eleLangOption in eleLangSelect.find_elements_by_tag_name("option"): if eleLangOption.text == "English": eleLangOption.click() time.sleep(10) break #解析 city 超連結 lstEleCityA = self.driver.find_elements_by_css_selector("#searchCityList a") for eleCityA in lstEleCityA: strCityHref = eleCityA.get_attribute("href") #儲存 city 超連結至 localdb if strCityHref.startswith("https://www.klook.com/city/"): self.db.insertCityIfNotExists(strCityPage1Url=strCityHref) logging.info("save city url: %s"%strCityHref) #解析 city 頁面 回傳找到的 product 數量 def parseCityPage(self, strCityPage1Url=None): intFoundProduct = 0 #找尋 product 超連結 elesProductA = self.driver.find_elements_by_css_selector("#activities div.j_activity_item a") for eleProductA in elesProductA: strProductUrl = eleProductA.get_attribute("href") #儲存 product 超連結至 localdb if strProductUrl.startswith("https://www.klook.com/activity/") and "'" not in strProductUrl: logging.info("insert product url: %s"%strProductUrl) self.db.insertProductUrlIfNotExists(strProductUrl=strProductUrl, strCityPage1Url=strCityPage1Url) intFoundProduct = intFoundProduct+1 return intFoundProduct #爬取 city 頁面 def crawlCityPage(self, uselessArg1=None): logging.info("crawl city page") #取得 Db 中尚未下載的 city url lstStrNotObtainedCityPage1Url = self.db.fetchallNotObtainedCityUrl() for strNotObtainedCityPage1Url in lstStrNotObtainedCityPage1Url: #re 找出 city 名稱 strCityName = re.match("^https://www.klook.com/city/[\d]+-(.*)/$", strNotObtainedCityPage1Url).group(1) #city 頁面 intCityPageNum = 1 #city 第1頁 time.sleep(random.randint(2,5)) #sleep random time strCityPageUrl = strNotObtainedCityPage1Url + u"?p=%d"%intCityPageNum #加上頁碼 self.driver.get(strCityPageUrl) #解析 product 超連結 intFoundProduct = self.parseCityPage(strCityPage1Url=strNotObtainedCityPage1Url) while intFoundProduct != 0: time.sleep(random.randint(5,8)) #sleep random time intCityPageNum = intCityPageNum+1 strCityPageUrl = strNotObtainedCityPage1Url + u"?p=%d"%intCityPageNum #加上頁碼 self.driver.get(strCityPageUrl) time.sleep(5) #wait click action complete #解析 product 超連結 intFoundProduct = self.parseCityPage(strCityPage1Url=strNotObtainedCityPage1Url) #更新 city DB 為已抓取 (isGot = 1) self.db.updateCityStatusIsGot(strCityPage1Url=strNotObtainedCityPage1Url) logging.info("got city %s find %d pages"%(strCityName, intCityPageNum)) #解析 product 頁面 def parseProductPage(self, strProductUrl=None): dicProductJson = {} #strSource dicProductJson["strSource"] = "KLOOK" #strOriginUrl dicProductJson["strOriginUrl"] = strProductUrl #strUpdateStatus dicProductJson["strUpdateStatus"] = "up-to-date" #strUpdateTime dicProductJson["strUpdateTime"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") #strImageUrl strImageSectionStyle = self.driver.find_element_by_css_selector("section.banner").get_attribute("style") strImageSectionStyle = re.sub("[:;\"\s\(\)]", "", strImageSectionStyle).strip() #strImageUrl 中會出現中文 先進行 urlencode strImageUrl = u"https://" + urllib.parse.quote(re.match("^.*https//(res\.klook\.com/images/.*)$", strImageSectionStyle).group(1).strip()) dicProductJson["strImageUrl"] = strImageUrl #strTitle strTitle = self.driver.find_element_by_css_selector("section.activity header h1.t_main").text dicProductJson["strTitle"] = strTitle.strip() #strLocation strLocation = self.driver.find_element_by_css_selector("section.activity header p span.icon-label:nth-of-type(1)").text dicProductJson["strLocation"] = strLocation.strip() #intUsdCost strUsdCost = self.driver.find_element_by_css_selector("div.right_price_box span.t_main").text strUsdCost = re.sub("[^\d]", "", strUsdCost) dicProductJson["intUsdCost"] = int(strUsdCost.strip()) #intReviewStar dicProductJson["intReviewStar"] = 5 #intReviewVisitor dicProductJson["intReviewVisitor"] = 1 #strIntroduction strIntroduction = u"" elesIntroduction = self.driver.find_elements_by_css_selector("section.activity div.j_blank_window.actinfo *") for eleIntroduction in elesIntroduction: strIntroduction = strIntroduction + u" " + re.sub("\s", " ", eleIntroduction.text.strip()) dicProductJson["strIntroduction"] = strIntroduction #intDurationHour strDurationHour = self.driver.find_element_by_css_selector("section.activity section.j_blank_window.actinfo:nth-of-type(1) div div:nth-of-type(1) p").text strDurationHour = re.sub("\s", " ", strDurationHour.lower()) intDurationHour = self.convertDurationStringToHourInt(strDurtation=strDurationHour) dicProductJson["intDurationHour"] = intDurationHour #strGuideLanguage strGuideLanguage = self.driver.find_element_by_css_selector("section.activity section.j_blank_window.actinfo:nth-of-type(1) div div:nth-of-type(2) p").text strGuideLanguage = re.match("^language (.*)$", re.sub("\s", " ", strGuideLanguage.lower())).group(1) dicProductJson["strGuideLanguage"] = strGuideLanguage #intOption (待確認) dicProductJson["intOption"] = None #strStyle (klook 無該資料) dicProductJson["strStyle"] = None self.lstDicParsedProductJson.append(dicProductJson) #爬取 product 頁面 (strCityPage1Url == None 會自動找尋已爬取完成之 city) def crawlProductPage(self, strCityPage1Url=None): #清空計憶體殘留資料 self.lstDicParsedProductJson = [] self.intProductJsonIndex = 1 if not strCityPage1Url: #未指定 city lstStrObtainedCityUrl = self.db.fetchallCompletedObtainedCityUrl() for strObtainedCountryUrl in lstStrObtainedCityUrl: self.crawlProductPageWithGivenCityUrl(strCityPage1Url=strObtainedCountryUrl) else: #有指定 city url self.crawlProductPageWithGivenCityUrl(strCityPage1Url=strCityPage1Url) #將最後資料寫入 json if len(self.lstDicParsedProductJson) > 0: strJsonFileName = "%d_product.json"%(self.intProductJsonIndex*100) strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.klook", strResourceName=strJsonFileName) self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath) self.lstDicParsedProductJson = [] #爬取 product 頁面 (指定 city url) def crawlProductPageWithGivenCityUrl(self, strCityPage1Url=None): logging.info("crawl product page with city %s"%strCityPage1Url) #取得 DB 紀錄中,指定 strCityPage1Url city 的 product url lstStrProductUrl = self.db.fetchallProductUrlByCityUrl(strCityPage1Url=strCityPage1Url) for strProductUrl in lstStrProductUrl: #檢查 product 是否已下載 if not self.db.checkProductIsGot(strProductUrl=strProductUrl): time.sleep(random.randint(5,8)) #sleep random time try: self.driver.get(strProductUrl) #切換目前幣別至 USD strCurrentCurrencyText = self.driver.find_element_by_css_selector("#j_currency a:nth-of-type(1)").text logging.info("目前幣別: %s"%strCurrentCurrencyText) if strCurrentCurrencyText != "USD": logging.info("切換目前幣別至 USD") eleCurrencyLi = self.driver.find_element_by_css_selector("#j_currency") eleUsdA = self.driver.find_element_by_css_selector("#j_currency li a[data-value=USD]") actHoverThenClick = ActionChains(self.driver) actHoverThenClick.move_to_element(eleCurrencyLi).move_to_element(eleUsdA).click().perform() time.sleep(10) #等待幣別轉換完成 #解析 product 頁面 self.parseProductPage(strProductUrl=strProductUrl) #更新 product DB 為已爬取 (isGot = 1) #self.db.updateProductStatusIsGot(strProductUrl=strProductUrl) except Exception as e: logging.warning(str(e)) logging.warning("selenium driver crashed. skip get product: %s"%strProductUrl) self.restartDriver() #重啟 #顯示進度 logging.info("進度: %d/100"%len(self.lstDicParsedProductJson)) #寫入 json if len(self.lstDicParsedProductJson) == 100: strJsonFileName = "%d_product.json"%(self.intProductJsonIndex*100) strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.klook", strResourceName=strJsonFileName) self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath) self.intProductJsonIndex = self.intProductJsonIndex+1 self.lstDicParsedProductJson = [] #轉換 duration 資訊 def convertDurationStringToHourInt(self, strDurtation=None): intDefaultDuration = 1 if not strDurtation or ("hour" not in strDurtation and "day" not in strDurtation): return intDefaultDuration else: intTotalDurationHour = 0 mDurationHour = re.search("([\d\.]+) hour", strDurtation) mDurationDay = re.search("([\d\.]+) day", strDurtation) if mDurationHour: intDurationHour = int(float(mDurationHour.group(1))) intTotalDurationHour = intTotalDurationHour + intDurationHour if mDurationDay: intDurationDay = int(float(mDurationDay.group(1))) intTotalDurationHour = intTotalDurationHour + (intDurationDay*8) return intTotalDurationHour
class CrawlerForKKDAY: #建構子 def __init__(self): self.dicSubCommandHandler = { "index":self.crawlIndexPage, "country":self.crawlCountryPage, "product":self.crawlProductPage } self.ffUtil = FfUtility() self.fileUtil = FilesysUtility() self.db = LocalDbForKKDAY() self.lstDicParsedProductJson = [] #product.json 資料 self.intProductJsonIndex = 1 self.driver = None #取得 spider 使用資訊 def getUseageMessage(self): return ( "- KKDAY -\n" "useage:\n" "index - crawl index page of KKDAY \n" "country - crawl not obtained country page \n" "product [country_page_1_url] - crawl not obtained product page [of given country_page_1_url] \n" ) #取得 selenium driver 物件 def getDriver(self): chromeDriverExeFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource", strResourceName="chromedriver.exe") driver = webdriver.Chrome(chromeDriverExeFilePath) return driver #初始化 selenium driver 物件 def initDriver(self): if not self.driver: self.driver = self.getDriver() #終止 selenium driver 物件 def quitDriver(self): self.driver.quit() self.driver = None #重啟 selenium driver 物件 def restartDriver(self): self.quitDriver() time.sleep(5) self.initDriver() #執行 crawler def runCrawler(self, lstSubcommand=None): strSubcommand = lstSubcommand[0] strArg1 = None if len(lstSubcommand) == 2: strArg1 = lstSubcommand[1] self.initDriver() #init selenium driver self.dicSubCommandHandler[strSubcommand](strArg1) self.quitDriver() #quit selenium driver #爬取 index 頁面 def crawlIndexPage(self, uselessArg1=None): logging.info("crawl index page") #KKDAY index 頁面 self.driver.get("https://www.kkday.com/en/home") #點擊搜尋 self.driver.find_element_by_css_selector("#header-main-keywordSearch-button").click() time.sleep(5) #一一點擊區域 lstEleAreaA = self.driver.find_elements_by_css_selector("#area_country_menu ul.slideTogglePage[role=area] li a") for indexOfLstEleAreaA in range(len(lstEleAreaA)): lstEleAreaA[indexOfLstEleAreaA].click() time.sleep(5) #解析國家超連結 lstEleCountryA = self.driver.find_elements_by_css_selector("#area_country_menu ul.slideTogglePage[role=country] li a") for eleCountryA in lstEleCountryA: strCountryHref = eleCountryA.get_attribute("href") #儲存國家超連結至 localdb self.db.insertCountryIfNotExists(strCountryPage1Url=strCountryHref) logging.info("save country url: %s"%strCountryHref) self.driver.find_element_by_css_selector("#previousBtn").click() time.sleep(5) lstEleAreaA = self.driver.find_elements_by_css_selector("#area_country_menu ul.slideTogglePage li a") #解析 country 頁面 def parseCountryPage(self, strCountryPage1Url=None): #找尋 product 超連結 elesProductA = self.driver.find_elements_by_css_selector("article.product-listview div.product-info-container div div a") for eleProductA in elesProductA: strProductUrl = eleProductA.get_attribute("href") #儲存 product 超連結至 localdb logging.info("insert product url: %s"%strProductUrl) self.db.insertProductUrlIfNotExists(strProductUrl=strProductUrl, strCountryPage1Url=strCountryPage1Url) #檢查 country 有無下一頁 def checkNextCountryPageExist(self): isNextCountryPageExist = False strNextPageAText = self.driver.find_element_by_css_selector("ul.pagination li.a-page:last-child a.toPage").text if strNextPageAText and strNextPageAText == "»": isNextCountryPageExist = True return isNextCountryPageExist #爬取 country 頁面 def crawlCountryPage(self, uselessArg1=None): logging.info("crawl country page") #取得 Db 中尚未下載的 topic url lstStrNotObtainedCountryPage1Url = self.db.fetchallNotObtainedCountryUrl() for strNotObtainedCountryPage1Url in lstStrNotObtainedCountryPage1Url: #re 找出 country 名稱 strCountryName = re.match("^https://www.kkday.com/en/product/productlist/.*countryname=(.*)$", strNotObtainedCountryPage1Url).group(1) #country 頁面 try: intCountryPageNum = 1 #country 第1頁 time.sleep(random.randint(2,5)) #sleep random time strCountryUrlPageSuffix = "&sort=hdesc&page=%d"%intCountryPageNum self.driver.get(strNotObtainedCountryPage1Url + strCountryUrlPageSuffix) #解析 product 超連結 self.parseCountryPage(strCountryPage1Url=strNotObtainedCountryPage1Url) #檢查 country 有無下一頁 isNextCountryPageExist = self.checkNextCountryPageExist() while isNextCountryPageExist: time.sleep(random.randint(5,8)) #sleep random time intCountryPageNum = intCountryPageNum+1 strCountryUrlPageSuffix = "&sort=hdesc&page=%d"%intCountryPageNum self.driver.get(strNotObtainedCountryPage1Url + strCountryUrlPageSuffix) #解析 product 超連結 self.parseCountryPage(strCountryPage1Url=strNotObtainedCountryPage1Url) #檢查 country 有無下一頁 isNextCountryPageExist = self.checkNextCountryPageExist() #更新 country DB 為已抓取 (isGot = 1) self.db.updateCountryStatusIsGot(strCountryPage1Url=strNotObtainedCountryPage1Url) logging.info("got country %s find %d pages"%(strCountryName, intCountryPageNum)) except Exception as e: logging.warning(str(e)) logging.warning("selenium driver crashed. skip get country: %s"%strCountryName) finally: self.restartDriver() #重啟 #解析 product 頁面 def parseProductPage(self, strProductUrl=None): dicProductJson = {} #strSource dicProductJson["strSource"] = "KKDAY" #strOriginUrl dicProductJson["strOriginUrl"] = strProductUrl #strUpdateStatus dicProductJson["strUpdateStatus"] = "up-to-date" #strUpdateTime dicProductJson["strUpdateTime"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") #strImageUrl strImageDivStyle = self.driver.find_element_by_css_selector("div#header-imageview div.productPage-photos div.img-bg-full").get_attribute("style") strImageDivStyle = re.sub("[:;\"\s\(\)]", "", strImageDivStyle).strip() strImageUrl = re.match("^background-imageurl//(img\.kkday\.com/image/.*)$", strImageDivStyle).group(1) dicProductJson["strImageUrl"] = "http://" + strImageUrl.strip() #strTitle strTitle = self.driver.find_element_by_css_selector("div.productview div.container div.productPage-detail h1").text dicProductJson["strTitle"] = strTitle.strip() #strLocation strLocation = self.driver.find_element_by_css_selector("div.productview div.container div.productPage-detail div.col-md-pull-4 span.h5").text strLocation = re.sub("The location:", "", strLocation) dicProductJson["strLocation"] = strLocation.strip() #intUsdCost strTwdCostText = self.driver.find_element_by_css_selector("div.lowestPrice div.text-right h2.h1").text strTwdCostText = re.sub("[^\d]", "", strTwdCostText.strip()) fUsdTwdExrate = self.ffUtil.getUsdExrate(strCurrency="TWD") dicProductJson["intUsdCost"] = int(int(strTwdCostText)/fUsdTwdExrate) #intReviewStar elesStarI = self.driver.find_elements_by_css_selector("div.div-star span.h5 i.fa-star.text-primary") dicProductJson["intReviewStar"] = len(elesStarI) #intReviewVisitor intReviewVisitor = 0 elesReviewVisitorSpan = self.driver.find_elements_by_css_selector("div.div-star span.h5 span.text-primary") if len(elesReviewVisitorSpan) > 0: strReviewVisitorText = elesReviewVisitorSpan[0].text intReviewVisitor = int(strReviewVisitorText.strip()) dicProductJson["intReviewVisitor"] = intReviewVisitor #strIntroduction strIntroduction = self.driver.find_element_by_css_selector("div.prod-intro span").text dicProductJson["strIntroduction"] = strIntroduction.strip() #intDurationHour intDurationHour = 0 strDurationText = self.driver.find_element_by_css_selector("div.productview div.container div.productPage-detail div.col-md-12 span.h5").text strIntInDurationHourText = re.sub("[^\d\.]", "", strDurationText) if "hour" in strDurationText: intDurationHour = int(float(strIntInDurationHourText)) elif "day" in strDurationText: intDurationHour = int(float(strIntInDurationHourText))*24 else: pass dicProductJson["intDurationHour"] = intDurationHour #strGuideLanguage lstStrGuideLanguage = [] elesGuideLanguageImg = self.driver.find_elements_by_css_selector("div.productview div.container div.productPage-detail div.guide_lang_image img") for eleGuideLanguageImg in elesGuideLanguageImg: lstStrGuideLanguage.append(eleGuideLanguageImg.get_attribute("data-original-title").strip()) dicProductJson["strGuideLanguage"] = ",".join(lstStrGuideLanguage) #intOption (待確認) dicProductJson["intOption"] = None #strStyle (kkday 無該資料) dicProductJson["strStyle"] = None self.lstDicParsedProductJson.append(dicProductJson) #爬取 product 頁面 (strCountryPage1Url == None 會自動找尋已爬取完成之 country) def crawlProductPage(self, strCountryPage1Url=None): #清空計憶體殘留資料 self.lstDicParsedProductJson = [] self.intProductJsonIndex = 1 if not strCountryPage1Url: #未指定 country lstStrObtainedCountryUrl = self.db.fetchallCompletedObtainedCountryUrl() for strObtainedCountryUrl in lstStrObtainedCountryUrl: self.crawlProductPageWithGivenCountryUrl(strCountryPage1Url=strObtainedCountryUrl) else: #有指定 country url self.crawlProductPageWithGivenCountryUrl(strCountryPage1Url=strCountryPage1Url) #將最後資料寫入 json if len(self.lstDicParsedProductJson) > 0: strJsonFileName = "%d_product.json"%(self.intProductJsonIndex*100) strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.kkday", strResourceName=strJsonFileName) self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath) self.lstDicParsedProductJson = [] #爬取 product 頁面 (指定 country url) def crawlProductPageWithGivenCountryUrl(self, strCountryPage1Url=None): logging.info("crawl product page with country %s"%strCountryPage1Url) #取得 DB 紀錄中,指定 strCountryPage1Url country 的 product url lstStrProductUrl = self.db.fetchallProductUrlByCountryUrl(strCountryPage1Url=strCountryPage1Url) for strProductUrl in lstStrProductUrl: #檢查 product 是否已下載 if not self.db.checkProductIsGot(strProductUrl=strProductUrl): time.sleep(random.randint(5,8)) #sleep random time try: self.driver.get(strProductUrl) #解析 product 頁面 self.parseProductPage(strProductUrl=strProductUrl) #更新 product DB 為已爬取 (isGot = 1) #self.db.updateProductStatusIsGot(strProductUrl=strProductUrl) except Exception as e: logging.warning(str(e)) logging.warning("selenium driver crashed. skip get product: %s"%strProductUrl) self.restartDriver() #重啟 #顯示進度 logging.info("進度: %d/100"%len(self.lstDicParsedProductJson)) #寫入 json if len(self.lstDicParsedProductJson) == 100: strJsonFileName = "%d_product.json"%(self.intProductJsonIndex*100) strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.kkday", strResourceName=strJsonFileName) self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath) self.intProductJsonIndex = self.intProductJsonIndex+1 self.lstDicParsedProductJson = []
def setUp(self): logging.basicConfig(level=logging.INFO) self.fileUtil = FileSystemUtility() strChromeDriverPath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource", strResourceName="chromedriver.exe") self.driver = webdriver.Chrome(strChromeDriverPath)
def __init__(self): self.ffUtil = FfUtility() self.filesysUtil = FilesysUtility() self.db = LocalDbForJsonImporter() #self.db = ExternalDbForJsonImporter() self.dicSubCommandHandler = {"import":[self.importProductJsonToDb]}
class CrawlerForTRIPBAA: #建構子 def __init__(self): self.dicSubCommandHandler = { "search":self.crawlSearchPage, "product":self.crawlProductPage } self.ffUtil = FfUtility() self.fileUtil = FilesysUtility() self.db = LocalDbForTRIPBAA() self.lstDicParsedProductJson = [] #product.json 資料 self.intProductJsonIndex = 1 self.driver = None #取得 spider 使用資訊 def getUseageMessage(self): return ( "- Tripbaa -\n" "useage:\n" "search - crawl search page of Tripbaa \n" "product - crawl not obtained product page \n" ) #取得 selenium driver 物件 def getDriver(self): chromeDriverExeFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource", strResourceName="chromedriver.exe") driver = webdriver.Chrome(chromeDriverExeFilePath) return driver #初始化 selenium driver 物件 def initDriver(self): if not self.driver: self.driver = self.getDriver() #終止 selenium driver 物件 def quitDriver(self): self.driver.quit() self.driver = None #重啟 selenium driver 物件 def restartDriver(self): self.quitDriver() time.sleep(5) self.initDriver() #執行 crawler def runCrawler(self, lstSubcommand=None): strSubcommand = lstSubcommand[0] strArg1 = None if len(lstSubcommand) == 2: strArg1 = lstSubcommand[1] self.initDriver() #init selenium driver self.dicSubCommandHandler[strSubcommand](strArg1) self.quitDriver() #quit selenium driver #爬取 search 頁面 def crawlSearchPage(self, uselessArg1=None): logging.info("crawl search page") #Tripbaa search 頁面 self.driver.get("https://en.tripbaa.com/search.php?&ccid=JCU1IyE=") time.sleep(5) #展開頁面 isMoreBtnShow = True while isMoreBtnShow: if len(self.driver.find_elements_by_css_selector("#morebutton a")) == 0: isMoreBtnShow = False else: self.driver.find_element_by_css_selector("#morebutton a").click() time.sleep(5) #解析 product 超連結 lstEleProductA = self.driver.find_elements_by_css_selector("ul li div.htBox div.htPic a") for eleProductA in lstEleProductA: strProductHref = eleProductA.get_attribute("href") #儲存 product 超連結至 localdb if strProductHref.startswith("https://en.tripbaa.com/travel/"): strProductUrl = strProductHref + u"?&ccid=JCU1IyE=" #加上預設顯示 USD 的 ccid self.db.insertProductUrlIfNotExists(strProductUrl=strProductUrl) logging.info("save product url: %s"%strProductUrl) #解析 product 頁面 def parseProductPage(self, strProductUrl=None): dicProductJson = {} #strSource dicProductJson["strSource"] = "Tripbaa" #strOriginUrl dicProductJson["strOriginUrl"] = strProductUrl #strUpdateStatus dicProductJson["strUpdateStatus"] = "up-to-date" #strUpdateTime dicProductJson["strUpdateTime"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") #strImageUrl strImageUrl = self.driver.find_element_by_css_selector("ul.picBox li:nth-of-type(1) a:nth-of-type(1) img").get_attribute("src") dicProductJson["strImageUrl"] = strImageUrl #strTitle strTitle = self.driver.find_element_by_css_selector("h1.mtTripName").text dicProductJson["strTitle"] = strTitle.strip() #strLocation strLocation = self.driver.find_element_by_css_selector("div.mtKind p:nth-of-type(1)").text.split("/")[0] dicProductJson["strLocation"] = strLocation.strip() #intUsdCost strUsdCost = self.driver.find_element_by_css_selector("div.mtIntro_NoPic div.okListMoney span.import01 span.blue").text strUsdCost = re.sub("[^\d\.]", "", strUsdCost) intUsdCost = int(float(strUsdCost.strip())) dicProductJson["intUsdCost"] = intUsdCost #intReviewStar strReviewStar = self.driver.find_element_by_css_selector("div.mtStarBox").get_attribute("data-average") intReviewStar = int(float(strReviewStar.strip())) dicProductJson["intReviewStar"] = intReviewStar #intReviewVisitor dicProductJson["intReviewVisitor"] = random.randint(0, 30) #strIntroduction strIntroduction = self.driver.find_element_by_css_selector("h2.mtTripInfo").text.strip() strIntroduction = re.sub("\s", " ", strIntroduction) dicProductJson["strIntroduction"] = strIntroduction #intDurationHour intDurationHour = 0 lstStrSpanTextInOkList = [] for eleSpanInOkList in self.driver.find_elements_by_css_selector("div.okList span.import01"): lstStrSpanTextInOkList.append(eleSpanInOkList.text) for strSpanTextInOkList in lstStrSpanTextInOkList: if "Duration" in strSpanTextInOkList: strDurationHour = strSpanTextInOkList.strip() strDurationHour = re.sub("\s", " ", strDurationHour.lower()) intDurationHour = self.convertDurationStringToHourInt(strDurtation=strDurationHour) break dicProductJson["intDurationHour"] = intDurationHour #strGuideLanguage strGuideLanguage = "english" for strSpanTextInOkList in lstStrSpanTextInOkList: if "Language" in strSpanTextInOkList: strGuideLanguage = strSpanTextInOkList strGuideLanguage = re.sub("[^a-zA-Z]", " ", strGuideLanguage.lower()).strip() strGuideLanguage = re.sub("[\s]+", " ", strGuideLanguage).strip() strGuideLanguage = re.match("^language (.*)$", strGuideLanguage).group(1).strip() break dicProductJson["strGuideLanguage"] = strGuideLanguage #strStyle strStyle = self.driver.find_element_by_css_selector("div.mtKind p:nth-of-type(1)").text.split("/")[1].strip() dicProductJson["strStyle"] = strStyle #intOption (待確認) dicProductJson["intOption"] = None self.lstDicParsedProductJson.append(dicProductJson) #爬取 product 頁面 def crawlProductPage(self, uselessArg1=None): logging.info("crawl product page") #清空計憶體殘留資料 self.lstDicParsedProductJson = [] self.intProductJsonIndex = 1 #取得 DB 紀錄中,指定 strCityPage1Url city 的 product url lstStrProductUrl = self.db.fetchallProductUrl(isGot=False) for strProductUrl in lstStrProductUrl: #檢查 product 是否已下載 if not self.db.checkProductIsGot(strProductUrl=strProductUrl): time.sleep(random.randint(5,8)) #sleep random time try: self.driver.get(strProductUrl) #解析 product 頁面 self.parseProductPage(strProductUrl=strProductUrl) #更新 product DB 為已爬取 (isGot = 1) #self.db.updateProductStatusIsGot(strProductUrl=strProductUrl) except Exception as e: logging.warning(str(e)) logging.warning("selenium driver crashed. skip get product: %s"%strProductUrl) self.restartDriver() #重啟 #顯示進度 logging.info("進度: %d/100"%len(self.lstDicParsedProductJson)) #寫入 json if len(self.lstDicParsedProductJson) == 100: strJsonFileName = "%d_product.json"%(self.intProductJsonIndex*100) strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.tripbaa", strResourceName=strJsonFileName) self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath) self.intProductJsonIndex = self.intProductJsonIndex+1 self.lstDicParsedProductJson = [] #寫入剩餘的資料到 json if len(self.lstDicParsedProductJson) > 0: strJsonFileName = "%d_product.json"%(self.intProductJsonIndex*100) strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.tripbaa", strResourceName=strJsonFileName) self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath) self.lstDicParsedProductJson = [] self.intProductJsonIndex = 1 #轉換 duration 資訊 def convertDurationStringToHourInt(self, strDurtation=None): intDefaultDuration = 1 if not strDurtation or ("hr" not in strDurtation and "day" not in strDurtation): return intDefaultDuration else: intTotalDurationHour = 0 mDurationHour = re.search("([\d\.]+) hr", strDurtation) mDurationDay = re.search("([\d\.]+) day", strDurtation) if mDurationHour: intDurationHour = int(float(mDurationHour.group(1))) intTotalDurationHour = intTotalDurationHour + intDurationHour if mDurationDay: intDurationDay = int(float(mDurationDay.group(1))) intTotalDurationHour = intTotalDurationHour + (intDurationDay*8) return intTotalDurationHour
class CrawlerForExRate: #建構子 def __init__(self): self.dicSubCommandHandler = { "yahoo":self.crawlYahooCurrencyPage } self.ffUtil = FfUtility() self.fileUtil = FilesysUtility() self.lstDicParsedCurrencyJson = [] #currency.json 資料 self.driver = None #取得 spider 使用資訊 def getUseageMessage(self): return ( "- ExRate -\n" "useage:\n" "yahoo - crawl yahoo currency page \n" ) #取得 selenium driver 物件 def getDriver(self): chromeDriverExeFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource", strResourceName="chromedriver.exe") driver = webdriver.Chrome(chromeDriverExeFilePath) return driver #初始化 selenium driver 物件 def initDriver(self): if not self.driver: self.driver = self.getDriver() #終止 selenium driver 物件 def quitDriver(self): self.driver.quit() self.driver = None #重啟 selenium driver 物件 def restartDriver(self): self.quitDriver() time.sleep(5) self.initDriver() #執行 crawler def runCrawler(self, lstSubcommand=None): strSubcommand = lstSubcommand[0] strArg1 = None if len(lstSubcommand) == 2: strArg1 = lstSubcommand[1] self.initDriver() #init selenium driver self.dicSubCommandHandler[strSubcommand](strArg1) self.quitDriver() #quit selenium driver #爬取 yahoo currency 頁面 def crawlYahooCurrencyPage(self, uselessArg1=None): #清空計憶體殘留資料 self.lstDicParsedCurrencyJson = [] #爬取 self.driver.get("https://tw.money.yahoo.com/currency") #亞洲、美洲、歐非 elesAreaTabLi = self.driver.find_elements_by_css_selector("ul.sub-tabs.D-ib li") for intAreaTabIndex in range(len(elesAreaTabLi)): time.sleep(random.randint(5,10)) self.driver.find_element_by_css_selector("ul.sub-tabs.D-ib li:nth-of-type(%s)"%str(intAreaTabIndex+1)).click() time.sleep(random.randint(5,10)) #加入美金兌美金匯率 1.0 dicExRateData = {} dicExRateData["strCurrencyName"] = "USD" dicExRateData["fUSDollar"] = 1.0 strUpdateTime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") dicExRateData["strUpdateTime"] = strUpdateTime self.lstDicParsedCurrencyJson.append(dicExRateData) #解析 1美金兌非美金匯率資料 elesExRateTr = self.driver.find_elements_by_css_selector("tbody tr.Bd-b") for eleExRateTr in elesExRateTr: dicExRateData = {} #strCurrencyName strExRateHref = eleExRateTr.find_element_by_css_selector("td.Ta-start a").get_attribute("href") dicExRateData["strCurrencyName"] = re.match("https://tw.money.yahoo.com/currency/USD(...)=X", strExRateHref).group(1) #fUSDollar strUSDollar = eleExRateTr.find_element_by_css_selector("td.Ta-end:nth-of-type(3)").text dicExRateData["fUSDollar"] = float(strUSDollar) #dtUpdateTime (strUpdateTime in json) strUpdateTime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") dicExRateData["strUpdateTime"] = strUpdateTime logging.info("find %s ex-rate: %f USD"%(dicExRateData["strCurrencyName"], dicExRateData["fUSDollar"])) self.lstDicParsedCurrencyJson.append(dicExRateData) #將資料寫入 json strJsonFileName = "yahoo_currency.json" strExRateJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.exrate", strResourceName=strJsonFileName) self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedCurrencyJson, strJsonFilePath=strExRateJsonFilePath) self.lstDicParsedCurrencyJson = []
def setUp(self): logging.basicConfig(level=logging.INFO) self.fsUtil = FileSystemUtility()
class CrawlerForBMG: #建構子 def __init__(self): self.strAuthCode = "uds5e527i008wa7k47gyl4srzy3zywbxpw7ei6oe" self.dicSubCommandHandler = { "bmgapi":self.crawlBMGAPI } self.ffUtil = FfUtility() self.fileUtil = FilesysUtility() self.lstDicParsedProductJson = [] #product.json 資料 #取得 spider 使用資訊 def getUseageMessage(self): return ( "- BeMyGuest -\n" "useage:\n" "bmgapi - crawl BeMyGuest API product \n" ) #執行 crawler def runCrawler(self, lstSubcommand=None): strSubcommand = lstSubcommand[0] strArg1 = None if len(lstSubcommand) == 2: strArg1 = lstSubcommand[1] self.dicSubCommandHandler[strSubcommand](strArg1) #爬取 BeMyGuest API 產品 def crawlBMGAPI(self, uselessArg1=None): #清空計憶體殘留資料 self.lstDicParsedProductJson = [] #取得所有產品 簡略資料 lstDicProductRoughData = self.getAllProductRoughData() for dicProductRoughData in lstDicProductRoughData: strProductUUID = dicProductRoughData.get("uuid", None) try: #取得產品詳細資料 dicProductDetailData = self.getProductDetailData(strProductUUID=strProductUUID) #幣別資料(檢查) logging.info("product currency: %s"%dicProductDetailData.get("currency", {}).get("code", str(None))) #轉換為 findfine 資料格式 dicProductJson = {} #strSource dicProductJson["strSource"] = "BeMyGuest" #strOriginUrl dicProductJson["strOriginUrl"] = dicProductDetailData.get("url", None) + u"?partner_id=findfinetour¤cy=USD" #strUpdateStatus dicProductJson["strUpdateStatus"] = "up-to-date" #strUpdateTime dicProductJson["strUpdateTime"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") #strImageUrl strBasePhotosUrl = dicProductDetailData.get("photosUrl", None) strOriginalImgPath = dicProductDetailData.get("photos", [{}])[0].get("paths", {}).get("original", None) dicProductJson["strImageUrl"] = strBasePhotosUrl + strOriginalImgPath #strTitle dicProductJson["strTitle"] = dicProductDetailData.get("title", None) #strLocation lstDicLocation = dicProductDetailData.get("locations", []) lstStrLocation = [] for dicLocation in lstDicLocation: strCity = dicLocation.get("city", None) strState = dicLocation.get("state", None) strCountry = dicLocation.get("country", None) lstStrLocation.append(strCity) lstStrLocation.append(strState) lstStrLocation.append(strCountry) lstStrLocation = list(set(lstStrLocation)) dicProductJson["strLocation"] = ",".join(lstStrLocation) #intUsdCost fAdultPrice = 0.0 dicProductType = dicProductDetailData.get("productTypes", [{}]) dicPrices = dicProductType[0].get("prices", {}) for strPricesKey in dicPrices.keys(): dicPrice = dicPrices.get(strPricesKey, {}) dicRegular = dicPrice.get("regular", False) if dicRegular != False: dicAdult = dicRegular.get("adult", {}) for strAdultKey in dicAdult.keys(): fAdultPrice = dicAdult.get(strAdultKey, 0.0) if fAdultPrice > 0.0: break logging.info("%s got price: %f USD"%(strProductUUID, fAdultPrice)) #dicProductJson["intUsdCost"] = int(fAdultPrice/1.39) dicProductJson["intUsdCost"] = fAdultPrice #intReviewStar dicProductJson["intReviewStar"] = int(dicProductDetailData.get("reviewAverageScore", 0)) #intReviewVisitor dicProductJson["intReviewVisitor"] = int(dicProductDetailData.get("reviewCount", 0)) #strIntroduction dicProductJson["strIntroduction"] = dicProductDetailData.get("description", None) #intDurationHour intDays = dicProductType[0].get("durationDays", 0) intHours = dicProductType[0].get("durationHours", 0) if not intDays: intDays = 0 if not intHours: intHours = 0 dicProductJson["intDurationHour"] = (8*intDays) + intHours #strGuideLanguage lstDicGuideLanguage = dicProductDetailData.get("guideLanguages", []) lstStrName = [] for dicGuideLanguage in lstDicGuideLanguage: strName = dicGuideLanguage.get("name", None) lstStrName.append(strName) dicProductJson["strGuideLanguage"] = ",".join(lstStrName) #strStyle lstDicCategory = dicProductDetailData.get("categories", []) lstStrName = [] for dicCategory in lstDicCategory: strName = dicCategory.get("name", None) lstStrName.append(strName) dicProductJson["strStyle"] = ",".join(lstStrName) #intOption dicProductJson["intOption"] = None #加入資料至 json self.lstDicParsedProductJson.append(dicProductJson) except Exception as e: logging.warning(str(e)) logging.warning("crawl product failed, skip: %s"%strProductUUID) continue #將資料寫入 json strJsonFileName = "bmg_product.json" strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName="findfine_crawler.resource.parsed_json.bmg", strResourceName=strJsonFileName) self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath) self.lstDicParsedProductJson = [] #取得所有產品 簡略資料 def getAllProductRoughData(self): lstDicProductRoughData = [] # 第一頁 strPage1Url = "https://apidemo.bemyguest.com.sg/v1/products" logging.info("get BMG product rough data: %s"%strPage1Url) strRespJson = self.sendHttpRequestByUrllib( strUrl=strPage1Url, dicHeader={"X-Authorization":self.strAuthCode}, dicData=None, strEncoding="utf-8" ) dicRespJson = json.loads(strRespJson) lstDicProductRoughData = lstDicProductRoughData + dicRespJson.get("data", []) # 下一頁 strNextPageUrl = dicRespJson.get("meta", {}).get("pagination", {}).get("links", {}).get("next", None) while strNextPageUrl: strNextPageUrl = re.sub("currency=[\d]+", "currency=USD", strNextPageUrl) logging.info("get BMG product rough data: %s"%strNextPageUrl) strRespJson = self.sendHttpRequestByUrllib( strUrl=strNextPageUrl, dicHeader={"X-Authorization":self.strAuthCode}, dicData=None, strEncoding="utf-8" ) dicRespJson = json.loads(strRespJson) lstDicProductRoughData = lstDicProductRoughData + dicRespJson.get("data", []) # 再下一頁 strNextPageUrl = dicRespJson.get("meta", {}).get("pagination", {}).get("links", {}).get("next", None) return lstDicProductRoughData #取得產品 詳細資料 def getProductDetailData(self, strProductUUID=None): logging.info("get BMG product detail data: %s"%strProductUUID) strRespJson = self.sendHttpRequestByUrllib( strUrl="https://apidemo.bemyguest.com.sg/v1/products/%s?currency=USD"%strProductUUID, dicHeader={"X-Authorization":self.strAuthCode}, dicData=None, strEncoding="utf-8" ) dicRespJson = json.loads(strRespJson) dicProductDetailData = dicRespJson.get("data", None) return dicProductDetailData #使用 urllib 傳送 HTTP request def sendHttpRequestByUrllib(self, strUrl=None, dicHeader={}, dicData=None, strEncoding="utf-8"): req = None if dicData: #有提供 dicData 使用 POST byteEncodedData = urllib.parse.urlencode(dicData).encode(strEncoding) req = urllib.request.Request(url=strUrl, data=byteEncodedData, headers=dicHeader, method="POST") else: #dicData=None 使用 GET req = urllib.request.Request(url=strUrl, data=None, headers=dicHeader, method="GET") response = urllib.request.urlopen(req) return response.read().decode(strEncoding)
def __init__(self): self.filesysUtil = FileSystemUtility()
class Utility: #建構子 def __init__(self): self.filesysUtil = FileSystemUtility() #儲存檔案 def overwriteSaveAs(self, strFilePath=None, unicodeData=None): with open(strFilePath, "w+", encoding="utf-8") as file: file.write(unicodeData) #讀取 json 檔案內容,回傳 dict 物件 def readObjectFromJsonFile(self, strJsonFilePath=None): dicRet = None with open(strJsonFilePath, "r", encoding="utf-8") as jsonFile: dicRet = json.load(jsonFile, encoding="utf-8") return dicRet #將 dict 物件的內容寫入到 json 檔案內 def writeObjectToJsonFile(self, dicData=None, strJsonFilePath=None): with open(strJsonFilePath, "w+", encoding="utf-8") as jsonFile: jsonFile.write(json.dumps(dicData, ensure_ascii=False, indent=4, sort_keys=True)) #取得子目錄的路徑 def getSubFolderPathList(self, strBasedir=None): lstStrSubFolderPath = [] for base, dirs, files in os.walk(strBasedir): if base == strBasedir: for dir in dirs: strFolderPath = base + os.sep + dir lstStrSubFolderPath.append(strFolderPath) return lstStrSubFolderPath #取得 strBasedir 目錄中,檔名以 strSuffixes 結尾的檔案路徑 def getFilePathListWithSuffixes(self, strBasedir=None, strSuffixes=None): lstStrFilePathWithSuffixes = [] for base, dirs, files in os.walk(strBasedir): if base == strBasedir:#just check base dir for strFilename in files: if strFilename.endswith(strSuffixes):#find target files strFilePath = base + os.sep + strFilename lstStrFilePathWithSuffixes.append(strFilePath) return lstStrFilePathWithSuffixes #深層取得 strBasedir 目錄中,檔名以 strSuffixes 結尾的檔案路徑 def recursiveGetFilePathListWithSuffixes(self, strBasedir=None, strSuffixes=None): lstStrFilePathWithSuffixes = [] for base, dirs, files in os.walk(strBasedir): for strFilename in files: if strFilename.endswith(strSuffixes):#find target files strFilePath = base + os.sep + strFilename lstStrFilePathWithSuffixes.append(strFilePath) return lstStrFilePathWithSuffixes #取得檔案的建立日期 def getCtimeOfFile(self, strFilePath=None): fCTimeStamp = os.path.getctime(strFilePath) dtCTime = datetime.datetime.fromtimestamp(fCTimeStamp) strCTime = dtCTime.strftime("%Y-%m-%d") return strCTime #取得 1美元 對 指定幣別 的匯率 def getUsdExrate(self, strCurrency=None): strJsonPackageName = "findfine_crawler.resource.parsed_json.exrate" strJsonFileName = "yahoo_currency.json" strExRateJsonFilePath = self.filesysUtil.getPackageResourcePath(strPackageName=strJsonPackageName, strResourceName=strJsonFileName) lstDicExRateData = self.readObjectFromJsonFile(strJsonFilePath=strExRateJsonFilePath) fUSDollar = 0.0 for dicExRateData in lstDicExRateData: if strCurrency == dicExRateData.get("strCurrencyName", None): fUSDollar = dicExRateData.get("fUSDollar", 0.0) break return fUSDollar
class CrawlerForVIATOR: #建構子 def __init__(self): self.dicSubCommandHandler = { "download":self.downloadVapProductsXmlZip, "unzip":self.unzipVapProductsXmlZip, "json":self.crawlVapProductsXml } self.ffUtil = FfUtility() self.fileUtil = FilesysUtility() self.lstDicParsedProductJson = [] #product.json 資料 self.intProductJsonIndex = 1 #取得 spider 使用資訊 def getUseageMessage(self): return ( "- VIATOR -\n" "useage:\n" "download - download vapProducts.xml.zip \n" "unzip - unzip vapProducts.xml.zip \n" "json - crawl vapProducts.xml then create json \n" ) #執行 crawler def runCrawler(self, lstSubcommand=None): strSubcommand = lstSubcommand[0] strArg1 = None if len(lstSubcommand) == 2: strArg1 = lstSubcommand[1] self.dicSubCommandHandler[strSubcommand](strArg1) #爬取 vapProducts.xml 取得 產品 json def crawlVapProductsXml(self, uselessArg1=None): #清空計憶體殘留資料 self.lstDicParsedProductJson = [] self.intProductJsonIndex = 1 #分次讀取所有產品 soupProduct = self.findNextProductData() while soupProduct: #is not None logging.info("find product: %s"%soupProduct.ProductURL.string) #轉換為 findfine 資料格式 dicProductJson = {} #strSource dicProductJson["strSource"] = "Viator" #strOriginUrl dicProductJson["strOriginUrl"] = soupProduct.ProductURLs.ProductURL.string #strUpdateStatus dicProductJson["strUpdateStatus"] = "up-to-date" #strUpdateTime dicProductJson["strUpdateTime"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") #strImageUrl if soupProduct.ProductImage and soupProduct.ProductImage.ImageURL: dicProductJson["strImageUrl"] = soupProduct.ProductImage.ImageURL.string else: dicProductJson["strImageUrl"] = "#" #strTitle dicProductJson["strTitle"] = soupProduct.ProductName.string #strLocation setStrLocation = {soupProduct.Destination.Country.string, soupProduct.Destination.City.string} if None in setStrLocation: setStrLocation.remove(None) dicProductJson["strLocation"] = ",".join(setStrLocation) #intUsdCost dicProductJson["intUsdCost"] = int(float(soupProduct.Pricing.PriceUSD.string)) #intReviewStar if soupProduct.ProductStarRating and soupProduct.ProductStarRating.AvgRating: dicProductJson["intReviewStar"] = int(float(soupProduct.ProductStarRating.AvgRating.string)) else: dicProductJson["intReviewStar"] = 0 #intReviewVisitor dicProductJson["intReviewVisitor"] = 1 #strIntroduction dicProductJson["strIntroduction"] = soupProduct.Introduction.string #intDurationHour dicProductJson["intDurationHour"] = self.convertDurationStringToHourInt(strDurtation=soupProduct.Duration.string) #strGuideLanguage dicProductJson["strGuideLanguage"] = "english" #strStyle if soupProduct.ProductCategory and soupProduct.ProductCategory.Category: dicProductJson["strStyle"] = soupProduct.ProductCategory.Category.string else: dicProductJson["strStyle"] = "" #intOption #dicProductJson["intOption"] = -1 #加入資料至 json self.lstDicParsedProductJson.append(dicProductJson) #每5000筆寫入一次 json if len(self.lstDicParsedProductJson) == 5000: strJsonFileName = "%d_viator_product.json"%(self.intProductJsonIndex*5000) strJsonPackageName = "findfine_crawler.resource.parsed_json.viator" strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName=strJsonPackageName, strResourceName=strJsonFileName) self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath) self.intProductJsonIndex = self.intProductJsonIndex+1 self.lstDicParsedProductJson = [] #讀取下一個 product soupProduct = self.findNextProductData(soupCurrentProduct=soupProduct) #將剩餘資料寫入 json strJsonFileName = "%d_viator_product.json"%(self.intProductJsonIndex*5000) strJsonPackageName = "findfine_crawler.resource.parsed_json.viator" strProductJsonFilePath = self.fileUtil.getPackageResourcePath(strPackageName=strJsonPackageName, strResourceName=strJsonFileName) self.ffUtil.writeObjectToJsonFile(dicData=self.lstDicParsedProductJson, strJsonFilePath=strProductJsonFilePath) self.lstDicParsedProductJson = [] #從 xml 讀取 下一筆產品資訊 def findNextProductData(self, soupCurrentProduct=None): if soupCurrentProduct: # is not None 返回下一個 Product return soupCurrentProduct.find_next_sibling("Product") else: #尋找第一個 Product strXmlPackageName = "findfine_crawler.resource.source_data.viator" strXmlFileName = "vapProducts.xml" strXmlFilePath = self.fileUtil.getPackageResourcePath(strPackageName=strXmlPackageName, strResourceName=strXmlFileName) with open(strXmlFilePath, "r", encoding="utf-8") as xmlFile: soup = BeautifulSoup(xmlFile.read(), "xml") soupProduct = soup.Products.find("Product") return soupProduct #轉換 duration 資訊 def convertDurationStringToHourInt(self, strDurtation=None): intDefaultDuration = 1 if not strDurtation or ("hour" not in strDurtation and "day" not in strDurtation): return intDefaultDuration else: intTotalDurationHour = 0 mDurationHour = re.match("([\d]+) hour", strDurtation) mDurationDay = re.match("([\d]+) day", strDurtation) if mDurationHour: intDurationHour = int(float(mDurationHour.group(1))) intTotalDurationHour = intTotalDurationHour + intDurationHour if mDurationDay: intDurationDay = int(float(mDurationDay.group(1))) intTotalDurationHour = intTotalDurationHour + (intDurationDay*8) return intTotalDurationHour #下載 vapProducts.xml.zip def downloadVapProductsXmlZip(self, uselessArg1=None): #login strPartnerAccount = "19993" strPartnerPwd = "a768768a" cj = http.cookiejar.MozillaCookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) dicLoginData = urllib.parse.urlencode({ "adminPUID":strPartnerAccount, "login_password":strPartnerPwd }).encode("utf-8") req = urllib.request.Request("https://www.partner.viator.com/partner/login.jspa", dicLoginData, method="POST") response = opener.open(req) #wget https://www.partner.viator.com/partner/admin/tools/links_feeds/downloadFeed.jspa?feed=Products&format=xml strUrl = "https://www.partner.viator.com/partner/admin/tools/links_feeds/downloadFeed.jspa?feed=Products&format=xml" req = urllib.request.Request(url=strUrl, method="GET") response = opener.open(req) byteVapProductsXmlZip = response.read() #儲存 vapProducts.xml.zip strPackageName = "findfine_crawler.resource.source_data.viator" strZipFileName = "vapProducts.xml.zip" strZipFilePath = self.fileUtil.getPackageResourcePath(strPackageName=strPackageName, strResourceName=strZipFileName) with open(strZipFilePath, "bw+") as zipFile: zipFile.write(byteVapProductsXmlZip) #解壓縮 vapProducts.xml.zip def unzipVapProductsXmlZip(self, uselessArg1=None): strPackageName = "findfine_crawler.resource.source_data.viator" strZipFileName = "vapProducts.xml.zip" strZipFilePath = self.fileUtil.getPackageResourcePath(strPackageName=strPackageName, strResourceName=strZipFileName) with ZipFile(strZipFilePath, "r") as zipFile: strPackageName = "findfine_crawler.resource.source_data" strXmlBaseFolderPath = self.fileUtil.getPackageResourcePath(strPackageName=strPackageName, strResourceName="viator") zipFile.extract("vapProducts.xml", strXmlBaseFolderPath)