def merge_rvcamp_and_pixnet(self, rvcamp_json, pixnet_json): ret = list() logger.debug("rvcamp_json.keys: {}".format(rvcamp_json[0].keys())) logger.debug("pixnet_json.keys: {}".format(pixnet_json[0].keys())) style_dict = self.get_camp_style_dict() for rvcamp in rvcamp_json: style = None tmp = 0 tags = list() for pixnet in pixnet_json: if rvcamp["camp_title"] == pixnet["camp_title"]: content = pixnet["content"] if content: for k, v in style_dict.items(): matches = 0 for w in v: cnt = content.count(w) matches += cnt if cnt > 0 and tags.count(w) == 0: tags.append(w) if matches > tmp: tmp = matches style = k break if style: rvcamp["style"] = style rvcamp["tags"] = " ".join(tags) ret.append(rvcamp) return ret
def google_search_extract_pixnet_blog(self, camp_list): datas = list() def search_filter(url_list): for u in url_list: if u.find("pixnet.net/blog/post/") != -1: yield u delays = [9, 10, 5] for idx in range(len(camp_list)): if idx % random.choice(delays) == 0: time.sleep(30) camp = camp_list[idx] camp_title = camp["camp_title"] camp_site = camp["camp_site"] logger.info("idx: {}, camp_site: {}, camp_title: {}".format(idx, camp_site, camp_title)) collect_cnt = 1 max_start = 30 # search_result = self.google_search("\"露營\"+\"痞客邦\"+\"" + camp_site + "\"", search_filter, collect_cnt, # max_start) search_result = self.google_search("露營+痞客邦+" + camp_site, search_filter, collect_cnt, max_start) logger.debug("search_result: {}".format(search_result)) for url in search_result: content = self.extract_pixnet(url)["text_content"] data = dict() data["camp_site"] = camp_site data["camp_title"] = camp_title data["pixnet_url"] = url data["content"] = content # .replace("\"", "") datas.append(data) return datas
def extract_evshhips(self): data_list = list() url = "https://evshhips.pixnet.net/blog/category/4337992" response = requests.get(url) response.encoding = "utf-8" # 解決亂碼問題 html = BeautifulSoup(response.text) mylink = html.select_one("#mylink") def get_data_by_url(url): data = dict() for d in data_list: if url == d.get("url"): data = d break return data for type in mylink.select("div.inner-box"): if type.select_one("img"): # logger.debug(type) style = type.select_one("h6").text.strip().split(" ")[0] url_list = type.select("a") for u in url_list: title = u.text url = u["href"] logger.debug("style: {}, title: {}, url: {}".format(style, title, url)) content = self.extract_pixnet(url)["text_content"] data = get_data_by_url(url) if not data.get("url"): data["style"] = list() data["title"] = title data["url"] = url data["content"] = content data_list.append(data) data["style"].append(style) return data_list
def process(keyword, search_filter, collect_cnt, max_start): """ :param keyword: 搜尋關鍵字 :param search_filter: 過濾器實作(generator) :param collect_cnt: 需蒐集的筆數 :param max_start: 最大分頁總筆數 :return: 搜尋到的網址陣列 """ keyword = quote(keyword) logger.debug("keyword: {}, collect_cnt{}, max_start: {}".format( keyword, collect_cnt, max_start)) ret = list() url_pattern = "https://www.google.com/search?q={}&start={}" for start in range(0, max_start, 10): url = url_pattern.format(keyword, start) try: logger.debug("url: {}".format(url)) response = random_requests_get(url) html = BeautifulSoup(response.text) url_list = [ unquote(d["href"], "utf-8").replace("/url?q=", "").split("&sa=")[0] for d in html.select("h3.r > a") ] # 該頁搜尋結果連結 ret.extend(search_filter(url_list)) if len(ret) == 0: break ret = ret[0:collect_cnt] if len(ret) > collect_cnt else ret if len(ret) == collect_cnt: break except Exception as e: logger.error("Error: {}, url: {}".format(e, url)) return ret
def json_to_mongodb_pixnet(json_data, drop): """ JSON寫入MongoDB """ # import json # from bson import json_util conn = None try: conn = MongoClient(db_config["mongodb"]) db = conn.test coll = db.blog_pixnet if drop: coll.drop() for doc in json_data: coll.update( { "camp_title": doc["camp_title"], "pixnet_url": doc["pixnet_url"] }, doc, upsert=True) except Exception as e: logger.error("Error:", e) finally: if conn: conn.close() logger.debug("connection closed ...")
def process(url): response = requests.get(url) items = json.loads(response.text)["items"] logger.debug(items) limit_cnt = 3 # 限制帶出筆數(大於0才做限制) for idx in range(len(items)): item = items[idx] logger.info("{}.".format(idx + 1), item) logger.info("no: {}, store_id: {}, store_name: {}".format( idx + 1, item["store_id"], item["store_name"])) logger.info("address: {}, area: {}, city: {}".format( item["address"], item["area"], item["city"])) url = "https://icamping-prod.appspot.com/_ah/api/icamping_guest/v2/query_store_by_store_id?store_id=" + item[ "store_id"] response = requests.get(url) content = json.loads(response.text)["items"][0] logger.info("description: {}".format(content["description"])) external_links = json.loads(content["external_links"]) for ex_link in external_links: logger.info("link_name: {}, link: {}".format( ex_link["link_name"], ex_link["link"])) photo = json.loads(content["photo"]) for p in photo: logger.info("gcs_url: {}".format(p["gcs_url"])) if limit_cnt > 0 and idx == limit_cnt - 1: break logger.info( "------------------------------------------------------------")
def process(url): """ 將爬蟲內容轉成JSON """ total = [] response = requests.get(url) html = BeautifulSoup(response.text) menus = html.select_one("#home-menu").select("li > a") cnt_area = 0 bk = False for menu in menus: cnt_area = cnt_area + 1 cnt_campsite = 0 murl = menu["href"] logger.info("區域: {} ----------------------------".format(menu.text)) logger.debug("murl: {}".format(murl)) response = requests.get(murl) html = BeautifulSoup(response.text) nav = html.select_one("div.nav-links") # 分頁導覽區域 if nav is not None: last_page_num = int( nav.select_one("a.page-numbers:nth-last-of-type(2)") ["href"].split("/")[-1]) # 倒數第2個才是最後一頁 logger.info("總共{}頁".format(last_page_num)) for num in range(last_page_num): pnum = str(num + 1) logger.info("{} - 第{}頁 ----------------------------".format( menu.text, pnum)) page_url = murl + "/page/" + pnum logger.debug("page_url: {}".format(page_url)) response = requests.get(page_url) html = BeautifulSoup(response.text) campsites = html.select("h2.entry-title-list > a") for campsite in campsites: cnt_campsite = cnt_campsite + 1 row = dict() # row["_id"] = "campsite_" + format(cnt_area, "02d") + "_" + format(cnt_campsite, "04d") row["location"] = menu.text campsite_url = campsite["href"] process_content(campsite_url, row) logger.info("row: {}".format(row)) total.append(row) if False and cnt_area == 1 and cnt_campsite == 10: # 限制爬的數量(False則不限制數量) bk = True # Python沒有label, 所以用這種鳥方式 if bk: break # <<< end of page campsite for loop if bk: break # <<< end of location page for loop if bk: break # <<< end of location menu for loop logger.info("total count: {}".format(len(total))) return total
def extract_fb_comment(self, camp_list): datas = list() for camp in camp_list: web_site = camp["web_site"] fb_url = "" for web in web_site: for v in web.values(): if v.find("facebook.com") != -1: fb_url = v if "" != fb_url: break if "" != fb_url: break if "" != fb_url: data = dict() data["camp_site"] = camp["camp_site"] data["camp_title"] = camp["camp_title"] data["fb_url"] = fb_url datas.append(data) driver = self.init_fb() delays = [7, 3, 5, 2, 4] for data in datas: try: url = data["fb_url"] url_reviews = url + "reviews/" logger.debug("url_reviews: {}".format(url_reviews)) driver.get(url_reviews) time.sleep(random.choice(delays)) _len = 0 while True: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾) time.sleep(3) reviews = driver.find_elements_by_css_selector("div[class='_5pbx userContent _3576']") logger.info("已載入{}筆意見".format(len(reviews))) if _len == len(reviews): break _len = len(reviews) # 筆數一樣表示沒有意見了 comments = list() for review in reviews: # logger.info( # "id: {}, comment: {}".format(review.get_attribute("id"), # review.find_element_by_tag_name("p").text)) comment = review.find_element_by_tag_name("p").text if comment and "" != comment.strip(): comments.append(comment.strip()) data["comments"] = comments except Exception as e: logger.error("Error: {}".format(e)) return datas
def camplist_to_mongodb(self, json_data, drop): conn = None try: conn = MongoClient(db_config["mongodb"]) db = conn.test coll = db.camplist if drop: coll.drop() for doc in json_data: coll.update({"camp_title": doc["camp_title"]}, doc, upsert=True) except Exception as e: logger.error("Error:", e) finally: if conn: conn.close() logger.debug("connection closed ...")
def process(keyword, prefix, collect_cnt): driver = Chrome("../chromedriver") driver.set_window_rect(10, 10, 1027, 768) img_dir = path_config["crawler"] + "/images" if not os.path.exists(img_dir): os.makedirs(img_dir) keyword = quote(keyword) logger.debug("keyword: {}, collect_cnt{}".format(keyword, collect_cnt)) ret = list() url_pattern = "https://www.google.com/search?q={}&source=lnms&tbm=isch&sa=X&ved=0ahUKEwi33-bootHhAhVXyIsBHXN5CAMQ_AUIDigB&biw=1920&bih=979" url = url_pattern.format(keyword) driver.get(url) _len = 0 while True: driver.execute_script("window.scrollTo(0, document.body.scrollHeight);" ) # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾) time.sleep(3) # 2秒有時會來不及, 所以改用3秒 hSRGPd = driver.find_elements_by_css_selector("a[jsname='hSRGPd']") logger.info("已載入{}筆資料".format(len(hSRGPd))) if _len == len(hSRGPd): break _len = len(hSRGPd) # 筆數一樣表示沒有意見了 g_urls = [] for d in hSRGPd: g_url = d.get_attribute("href") g_urls.append(g_url) delay = [1, 2, 3, 1.5, 2.3, 3.2] for i in range(len(g_urls)): try: g_url = g_urls[i] # print("g_url=", g_url) driver.get(g_url) time.sleep(random.choice(delay)) img_url = driver.find_element_by_css_selector( "img[class='irc_mi'][src^='http']").get_attribute("src") print("img_url=", img_url) fpath = img_dir + "/" + prefix + format( i, "03d") + "." + img_url.split(".")[-1] urlretrieve(img_url, fpath) if i > collect_cnt: break except Exception as e: print("Error: {}".format(e)) return ret
def __merge_rvcamp_text1(self, datas, row): keymap = dict((("電話", "tel"), ("地址", "addr"), ("名稱", "camp_site"), ("網站", "web_site"), ("座標", "latlong"))) for data in datas: title = keymap.get(data[0].text) if title is None: logger.debug("title is None: {}".format(data[0].text)) continue content = data[1] if "tel" == title: content = [d.text for d in content.select("span[itemprop='telephone']")] elif "web_site" == title: content = [{d.text: d["href"]} for d in content.select("a")] elif "latlong" == title: if content.select_one("#gps_10") is not None: content = ",".join(content.select_one("#gps_10").text.split("\xa0")) else: content = "N/A" else: content = content.text row[title] = content
def proces_pixnet_blog(camp_list): """ 依JSON找出營地的痞客邦部落格 :param camp_list: 營地資訊JSON :return: """ datas = list() import crawler.test.google_search as google_search import crawler.test.pixnet as pixnet def search_filter(url_list): for u in url_list: if u.find("pixnet.net/blog") != -1: yield u delays = [9, 10, 2, 5] for idx in range(len(camp_list)): if idx % random.choice(delays) == 0: time.sleep(30) camp = camp_list[idx] camp_title = camp["camp_title"] camp_site = camp["camp_site"] logger.info("idx: {}, camp_site: {}, camp_title: {}".format( idx, camp_site, camp_title)) collect_cnt = 3 max_start = 30 search_result = google_search.process( "\"露營\"+\"pixnet\"+\"" + camp_site + "\"", search_filter, collect_cnt, max_start) logger.debug("search_result: {}".format(search_result)) for url in search_result: content = pixnet.process(url)["text_content"] data = dict() data["camp_site"] = camp_site data["camp_title"] = camp_title data["pixnet_url"] = url data["content"] = content # .replace("\"", "") datas.append(data) return datas
def json_to_mongodb_rvcamp(json_data, drop): """ JSON寫入MongoDB """ # import json # from bson import json_util conn = None try: conn = MongoClient(db_config["mongodb"]) db = conn.test coll = db.rvcamp if drop: coll.drop() for doc in json_data: # doc = json.loads(json.dumps(doc, default=json_util.default)) # trans dict in list to json -> 多此一舉 # fdoc = coll.find_one({"camp_title": doc["camp_title"]}) coll.update({"camp_title": doc["camp_title"]}, doc, upsert=True) except Exception as e: logger.error("Error:", e) finally: if conn: conn.close() logger.debug("connection closed ...")
def process_content(content_url, row): try: logger.debug("content_url: {}".format(content_url)) response = requests.get(content_url) html = BeautifulSoup(response.text) logger.info("entry-title: {}".format( html.select_one("h1.entry-title").text)) row["camp_title"] = html.select_one("h1.entry-title").text text0 = [ t.select_one("a").text for t in html.select_one("#text0").select( "div[class^='t-camp-']" # 為t-camp-開頭 + ":not([class$='-none'])" # 不為-none結尾 + ":not([class='t-camp-area'])" # 不為t-camp-area ) ] row["features"] = text0 text1 = [ t.select("span[class^=t-]") for t in html.select_one("#text1").select("li") ] merge_text1(text1, row) except Exception as e: logger.error("Error: {}, content_url: {}".format(e, content_url))
def extract_rvcamp(self, limit_count): total = [] response = requests.get(self.__config["url_rvcamp"]) html = BeautifulSoup(response.text) menus = html.select_one("#home-menu").select("li > a") cnt_area = 0 bk = False extract_count = 0 for menu in menus: cnt_area = cnt_area + 1 cnt_campsite = 0 murl = menu["href"] logger.info("區域: {} ----------------------------".format(menu.text)) logger.debug("murl: {}".format(murl)) response = requests.get(murl) html = BeautifulSoup(response.text) nav = html.select_one("div.nav-links") # 分頁導覽區域 if nav is not None: last_page_num = int( nav.select_one("a.page-numbers:nth-last-of-type(2)")["href"].split("/")[-1]) # 倒數第2個才是最後一頁 logger.info("總共{}頁".format(last_page_num)) for num in range(last_page_num): pnum = str(num + 1) logger.info("{} - 第{}頁 ----------------------------".format(menu.text, pnum)) page_url = murl + "/page/" + pnum logger.debug("page_url: {}".format(page_url)) response = requests.get(page_url) html = BeautifulSoup(response.text) campsites = html.select("h2.entry-title-list > a") for campsite in campsites: extract_count += 1 cnt_campsite = cnt_campsite + 1 row = dict() # row["_id"] = "campsite_" + format(cnt_area, "02d") + "_" + format(cnt_campsite, "04d") row["location"] = menu.text campsite_url = campsite["href"] try: logger.debug("content_url: {}".format(campsite_url)) response = requests.get(campsite_url) html = BeautifulSoup(response.text) logger.info("entry-title: {}".format(html.select_one("h1.entry-title").text)) row["camp_title"] = html.select_one("h1.entry-title").text text0 = [t.select_one("a").text for t in html.select_one("#text0").select( "div[class^='t-camp-']" # 為t-camp-開頭 + ":not([class$='-none'])" # 不為-none結尾 + ":not([class='t-camp-area'])" # 不為t-camp-area ) ] row["features"] = text0 text1 = [t.select("span[class^=t-]") for t in html.select_one("#text1").select("li")] self.__merge_rvcamp_text1(text1, row) except Exception as e: logger.error("Error: {}, campsite_url: {}".format(e, campsite_url)) logger.info("row: {}".format(row)); total.append(row) # if False and cnt_area == 1 and cnt_campsite == 10: # 限制爬的數量(False則不限制數量) if extract_count == limit_count: bk = True # Python沒有label, 所以用這種鳥方式 if bk: break # <<< end of page campsite for loop if bk: break # <<< end of location page for loop if bk: break # <<< end of location menu for loop logger.info("total count: {}".format(len(total))) return total # json array
response = random_requests_get(url) html = BeautifulSoup(response.text) url_list = [ unquote(d["href"], "utf-8").replace("/url?q=", "").split("&sa=")[0] for d in html.select("h3.r > a") ] # 該頁搜尋結果連結 ret.extend(search_filter(url_list)) if len(ret) == 0: break ret = ret[0:collect_cnt] if len(ret) > collect_cnt else ret if len(ret) == collect_cnt: break except Exception as e: logger.error("Error: {}, url: {}".format(e, url)) return ret if __name__ == '__main__': # keyword = "\"露營\"+\"痞客邦\"+\"山林鳥日子\"" keyword = "露營+痞客邦+山林鳥日子" def search_filter(url_list): for u in url_list: if u.find("pixnet.net/blog") != -1: yield u collect_cnt = 3 max_start = 30 result = process(keyword, search_filter, collect_cnt, max_start) logger.debug("result: {}".format(result))
def camplist_to_mysql(self, json_data): conn = None try: # conn = MySQLdb.connect(**db_config["mysql"]) conn = mysql.connector.connect(**db_config["mysql"]) # conn.autocommit(False) cur = conn.cursor() sql = "delete from camp_webs" res = cur.execute(sql) logger.debug("sql: {}, res: {}".format(sql, res)) sql = "delete from camp_tels" res = cur.execute(sql) logger.debug("sql: {}, res: {}".format(sql, res)) sql = "delete from camp_features" res = cur.execute(sql) logger.debug("sql: {}, res: {}".format(sql, res)) sql = "delete from camp_list" res = cur.execute(sql) logger.debug("sql: {}, res: {}".format(sql, res)) ins_datas = [] for data in json_data: ins_data = ( data["camp_title"], data["camp_site"], data["addr"], (data["latlong"] if data["latlong"] else "NA"), data["location"], data["style"], data["tags"]) ins_datas.append(ins_data) sql = ("insert into camp_list ( \n" + "camp_title, camp_site, addr, latlong, location, style, tags \n" + ") values ( \n" + "%s, %s, %s, %s, %s, %s, %s \n" + ")") res = cur.executemany(sql, ins_datas) logger.debug("sql: {}, res: {}".format(sql, res)) feature_datas = [] tel_datas = [] web_datas = [] for data in json_data: camp_title = data["camp_title"] for feature in data["features"]: feature_datas.append((camp_title, feature)) for tel in data["tel"]: if tel == "" or tel_datas.count((camp_title, tel)) != 0: print(">>>> ", tel) continue tel_datas.append((camp_title, tel)) for ws in data["web_site"]: for item in ws.items(): web_datas.append((camp_title, item[0], item[1])) sql = ("insert into camp_features ( \n" + "camp_title, feature \n" + ") values ( \n" + "%s, %s \n" + ")") res = cur.executemany(sql, feature_datas) logger.debug("sql: {}, res: {}".format(sql, res)) sql = ("insert into camp_tels ( \n" + "camp_title, tel \n" + ") values ( \n" + "%s, %s \n" + ")") res = cur.executemany(sql, tel_datas) logger.debug("sql: {}, res: {}".format(sql, res)) sql = ("insert into camp_webs ( \n" + "camp_title, name, url \n" + ") values ( \n" + "%s, %s, %s \n" + ")") res = cur.executemany(sql, web_datas) logger.debug("sql: {}, res: {}".format(sql, res)) conn.commit() except Exception as e: logger.error("Error: {}".format(e.with_traceback())) if conn: conn.rollback() finally: if conn: conn.close() logger.debug("conn.close() ...")