def json_to_mongodb_pixnet(json_data, drop): """ JSON寫入MongoDB """ # import json # from bson import json_util conn = None try: conn = MongoClient(db_config["mongodb"]) db = conn.test coll = db.blog_pixnet if drop: coll.drop() for doc in json_data: coll.update( { "camp_title": doc["camp_title"], "pixnet_url": doc["pixnet_url"] }, doc, upsert=True) except Exception as e: logger.error("Error:", e) finally: if conn: conn.close() logger.debug("connection closed ...")
def process(keyword, search_filter, collect_cnt, max_start): """ :param keyword: 搜尋關鍵字 :param search_filter: 過濾器實作(generator) :param collect_cnt: 需蒐集的筆數 :param max_start: 最大分頁總筆數 :return: 搜尋到的網址陣列 """ keyword = quote(keyword) logger.debug("keyword: {}, collect_cnt{}, max_start: {}".format( keyword, collect_cnt, max_start)) ret = list() url_pattern = "https://www.google.com/search?q={}&start={}" for start in range(0, max_start, 10): url = url_pattern.format(keyword, start) try: logger.debug("url: {}".format(url)) response = random_requests_get(url) html = BeautifulSoup(response.text) url_list = [ unquote(d["href"], "utf-8").replace("/url?q=", "").split("&sa=")[0] for d in html.select("h3.r > a") ] # 該頁搜尋結果連結 ret.extend(search_filter(url_list)) if len(ret) == 0: break ret = ret[0:collect_cnt] if len(ret) > collect_cnt else ret if len(ret) == collect_cnt: break except Exception as e: logger.error("Error: {}, url: {}".format(e, url)) return ret
def extract_fb_comment(self, camp_list): datas = list() for camp in camp_list: web_site = camp["web_site"] fb_url = "" for web in web_site: for v in web.values(): if v.find("facebook.com") != -1: fb_url = v if "" != fb_url: break if "" != fb_url: break if "" != fb_url: data = dict() data["camp_site"] = camp["camp_site"] data["camp_title"] = camp["camp_title"] data["fb_url"] = fb_url datas.append(data) driver = self.init_fb() delays = [7, 3, 5, 2, 4] for data in datas: try: url = data["fb_url"] url_reviews = url + "reviews/" logger.debug("url_reviews: {}".format(url_reviews)) driver.get(url_reviews) time.sleep(random.choice(delays)) _len = 0 while True: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾) time.sleep(3) reviews = driver.find_elements_by_css_selector("div[class='_5pbx userContent _3576']") logger.info("已載入{}筆意見".format(len(reviews))) if _len == len(reviews): break _len = len(reviews) # 筆數一樣表示沒有意見了 comments = list() for review in reviews: # logger.info( # "id: {}, comment: {}".format(review.get_attribute("id"), # review.find_element_by_tag_name("p").text)) comment = review.find_element_by_tag_name("p").text if comment and "" != comment.strip(): comments.append(comment.strip()) data["comments"] = comments except Exception as e: logger.error("Error: {}".format(e)) return datas
def camplist_to_mongodb(self, json_data, drop): conn = None try: conn = MongoClient(db_config["mongodb"]) db = conn.test coll = db.camplist if drop: coll.drop() for doc in json_data: coll.update({"camp_title": doc["camp_title"]}, doc, upsert=True) except Exception as e: logger.error("Error:", e) finally: if conn: conn.close() logger.debug("connection closed ...")
def extract_pixnet(self, url): ret = dict() ret["text_content"] = "" try: response = requests.get(url) response.encoding = "utf-8" # 解決亂碼問題 html = BeautifulSoup(response.text) # logger.debug(html) article_content = html.select_one("div#article-content-inner") # text_content = self.__get_text_content(article_content.select("*")) text_content = "\n".join(c.strip() for c in article_content.text.split("\n") if "" != c.strip()) text_content = text_content.replace("\xa0", " ") ret["text_content"] = text_content # logger.info(ret["text_content"]) except Exception as e: logger.error("Error: {}".format(e)) ret["text_content"] = "{}".format(e) return ret
def json_to_mongodb_rvcamp(json_data, drop): """ JSON寫入MongoDB """ # import json # from bson import json_util conn = None try: conn = MongoClient(db_config["mongodb"]) db = conn.test coll = db.rvcamp if drop: coll.drop() for doc in json_data: # doc = json.loads(json.dumps(doc, default=json_util.default)) # trans dict in list to json -> 多此一舉 # fdoc = coll.find_one({"camp_title": doc["camp_title"]}) coll.update({"camp_title": doc["camp_title"]}, doc, upsert=True) except Exception as e: logger.error("Error:", e) finally: if conn: conn.close() logger.debug("connection closed ...")
def process_content(content_url, row): try: logger.debug("content_url: {}".format(content_url)) response = requests.get(content_url) html = BeautifulSoup(response.text) logger.info("entry-title: {}".format( html.select_one("h1.entry-title").text)) row["camp_title"] = html.select_one("h1.entry-title").text text0 = [ t.select_one("a").text for t in html.select_one("#text0").select( "div[class^='t-camp-']" # 為t-camp-開頭 + ":not([class$='-none'])" # 不為-none結尾 + ":not([class='t-camp-area'])" # 不為t-camp-area ) ] row["features"] = text0 text1 = [ t.select("span[class^=t-]") for t in html.select_one("#text1").select("li") ] merge_text1(text1, row) except Exception as e: logger.error("Error: {}, content_url: {}".format(e, content_url))
def extract_google_images(self, keyword, prefix, collect_cnt, img_dir): driver = Chrome("./chromedriver") driver.set_window_rect(10, 10, 1027, 768) if not os.path.exists(img_dir): os.makedirs(img_dir) keyword = quote(keyword) logger.debug("keyword: {}, collect_cnt{}".format(keyword, collect_cnt)) url_pattern = "https://www.google.com/search?q={}&source=lnms&tbm=isch&sa=X&ved=0ahUKEwi33-bootHhAhVXyIsBHXN5CAMQ_AUIDigB&biw=1920&bih=979" url = url_pattern.format(keyword) driver.get(url) _len = 0 while True: driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾) time.sleep(3) # 2秒有時會來不及, 所以改用3秒 hSRGPd = driver.find_elements_by_css_selector("a[jsname='hSRGPd']") logger.info("已載入{}筆資料".format(len(hSRGPd))) if _len == len(hSRGPd): break _len = len(hSRGPd) # 筆數一樣表示沒有意見了 g_urls = [] for d in hSRGPd: g_url = d.get_attribute("href") g_urls.append(g_url) delay = [1, 2, 3, 1.5, 2.3, 3.2] for i in range(len(g_urls)): try: g_url = g_urls[i] driver.get(g_url) time.sleep(random.choice(delay)) img_url = driver.find_element_by_css_selector("img[class='irc_mi'][src^='http']").get_attribute("src") logger.debug("img_url: {}".format(img_url)) fpath = img_dir + "/" + prefix + format(i, "03d") + "." + img_url.split(".")[-1] urlretrieve(img_url, fpath) if i > collect_cnt: break except Exception as e: logger.error("Error: {}".format(e))
def camplist_to_mysql(self, json_data): conn = None try: # conn = MySQLdb.connect(**db_config["mysql"]) conn = mysql.connector.connect(**db_config["mysql"]) # conn.autocommit(False) cur = conn.cursor() sql = "delete from camp_webs" res = cur.execute(sql) logger.debug("sql: {}, res: {}".format(sql, res)) sql = "delete from camp_tels" res = cur.execute(sql) logger.debug("sql: {}, res: {}".format(sql, res)) sql = "delete from camp_features" res = cur.execute(sql) logger.debug("sql: {}, res: {}".format(sql, res)) sql = "delete from camp_list" res = cur.execute(sql) logger.debug("sql: {}, res: {}".format(sql, res)) ins_datas = [] for data in json_data: ins_data = ( data["camp_title"], data["camp_site"], data["addr"], (data["latlong"] if data["latlong"] else "NA"), data["location"], data["style"], data["tags"]) ins_datas.append(ins_data) sql = ("insert into camp_list ( \n" + "camp_title, camp_site, addr, latlong, location, style, tags \n" + ") values ( \n" + "%s, %s, %s, %s, %s, %s, %s \n" + ")") res = cur.executemany(sql, ins_datas) logger.debug("sql: {}, res: {}".format(sql, res)) feature_datas = [] tel_datas = [] web_datas = [] for data in json_data: camp_title = data["camp_title"] for feature in data["features"]: feature_datas.append((camp_title, feature)) for tel in data["tel"]: if tel == "" or tel_datas.count((camp_title, tel)) != 0: print(">>>> ", tel) continue tel_datas.append((camp_title, tel)) for ws in data["web_site"]: for item in ws.items(): web_datas.append((camp_title, item[0], item[1])) sql = ("insert into camp_features ( \n" + "camp_title, feature \n" + ") values ( \n" + "%s, %s \n" + ")") res = cur.executemany(sql, feature_datas) logger.debug("sql: {}, res: {}".format(sql, res)) sql = ("insert into camp_tels ( \n" + "camp_title, tel \n" + ") values ( \n" + "%s, %s \n" + ")") res = cur.executemany(sql, tel_datas) logger.debug("sql: {}, res: {}".format(sql, res)) sql = ("insert into camp_webs ( \n" + "camp_title, name, url \n" + ") values ( \n" + "%s, %s, %s \n" + ")") res = cur.executemany(sql, web_datas) logger.debug("sql: {}, res: {}".format(sql, res)) conn.commit() except Exception as e: logger.error("Error: {}".format(e.with_traceback())) if conn: conn.rollback() finally: if conn: conn.close() logger.debug("conn.close() ...")
def extract_rvcamp(self, limit_count): total = [] response = requests.get(self.__config["url_rvcamp"]) html = BeautifulSoup(response.text) menus = html.select_one("#home-menu").select("li > a") cnt_area = 0 bk = False extract_count = 0 for menu in menus: cnt_area = cnt_area + 1 cnt_campsite = 0 murl = menu["href"] logger.info("區域: {} ----------------------------".format(menu.text)) logger.debug("murl: {}".format(murl)) response = requests.get(murl) html = BeautifulSoup(response.text) nav = html.select_one("div.nav-links") # 分頁導覽區域 if nav is not None: last_page_num = int( nav.select_one("a.page-numbers:nth-last-of-type(2)")["href"].split("/")[-1]) # 倒數第2個才是最後一頁 logger.info("總共{}頁".format(last_page_num)) for num in range(last_page_num): pnum = str(num + 1) logger.info("{} - 第{}頁 ----------------------------".format(menu.text, pnum)) page_url = murl + "/page/" + pnum logger.debug("page_url: {}".format(page_url)) response = requests.get(page_url) html = BeautifulSoup(response.text) campsites = html.select("h2.entry-title-list > a") for campsite in campsites: extract_count += 1 cnt_campsite = cnt_campsite + 1 row = dict() # row["_id"] = "campsite_" + format(cnt_area, "02d") + "_" + format(cnt_campsite, "04d") row["location"] = menu.text campsite_url = campsite["href"] try: logger.debug("content_url: {}".format(campsite_url)) response = requests.get(campsite_url) html = BeautifulSoup(response.text) logger.info("entry-title: {}".format(html.select_one("h1.entry-title").text)) row["camp_title"] = html.select_one("h1.entry-title").text text0 = [t.select_one("a").text for t in html.select_one("#text0").select( "div[class^='t-camp-']" # 為t-camp-開頭 + ":not([class$='-none'])" # 不為-none結尾 + ":not([class='t-camp-area'])" # 不為t-camp-area ) ] row["features"] = text0 text1 = [t.select("span[class^=t-]") for t in html.select_one("#text1").select("li")] self.__merge_rvcamp_text1(text1, row) except Exception as e: logger.error("Error: {}, campsite_url: {}".format(e, campsite_url)) logger.info("row: {}".format(row)); total.append(row) # if False and cnt_area == 1 and cnt_campsite == 10: # 限制爬的數量(False則不限制數量) if extract_count == limit_count: bk = True # Python沒有label, 所以用這種鳥方式 if bk: break # <<< end of page campsite for loop if bk: break # <<< end of location page for loop if bk: break # <<< end of location menu for loop logger.info("total count: {}".format(len(total))) return total # json array