Ejemplo n.º 1
0
def json_to_mongodb_pixnet(json_data, drop):
    """
    JSON寫入MongoDB
    """
    # import json
    # from bson import json_util
    conn = None
    try:
        conn = MongoClient(db_config["mongodb"])
        db = conn.test
        coll = db.blog_pixnet
        if drop:
            coll.drop()
        for doc in json_data:
            coll.update(
                {
                    "camp_title": doc["camp_title"],
                    "pixnet_url": doc["pixnet_url"]
                },
                doc,
                upsert=True)
    except Exception as e:
        logger.error("Error:", e)
    finally:
        if conn:
            conn.close()
            logger.debug("connection closed ...")
Ejemplo n.º 2
0
def process(keyword, search_filter, collect_cnt, max_start):
    """
    :param keyword: 搜尋關鍵字
    :param search_filter: 過濾器實作(generator)
    :param collect_cnt: 需蒐集的筆數
    :param max_start: 最大分頁總筆數
    :return: 搜尋到的網址陣列
    """
    keyword = quote(keyword)
    logger.debug("keyword: {}, collect_cnt{}, max_start: {}".format(
        keyword, collect_cnt, max_start))
    ret = list()
    url_pattern = "https://www.google.com/search?q={}&start={}"

    for start in range(0, max_start, 10):
        url = url_pattern.format(keyword, start)
        try:
            logger.debug("url: {}".format(url))
            response = random_requests_get(url)
            html = BeautifulSoup(response.text)
            url_list = [
                unquote(d["href"], "utf-8").replace("/url?q=",
                                                    "").split("&sa=")[0]
                for d in html.select("h3.r > a")
            ]  # 該頁搜尋結果連結
            ret.extend(search_filter(url_list))
            if len(ret) == 0:
                break
            ret = ret[0:collect_cnt] if len(ret) > collect_cnt else ret
            if len(ret) == collect_cnt:
                break
        except Exception as e:
            logger.error("Error: {}, url: {}".format(e, url))
    return ret
Ejemplo n.º 3
0
 def extract_fb_comment(self, camp_list):
     datas = list()
     for camp in camp_list:
         web_site = camp["web_site"]
         fb_url = ""
         for web in web_site:
             for v in web.values():
                 if v.find("facebook.com") != -1:
                     fb_url = v
                 if "" != fb_url:
                     break
             if "" != fb_url:
                 break
         if "" != fb_url:
             data = dict()
             data["camp_site"] = camp["camp_site"]
             data["camp_title"] = camp["camp_title"]
             data["fb_url"] = fb_url
             datas.append(data)
     driver = self.init_fb()
     delays = [7, 3, 5, 2, 4]
     for data in datas:
         try:
             url = data["fb_url"]
             url_reviews = url + "reviews/"
             logger.debug("url_reviews: {}".format(url_reviews))
             driver.get(url_reviews)
             time.sleep(random.choice(delays))
             _len = 0
             while True:
                 driver.execute_script(
                     "window.scrollTo(0, document.body.scrollHeight);")  # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾)
                 time.sleep(3)
                 reviews = driver.find_elements_by_css_selector("div[class='_5pbx userContent _3576']")
                 logger.info("已載入{}筆意見".format(len(reviews)))
                 if _len == len(reviews):
                     break
                 _len = len(reviews)  # 筆數一樣表示沒有意見了
             comments = list()
             for review in reviews:
                 # logger.info(
                 #     "id: {}, comment: {}".format(review.get_attribute("id"),
                 #                                  review.find_element_by_tag_name("p").text))
                 comment = review.find_element_by_tag_name("p").text
                 if comment and "" != comment.strip():
                     comments.append(comment.strip())
             data["comments"] = comments
         except Exception as e:
             logger.error("Error: {}".format(e))
     return datas
Ejemplo n.º 4
0
 def camplist_to_mongodb(self, json_data, drop):
     conn = None
     try:
         conn = MongoClient(db_config["mongodb"])
         db = conn.test
         coll = db.camplist
         if drop:
             coll.drop()
         for doc in json_data:
             coll.update({"camp_title": doc["camp_title"]}, doc, upsert=True)
     except Exception as e:
         logger.error("Error:", e)
     finally:
         if conn:
             conn.close()
             logger.debug("connection closed ...")
Ejemplo n.º 5
0
 def extract_pixnet(self, url):
     ret = dict()
     ret["text_content"] = ""
     try:
         response = requests.get(url)
         response.encoding = "utf-8"  # 解決亂碼問題
         html = BeautifulSoup(response.text)
         # logger.debug(html)
         article_content = html.select_one("div#article-content-inner")
         # text_content = self.__get_text_content(article_content.select("*"))
         text_content = "\n".join(c.strip() for c in article_content.text.split("\n") if "" != c.strip())
         text_content = text_content.replace("\xa0", " ")
         ret["text_content"] = text_content
         # logger.info(ret["text_content"])
     except Exception as e:
         logger.error("Error: {}".format(e))
         ret["text_content"] = "{}".format(e)
     return ret
Ejemplo n.º 6
0
def json_to_mongodb_rvcamp(json_data, drop):
    """
    JSON寫入MongoDB
    """
    # import json
    # from bson import json_util
    conn = None
    try:
        conn = MongoClient(db_config["mongodb"])
        db = conn.test
        coll = db.rvcamp
        if drop:
            coll.drop()
        for doc in json_data:
            # doc = json.loads(json.dumps(doc, default=json_util.default))  # trans dict in list to json -> 多此一舉
            # fdoc = coll.find_one({"camp_title": doc["camp_title"]})
            coll.update({"camp_title": doc["camp_title"]}, doc, upsert=True)
    except Exception as e:
        logger.error("Error:", e)
    finally:
        if conn:
            conn.close()
            logger.debug("connection closed ...")
Ejemplo n.º 7
0
def process_content(content_url, row):
    try:
        logger.debug("content_url: {}".format(content_url))
        response = requests.get(content_url)
        html = BeautifulSoup(response.text)
        logger.info("entry-title: {}".format(
            html.select_one("h1.entry-title").text))
        row["camp_title"] = html.select_one("h1.entry-title").text
        text0 = [
            t.select_one("a").text for t in html.select_one("#text0").select(
                "div[class^='t-camp-']"  # 為t-camp-開頭
                + ":not([class$='-none'])"  # 不為-none結尾
                + ":not([class='t-camp-area'])"  # 不為t-camp-area
            )
        ]
        row["features"] = text0
        text1 = [
            t.select("span[class^=t-]")
            for t in html.select_one("#text1").select("li")
        ]
        merge_text1(text1, row)
    except Exception as e:
        logger.error("Error: {}, content_url: {}".format(e, content_url))
Ejemplo n.º 8
0
 def extract_google_images(self, keyword, prefix, collect_cnt, img_dir):
     driver = Chrome("./chromedriver")
     driver.set_window_rect(10, 10, 1027, 768)
     if not os.path.exists(img_dir):
         os.makedirs(img_dir)
     keyword = quote(keyword)
     logger.debug("keyword: {}, collect_cnt{}".format(keyword, collect_cnt))
     url_pattern = "https://www.google.com/search?q={}&source=lnms&tbm=isch&sa=X&ved=0ahUKEwi33-bootHhAhVXyIsBHXN5CAMQ_AUIDigB&biw=1920&bih=979"
     url = url_pattern.format(keyword)
     driver.get(url)
     _len = 0
     while True:
         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾)
         time.sleep(3)  # 2秒有時會來不及, 所以改用3秒
         hSRGPd = driver.find_elements_by_css_selector("a[jsname='hSRGPd']")
         logger.info("已載入{}筆資料".format(len(hSRGPd)))
         if _len == len(hSRGPd):
             break
         _len = len(hSRGPd)  # 筆數一樣表示沒有意見了
     g_urls = []
     for d in hSRGPd:
         g_url = d.get_attribute("href")
         g_urls.append(g_url)
     delay = [1, 2, 3, 1.5, 2.3, 3.2]
     for i in range(len(g_urls)):
         try:
             g_url = g_urls[i]
             driver.get(g_url)
             time.sleep(random.choice(delay))
             img_url = driver.find_element_by_css_selector("img[class='irc_mi'][src^='http']").get_attribute("src")
             logger.debug("img_url: {}".format(img_url))
             fpath = img_dir + "/" + prefix + format(i, "03d") + "." + img_url.split(".")[-1]
             urlretrieve(img_url, fpath)
             if i > collect_cnt:
                 break
         except Exception as e:
             logger.error("Error: {}".format(e))
Ejemplo n.º 9
0
 def camplist_to_mysql(self, json_data):
     conn = None
     try:
         # conn = MySQLdb.connect(**db_config["mysql"])
         conn = mysql.connector.connect(**db_config["mysql"])
         # conn.autocommit(False)
         cur = conn.cursor()
         sql = "delete from camp_webs"
         res = cur.execute(sql)
         logger.debug("sql: {}, res: {}".format(sql, res))
         sql = "delete from camp_tels"
         res = cur.execute(sql)
         logger.debug("sql: {}, res: {}".format(sql, res))
         sql = "delete from camp_features"
         res = cur.execute(sql)
         logger.debug("sql: {}, res: {}".format(sql, res))
         sql = "delete from camp_list"
         res = cur.execute(sql)
         logger.debug("sql: {}, res: {}".format(sql, res))
         ins_datas = []
         for data in json_data:
             ins_data = (
                 data["camp_title"], data["camp_site"], data["addr"], (data["latlong"] if data["latlong"] else "NA"),
                 data["location"],
                 data["style"], data["tags"])
             ins_datas.append(ins_data)
         sql = ("insert into camp_list ( \n"
                + "camp_title, camp_site, addr, latlong, location, style, tags \n"
                + ") values ( \n"
                + "%s, %s, %s, %s, %s, %s, %s \n"
                + ")")
         res = cur.executemany(sql, ins_datas)
         logger.debug("sql: {}, res: {}".format(sql, res))
         feature_datas = []
         tel_datas = []
         web_datas = []
         for data in json_data:
             camp_title = data["camp_title"]
             for feature in data["features"]:
                 feature_datas.append((camp_title, feature))
             for tel in data["tel"]:
                 if tel == "" or tel_datas.count((camp_title, tel)) != 0:
                     print(">>>> ", tel)
                     continue
                 tel_datas.append((camp_title, tel))
             for ws in data["web_site"]:
                 for item in ws.items():
                     web_datas.append((camp_title, item[0], item[1]))
         sql = ("insert into camp_features ( \n"
                + "camp_title, feature \n"
                + ") values ( \n"
                + "%s, %s \n"
                + ")")
         res = cur.executemany(sql, feature_datas)
         logger.debug("sql: {}, res: {}".format(sql, res))
         sql = ("insert into camp_tels ( \n"
                + "camp_title, tel \n"
                + ") values ( \n"
                + "%s, %s \n"
                + ")")
         res = cur.executemany(sql, tel_datas)
         logger.debug("sql: {}, res: {}".format(sql, res))
         sql = ("insert into camp_webs ( \n"
               + "camp_title, name, url \n"
               + ") values ( \n"
               + "%s, %s, %s \n"
               + ")")
         res = cur.executemany(sql, web_datas)
         logger.debug("sql: {}, res: {}".format(sql, res))
         conn.commit()
     except Exception as e:
         logger.error("Error: {}".format(e.with_traceback()))
         if conn:
             conn.rollback()
     finally:
         if conn:
             conn.close()
             logger.debug("conn.close() ...")
Ejemplo n.º 10
0
 def extract_rvcamp(self, limit_count):
     total = []
     response = requests.get(self.__config["url_rvcamp"])
     html = BeautifulSoup(response.text)
     menus = html.select_one("#home-menu").select("li > a")
     cnt_area = 0
     bk = False
     extract_count = 0
     for menu in menus:
         cnt_area = cnt_area + 1
         cnt_campsite = 0
         murl = menu["href"]
         logger.info("區域: {} ----------------------------".format(menu.text))
         logger.debug("murl: {}".format(murl))
         response = requests.get(murl)
         html = BeautifulSoup(response.text)
         nav = html.select_one("div.nav-links")  # 分頁導覽區域
         if nav is not None:
             last_page_num = int(
                 nav.select_one("a.page-numbers:nth-last-of-type(2)")["href"].split("/")[-1])  # 倒數第2個才是最後一頁
             logger.info("總共{}頁".format(last_page_num))
             for num in range(last_page_num):
                 pnum = str(num + 1)
                 logger.info("{} - 第{}頁 ----------------------------".format(menu.text, pnum))
                 page_url = murl + "/page/" + pnum
                 logger.debug("page_url: {}".format(page_url))
                 response = requests.get(page_url)
                 html = BeautifulSoup(response.text)
                 campsites = html.select("h2.entry-title-list > a")
                 for campsite in campsites:
                     extract_count += 1
                     cnt_campsite = cnt_campsite + 1
                     row = dict()
                     # row["_id"] = "campsite_" + format(cnt_area, "02d") + "_" + format(cnt_campsite, "04d")
                     row["location"] = menu.text
                     campsite_url = campsite["href"]
                     try:
                         logger.debug("content_url: {}".format(campsite_url))
                         response = requests.get(campsite_url)
                         html = BeautifulSoup(response.text)
                         logger.info("entry-title: {}".format(html.select_one("h1.entry-title").text))
                         row["camp_title"] = html.select_one("h1.entry-title").text
                         text0 = [t.select_one("a").text for t in
                                  html.select_one("#text0").select(
                                      "div[class^='t-camp-']"  # 為t-camp-開頭
                                      + ":not([class$='-none'])"  # 不為-none結尾
                                      + ":not([class='t-camp-area'])"  # 不為t-camp-area
                                  )
                                  ]
                         row["features"] = text0
                         text1 = [t.select("span[class^=t-]") for t in html.select_one("#text1").select("li")]
                         self.__merge_rvcamp_text1(text1, row)
                     except Exception as e:
                         logger.error("Error: {}, campsite_url: {}".format(e, campsite_url))
                     logger.info("row: {}".format(row));
                     total.append(row)
                     # if False and cnt_area == 1 and cnt_campsite == 10:  # 限制爬的數量(False則不限制數量)
                     if extract_count == limit_count:
                         bk = True  # Python沒有label, 所以用這種鳥方式
                     if bk:
                         break
                 # <<< end of page campsite for loop
                 if bk:
                     break
             # <<< end of location page for loop
         if bk:
             break
     # <<< end of location menu for loop
     logger.info("total count: {}".format(len(total)))
     return total  # json array