Example #1
0
 def merge_rvcamp_and_pixnet(self, rvcamp_json, pixnet_json):
     ret = list()
     logger.debug("rvcamp_json.keys: {}".format(rvcamp_json[0].keys()))
     logger.debug("pixnet_json.keys: {}".format(pixnet_json[0].keys()))
     style_dict = self.get_camp_style_dict()
     for rvcamp in rvcamp_json:
         style = None
         tmp = 0
         tags = list()
         for pixnet in pixnet_json:
             if rvcamp["camp_title"] == pixnet["camp_title"]:
                 content = pixnet["content"]
                 if content:
                     for k, v in style_dict.items():
                         matches = 0
                         for w in v:
                             cnt = content.count(w)
                             matches += cnt
                             if cnt > 0 and tags.count(w) == 0:
                                 tags.append(w)
                         if matches > tmp:
                             tmp = matches
                             style = k
                 break
         if style:
             rvcamp["style"] = style
             rvcamp["tags"] = " ".join(tags)
             ret.append(rvcamp)
     return ret
Example #2
0
    def google_search_extract_pixnet_blog(self, camp_list):
        datas = list()

        def search_filter(url_list):
            for u in url_list:
                if u.find("pixnet.net/blog/post/") != -1: yield u

        delays = [9, 10, 5]
        for idx in range(len(camp_list)):
            if idx % random.choice(delays) == 0:
                time.sleep(30)
            camp = camp_list[idx]
            camp_title = camp["camp_title"]
            camp_site = camp["camp_site"]
            logger.info("idx: {}, camp_site: {}, camp_title: {}".format(idx, camp_site, camp_title))
            collect_cnt = 1
            max_start = 30
            # search_result = self.google_search("\"露營\"+\"痞客邦\"+\"" + camp_site + "\"", search_filter, collect_cnt,
            #                                    max_start)
            search_result = self.google_search("露營+痞客邦+" + camp_site, search_filter, collect_cnt,
                                               max_start)
            logger.debug("search_result: {}".format(search_result))
            for url in search_result:
                content = self.extract_pixnet(url)["text_content"]
                data = dict()
                data["camp_site"] = camp_site
                data["camp_title"] = camp_title
                data["pixnet_url"] = url
                data["content"] = content  # .replace("\"", "")
                datas.append(data)
        return datas
Example #3
0
    def extract_evshhips(self):
        data_list = list()
        url = "https://evshhips.pixnet.net/blog/category/4337992"
        response = requests.get(url)
        response.encoding = "utf-8"  # 解決亂碼問題
        html = BeautifulSoup(response.text)
        mylink = html.select_one("#mylink")

        def get_data_by_url(url):
            data = dict()
            for d in data_list:
                if url == d.get("url"):
                    data = d
                    break
            return data

        for type in mylink.select("div.inner-box"):
            if type.select_one("img"):
                # logger.debug(type)
                style = type.select_one("h6").text.strip().split(" ")[0]
                url_list = type.select("a")
                for u in url_list:
                    title = u.text
                    url = u["href"]
                    logger.debug("style: {}, title: {}, url: {}".format(style, title, url))
                    content = self.extract_pixnet(url)["text_content"]
                    data = get_data_by_url(url)
                    if not data.get("url"):
                        data["style"] = list()
                        data["title"] = title
                        data["url"] = url
                        data["content"] = content
                        data_list.append(data)
                    data["style"].append(style)
        return data_list
Example #4
0
def process(keyword, search_filter, collect_cnt, max_start):
    """
    :param keyword: 搜尋關鍵字
    :param search_filter: 過濾器實作(generator)
    :param collect_cnt: 需蒐集的筆數
    :param max_start: 最大分頁總筆數
    :return: 搜尋到的網址陣列
    """
    keyword = quote(keyword)
    logger.debug("keyword: {}, collect_cnt{}, max_start: {}".format(
        keyword, collect_cnt, max_start))
    ret = list()
    url_pattern = "https://www.google.com/search?q={}&start={}"

    for start in range(0, max_start, 10):
        url = url_pattern.format(keyword, start)
        try:
            logger.debug("url: {}".format(url))
            response = random_requests_get(url)
            html = BeautifulSoup(response.text)
            url_list = [
                unquote(d["href"], "utf-8").replace("/url?q=",
                                                    "").split("&sa=")[0]
                for d in html.select("h3.r > a")
            ]  # 該頁搜尋結果連結
            ret.extend(search_filter(url_list))
            if len(ret) == 0:
                break
            ret = ret[0:collect_cnt] if len(ret) > collect_cnt else ret
            if len(ret) == collect_cnt:
                break
        except Exception as e:
            logger.error("Error: {}, url: {}".format(e, url))
    return ret
Example #5
0
def json_to_mongodb_pixnet(json_data, drop):
    """
    JSON寫入MongoDB
    """
    # import json
    # from bson import json_util
    conn = None
    try:
        conn = MongoClient(db_config["mongodb"])
        db = conn.test
        coll = db.blog_pixnet
        if drop:
            coll.drop()
        for doc in json_data:
            coll.update(
                {
                    "camp_title": doc["camp_title"],
                    "pixnet_url": doc["pixnet_url"]
                },
                doc,
                upsert=True)
    except Exception as e:
        logger.error("Error:", e)
    finally:
        if conn:
            conn.close()
            logger.debug("connection closed ...")
Example #6
0
def process(url):
    response = requests.get(url)
    items = json.loads(response.text)["items"]
    logger.debug(items)
    limit_cnt = 3  # 限制帶出筆數(大於0才做限制)
    for idx in range(len(items)):
        item = items[idx]
        logger.info("{}.".format(idx + 1), item)
        logger.info("no: {}, store_id: {}, store_name: {}".format(
            idx + 1, item["store_id"], item["store_name"]))
        logger.info("address: {}, area: {}, city: {}".format(
            item["address"], item["area"], item["city"]))
        url = "https://icamping-prod.appspot.com/_ah/api/icamping_guest/v2/query_store_by_store_id?store_id=" + item[
            "store_id"]
        response = requests.get(url)
        content = json.loads(response.text)["items"][0]
        logger.info("description: {}".format(content["description"]))
        external_links = json.loads(content["external_links"])
        for ex_link in external_links:
            logger.info("link_name: {}, link: {}".format(
                ex_link["link_name"], ex_link["link"]))
        photo = json.loads(content["photo"])
        for p in photo:
            logger.info("gcs_url: {}".format(p["gcs_url"]))
        if limit_cnt > 0 and idx == limit_cnt - 1:
            break
        logger.info(
            "------------------------------------------------------------")
Example #7
0
def process(url):
    """
    將爬蟲內容轉成JSON
    """
    total = []
    response = requests.get(url)
    html = BeautifulSoup(response.text)
    menus = html.select_one("#home-menu").select("li > a")
    cnt_area = 0
    bk = False
    for menu in menus:
        cnt_area = cnt_area + 1
        cnt_campsite = 0
        murl = menu["href"]
        logger.info("區域: {} ----------------------------".format(menu.text))
        logger.debug("murl: {}".format(murl))
        response = requests.get(murl)
        html = BeautifulSoup(response.text)
        nav = html.select_one("div.nav-links")  # 分頁導覽區域
        if nav is not None:
            last_page_num = int(
                nav.select_one("a.page-numbers:nth-last-of-type(2)")
                ["href"].split("/")[-1])  # 倒數第2個才是最後一頁
            logger.info("總共{}頁".format(last_page_num))
            for num in range(last_page_num):
                pnum = str(num + 1)
                logger.info("{} - 第{}頁 ----------------------------".format(
                    menu.text, pnum))
                page_url = murl + "/page/" + pnum
                logger.debug("page_url: {}".format(page_url))
                response = requests.get(page_url)
                html = BeautifulSoup(response.text)
                campsites = html.select("h2.entry-title-list > a")
                for campsite in campsites:
                    cnt_campsite = cnt_campsite + 1
                    row = dict()
                    # row["_id"] = "campsite_" + format(cnt_area, "02d") + "_" + format(cnt_campsite, "04d")
                    row["location"] = menu.text
                    campsite_url = campsite["href"]
                    process_content(campsite_url, row)
                    logger.info("row: {}".format(row))
                    total.append(row)
                    if False and cnt_area == 1 and cnt_campsite == 10:  # 限制爬的數量(False則不限制數量)
                        bk = True  # Python沒有label, 所以用這種鳥方式
                    if bk:
                        break
                # <<< end of page campsite for loop
                if bk:
                    break
            # <<< end of location page for loop
        if bk:
            break
    # <<< end of location menu for loop
    logger.info("total count: {}".format(len(total)))
    return total
Example #8
0
 def extract_fb_comment(self, camp_list):
     datas = list()
     for camp in camp_list:
         web_site = camp["web_site"]
         fb_url = ""
         for web in web_site:
             for v in web.values():
                 if v.find("facebook.com") != -1:
                     fb_url = v
                 if "" != fb_url:
                     break
             if "" != fb_url:
                 break
         if "" != fb_url:
             data = dict()
             data["camp_site"] = camp["camp_site"]
             data["camp_title"] = camp["camp_title"]
             data["fb_url"] = fb_url
             datas.append(data)
     driver = self.init_fb()
     delays = [7, 3, 5, 2, 4]
     for data in datas:
         try:
             url = data["fb_url"]
             url_reviews = url + "reviews/"
             logger.debug("url_reviews: {}".format(url_reviews))
             driver.get(url_reviews)
             time.sleep(random.choice(delays))
             _len = 0
             while True:
                 driver.execute_script(
                     "window.scrollTo(0, document.body.scrollHeight);")  # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾)
                 time.sleep(3)
                 reviews = driver.find_elements_by_css_selector("div[class='_5pbx userContent _3576']")
                 logger.info("已載入{}筆意見".format(len(reviews)))
                 if _len == len(reviews):
                     break
                 _len = len(reviews)  # 筆數一樣表示沒有意見了
             comments = list()
             for review in reviews:
                 # logger.info(
                 #     "id: {}, comment: {}".format(review.get_attribute("id"),
                 #                                  review.find_element_by_tag_name("p").text))
                 comment = review.find_element_by_tag_name("p").text
                 if comment and "" != comment.strip():
                     comments.append(comment.strip())
             data["comments"] = comments
         except Exception as e:
             logger.error("Error: {}".format(e))
     return datas
Example #9
0
 def camplist_to_mongodb(self, json_data, drop):
     conn = None
     try:
         conn = MongoClient(db_config["mongodb"])
         db = conn.test
         coll = db.camplist
         if drop:
             coll.drop()
         for doc in json_data:
             coll.update({"camp_title": doc["camp_title"]}, doc, upsert=True)
     except Exception as e:
         logger.error("Error:", e)
     finally:
         if conn:
             conn.close()
             logger.debug("connection closed ...")
Example #10
0
def process(keyword, prefix, collect_cnt):
    driver = Chrome("../chromedriver")
    driver.set_window_rect(10, 10, 1027, 768)
    img_dir = path_config["crawler"] + "/images"
    if not os.path.exists(img_dir):
        os.makedirs(img_dir)
    keyword = quote(keyword)
    logger.debug("keyword: {}, collect_cnt{}".format(keyword, collect_cnt))
    ret = list()
    url_pattern = "https://www.google.com/search?q={}&source=lnms&tbm=isch&sa=X&ved=0ahUKEwi33-bootHhAhVXyIsBHXN5CAMQ_AUIDigB&biw=1920&bih=979"
    url = url_pattern.format(keyword)
    driver.get(url)
    _len = 0
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);"
                              )  # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾)
        time.sleep(3)  # 2秒有時會來不及, 所以改用3秒
        hSRGPd = driver.find_elements_by_css_selector("a[jsname='hSRGPd']")
        logger.info("已載入{}筆資料".format(len(hSRGPd)))
        if _len == len(hSRGPd):
            break
        _len = len(hSRGPd)  # 筆數一樣表示沒有意見了
    g_urls = []
    for d in hSRGPd:
        g_url = d.get_attribute("href")
        g_urls.append(g_url)
    delay = [1, 2, 3, 1.5, 2.3, 3.2]
    for i in range(len(g_urls)):
        try:
            g_url = g_urls[i]
            # print("g_url=", g_url)
            driver.get(g_url)
            time.sleep(random.choice(delay))
            img_url = driver.find_element_by_css_selector(
                "img[class='irc_mi'][src^='http']").get_attribute("src")
            print("img_url=", img_url)
            fpath = img_dir + "/" + prefix + format(
                i, "03d") + "." + img_url.split(".")[-1]
            urlretrieve(img_url, fpath)
            if i > collect_cnt:
                break
        except Exception as e:
            print("Error: {}".format(e))
    return ret
Example #11
0
 def __merge_rvcamp_text1(self, datas, row):
     keymap = dict((("電話", "tel"), ("地址", "addr"), ("名稱", "camp_site"), ("網站", "web_site"), ("座標", "latlong")))
     for data in datas:
         title = keymap.get(data[0].text)
         if title is None:
             logger.debug("title is None: {}".format(data[0].text))
             continue
         content = data[1]
         if "tel" == title:
             content = [d.text for d in content.select("span[itemprop='telephone']")]
         elif "web_site" == title:
             content = [{d.text: d["href"]} for d in content.select("a")]
         elif "latlong" == title:
             if content.select_one("#gps_10") is not None:
                 content = ",".join(content.select_one("#gps_10").text.split("\xa0"))
             else:
                 content = "N/A"
         else:
             content = content.text
         row[title] = content
Example #12
0
def proces_pixnet_blog(camp_list):
    """
    依JSON找出營地的痞客邦部落格
    :param camp_list: 營地資訊JSON
    :return:
    """
    datas = list()
    import crawler.test.google_search as google_search
    import crawler.test.pixnet as pixnet

    def search_filter(url_list):
        for u in url_list:
            if u.find("pixnet.net/blog") != -1: yield u

    delays = [9, 10, 2, 5]
    for idx in range(len(camp_list)):
        if idx % random.choice(delays) == 0:
            time.sleep(30)
        camp = camp_list[idx]
        camp_title = camp["camp_title"]
        camp_site = camp["camp_site"]
        logger.info("idx: {}, camp_site: {}, camp_title: {}".format(
            idx, camp_site, camp_title))
        collect_cnt = 3
        max_start = 30
        search_result = google_search.process(
            "\"露營\"+\"pixnet\"+\"" + camp_site + "\"", search_filter,
            collect_cnt, max_start)
        logger.debug("search_result: {}".format(search_result))
        for url in search_result:
            content = pixnet.process(url)["text_content"]
            data = dict()
            data["camp_site"] = camp_site
            data["camp_title"] = camp_title
            data["pixnet_url"] = url
            data["content"] = content  # .replace("\"", "")
            datas.append(data)
    return datas
Example #13
0
def json_to_mongodb_rvcamp(json_data, drop):
    """
    JSON寫入MongoDB
    """
    # import json
    # from bson import json_util
    conn = None
    try:
        conn = MongoClient(db_config["mongodb"])
        db = conn.test
        coll = db.rvcamp
        if drop:
            coll.drop()
        for doc in json_data:
            # doc = json.loads(json.dumps(doc, default=json_util.default))  # trans dict in list to json -> 多此一舉
            # fdoc = coll.find_one({"camp_title": doc["camp_title"]})
            coll.update({"camp_title": doc["camp_title"]}, doc, upsert=True)
    except Exception as e:
        logger.error("Error:", e)
    finally:
        if conn:
            conn.close()
            logger.debug("connection closed ...")
Example #14
0
def process_content(content_url, row):
    try:
        logger.debug("content_url: {}".format(content_url))
        response = requests.get(content_url)
        html = BeautifulSoup(response.text)
        logger.info("entry-title: {}".format(
            html.select_one("h1.entry-title").text))
        row["camp_title"] = html.select_one("h1.entry-title").text
        text0 = [
            t.select_one("a").text for t in html.select_one("#text0").select(
                "div[class^='t-camp-']"  # 為t-camp-開頭
                + ":not([class$='-none'])"  # 不為-none結尾
                + ":not([class='t-camp-area'])"  # 不為t-camp-area
            )
        ]
        row["features"] = text0
        text1 = [
            t.select("span[class^=t-]")
            for t in html.select_one("#text1").select("li")
        ]
        merge_text1(text1, row)
    except Exception as e:
        logger.error("Error: {}, content_url: {}".format(e, content_url))
Example #15
0
 def extract_rvcamp(self, limit_count):
     total = []
     response = requests.get(self.__config["url_rvcamp"])
     html = BeautifulSoup(response.text)
     menus = html.select_one("#home-menu").select("li > a")
     cnt_area = 0
     bk = False
     extract_count = 0
     for menu in menus:
         cnt_area = cnt_area + 1
         cnt_campsite = 0
         murl = menu["href"]
         logger.info("區域: {} ----------------------------".format(menu.text))
         logger.debug("murl: {}".format(murl))
         response = requests.get(murl)
         html = BeautifulSoup(response.text)
         nav = html.select_one("div.nav-links")  # 分頁導覽區域
         if nav is not None:
             last_page_num = int(
                 nav.select_one("a.page-numbers:nth-last-of-type(2)")["href"].split("/")[-1])  # 倒數第2個才是最後一頁
             logger.info("總共{}頁".format(last_page_num))
             for num in range(last_page_num):
                 pnum = str(num + 1)
                 logger.info("{} - 第{}頁 ----------------------------".format(menu.text, pnum))
                 page_url = murl + "/page/" + pnum
                 logger.debug("page_url: {}".format(page_url))
                 response = requests.get(page_url)
                 html = BeautifulSoup(response.text)
                 campsites = html.select("h2.entry-title-list > a")
                 for campsite in campsites:
                     extract_count += 1
                     cnt_campsite = cnt_campsite + 1
                     row = dict()
                     # row["_id"] = "campsite_" + format(cnt_area, "02d") + "_" + format(cnt_campsite, "04d")
                     row["location"] = menu.text
                     campsite_url = campsite["href"]
                     try:
                         logger.debug("content_url: {}".format(campsite_url))
                         response = requests.get(campsite_url)
                         html = BeautifulSoup(response.text)
                         logger.info("entry-title: {}".format(html.select_one("h1.entry-title").text))
                         row["camp_title"] = html.select_one("h1.entry-title").text
                         text0 = [t.select_one("a").text for t in
                                  html.select_one("#text0").select(
                                      "div[class^='t-camp-']"  # 為t-camp-開頭
                                      + ":not([class$='-none'])"  # 不為-none結尾
                                      + ":not([class='t-camp-area'])"  # 不為t-camp-area
                                  )
                                  ]
                         row["features"] = text0
                         text1 = [t.select("span[class^=t-]") for t in html.select_one("#text1").select("li")]
                         self.__merge_rvcamp_text1(text1, row)
                     except Exception as e:
                         logger.error("Error: {}, campsite_url: {}".format(e, campsite_url))
                     logger.info("row: {}".format(row));
                     total.append(row)
                     # if False and cnt_area == 1 and cnt_campsite == 10:  # 限制爬的數量(False則不限制數量)
                     if extract_count == limit_count:
                         bk = True  # Python沒有label, 所以用這種鳥方式
                     if bk:
                         break
                 # <<< end of page campsite for loop
                 if bk:
                     break
             # <<< end of location page for loop
         if bk:
             break
     # <<< end of location menu for loop
     logger.info("total count: {}".format(len(total)))
     return total  # json array
Example #16
0
            response = random_requests_get(url)
            html = BeautifulSoup(response.text)
            url_list = [
                unquote(d["href"], "utf-8").replace("/url?q=",
                                                    "").split("&sa=")[0]
                for d in html.select("h3.r > a")
            ]  # 該頁搜尋結果連結
            ret.extend(search_filter(url_list))
            if len(ret) == 0:
                break
            ret = ret[0:collect_cnt] if len(ret) > collect_cnt else ret
            if len(ret) == collect_cnt:
                break
        except Exception as e:
            logger.error("Error: {}, url: {}".format(e, url))
    return ret


if __name__ == '__main__':
    # keyword = "\"露營\"+\"痞客邦\"+\"山林鳥日子\""
    keyword = "露營+痞客邦+山林鳥日子"

    def search_filter(url_list):
        for u in url_list:
            if u.find("pixnet.net/blog") != -1: yield u

    collect_cnt = 3
    max_start = 30
    result = process(keyword, search_filter, collect_cnt, max_start)
    logger.debug("result: {}".format(result))
Example #17
0
 def camplist_to_mysql(self, json_data):
     conn = None
     try:
         # conn = MySQLdb.connect(**db_config["mysql"])
         conn = mysql.connector.connect(**db_config["mysql"])
         # conn.autocommit(False)
         cur = conn.cursor()
         sql = "delete from camp_webs"
         res = cur.execute(sql)
         logger.debug("sql: {}, res: {}".format(sql, res))
         sql = "delete from camp_tels"
         res = cur.execute(sql)
         logger.debug("sql: {}, res: {}".format(sql, res))
         sql = "delete from camp_features"
         res = cur.execute(sql)
         logger.debug("sql: {}, res: {}".format(sql, res))
         sql = "delete from camp_list"
         res = cur.execute(sql)
         logger.debug("sql: {}, res: {}".format(sql, res))
         ins_datas = []
         for data in json_data:
             ins_data = (
                 data["camp_title"], data["camp_site"], data["addr"], (data["latlong"] if data["latlong"] else "NA"),
                 data["location"],
                 data["style"], data["tags"])
             ins_datas.append(ins_data)
         sql = ("insert into camp_list ( \n"
                + "camp_title, camp_site, addr, latlong, location, style, tags \n"
                + ") values ( \n"
                + "%s, %s, %s, %s, %s, %s, %s \n"
                + ")")
         res = cur.executemany(sql, ins_datas)
         logger.debug("sql: {}, res: {}".format(sql, res))
         feature_datas = []
         tel_datas = []
         web_datas = []
         for data in json_data:
             camp_title = data["camp_title"]
             for feature in data["features"]:
                 feature_datas.append((camp_title, feature))
             for tel in data["tel"]:
                 if tel == "" or tel_datas.count((camp_title, tel)) != 0:
                     print(">>>> ", tel)
                     continue
                 tel_datas.append((camp_title, tel))
             for ws in data["web_site"]:
                 for item in ws.items():
                     web_datas.append((camp_title, item[0], item[1]))
         sql = ("insert into camp_features ( \n"
                + "camp_title, feature \n"
                + ") values ( \n"
                + "%s, %s \n"
                + ")")
         res = cur.executemany(sql, feature_datas)
         logger.debug("sql: {}, res: {}".format(sql, res))
         sql = ("insert into camp_tels ( \n"
                + "camp_title, tel \n"
                + ") values ( \n"
                + "%s, %s \n"
                + ")")
         res = cur.executemany(sql, tel_datas)
         logger.debug("sql: {}, res: {}".format(sql, res))
         sql = ("insert into camp_webs ( \n"
               + "camp_title, name, url \n"
               + ") values ( \n"
               + "%s, %s, %s \n"
               + ")")
         res = cur.executemany(sql, web_datas)
         logger.debug("sql: {}, res: {}".format(sql, res))
         conn.commit()
     except Exception as e:
         logger.error("Error: {}".format(e.with_traceback()))
         if conn:
             conn.rollback()
     finally:
         if conn:
             conn.close()
             logger.debug("conn.close() ...")