def get_page(self,code,url): self.itemArray = [] res = requests.get(url,timeout=10) res.encoding = "gbk" try: res.raise_for_status() if res.status_code == 200 : contentSoup = bs4.BeautifulSoup(res.text,'lxml') elems = contentSoup.select('#js_ggzx > li,.li_point > ul > li,.col02_22 > ul > li') for elem in elems: json = {} json['code'] = code ele = elem.select('span') json['date'] = dateutil.format_date(ele[0].getText()[1:-1]) s = json['date'] ele = elem.select('a') json['title'] = ele[len(ele)-1].getText() loger.info("date:{},title:{}".format(s,json['title'])) json['href'] = ele[len(ele)-1].attrs['href'] ret,content = self.get_content(json['href']) if ret != -1 : time.sleep(4 * random.random()) if ret == 0 : json['content'] = content self.itemArray.append(json) except Exception as err: time.sleep(4 * random.random()) loger.warning(err) finally: res.close()
def get_content(self,url): content = '' ret = -1 self.urlExist = self.mongodbutil.urlIsExist(url) if self.urlExist: loger.info('This url:{} has existed'.format(url)) return ret, content header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} res = requests.get(url,headers=header,timeout=10) res.encoding = "utf-8" try: res.raise_for_status() if res.status_code == 200: soup = bs4.BeautifulSoup(res.text,'lxml') elems = soup.select('#artibody,.entry-content') if len(elems) > 0 : content = elems[0].getText() ret = 0 self.mongodbutil.insertUrls({"url": url}) except Exception as err: loger.warning(err) finally: res.close() return ret, content
def scheduled_history_job(): loger.info('history_scheduled_job..') if working_history == False: sched.remove_job(timerid_history) start_crawl_history() else: loger.info('pre-history-timer is working')
def process_30birds(): login() urls = [ "https://www.facebook.com/微笑山丘-223226418074079/", "https://www.facebook.com/30birdz/" ] for url in urls: url_reviews = url + "reviews/" driver.get(url_reviews) time.sleep(1) _len = 0 while True: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);" ) # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾) time.sleep(3) # 2秒有時會來不及, 所以改用3秒 reviews = driver.find_elements_by_css_selector( "div[class='_5pbx userContent _3576']") logger.info("已載入{}筆意見".format(len(reviews))) if _len == len(reviews): break _len = len(reviews) # 筆數一樣表示沒有意見了 for review in reviews: logger.info("id: {}, comment: {}".format( review.get_attribute("id"), review.find_element_by_tag_name("p").text))
def google_search_extract_pixnet_blog(self, camp_list): datas = list() def search_filter(url_list): for u in url_list: if u.find("pixnet.net/blog/post/") != -1: yield u delays = [9, 10, 5] for idx in range(len(camp_list)): if idx % random.choice(delays) == 0: time.sleep(30) camp = camp_list[idx] camp_title = camp["camp_title"] camp_site = camp["camp_site"] logger.info("idx: {}, camp_site: {}, camp_title: {}".format(idx, camp_site, camp_title)) collect_cnt = 1 max_start = 30 # search_result = self.google_search("\"露營\"+\"痞客邦\"+\"" + camp_site + "\"", search_filter, collect_cnt, # max_start) search_result = self.google_search("露營+痞客邦+" + camp_site, search_filter, collect_cnt, max_start) logger.debug("search_result: {}".format(search_result)) for url in search_result: content = self.extract_pixnet(url)["text_content"] data = dict() data["camp_site"] = camp_site data["camp_title"] = camp_title data["pixnet_url"] = url data["content"] = content # .replace("\"", "") datas.append(data) return datas
def scheduled_job(): loger.info('scheduled_job..') if working == False: sched.remove_job(timerid) start_crawl() else: loger.info('pre-timer is working')
def process(url): """ 將爬蟲內容轉成JSON """ total = [] response = requests.get(url) html = BeautifulSoup(response.text) menus = html.select_one("#home-menu").select("li > a") cnt_area = 0 bk = False for menu in menus: cnt_area = cnt_area + 1 cnt_campsite = 0 murl = menu["href"] logger.info("區域: {} ----------------------------".format(menu.text)) logger.debug("murl: {}".format(murl)) response = requests.get(murl) html = BeautifulSoup(response.text) nav = html.select_one("div.nav-links") # 分頁導覽區域 if nav is not None: last_page_num = int( nav.select_one("a.page-numbers:nth-last-of-type(2)") ["href"].split("/")[-1]) # 倒數第2個才是最後一頁 logger.info("總共{}頁".format(last_page_num)) for num in range(last_page_num): pnum = str(num + 1) logger.info("{} - 第{}頁 ----------------------------".format( menu.text, pnum)) page_url = murl + "/page/" + pnum logger.debug("page_url: {}".format(page_url)) response = requests.get(page_url) html = BeautifulSoup(response.text) campsites = html.select("h2.entry-title-list > a") for campsite in campsites: cnt_campsite = cnt_campsite + 1 row = dict() # row["_id"] = "campsite_" + format(cnt_area, "02d") + "_" + format(cnt_campsite, "04d") row["location"] = menu.text campsite_url = campsite["href"] process_content(campsite_url, row) logger.info("row: {}".format(row)) total.append(row) if False and cnt_area == 1 and cnt_campsite == 10: # 限制爬的數量(False則不限制數量) bk = True # Python沒有label, 所以用這種鳥方式 if bk: break # <<< end of page campsite for loop if bk: break # <<< end of location page for loop if bk: break # <<< end of location menu for loop logger.info("total count: {}".format(len(total))) return total
def extract_fb_comment(self, camp_list): datas = list() for camp in camp_list: web_site = camp["web_site"] fb_url = "" for web in web_site: for v in web.values(): if v.find("facebook.com") != -1: fb_url = v if "" != fb_url: break if "" != fb_url: break if "" != fb_url: data = dict() data["camp_site"] = camp["camp_site"] data["camp_title"] = camp["camp_title"] data["fb_url"] = fb_url datas.append(data) driver = self.init_fb() delays = [7, 3, 5, 2, 4] for data in datas: try: url = data["fb_url"] url_reviews = url + "reviews/" logger.debug("url_reviews: {}".format(url_reviews)) driver.get(url_reviews) time.sleep(random.choice(delays)) _len = 0 while True: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾) time.sleep(3) reviews = driver.find_elements_by_css_selector("div[class='_5pbx userContent _3576']") logger.info("已載入{}筆意見".format(len(reviews))) if _len == len(reviews): break _len = len(reviews) # 筆數一樣表示沒有意見了 comments = list() for review in reviews: # logger.info( # "id: {}, comment: {}".format(review.get_attribute("id"), # review.find_element_by_tag_name("p").text)) comment = review.find_element_by_tag_name("p").text if comment and "" != comment.strip(): comments.append(comment.strip()) data["comments"] = comments except Exception as e: logger.error("Error: {}".format(e)) return datas
def start_crawl(): ''' retrieve news from sina site :return: ''' global working working = True loger.info('start crawl current news...') for market in MARKET: data = read_file(market) for indexs in data.index: market = data.loc[indexs].values[0][0:2] code = data.loc[indexs].values[0][3:] url = generate_url(market, code) loger.info('Current Time:{}, code:{}, url:{}'.format( datetime.datetime.now(), code, url)) try: sinanews.get_page(code, url) items = sinanews.get_item_array() if len(items) > 0: mongodbutil.insertItems(items) time.sleep(4 * random.random()) loger.info("store items to mongodb ...") else: loger.info("all items exists") except Exception as err: time.sleep(4 * random.random()) loger.warning(err) working = False sched.add_job(scheduled_job, 'interval', seconds=1, id=timerid)
def process(keyword, prefix, collect_cnt): driver = Chrome("../chromedriver") driver.set_window_rect(10, 10, 1027, 768) img_dir = path_config["crawler"] + "/images" if not os.path.exists(img_dir): os.makedirs(img_dir) keyword = quote(keyword) logger.debug("keyword: {}, collect_cnt{}".format(keyword, collect_cnt)) ret = list() url_pattern = "https://www.google.com/search?q={}&source=lnms&tbm=isch&sa=X&ved=0ahUKEwi33-bootHhAhVXyIsBHXN5CAMQ_AUIDigB&biw=1920&bih=979" url = url_pattern.format(keyword) driver.get(url) _len = 0 while True: driver.execute_script("window.scrollTo(0, document.body.scrollHeight);" ) # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾) time.sleep(3) # 2秒有時會來不及, 所以改用3秒 hSRGPd = driver.find_elements_by_css_selector("a[jsname='hSRGPd']") logger.info("已載入{}筆資料".format(len(hSRGPd))) if _len == len(hSRGPd): break _len = len(hSRGPd) # 筆數一樣表示沒有意見了 g_urls = [] for d in hSRGPd: g_url = d.get_attribute("href") g_urls.append(g_url) delay = [1, 2, 3, 1.5, 2.3, 3.2] for i in range(len(g_urls)): try: g_url = g_urls[i] # print("g_url=", g_url) driver.get(g_url) time.sleep(random.choice(delay)) img_url = driver.find_element_by_css_selector( "img[class='irc_mi'][src^='http']").get_attribute("src") print("img_url=", img_url) fpath = img_dir + "/" + prefix + format( i, "03d") + "." + img_url.split(".")[-1] urlretrieve(img_url, fpath) if i > collect_cnt: break except Exception as e: print("Error: {}".format(e)) return ret
def proces_pixnet_blog(camp_list): """ 依JSON找出營地的痞客邦部落格 :param camp_list: 營地資訊JSON :return: """ datas = list() import crawler.test.google_search as google_search import crawler.test.pixnet as pixnet def search_filter(url_list): for u in url_list: if u.find("pixnet.net/blog") != -1: yield u delays = [9, 10, 2, 5] for idx in range(len(camp_list)): if idx % random.choice(delays) == 0: time.sleep(30) camp = camp_list[idx] camp_title = camp["camp_title"] camp_site = camp["camp_site"] logger.info("idx: {}, camp_site: {}, camp_title: {}".format( idx, camp_site, camp_title)) collect_cnt = 3 max_start = 30 search_result = google_search.process( "\"露營\"+\"pixnet\"+\"" + camp_site + "\"", search_filter, collect_cnt, max_start) logger.debug("search_result: {}".format(search_result)) for url in search_result: content = pixnet.process(url)["text_content"] data = dict() data["camp_site"] = camp_site data["camp_title"] = camp_title data["pixnet_url"] = url data["content"] = content # .replace("\"", "") datas.append(data) return datas
def process_content(content_url, row): try: logger.debug("content_url: {}".format(content_url)) response = requests.get(content_url) html = BeautifulSoup(response.text) logger.info("entry-title: {}".format( html.select_one("h1.entry-title").text)) row["camp_title"] = html.select_one("h1.entry-title").text text0 = [ t.select_one("a").text for t in html.select_one("#text0").select( "div[class^='t-camp-']" # 為t-camp-開頭 + ":not([class$='-none'])" # 不為-none結尾 + ":not([class='t-camp-area'])" # 不為t-camp-area ) ] row["features"] = text0 text1 = [ t.select("span[class^=t-]") for t in html.select_one("#text1").select("li") ] merge_text1(text1, row) except Exception as e: logger.error("Error: {}, content_url: {}".format(e, content_url))
def start_crawl_history(): ''' retrieve news from sina site :return: ''' global working_history working_history = True loger.info('start crawl history news...') for market in MARKET: data = read_file(market) for indexs in data.index: market = data.loc[indexs].values[0][0:2] code = data.loc[indexs].values[0][3:] sinanewshistory.clear_item_array() loger.info('Current Time:{}, code:{}, market:{},history'.format( datetime.datetime.now(), code, market)) try: if market == 'HK': sinanewshistory.get_hk_page(market, code) if market == 'US': sinanewshistory.get_us_page(market, code) if market == 'SZ' or market == 'SH': sinanewshistory.get_chn_page(market, code) items = sinanewshistory.get_item_array() if len(items) > 0: mongodbutil.insertItems(items) time.sleep(4 * random.random()) loger.info("store items to mongodb ...") else: loger.info("all items exists") except Exception as err: time.sleep(4 * random.random()) loger.warning(err) working_history = False sched.add_job(scheduled_history_job, 'interval', days=1, id=timerid_history)
def process(url): response = requests.get(url) items = json.loads(response.text)["items"] logger.debug(items) limit_cnt = 3 # 限制帶出筆數(大於0才做限制) for idx in range(len(items)): item = items[idx] logger.info("{}.".format(idx + 1), item) logger.info("no: {}, store_id: {}, store_name: {}".format( idx + 1, item["store_id"], item["store_name"])) logger.info("address: {}, area: {}, city: {}".format( item["address"], item["area"], item["city"])) url = "https://icamping-prod.appspot.com/_ah/api/icamping_guest/v2/query_store_by_store_id?store_id=" + item[ "store_id"] response = requests.get(url) content = json.loads(response.text)["items"][0] logger.info("description: {}".format(content["description"])) external_links = json.loads(content["external_links"]) for ex_link in external_links: logger.info("link_name: {}, link: {}".format( ex_link["link_name"], ex_link["link"])) photo = json.loads(content["photo"]) for p in photo: logger.info("gcs_url: {}".format(p["gcs_url"])) if limit_cnt > 0 and idx == limit_cnt - 1: break logger.info( "------------------------------------------------------------")
def extract_rvcamp(self, limit_count): total = [] response = requests.get(self.__config["url_rvcamp"]) html = BeautifulSoup(response.text) menus = html.select_one("#home-menu").select("li > a") cnt_area = 0 bk = False extract_count = 0 for menu in menus: cnt_area = cnt_area + 1 cnt_campsite = 0 murl = menu["href"] logger.info("區域: {} ----------------------------".format(menu.text)) logger.debug("murl: {}".format(murl)) response = requests.get(murl) html = BeautifulSoup(response.text) nav = html.select_one("div.nav-links") # 分頁導覽區域 if nav is not None: last_page_num = int( nav.select_one("a.page-numbers:nth-last-of-type(2)")["href"].split("/")[-1]) # 倒數第2個才是最後一頁 logger.info("總共{}頁".format(last_page_num)) for num in range(last_page_num): pnum = str(num + 1) logger.info("{} - 第{}頁 ----------------------------".format(menu.text, pnum)) page_url = murl + "/page/" + pnum logger.debug("page_url: {}".format(page_url)) response = requests.get(page_url) html = BeautifulSoup(response.text) campsites = html.select("h2.entry-title-list > a") for campsite in campsites: extract_count += 1 cnt_campsite = cnt_campsite + 1 row = dict() # row["_id"] = "campsite_" + format(cnt_area, "02d") + "_" + format(cnt_campsite, "04d") row["location"] = menu.text campsite_url = campsite["href"] try: logger.debug("content_url: {}".format(campsite_url)) response = requests.get(campsite_url) html = BeautifulSoup(response.text) logger.info("entry-title: {}".format(html.select_one("h1.entry-title").text)) row["camp_title"] = html.select_one("h1.entry-title").text text0 = [t.select_one("a").text for t in html.select_one("#text0").select( "div[class^='t-camp-']" # 為t-camp-開頭 + ":not([class$='-none'])" # 不為-none結尾 + ":not([class='t-camp-area'])" # 不為t-camp-area ) ] row["features"] = text0 text1 = [t.select("span[class^=t-]") for t in html.select_one("#text1").select("li")] self.__merge_rvcamp_text1(text1, row) except Exception as e: logger.error("Error: {}, campsite_url: {}".format(e, campsite_url)) logger.info("row: {}".format(row)); total.append(row) # if False and cnt_area == 1 and cnt_campsite == 10: # 限制爬的數量(False則不限制數量) if extract_count == limit_count: bk = True # Python沒有label, 所以用這種鳥方式 if bk: break # <<< end of page campsite for loop if bk: break # <<< end of location page for loop if bk: break # <<< end of location menu for loop logger.info("total count: {}".format(len(total))) return total # json array
items = sinanewshistory.get_item_array() if len(items) > 0: mongodbutil.insertItems(items) time.sleep(4 * random.random()) loger.info("store items to mongodb ...") else: loger.info("all items exists") except Exception as err: time.sleep(4 * random.random()) loger.warning(err) working_history = False sched.add_job(scheduled_history_job, 'interval', days=1, id=timerid_history) loger.info('Starting time: {}'.format(datetime.datetime.now())) sched.add_job(scheduled_job, 'interval', max_instances=2, seconds=1, id=timerid) sched.add_job(scheduled_history_job, 'interval', max_instances=2, days=1, id=timerid_history) sched.start() loger.info('Ending time: {}'.format(datetime.datetime.now()))