def main(): # python beauty_spider2.py [版名] [爬蟲起始的頁面] [爬幾頁] [推文多少以上] board, start_page, page_term, push_rate = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) start_time = time.time() datetime_format = '%Y%m%d%H%M%S' crawler_time = '_PttImg_{:{}}'.format(datetime.datetime.now(), datetime_format) if start_page == 0: print("請輸入有效數字") sys.exit() # 如為 -1 ,則從最新的一頁開始 else: # 檢查看板是否為18禁,有些看板為18禁 soup = over18(board) all_page_url = soup.select('.btn.wide')[1]['href'] start_page = get_page_number(all_page_url) print("Analytical download page...") index_list = [] article_list = [] for page in range(start_page, start_page - page_term, -1): page_url = 'https://www.ptt.cc/bbs/{}/index{}.html'.format(board, page) index_list.append(page_url) # 抓取 文章標題 網址 推文數 while index_list: index = index_list.pop(0) res = rs.get(index, verify=False) # 如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接 if res.status_code != 200: index_list.append(index) # print('error_url:',index) time.sleep(1) else: # soup = BeautifulSoup(res.text, 'html.parser') article_list = craw_page(res, push_rate) # print('ok_url:', index) time.sleep(0.05) total = len(article_list) count = 0 # 進入每篇文章分析內容 while article_list: article = article_list.pop(0) res = rs.get(article['url'], verify=False) # 如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接 if res.status_code != 200: article_list.append(article) # print('error_URL:',article[1]) time.sleep(1) else: # soup = BeautifulSoup(res.text, 'html.parser') count += 1 # print('OK_URL:', article.url) download_beauty.store_pic(crawler_time, article['url'], article['rate'], article['title']) print('download: {:.2%}'.format(count / total)) time.sleep(0.05) print("下載完畢...") print('execution time: {:.3}s'.format(time.time() - start_time))
def main(crawler_pages=2): engine, session = connect_db(DB_connect) # python beauty_spider2.py [版名] [爬幾頁] [推文多少以上] board, page_term, push_rate = 'beauty', crawler_pages, 10 start_time = time.time() soup = over18(board) all_page_url = soup.select('.btn.wide')[1]['href'] start_page = get_page_number(all_page_url) print("Analytical download page...") index_list = [] article_list = [] for page in range(start_page, start_page - page_term, -1): page_url = 'https://www.ptt.cc/bbs/{}/index{}.html'.format(board, page) index_list.append(page_url) # 抓取 文章標題 網址 推文數 while index_list: index = index_list.pop(0) res = rs.get(index, verify=False) # 如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接 if res.status_code != 200: index_list.append(index) time.sleep(1) else: article_list += craw_page(res, push_rate) time.sleep(0.05) total = len(article_list) count = 0 image_seq = [] # 進入每篇文章分析內容 while article_list: article = article_list.pop(0) res = rs.get(article['url'], verify=False) # 如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接 if res.status_code != 200: article_list.append(article) time.sleep(1) else: count += 1 image_seq += download_beauty.store_pic(article['url']) write_db(image_seq, session) print('download: {:.2%}'.format(count / total)) time.sleep(0.05) # disconnect session.close() engine.dispose() print("下載完畢...") print('execution time: {:.3}s'.format(time.time() - start_time))
rate = int(comment_rate.group(2)) except Exception as err: rate = reformalize(comment_rate.group(2)) if rate >= push_rate: # parse each url # get into new page, parse photo try: url = 'https://www.ptt.cc/' + url_regex.search(each_data).group(1) article_list.append((rate, url)) # print rate, url except Exception as err: # print err pass if __name__ == '__main__': start_page, page_term, push_rate = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]) if start_page < 0: start_page = getFirstPage() # print start_page, page_term, push_rate article_list = [] for page in range(start_page, start_page - page_term, -1): page_url = 'https://www.ptt.cc/bbs/Beauty/index' + str(page) + '.html' crawPage(page_url, article_list, push_rate) for hot_rate, article in article_list: download_beauty.store_pic(article, str(hot_rate))
else: crawPage(index, push_rate) #print u'OK_URL:', index time.sleep(0.05) total = len(article_list) count = 0 #進入每篇文章分析內容 while article_list: article = article_list.pop(0) # url = article[1] res = rs.get(article[1], verify=False) soup = BeautifulSoup(res.text, 'html.parser') #如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接 if (soup.title.text.find('Service Temporarily') > -1): article_list.append(article) #print u'error_URL:',article[1] time.sleep(1) else: count += 1 #print u'OK_URL:', article[1] # rate = article[0], url = article[1], title = article[2] # store_pic(CrawlerTime, url, rate="", title="" ): download_beauty.store_pic(CrawlerTime, article[1], str(article[0]), article[2]) print u"download: " + str(100 * count / total) + " %." time.sleep(0.05) print u"下載完畢..." print u"execution time:" + str(time.time() - start_time) + "s"
except Exception as err: # print err pass if __name__ == '__main__': start_page, page_term, push_rate = int(sys.argv[1]), int(sys.argv[2]), int( sys.argv[3]) if start_page < 0: start_page = getFirstPage() # print start_page, page_term, push_rate print "解析下載網頁面,統計數量中..." article_list = [] for page in range(start_page, start_page - page_term, -1): page_url = 'https://www.ptt.cc/bbs/Beauty/index' + str(page) + '.html' crawPage(page_url, article_list, push_rate) print "即將開始下載圖片, 請再等一下下 ^_^" total = len(article_list) count = 0 for hot_rate, article in article_list: download_beauty.store_pic(article, str(hot_rate)) count += 1 print "已經下載: " + str(100 * count / total) + " %." print "即將下載完畢,滿滿的正妹圖就要入袋拉!"
total = len(article_list) count = 0 #進入每篇文章分析內容 while article_list: article = article_list.pop(0) # url = article[1] res = rs.get( article[1], verify = False ) soup = BeautifulSoup(res.text,'html.parser') #如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接 if (soup.title.text.find('Service Temporarily') > -1) : article_list.append(article) #print u'error_URL:',article[1] time.sleep(1) else : count += 1 #print u'OK_URL:', article[1] # rate = article[0], url = article[1], title = article[2] # store_pic(CrawlerTime, url, rate="", title="" ): download_beauty.store_pic(CrawlerTime, article[1], str(article[0]), article[2]) print u"download: " + str(100 * count / total ) + " %." time.sleep(0.05) print u"下載完畢..." print u"execution time:" + str(time.time() - start_time)+"s"