def main():
    # python beauty_spider2.py [版名] [爬蟲起始的頁面] [爬幾頁] [推文多少以上]
    board, start_page, page_term, push_rate = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
    start_time = time.time()
    datetime_format = '%Y%m%d%H%M%S'
    crawler_time = '_PttImg_{:{}}'.format(datetime.datetime.now(), datetime_format)
    if start_page == 0:
        print("請輸入有效數字")
        sys.exit()
    # 如為 -1 ,則從最新的一頁開始
    else:
        # 檢查看板是否為18禁,有些看板為18禁
        soup = over18(board)
        all_page_url = soup.select('.btn.wide')[1]['href']
        start_page = get_page_number(all_page_url)

    print("Analytical download page...")
    index_list = []
    article_list = []
    for page in range(start_page, start_page - page_term, -1):
        page_url = 'https://www.ptt.cc/bbs/{}/index{}.html'.format(board, page)
        index_list.append(page_url)

    # 抓取 文章標題 網址 推文數
    while index_list:
        index = index_list.pop(0)
        res = rs.get(index, verify=False)
        # 如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接
        if res.status_code != 200:
            index_list.append(index)
            # print('error_url:',index)
            time.sleep(1)
        else:
            # soup = BeautifulSoup(res.text, 'html.parser')
            article_list = craw_page(res, push_rate)
            # print('ok_url:', index)
        time.sleep(0.05)

    total = len(article_list)
    count = 0
    # 進入每篇文章分析內容
    while article_list:
        article = article_list.pop(0)
        res = rs.get(article['url'], verify=False)
        # 如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接
        if res.status_code != 200:
            article_list.append(article)
            # print('error_URL:',article[1])
            time.sleep(1)
        else:
            # soup = BeautifulSoup(res.text, 'html.parser')
            count += 1
            # print('OK_URL:', article.url)
            download_beauty.store_pic(crawler_time, article['url'], article['rate'], article['title'])
            print('download: {:.2%}'.format(count / total))
        time.sleep(0.05)

    print("下載完畢...")
    print('execution time: {:.3}s'.format(time.time() - start_time))
Exemple #2
0
def main(crawler_pages=2):
    engine, session = connect_db(DB_connect)
    # python beauty_spider2.py [版名]  [爬幾頁] [推文多少以上]
    board, page_term, push_rate = 'beauty', crawler_pages, 10
    start_time = time.time()
    soup = over18(board)
    all_page_url = soup.select('.btn.wide')[1]['href']
    start_page = get_page_number(all_page_url)

    print("Analytical download page...")
    index_list = []
    article_list = []
    for page in range(start_page, start_page - page_term, -1):
        page_url = 'https://www.ptt.cc/bbs/{}/index{}.html'.format(board, page)
        index_list.append(page_url)

    # 抓取 文章標題 網址 推文數
    while index_list:
        index = index_list.pop(0)
        res = rs.get(index, verify=False)
        # 如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接
        if res.status_code != 200:
            index_list.append(index)
            time.sleep(1)
        else:
            article_list += craw_page(res, push_rate)
        time.sleep(0.05)

    total = len(article_list)
    count = 0
    image_seq = []
    # 進入每篇文章分析內容
    while article_list:
        article = article_list.pop(0)
        res = rs.get(article['url'], verify=False)
        # 如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接
        if res.status_code != 200:
            article_list.append(article)
            time.sleep(1)
        else:
            count += 1
            image_seq += download_beauty.store_pic(article['url'])
            write_db(image_seq, session)
            print('download: {:.2%}'.format(count / total))
        time.sleep(0.05)

    # disconnect
    session.close()
    engine.dispose()

    print("下載完畢...")
    print('execution time: {:.3}s'.format(time.time() - start_time))
                rate = int(comment_rate.group(2))
            except Exception as err:
                rate = reformalize(comment_rate.group(2))
            if rate >= push_rate:
                # parse each url
                # get into new page, parse photo
                try:
                    url = 'https://www.ptt.cc/' + url_regex.search(each_data).group(1)
                    article_list.append((rate, url))
                    # print rate, url
                except Exception as err:
                    # print err
                    pass


if __name__ == '__main__':

    start_page, page_term, push_rate = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3])
    if start_page < 0:
        start_page = getFirstPage()
    # print start_page, page_term, push_rate

    article_list = []

    for page in range(start_page, start_page - page_term, -1):
        page_url = 'https://www.ptt.cc/bbs/Beauty/index' + str(page) + '.html'
        crawPage(page_url, article_list, push_rate)

    for hot_rate, article in article_list:
        download_beauty.store_pic(article, str(hot_rate))
        else:
            crawPage(index, push_rate)
            #print u'OK_URL:', index
        time.sleep(0.05)

    total = len(article_list)
    count = 0
    #進入每篇文章分析內容
    while article_list:
        article = article_list.pop(0)
        # url = article[1]
        res = rs.get(article[1], verify=False)
        soup = BeautifulSoup(res.text, 'html.parser')
        #如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接
        if (soup.title.text.find('Service Temporarily') > -1):
            article_list.append(article)
            #print u'error_URL:',article[1]
            time.sleep(1)
        else:
            count += 1
            #print u'OK_URL:', article[1]
            # rate = article[0], url = article[1], title = article[2]
            # store_pic(CrawlerTime, url, rate="", title="" ):
            download_beauty.store_pic(CrawlerTime, article[1], str(article[0]),
                                      article[2])
            print u"download: " + str(100 * count / total) + " %."
        time.sleep(0.05)

    print u"下載完畢..."
    print u"execution time:" + str(time.time() - start_time) + "s"
Exemple #5
0
                except Exception as err:
                    # print err
                    pass


if __name__ == '__main__':

    start_page, page_term, push_rate = int(sys.argv[1]), int(sys.argv[2]), int(
        sys.argv[3])
    if start_page < 0:
        start_page = getFirstPage()
    # print start_page, page_term, push_rate

    print "解析下載網頁面,統計數量中..."

    article_list = []
    for page in range(start_page, start_page - page_term, -1):
        page_url = 'https://www.ptt.cc/bbs/Beauty/index' + str(page) + '.html'
        crawPage(page_url, article_list, push_rate)

    print "即將開始下載圖片, 請再等一下下 ^_^"

    total = len(article_list)
    count = 0
    for hot_rate, article in article_list:
        download_beauty.store_pic(article, str(hot_rate))
        count += 1
        print "已經下載: " + str(100 * count / total) + " %."

    print "即將下載完畢,滿滿的正妹圖就要入袋拉!"
    
    total = len(article_list)
    count = 0
    #進入每篇文章分析內容
    while article_list:
          article = article_list.pop(0)
          # url = article[1] 
          res = rs.get( article[1], verify = False )
          soup = BeautifulSoup(res.text,'html.parser')
          #如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接
          if (soup.title.text.find('Service Temporarily') > -1) :
            article_list.append(article)
            #print u'error_URL:',article[1]
            time.sleep(1)
          else : 
            count += 1
            #print u'OK_URL:', article[1]
            # rate = article[0], url = article[1], title = article[2]
            # store_pic(CrawlerTime, url, rate="", title="" ):
            download_beauty.store_pic(CrawlerTime, article[1], str(article[0]), article[2])
            print u"download: " + str(100 * count / total ) + " %."
          time.sleep(0.05)

    print u"下載完畢..."
    print u"execution time:" + str(time.time() - start_time)+"s"