def task(self): self.session = scoped_session(SessionFactory) html = download_html("http://jandan.net/girl") soup = BeautifulSoup(html, features="html.parser") count = 0 for _image in soup.find_all("a", class_="view_img_link"): url = "http:{}".format(_image["href"]) if self.session.query(JiandanImage).filter( JiandanImage.url == url).count() == 0: self.session.add( JiandanImage(url=url, status="new", date=datetime.now())) self.session.commit() count += 1 self.logger.info('抓取图片: {} 张'.format(count)) self.session.remove()
def task(self): self.session = scoped_session(SessionFactory) html = download_html("https://lvv2.com/nsfw", proxy=True) soup = BeautifulSoup(html, features="html.parser") count = 0 for _thread in soup.find_all("div", class_="link show"): url = _thread.find("a", class_="thumbnail")["href"] title = _thread.find("a", class_="title").text tag = _thread.find("h4").text if self.session.query(LVV2Thread).filter(LVV2Thread.url == url).count() == 0: self.session.add(LVV2Thread(url=url, status="new", tag=tag, title=title, date=datetime.now())) self.session.commit() count += 1 self.logger.info('抓取数据: {} 条'.format(count)) self.session.remove()
def task(self): self.session = scoped_session(SessionFactory) threads = self.session.query(LVV2Thread).filter( LVV2Thread.status == "new").all() count = 0 for _thread in threads: html = download_html(_thread.url, proxy=True) soup = BeautifulSoup(html, features="html.parser") for _image in soup.find_all("img", "lazy detailImg"): url = _image["data-echo"] if self.session.query(LVV2Image).filter( LVV2Image.url == url, LVV2Image.thread_id == _thread.id).count() == 0: self.session.add( LVV2Image(url=url, status="new", thread_id=_thread.id, date=_thread.date)) _thread.status = "download" self.session.commit() count += 1 self.logger.info('抓取图片: {} 张'.format(count)) self.session.remove()
# -*- coding:utf-8 -*- """ Used to download html files directly (without login) from the board. This program will download threads with tid in the tids_list. tids list is loaded from ./pickle folder """ import pickle from config import path, sleeptime, start_tid from utils import download_html, generate_thread_url tids_list = pickle.load("%s/pickle/tids_from_thread_%s.p" % (path, start_tid)) for tid in tids_list: try: if sleeptime: print("sleeping...") sleep(sleeptime) print("downloading:", tid) download_html(generate_thread_url(tid, 1)) except Exception as inst: print("There is an error:") print(type(inst), inst.args)
def download_first_page(): print('Downloading the first page...') download_html(generate_thread_url(start_tid,1))
# -*- coding:utf-8 -*- ''' Used to download html files directly (without login) from the board. This program will download threads with tid in the tids_list. tids list is loaded from ./pickle folder ''' import pickle from config import path, sleeptime, start_tid from utils import download_html, generate_thread_url tids_list = pickle.load('%s/pickle/tids_from_thread_%s.p' % (path, start_tid)) for tid in tids_list: try: if sleeptime: print('sleeping...') sleep(sleeptime) print('downloading:', tid) download_html(generate_thread_url(tid, 1)) except Exception as inst: print('There is an error:') print(type(inst), inst.args)
def download_first_page(): print('Downloading the first page...') download_html(generate_thread_url(start_tid, 1))