Exemple #1
0
 def task(self):
     self.session = scoped_session(SessionFactory)
     html = download_html("http://jandan.net/girl")
     soup = BeautifulSoup(html, features="html.parser")
     count = 0
     for _image in soup.find_all("a", class_="view_img_link"):
         url = "http:{}".format(_image["href"])
         if self.session.query(JiandanImage).filter(
                 JiandanImage.url == url).count() == 0:
             self.session.add(
                 JiandanImage(url=url, status="new", date=datetime.now()))
             self.session.commit()
             count += 1
     self.logger.info('抓取图片: {} 张'.format(count))
     self.session.remove()
Exemple #2
0
 def task(self):
     self.session = scoped_session(SessionFactory)
     html = download_html("https://lvv2.com/nsfw", proxy=True)
     soup = BeautifulSoup(html, features="html.parser")
     count = 0
     for _thread in soup.find_all("div", class_="link show"):
         url = _thread.find("a", class_="thumbnail")["href"]
         title = _thread.find("a", class_="title").text
         tag = _thread.find("h4").text
         if self.session.query(LVV2Thread).filter(LVV2Thread.url == url).count() == 0:
             self.session.add(LVV2Thread(url=url, status="new", tag=tag, title=title, date=datetime.now()))
             self.session.commit()
             count += 1
     self.logger.info('抓取数据: {} 条'.format(count))
     self.session.remove()
Exemple #3
0
 def task(self):
     self.session = scoped_session(SessionFactory)
     threads = self.session.query(LVV2Thread).filter(
         LVV2Thread.status == "new").all()
     count = 0
     for _thread in threads:
         html = download_html(_thread.url, proxy=True)
         soup = BeautifulSoup(html, features="html.parser")
         for _image in soup.find_all("img", "lazy detailImg"):
             url = _image["data-echo"]
             if self.session.query(LVV2Image).filter(
                     LVV2Image.url == url,
                     LVV2Image.thread_id == _thread.id).count() == 0:
                 self.session.add(
                     LVV2Image(url=url,
                               status="new",
                               thread_id=_thread.id,
                               date=_thread.date))
                 _thread.status = "download"
                 self.session.commit()
                 count += 1
     self.logger.info('抓取图片: {} 张'.format(count))
     self.session.remove()
# -*- coding:utf-8 -*-

""" Used to download html files directly (without login) from the board.
    This program will download threads with tid in the tids_list.
    tids list is loaded from ./pickle folder """

import pickle

from config import path, sleeptime, start_tid
from utils import download_html, generate_thread_url

tids_list = pickle.load("%s/pickle/tids_from_thread_%s.p" % (path, start_tid))

for tid in tids_list:
    try:
        if sleeptime:
            print("sleeping...")
            sleep(sleeptime)
        print("downloading:", tid)
        download_html(generate_thread_url(tid, 1))
    except Exception as inst:
        print("There is an error:")
        print(type(inst), inst.args)
def download_first_page():
    print('Downloading the first page...')
    download_html(generate_thread_url(start_tid,1))
Exemple #6
0
# -*- coding:utf-8 -*-
''' Used to download html files directly (without login) from the board.
    This program will download threads with tid in the tids_list.
    tids list is loaded from ./pickle folder '''

import pickle

from config import path, sleeptime, start_tid
from utils import download_html, generate_thread_url

tids_list = pickle.load('%s/pickle/tids_from_thread_%s.p' % (path, start_tid))

for tid in tids_list:
    try:
        if sleeptime:
            print('sleeping...')
            sleep(sleeptime)
        print('downloading:', tid)
        download_html(generate_thread_url(tid, 1))
    except Exception as inst:
        print('There is an error:')
        print(type(inst), inst.args)
Exemple #7
0
def download_first_page():
    print('Downloading the first page...')
    download_html(generate_thread_url(start_tid, 1))