Esempio n. 1
0
    def __init__(self, task_list):
        self.task_set = set(task_list)
        self.work_set = set() # 待抓取网址池
        self.answer_list = []
        self.question_list = []
        self.thread_pool = ThreadPool(SettingClass.MAXTHREAD)

        self.info_list = []
        self.extra_index_list = []
        self.info_url_set = self.task_set.copy()

        self.add_property() # 添加扩展属性
        HttpBaseClass.set_cookie()
Esempio n. 2
0
    def __init__(self, task_list):
        self.task_set = set(task_list)
        self.work_set = set()  # 待抓取网址池
        self.answer_list = []
        self.question_list = []
        self.thread_pool = ThreadPool(SettingClass.MAXTHREAD)

        self.info_list = []
        self.extra_index_list = []
        self.info_url_set = self.task_set.copy()

        self.add_property()  # 添加扩展属性
        HttpBaseClass.set_cookie()
Esempio n. 3
0
 def worker(self, target_url):
     content = HttpBaseClass.get_http_content(target_url, timeout=SettingClass.WAITFOR_HTML)
     if not content:
         return
     self.work_set.discard(target_url)
     self.parse_content(content)
     return
Esempio n. 4
0
 def worker(self, target_url):
     content = HttpBaseClass.get_http_content(
         target_url, timeout=SettingClass.WAITFOR_HTML)
     if not content:
         return
     self.work_set.discard(target_url)
     self.parse_content(content)
     return
Esempio n. 5
0
 def catch_info(self, target_url):
     content = HttpBaseClass.get_http_content(target_url + '/top-answers', timeout=SettingClass.WAITFOR_HTML)
     if not content:
         return
     self.info_url_set.discard(target_url)
     parser = TopicParser(content)
     self.info_list.append(parser.get_extra_info())
     return
Esempio n. 6
0
 def catch_info(self, target_url):
     content = HttpBaseClass.get_http_content(
         target_url + '/top-answers', timeout=SettingClass.WAITFOR_HTML)
     if not content:
         return
     self.info_url_set.discard(target_url)
     parser = TopicParser(content)
     self.info_list.append(parser.get_extra_info())
     return
 def download(self, image):
     filename = image['filename']
     href = image['href']
     content = HttpBaseClass.get_http_content(url=href, timeout=SettingClass.WAITFOR_PIC)
     if not content:
         return
     with open(self.save_path + '/' + filename, 'wb') as image:
         image.write(content)
     self.delete(href)
     return
Esempio n. 8
0
 def create_work_set(self, target_url):
     content = HttpBaseClass.get_http_content(target_url, timeout=SettingClass.WAITFOR_HTML)
     if not content:
         return
     self.task_set.discard(target_url)
     max_page = self.parse_max_page(content)
     for page in range(max_page):
         url = '{}?nr=1&sort=created&page={}'.format(target_url, page)
         self.work_set.add(url)
     return
Esempio n. 9
0
 def create_work_set(self, target_url):
     content = HttpBaseClass.get_http_content(target_url + '/answers?order_by=vote_num', timeout=SettingClass.WAITFOR_HTML)
     if not content:
         return
     self.task_set.discard(target_url)
     max_page = self.parse_max_page(content)
     for page in range(max_page):
         url = '{}/answers?order_by=vote_num&page={}'.format(target_url, page)
         self.work_set.add(url)
     return
Esempio n. 10
0
 def create_work_set(self, target_url):
     content = HttpBaseClass.get_http_content(
         target_url, timeout=SettingClass.WAITFOR_HTML)
     if not content:
         return
     self.task_set.discard(target_url)
     max_page = self.parse_max_page(content)
     for page in range(max_page):
         url = '{}?nr=1&sort=created&page={}'.format(target_url, page)
         self.work_set.add(url)
     return
Esempio n. 11
0
 def create_work_set(self, target_url):
     content = HttpBaseClass.get_http_content(
         target_url + '/answers?order_by=vote_num',
         timeout=SettingClass.WAITFOR_HTML)
     if not content:
         return
     self.task_set.discard(target_url)
     max_page = self.parse_max_page(content)
     for page in range(max_page):
         url = '{}/answers?order_by=vote_num&page={}'.format(
             target_url, page)
         self.work_set.add(url)
     return