def get_list(self, url): is_collection = False global NEXT_URL global QULIST global MAX_RUNSUM html = download_html(url=url) if html != '' and len(html) > 0: soup = BeautifulSoup(html, 'lxml') # 匹配下一页链接 next_html = soup.find('div', class_='pagingnav') next_url = '/v.php' + re.findall(r'<a href="(.*?)">', str(next_html), re.S)[-1] NEXT_URL = pron91_root + unhtml(next_url) # 匹配下一页链接 END # 视频列表源码 list_html = soup.find_all('div', class_='listchannel') threads = [] for item in list_html: vinfo = {} try: info = re.findall( r'<img src="(.*?)" title="(.*?)" width="120"/>', str(item), re.S) if info: info = info[0] try: time_long = re.findall( r'<span class="info">时长:</span>(.*?)<br/>', str(item), re.S) except Exception as e: if DEBUG: error( "[pron91] 匹配时长Error ==> [spider.py get_list() ({0})]" .format(e)) time_long = '暂无时长' finally: # 视频播放页链接 try: link = re.findall( r'<a href="(.*?)" target="blank">', str(item), re.S) if link and time_long and info: vinfo['vkey'] = ikey() vinfo['title'] = info[1] vinfo['en_title'] = en(vinfo['title']) vinfo['images'] = info[0] vinfo['vod_long'] = strip(time_long[0]) vinfo['yuan_url'] = unhtml(link[0]) vinfo['play_type'] = online_type[1] vinfo['movie_type'] = movie_type[3] vinfo['inputtime'] = getTime() # 如果当前链接没有采集过,加入队列 if not is_url(vinfo['yuan_url'], vod_type[0]): # 没采集过,加入队列 , 并且没有超过最大爬取数 if MAX_RUNSUM < max_spider: QULIST.put(vinfo) MAX_RUNSUM += 1 # 最大爬取数递增 else: is_collection = True else: is_collection = True except Exception as e: if DEBUG: error( "[pron91] 播放页匹配Error ==> [spider.py get_list() ({0})]" .format(e)) except Exception as e: if DEBUG: error( "[pron91] 匹配信息Error ==> [spider.py get_list() ({0})]" .format(e)) # 开启线程 for v in range(thread_sum): t = Thread(self.pron91_do) t.setDaemon(True) t.start() threads.append(t) # 线程 for tobj in threads: tobj.join() # 等待队列为空 在执行其他操作 QULIST.join() # 检测是否要采集下一页 if not is_collection: if NEXT_URL != '': self.get_list(NEXT_URL) else: # 重置 MAX_RUNSUM = 0 NEXT_URL = '' else: # 重置 MAX_RUNSUM = 0 NEXT_URL = ''
def vod_list(self, url, mov_type=None, sex_type=None): is_collection = False global NEXT_URL global QULIST global MAX_RUNSUM list_html = download_html(url=url) if list_html: soup = BeautifulSoup(list_html, 'lxml') # 匹配下一页 next_html = soup.find('a', class_="nxt") if next_html: NEXT_URL = sex8_root + '/' + next_html['href'] else: NEXT_URL = '' # 匹配下一页 END # 解析列表页 list_arr = soup.find_all( 'tbody', attrs={'id': re.compile('normalthread_[0-9]*')}) threads = [] for v in list_arr: vinfo = {} soup = BeautifulSoup(str(v), 'lxml') vodbt_url = soup.find('a')['href'] if not is_url(vodbt_url, vod_type[1]): if MAX_RUNSUM < max_spider: # 添加到队列 vinfo['yuan_url'] = vodbt_url vinfo['movie_type'] = mov_type QULIST.put(vinfo) MAX_RUNSUM += 1 # 最大爬取数递增 else: is_collection = True else: is_collection = True # 开启多线程 for v in range(thread_sum): if sex_type == 1: t = Thread(self.get_vodbt_1) elif sex_type == 2: t = Thread(self.get_vodbt_2) elif sex_type == 3: t = Thread(self.get_vodbt_3) else: t = Thread(self.get_vodbt_4) t.setDaemon(True) t.start() threads.append(t) # 线程 for tobj in threads: tobj.join() # 等待队列为空 在执行其他操作 QULIST.join() # 检测是否要采集下一页 if not is_collection: if NEXT_URL != '': self.vod_list(url=NEXT_URL, mov_type=mov_type, sex_type=sex_type) else: # 重置爬取数量 MAX_RUNSUM = 0 NEXT_URL = '' else: # 重置爬取数量 MAX_RUNSUM = 0 NEXT_URL = ''
def vod_list(self, url, movie_type=None): is_collection = False global NEXT_URL global QULIST global MAX_RUNSUM list_html = download_html(url=url, result_type='content') if list_html: soup = BeautifulSoup(list_html, 'lxml') # 提取下一页 next_html = soup.find('li', class_="next-page") if next_html: next_url = papax_root + next_html.a['href'] NEXT_URL = next_url else: NEXT_URL = '' # 提取下一页 END # 解析列表 vod_list = soup.find_all('article', class_="excerpt") threads = [] for v in vod_list: vinfo = {} soup = BeautifulSoup(str(v), 'lxml') vinfo['yuan_url'] = soup.a['href'] if not is_url(vinfo['yuan_url'], vod_type[0]): vinfo['title'] = soup.img['alt'] vinfo['images'] = papax_root + soup.img['src'] vinfo['movie_type'] = movie_type # 链接没有采集过 if MAX_RUNSUM < max_spider: QULIST.put(vinfo) MAX_RUNSUM += 1 # 最大爬取数递增 else: is_collection = True else: is_collection = True # 开启多线程 for v in range(thread_sum): t = Thread(self.get_vod) t.setDaemon(True) t.start() threads.append(t) # 线程 for tobj in threads: tobj.join() # 等待队列为空 在执行其他操作 QULIST.join() # 检测是否要采集下一页 if not is_collection: if NEXT_URL != '': self.vod_list(url=NEXT_URL, movie_type=movie_type) else: # 重置爬取数量 MAX_RUNSUM = 0 NEXT_URL = '' else: # 重置爬取数量 MAX_RUNSUM = 0 NEXT_URL = ''
def vod_list(self, url, mov_type=None): is_collection = False global NEXT_URL global QULIST global MAX_RUNSUM list_html = download_html(url=url) if list_html: soup = BeautifulSoup(list_html, 'lxml') # 匹配下一页 next_html = soup.find('a', class_="pagelink_a") if next_html: NEXT_URL = av911_root + next_html['href'] else: NEXT_URL = '' # 匹配下一页 END # 解析列表页 list_arr = soup.find_all('li', class_="thumb item") threads = [] for i in list_arr: vinfo = {} soup = BeautifulSoup(str(i), 'lxml') vinfo['yuan_url'] = soup.a['href'] if not is_url(vinfo['yuan_url'], vod_type[0]): vinfo['title'] = soup.find('span', class_="title").get_text() vinfo['images'] = soup.img['data-original'] vinfo['movie_type'] = mov_type # 链接没有采集过 if MAX_RUNSUM < max_spider: # 添加到队列 QULIST.put(vinfo) MAX_RUNSUM += 1 # 最大爬取数递增 else: is_collection = True else: is_collection = True # 开启多线程 for v in range(thread_sum): t = Thread(self.get_vod) t.setDaemon(True) t.start() threads.append(t) # 线程 for tobj in threads: tobj.join() # 等待队列为空 在执行其他操作 QULIST.join() # 检测是否要采集下一页 if not is_collection: if NEXT_URL != '': self.vod_list(url=NEXT_URL, mov_type=mov_type) else: # 重置爬取数量 MAX_RUNSUM = 0 NEXT_URL = '' else: # 重置爬取数量 MAX_RUNSUM = 0 NEXT_URL = ''
def vod_list(self, url=None, video_type=None): is_collection = False global NEXT_URL global QULIST global MAX_RUNSUM list_html = download_html(url=url) if list_html != '': soup = BeautifulSoup(list_html, 'lxml') #解析下一页 next_url = soup.find('a', class_="nxt") if next_url: NEXT_URL = taoyin_root + '/' + unhtml(next_url['href']) else: NEXT_URL = '' #解析下一页 END #解析列表 ul_list = soup.find_all('ul', id="waterfall") soup = BeautifulSoup(str(ul_list), 'lxml') li_list = soup.find_all('li') threads = [] for v in li_list: vinfo = {} soup = BeautifulSoup(str(v), 'lxml') v_url = soup.a['href'] # 将v_url加入队列 if not is_url(url=v_url, t=vod_type[0]): # 链接没有采集过 if MAX_RUNSUM < max_spider: QULIST.put(v_url) MAX_RUNSUM += 1 # 最大爬取数递增 else: is_collection = True else: is_collection = True # 开启多线程 for v in range(thread_sum): if video_type == 'av': t = Thread(self.get_vod_av) else: t = Thread(self.get_vod) t.setDaemon(True) t.start() threads.append(t) # 线程 for tobj in threads: tobj.join() # 等待队列为空 在执行其他操作 QULIST.join() # 检测是否要采集下一页 if not is_collection: if NEXT_URL != '': self.vod_list(url=NEXT_URL, video_type=video_type) else: # 重置爬取数量 MAX_RUNSUM = 0 NEXT_URL = '' else: # 重置爬取数量 MAX_RUNSUM = 0 NEXT_URL = ''