Python is_urlの例、hao.database.is_url Pythonの例

コード例 #1

0

ファイルを表示

ファイル: spider.py プロジェクト: Jerry-goodboy/XXX.com_Spider

    def get_list(self, url):
        is_collection = False
        global NEXT_URL
        global QULIST
        global MAX_RUNSUM
        html = download_html(url=url)
        if html != '' and len(html) > 0:
            soup = BeautifulSoup(html, 'lxml')
            # 匹配下一页链接
            next_html = soup.find('div', class_='pagingnav')
            next_url = '/v.php' + re.findall(r'<a href="(.*?)">',
                                             str(next_html), re.S)[-1]
            NEXT_URL = pron91_root + unhtml(next_url)
            # 匹配下一页链接 END
            # 视频列表源码
            list_html = soup.find_all('div', class_='listchannel')
            threads = []
            for item in list_html:
                vinfo = {}
                try:
                    info = re.findall(
                        r'<img src="(.*?)" title="(.*?)" width="120"/>',
                        str(item), re.S)
                    if info:
                        info = info[0]
                        try:
                            time_long = re.findall(
                                r'<span class="info">时长:</span>(.*?)<br/>',
                                str(item), re.S)
                        except Exception as e:
                            if DEBUG:
                                error(
                                    "[pron91] 匹配时长Error ==> [spider.py get_list() ({0})]"
                                    .format(e))
                            time_long = '暂无时长'
                        finally:
                            # 视频播放页链接
                            try:
                                link = re.findall(
                                    r'<a href="(.*?)" target="blank">',
                                    str(item), re.S)

                                if link and time_long and info:
                                    vinfo['vkey'] = ikey()
                                    vinfo['title'] = info[1]
                                    vinfo['en_title'] = en(vinfo['title'])
                                    vinfo['images'] = info[0]
                                    vinfo['vod_long'] = strip(time_long[0])
                                    vinfo['yuan_url'] = unhtml(link[0])
                                    vinfo['play_type'] = online_type[1]
                                    vinfo['movie_type'] = movie_type[3]
                                    vinfo['inputtime'] = getTime()

                                    # 如果当前链接没有采集过，加入队列
                                    if not is_url(vinfo['yuan_url'],
                                                  vod_type[0]):
                                        # 没采集过,加入队列 , 并且没有超过最大爬取数
                                        if MAX_RUNSUM < max_spider:
                                            QULIST.put(vinfo)
                                            MAX_RUNSUM += 1  # 最大爬取数递增
                                        else:
                                            is_collection = True
                                    else:
                                        is_collection = True
                            except Exception as e:
                                if DEBUG:
                                    error(
                                        "[pron91] 播放页匹配Error ==> [spider.py get_list() ({0})]"
                                        .format(e))

                except Exception as e:
                    if DEBUG:
                        error(
                            "[pron91] 匹配信息Error ==> [spider.py get_list() ({0})]"
                            .format(e))

            # 开启线程
            for v in range(thread_sum):
                t = Thread(self.pron91_do)
                t.setDaemon(True)
                t.start()
                threads.append(t)
                # 线程
            for tobj in threads:
                tobj.join()

            # 等待队列为空 在执行其他操作
            QULIST.join()

        # 检测是否要采集下一页
        if not is_collection:
            if NEXT_URL != '':
                self.get_list(NEXT_URL)
            else:
                # 重置
                MAX_RUNSUM = 0
                NEXT_URL = ''
        else:
            # 重置
            MAX_RUNSUM = 0
            NEXT_URL = ''

コード例 #2

0

ファイルを表示

ファイル: spider.py プロジェクト: Jerry-goodboy/XXX.com_Spider

    def vod_list(self, url, mov_type=None, sex_type=None):
        is_collection = False
        global NEXT_URL
        global QULIST
        global MAX_RUNSUM
        list_html = download_html(url=url)
        if list_html:
            soup = BeautifulSoup(list_html, 'lxml')
            # 匹配下一页
            next_html = soup.find('a', class_="nxt")
            if next_html:
                NEXT_URL = sex8_root + '/' + next_html['href']
            else:
                NEXT_URL = ''
            # 匹配下一页 END

            # 解析列表页
            list_arr = soup.find_all(
                'tbody', attrs={'id': re.compile('normalthread_[0-9]*')})
            threads = []
            for v in list_arr:
                vinfo = {}
                soup = BeautifulSoup(str(v), 'lxml')
                vodbt_url = soup.find('a')['href']
                if not is_url(vodbt_url, vod_type[1]):
                    if MAX_RUNSUM < max_spider:
                        # 添加到队列
                        vinfo['yuan_url'] = vodbt_url
                        vinfo['movie_type'] = mov_type
                        QULIST.put(vinfo)
                        MAX_RUNSUM += 1  # 最大爬取数递增
                    else:
                        is_collection = True
                else:
                    is_collection = True

                    # 开启多线程
            for v in range(thread_sum):
                if sex_type == 1:
                    t = Thread(self.get_vodbt_1)
                elif sex_type == 2:
                    t = Thread(self.get_vodbt_2)
                elif sex_type == 3:
                    t = Thread(self.get_vodbt_3)
                else:
                    t = Thread(self.get_vodbt_4)
                t.setDaemon(True)
                t.start()
                threads.append(t)
                # 线程
            for tobj in threads:
                tobj.join()

                # 等待队列为空 在执行其他操作
            QULIST.join()

        # 检测是否要采集下一页
        if not is_collection:
            if NEXT_URL != '':
                self.vod_list(url=NEXT_URL,
                              mov_type=mov_type,
                              sex_type=sex_type)
            else:
                # 重置爬取数量
                MAX_RUNSUM = 0
                NEXT_URL = ''
        else:
            # 重置爬取数量
            MAX_RUNSUM = 0
            NEXT_URL = ''

コード例 #3

0

ファイルを表示

ファイル: spider.py プロジェクト: Jerry-goodboy/XXX.com_Spider

    def vod_list(self, url, movie_type=None):
        is_collection = False
        global NEXT_URL
        global QULIST
        global MAX_RUNSUM
        list_html = download_html(url=url, result_type='content')
        if list_html:
            soup = BeautifulSoup(list_html, 'lxml')
            # 提取下一页
            next_html = soup.find('li', class_="next-page")
            if next_html:
                next_url = papax_root + next_html.a['href']
                NEXT_URL = next_url
            else:
                NEXT_URL = ''
            # 提取下一页 END

            # 解析列表
            vod_list = soup.find_all('article', class_="excerpt")
            threads = []
            for v in vod_list:
                vinfo = {}
                soup = BeautifulSoup(str(v), 'lxml')
                vinfo['yuan_url'] = soup.a['href']
                if not is_url(vinfo['yuan_url'], vod_type[0]):
                    vinfo['title'] = soup.img['alt']
                    vinfo['images'] = papax_root + soup.img['src']
                    vinfo['movie_type'] = movie_type
                    # 链接没有采集过
                    if MAX_RUNSUM < max_spider:
                        QULIST.put(vinfo)
                        MAX_RUNSUM += 1  # 最大爬取数递增
                    else:
                        is_collection = True
                else:
                    is_collection = True

            # 开启多线程
            for v in range(thread_sum):
                t = Thread(self.get_vod)
                t.setDaemon(True)
                t.start()
                threads.append(t)
            # 线程
            for tobj in threads:
                tobj.join()

            # 等待队列为空 在执行其他操作
            QULIST.join()

        # 检测是否要采集下一页
        if not is_collection:
            if NEXT_URL != '':
                self.vod_list(url=NEXT_URL, movie_type=movie_type)
            else:
                # 重置爬取数量
                MAX_RUNSUM = 0
                NEXT_URL = ''
        else:
            # 重置爬取数量
            MAX_RUNSUM = 0
            NEXT_URL = ''

コード例 #4

0

ファイルを表示

ファイル: spider.py プロジェクト: Jerry-goodboy/XXX.com_Spider

    def vod_list(self, url, mov_type=None):
        is_collection = False
        global NEXT_URL
        global QULIST
        global MAX_RUNSUM
        list_html = download_html(url=url)
        if list_html:
            soup = BeautifulSoup(list_html, 'lxml')
            # 匹配下一页
            next_html = soup.find('a', class_="pagelink_a")
            if next_html:
                NEXT_URL = av911_root + next_html['href']
            else:
                NEXT_URL = ''
            # 匹配下一页 END
            # 解析列表页
            list_arr = soup.find_all('li', class_="thumb item")
            threads = []
            for i in list_arr:
                vinfo = {}
                soup = BeautifulSoup(str(i), 'lxml')
                vinfo['yuan_url'] = soup.a['href']
                if not is_url(vinfo['yuan_url'], vod_type[0]):
                    vinfo['title'] = soup.find('span',
                                               class_="title").get_text()
                    vinfo['images'] = soup.img['data-original']
                    vinfo['movie_type'] = mov_type
                    # 链接没有采集过
                    if MAX_RUNSUM < max_spider:
                        # 添加到队列
                        QULIST.put(vinfo)
                        MAX_RUNSUM += 1  # 最大爬取数递增
                    else:
                        is_collection = True
                else:
                    is_collection = True

            # 开启多线程
            for v in range(thread_sum):
                t = Thread(self.get_vod)
                t.setDaemon(True)
                t.start()
                threads.append(t)
            # 线程
            for tobj in threads:
                tobj.join()

            # 等待队列为空 在执行其他操作
            QULIST.join()

        # 检测是否要采集下一页
        if not is_collection:
            if NEXT_URL != '':
                self.vod_list(url=NEXT_URL, mov_type=mov_type)
            else:
                # 重置爬取数量
                MAX_RUNSUM = 0
                NEXT_URL = ''
        else:
            # 重置爬取数量
            MAX_RUNSUM = 0
            NEXT_URL = ''

コード例 #5

0

ファイルを表示

ファイル: spider.py プロジェクト: Jerry-goodboy/XXX.com_Spider

    def vod_list(self, url=None, video_type=None):
        is_collection = False
        global NEXT_URL
        global QULIST
        global MAX_RUNSUM
        list_html = download_html(url=url)
        if list_html != '':
            soup = BeautifulSoup(list_html, 'lxml')
            #解析下一页
            next_url = soup.find('a', class_="nxt")
            if next_url:
                NEXT_URL = taoyin_root + '/' + unhtml(next_url['href'])
            else:
                NEXT_URL = ''
            #解析下一页 END

            #解析列表
            ul_list = soup.find_all('ul', id="waterfall")
            soup = BeautifulSoup(str(ul_list), 'lxml')
            li_list = soup.find_all('li')
            threads = []
            for v in li_list:
                vinfo = {}
                soup = BeautifulSoup(str(v), 'lxml')
                v_url = soup.a['href']
                # 将v_url加入队列
                if not is_url(url=v_url, t=vod_type[0]):
                    # 链接没有采集过
                    if MAX_RUNSUM < max_spider:
                        QULIST.put(v_url)
                        MAX_RUNSUM += 1  # 最大爬取数递增
                    else:
                        is_collection = True
                else:
                    is_collection = True

            # 开启多线程
            for v in range(thread_sum):
                if video_type == 'av':
                    t = Thread(self.get_vod_av)
                else:
                    t = Thread(self.get_vod)
                t.setDaemon(True)
                t.start()
                threads.append(t)
            # 线程
            for tobj in threads:
                tobj.join()

            # 等待队列为空 在执行其他操作
            QULIST.join()

        # 检测是否要采集下一页
        if not is_collection:
            if NEXT_URL != '':
                self.vod_list(url=NEXT_URL, video_type=video_type)
            else:
                # 重置爬取数量
                MAX_RUNSUM = 0
                NEXT_URL = ''
        else:
            # 重置爬取数量
            MAX_RUNSUM = 0
            NEXT_URL = ''