Example #1
0
    def get_show_page_info(self,url):
        regex = dict(
                info = '<li><span class="tt"></span><span class="xing_vb4"><img class="new" src="/template/www_api_xin/images/new/49.gif">&nbsp<a href="(.*?)" target="_blank">(.*?)<img class="hot" src="/template/www_api_xin/images/hot/hot.gif"></font></a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb6">(.*?)</span></li>'
                )
        info = Spider().get_info(url,encoding = 'utf-8',  **regex)['info']
        
        joint_url = self.domain

        info = [{'url':joint_url + url[1:], 'name':name, 'types':types, 'update_time': update_time} for url, name, types, update_time in info]
        
        return {'film_list': info}
Example #2
0
 def main():
     reload(sys)
     sys.setdefaultencoding('utf8')
     spider = Spider('python', '杭州')
     spider.setSalay(5.9, 16, 10.9, 31.0)
     spider.addShieldCompany('畅唐网络')
     spider.addShieldCompany('中国亿教亿学网')
     spider.addContainText('C++')
     spider.addContainText('c++')
     #spider.addContainText('爬虫')
     spider.analyse()
Example #3
0
    def start(self):

        if self.filePath.get() == '' or self.outputPath.get() == '':
            tkinter.messagebox._show("Error", message="please select a keyword file and output path!")
            return

        spider = Spider()
        spider.readKeyWord(filePath=self.filePath)
        spider.searchKeyWord()
        spider.createResultExcel(outputFilePath=self.outputPath)
        tkinter.messagebox._show("Success", message="已经完成了爬取!")
 def crawl(self):
     try:
         start = time.time()
         Spider(self.project_name, self.home_page, self.domain_name,
                self.save_flag)
         self._create_workers()
         self._crawl()
         end = time.time()
         print("Time elapsed: ", end - start)
     finally:
         Spider.graph.save()
def download(input_file,
             audio_output='audio_output',
             video_output='video_output',
             sources=None):
    if not os.path.isdir(video_output):
        os.makedirs(video_output)
    if not os.path.isdir(audio_output):
        os.makedirs(audio_output)
    spider = Spider()
    spider.load(input_file)
    spider.download(audio_output, video_output, sources)
Example #6
0
    def get_all_page(self, url):

        rst = Spider().get_html(url)

        page = int(
            str(
                re.findall(
                    '<li><div class="pages" style="margin-bottom:10px;">共.*?条数据&nbsp;当前:1/(.*?)页&nbsp;<em>',
                    rst, re.S))[2:-2])

        return page
Example #7
0
    def get_movies(self):  #随机获取20个视频

        try:

            url = 'http://101.251.217.216/rest/n/feed/hot?isp=CMCC&mod=lemobile%28le%20x620%29&lon=116.41025&country_code=cn&kpf=ANDROID_PHONE&extId=59942a6c1d534a51844dfda37e92afc3&did=ANDROID_72c3ac6bd3184a67&kpn=KUAISHOU&net=WIFI&app=0&oc=MYAPP%2C1&ud=0&hotfix_ver=&c=MYAPP%2C1&sys=ANDROID_5.1.1&appver=6.1.0.8039&ftt=&language=zh-cn&iuid=&lat=39.916411&did_gt=1560736770695&ver=6.1&max_memory=192&type=7&page=1&coldStart=false&count=20&pv=false&id=23&refreshTimes=7&pcursor=&source=1&needInterestTag=false&client_key=3c2cd3f3&os=android&sig=510e56b366931c4cb008c51ee44664c2'

            return Spider().get_html(url)

        except Exception as e:

            print(e.args)
Example #8
0
 def spider(self):
     # 请求头增加cc
     s = Spider(additional_headers={'Cache-Control': 'max-age=0'})
     try:
         s.fetch(self.url)
     except HTTPError as e:
         # 检查该电影相关页面是否存在
         if e.msg == 'Not Found':
             return
     # 因为中文被编码成utf-8之后变成'/u2541'之类的形式,lxml一遇到"/"就会认为其标签结束
     return etree.HTML(s.content.decode('utf-8'))
Example #9
0
def main():
    init_encoding()
    init_logging("log/debug.log", "log/log.log")
    spider = Spider()
    begin_time = time.strftime('%Y-%m-%d %H:%M:%S',
                               time.localtime(time.time()))
    # spider.run()
    spider.run_crontab()
    end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    logging.info("-------begin: %s, end: %s--------" % (begin_time, end_time))
    pass
Example #10
0
 def film_search(self, keyword, encoding = None):
     post_url = 'http://zy.ataoju.com/index.php?m=vod-search'
     data = {
             'wd': keyword,
             'submit': 'search'
             }
     regex = dict(
             info = '<li><span class="tt"></span><span class="xing_vb4"><a href="(.*?)" target="_blank">(.*?)</a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb[67]">(.*?)</span></li>'
             )
     info = Spider().post_info(post_url,data,**regex)
     info = [{'url':self.domain+i[0],'name':i[1],'types':i[2],'update_time':i[3]} for i in info['info']]
     return {'search_list': info, 'search_word': keyword, 'host': self.domain}
Example #11
0
    def get_film_info(self, url, encoding=None):
        regex = dict(
            intro='<div class="vodplayinfo">(.*?)</div>',
            name='<h2>(.*?)</h2>\s+?<span>(.*?)</span>\s+?<label>(.*?)</label>',
            info='\
<li>别名:<span>(.*?)</span></li>\s+?\
<li>导演:<span>(.*?)</span></li>\s+?\
<li>主演:<span>(.*?)</span></li>\s+?\
<li>类型:<span>(.*?)</span></li>\s+?\
<li class="sm">地区:<span>(.*?)</span></li>\s+?\
<li class="sm">语言:<span>(.*?)</span></li>\s+?\
<li class="sm">上映:<span>(.*?)</span></li>\s+?\
<li class="sm">片长:<span>(.*?)</span></li>\s+?\
<li class="sm">更新:<span>(.*?)</span></li>\s+?\
<li class="sm">总播放量:<span><em id="hits">.*?</script></span></li>\s+?\
<li class="sm">今日播放量:<span>(.*?)</span></li>\s+?\
<li class="sm">总评分数:<span>(.*?)</span></li>\s+?\
<li class="sm">评分次数:<span>(.*?)</span></li>',
            show_list='target=_blank>(.*?)</a>&emsp;',
            imgurl='<img class="lazy" src="(.*?)" alt=".*?" />')
        info = Spider().get_info(url, encoding=encoding, **regex)
        director = self.split_info(info['info'][0][1])
        actor = self.split_info(info['info'][0][2])
        types = self.split_info(info['info'][0][3])
        area = self.split_info(info['info'][0][4])
        language = self.split_info(info['info'][0][5])

        film_info = dict(name=info['name'][0][0],
                         name_info=info['name'][0][1],
                         grader=info['name'][0][2],
                         athour_name=info['info'][0][0],
                         director=director,
                         actor=actor,
                         types=types,
                         area=area,
                         language=language,
                         show_time=info['info'][0][6],
                         lens=info['info'][0][7],
                         up_date=info['info'][0][8],
                         day_palys=info['info'][0][9],
                         total_score=info['info'][0][10],
                         total_score_number=info['info'][0][11],
                         intro=info['intro'][0],
                         m3u8_list=[
                             i.split('$') for i in info['show_list']
                             if i.endswith('index.m3u8')
                         ],
                         yun_list=[
                             i.split('$') for i in info['show_list']
                             if not i.endswith('index.m3u8')
                         ],
                         imgurl=info['imgurl'][0])
        return film_info
Example #12
0
    def multi_thread(self):
        # 获取任务
        task = Task()
        url_list = task.get_task()

        # 爬取数据
        spider = Spider()
        for i in range(LOOP_NUM):
            print('开始下一循环', end='\n' * 3)
            per_step_urls = task.get_urls(url_list)
            with futures.ThreadPoolExecutor(64) as executor:
                executor.map(spider.get_resp, per_step_urls)
Example #13
0
    def get_comment(self, args):  #获取评论

        comment_url = 'http://api.gifshow.com/rest/n/comment/list/v2?isp=CMCC&mod=vivo%28vivo%20x5m%29&lon=116.41025&country_code=cn&kpf=ANDROID_PHONE&did=ANDROID_d1f47e9473209293&kpn=KUAISHOU&net=WIFI&app=0&oc=MYAPP%2C1&ud=0&hotfix_ver=&c=MYAPP%2C1&sys=ANDROID_5.1.1&appver=6.1.0.8039&ftt=&language=zh-cn&iuid=&lat=39.916411&did_gt=1560817030537&ver=6.1&retryTimes=1&max_memory=192'

        if len(args) == 3:  #获取作品的第一组评论

            sig_r = '&photoId={}&user_id={}&order=desc&count=10&photoPageType=0&client_key=3c2cd3f3&os=android&sig={}'.format(
                args[0], args[1], args[2])

            comment_url = comment_url + sig_r

            return Spider().get_html(comment_url)

        if len(args) == 4:  #获取带有pcursor参数的评论

            sig_r = '&photoId={}&user_id={}&order=desc&pcursor={}&count=10&photoPageType=0&client_key=3c2cd3f3&os=android&sig={}'.format(
                args[0], args[1], args[2], args[3])

            comment_url = comment_url + sig_r

            return Spider().get_html(comment_url)
Example #14
0
    def get_show_page_info(self,url):
        
        '''
        url:'https://www.subo8988.com/?m=vod-type-id-13.html'
        
        return:
            
            {
            
            #type_name:'香港剧'
            film_list:[
                    
            
            {
                    url: 'https://www.subo8988.com/?m=vod-detail-id-25401.html'
                    name: '宝宝来了[国语版] 20集全/已完结'
                    types: '香港剧'
                    update_time:'2019-05-27'
                
            
            }
            
            ...
            
            
            
            ]
            
            
            }
        
        '''
        regex = dict(
                
                info = '\
<li><span class="tt"></span><span class="xing_vb4"> <a href="(.*?)" target="_blank">(.*?)<font color="#FF0000">(.*?)</font>\s+?\
<img class="new" src="/template/172zy/images/Henry_huo.gif"></span><span class="xing_vb51">(.*?)</span><span class="xing_vb54">(.*?)</span>.*?<span class="xing_vb52">(.*?)</span>.*?<span class="xing_vb53"><font color="#f60">(.*?)</font></span>.*?<span class="xing_vb[67]">(.*?)</span></a></li>'


                
                
                )
        
        info = Spider().get_info(url,encoding = 'utf-8',  **regex)['info']
        
        ls = 'http://www.172zy.net'
        
        info = [dict(url = ls + i[0], name = i[1].split('&nbsp;')[0], update_total = i[2], types = i[3], show_time = i[4], area = i[5], pingfen=i[6], update_time = i[7]) for i in info]
        
        
        
        return {'film_list': info}
Example #15
0
    def get_film_info(self, url, encoding=None):
        regex = dict(
            imgurl=
            '<div class="vodImg"><img class="lazy" src="(.*?)" alt=".*?" />',
            name='<h2>(.*?)</h2>',
            score='<div class="vodh"><h2>.*?</h2><span>(.*?)</span>',
            pingfen='<label>(.*?)</label>',
            info='\
<li>别名:<span>(.*?)</span></li>\
<li>导演:<span>(.*?)</span></li>\
<li>主演:<span>(.*?)</span></li>\
<li>类型:<span>(.*?)</span></li>\
<li>地区:<span>(.*?)</span></li>\
<li>语言:<span>(.*?)</span></li>\
<li>上映:<span>(.*?)</span></li>\
<li>更新:<span>(.*?)</span></li>',
            infro=
            '<strong>剧情介绍:</strong></div><div class="vodplayinfo">(.*?)</div>',
            show_list='checked="" />(.*?)</li>')
        two_info = Spider().get_info(url, encoding, **regex)
        imgurl = two_info['imgurl'][0]
        infro = two_info['infro'][0]
        director = self.split_info(two_info['info'][0][1])
        actor = self.split_info(two_info['info'][0][2])
        types = self.split_info(two_info['info'][0][3])
        area = self.split_info(two_info['info'][0][4])
        language = self.split_info(two_info['info'][0][5])
        m3u8_list = [
            url.split('$') for url in two_info['show_list']
            if url.endswith('.m3u8')
        ]
        yun_list = [
            url.split('$') for url in two_info['show_list']
            if not url.endswith('.m3u8')
        ]

        film_info = dict(imgurl=imgurl,
                         name=two_info['name'][0],
                         name_info=two_info['score'][0],
                         pingfen=two_info['pingfen'][0],
                         author_name=two_info['info'][0][0],
                         director=director,
                         actor=actor,
                         types=types,
                         area=area,
                         language=language,
                         infro=infro,
                         show_time=two_info['info'][0][6],
                         up_date=two_info['info'][0][7],
                         m3u8_list=m3u8_list,
                         yun_list=yun_list)
        return film_info
Example #16
0
def main():
    # project dir
    create_dir(ROOT)

    Spider(DEFAULT_HEADERS, DEFAULT_TIMEOUT)

    # 读取url列表
    file = open('msglist.json')
    text = file.read()
    file.close()
    urls = json.loads(text)

    urls_visited = []
    if os.path.exists('visited.txt'):
        file = open('visited.txt', 'r')
        for line in file:
            urls_visited.append(line.rstrip())

    urlmap = {}
    for item in urls:
        title = item['title']
        url = item['url']
        if url in urls_visited:
            print 'visited', url
            continue

        urlmap[url] = title
        queue.put(url)

    # start 
    file = open('visited.txt', 'a')
    while queue.empty() == False:
        url = queue.get()
        print "crawl ", url
        logging.info('now crawl %s', url)
        Spider.crawl(url)
        print "analyse ", url
        logging.info('now analyse %s', url)
        images = Spider.analyse()
   
        queue.task_done()

        visited.add(url)

        save(images, urlmap[url])

        file.write(url+'\n')
        file.flush()

    file.close()
    print 'finished'
    logging.info('finished')
Example #17
0
 def _start_spiders(self):
     """
     Starts all spider processes
     """
     #Start the spider processes
     self.spiders = []
     for i in xrange(self.n_spiders):
         spider_instance = Spider(i, self.frontier, self.document_store)
         spider_process = Process(target=spider_instance)
         spider_process.daemon = True
         self.spiders.append(spider_process)
         spider_process.start()
     print 'Spider all started.'
Example #18
0
 def attribute(self):
     """属性"""
     # 图标
     self.window.setWindowIcon(QIcon(RESOURCES_ROOT + "/magnet.ico"))
     # 磁力爬虫
     self.spider = Spider()
     self.spider_thread = None
     # 爬虫线程锁
     self.lock = Lock()
     # 右键菜单
     self.rightMenu = QMenu(self.window.search_out)
     # 剪切板
     self.clipboard = QApplication.clipboard()
Example #19
0
 def get_show_page_info(self, url):
     regex = dict(
         info=
         '<li><span class="tt"></span><span class="xing_vb4"><img class="new" src="/template/www_api_xin/images/new/49.gif">&nbsp<a href="(.*?)">(.*?)<img class="hot" src="/template/www_api_xin/images/hot/hot.gif"></font></a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb6">(.*?)</span></li>'
     )
     info = Spider().get_info(url, encoding='utf-8', **regex)['info']
     info = [
         dict(url=i[0],
              name=i[1].split('&nbsp')[0],
              types=i[2],
              update_time=i[3]) for i in info
     ]
     return {'film_list': info}
Example #20
0
def getFund1():
    res_list = []
    url = "http://www.southmoney.com/jijin/jijindaquan/"
    xpath_str = "//tr/td[@class]/a/text()"
    sp = Spider(url=url, parse_str=xpath_str)
    sp()
    for i in sp.result:
        ii = re.findall('\d{6}', i)
        if not ii:
            continue
        res_list.append(ii[0])

    return res_list
Example #21
0
 def get_show_page_info(self, url):
     regex = dict(
         info=
         '<li><span class="tt"></span><span class="xing_vb4"><a href="(.*?)" target="_blank">(.*?)</a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb[67]">(.*?)</span></li>'
     )
     info = Spider().get_info(url, encoding='utf-8', **regex)['info']
     info = [
         dict(url='http://www.ziyuanpian.net' + i[0],
              name=i[1].split('&nbsp')[0],
              types=i[2],
              update_time=i[3]) for i in info
     ]
     return {'film_list': info}
Example #22
0
 def get_show_page_info(self, url):
     regex = dict(
         info=
         '<li><span class="tt"></span><span class="xing_vb4"><a href="(.*?)" target="_blank">(.*?)</a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb[67]">(.*?)</span></li>'
     )
     info = Spider().get_info(url, **regex)['info']
     info = [{
         'url': self.domain + i[0],
         'name': i[1],
         'types': i[2],
         'update_time': i[3]
     } for i in info]
     return {'film_list': info}
Example #23
0
 def get_show_page_info(self, url):
     regex = dict(
         href=
         '''<ul><li><span class="tt"></span><span class="xing_vb4"><a href="(.*?)" target="_blank">(.*?)</a></span><span class="xing_vb5">(.*?)</span><span class="xing_vb[67]">(.*?)</span></li></ul>'''
     )
     one_info = Spider().get_info(url, **regex)['href']
     one_info = [
         dict(url=self.domain + i[0],
              name=i[1].split(' ')[0],
              types=i[2],
              update_time=i[3]) for i in one_info
     ]
     return {'film_list': one_info}
Example #24
0
def main():
    target_page = [1, 2, 150, 300, 450]
    base = "http://www.jjwxc.net/bookbase_slave.php?booktype=&opt=&page={}&endstr=true&orderstr=4"
    base_urls = [base.format(x) for x in target_page]
    book_list = []
    for url in base_urls:
        temp = get_books(url)
        time.sleep(3)
        book_list.extend(temp)
        print('%d books loaded!' % (len(book_list)))
    write_list = ['\t'.join(x) for x in book_list]
    spider = Spider()
    spider.write_list_txt(write_list, 'book_list.txt')
Example #25
0
def run():
    arguments = sys.argv
    project_name = arguments[1]
    homepage = arguments[2]
    number_of_threads = int(arguments[3])

    domain_name = get_domain_name(homepage)
    queue_file = project_name + '/queue.txt'
    # crawled_file = project_name + '/crawled.txt'

    Spider(project_name, homepage, domain_name)
    create_workers(number_of_threads)
    crawl(queue_file)
Example #26
0
    def __init__(self, master):
        self.master = master

        east_group = LabelFrame(master, text='东部')
        east_group.grid(row=0, column=0, padx=5, pady=5)
        west_group = LabelFrame(master, text='西部')
        west_group.grid(row=1, column=0, padx=5, pady=5)

        # 东部排名
        east_ranking = LabelFrame(master, text='东部排名')
        east_ranking.grid(row=0, column=1, rowspan=2, padx=5, pady=5, sticky=N)
        self.east_ranking_list = self.creat_teams_ranking_list(east_ranking)

        # 西部排名
        west_ranking = LabelFrame(master, text='西部排名')
        west_ranking.grid(row=0, column=2, rowspan=2, padx=5, pady=5, sticky=N)
        self.west_ranking_list = self.creat_teams_ranking_list(west_ranking)

        # 东部
        atlantic_group = LabelFrame(east_group, text='大西洋区')
        atlantic_group.grid(row=0, column=0, padx=5, pady=5)
        central_group = LabelFrame(east_group, text='中部区')
        central_group.grid(row=0, column=1, padx=5, pady=5)
        southeast_group = LabelFrame(east_group, text='东南区')
        southeast_group.grid(row=0, column=2, padx=5, pady=5)

        # 西部
        pacific_group = LabelFrame(west_group, text='太平洋区')
        pacific_group.grid(row=1, column=0, padx=5, pady=5)
        southwest_group = LabelFrame(west_group, text='西南区')
        southwest_group.grid(row=1, column=1, padx=5, pady=5)
        northwest_group = LabelFrame(west_group, text='西北区')
        northwest_group.grid(row=1, column=2, padx=5, pady=5)

        spider = Spider()
        index_data = spider.load_teams_index()
        teams_ranking_data = spider.load_teams_ranking()

        analyzer = Analyzer()
        teams_data = analyzer.analyze_teams_data(index_data)
        self.teams_ranking = analyzer.analyze_teams_ranking(teams_ranking_data)

        self.load_teams_ranking()

        self.teams_logo = utils.load_teams_logos()
        self.load_group(atlantic_group, teams_data[0:5])
        self.load_group(pacific_group, teams_data[5:10])
        self.load_group(central_group, teams_data[10:15])
        self.load_group(southwest_group, teams_data[15:20])
        self.load_group(southeast_group, teams_data[20:25])
        self.load_group(northwest_group, teams_data[25:30])
Example #27
0
    def get_show_page_info(self, url):
        '''
        url:'https://www.rebozy.com/index.php/index/index/page/6.html'
        
        return:
            
            {
            
            #type_name:'香港剧'
            film_list:[
                    
            
            {
                    url: 'https://www.subo8988.com/?m=vod-detail-id-25401.html'
                    name: '宝宝来了[国语版] 20集全/已完结'
                    types: '香港剧'
                    update_time:'2019-05-27'
                
            
            }
            
            ...
            
            
            
            ]
            
            
            }
        
        '''

        regex = dict(info='<li class="clearfix">\s+?\
	<h3 class="title">\s+?\
		<a href="(.*?)" title="(.*?)">.*? <em>(.*?)</em></a>\s+?\
	</h3>\s+?\
	<span class="type">\s+?\
		<a href=".*?">(.*?)</a>\s+?\
	</span>\s+?\
	<span class="time">\s+?\
		(.*?)	</span>	')

        info = Spider().get_info(url, encoding='utf-8', **regex)['info']

        #        return info
        info = [
            dict(url=i[0], name=i[1], state=i[2], types=i[3], update_time=i[4])
            for i in info
        ]

        return {'film_list': info}
Example #28
0
 def get_show_page_info(self, url):
     regex = dict(
         info=
         '<li><span class="tt"></span><span class="xing_vb4"><a href="(.*?)" target="_blank">(.*?)</a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb[67]">(.*?)</span></li>'
     )
     info = Spider().get_info(url, **regex)['info']
     joint_url = self.domain
     info = [{
         'url': joint_url + url[1:],
         'name': name,
         'types': types,
         'update_time': update_time
     } for url, name, types, update_time in info]
     return {'film_list': info}
Example #29
0
def getFund2():
    res_list = []
    url = "http://fund.eastmoney.com/js/fundcode_search.js"
    re_str = '\["\d{6}","\w+","\w+","\w+","\w+"\]'
    sp = Spider(url=url, parse_str=re_str, parse_method="re")
    sp()
    for i in sp.result:
        ii = i[1:-1].replace('"', '')
        ii = ii.split(",")

        if ii[3] == "混合型":
            res_list.append(ii[0])

    return res_list
Example #30
0
 def film_search(self,keyword,encoding=None):
     post_url='http://www.123ku.com/index.php?m=vod-search'
     data={
          'wd': keyword,
          'submit': 'search',
             }
     regex = dict(
             
             info = '<li><span class="tt"></span><span class="xing_vb4"><a href="(.*?)" target="_blank">(.*?)</a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb6">(.*?)</span></li>'
             )
     joint_url = self.domain
     info = Spider().post_info(post_url, data, encoding, **regex)['info']
     info = [{'url':joint_url + url[1:], 'name':name, 'types':types, 'update_time': update_time} for url, name, types, update_time in info]
     return {'search_list': info, 'search_word': keyword, 'host': self.domain}