def get_show_page_info(self,url): regex = dict( info = '<li><span class="tt"></span><span class="xing_vb4"><img class="new" src="/template/www_api_xin/images/new/49.gif"> <a href="(.*?)" target="_blank">(.*?)<img class="hot" src="/template/www_api_xin/images/hot/hot.gif"></font></a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb6">(.*?)</span></li>' ) info = Spider().get_info(url,encoding = 'utf-8', **regex)['info'] joint_url = self.domain info = [{'url':joint_url + url[1:], 'name':name, 'types':types, 'update_time': update_time} for url, name, types, update_time in info] return {'film_list': info}
def main(): reload(sys) sys.setdefaultencoding('utf8') spider = Spider('python', '杭州') spider.setSalay(5.9, 16, 10.9, 31.0) spider.addShieldCompany('畅唐网络') spider.addShieldCompany('中国亿教亿学网') spider.addContainText('C++') spider.addContainText('c++') #spider.addContainText('爬虫') spider.analyse()
def start(self): if self.filePath.get() == '' or self.outputPath.get() == '': tkinter.messagebox._show("Error", message="please select a keyword file and output path!") return spider = Spider() spider.readKeyWord(filePath=self.filePath) spider.searchKeyWord() spider.createResultExcel(outputFilePath=self.outputPath) tkinter.messagebox._show("Success", message="已经完成了爬取!")
def crawl(self): try: start = time.time() Spider(self.project_name, self.home_page, self.domain_name, self.save_flag) self._create_workers() self._crawl() end = time.time() print("Time elapsed: ", end - start) finally: Spider.graph.save()
def download(input_file, audio_output='audio_output', video_output='video_output', sources=None): if not os.path.isdir(video_output): os.makedirs(video_output) if not os.path.isdir(audio_output): os.makedirs(audio_output) spider = Spider() spider.load(input_file) spider.download(audio_output, video_output, sources)
def get_all_page(self, url): rst = Spider().get_html(url) page = int( str( re.findall( '<li><div class="pages" style="margin-bottom:10px;">共.*?条数据 当前:1/(.*?)页 <em>', rst, re.S))[2:-2]) return page
def get_movies(self): #随机获取20个视频 try: url = 'http://101.251.217.216/rest/n/feed/hot?isp=CMCC&mod=lemobile%28le%20x620%29&lon=116.41025&country_code=cn&kpf=ANDROID_PHONE&extId=59942a6c1d534a51844dfda37e92afc3&did=ANDROID_72c3ac6bd3184a67&kpn=KUAISHOU&net=WIFI&app=0&oc=MYAPP%2C1&ud=0&hotfix_ver=&c=MYAPP%2C1&sys=ANDROID_5.1.1&appver=6.1.0.8039&ftt=&language=zh-cn&iuid=&lat=39.916411&did_gt=1560736770695&ver=6.1&max_memory=192&type=7&page=1&coldStart=false&count=20&pv=false&id=23&refreshTimes=7&pcursor=&source=1&needInterestTag=false&client_key=3c2cd3f3&os=android&sig=510e56b366931c4cb008c51ee44664c2' return Spider().get_html(url) except Exception as e: print(e.args)
def spider(self): # 请求头增加cc s = Spider(additional_headers={'Cache-Control': 'max-age=0'}) try: s.fetch(self.url) except HTTPError as e: # 检查该电影相关页面是否存在 if e.msg == 'Not Found': return # 因为中文被编码成utf-8之后变成'/u2541'之类的形式,lxml一遇到"/"就会认为其标签结束 return etree.HTML(s.content.decode('utf-8'))
def main(): init_encoding() init_logging("log/debug.log", "log/log.log") spider = Spider() begin_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # spider.run() spider.run_crontab() end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) logging.info("-------begin: %s, end: %s--------" % (begin_time, end_time)) pass
def film_search(self, keyword, encoding = None): post_url = 'http://zy.ataoju.com/index.php?m=vod-search' data = { 'wd': keyword, 'submit': 'search' } regex = dict( info = '<li><span class="tt"></span><span class="xing_vb4"><a href="(.*?)" target="_blank">(.*?)</a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb[67]">(.*?)</span></li>' ) info = Spider().post_info(post_url,data,**regex) info = [{'url':self.domain+i[0],'name':i[1],'types':i[2],'update_time':i[3]} for i in info['info']] return {'search_list': info, 'search_word': keyword, 'host': self.domain}
def get_film_info(self, url, encoding=None): regex = dict( intro='<div class="vodplayinfo">(.*?)</div>', name='<h2>(.*?)</h2>\s+?<span>(.*?)</span>\s+?<label>(.*?)</label>', info='\ <li>别名:<span>(.*?)</span></li>\s+?\ <li>导演:<span>(.*?)</span></li>\s+?\ <li>主演:<span>(.*?)</span></li>\s+?\ <li>类型:<span>(.*?)</span></li>\s+?\ <li class="sm">地区:<span>(.*?)</span></li>\s+?\ <li class="sm">语言:<span>(.*?)</span></li>\s+?\ <li class="sm">上映:<span>(.*?)</span></li>\s+?\ <li class="sm">片长:<span>(.*?)</span></li>\s+?\ <li class="sm">更新:<span>(.*?)</span></li>\s+?\ <li class="sm">总播放量:<span><em id="hits">.*?</script></span></li>\s+?\ <li class="sm">今日播放量:<span>(.*?)</span></li>\s+?\ <li class="sm">总评分数:<span>(.*?)</span></li>\s+?\ <li class="sm">评分次数:<span>(.*?)</span></li>', show_list='target=_blank>(.*?)</a> ', imgurl='<img class="lazy" src="(.*?)" alt=".*?" />') info = Spider().get_info(url, encoding=encoding, **regex) director = self.split_info(info['info'][0][1]) actor = self.split_info(info['info'][0][2]) types = self.split_info(info['info'][0][3]) area = self.split_info(info['info'][0][4]) language = self.split_info(info['info'][0][5]) film_info = dict(name=info['name'][0][0], name_info=info['name'][0][1], grader=info['name'][0][2], athour_name=info['info'][0][0], director=director, actor=actor, types=types, area=area, language=language, show_time=info['info'][0][6], lens=info['info'][0][7], up_date=info['info'][0][8], day_palys=info['info'][0][9], total_score=info['info'][0][10], total_score_number=info['info'][0][11], intro=info['intro'][0], m3u8_list=[ i.split('$') for i in info['show_list'] if i.endswith('index.m3u8') ], yun_list=[ i.split('$') for i in info['show_list'] if not i.endswith('index.m3u8') ], imgurl=info['imgurl'][0]) return film_info
def multi_thread(self): # 获取任务 task = Task() url_list = task.get_task() # 爬取数据 spider = Spider() for i in range(LOOP_NUM): print('开始下一循环', end='\n' * 3) per_step_urls = task.get_urls(url_list) with futures.ThreadPoolExecutor(64) as executor: executor.map(spider.get_resp, per_step_urls)
def get_comment(self, args): #获取评论 comment_url = 'http://api.gifshow.com/rest/n/comment/list/v2?isp=CMCC&mod=vivo%28vivo%20x5m%29&lon=116.41025&country_code=cn&kpf=ANDROID_PHONE&did=ANDROID_d1f47e9473209293&kpn=KUAISHOU&net=WIFI&app=0&oc=MYAPP%2C1&ud=0&hotfix_ver=&c=MYAPP%2C1&sys=ANDROID_5.1.1&appver=6.1.0.8039&ftt=&language=zh-cn&iuid=&lat=39.916411&did_gt=1560817030537&ver=6.1&retryTimes=1&max_memory=192' if len(args) == 3: #获取作品的第一组评论 sig_r = '&photoId={}&user_id={}&order=desc&count=10&photoPageType=0&client_key=3c2cd3f3&os=android&sig={}'.format( args[0], args[1], args[2]) comment_url = comment_url + sig_r return Spider().get_html(comment_url) if len(args) == 4: #获取带有pcursor参数的评论 sig_r = '&photoId={}&user_id={}&order=desc&pcursor={}&count=10&photoPageType=0&client_key=3c2cd3f3&os=android&sig={}'.format( args[0], args[1], args[2], args[3]) comment_url = comment_url + sig_r return Spider().get_html(comment_url)
def get_show_page_info(self,url): ''' url:'https://www.subo8988.com/?m=vod-type-id-13.html' return: { #type_name:'香港剧' film_list:[ { url: 'https://www.subo8988.com/?m=vod-detail-id-25401.html' name: '宝宝来了[国语版] 20集全/已完结' types: '香港剧' update_time:'2019-05-27' } ... ] } ''' regex = dict( info = '\ <li><span class="tt"></span><span class="xing_vb4"> <a href="(.*?)" target="_blank">(.*?)<font color="#FF0000">(.*?)</font>\s+?\ <img class="new" src="/template/172zy/images/Henry_huo.gif"></span><span class="xing_vb51">(.*?)</span><span class="xing_vb54">(.*?)</span>.*?<span class="xing_vb52">(.*?)</span>.*?<span class="xing_vb53"><font color="#f60">(.*?)</font></span>.*?<span class="xing_vb[67]">(.*?)</span></a></li>' ) info = Spider().get_info(url,encoding = 'utf-8', **regex)['info'] ls = 'http://www.172zy.net' info = [dict(url = ls + i[0], name = i[1].split(' ')[0], update_total = i[2], types = i[3], show_time = i[4], area = i[5], pingfen=i[6], update_time = i[7]) for i in info] return {'film_list': info}
def get_film_info(self, url, encoding=None): regex = dict( imgurl= '<div class="vodImg"><img class="lazy" src="(.*?)" alt=".*?" />', name='<h2>(.*?)</h2>', score='<div class="vodh"><h2>.*?</h2><span>(.*?)</span>', pingfen='<label>(.*?)</label>', info='\ <li>别名:<span>(.*?)</span></li>\ <li>导演:<span>(.*?)</span></li>\ <li>主演:<span>(.*?)</span></li>\ <li>类型:<span>(.*?)</span></li>\ <li>地区:<span>(.*?)</span></li>\ <li>语言:<span>(.*?)</span></li>\ <li>上映:<span>(.*?)</span></li>\ <li>更新:<span>(.*?)</span></li>', infro= '<strong>剧情介绍:</strong></div><div class="vodplayinfo">(.*?)</div>', show_list='checked="" />(.*?)</li>') two_info = Spider().get_info(url, encoding, **regex) imgurl = two_info['imgurl'][0] infro = two_info['infro'][0] director = self.split_info(two_info['info'][0][1]) actor = self.split_info(two_info['info'][0][2]) types = self.split_info(two_info['info'][0][3]) area = self.split_info(two_info['info'][0][4]) language = self.split_info(two_info['info'][0][5]) m3u8_list = [ url.split('$') for url in two_info['show_list'] if url.endswith('.m3u8') ] yun_list = [ url.split('$') for url in two_info['show_list'] if not url.endswith('.m3u8') ] film_info = dict(imgurl=imgurl, name=two_info['name'][0], name_info=two_info['score'][0], pingfen=two_info['pingfen'][0], author_name=two_info['info'][0][0], director=director, actor=actor, types=types, area=area, language=language, infro=infro, show_time=two_info['info'][0][6], up_date=two_info['info'][0][7], m3u8_list=m3u8_list, yun_list=yun_list) return film_info
def main(): # project dir create_dir(ROOT) Spider(DEFAULT_HEADERS, DEFAULT_TIMEOUT) # 读取url列表 file = open('msglist.json') text = file.read() file.close() urls = json.loads(text) urls_visited = [] if os.path.exists('visited.txt'): file = open('visited.txt', 'r') for line in file: urls_visited.append(line.rstrip()) urlmap = {} for item in urls: title = item['title'] url = item['url'] if url in urls_visited: print 'visited', url continue urlmap[url] = title queue.put(url) # start file = open('visited.txt', 'a') while queue.empty() == False: url = queue.get() print "crawl ", url logging.info('now crawl %s', url) Spider.crawl(url) print "analyse ", url logging.info('now analyse %s', url) images = Spider.analyse() queue.task_done() visited.add(url) save(images, urlmap[url]) file.write(url+'\n') file.flush() file.close() print 'finished' logging.info('finished')
def _start_spiders(self): """ Starts all spider processes """ #Start the spider processes self.spiders = [] for i in xrange(self.n_spiders): spider_instance = Spider(i, self.frontier, self.document_store) spider_process = Process(target=spider_instance) spider_process.daemon = True self.spiders.append(spider_process) spider_process.start() print 'Spider all started.'
def attribute(self): """属性""" # 图标 self.window.setWindowIcon(QIcon(RESOURCES_ROOT + "/magnet.ico")) # 磁力爬虫 self.spider = Spider() self.spider_thread = None # 爬虫线程锁 self.lock = Lock() # 右键菜单 self.rightMenu = QMenu(self.window.search_out) # 剪切板 self.clipboard = QApplication.clipboard()
def get_show_page_info(self, url): regex = dict( info= '<li><span class="tt"></span><span class="xing_vb4"><img class="new" src="/template/www_api_xin/images/new/49.gif"> <a href="(.*?)">(.*?)<img class="hot" src="/template/www_api_xin/images/hot/hot.gif"></font></a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb6">(.*?)</span></li>' ) info = Spider().get_info(url, encoding='utf-8', **regex)['info'] info = [ dict(url=i[0], name=i[1].split(' ')[0], types=i[2], update_time=i[3]) for i in info ] return {'film_list': info}
def getFund1(): res_list = [] url = "http://www.southmoney.com/jijin/jijindaquan/" xpath_str = "//tr/td[@class]/a/text()" sp = Spider(url=url, parse_str=xpath_str) sp() for i in sp.result: ii = re.findall('\d{6}', i) if not ii: continue res_list.append(ii[0]) return res_list
def get_show_page_info(self, url): regex = dict( info= '<li><span class="tt"></span><span class="xing_vb4"><a href="(.*?)" target="_blank">(.*?)</a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb[67]">(.*?)</span></li>' ) info = Spider().get_info(url, encoding='utf-8', **regex)['info'] info = [ dict(url='http://www.ziyuanpian.net' + i[0], name=i[1].split(' ')[0], types=i[2], update_time=i[3]) for i in info ] return {'film_list': info}
def get_show_page_info(self, url): regex = dict( info= '<li><span class="tt"></span><span class="xing_vb4"><a href="(.*?)" target="_blank">(.*?)</a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb[67]">(.*?)</span></li>' ) info = Spider().get_info(url, **regex)['info'] info = [{ 'url': self.domain + i[0], 'name': i[1], 'types': i[2], 'update_time': i[3] } for i in info] return {'film_list': info}
def get_show_page_info(self, url): regex = dict( href= '''<ul><li><span class="tt"></span><span class="xing_vb4"><a href="(.*?)" target="_blank">(.*?)</a></span><span class="xing_vb5">(.*?)</span><span class="xing_vb[67]">(.*?)</span></li></ul>''' ) one_info = Spider().get_info(url, **regex)['href'] one_info = [ dict(url=self.domain + i[0], name=i[1].split(' ')[0], types=i[2], update_time=i[3]) for i in one_info ] return {'film_list': one_info}
def main(): target_page = [1, 2, 150, 300, 450] base = "http://www.jjwxc.net/bookbase_slave.php?booktype=&opt=&page={}&endstr=true&orderstr=4" base_urls = [base.format(x) for x in target_page] book_list = [] for url in base_urls: temp = get_books(url) time.sleep(3) book_list.extend(temp) print('%d books loaded!' % (len(book_list))) write_list = ['\t'.join(x) for x in book_list] spider = Spider() spider.write_list_txt(write_list, 'book_list.txt')
def run(): arguments = sys.argv project_name = arguments[1] homepage = arguments[2] number_of_threads = int(arguments[3]) domain_name = get_domain_name(homepage) queue_file = project_name + '/queue.txt' # crawled_file = project_name + '/crawled.txt' Spider(project_name, homepage, domain_name) create_workers(number_of_threads) crawl(queue_file)
def __init__(self, master): self.master = master east_group = LabelFrame(master, text='东部') east_group.grid(row=0, column=0, padx=5, pady=5) west_group = LabelFrame(master, text='西部') west_group.grid(row=1, column=0, padx=5, pady=5) # 东部排名 east_ranking = LabelFrame(master, text='东部排名') east_ranking.grid(row=0, column=1, rowspan=2, padx=5, pady=5, sticky=N) self.east_ranking_list = self.creat_teams_ranking_list(east_ranking) # 西部排名 west_ranking = LabelFrame(master, text='西部排名') west_ranking.grid(row=0, column=2, rowspan=2, padx=5, pady=5, sticky=N) self.west_ranking_list = self.creat_teams_ranking_list(west_ranking) # 东部 atlantic_group = LabelFrame(east_group, text='大西洋区') atlantic_group.grid(row=0, column=0, padx=5, pady=5) central_group = LabelFrame(east_group, text='中部区') central_group.grid(row=0, column=1, padx=5, pady=5) southeast_group = LabelFrame(east_group, text='东南区') southeast_group.grid(row=0, column=2, padx=5, pady=5) # 西部 pacific_group = LabelFrame(west_group, text='太平洋区') pacific_group.grid(row=1, column=0, padx=5, pady=5) southwest_group = LabelFrame(west_group, text='西南区') southwest_group.grid(row=1, column=1, padx=5, pady=5) northwest_group = LabelFrame(west_group, text='西北区') northwest_group.grid(row=1, column=2, padx=5, pady=5) spider = Spider() index_data = spider.load_teams_index() teams_ranking_data = spider.load_teams_ranking() analyzer = Analyzer() teams_data = analyzer.analyze_teams_data(index_data) self.teams_ranking = analyzer.analyze_teams_ranking(teams_ranking_data) self.load_teams_ranking() self.teams_logo = utils.load_teams_logos() self.load_group(atlantic_group, teams_data[0:5]) self.load_group(pacific_group, teams_data[5:10]) self.load_group(central_group, teams_data[10:15]) self.load_group(southwest_group, teams_data[15:20]) self.load_group(southeast_group, teams_data[20:25]) self.load_group(northwest_group, teams_data[25:30])
def get_show_page_info(self, url): ''' url:'https://www.rebozy.com/index.php/index/index/page/6.html' return: { #type_name:'香港剧' film_list:[ { url: 'https://www.subo8988.com/?m=vod-detail-id-25401.html' name: '宝宝来了[国语版] 20集全/已完结' types: '香港剧' update_time:'2019-05-27' } ... ] } ''' regex = dict(info='<li class="clearfix">\s+?\ <h3 class="title">\s+?\ <a href="(.*?)" title="(.*?)">.*? <em>(.*?)</em></a>\s+?\ </h3>\s+?\ <span class="type">\s+?\ <a href=".*?">(.*?)</a>\s+?\ </span>\s+?\ <span class="time">\s+?\ (.*?) </span> ') info = Spider().get_info(url, encoding='utf-8', **regex)['info'] # return info info = [ dict(url=i[0], name=i[1], state=i[2], types=i[3], update_time=i[4]) for i in info ] return {'film_list': info}
def get_show_page_info(self, url): regex = dict( info= '<li><span class="tt"></span><span class="xing_vb4"><a href="(.*?)" target="_blank">(.*?)</a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb[67]">(.*?)</span></li>' ) info = Spider().get_info(url, **regex)['info'] joint_url = self.domain info = [{ 'url': joint_url + url[1:], 'name': name, 'types': types, 'update_time': update_time } for url, name, types, update_time in info] return {'film_list': info}
def getFund2(): res_list = [] url = "http://fund.eastmoney.com/js/fundcode_search.js" re_str = '\["\d{6}","\w+","\w+","\w+","\w+"\]' sp = Spider(url=url, parse_str=re_str, parse_method="re") sp() for i in sp.result: ii = i[1:-1].replace('"', '') ii = ii.split(",") if ii[3] == "混合型": res_list.append(ii[0]) return res_list
def film_search(self,keyword,encoding=None): post_url='http://www.123ku.com/index.php?m=vod-search' data={ 'wd': keyword, 'submit': 'search', } regex = dict( info = '<li><span class="tt"></span><span class="xing_vb4"><a href="(.*?)" target="_blank">(.*?)</a></span> <span class="xing_vb5">(.*?)</span> <span class="xing_vb6">(.*?)</span></li>' ) joint_url = self.domain info = Spider().post_info(post_url, data, encoding, **regex)['info'] info = [{'url':joint_url + url[1:], 'name':name, 'types':types, 'update_time': update_time} for url, name, types, update_time in info] return {'search_list': info, 'search_word': keyword, 'host': self.domain}