def work(): print('main.py/work()') while True: url=queue.get() Spider.crawl_page(threading.current_thread().name,url) queue.task_done() print('main.py/work()/end')
def main(): """ 程序主入口 获取命令行参数并做判断和处理,根据参数设置logger,创建线程池和 spider,线程池中加入 工作线程 处理线程任务,spider向线程池中加入任务。 """ # 获取命令行参数并处理 args = base.get_arg() if not base.check_args(args): print 'Args error!' sys.exit() base.handle_args(args) # 设置logger if not base.set_logger(args.log_file, args.log_level): print 'Set logger error' sys.exit() logger.debug('Get args :%s' % args) # 程序自检 if args.test_self: base.test_self() sys.exit() database = Sqlite3DB(args.db_file) # 创建 spider 和 线程池。根据 thread_num 向线程池加入多个工作线程。 # 在 spider 中建立多个任务 放入到线程池中。 spider = Spider(args.url, args.depth, args.thread_num, args.key_word, args.down_file, database) main_thread = MainThread(spider) main_thread.start() spider.start()
def work(): while True: item = thread_queue.get() url = item['url'] distance = item['distance'] Spider.crawl_page(threading.current_thread().name, url, distance) thread_queue.task_done()
def walk(self, url, outfile): self.pageinfo = {} self.errors = [] Spider.walk(self, url, self.iswebpage) print("\r[ ] Processed %i urls" % (len(self.pageinfo))) urlset = ET.Element('urlset', {'xmlns':"http://www.sitemaps.org/schemas/sitemap/0.9"}) for page in self.pageinfo: url = ET.SubElement(urlset, 'url') loc = ET.SubElement(url, 'loc') lastmod = ET.SubElement(url, 'lastmod') changefreq = ET.SubElement(url, 'changefreq') priority = ET.SubElement(url, 'priority') loc.text = page lastmod.text = self.pageinfo[page]['lastmod'] changefreq.text = self.pageinfo[page]['change'] priority.text = '%0.1f' % self.pageinfo[page]['pri'] tree = ET.ElementTree(urlset) tree.write(outfile, encoding='utf-8', xml_declaration=True) if len(self.errors) > 0: print("[!] The following pages produced errors:") for e in self.errors: print(" %i %s" % (e[1], e[0]))
def downloadArchivesList(aList, container, extension='.txt.gz', numThreads=5): '''Set up downloader''' queue = initDownloader(numThreads) import csv f = open(aList, 'rb') reader = csv.reader(f) for row in reader: startURL = row[0] mlName = startURL.split('/')[-2] spider = Spider(startURL) spider.process_page(startURL) '''Only the links to archive files are interesting: mailing list archive file names end with '.txt.gz' ''' urlList = [x for x in sorted(spider.URLs) if x.endswith(extension)] if len(urlList): print '%s: %d archives' % (mlName, len(urlList)) store = os.path.join(container, mlName) if not (os.path.isdir(store)): os.system("mkdir %s" % store) '''Download each archive''' addToQ(queue, urlList, store) '''If here, download finished. Stop threads''' stopDownloader(queue, numThreads)
def __init__(self, **kwargs): kwargs['enable_reborn'] = True kwargs['enable_proxy'] = False kwargs['max_login_tries'] = 8 #kwargs['ips_obj'] = self.ips_obj self.out = open('out.txt', 'w+') self.login_status = False Spider.__init__(self, **kwargs)
def run(self): sp = Spider() if not sp.login_succeed: self.stop() else: while True: new_stuff = sp.update() if len(new_stuff) > 0: print str(len(new_stuff)) + " weibos to update" glob.newswall.notifyCallbacks(new_stuff) time.sleep(crawl_interval)
def spider(self): # 请求头增加cc s = Spider(additional_headers={'Cache-Control': 'max-age=0'}) try: s.fetch(self.url) except HTTPError as e: # 检查该电影相关页面是否存在 if e.msg == 'Not Found': return # 因为中文被编码成utf-8之后变成'/u2541'之类的形式,lxml一遇到"/"就会认为其标签结束 return etree.HTML(s.content.decode('utf-8'))
def main(): args = command_parser() target_url = args.target_url[0] depth = int(args.depth[0]) log_level = int(args.log_level) log_file = args.log_file thread_number = int(args.thread_number) key = args.key db_file = args.db_file test_self = args.test_self spider = Spider(target_url, depth=depth, thread_number=thread_number) spider.start()
def __init__(self, master): self.master = master east_group = LabelFrame(master, text='东部') east_group.grid(row=0, column=0, padx=5, pady=5) west_group = LabelFrame(master, text='西部') west_group.grid(row=1, column=0, padx=5, pady=5) # 东部排名 east_ranking = LabelFrame(master, text='东部排名') east_ranking.grid(row=0, column=1, rowspan=2, padx=5, pady=5, sticky=N) self.east_ranking_list = self.creat_teams_ranking_list(east_ranking) # 西部排名 west_ranking = LabelFrame(master, text='西部排名') west_ranking.grid(row=0, column=2, rowspan=2, padx=5, pady=5, sticky=N) self.west_ranking_list = self.creat_teams_ranking_list(west_ranking) # 东部 atlantic_group = LabelFrame(east_group, text='大西洋区') atlantic_group.grid(row=0, column=0, padx=5, pady=5) central_group = LabelFrame(east_group, text='中部区') central_group.grid(row=0, column=1, padx=5, pady=5) southeast_group = LabelFrame(east_group, text='东南区') southeast_group.grid(row=0, column=2, padx=5, pady=5) # 西部 pacific_group = LabelFrame(west_group, text='太平洋区') pacific_group.grid(row=1, column=0, padx=5, pady=5) southwest_group = LabelFrame(west_group, text='西南区') southwest_group.grid(row=1, column=1, padx=5, pady=5) northwest_group = LabelFrame(west_group, text='西北区') northwest_group.grid(row=1, column=2, padx=5, pady=5) spider = Spider() index_data = spider.load_teams_index() teams_ranking_data = spider.load_teams_ranking() analyzer = Analyzer() teams_data = analyzer.analyze_teams_data(index_data) self.teams_ranking = analyzer.analyze_teams_ranking(teams_ranking_data) self.load_teams_ranking() self.teams_logo = utils.load_teams_logos() self.load_group(atlantic_group, teams_data[0:5]) self.load_group(pacific_group, teams_data[5:10]) self.load_group(central_group, teams_data[10:15]) self.load_group(southwest_group, teams_data[15:20]) self.load_group(southeast_group, teams_data[20:25]) self.load_group(northwest_group, teams_data[25:30])
def work(): while True: url = queue.get() table_name = 'url_title_rel' title = Spider.crawl_page(threading.current_thread().name, url, DB_FILE_PATH, table_name) #print title queue.task_done()
def create_spider(self): spider = Spider() xml = parse(self._filename) params = xml.getElementsByTagName(self._parameters) if params is not None: params = params[0] pages = params.getElementsByTagName(self._page) for page in pages: print(page.firstChild.data) spider.add_url(page.firstChild.data) domains = params.getElementsByTagName(self._domain) for domain in domains: print(domain.firstChild.data) spider.add_domain(domain.firstChild.data) depth = params.getElementsByTagName(self._depth) if depth is not None: depth = depth[0] print(depth.firstChild.data) spider.set_max_depth(depth.firstChild.data) return spider
def grab_crawler(data): bot = Spider() bot.initial_urls = [data['site_url']] bot.total = data['image_count'] bot.result_status = 'inprogress' bot.image_type = data['image_type'] bot.run()
def work(): while True: url=queue.get() Spider.crawl_page(threading.current_thread().name,url) queue.task.done()
class TestSpider(unittest.TestCase): def setUp(self): self.test_spider = Spider("aladinfoods.bg") def test_spider_init(self): self.assertEqual(self.test_spider.scaned_url, []) self.assertEqual(self.test_spider.domain, "aladinfoods.bg") def test_is_outgoing(self): self.assertFalse(self.test_spider.is_outgoing("http://aladinfoods.bg")) def test_is_not_outgoing(self): self.assertTrue(self.test_spider.is_outgoing("http://hackbulgaria.com")) def test_is_valid(self): self.assertTrue(self.test_spider.is_valid("http://aladinfoods.bg/menu")) def test_is_not_valid(self): self.assertFalse(self.test_spider.is_valid("http://hackbulgaria.com"))
def downloadArchives(startURL, container, lookInsideSubfolders=False, extension='.txt.gz', numThreads=5): '''Crawl <startURL> and find all mailing list archives (given the filename <extension>). Store the files in the folder with the path <container>. If <lookInsideSubfolders>, then go one level deeper (crawl all first-order links as well). ''' '''Set up downloader''' queue = initDownloader(numThreads) print 'Downloading archives from', startURL if not lookInsideSubfolders: spider = Spider(startURL) spider.process_page(startURL) '''Only the links to archive files are interesting: mailing list archive file names end with '.txt.gz' ''' urlList = [x for x in sorted(spider.URLs) if x.endswith(extension)] print '%d archives' % (len(urlList)) addToQ(queue, urlList, container) else: spider = Spider(startURL) spider.process_page(startURL) for link in sorted(spider.URLs): subspider = Spider(link) subspider.process_page(link) mlName = link.split('/')[-2] '''Only the links to archive files are interesting: mailing list archive file names end with '.txt.gz' ''' urlList = [x for x in sorted(subspider.URLs) if x.endswith(extension)] if len(urlList): print '%s: %d archives' % (mlName, len(urlList)) '''Create a folder for the mailing list''' store = os.path.join(container, mlName) if not (os.path.isdir(store)): os.system("mkdir %s" % store) addToQ(queue, urlList, store) '''If here, download finished. Stop threads''' stopDownloader(queue, numThreads)
def __init__(self, url=None, index_path=None): '''Initializes a WebFetcher instance and its base attributes''' # the url which shouldbe fetched self._url = url self._base_url = self._url file_ext = os.path.splitext(url)[1] if file_ext: # trim the file name self._base_url = self._url[:self._url.rindex("/")] # the path where everything will be saved self._index_path = index_path self._spider = Spider() self._CSS_RE = re.compile(r"url\(([^\)]*)\)")
def get(self, *args, **kwargs): query = self.get_argument("query") student_number = self.get_argument("stu_no") password = self.get_argument("pwd") spider = Spider(student_number=student_number, password=password) spider.authorized() student_id = spider.student_id self.set_header("Content-Type", "application/json") try: if query == "semester": storage = self.redis_cls.get_semester_grade(student_id) if storage is None: current = spider.parse_semester_grade() if current.code != 0: self.write(json.dumps(dict(current._asdict()))) return data = current.data else: data = storage elif query == "pass": storage = self.redis_cls.get_passed_grade(student_id) if storage is None: current = spider.parse_passed_grade() if current.code != 0: self.write(json.dumps(dict(current._asdict()))) return data = current.data else: data = storage elif query == "fail": storage = self.redis_cls.get_failed_grade(student_id) if storage is None: current = spider.parse_failed_grade() if current.code != 0: self.write(json.dumps(dict(current._asdict()))) return data = current.data else: data = storage else: raise ValueError("Query Operation Out") self.write(gen_response(code=0x0000, data=data, msg="成功", en_msg="Success")) except Exception as err: self.write(gen_response(code=0x0001, msg=str(err), en_msg="Unknown Error"))
def __init__(self, master, team_id): self.team_info_win = Toplevel(master) self.team_info_win.resizable(False, False) self.team_id = team_id self.spider = Spider() team_info_data = self.spider.load_team_info(team_id) analyzer = Analyzer() self.team_info = analyzer.analyze_team_info(team_info_data) (self.team_average, self.team_leader) = analyzer.analyze_team_data_leader(team_info_data) self.team_info_win.title(self.team_info.name) self.load_team_introduction() self.load_team_data() self.load_players()
def setUp(self): """Set up""" self.spider_q = Queue() self.db_q = Queue() self.url_q = Queue() for i in range(5): self.spider = Spider(self.spider_q, self.db_q, self.url_q, self.start, blacklist=(os.path.abspath( 'blacklist.txt'))) self.spider.setDaemon(True) self.spider.start() self.pages = ['http://exchanges.state.gov/heritage/index.html', 'http://exchanges.state.gov/heritage/iraq.html', 'http://exchanges.state.gov/heritage/special.html', 'http://exchanges.state.gov/heritage/culprop.html', 'http://exchanges.state.gov/heritage/afcp.html'] self.start = pages[0] self.soups = [BeautifulSoup(requests.get(page).text) for page in self.pages] for soup in self.soups: self.spider_q.get(soup) self.spider_q.join() self.soup = soups[0]
def spider(domain, url, depth): """爬虫测试""" spider_engine = Spider(domain) spider_engine.crawl_page([url], depth)
def get_film_info(self, url, encoding=None): regex = dict( intro='<!--简介开始-->(.*?)<!--简介结束-->', info='\ <li class="sa">影片名称: <!--片名开始-->(.*?)<!--片名结束--></li>\s+?\ <li class="sa">影片别名: <!--别名开始-->(.*?)<!--别名结束--></li>\s+?\ <li class="sa">影片备注: <!--备注开始-->(.*?)<!--备注结束--></li>\s+?\ <li class="sa">影片主演: <!--主演开始-->(.*?)<!--主演结束--></li>\s+?\ <li class="sa">影片导演: <!--导演开始-->(.*?)<!--导演结束--></li>\s+?\ <li><div class="left">栏目分类: <!--栏目开始-->(.*?)<!--栏目结束--></div><div class="right">影片类型: <!--类型开始-->(.*?)<!--类型结束--></div></li>\s+?\ <li><div class="left">语言分类: <!--语言开始-->(.*?)<!--语言结束--></div><div class="right">影片地区: <!--地区开始-->(.*?)<!--地区结束--></div></li>\s+?\ <li><div class="left">连载状态: <!--连载开始-->(.*?)<!--连载结束--></div><div class="right">上映年份: <!--年代开始-->(.*?)<!--年代结束--></div></li>\s+?\ <li><div class="left">更新时间: <!--时间开始-->(.*?)<!--时间结束--></div><div class="right">豆瓣ID: <!--豆瓣ID开始-->(.*?)<!--豆瓣ID结束--></div></li>\s+?', show_list= '<input type="checkbox" name="copy_sel" value="(.*?)" checked="">', imgurl= '<div class="videoPic"><!--图片开始--><img src="(.*?)"/><!--图片结束--></div>' ) info = Spider().get_info(url, encoding=encoding, **regex) actor = self.split_info(info['info'][0][3]) #主演 types = self.split_info(info['info'][0][5]) #栏目分类 # types = self.split_info(info['info'][0][6])#影片类型 area = self.split_info(info['info'][0][8]) #影片地区 director = self.split_info(info['info'][0][4]) #导演 language = self.split_info(info['info'][0][7]) #语言分类 m3u8_list = [ url.split('$')[1] for url in info['show_list'] if url.endswith('.m3u8') ] yun_list = [ url.split('$')[1] for url in info['show_list'] if not url.endswith('.m3u8') ] film_info = dict( imgurl=info['imgurl'][0], name=info['info'][0][0], #影片名称 name_info='', grader='', another_name=info['info'][0][1], #影片别名 director=director, #影片导演 actor=actor, types=types, area=area, language=language, show_time=info['info'][0][10], #上映年份 length='', update_time=info['info'][0][11], #更新时间 day_palys='', total_score='', total_score_number='', intro=info['intro'][0], m3u8_list=m3u8_list, yun_list=yun_list, film_remark=info['info'][0][2], #影片备注 serial_state=info['info'][0][9], #连载状态 halves_id=info['info'][0][12], #豆瓣id ) return film_info
def work(): while True: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def get_film_info(self, url, encoding=None): regex = dict( intro='<div class="vodplayinfo">(.*?)</div>', name='<h2>(.*?)</h2>\s+?<span>(.*?)</span>\s+?<label>(.*?)</label>', info='\ <li>别名:<span>(.*?)</span></li>\s+?\ <li>导演:<span>(.*?)</span></li>\s+?\ <li>主演:<span>(.*?)</span></li>\s+?\ <li>类型:<span>(.*?)</span></li>\s+?\ <li class="sm">地区:<span>(.*?)</span></li>\s+?\ <li class="sm">语言:<span>(.*?)</span></li>\s+?\ <li class="sm">上映:<span>(.*?)</span></li>\s+?\ <li class="sm">片长:<span>(.*?)</span></li>\s+?\ <li class="sm">更新:<span>(.*?)</span></li>\s+?\ <li class="sm">总播放量:<span><em id="hits">.*?</script></span></li>\s+?\ <li class="sm">今日播放量:<span>(.*?)</span></li>\s+?\ <li class="sm">总评分数:<span>(.*?)</span></li>\s+?\ <li class="sm">评分次数:<span>(.*?)</span></li>', show_list='checked="" />(.*?)</li>') info = Spider().get_info(url, encoding=encoding, **regex) director = self.split_info(info['info'][0][1]) actor = self.split_info(info['info'][0][2]) types = self.split_info(info['info'][0][3]) area = self.split_info(info['info'][0][4]) language = self.split_info(info['info'][0][5]) m3u8_list = [ url.split('$') for url in info['show_list'] if url.endswith('.m3u8') ] xunlei_list = [ url.split('$') for url in info['show_list'] if url.endswith('.mp4') ] yun_list = [ url.split('$') for url in info['show_list'] if not url.endswith('.m3u8') and url.endswith('mp4') ] film_info = dict( name=info['name'][0][0], name_info=info['name'][0][1], grade=info['name'][0][2], athour_name=info['info'][0][0], director=director, actor=actor, types=types, area=area, language=language, show_time=info['info'][0][6], lens=info['info'][0][7], up_date=info['info'][0][8], #plays = info['info'][0][9], day_plays=info['info'][0][9], total_score=info['info'][0][10], total_score_number=info['info'][0][11], m3u8_list=m3u8_list, xunlei_list=xunlei_list, yun_list=yun_list, ) return film_info
#!/usr/bin/env python3.5 # -*- coding: utf-8 -*- # __author__ = Goodweather Fu from spider import Spider spider = Spider() spider.getPage(3, -1, 1)
# coding: utf-8 from spider import Spider if __name__ == "__main__": import time print("Start At:", time.asctime(time.localtime(time.time()))) spider = Spider() spider.start() print("Stop At:", time.asctime(time.localtime(time.time())))
def main(): s = Spider.verify_cookie() s.get_weibo_info()
from flask import url_for from common import * from spider import Spider app = Flask(APP_NAME) app.jinja_env.auto_reload = True app.jinja_env.filters['rename'] = rename app.jinja_env.filters['url_rename'] = url_rename app.jinja_env.filters['small_img'] = small_img app.jinja_env.filters['big_img'] = big_img app.jinja_env.filters['can_play_url'] = can_play_url app.config['TEMPLATES_AUTO_RELOAD'] = True # 向网页暴露异常 app.config['PROPAGATE_EXCEPTIONS'] = True SPIDER = Spider() def run(): LOGGER.info("website.run") if CONFIG.getboolean("base", "debug_mode"): app.debug = True # 打开主页 if CONFIG.getboolean("website", "auto_open_site_on_run"): open_browser_tab(get_local_url()) app.run(port=CONFIG.getint("base", "port"), processes=1) # 安装程序 @app.route('/install', methods=['GET', 'POST']) def install():
def __init__(self): self.mysql = MysqlClient() self.spider = Spider()
from spider import Spider, Content #from model.models import Ip def get_ip_info(html_response): """ 清理内容得到IP信息 """ ips_list = [] soup = BeautifulSoup(html_response.body, "html.parser") ip_list_table = soup.find(id='ip_list') for ip_info in ip_list_table.find_all('tr'): ip_detail = ip_info.find_all('td') if ip_detail: # 注意:为什么我用list和str方法?否则就是bs4对象!!! ips_list.append(dict(ip=str(list(ip_detail)[1].string), port=str(list(ip_detail)[2].string))) return ips_list s = Spider('http://www.xicidaili.com/nn/') response = s.get() ips = get_ip_info(response) # 默认存到运行运行脚本的目录,文件名:data.txt Content().save_to_file(ips) # 存到数据库 #t = Content(Ip) # for ip_data in ips: # t.save(ip_data)
def get_film_info(self, url, encoding=None): print("开始获取二级页面信息...") regex = dict( #这个爬取的是简介 intro='<!--简介开始-->(.*?)<!--简介结束-->', #\s+? \s代表换行或者空格 +最少有一个 ?匹配离它最近的 #片名,更新与评分 info='\ <li class="sa">影片名称: <!--片名开始-->(.*?)<!--片名结束--></li>\s+?\ <li class="sa">影片别名: <!--别名开始-->(.*?)<!--别名结束--></li>\s+?\ <li class="sa">影片备注: <!--备注开始-->(.*?)<!--备注结束--></li>\s+?\ <li class="sa">影片主演: <!--主演开始-->(.*?)<!--主演结束--></li>\s+?\ <li class="sa">影片导演: <!--导演开始-->(.*?)<!--导演结束--></li>\s+?\ <li><div class="left">栏目分类: <!--栏目开始-->(.*?)<!--栏目结束--></div><div class="right">影片类型: <!--类型开始-->(.*?)<!--类型结束--></div></li>\s+?\ <li><div class="left">语言分类: <!--语言开始-->(.*?)<!--语言结束--></div><div class="right">影片地区: <!--地区开始-->(.*?)<!--地区结束--></div></li>\s+?\ <li><div class="left">连载状态: <!--连载开始-->(.*?)<!--连载结束--></div><div class="right">上映年份: <!--年代开始-->(.*?)<!--年代结束--></div></li>\s+?\ <li><div class="left">更新时间: <!--时间开始-->(.*?)<!--时间结束--></div><div class="right">豆瓣ID: <!--豆瓣ID开始-->(.*?)<!--豆瓣ID结束--></div></li>', show_list= '<input type="checkbox" name="copy_sel" value="(.*?)" checked="">') info = Spider().get_info(url, encoding=encoding, **regex) director = self.split_info(info['info'][0][4]) actor = self.split_info(info['info'][0][3]) types = self.split_info(info['info'][0][6]) area = self.split_info(info['info'][0][8]) language = self.split_info(info['info'][0][7]) m3u8_list = [ url.split('$') for url in info['show_list'] if url.endswith('.m3u8') ] yun_list = [ url.split('$') for url in info['show_list'] if not url.endswith('m3u8') ] film_info = dict( name=info['info'][0][0], #影片名字 athour_name=info['info'][0][1], #别名 remarks=info['info'][0][2], #备注 actor=actor, #主演 director=director, #导演 column_class=info['info'][0][5], #栏目分类 types=types, #影片类型 language=language, #语言 area=area, #影片地区 name_info=info['info'][0][9], #连载状态 release_date=info['info'][0][10], #上映年份 up_data=info['info'][0][11], #更新时间 grade=info['info'][0][12], #影片评分 m3u8_list=m3u8_list, yun_list=yun_list) print("获取成功!!!") return film_info
from queue import Queue from spider import Spider from domain import * from utility import * PROJECT_NAME = str(input('Enter the Project name : ')) #print(PROJECT_NAME) HOMPAGE = str(input('Enter the Homepage : ')) #print(HOMPAGE) DOMAIN_NAME = get_domain_name(HOMPAGE) QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWLED_FILE = PROJECT_NAME + '/crawled.txt' NUMBER_OF_THREADS = 8 queue = Queue() #thread queue Spider(PROJECT_NAME,HOMPAGE,DOMAIN_NAME) #crawling the homepage #create worker-spider thread 01 def create_workers(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target = work) t.daemon = True t.start() #lets get high #do the job queue def work(): while True: url = queue.get() Spider.crawl_page(threading.current_thread().name,url) queue.task_done()
def __init__(self, proxy=False): ''' proxy: 默认开启代理,False为关闭代理 ''' self.spider = Spider(proxy)
class TestSpider(unittest.TestCase): """Test case for the Spider clas""" def setUp(self): """Set up""" self.spider_q = Queue() self.db_q = Queue() self.url_q = Queue() for i in range(5): self.spider = Spider(self.spider_q, self.db_q, self.url_q, self.start, blacklist=(os.path.abspath( 'blacklist.txt'))) self.spider.setDaemon(True) self.spider.start() self.pages = ['http://exchanges.state.gov/heritage/index.html', 'http://exchanges.state.gov/heritage/iraq.html', 'http://exchanges.state.gov/heritage/special.html', 'http://exchanges.state.gov/heritage/culprop.html', 'http://exchanges.state.gov/heritage/afcp.html'] self.start = pages[0] self.soups = [BeautifulSoup(requests.get(page).text) for page in self.pages] for soup in self.soups: self.spider_q.get(soup) self.spider_q.join() self.soup = soups[0] def test_get_links(self): """Tests only links to web pages are being collected""" actual = self.spider.get_links(self.soup) expected = set([ 'http://exchanges.state.gov/scho-pro.html', 'http://www.state.gov/misc/87529.htm#privacy', 'http://www.state.gov/m/a/ips/', 'http://exchanges.state.gov/alumni/index.html', 'http://exchanges.state.gov/student.html', 'http://exchanges.state.gov/programs/professionals.html', 'http://exchanges.state.gov/about/assistant-secretary-stock.html', 'http://exchanges.state.gov/news/index.html', 'http://exchanges.state.gov/heritage/index.html', 'http://exchanges.state.gov/heritage/1sindex.html', 'http://exchanges.state.gov/heritage/culprop.html', 'http://exchanges.state.gov/mobile/index.html', 'http://j1visa.state.gov/', 'http://www.state.gov/misc/415.htm', 'http://exchanges.state.gov/index.html', 'http://exchanges.state.gov/sports/index.html', 'http://exchanges.state.gov/grants/preparing_payment.html', 'http://state.gov/', 'http://exchanges.state.gov/grants/faqs.html', 'http://exchanges.state.gov/heritage/whatsnew.html', 'http://exchanges.state.gov/', 'http://exchanges.state.gov/about/program_offices.html', 'http://exchanges.state.gov/englishteaching/forum-journal.html', 'http://www.state.gov/misc/60289.htm', 'http://exchanges.state.gov/heritage/iraq.html', 'http://exchanges.state.gov/grants/terminology.html', 'http://exchanges.state.gov/heritage/sindex.html', 'http://exchanges.state.gov/heritage/special.html', 'http://exchanges.state.gov/grants/preparing_reports.html', 'http://exchanges.state.gov/programevaluations/index.html', 'http://exchanges.state.gov/programs/scholars.html', 'http://exchanges.state.gov/programs/cultural.html', 'http://exchanges.state.gov/programs/secondary-school.html', 'http://www.usa.gov/', 'http://exchanges.state.gov/about/contact-us.html', 'http://exchanges.state.gov/programs/university.html', 'http://www.state.gov/misc/87529.htm#copyright', 'http://exchanges.state.gov/grants/open2.html', 'http://exchanges.state.gov/programs/english-language.html', 'http://exchanges.state.gov/jexchanges/ppp.html', 'http://exchanges.state.gov/pro-admin.html', 'http://exchanges.state.gov/search.html', 'http://exchanges.state.gov/grants/cfda.html', 'http://www.iawg.gov/', 'http://exchanges.state.gov/englishteaching/resources-et.html', ('http://exchanges.state.gov/heritage/culprop/index/' 'pdfs/unesco01.pdf'), 'http://exchanges.state.gov/heritage/afcp.html', 'http://exchanges.state.gov/features/index.html', 'http://exchanges.state.gov/host/index.html', 'http://exchanges.state.gov/about/employment.html', 'http://exchanges.state.gov/programs/educators.html', 'http://exchanges.state.gov/a-z.html', 'http://exchanges.state.gov/about.html', ('http://exchanges.state.gov/programevaluations/' 'program-evaluations.html'), ]) self.assertEqual(actual, expected) def test_get_pdfs(self): """Tests that pdfs are being found on page""" actual = self.spider.get_pdfs(self.soup) expected = set([('http://exchanges.state.gov/heritage/culprop/index/' 'pdfs/unesco01.pdf')]) self.assertEqual(actual, expected) def test_black_list(self): """Tests black list is being pulled in""" actual = self.spider.black_list() expected = ['http://exchanges.state.gov/heritage/iraq.html', 'http://exchanges.state.gov/heritage/special.html'] self.assertEqual(actual, expected) def test_threaded_processing_pdfs(self): actual = [] def tearDown(self): """Tear down""" pass
def startCrawl(self): spider = Spider(self.userList) spider.crawl(self.hasProcessed)
import threading from queue import Queue from spider import Spider from domain import * from general import * from linkFinder import LinkFinder PROJECTNAME = 'thenewboston' HOMEPAGE = 'http://thenewboston.com/' DOMAINNAME = getDomainName(HOMEPAGE) QUEUEFILE = PROJECTNAME + '/queue.txt' CRAWLEDFILE = PROJECTNAME + '/crawled.txt' NUMBEROFTHREADS = 8 threadQueue = Queue() Spider(PROJECTNAME, HOMEPAGE, DOMAINNAME) # create worker threads (will die when main exists) def createWorkers(): for _ in range(NUMBEROFTHREADS): t = threading.Thread( target=work) # start 8 threads that run the "work()" function t.daemon = True # dies when main exists t.start() # do next job in the queue def work(): while True: url = threadQueue.get()
temp_dire = os.path.abspath(temp_dire) ''' Creating workspacein tmp directory of os is creating many probs in osx. tmp directory change from /var to ./private etc ''' log_it("TEMP DIR",temp_dire) if os.path.exists(temp_dire): shutil.rmtree(temp_dire) distutils.dir_util.copy_tree(src_dir,temp_dire) owd = os.getcwd() log_it("LOG","Crawling started") spider = Spider(temp_dire) log_it("LOG","Crawling done") # spider.crawl() log_it("LOG","Compileing pages started") posts_data=[] for post_folder in spider.crawl(): config = json.load(open(os.path.join(post_folder,"__pub.lish"))) t_date = time.strptime(config['date'],"%Y-%m-%d") posts_data.append({ 'title': config['name'].replace('-', ' '), 'url' : post_folder[len(temp_dire)+1:], 'year' : time.strftime("%Y",t_date), 'day' : time.strftime("%d",t_date), 'month': time.strftime("%b",t_date), 'date' : t_date
import threading from queue import Queue from spider import Spider from domain import * START = 'http://website.address.com/' DOMAIN_NAME = get_domain_name(START) NUMBER_OF_THREADS = 8 queue = Queue() Spider(START, DOMAIN_NAME) # Create worker threads (will die when main exits) def create_workers(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) t.daemon = True t.start() # Do the next job in the queue def work(): while True: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) queue.task_done() # Each queued link is a new job def create_jobs(): #print("create_jobs()")
from spider import Spider from domain import * from general import * from Library import Requirement_Manager # This method gets all requirements from the user # This file module was created because it takes up # more lines to be in this main.py python module Requirement_Manager.Initiate() PROJECT_NAME = Requirement_Manager.getPath() + Requirement_Manager.getProjectName() DOMAIN_NAME = get_domain_name(Requirement_Manager.getHomePage()) QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWLED_FILE = Requirement_Manager.getPath() + PROJECT_NAME + '/crawled.txt' queue = Queue() Spider(PROJECT_NAME, Requirement_Manager.getHomePage(), DOMAIN_NAME) print(Requirement_Manager.getCores()) # Create worker threads (will die when main exits) def create_workers(): for _ in range(Requirement_Manager.getCores()): t = threading.Thread(target=work) t.daemon = True t.start() # Do the next job in the queue def work(): while True: url = queue.get() Spider.crawl_page(threading.current_thread().name, url)
class Crawler(object): """description of class""" lookup = None q = None feed = None MAT_RETREIS = 5 INFO_LOG = logging.INFO ERROR_LOG = logging.ERROR DEBUG_LOG = logging.DEBUG def __init__(self, link, **kwargs): super(Crawler, self).__init__() self.lookup = {} self.q = Queue() self.feed = link.replace('https://', 'http://') if self.feed.endswith('/'): self.feed = self.feed[0:len(self.feed) - 1] self.downloader = Downloader(verify=kwargs.get('verify', True)) link = link.replace('http://', '') link = link.replace('https://', '') self.path = kwargs.get( 'output_path', os.path.join( os.path.expanduser('~'), link.translate({ord(c): "" for c in "<>:/\\\"|?*"}) + '.txt')) self.links_count = 0 self.parser = Spider() log_level = kwargs.get('LOG', Crawler.INFO_LOG) if not os.path.exists(os.path.join(os.path.expanduser('~'), '.crawler')): os.makedirs(os.path.join(os.path.expanduser('~'), '.crawler')) logging.basicConfig(filename=os.path.join( os.path.expanduser('~'), '.crawler', link.translate({ord(c): "" for c in "<>:/\\\"|?*"}) + '.log'), format='%(asctime)s %(levelname)s %(message)s', level=log_level) logging.getLogger().addHandler(logging.StreamHandler()) def __enter__(self): if len(self.lookup) == 0: self.lookup[self.feed] = Link() self.q.put(self.feed) self.file = open(self.path, 'w+') self.file.write(self.feed + '\n') self.links_count = 1 return self def __exit__(self, exc_type, exc_val, exc_tb): self.file.close() logging.info("Links are saved at " + self.path) logging.info("Logs are saved at " + os.path.join(os.path.expanduser('~'), '.crawler')) def get__stats(self): return self.lookup def crawl(self, count=-1): try: while not self.q.empty(): link = self.q.get() if not self.lookup[link].crawled: clear() logging.info("Links extracted : " + str(self.links_count)) if self.links_count >= count and count > 0: return logging.debug(link + ' being crawled') self.__get_links(link) except KeyboardInterrupt: sys.exit() def crawl_next(self): while not self.q.empty(): link = self.q.get() if not self.lookup[link].crawled: logging.debug(link + ' being crawled') self.__get_links(link) links = self.lookup.keys() yield links def __get_links(self, link): try: response = self.downloader.getContent(link) if response.status_code == 200: self.lookup[link].crawled = True links = self.parser.parseLinks(response.text) logging.debug(link + ' crawled' + str(len(links))) self.lookup[link].extracted_links_count = len(links) for link in links: self.q.put(link) if not link in self.lookup: self.links_count += 1 self.file.write(link + '\n') self.lookup[link] = Link() else: logging.error(link + ' failed to get response with code ' + str(response.status_code)) self.lookup[link].res_code = response.status_code self.lookup[link].attempts += 1 if self.lookup[link].attempts < Crawler.MAT_RETREIS: self.q.put(link) else: logging.error(link + ' MAX_RETRIES Exceeded') except Exception as err: logging.error(err)
from spider import Spider from mongo import Mongo keyword = 'Python' spider = Spider(keyword) mongo = Mongo('taobao', keyword) # 创建mogodb print('start keyword:', keyword) total_page = spider.search() # 打开搜索页面 print('total page:', total_page) for page in range(1, total_page + 1): html = spider.next_page(page) # 根据页码获取搜索结果页面 items = spider.parse(html) # 提取数据 ids = mongo.insert_many(items) # 保存到mongodb中去 print('current page:', page, 'save to mongo items:', len(ids))
def work(): while True: url = q.get() Spider.crawl_page(threading.currentThread().name, url) q.task_done()
import threading from Queue import Queue from spider import Spider from domain import * from file_helper import * PROJECT_NAME = 'Micron' HOME_PAGE = 'https://www.micron.com/' DOMAIN_NAME = get_domain_name(HOME_PAGE) QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWL_FILE = PROJECT_NAME + '/crawled.txt' NUMBER_OF_THREADS = 8 queue = Queue() spider = Spider(PROJECT_NAME, HOME_PAGE, DOMAIN_NAME, QUEUE_FILE, CRAWL_FILE)
import threading from queue import Queue from spider import Spider from domain import * from functions import * PROJECT_NAME = '9gag' BASE_URL = 'http://9gag.com' DOMAIN_NAME = getDomainName(BASE_URL) QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWLED_FILE = PROJECT_NAME + '/crawled.txt' NUMBER_OF_THREADS = 4 threadQueue = Queue() Spider(PROJECT_NAME, BASE_URL, DOMAIN_NAME) def crawl(): while True: queuedLinks = fileToSet(QUEUE_FILE) if len(queuedLinks) > 0: createJobs() else: break # Each link in queued links is a new job def createJobs(): for link in fileToSet(QUEUE_FILE): threadQueue.put(link)
import threading from queue import Queue from spider import Spider from domain import * from web_crawler import * project = 'jobsearch' homepage = 'https://www.monster.com/jobs/search/?q=Software-Engineer&intcid=skr_navigation_nhpso_searchMain' domain = get_domain(homepage) queue_file = project + '/queue.txt' craw_file = project + '/crawl.txt' threads = 8 queue = Queue() Spider(project, homepage, domain) def crawl1(): queued_links = create_set(queue_file) if len(queued_links) > 0: print(str(len(queued_links)) + ' links in the queue') create_jobs() def create_jobs(): for link in create_set(queue_file): queue.put(link) queue.join() crawl1()
class Crawler: ''' Crawler class is a class to collect target data using Spider class which requests a websource from a web site and extracts target data. Crawler class mainly runs with thread which run spiders asynchronously using queing method. ''' def __init__(self, name, base_url, domain_url=None, dbpath=None): ''' When crawler starts, the init method should do 3 things. 1. Setup the base url and extract domain url. 2. Build crawled list and queue list. The crawled list is a list to store data which users want to extract. The queue list is a url list to crawl a page from the url. Default storer is SQLite by creating tables named 'crawled' and 'queue'. 3. Create a spider. ''' self.name = name self.base_url = base_url if domain_url is None: self.domain_url = get_domain_url(self.base_url) self.create_url_storer(dbpath) self.queue = Queue() self.spider = Spider(self.name, self.base_url, self.domain_url, self.url_storer, 0) print('Finished initialize.') def create_url_storer(self, dbpath): if dbpath is not None: self.url_storer = { 'db': UrlStorer(dbpath), 'table': { 'queue': self.name + 'queue', 'crawled': self.name + 'crawled', }, } self.url_storer['db'].create_table( self.url_storer['table']['queue']) self.url_storer['db'].create_table( self.url_storer['table']['crawled']) def run(self): ''' When run method starts, the method does following directions. 1. Create worker threads which should be shut down when Processor or Crawler instance is dead. 2. Get the base url and store it to the queue list. (This is initial run on the crawling, usually storing url operates after link urls are fetched from a target webpage.) 3. Fetch urls from queue list. If queue list is empty, make this module consider whether crawler should be shut down. 4. Input fetched urls to Queue, so Spider can be ready to run again. 5. When items are input to Queue then threads will run until the work is finished. To assure that implementation, Queue.join is necessary. ''' self.create_workers(self.work) self.store_base_url() print('All default settings are done.') while self.fetch_urls(): print('waiting for join') self.queue.join() print('working done') self.url_storer['db'].close() def create_workers(self, work=None): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=self.work) t.daemon = True t.start() def store_base_url(self): self.url_storer['db'].put(self.url_storer['table']['queue'], self.base_url, 0) def fetch_urls(self): print('fetch_urls called') url_list = list(self.url_storer['db'].get( self.url_storer['table']['queue'])) if len(url_list) > 0: for url in url_list: if url[1] < MAXIMUM_LEVELS: print(url, 'willl put into queue') self.queue.put(url) self.url_storer['db'].delete(self.url_storer['table']['queue'], url[0]) return True return False def work(self): while True: url = self.queue.get() self.spider = Spider(self.name, url[0], self.domain_url, self.url_storer, url[1]) print('start spider work', url) self.spider.run(threading.current_thread().name, url[0]) self.queue.task_done() print('task is done')
import threading from queue import Queue from spider import Spider from domain import * from general import * PROJECT_NAME = '' HOMEPAGE = '' DOMAIN_NAME = get_domain_name(HOMEPAGE) QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWLED_FILE = PROJECT_NAME + '/crawled.txt' NUMBER_OF_THREADS = 8 queue = Queue() Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) # Create worker threads (will die when main exits) def create_workers(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) t.daemon = True t.start() # Do the next job in the queue def work(): while True: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def main(): Spider(PROJECT_NAME, URL)
import threading from queue import Queue from domain import get_domain_name from spider import Spider NUM_SPIDERS = 10 HOMEPAGE = 'https://twitter.com/' DOMAIN_NAME = get_domain_name(HOMEPAGE) Spider(DOMAIN_NAME, HOMEPAGE) q = Queue() # crawl the next url def work(): while True: url = q.get() Spider.crawl_page(threading.currentThread().name, url) q.task_done() # Create spider threads (will be terminated when main exits) def create_spiders(): for x in range(NUM_SPIDERS): t = threading.Thread(target=work) t.daemon = True t.start() # Each queued link is a new job def create_jobs():
def run_by_head(): loop = asyncio.get_event_loop() ss = SlaverSpider() b, p, f = loop.run_until_complete(ss.login(**STORE_INFO['YK'])) s = Spider(ss, b, p, f) loop.run_until_complete(s.get_page())
# 此代码创建了爬虫的api from flask import Flask, request import flask_restful from flask_restful import Resource from spider import Spider import logging #设置log级别,过滤access日志,降低内存 log = logging.getLogger('werkzeug') log.setLevel(logging.WARNING) app = Flask("Spider") api = flask_restful.Api(app) sp = Spider() #漫画搜索页面 class ComicSearch(Resource): def get(self): kw = request.args.get('kw') p = request.args.get('p') try: res = sp.comic_search(kw, p) except: res = '' finally: return res #漫画详情页面
def work(): ### a thread will take jobs(links), then crawl and repeat untill queue will not be empty while True: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def __init__(self, **kwargs): kwargs['enable_reborn'] = True kwargs['enable_proxy'] = True kwargs['ips_obj'] = self.ips_obj Spider.__init__(self, **kwargs)
def work(): while True: url = queue.get() #multithreaded queue is obtained and returns all the threads from there Spider.crawl_page(threading.current_thread().name,url) queue.task_done() #once done it returns False and while loop ends.
def release(redis, pname, db, max_job_count, interval, wait, urls, response, referer, tag, qname="spider", debug=False): s = Spider(redis, pname, db, max_job_count=max_job_count, qname=qname, interval=interval, wait=wait, debug=debug) s.run(urls, response, referer=referer, tag=tag)
from spider import Spider if __name__ == "__main__": website = Spider() # website.search('变身') # website.get_lists("http://www.biquge5200.com/85_85278/") print( website.get_page("http://www.biquge5200.com/81_81174/149183228.html"))
class TeamInfoFrame(): def __init__(self, master, team_id): self.team_info_win = Toplevel(master) self.team_info_win.resizable(False, False) self.team_id = team_id self.spider = Spider() team_info_data = self.spider.load_team_info(team_id) analyzer = Analyzer() self.team_info = analyzer.analyze_team_info(team_info_data) (self.team_average, self.team_leader) = analyzer.analyze_team_data_leader(team_info_data) self.team_info_win.title(self.team_info.name) self.load_team_introduction() self.load_team_data() self.load_players() def load_team_introduction(self): team_frame = LabelFrame(self.team_info_win, text=self.team_info.name) team_frame.grid(row=0, column=0, columnspan=5, padx=5, pady=5) team_logo_image = utils.load_team_logo(self.team_id) team_logo = Label(team_frame, image=team_logo_image) team_logo.grid(row=0, column=0, rowspan=2, padx=5, pady=5) team_logo.image = team_logo_image Label(team_frame, text=self.team_info.introduction[0], width=30).grid(row=0, column=1, sticky=W, padx=5, pady=5) Label(team_frame, text=self.team_info.introduction[1], width=30).grid(row=0, column=2, sticky=W, padx=5, pady=5) Label(team_frame, text=self.team_info.introduction[2], width=30).grid(row=0, column=3, sticky=W, padx=5, pady=5) Label(team_frame, text=self.team_info.introduction[3], width=30).grid(row=1, column=1, sticky=W, padx=5, pady=5) Label(team_frame, text=self.team_info.introduction[4], width=30).grid(row=1, column=2, sticky=W, padx=5, pady=5) Label(team_frame, text=self.team_info.introduction[5], width=30).grid(row=1, column=3, sticky=W, padx=5, pady=5) def load_team_data(self): self.load_points_frame() self.load_assists_frame() self.load_rebounds_frame() self.load_steal_frame() self.load_block_frame() '''team_average_frame = LabelFrame(self.team_info_win) team_average_frame.grid(row=1, column=0, padx=5, pady=5) Label(team_average_frame, text=self.team_average[0][0] + self.team_average[0][1] + '\n' + self.team_average[0][2]).grid(row=0, column=0) Label(team_average_frame, text=self.team_average[1][0] + self.team_average[1][1] + '\n' + self.team_average[1][2]).grid(row=0, column=1) Label(team_average_frame, text=self.team_average[2][0] + self.team_average[2][1] + '\n' + self.team_average[2][2]).grid(row=0, column=2) Label(team_average_frame, text=self.team_average[3][0] + self.team_average[3][1] + '\n' + self.team_average[3][2]).grid(row=0, column=3) Label(team_average_frame, text=self.team_average[4][0] + self.team_average[4][1] + '\n' + self.team_average[4][2]).grid(row=0, column=4) team_leader_frame = LabelFrame(self.team_info_win) team_leader_frame.grid(row=2, column=0, padx=5, pady=5) Label(team_leader_frame, text=self.team_leader[0][0] + '\n' + self.team_leader[0][1] + '\n' + self.team_leader[0][2]).grid(row=1, column=0) Label(team_leader_frame, text=self.team_leader[1][0] + '\n' + self.team_leader[1][1] + '\n' + self.team_leader[1][2]).grid(row=1, column=1) Label(team_leader_frame, text=self.team_leader[2][0] + '\n' + self.team_leader[2][1] + '\n' + self.team_leader[2][2]).grid(row=1, column=2) Label(team_leader_frame, text=self.team_leader[3][0] + '\n' + self.team_leader[3][1] + '\n' + self.team_leader[3][2]).grid(row=1, column=3) Label(team_leader_frame, text=self.team_leader[4][0] + '\n' + self.team_leader[4][1] + '\n' + self.team_leader[4][2]).grid(row=1, column=4)''' def load_points_frame(self): points_frame = LabelFrame(self.team_info_win, text='得分') points_frame.grid(row=1, column=0, padx=5, pady=5) # 球队得分 Label(points_frame, text=self.team_average[0][0] + self.team_average[0][1] + '\n' + self.team_average[0][2]).grid(row=0, column=0, columnspan=2, padx=5, pady=5) # 个人得分 image = self.spider.load_internet_image(self.team_leader[0][0]) image_label = Label(points_frame, image=image) image_label.grid(row=1, column=0, sticky=W, padx=5, pady=5) image_label.image = image Label(points_frame, text=self.team_leader[0][1] + '\n' + self.team_leader[0][2] + '\n' + self.team_leader[0][3]).grid(row=1, column=1, sticky=W, padx=5, pady=5) def load_assists_frame(self): assists_frame = LabelFrame(self.team_info_win, text='助攻') assists_frame.grid(row=1, column=1, padx=5, pady=5) Label(assists_frame, text=self.team_average[1][0] + self.team_average[1][1] + '\n' + self.team_average[1][2]).grid(row=0, column=0, columnspan=2, padx=5, pady=5) image = self.spider.load_internet_image(self.team_leader[1][0]) image_label = Label(assists_frame, image=image) image_label.grid(row=1, column=0, sticky=W, padx=5, pady=5) image_label.image = image Label(assists_frame, text=self.team_leader[1][1] + '\n' + self.team_leader[1][2] + '\n' + self.team_leader[1][3]).grid(row=1, column=1, sticky=W, padx=5, pady=5) def load_rebounds_frame(self): rebounds_frame = LabelFrame(self.team_info_win, text='篮板') rebounds_frame.grid(row=1, column=2, padx=5, pady=5) Label(rebounds_frame, text=self.team_average[2][0] + self.team_average[2][1] + '\n' + self.team_average[2][2]).grid(row=0, column=0, columnspan=2, padx=5, pady=5) image = self.spider.load_internet_image(self.team_leader[2][0]) image_label = Label(rebounds_frame, image=image) image_label.grid(row=1, column=0, sticky=W, padx=5, pady=5) image_label.image = image Label(rebounds_frame, text=self.team_leader[2][1] + '\n' + self.team_leader[2][2] + '\n' + self.team_leader[2][3]).grid(row=1, column=1, sticky=W, padx=5, pady=5) def load_steal_frame(self): steal_frame = LabelFrame(self.team_info_win, text='抢断') steal_frame.grid(row=1, column=3, padx=5, pady=5) Label(steal_frame, text=self.team_average[3][0] + self.team_average[3][1] + '\n' + self.team_average[3][2]).grid(row=0, column=0, columnspan=2, padx=5, pady=5) image = self.spider.load_internet_image(self.team_leader[3][0]) image_label = Label(steal_frame, image=image) image_label.grid(row=1, column=0, sticky=W, padx=5, pady=5) image_label.image = image Label(steal_frame, text=self.team_leader[3][1] + '\n' + self.team_leader[3][2] + '\n' + self.team_leader[3][3]).grid(row=1, column=1, sticky=W, padx=5, pady=5) def load_block_frame(self): block_frame = LabelFrame(self.team_info_win, text='盖帽') block_frame.grid(row=1, column=4, padx=5, pady=5) Label(block_frame, text=self.team_average[4][0] + self.team_average[4][1] + '\n' + self.team_average[4][2]).grid(row=0, column=0, columnspan=2, padx=5, pady=5) image = self.spider.load_internet_image(self.team_leader[4][0]) image_label = Label(block_frame, image=image) image_label.grid(row=1, column=0, sticky=W, padx=5, pady=5) image_label.image = image Label(block_frame, text=self.team_leader[4][1] + '\n' + self.team_leader[4][2] + '\n' + self.team_leader[4][3]).grid(row=1, column=1, sticky=W, padx=5, pady=5) def load_players(self): players_frame = LabelFrame(self.team_info_win, text='球员阵容') players_frame.grid(row=3, column=0, columnspan=5, padx=5, pady=5) players_list = Treeview(players_frame, columns=('0', '1', '2', '3', '4', '5', '6', '7', '8'), show='headings', height=len(self.team_info.players)) players_list.grid(row=0, column=0) players_list.column('0', width=50, anchor='center') players_list.column('1', width=150, anchor='center') players_list.column('2', width=100, anchor='center') players_list.column('3', width=70, anchor='center') players_list.column('4', width=70, anchor='center') players_list.column('5', width=50, anchor='center') players_list.column('6', width=100, anchor='center') players_list.column('7', width=100, anchor='center') players_list.column('8', width=100, anchor='center') players_list.heading('0', text='号码') players_list.heading('1', text='球员') players_list.heading('2', text='位置') players_list.heading('3', text='身高') players_list.heading('4', text='体重') players_list.heading('5', text='年龄') players_list.heading('6', text='生日') players_list.heading('7', text='工资(美元)') players_list.heading('8', text='进入NBA') for player in self.team_info.players: players_list.insert('', 'end', value=(player.number, player.name, player.position, player.height, player.weight, player.age, player.birthday, player.salary, player.time_enter))
# but that proved to be inefficient depth = 1 # Build the root node from argv[1] or default to grackles if len(arguments) > 2: raise ValueError('Too many arguments given') elif len(arguments) == 2: if url_validator(arguments[1]): return spider.build_node(arguments[1], None, depth) else: raise ValueError('Invliad URL given') else: return spider.build_node( _DEFAULT_URL, None, depth) if __name__ == '__main__': logging.basicConfig( level=logging.DEBUG, format='%(relativeCreated)6d %(threadName)s %(message)s') # Enable SIGINT handling see: # https://bugzilla.gnome.org/show_bug.cgi?id=622084 # https://stackoverflow.com/a/16486080/298149 signal.signal(signal.SIGINT, signal.SIG_DFL) wiki_nodes = get_nodes(Spider(),is_valid_url, sys.argv) TreeWindow(wiki_nodes) Gtk.main()
import threading from queue import Queue from spider import Spider from domain import * from general import * import time PROJECT_NAME = 'Gik' HOMEPAGE = 'https://www.giki.edu.pk/' DOMAIN_NAME = get_domain_name(HOMEPAGE) QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWLED_FILE = PROJECT_NAME + '/crawled.txt' NUMBER_OF_THREADS = 8 queue = Queue() x = Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) QUEUE = x.crawl_page('First spider', HOMEPAGE) # Create worker threads (will die when main exits) def create_workers(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) t.daemon = True t.start() # Do the next job in the queue def work(): global QUEUE while True:
def __init__(self): Spider.__init__(self) self.site = 'taobao'