def getMovieHtml( self, filename, name, j, end, z ): #从filename中获取影片存放影片具体信息的网页url,抓取url指向的网页,name是开始url,抓取下来保存文件的名字,end是结束url file = open(filename, 'r', encoding="UTF-8") i = 0 j = j flag = False for url in file: if i % 2 == 0: m = re.search(r'h.*', url) if m.group() == name: flag = True if flag: html = Spider().getHtml(m.group()) if html: Spider().saveHtml(html, dizhi[z] + str(j) + ".html") print(str(m) + " :" + str(j) + ".html已存储~") j += 1 if m.group() == end: return else: print(url.strip()) i += 1 if j % 90 == 0: time.sleep(120) #为了防止请求过于频繁,抓取一定页数就暂停一下
def going(url): mail = email() text = wordsDeal() spider1 = Spider(url, 'test') hrefs = spider1.hrefFor2018() for item in hrefs: try: content = spider1.contentOfArtical(item['href']) if content['content'] != "contents": query = "insert into Christian(href,title, content, sent) values ( '" + text.sqlEscape( item['href']) + "','" + text.sqlEscape( item['title']) + "','" + text.sqlEscape( content['content'] ) + "','" + config.notSend + "');" sqlQuery(query) print "YES ", item['title'] # mail.sendAuto(content['title'], content['content'] + '<p>' + Chinese + '</p>') except BaseException as error: query = "insert into SpiderExcept(href,except) values ( '" + text.sqlEscape( item['href']) + "','" + text.sqlEscape(str(error)) + "');" sqlQuery(query) else: print "NO ", item['title'] del spider1 del text del mail
def main(): # 根据所带参数,确定使用哪个网站的配置参数 try: website = sys.argv[1] url = sys.argv[2] except Exception as e: print "please choose one website" exit() # 实例化 dic = { "qidian": Qidian, "heiyan": Heiyan, } config = dic[website]() # 获取关键信息 handler = Spider(config.title, config.content, config.next) chapters = config.getList(url) book = open("text.txt", "w") for item in chapters: print "正在下载->", item["title"] content = handler.getContent(item["href"]) book.writelines(item["title"] + "\n") book.writelines(content["content"] + "\n")
def startCB(self): # 保存内容的文件 file = open(self.filePath, "w") # 爬取得规则 titleKlass = {"class": "j_chapterName"} contentKlass = {"class": "j_readContent"} nextKlass = {"id": "j_chapterNext"} page = self.entryUrl.get() # 开始爬取 spider = Spider(titleKlass, contentKlass, nextKlass) if page == "" or self.filePath == "": tkMessageBox.showerror("woolson", "小说名称或链接未填写!") else: # 循环抓取下一章 while page != "": result = spider.getContent(page) try: page = result["nextUrl"] file.write(result["title"] + "\n") file.write(result["content"] + "\n\n") print "正在写入->" + result["title"] except Exception as e: page = "" print "结束", result["error"]
def parse_listing(categories): postman = Postman.init() for category in categories: print(category) spider = Spider(url=category["url"]) with open("page.html", "a") as f: f.write(spider.get_page()) soup = BeautifulSoup(spider.get_page(), "html.parser") soup_a = soup.find_all("a", class_="item _item") products = [a["href"] for a in soup_a] print(products) for num, url in enumerate(products): parse_detail(category, url, postman) print(category, num, len(products)) print("sleep 3") time.sleep(1) print("sleep 2") time.sleep(1) print("sleep 1") time.sleep(1) break
def initialization(): # get a list of user ids as tasks uid_list = get_tasks(TASK_NUM) # get a list of users as crawling accounts user_list = get_accounts(ACCOUNT_NUM) spider = Spider(user_list) return spider, uid_list, user_list
def create_threads(self): """Create, start and add threads to a list. Threads run an instance of Spider. The amount of threads created depends on the amount of cores found in the system.""" for i in range(1, multiprocessing.cpu_count()): name = "Thread-%s" % i thread = Spider(name, self.queue, self.result) thread.start() threads.append(thread)
def spider_store_details(self): stores = self.query_store() for store in stores: time.sleep(1) try: contacts = None # 爬取联系方式页 spider_time = time.time() self.logger.info('开始爬取:' + store[1]) result_contacts = Spider().spider_URL( url=store[1], is_proxies=True) self.logger.info('爬取耗时:' + str(time.time() - spider_time)) # 解析联系方式页 interpreting_time = time.time() self.logger.info('开始解析:' + store[1]) contacts = Interpreter().interpreting_contact_info( result_contacts) self.logger.info('解析耗时:' + str(time.time() - interpreting_time)) # 爬取联系方式页 if not contacts: spider_time = time.time() store[1] += '/shop/company.html' self.logger.info('开始爬取:' + store[1]) result_contacts = Spider().spider_URL( url=store[1], is_proxies=True) self.logger.info('爬取耗时:' + str(time.time() - spider_time)) # 解析联系方式页 interpreting_time = time.time() self.logger.info('开始解析:' + store[1]) contacts = Interpreter().interpreting_contact_info( result_contacts) self.logger.info('解析耗时:' + str(time.time() - interpreting_time)) # 更新联系方式 self.update_contacts(store[0], contacts) except BaseException: self.logger.error('爬取或更新联系方式出错:' + traceback.format_exc()) continue
def run(key): url = set_url(host, key) Cookies() spider = Spider(url) html = spider.spider(BASEHEADERS) if not verify(html): BASEHEADERS["Cookie"] = BASEHEADERS["Cookie"] + Cookies.cookie_str( ["acw_tc", "PHPSESSID"]) proxieser.proxies() parser = HtmlParser(html) data = parser.parser("fund") print(data)
def main(): ### The start page's URL start_url = 'https://scholar.google.com.tw/citations?view_op=search_authors&hl=en&mauthors=label:complex_systems' ### p_key and n p_key = [] n_key = [] ### Google Scholar Crawler, Class Spider myCrawler = Spider(start_url, p_key, n_key, page=5) results = myCrawler.crawl() with open('result.pickle', 'wb') as f: pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
def course(): post_format = {"username": "", "password": ""} if request.method == 'POST': username = request.form['username'] password = request.form['password'] user = Spider(username, password) user.login() if user.login_status: info = user.modify_data() return jsonify(info) else: return "登录失败" else: return render_template("index.html", format=post_format)
def grade(): post_format = {"username": "", "password": ""} if request.method == 'POST': username = request.form['username'] password = request.form['password'] grade = Spider(username, password) grade.login() if grade.login_status: info = grade.modify_grade() print(info) return jsonify(info) else: return "登录失败" else: return render_template("index.html", format=post_format)
def spider_product(self, category_2nd, start, end): # 解析产品列表page1~page100 for i in range(start, end): try: # 爬取下一页休息3s time.sleep(1) # 分三次抓取当前页60个产品 product_list = [] for j in range(3): req = category_2nd[ 1] + '&ap=A&t=1&afadprenum=0&af=1' + '&ee=' + str( i) + '&afadbeg=' + str(60 * (i - 1) + (j * 20) + 1) # 爬取并解析 spider_time = time.time() self.logger.info('开始爬取:' + req) result_product_list = Spider().spider_URL(url=req, is_proxies=True) self.logger.info('爬取耗时:' + str(time.time() - spider_time)) interpreting_time = time.time() self.logger.info('开始解析:' + req) product_list.extend( Interpreter().interpreting_product_list( result_product_list)) self.logger.info('解析耗时:' + str(time.time() - interpreting_time)) ''' 先爬取所有产品列表信息,后再逐个抓取产品详情及公司信息 ''' for product in product_list: # 保存公司信息 contact = {} contact['公司名'] = product.get('company') contact['公司主页'] = product.get('homepage') store_id = self.save_contacts(contact) # 保存产品 self.save_product(product, category_2nd[0], store_id) except BaseException: self.logger.error('爬取或解析' + req + '出错:' + traceback.format_exc()) continue
def spider_product_details(self): products = self.query_products() for product in products: time.sleep(1) try: # 爬取产品详情页 spider_time = time.time() self.logger.info('开始爬取:' + product[1]) result_products = Spider().spider_URL( url=product[1], is_proxies=True) self.logger.info('爬取耗时:' + str(time.time() - spider_time)) # 解析产品详情页 interpreting_time = time.time() self.logger.info('开始解析:' + product[1]) details = Interpreter().interpreting_product_details( result_products) self.logger.info('解析耗时:' + str(time.time() - interpreting_time)) ''' # 若产品详情需要再次爬取 product_bcid = details.get('bcid') if product_bcid: interpreting_time = time.time() xss_filter = 'http://wsdetail.b2b.hc360.com/XssFilter?callback=jQuery&bcid=' result_product_introduce = Spider().spider_URL( url=xss_filter + product_bcid) self.logger.info('开始解析:' + xss_filter + product_bcid) details['desc'] = Interpreter( ).interpreting_product_details_desc( result_product_introduce) # 组装产品详情 details = Interpreter().assemble_product_details(details) self.logger.info('解析耗时:' + str(time.time() - interpreting_time)) ''' # 更新产品详情 self.update_products(product[0], details) except BaseException: self.logger.error('爬取或更新产品详情出错:' + traceback.format_exc()) continue
def main(): ### The start page's URL start_url = 'https://scholar.google.com.tw/scholar?q=frequency+lowering+algorithm&hl=zh-TW&as_sdt=0,5' ### p_key and n p_key = [ 'wdrc', 'dynamic range compression', 'hearing aid', 'speech', 'noise cancellation', 'noise reduction', 'feedback cancellation', 'sound', 'hearing loss' ] n_key = [ 'imagery', 'image', 'visual', 'video', 'optic', 'opto', 'quantum', 'photon' ] ### Google Scholar Crawler, Class Spider myCrawler = Spider(start_url, p_key, n_key, page=5) results = myCrawler.crawl() with open('result.pickle', 'wb') as f: pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
def spider_job(): try: spider = Spider.Spider() spider.spider(categoryId=1) spider.spider(categoryId=2) spider.spider(categoryId=3) spider.spider(categoryId=4) spider.spider(categoryId=5) print("success") time.sleep(5) except Exception as err: # 错误预警,发送邮件 errStr = ",".join(err.args) myEmail = MyEmail.MyEmail() myEmail.tag = "新发地商品数据爬去异常" myEmail.to_list = ["*****@*****.**"] myEmail.content = errStr myEmail.send() print(errStr)
def main(batchSize=25, threads=8, timeout=3.0, maxDepth=1, limitPerSpider=200, webpagesLimit=1016, initialize=False, recursive=False, allLinks=False, mode='normal'): colorsCombinations = [(color, style) for style in styles for color in colors] db = Database() if (initialize): db.Initialize() return 0 toVisit = [] if mode == 'explore': recursive = False allLinks = True toVisit = list( db.notVisited.find({'baseDomain': True}, { 'url': 1, 'depth': 1 })) elif mode == 'in-depth': recursive = True allLinks = False toVisit = list( db.notVisited.find({'baseDomain': False}, { 'url': 1, 'depth': 1 })) knownDomains = set(db.GetKnownDomains()) toVisit = list( filter( lambda element: GetBaseDomain(element['url']) in knownDomains, toVisit)) else: toVisit = list( db.notVisited.find({}, { 'url': 1, 'depth': 1 }).limit(webpagesLimit)) threads = len(toVisit) if len(toVisit) < threads else threads print(f'{Fore.BLUE}Webpages to visit: {len(toVisit)}{Style.RESET_ALL}') print(f'{Fore.BLUE}Threads: {threads}{Style.RESET_ALL}') if threads == 0: print(f'{Fore.BLUE}Nothing to do...{Style.RESET_ALL}') return 0 webpagesPerSpider = int(math.floor(len(toVisit) / threads)) webpagesPerSpider = limitPerSpider if webpagesPerSpider > limitPerSpider else webpagesPerSpider chunks = [ toVisit[i:i + webpagesPerSpider] for i in range(0, len(toVisit), webpagesPerSpider) ] Spider.allLinks = allLinks Spider.batchSize = batchSize Spider.limit = limitPerSpider Spider.maxDepth = maxDepth Spider.recursive = recursive Spider.timeout = timeout print(f'{Fore.BLUE}Spiders: {len(chunks)}{Style.RESET_ALL}') spiders = [] for i in range(len(chunks)): spiderColors = colorsCombinations[i % len(colorsCombinations)] spider = Spider(str(i), spiderColors[0], spiderColors[1]) spider.toVisit = chunks[i] spiders.append(spider) with concurrent.futures.ThreadPoolExecutor( max_workers=threads) as executor: futures = [executor.submit(spider.Search) for spider in spiders] for future in futures: future.result()
# -*- coding: utf-8 -*- import os from flask import Flask, request, Response from flask_uploads import UploadSet, configure_uploads, IMAGES, patch_request_class import json from IRNet import IRNet from Spider import Spider app = Flask(__name__) app.config['UPLOADED_PHOTOS_DEST'] = os.getcwd() + '/upload' net = IRNet() net.load_model() net.predict('1.jpg') spider = Spider() photos = UploadSet('photos', IMAGES) configure_uploads(app, photos) patch_request_class(app) # set maximum file size, default is 16MB #net = IRNet() #net.load_model() html = ''' <!DOCTYPE html> <title>Upload File</title> <h1>图片上传</h1> <form method=post enctype=multipart/form-data> <input type=file name=photo> <input type=submit value=上传> </form>
from Spider import Spider from Query import Query import sys arguments = sys.argv if arguments[1] == "crawl": spider = Spider("https://en.wikipedia.org/") spider.crawl() elif arguments[1] == "query": query = Query(arguments[2]) query.query() # # query.multiWordQuery(["action","design"])
from Course import Course from Spider import Spider import sys if __name__ == '__main__': course = Course(term=sys.argv[1], CRN=sys.argv[2]) header = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0' } cookie = "JSESSIONID=3AC5E57A46288ADDCFC379DD705B8DFB; _ga=GA1.2.1751626557.1561253718; rollup=GA1.2.2143599923.1561254105; subdirectory=GA1.2.1907215284.1561254106; subdomain=GA1.2.1953509159.1561254106; _mkto_trk=id:558-EBH-425&token:_mch-neu.edu-1561254106843-66694; _hjid=f7bd8518-0e00-4b57-b491-5115c949e1f6; _sp_id.cb6f=219c05ce-514e-4010-a0eb-21cba0580c12.1561254106.4.1565081306.1564370721.6ac23e35-e12c-4820-8032-deff60463f54; _gid=GA1.2.1060020417.1574032199; nubanner-cookie=3676250523.36895.0000; IDMSESSID=B450B8320592C4EDD35CCFAB19035112A99279B3FAB0DB15988A4C35356A1ED4E3F335459F9D7568EC15FB5E07D48BF085D6D41FB53655C6A844236EC2477175" spider = Spider(course, cookie, header) spider.setTimeGap(5) spider.start()
from util.MysqlManager import MysqlManager from Spider import Spider import logging logging.basicConfig(filename='./log/20181022.txt', level=logging.INFO) # def crawl(url, brand, class_): # print(os.getpid(), url, brand, class_) # spider = Spider(url, brand, class_) # spider.work() if __name__ == "__main__": records = MysqlManager().fetch_all_source() for record in records: url = record.get("url") brand = record.get("brand") class_ = record.get("class") # p = Process(target=crawl, args=(url, brand, class_)) # p.start() # time.sleep(2) spider = Spider(url, brand, class_) spider.work()
def interpreting_product_details(self, html_doc): try: # 存储产品信息 product = {} doc = BeautifulSoup(html_doc, 'html5lib') # 产品图片 img = [] # 产品多图 product_li = doc.find_all('li', class_='tab-trigger') if product_li: for li in product_li: product_a = li.find('a', attrs={ "data-useractivelogs": "UserBehavior_detail_smallphoto" }) if product_a: # 获取largeimage product_img = product_a.find('img') product_img_src = product_img.get('src') # 获取原图 last_index = product_img_src.rfind('..') img.append(product_img_src[:last_index]) # 该产品不存在多图时,使用默认的一张大图 else: product_img_div = doc.find('div', class_='vertical-img') if product_img_div: product_img = product_img_div.find( 'a', attrs={ "data-useractivelogs": "UserBehavior_detail_bigphoto" }) product_img_hrefs = product_img.get('hrefs') if not product_img_hrefs: product_img_src = product_img.find('img').get('src') # 获取原图 last_index = product_img_src.rfind('..') product_img_hrefs = product_img_src[:last_index] img.append(product_img_hrefs) product['imgs'] = img ''' 产品详情有两种展示效果,因此需要不同解析 ''' # 产品详情整个内容 pdetail = doc.find('div', id='pdetail', class_='proDetailCon tab_content_event_class') if pdetail is not None: # 获取产品唯一识别id product_bcid = doc.find('input', id='bcid').get('value') detail_bot = pdetail.find('div', class_='detailBot') detail_bot.decompose() introduce = pdetail.find('div', id='introduce') if product_bcid: product['bcid'] = product_bcid xss_filter = 'http://wsdetail.b2b.hc360.com/XssFilter?callback=jQuery&bcid=' result_product_introduce = Spider().spider_URL( url=xss_filter + product_bcid) product_introduce = self.interpreting_product_details_desc( result_product_introduce) introduce.replace_with( BeautifulSoup(product_introduce, 'html.parser')) else: pdetail = doc.find('div', id='pdetail', class_='pdetail tab_content_event_class') if pdetail is not None: # 基本参数 vopy = pdetail.find('div', class_="d-vopy") # 去除基本参数列表中图片div vopyImgBoxs = vopy.find_all('div', class_='d-vopyImgBox') for vopyImgBox in vopyImgBoxs: vopyImgBox.decompose() # 去除基本参数列表中同类产品显示span span = pdetail.find_all( 'span', class_='same-parameter-commodity-hook') for s in span: s.decompose() # 详细说明div d_xi_b = pdetail.find('div', class_='d-xi-b').find('div') detail_imgs = d_xi_b.find_all('img') if detail_imgs: for img in detail_imgs: del img['onerror'] del img['onload'] # 详细说明中包含的文本内容(不包含tag标签) content_text = d_xi_b.find_all(text=True, recursive=False) if content_text: for text in content_text: # 全部替换为'' 去除“慧聪网”字眼 text.replace_with('') style = '''<style> #introduce {font-size: 14px;} table {border-collapse: collapse;border-spacing: 0;} p {margin: 0;} .dvop-title {line-height: 30px;font-size: 14px;color: rgb(51, 51, 51);padding-bottom: 10px;} .dvop-title h4 {font-weight: normal;} .d-vopy table {width: 100%;float: left;font-size: 12px;margin-bottom: 18px;border-left: 1px solid rgb(237, 237, 237);border-top: 1px solid rgb(237, 237, 237);} .d-vopy th {width: 200px;background-color: rgb(245, 245, 245);text-align: center;font-weight: normal;min-height: 34px;line-height: 34px;border-right: 1px solid rgb(237, 237, 237);border-bottom: 1px solid rgb(237, 237, 237);padding: 0px;} .d-vopy td {border-right: 1px solid #ededed;border-bottom: 1px solid #ededed;vertical-align: top;} .d-vopy td {padding-left: 20px;line-height: 34px;} .d-vopy th h4 {font-size: 12px;color: rgb(51, 51, 51);margin: 0px;} .d-vopyList {overflow: hidden;} .d-vopyList {line-height: 34px;padding-left: 20px;} .d-vopyList p {float: left;} .d-vopyList p {padding-right: 20px;width: 500px;line-height: 24px;padding: 5px 0;} .d-xi-b {padding: 10px 0px;font-size: 12px;} </style> ''' product['details'] = style + pdetail.prettify() except AttributeError: self.logger.error('对象没有这个属性:' + traceback.format_exc()) except KeyError: self.logger.error('映射中没有这个键:' + traceback.format_exc()) except BaseException: self.logger.error('解析产品详情出错:' + traceback.format_exc()) return product
import threading from queue import Queue from Spider import Spider from domain import * from WebCrawler import * Project_Name="The WebCrawler" Home_Page="http://codechannels.com/channel/thenewboston/" Domain_Name=get_full_domain_name(Home_Page) Queue_File=Project_Name+'_queue.txt' Crawled_File=Project_Name+'_crawled.txt' Number_Of_Threads=8 queue=Queue() Spider(Project_Name,Home_Page,Domain_Name) #create wroker threads #Die when maix exits def create_workers(): for _ in range(Number_Of_Threads): t=threading.Thread(target=work) t.daemon=True t.start() #do the next job in the queue def work(): while True: url=queue.get() Spider.crawling(threading.current_thread().name,url) queue.task_done()
with open('error/error.txt', 'a+') as f: f.write('error/error_server 76') f.write(str(e)+'\n') print('1010') break manage.shutdown() spider_main.save() if __name__ == "__main__": pickle = os.listdir('pickle/') print('当前的已保存搜索文件:', pickle) name = input('输入搜索代号:') path = name + '.pickle' used_path = name + '_used.pickle' spider_main = Spider(name, used_path) if path not in pickle: start = time.time() url = 'https://www.bilibili.com/index/rank/all-30-3.json' try: spider_main.crawl(url, path) except Exception as e: with open('error/error.txt', 'a+') as f: f.write('94'+str(e) + '\n') end = time.time() times = int(end - start) if times > 60: mins = times//60
def StartSpider(self, name): spider = Spider(name) spider.start()
import threading from queue import Queue from Spider import Spider from domain import * from general import * PROJECT_NAME = 'testingOne' HOMEPAGE = 'https://thenewboston.com/' DOMAIN_NAME = get_domain_name(HOMEPAGE) QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWLED_FILE = PROJECT_NAME + '/crawled.txt' NUMBER_OF_THREADS = 4 queue = Queue() Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) #Create worker threads(will die main exits) #var _ beacuse just want to loop some number of times def create_workers(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) t.daemon = True t.start() #Do the next job in the queue def work(): while True: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def library(): lib = Spider() visit = lib.read_library() return jsonify(visit)
shieldDir = "D:/Program Files/Streamlabs Chatbot/Services/Twitch/shields.txt" shieldDamageDir = "D:/Program Files/Streamlabs Chatbot/Services/Twitch/shieldDamage.txt" campfireDir = "D:/Program Files/Streamlabs Chatbot/Services/Twitch/flame.txt" attackers = [Vine(60, 1.0, 5, 1.0, 20, 120), # dpm of 5 Vine(60, 1.0, 5, 1.0, 20, 120), Vine(60, 1.0, 5, 1.0, 20, 120), Vine(60, 1.0, 5, 1.0, 20, 120), Vine(60, 1.0, 5, 1.0, 20, 120), Vine(60, 1.0, 5, 1.0, 20, 120), Vine(60, 1.0, 5, 1.0, 20, 120), Vine(60, 1.0, 5, 1.0, 20, 120), Vine(60, 1.0, 5, 1.0, 20, 120), Vine(60, 1.0, 5, 1.0, 20, 120), Spider(60, 1.0, 15, 1.0, 100, 240), # dpm of 15 Spider(60, 1.0, 15, 1.0, 100, 240), Spider(60, 1.0, 15, 1.0, 100, 240), Spider(60, 1.0, 15, 1.0, 100, 240), Spider(60, 1.0, 15, 1.0, 100, 240), Spider(60, 1.0, 15, 1.0, 100, 240), ShadowBoundBear(120, 1.0, 60, 1.0, 300, 300), # dpm of 30 ShadowBoundBear(120, 1.0, 60, 1.0, 300, 300), ShadowBoundBear(120, 1.0, 60, 1.0, 300, 300), ShadowBoundBear(120, 1.0, 60, 1.0, 300, 300), Beast(120, 1.0, 70, 1.0, 100, 300), # dpm of 35, increases over time Beast(120, 1.0, 70, 1.0, 100, 300), Colossus(60, 5.0, 700, 1.0, 2000, 1800), # dpm of 140, increases over time Colossus(60, 5.0, 700, 1.0, 2000, 1800), Dragon(300, 1.0, 1000, 1.0, 2000, 3600), # dpm of 200. Reward increases over time, difficult to kill. Ashvine(60, 1.0, 30, 1.0, 60, 50), # dpm of 30. Increases over time, harder to kill over time, reward increases over time.
from Goblin import Goblin from Store import Store from Wizard import Wizard from Spider import Spider from Snake import Snake from Medic import Medic from Shadow import Shadow from Zombie import Zombie if __name__ == "__main__": hero = Hero() enemies = [ Goblin(), Wizard(), Medic(), Shadow(), Zombie(), Spider(), Snake() ] battle_engine = Battle() shopping_engine = Store() for enemy in enemies: hero_won = battle_engine.do_battle(hero, enemy) if not hero_won: print("YOU LOSE!") exit(0) shopping_engine.do_shopping(hero) print("YOU WIN!")
BASE_URL = input('Enter The website URL:\t') if re.match(regex, BASE_URL) is not None: RESPONSE = urlopen(BASE_URL).getcode() if RESPONSE != 200: WRONG = True print("WRONG URL") else: break else: WRONG = True print("WRONG URL") DOMAIN = get_domain_name(BASE_URL) SEARCH_WORD = input('Enter the search text, if there is none press enter:\t') Spider(PROJECT_NAME, BASE_URL, DOMAIN, 'Spider' + str(SPIDER_ID), SEARCH_WORD) while True: if len(Spider.wait_list) <= 0: break BASE_URL = Spider.wait_list.pop() Spider.wait_list.add(BASE_URL) Spider(PROJECT_NAME, BASE_URL, DOMAIN, 'Spider' + str(SPIDER_ID), SEARCH_WORD) SPIDER_ID += 1 URLS_GATHERED = len(Spider.crawled) print('\n' + "Finished Crawling.\n" + "Number of URLs Gathered:\t" + str(URLS_GATHERED)) if SEARCH_WORD != '': print("\nSearch Results:\nThe Search Word Found in These URLS:\n")