def parse_listing(categories): postman = Postman.init() for category in categories: print(category) spider = Spider(url=category["url"]) with open("page.html", "a") as f: f.write(spider.get_page()) soup = BeautifulSoup(spider.get_page(), "html.parser") soup_a = soup.find_all("a", class_="item _item") products = [a["href"] for a in soup_a] print(products) for num, url in enumerate(products): parse_detail(category, url, postman) print(category, num, len(products)) print("sleep 3") time.sleep(1) print("sleep 2") time.sleep(1) print("sleep 1") time.sleep(1) break
def test_cls_Spider_scapy__print_OnScapy(): site = "http://www.hao123.com/" max_size = 100 ptns = producePtns() onscapy = print_OnScapy() spider = Spider(site,ptns,max_size=max_size,onScapy=onscapy) spider.scapy()
def startCB(self): # 保存内容的文件 file = open(self.filePath, "w") # 爬取得规则 titleKlass = {"class": "j_chapterName"} contentKlass = {"class": "j_readContent"} nextKlass = {"id": "j_chapterNext"} page = self.entryUrl.get() # 开始爬取 spider = Spider(titleKlass, contentKlass, nextKlass) if page == "" or self.filePath == "": tkMessageBox.showerror("woolson", "小说名称或链接未填写!") else: # 循环抓取下一章 while page != "": result = spider.getContent(page) try: page = result["nextUrl"] file.write(result["title"] + "\n") file.write(result["content"] + "\n\n") print "正在写入->" + result["title"] except Exception as e: page = "" print "结束", result["error"]
def going(url): mail = email() text = wordsDeal() spider1 = Spider(url, 'test') hrefs = spider1.hrefFor2018() for item in hrefs: try: content = spider1.contentOfArtical(item['href']) if content['content'] != "contents": query = "insert into Christian(href,title, content, sent) values ( '" + text.sqlEscape( item['href']) + "','" + text.sqlEscape( item['title']) + "','" + text.sqlEscape( content['content'] ) + "','" + config.notSend + "');" sqlQuery(query) print "YES ", item['title'] # mail.sendAuto(content['title'], content['content'] + '<p>' + Chinese + '</p>') except BaseException as error: query = "insert into SpiderExcept(href,except) values ( '" + text.sqlEscape( item['href']) + "','" + text.sqlEscape(str(error)) + "');" sqlQuery(query) else: print "NO ", item['title'] del spider1 del text del mail
def main(): # 根据所带参数,确定使用哪个网站的配置参数 try: website = sys.argv[1] url = sys.argv[2] except Exception as e: print "please choose one website" exit() # 实例化 dic = { "qidian": Qidian, "heiyan": Heiyan, } config = dic[website]() # 获取关键信息 handler = Spider(config.title, config.content, config.next) chapters = config.getList(url) book = open("text.txt", "w") for item in chapters: print "正在下载->", item["title"] content = handler.getContent(item["href"]) book.writelines(item["title"] + "\n") book.writelines(content["content"] + "\n")
def getMovieHtml( self, filename, name, j, end, z ): #从filename中获取影片存放影片具体信息的网页url,抓取url指向的网页,name是开始url,抓取下来保存文件的名字,end是结束url file = open(filename, 'r', encoding="UTF-8") i = 0 j = j flag = False for url in file: if i % 2 == 0: m = re.search(r'h.*', url) if m.group() == name: flag = True if flag: html = Spider().getHtml(m.group()) if html: Spider().saveHtml(html, dizhi[z] + str(j) + ".html") print(str(m) + " :" + str(j) + ".html已存储~") j += 1 if m.group() == end: return else: print(url.strip()) i += 1 if j % 90 == 0: time.sleep(120) #为了防止请求过于频繁,抓取一定页数就暂停一下
def test_cls_Spider_scapy__logfile_OnScapy(): site = "http://www.hao123.com/" max_size = 100 logfile = "test_result/logfile" ptns = producePtns() onscapy = logfile_OnScapy(name=logfile) spider = Spider(site,ptns,max_size=max_size,onScapy=onscapy) spider.scapy()
def work(): while True: try: url = queue.get() Spider.crawlPlayer(url, threading.current_thread().name) queue.task_done() except: pass
def yyetsFinder(): urls = ['http://yyets.com/showresource-juji-1103.html' # 2 BROKE GIRlS ,'http://yyets.com/showresource-juji-1088.html' # HOMELAND ,'http://yyets.com/showresource-juji-1007.html' # MENTALIST ,'http://yyets.com/showresource-juji-974.html' # NEW GIRL ] spider = Spider(urls) spider.start()
def create_threads(self): """Create, start and add threads to a list. Threads run an instance of Spider. The amount of threads created depends on the amount of cores found in the system.""" for i in range(1, multiprocessing.cpu_count()): name = "Thread-%s" % i thread = Spider(name, self.queue, self.result) thread.start() threads.append(thread)
def run(key): url = set_url(host, key) Cookies() spider = Spider(url) html = spider.spider(BASEHEADERS) if not verify(html): BASEHEADERS["Cookie"] = BASEHEADERS["Cookie"] + Cookies.cookie_str( ["acw_tc", "PHPSESSID"]) proxieser.proxies() parser = HtmlParser(html) data = parser.parser("fund") print(data)
def main(): ### The start page's URL start_url = 'https://scholar.google.com.tw/citations?view_op=search_authors&hl=en&mauthors=label:complex_systems' ### p_key and n p_key = [] n_key = [] ### Google Scholar Crawler, Class Spider myCrawler = Spider(start_url, p_key, n_key, page=5) results = myCrawler.crawl() with open('result.pickle', 'wb') as f: pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
def course(): post_format = {"username": "", "password": ""} if request.method == 'POST': username = request.form['username'] password = request.form['password'] user = Spider(username, password) user.login() if user.login_status: info = user.modify_data() return jsonify(info) else: return "登录失败" else: return render_template("index.html", format=post_format)
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "svh", ["help", "output="]) except getopt.GetoptError as err: # print help information and exit: print(str(err)) # will print something like "option -a not recognized" usage() sys.exit(2) for o, a in opts: if o == "-s": s = Spider() s.run() elif o == "-v": voter = Voter() voter.run() else: assert False, "unhandled option"
def initialization(): # get a list of user ids as tasks uid_list = get_tasks(TASK_NUM) # get a list of users as crawling accounts user_list = get_accounts(ACCOUNT_NUM) spider = Spider(user_list) return spider, uid_list, user_list
def ydyFinder(): # everything for login username = '******' password = '******' auth_data = {'username': username, 'password': password, 'formhash':'592862ac'} login_page = 'http://bbs.sfile2012.com/logging.php?action=login&loginsubmit=yes' auth = FormAuth(auth_data, login_page) # good hunting urls = ['http://bbs.sfile2012.com/viewthread.php?tid=351496&extra=page%3D1', # BONES 'http://bbs.sfile2012.com/viewthread.php?tid=348582&extra=page%3D1', # HOUSE 'http://bbs.sfile2012.com/viewthread.php?tid=348117&extra=page%3D1', # MENTALIST ] spider = Spider(urls, auth) spider.start()
def grade(): post_format = {"username": "", "password": ""} if request.method == 'POST': username = request.form['username'] password = request.form['password'] grade = Spider(username, password) grade.login() if grade.login_status: info = grade.modify_grade() print(info) return jsonify(info) else: return "登录失败" else: return render_template("index.html", format=post_format)
def __init__(self, url, **kwargs): self.results = defaultdict(list) self.maxdepth = 2 self.URLHandler = URLHandler() self.candidates = set() self.url = url self.baseurl = self.URLHandler.get_provider(self.url) self.spider = Spider(self.url, **kwargs) self.mysoup = BeautifulSoup(self.spider.request.text)
def spider_store_details(self): stores = self.query_store() for store in stores: time.sleep(1) try: contacts = None # 爬取联系方式页 spider_time = time.time() self.logger.info('开始爬取:' + store[1]) result_contacts = Spider().spider_URL( url=store[1], is_proxies=True) self.logger.info('爬取耗时:' + str(time.time() - spider_time)) # 解析联系方式页 interpreting_time = time.time() self.logger.info('开始解析:' + store[1]) contacts = Interpreter().interpreting_contact_info( result_contacts) self.logger.info('解析耗时:' + str(time.time() - interpreting_time)) # 爬取联系方式页 if not contacts: spider_time = time.time() store[1] += '/shop/company.html' self.logger.info('开始爬取:' + store[1]) result_contacts = Spider().spider_URL( url=store[1], is_proxies=True) self.logger.info('爬取耗时:' + str(time.time() - spider_time)) # 解析联系方式页 interpreting_time = time.time() self.logger.info('开始解析:' + store[1]) contacts = Interpreter().interpreting_contact_info( result_contacts) self.logger.info('解析耗时:' + str(time.time() - interpreting_time)) # 更新联系方式 self.update_contacts(store[0], contacts) except BaseException: self.logger.error('爬取或更新联系方式出错:' + traceback.format_exc()) continue
def main(): ### The start page's URL start_url = 'https://scholar.google.com.tw/scholar?q=frequency+lowering+algorithm&hl=zh-TW&as_sdt=0,5' ### p_key and n p_key = [ 'wdrc', 'dynamic range compression', 'hearing aid', 'speech', 'noise cancellation', 'noise reduction', 'feedback cancellation', 'sound', 'hearing loss' ] n_key = [ 'imagery', 'image', 'visual', 'video', 'optic', 'opto', 'quantum', 'photon' ] ### Google Scholar Crawler, Class Spider myCrawler = Spider(start_url, p_key, n_key, page=5) results = myCrawler.crawl() with open('result.pickle', 'wb') as f: pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
def run(self): global condition, products, urls while True: if condition.acquire(): if urls.url_size() <= 20: urls.add_new_url(Spider.crawl_url(self.keyword, products)) products += 1 print("Producer(%s):deliver one, now products:%s" % (self.name, urls.url_size())) condition.notify() pass else: condition.wait() # stop if products >= 100: break condition.release()
def spider_product(self, category_2nd, start, end): # 解析产品列表page1~page100 for i in range(start, end): try: # 爬取下一页休息3s time.sleep(1) # 分三次抓取当前页60个产品 product_list = [] for j in range(3): req = category_2nd[ 1] + '&ap=A&t=1&afadprenum=0&af=1' + '&ee=' + str( i) + '&afadbeg=' + str(60 * (i - 1) + (j * 20) + 1) # 爬取并解析 spider_time = time.time() self.logger.info('开始爬取:' + req) result_product_list = Spider().spider_URL(url=req, is_proxies=True) self.logger.info('爬取耗时:' + str(time.time() - spider_time)) interpreting_time = time.time() self.logger.info('开始解析:' + req) product_list.extend( Interpreter().interpreting_product_list( result_product_list)) self.logger.info('解析耗时:' + str(time.time() - interpreting_time)) ''' 先爬取所有产品列表信息,后再逐个抓取产品详情及公司信息 ''' for product in product_list: # 保存公司信息 contact = {} contact['公司名'] = product.get('company') contact['公司主页'] = product.get('homepage') store_id = self.save_contacts(contact) # 保存产品 self.save_product(product, category_2nd[0], store_id) except BaseException: self.logger.error('爬取或解析' + req + '出错:' + traceback.format_exc()) continue
def spider_product_details(self): products = self.query_products() for product in products: time.sleep(1) try: # 爬取产品详情页 spider_time = time.time() self.logger.info('开始爬取:' + product[1]) result_products = Spider().spider_URL( url=product[1], is_proxies=True) self.logger.info('爬取耗时:' + str(time.time() - spider_time)) # 解析产品详情页 interpreting_time = time.time() self.logger.info('开始解析:' + product[1]) details = Interpreter().interpreting_product_details( result_products) self.logger.info('解析耗时:' + str(time.time() - interpreting_time)) ''' # 若产品详情需要再次爬取 product_bcid = details.get('bcid') if product_bcid: interpreting_time = time.time() xss_filter = 'http://wsdetail.b2b.hc360.com/XssFilter?callback=jQuery&bcid=' result_product_introduce = Spider().spider_URL( url=xss_filter + product_bcid) self.logger.info('开始解析:' + xss_filter + product_bcid) details['desc'] = Interpreter( ).interpreting_product_details_desc( result_product_introduce) # 组装产品详情 details = Interpreter().assemble_product_details(details) self.logger.info('解析耗时:' + str(time.time() - interpreting_time)) ''' # 更新产品详情 self.update_products(product[0], details) except BaseException: self.logger.error('爬取或更新产品详情出错:' + traceback.format_exc()) continue
def spider_job(): try: spider = Spider.Spider() spider.spider(categoryId=1) spider.spider(categoryId=2) spider.spider(categoryId=3) spider.spider(categoryId=4) spider.spider(categoryId=5) print("success") time.sleep(5) except Exception as err: # 错误预警,发送邮件 errStr = ",".join(err.args) myEmail = MyEmail.MyEmail() myEmail.tag = "新发地商品数据爬去异常" myEmail.to_list = ["*****@*****.**"] myEmail.content = errStr myEmail.send() print(errStr)
#!/usr/bin/python # -*- coding:utf-8 -*- import re from Spider import Spider import sys reload(sys) sys.setdefaultencoding('utf8') errors = [] spider = Spider() # 爬数据 errors.extend(spider.crawljobs()) # 解析入库 errors.extend(spider.insert_jobs()) # 格式化职位的一些信息(薪资) errors.extend(spider.analyze_jobs()) spider.adapt_job_city() # 关联关键字 errors.extend(spider.associate_key_and_job()) # 输出错误 for e in errors:
from util.MysqlManager import MysqlManager from Spider import Spider import logging logging.basicConfig(filename='./log/20181022.txt', level=logging.INFO) # def crawl(url, brand, class_): # print(os.getpid(), url, brand, class_) # spider = Spider(url, brand, class_) # spider.work() if __name__ == "__main__": records = MysqlManager().fetch_all_source() for record in records: url = record.get("url") brand = record.get("brand") class_ = record.get("class") # p = Process(target=crawl, args=(url, brand, class_)) # p.start() # time.sleep(2) spider = Spider(url, brand, class_) spider.work()
def work(): while True: url=queue.get() Spider.crawling(threading.current_thread().name,url) queue.task_done()
import aiohttp from Spider import Request from Spider import Spider import PageParse import argparse ARGS = argparse.ArgumentParser(description="caoliu spider") ARGS.add_argument("--pages", action='store', type=int, default=1, help='Limit page to spider') ARGS.add_argument("--max_tries", action='store', type=int, default=30, help='Limit retries on network errors') ARGS.add_argument("--root_dir", action='store', default='./download', help='directory store picture and torrent') ARGS.add_argument("--max_tasks", action='store', type=int, default=20, help='Limit concurrent connections') ROOT_DIR = "/media/mosaic/软件/git-myspider/cl_spider/source/" args = ARGS.parse_args() loop = asyncio.get_event_loop() spider = Spider(max_tries=args.max_tries, max_tasks=args.max_tasks) PageParse.start(spider, 1, args.pages+1, root_dir=args.root_dir) loop.run_until_complete(spider.spider()) spider.close() loop.stop() loop.run_forever() loop.close()
def interpreting_product_details(self, html_doc): try: # 存储产品信息 product = {} doc = BeautifulSoup(html_doc, 'html5lib') # 产品图片 img = [] # 产品多图 product_li = doc.find_all('li', class_='tab-trigger') if product_li: for li in product_li: product_a = li.find('a', attrs={ "data-useractivelogs": "UserBehavior_detail_smallphoto" }) if product_a: # 获取largeimage product_img = product_a.find('img') product_img_src = product_img.get('src') # 获取原图 last_index = product_img_src.rfind('..') img.append(product_img_src[:last_index]) # 该产品不存在多图时,使用默认的一张大图 else: product_img_div = doc.find('div', class_='vertical-img') if product_img_div: product_img = product_img_div.find( 'a', attrs={ "data-useractivelogs": "UserBehavior_detail_bigphoto" }) product_img_hrefs = product_img.get('hrefs') if not product_img_hrefs: product_img_src = product_img.find('img').get('src') # 获取原图 last_index = product_img_src.rfind('..') product_img_hrefs = product_img_src[:last_index] img.append(product_img_hrefs) product['imgs'] = img ''' 产品详情有两种展示效果,因此需要不同解析 ''' # 产品详情整个内容 pdetail = doc.find('div', id='pdetail', class_='proDetailCon tab_content_event_class') if pdetail is not None: # 获取产品唯一识别id product_bcid = doc.find('input', id='bcid').get('value') detail_bot = pdetail.find('div', class_='detailBot') detail_bot.decompose() introduce = pdetail.find('div', id='introduce') if product_bcid: product['bcid'] = product_bcid xss_filter = 'http://wsdetail.b2b.hc360.com/XssFilter?callback=jQuery&bcid=' result_product_introduce = Spider().spider_URL( url=xss_filter + product_bcid) product_introduce = self.interpreting_product_details_desc( result_product_introduce) introduce.replace_with( BeautifulSoup(product_introduce, 'html.parser')) else: pdetail = doc.find('div', id='pdetail', class_='pdetail tab_content_event_class') if pdetail is not None: # 基本参数 vopy = pdetail.find('div', class_="d-vopy") # 去除基本参数列表中图片div vopyImgBoxs = vopy.find_all('div', class_='d-vopyImgBox') for vopyImgBox in vopyImgBoxs: vopyImgBox.decompose() # 去除基本参数列表中同类产品显示span span = pdetail.find_all( 'span', class_='same-parameter-commodity-hook') for s in span: s.decompose() # 详细说明div d_xi_b = pdetail.find('div', class_='d-xi-b').find('div') detail_imgs = d_xi_b.find_all('img') if detail_imgs: for img in detail_imgs: del img['onerror'] del img['onload'] # 详细说明中包含的文本内容(不包含tag标签) content_text = d_xi_b.find_all(text=True, recursive=False) if content_text: for text in content_text: # 全部替换为'' 去除“慧聪网”字眼 text.replace_with('') style = '''<style> #introduce {font-size: 14px;} table {border-collapse: collapse;border-spacing: 0;} p {margin: 0;} .dvop-title {line-height: 30px;font-size: 14px;color: rgb(51, 51, 51);padding-bottom: 10px;} .dvop-title h4 {font-weight: normal;} .d-vopy table {width: 100%;float: left;font-size: 12px;margin-bottom: 18px;border-left: 1px solid rgb(237, 237, 237);border-top: 1px solid rgb(237, 237, 237);} .d-vopy th {width: 200px;background-color: rgb(245, 245, 245);text-align: center;font-weight: normal;min-height: 34px;line-height: 34px;border-right: 1px solid rgb(237, 237, 237);border-bottom: 1px solid rgb(237, 237, 237);padding: 0px;} .d-vopy td {border-right: 1px solid #ededed;border-bottom: 1px solid #ededed;vertical-align: top;} .d-vopy td {padding-left: 20px;line-height: 34px;} .d-vopy th h4 {font-size: 12px;color: rgb(51, 51, 51);margin: 0px;} .d-vopyList {overflow: hidden;} .d-vopyList {line-height: 34px;padding-left: 20px;} .d-vopyList p {float: left;} .d-vopyList p {padding-right: 20px;width: 500px;line-height: 24px;padding: 5px 0;} .d-xi-b {padding: 10px 0px;font-size: 12px;} </style> ''' product['details'] = style + pdetail.prettify() except AttributeError: self.logger.error('对象没有这个属性:' + traceback.format_exc()) except KeyError: self.logger.error('映射中没有这个键:' + traceback.format_exc()) except BaseException: self.logger.error('解析产品详情出错:' + traceback.format_exc()) return product
import threading from queue import Queue from Spider import Spider from domain import * from WebCrawler import * Project_Name="The WebCrawler" Home_Page="http://codechannels.com/channel/thenewboston/" Domain_Name=get_full_domain_name(Home_Page) Queue_File=Project_Name+'_queue.txt' Crawled_File=Project_Name+'_crawled.txt' Number_Of_Threads=8 queue=Queue() Spider(Project_Name,Home_Page,Domain_Name) #create wroker threads #Die when maix exits def create_workers(): for _ in range(Number_Of_Threads): t=threading.Thread(target=work) t.daemon=True t.start() #do the next job in the queue def work(): while True: url=queue.get() Spider.crawling(threading.current_thread().name,url) queue.task_done()
def StartSpider(self, name): spider = Spider(name) spider.start()
from Spider import Spider from save_as_opml import save_to_opml if __name__ == "__main__": spider = Spider() content_list = spider.run() save_to_opml(content_list, spider.tag_set, spider.name_list)
#!/usr/bin/python # -*- coding:utf-8 -*- from Spider import Spider # 入口 spider = Spider() fans = spider.get_my_fans() for fan in fans: spider.user_crawl(fan.user_id) spider.status_crawl(fan.user_id) followers = spider.get_my_follower() for follower in followers: spider.user_crawl(fan.user_id) spider.status_crawl(fan.user_id)
import threading from queue import Queue from Spider import Spider from domain import * from general import * PROJECT_NAME = 'testingOne' HOMEPAGE = 'https://thenewboston.com/' DOMAIN_NAME = get_domain_name(HOMEPAGE) QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWLED_FILE = PROJECT_NAME + '/crawled.txt' NUMBER_OF_THREADS = 4 queue = Queue() Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) #Create worker threads(will die main exits) #var _ beacuse just want to loop some number of times def create_workers(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) t.daemon = True t.start() #Do the next job in the queue def work(): while True: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
# -*- Mode: Python; coding: utf-8; indent-tabs-mode: t; c-basic-offset: 4; tab-width: 4 -*- # # main.py # Copyleft 2014 Yuzo(PillowSky) <*****@*****.**> # Compatible with python3 and pypy # # require PyQuery which depend on cssselect, so I pack it in the project to ensure running normally on other computers import os.path from datetime import datetime from Spider import Spider print("==Welcome to search engine for ZOJ==") if(os.path.isfile("data.db")): spider = Spider() print("Local database is update on %s" % datetime.fromtimestamp(os.path.getmtime("data.db"))) print("Database have %s problems stored now" % spider.getItemsCount()) else: print("Local database hasn't build") spider = Spider() print("\nUsage:") print("[1] update the local database in serial") print("[2] update the local database in parallel") print("[3] search problems about Matrix") print("[4] search generic problems") choice = raw_input() if(choice == str(1)) : spider.serialFetchAllProblems()
def __init__(self, config): Spider.__init__(self, config)
BASE_URL = input('Enter The website URL:\t') if re.match(regex, BASE_URL) is not None: RESPONSE = urlopen(BASE_URL).getcode() if RESPONSE != 200: WRONG = True print("WRONG URL") else: break else: WRONG = True print("WRONG URL") DOMAIN = get_domain_name(BASE_URL) SEARCH_WORD = input('Enter the search text, if there is none press enter:\t') Spider(PROJECT_NAME, BASE_URL, DOMAIN, 'Spider' + str(SPIDER_ID), SEARCH_WORD) while True: if len(Spider.wait_list) <= 0: break BASE_URL = Spider.wait_list.pop() Spider.wait_list.add(BASE_URL) Spider(PROJECT_NAME, BASE_URL, DOMAIN, 'Spider' + str(SPIDER_ID), SEARCH_WORD) SPIDER_ID += 1 URLS_GATHERED = len(Spider.crawled) print('\n' + "Finished Crawling.\n" + "Number of URLs Gathered:\t" + str(URLS_GATHERED)) if SEARCH_WORD != '': print("\nSearch Results:\nThe Search Word Found in These URLS:\n")
# -*- coding: utf-8 -*- import os from flask import Flask, request, Response from flask_uploads import UploadSet, configure_uploads, IMAGES, patch_request_class import json from IRNet import IRNet from Spider import Spider app = Flask(__name__) app.config['UPLOADED_PHOTOS_DEST'] = os.getcwd() + '/upload' net = IRNet() net.load_model() net.predict('1.jpg') spider = Spider() photos = UploadSet('photos', IMAGES) configure_uploads(app, photos) patch_request_class(app) # set maximum file size, default is 16MB #net = IRNet() #net.load_model() html = ''' <!DOCTYPE html> <title>Upload File</title> <h1>图片上传</h1> <form method=post enctype=multipart/form-data> <input type=file name=photo> <input type=submit value=上传> </form>
with open('error/error.txt', 'a+') as f: f.write('error/error_server 76') f.write(str(e)+'\n') print('1010') break manage.shutdown() spider_main.save() if __name__ == "__main__": pickle = os.listdir('pickle/') print('当前的已保存搜索文件:', pickle) name = input('输入搜索代号:') path = name + '.pickle' used_path = name + '_used.pickle' spider_main = Spider(name, used_path) if path not in pickle: start = time.time() url = 'https://www.bilibili.com/index/rank/all-30-3.json' try: spider_main.crawl(url, path) except Exception as e: with open('error/error.txt', 'a+') as f: f.write('94'+str(e) + '\n') end = time.time() times = int(end - start) if times > 60: mins = times//60
from Spider import Spider from Query import Query import sys arguments = sys.argv if arguments[1] == "crawl": spider = Spider("https://en.wikipedia.org/") spider.crawl() elif arguments[1] == "query": query = Query(arguments[2]) query.query() # # query.multiWordQuery(["action","design"])
def work(): while True: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def __init__(self, config): Spider.__init__(self, config) self.logger = LogUtil.Logging.getLogger()
def run(self): spider = Spider(self.thread_name, self.city_name) spider.getData()
from Goblin import Goblin from Store import Store from Wizard import Wizard from Spider import Spider from Snake import Snake from Medic import Medic from Shadow import Shadow from Zombie import Zombie if __name__ == "__main__": hero = Hero() enemies = [ Goblin(), Wizard(), Medic(), Shadow(), Zombie(), Spider(), Snake() ] battle_engine = Battle() shopping_engine = Store() for enemy in enemies: hero_won = battle_engine.do_battle(hero, enemy) if not hero_won: print("YOU LOSE!") exit(0) shopping_engine.do_shopping(hero) print("YOU WIN!")
class Yaff(object): def __init__(self, url, **kwargs): self.results = defaultdict(list) self.maxdepth = 2 self.URLHandler = URLHandler() self.candidates = set() self.url = url self.baseurl = self.URLHandler.get_provider(self.url) self.spider = Spider(self.url, **kwargs) self.mysoup = BeautifulSoup(self.spider.request.text) def getnormalfeeds(self): tags = self.mysoup.findAll(['link', 'a'], {"type": ['application/rss+xml', 'application/atom+xml', "application/x.atom+xml", "text/xml", "application/xhtml+xml"]}) for tag in tags: url = URLHandler.get_full_urls(self.baseurl, tag['href']) self.results[url].append(Result(title=tag.get('title', ''), feedtype=tag.get('type', ''))) return self def gethiddenfeeds(self): for i in range(self.maxdepth): self._getcandidatetags() for candidate in self.candidates: try: self.spider.make_request(candidate) self.mysoup = BeautifulSoup(self.spider.request.text) except ValueError as e: print(e) continue if self.isfeed(): self.results[self.spider.request.url].append(Result( title=self.mysoup.find('title').text, feedtype=self.spider.contenttype)) self.getnormalfeeds() return self def getrootrss(self): self.spider.make_request(self.url + '/rss') if self.isfeed(): self.mysoup = BeautifulSoup(self.spider.request.text) url = URLHandler.get_full_urls(self.baseurl, self.url + '/rss') self.results[url].append(Result( title=self.mysoup.find('title').text, feedtype=self.spider.contenttype)) return self def _getcandidatetags(self): tags = self.mysoup.findAll('a') feedstrings = ['feed', 'rss', 'atom', 'xml'] for tag in tags: try: if any(fstring in tag['href'] for fstring in feedstrings): self.candidates.add(URLHandler.get_full_urls(self.baseurl, tag['href'])) except: continue def isfeed(self): if 'xml' in self.spider.contenttype or 'atom' in self.spider.contenttype: return True return False