def spider(url_list): fetcher = Fetch('/tmp') spider = Rolling( fetcher, url_list ) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start()
def main(): headers = { } dongxi = Dongxi() fetcher = NoCacheFetch(0,headers=headers) spider = Rolling(fetcher,dongxi.daily_dongxi()) spider_runner = GSpider(spider, workers_count=3) spider_runner.start()
def spider(url_list): # fetcher = MultiHeadersFetch( headers=tuple( { 'Cookie': i } for i in COOKIE)) fetcher = Fetch('/tmp', tuple({'Cookie': i} for i in COOKIE), 3.33, zhihu_topic_title) spider = Rolling(fetcher, url_list) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start()
def spider(url_list): fetcher = MultiHeadersFetch( headers=tuple( { 'Cookie': i } for i in COOKIE ) ) spider = Rolling( fetcher, url_list ) debug = False debug = True spider_runner = GSpider(spider, workers_count=2, debug=debug) spider_runner.start()
def spider(url_list): fetcher = NoCacheFetch( 0, {"Cookie": COOKIE} #'/home/zuroc/tmp', #tuple( { 'Cookie': i.replace('Cookie:','').strip() } for i in COOKIE), #1, ) spider = Rolling(fetcher, url_list) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start()
def spider(url_list): fetcher = Fetch( '/home/zuroc/tmp', tuple( { 'Cookie': i.replace('Cookie:','').strip() } for i in COOKIE), 25, zhihu_topic_title ) spider = Rolling( fetcher, url_list ) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start()
def main(): url_list = [] for url, user_id, zsite_id in DOUBAN_SITE_LIST: url_list.append((ParseEventIndex(user_id, zsite_id), 'http://site.douban.com/widget/events/%s/' % url)) #self.url, self.user_id, self.zsite_id = url, user_id, zsite_id headers = { 'Cookie': 'bid=i9gsK/lU40A', } fetcher = NoCacheFetch(10, headers=headers) spider = Rolling(fetcher, url_list) spider_runner = GSpider(spider, workers_count=1) spider_runner.start()
def main(): headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7', 'Accept': ' text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language': 'zh-cn,zh;q=0.5', 'Accept-Charset': 'gb18030,utf-8;q=0.7,*;q=0.7', 'Content-type': 'application/x-www-form-urlencoded' } fetcher = NoCacheFetch(0, headers=headers) spider = Rolling(fetcher, pagelister()) spider_runner = GSpider(spider, workers_count=10) spider_runner.start()
def spider(url_list): # fetcher = MultiHeadersFetch( headers=tuple( { 'Cookie': i } for i in COOKIE)) fetcher = Fetch('/tmp', tuple({'Cookie': i} for i in COOKIE), 2.6, zhihu_topic_title) spider = Rolling(fetcher, url_list) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start() global QUESTION_ID_SET QUESTION_ID_SET = tuple(QUESTION_ID_SET) with open("question_id.py", "w") as question: question.write("QUESTION_ID_SET = ") question.write(pformat(QUESTION_ID_SET))