class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.parser = HtmlParser() self.downloader = HtmlDownloader() self.output = DataOutput() def crawl(self, root_url): """ 程序主逻辑 :param root_url: 入口 url :return: """ self.manager.add_new_url(root_url) while self.manager.has_new_url() and self.manager.old_url_size() < 20: try: new_url = self.manager.get_new_url() html = self.downloader.downloader(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.output_txt(data) print(data) print("爬取了{}条链接".format(self.manager.old_url_size())) except Exception as e: print("爬取失败", e)
def __init__(self): #开启的线程数目 self.pcount = 1 #结果输出队列 self.dqueue = queue.Queue() #错误信息输出队列 self.equeue = queue.Queue() self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() # self.proxies = getProxy() self.proxies = getFromPool2() self.inactivepro = [] self.count = 0 self.sumSuccess = 0 self.sumFail = 0 self.updating = False
class SpiderMan(object): def __init__(self): self.manger = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = OutputData() def crawl(self, root_url): """ 主程序 :param root_url: 入口 URL :return: """ self.manger.add_new_url(root_url) while self.manger.has_new_url() and self.manger.old_urls_size() < 5: new_url = self.manger.get_new_url() html = self.downloader.downloader(new_url) next_url, data = self.parser.parser(new_url, html) self.manger.add_new_url(next_url) self.output.outputTxt(data)
def __init__(self): self.manager = UrlManager() self.parser = HtmlParser() self.downloader = HtmlDownloader() self.output = DataOutput()
data['rating'] = False table = content.xpath(".//*[@id='detail-bullets']//b") #详细信息表 for items in table: #扫描详细信息表 if (items.text and items.text.strip() == 'Rated:'): #检查是否有rating项目 data['rating'] = True break else: #二类网页 data['title'] = title[0].text.strip() data['rating'] = False table = content.xpath( ".//*[@id='btf-product-details']/following-sibling::*//th" ) #详细信息表 for items in table: #扫描详细信息表 if (items.text.strip() == 'MPAA rating'): #检查是否有rating项目 data['rating'] = True break return data if __name__ == "__main__": hp = HtmlParser() downloader = HtmlDownloader() url = "https://www.amazon.com/dp/B002PT1D1E" pro = None hp.test() # html = downloader.download(url,pro) # with open("test.txt","w",errors='ignore') as w: # w.write(html) # hp.parser(url,html)
class SpiderMan(object): def __init__(self): #开启的线程数目 self.pcount = 1 #结果输出队列 self.dqueue = queue.Queue() #错误信息输出队列 self.equeue = queue.Queue() self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() # self.proxies = getProxy() self.proxies = getFromPool2() self.inactivepro = [] self.count = 0 self.sumSuccess = 0 self.sumFail = 0 self.updating = False #self.proxies = ['http://127.0.0.1:2740'] def doCrawl(self, new_url): try: self.pcount += 1 count = 1 #随机选取代理IP pro = random.choice(self.proxies) #pro = 'http://127.0.0.1:2740' while (True): #HTML下载器下载网页 html = self.downloader.download(new_url, pro) #HTML解析器抽取网页数据 data = self.parser.parser(new_url, html) ## 数据存储器储存文件引起多线程写冲突而废弃 # self.output.store_data(data) #如果遇到机器人检测 if data == "robot": if count < 6: count = count + 1 #加入淘汰机制 # self.proxies - self.inactivepro 表现良好 # self.proxies and self.inactivepro 一次被墙 待观察 # self.inactivepro - self.proxies 两次被墙 暂时退出 # none 复活失败,永久退出 if (count == 5 and len(self.proxies) > 100): if (self.inactivepro.index(pro) < 0): #加入观察名单 self.inactivepro.append(pro) pro = random.choice(self.proxies) else: #暂时退出 print(str(pro) + " out\n") if (self.proxies.index(pro) >= 0): self.proxies.remove(pro) continue else: raise Exception("robot check") else: break # 队列将输出存储起来 self.dqueue.put(data) except Exception as e: self.sumFail = self.sumFail + 1 print( "Fail: link %d fail %d times : %s\n" % (self.count, self.sumFail, new_url), e.args) # 启动激活计划 if (len(self.proxies) < 200 or len(self.inactivepro) > 500): pro = random.choice(self.inactivepro) if (not pro is None and self.proxies.index(pro) < 0 and self.testIP(pro)): self.proxies.append(pro) print(str(pro) + " in!!!\n") # 不管结果如何,都要将pro移除, # 判断条件是为了防止多线程并发出现问题 if (self.inactivepro.index(pro) >= 0): self.inactivepro.remove(pro) self.equeue.put([new_url, e.args]) else: self.sumSuccess = self.sumSuccess + 1 print("Success: link %d success %d times : %s" % (self.count, self.sumSuccess, new_url)) finally: self.pcount -= 1 def setProxy(self): #self.proxies = getProxy() self.proxies = getFromPool2() self.updating = False #输出结果和错误信息 def outPutData(self): while (not self.dqueue.empty()): data = self.dqueue.get() self.output.store_data(data) while (not self.equeue.empty()): err = self.equeue.get() self.output.store_err(err) def testIP(self, pro): url = 'https://www.douban.com' res = requests.get(url, proxies={'proxy': pro}, timeout=20) if (res.status_code == 200): return True else: return False def crawl(self): threads = [] preFail = 0 #跳过之前的url for i in range(22350): self.manager.has_new_url() while (self.manager.has_new_url()): try: self.count = self.count + 1 # 启动更新计划 if self.sumFail - preFail > 46 and not self.updating: self.updating = True print("\n\nstart refreshing proxies\n\n") t = threading.Thread(target=SpiderMan.setProxy, args=[ self, ]) t.start() threads.append(t) # p = Pool() # result = p.apply_async(getFromPool2, args=()) # p.close() #self.proxies = result.get() #每50条数据刷新缓冲区和成功率 if (self.count % 50 == 0 and self.count != 0): preFail = self.sumFail rate = float(self.sumSuccess) / float(self.count - 1) print("Success Rate: %f" % rate) self.output.store_err([str(self.count), str(rate)]) self.output.flush() #从URL管理器获取新的url new_url = self.manager.get_new_url() #爬虫主过程(多线程优化) if self.pcount < 0: pcount = 0 else: pcount = self.pcount time.sleep(random.random() + pcount / 10) #随机时间间隔,根据线程数调整速度 t = threading.Thread(target=SpiderMan.doCrawl, args=[ self, new_url, ]) t.start() threads.append(t) #输出结果和错误信息 self.outPutData() except Exception as e: print("wired fail") [t.join() for t in threads] self.outPutData()