def __init__(self): self.htmldownload = HtmlDownload() self.parse = HtmlParse() self.urlmanage = UrlManager() self.storedata = StoreData() self.province = []
class SpiderManager(object): def __init__(self): self.htmldownload = HtmlDownload() self.parse = HtmlParse() self.urlmanage = UrlManager() self.storedata = StoreData() self.province = [] def crawl(self,url): html = self.htmldownload.Download(url) urls = self.parse.get_all_url(html) self.urlmanage.add_urls(urls) i = 0 while self.urlmanage.has_url(): url = self.urlmanage.get_url() html =self.htmldownload.Download(url) data = self.parse.get_province(html) self.province.append(data) i += 1 print(i) # print(self.province) self.storedata.store(self.province,"中国科学院院士信息.json") result = self.storedata.analysis(self.province) self.storedata.store(result, "省份统计结果.json")
def __init__(self): """ 初始化各个模块 """ self.manager = UrlManager() self.download = HtmlDownload() self.parse = ParseData() self.ouput = DataOutput()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.parser = HtmlParser() self.downloader = HtmlDownloader() self.output = DataOutput() def crawl(self, root_url): """ 程序主逻辑 :param root_url: 入口 url :return: """ self.manager.add_new_url(root_url) while self.manager.has_new_url() and self.manager.old_url_size() < 20: try: new_url = self.manager.get_new_url() html = self.downloader.downloader(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.output_txt(data) print(data) print("爬取了{}条链接".format(self.manager.old_url_size())) except Exception as e: print("爬取失败", e)
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_urls() and self.manager.old_url_size() < 100): #try: # 获取新的url new_url = self.manager.get_new_url() # 下载器下载网页 html = self.downloader.download(new_url) # 解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 添加UR管理器 self.manager.add_new_urls(new_urls) # 数据存储文件 self.output.store_data(data) print("已经抓取 %s 个链接" % self.manager.old_url_size()) #except Exception, e: # print("crawl failded", e) self.output.out_put_html()
def run(): #数据爬取器 factory_spider = Spiser() #数据解析器 data_parser = dataParser.DataParser() data_manager = DataManager() url_manager = UrlManager() #获取到三个url列表,均为需要爬取的数据 url_list, shopurl1_list, shopurl2_list = url_manager.tel_url() total_num = len(url_list) crawred_url = url_manager.crawred_url() company_dataList = [] for i in range(total_num): url = url_list[i] shopurl1 = shopurl1_list[i] shopurl2 = shopurl2_list[i] if url not in crawred_url: page_data = factory_spider.get_urlpage(url) page_shop1 = factory_spider.get_urlpage(shopurl1) page_shop2 = factory_spider.get_urlpage(shopurl2) #使用解析器,解析三个页面的数据 companydata = data_parser.get_company_data(page_data, page_shop1, page_shop2, url) #将解析后的数据元组保存至列表 company_dataList.append(companydata) time.sleep(1.1) # elif url in crawred_url: # print '已经爬取过了',url # 将爬取结果保存至本地csv文件,爬5家店铺保存一次 print '=========', i, '==========' if i % 10 == 0 and len(company_dataList) > 0: data_manager.save_local_tel(company_dataList) company_dataList = [] time.sleep(10)
class SpiderMan(): """ 爬虫主模块 """ def __init__(self): """ 初始化各个模块 """ self.manager = UrlManager() self.download = HtmlDownload() self.parse = ParseData() self.ouput = DataOutput() def start(self, city, job): """ 开始爬取 :return: """ # 创建 csv 文件 self.ouput.create_csv(city, job) # 爬 2页 position_url = self.manager.get_position_url() for pn in range(1, 2): """ 买的是动态代理,使用时发现,代理虽然换了,但也常被封,可能是这个代理别人 也拿去爬拉勾了,所以只要失败就重新换代理再请求 """ while True: response = self.download.get_html(pn, position_url, city, job) if 'true' not in response.text: continue else: break data = self.parse.get_info(response.text) if data == None: # 编码错误的页跳过 continue if data == []: print('\n爬取完毕或拉勾上此城市没有相关的职位!!!') break self.ouput.write_to_csv(data, city, job) print('\r第 {} 已爬取'.format(str(pn)), end='') print('\n爬取完毕,正在生成职位信息报表.....')
class SpiderMan(object): def __init__(self): self.manger = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = OutputData() def crawl(self, root_url): """ 主程序 :param root_url: 入口 URL :return: """ self.manger.add_new_url(root_url) while self.manger.has_new_url() and self.manger.old_urls_size() < 5: new_url = self.manger.get_new_url() html = self.downloader.downloader(new_url) next_url, data = self.parser.parser(new_url, html) self.manger.add_new_url(next_url) self.output.outputTxt(data)
def __init__(self): self.manager = UrlManager() self.parser = HtmlParser() self.downloader = HtmlDownloader() self.output = DataOutput()
def login(): url = url_for('hello') # 通过视图的方法名,找到对应的路由,/api/hello/ url1 = UrlManager.buildUrl('/api') # /api url2 = UrlManager.buildStaticUrl( '/css/bootstrap') # /css/bootstrap?version=20180925 return 'Hello World,url:%s,url1:%s, url2:%s' % (url, url1, url2)