class SpiderMan(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): content = self.downloader.download(root_url) urls = self.parser.parser_url(root_url, content) for url in urls: try: t = time.strftime("%Y%m%d%H%M%S3282", time.localtime()) rank_url = 'http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallBackArgument0=%s' % (url[0],t,url[1]) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url, rank_content) self.output.store_data(data) except Exception as e: print('Crawl failed') self.output.output_end() print('Crawl finish')
class SpiderMain: def __init__(self): """ 初始化方法,主要是将其他组件实例化 """ self.url_manager = UrlManager() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() self.data_storage = DataStorage() def start(self): """ 爬虫的主启动方法 :return: """ """ 页码 """ title = set() for a in range(2, 10): html = self.html_downloader.download( 'http://ggzy.foshan.gov.cn/jyxx/fss/zfcg_1108551/zbxx/index_'+str(a)+'.html?1') _title = self.html_parser.titleParer(html) for i in _title: title.add(i) for i in title: print(i) html = self.html_downloader.download(i) _product = self.html_parser.contextParer(html) self.data_storage.storage(_product)
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = HtmlOutputer() def craw(self, root_url, page_amount=5, time_sleep=None): count = 1 # 添加第一个待爬取url self.urls.add_new_url(root_url) # 如果集合中有url, 就取出一个url 请求, 没有链接则跳出。 while self.urls.has_new_url(): try: # 开始爬取 new_url = self.urls.get_new_url() print(f'craw{count}:{new_url}') # 请求url, 返回html html_content = self.downloader.download(new_url) # xpath 解析html,得到需要的数据 new_urls, new_data = self.parser.parse(html_content) # 一个词条页面上关联的a 链表列表加入到url 管理器中待爬取 self.urls.add_new_urls(new_urls) self.output.collect_data(new_url, new_data) count += 1 if count > page_amount: break time.sleep(2) except Exception as e: print(e) print(f'抓取失败:{new_url}') self.output.output_html()
class SpiderWork(object): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) self.m = BaseManager(address=(server_addr,8001),authkey=b'baike') self.m.connect() self.task = self.m.get_task_queue() print(self.task.qsize()) self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): import time while(True): try: if not self.task.empty(): url = self.task.get() if url == 'end': print('控制节点通知爬虫节点停止工作...') self.result.put({'new_urls':'end', 'data':'end'}) return print('爬虫节点正在解析:%s' % url.encode('utf-8')) content = self.downloader.download(url) new_urls,data = self.parser.parser(url,content) self.result.put({'new_urls':new_urls, 'data':data}) except EOFError as e: print('连接工作节点失败') return except Exception as e: print(e) print('Crawl fail')
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() def craw(self, url): count = 1 self.urls.add_new_url(url) while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() html_cont = self.downloader.download(new_url) new_urls, html_data = self.parser.parse(new_url, html_cont) self.urls.add_new_urls(new_urls) self.outputer.collect_data(html_data) print "%d craw success : %s" % (count, new_url) if count >= 10: break count = count + 1 except Exception as e: print str(e) print "%d craw failed : %s" % (count, new_url) self.outputer.output()
def craw(self): # 下载 downloader = HtmlDownloader() root_cont = downloader.download(self.url) parser = HtmlParser() urls, data = parser.parse(self.url, root_cont, True) result = "" for url in urls: cont = downloader.download(url) newurls, month = parser.parse(url, cont, False) if month != None: result += month.getMonthly() month = None #print(month.getMonthly()) f = open("阿里巴巴数据库内核组月报.md", "w+", encoding='utf-8') result = "## 阿里巴巴数据库内核月报\n\n" + result f.write(result) f.close() pass
class Spider: def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): content = self.downloader.download(root_url) urls = self.parser.parse_url(root_url, content) for url in urls: try: # http://service.library.mtime.com/Movie.api # ?Ajax_CallBack=true # &Ajax_CallBackType=Mtime.Library.Services # &Ajax_CallBackMethod=GetMovieOverviewRating # &Ajax_CrossDomain=1 # &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F246526%2F&t=201710117174393728&Ajax_CallBackArgument0=246526 t = time.strftime('%Y%m%d%H%M%S3282', time.localtime()) rank_url = 'http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallbackArgument0=%s' % (url[0].replace('://', '%3A%2F%2F')[:-1], t, url[1]) rank_content = self.downloader.download(rank_url) if rank_content is None: print('None') data = self.parser.parse_json(rank_url, rank_content) self.output.store_data(data) except Exception as e: raise e # print(e) # print('Crawl failed') self.output.output_end() print('Crawl finish')
class SpiderWorker: def __init__(self, address='127.0.0.1', port=8001, authkey=b'baike'): """初始化分布式进程中工作节点的连接工作""" # 注册用于获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 连接到服务器 print('Connect to server %s:%s...' % (address, port)) self.manager = BaseManager(address=(address, port), authkey=authkey) # 开始连接 self.manager.connect() # 获取Queue对象 self.task_q = self.manager.get_task_queue() self.result_q = self.manager.get_result_queue() # 初始化下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): while True: try: if not self.task_q.empty(): url = self.task_q.get() if url == 'end': print('控制节点通知爬虫节点停止工作...') # 接着通知其他节点停止工作 self.result_q.put({'new_urls': 'end', 'data': 'end'}) return print('爬虫节点正在解析: %s' % url) content = self.downloader.download(url) new_urls, data = self.parser.parse(url, content) self.result_q.put({'new_urls': new_urls, 'data': data}) else: print('task queue is empty', self.task_q.empty()) except EOFError: print('连接工作节点失败') return except Exception as e: print(e) print('crawl fail')
class CodeSpider(object): def __init__(self): # 实例化其他模块类 #self.mysql_handler = MysqlHandler() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() self.path = "/Users/spike/python_项目/get_cd_school/" # # 爬取起点url # self.root_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=1' # # 用于后续url的拼接 # self.split_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=' # school info # self.school_infos = [] def craw(self, downloading_url): try: # 记录正在下载、解析的url,便于分析错误 # downloading_url = self.root_url html_content = self.html_downloader.download(downloading_url) # 第一个参数:需要解析的html代码 # 第二个参数:用于url拼接的url self.school_infos = self.html_parser.province_parser(html_content) # print(self.school_infos) #exit() if (len(self.school_infos) != 20): print(downloading_url + "解析成功") print("当前页面数据:" + str(len(self.school_infos))) #print(self.province_url_list) with open(self.path + "school.txt", "a") as f: # print("writting") for mc, xd, qy, xz, dh, dz in self.school_infos: f.write(mc + "\t" + xd + "\t" + qy + "\t" + xz + "\t" + dh + "\t" + dz) f.close() return len(self.school_infos) except Exception as e: print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e) # 利用traceback定位异常 traceback.print_exc()
class CodeSpider(object): def __init__(self): # 实例化其他模块类 self.mysql_handler = MysqlHandler() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() # 爬取起点url # self.root_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=1' # # 用于后续url的拼接 # self.split_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=' # # school info # self.school_infos = [] #日志文件路径需要自行修改 # self.last_log_path = "d:\\log.txt" # self.last_log_path = "/Users/spike/spider_log.txt" def craw(self,downloading_url): try: # 记录正在下载、解析的url,便于分析错误 # downloading_url = self.root_url html_content = self.html_downloader.download(downloading_url) # 第一个参数:需要解析的html代码 self.school_infos = self.html_parser.province_parser(html_content) # print(self.school_infos) if (len(self.school_infos)!=20): print(downloading_url+"解析成功") print("当前页面数据:"+str(len(self.school_infos))) for mc,xd,qy,xz,dh,dz in self.school_infos: # print(mc+xd+qy+xz+dh+dz) province_id = self.mysql_handler.insert(mc,xd,qy,xz,dh,dz) # print(province_id) # exit() # 记录正在下载、解析的url,便于分析错误 # self.mysql_handler.close() return len(self.school_infos) except Exception as e: print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e) # 利用traceback定位异常 traceback.print_exc() time.sleep(60)
class SpiderMain: def __init__(self): self.url_manager = UrlManager() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() self.data_storage = DataStorage() def start(self): """ 爬虫的主启动方法 :return: """ self.url_manager.add_new_url( "http://127.0.0.1:8848/xiaomi-master/index.html") # 从url管理器获取url url = self.url_manager.get_new_url() # 将获取到的url使用下载器进行下载 html = self.html_downloader.download(url) # 将html进行解析 res = self.html_parser.parser(html) # 数据存储 self.data_storage.storage(res)
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() def craw(self, root_url): count = 1 self.urls.add_new_url(root_url) while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() # 获取新url html_cont = self.downloader.download(new_url) # 下载url内容 new_urls, new_data = self.parser.parse(new_url, html_cont) # 解析url内容 self.urls.add_new_urls(new_urls) # 将解析到的新url存入url管理器 self.outputer.collect_data(new_data) # 收集解析到的数据 if count == 200: break count = count + 1 except: print("craw failed") self.outputer.output_html()
class CodeSpider(object): def __init__(self): # 实例化其他模块类 self.mysql_handler = MysqlHandler() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() # 爬取起点url self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html' # 用于后续url的拼接 self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' # 省页面列表 self.province_url_list = [] # 市页面列表 self.city_url_list = [] # 区页面列表 self.county_url_list = [] # 乡镇、街道页面列表 self.town_url_list = [] self.last_log_path = "d:\\log.txt" def craw(self): try: # 记录正在下载、解析的url,便于分析错误 downloading_url = self.root_url html_content = self.html_downloader.download(downloading_url) # 第一个参数:需要解析的html代码 # 第二个参数:用于url拼接的url self.province_url_list = self.html_parser.province_parser( html_content, self.split_url) #print(self.province_url_list) pro = self.province_url_list #print(self.province_url_list[0][0]) with open(self.last_log_path, "r") as r: last_log = r.read() #print(last_log) if last_log != "": last_log_index = pro.index(tuple(last_log.split(';'))) #print("inde:"+str(last_log_index)) for i in range(last_log_index): del self.province_url_list[0] print("删除已下载元素后还剩余:" + str(len(self.province_url_list)) + "共计:31") #print(self.province_url_list) #exit() #else: # print("下载开始,共计:"+str(len(pro)) #print(last_log_index) #exit() for province_name, province_url, province_code in self.province_url_list: #print(province_code) #记录最后一个下载 last_record = (province_name, province_url, province_code) #print(last_record) with open(self.last_log_path, "w") as l: #last_name = province_name.encode('utf8') l.write(last_record[0] + ";" + last_record[1] + ";" + last_record[2]) #exit() province_id = self.mysql_handler.insert( province_code + '0000000000', province_name) #print(province_id) # 记录正在下载、解析的url,便于分析错误 downloading_url = province_url html_content = self.html_downloader.download(downloading_url) self.city_url_list = self.html_parser.city_parser( html_content, self.split_url) for city_name, city_url, city_code in self.city_url_list: city_id = self.mysql_handler.insert(city_code, city_name) # 例如直辖市没有下级页面 if city_url is None: continue # 记录正在下载、解析的url,便于分析错误 downloading_url = city_url html_content = self.html_downloader.download( downloading_url) self.county_url_list = self.html_parser.county_parser( html_content, self.split_url + province_code + "/") for county_name, county_url, county_code in self.county_url_list: county_id = self.mysql_handler.insert( county_code, county_name) if county_url is None: continue # 记录正在下载、解析的url,便于分析错误 downloading_url = county_url html_content = self.html_downloader.download( downloading_url) self.town_url_list = self.html_parser.town_parser( html_content, self.split_url) for town_name, town_url, town_code in self.town_url_list: # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码 print(town_name, town_url, town_code) self.mysql_handler.insert(town_code, town_name) self.mysql_handler.close() except Exception as e: print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e) # 利用traceback定位异常 traceback.print_exc() time.sleep(60) return self.craw()
class CodeSpider(object): def __init__(self): # 实例化其他模块类 self.mysql_handler = MysqlHandler() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() # 爬取起点url self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html' # 用于后续url的拼接 self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' # 省页面列表 self.province_url_list = [] # 市页面列表 self.city_url_list = [] # 区页面列表 self.county_url_list = [] # 乡镇、街道页面列表 self.town_url_list = [] def craw(self): try: # 记录正在下载、解析的url,便于分析错误 downloading_url = self.root_url html_content = self.html_downloader.download(downloading_url) # 第一个参数:需要解析的html代码 # 第二个参数:用于url拼接的url self.province_url_list = self.html_parser.province_parser( html_content, self.split_url) for province_name, province_url, province_code in self.province_url_list: # 第一个参数:1-插入一个省数据;2-市数据;3-区数据;4-乡镇街道数据 # 第二个参数:省市区街道名称 # 第三个参数:上级的id,注意省没有上级id # 第四个参数:市区街道的行政区划编码 province_id = self.mysql_handler.insert( 1, province_name, None, None) if province_id == 0: continue sleep(5) # 记录正在下载、解析的url,便于分析错误 downloading_url = province_url try: html_content = self.html_downloader.download( downloading_url) except Exception as e: sleep(10) print e, "重新下载 省份" html_content = self.html_downloader.download( downloading_url) self.city_url_list = self.html_parser.city_parser( html_content, self.split_url) for city_name, city_url, city_code in self.city_url_list: city_id = self.mysql_handler.insert( 2, city_name, province_id, city_code) # 例如直辖市没有下级页面 if city_url is None: continue # 记录正在下载、解析的url,便于分析错误 downloading_url = city_url try: html_content = self.html_downloader.download( downloading_url) except Exception as e: sleep(10) print e, "重新下载 直辖市" html_content = self.html_downloader.download( downloading_url) self.county_url_list = self.html_parser.county_parser( html_content, self.split_url + province_code + "/") for county_name, county_url, county_code in self.county_url_list: county_id = self.mysql_handler.insert( 3, county_name, city_id, county_code) if county_url is None: continue # 记录正在下载、解析的url,便于分析错误 downloading_url = county_url try: html_content = self.html_downloader.download( downloading_url) except Exception as e: sleep(10) print e, "重新下载乡镇" html_content = self.html_downloader.download( downloading_url) self.town_url_list = town_parser( html_content, self.split_url) for town_name, town_url, town_code in self.town_url_list: # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码 if town_code == "130408100000": print town_url print(town_name, town_url, town_code) self.mysql_handler.insert(4, town_name, county_id, town_code) self.mysql_handler.close() except Exception as e: print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e) # 利用traceback定位异常 traceback.print_exc()
class LinkExtractor(object): def __init__(self): self.counter = 0 self.k_count = 0 self.downloader = HtmlDownloader() def get_menu_page_info(self, menu_page_url): if menu_page_url is None: return None html_text = self.downloader.download(menu_page_url) if html_text == None: return None self.counter = (self.counter + 1)%100 if self.counter == 0: self.k_count += 1 print('Get Manu Pages: %d00'%(self.k_count)) return self.parse_menu_page_info(html_text) def parse_menu_page_info(self, html_text): if html_text is None: return None soup = BeautifulSoup(html_text, 'lxml') menu_page_data = [] for entry in soup.select('.r-ent'): data = { 'title': entry.select('.title')[0].text.strip(), 'post_url': PTT_HOST_URL + entry.select('.title > a')[0].get('href') if entry.select('.title > a') else None, 'date': entry.select('.date')[0].text.strip(), 'author': entry.select('.author')[0].text.strip(), 'visited': 0 } menu_page_data.append(data) return menu_page_data # 抓 post_links 到 post_url_infos table def fetch_menu_page_links(self, menu_page_url): menu_page_data = self.get_menu_page_info(menu_page_url) if menu_page_data != None: url_manager.add_new_url_infos(menu_page_data) def next_page(self, html_text): soup = BeautifulSoup(html_text, 'lxml') if soup.find_all('a', class_='btn wide', text='下頁 ›'): return PTT_HOST_URL + soup.find_all('a', class_='btn wide', text='下頁 ›')[0].get('href') return None def run(self, root_menu_page, min_menu_page_index=1, max_menu_page_index=6000, threadNum=5): print('===================== start run extractor() ========================') try: pool = threadpool.ThreadPool(threadNum) menu_page_urls = [root_menu_page.format(i) for i in range(min_menu_page_index, max_menu_page_index)] requests = threadpool.makeRequests(self.fetch_menu_page_links, menu_page_urls) [pool.putRequest(req) for req in requests] pool.wait() print('link extractor done.') except: print('link_extractor excepttion') raise
print(year) year_date_list = getAllDayPerYear(year) # print(year_date_list) for comregdate in year_date_list: print(comregdate) errcnt = 0 pagecnt_tmp = 0 for pagecnt in range(0, 1000): url = r'https://gongshang.mingluji.com/' + province + r'/riqi/' + comregdate + r'?page=' + str( pagecnt) # print(url) time.sleep(1) pagecnt_tmp = pagecnt try: html_content = hd.download(url) hp.cityparase(html_content, cursor, province, comregdate) conn.commit() print(province, comregdate, pagecnt) except Exception as e: print(e) with open('download.err', 'a') as f: f.write(url + '\n') if (pagecnt - pagecnt_tmp == 0): errcnt += 1 # print(pagecnt) # print(pagecnt_tmp) # print(errcnt) if (errcnt > 10): break
class CodeSpider(object): def __init__(self): # 实例化其他模块类 #self.mysql_handler = MysqlHandler() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() self.path = "D:\\python_work\\get_diqu_dm\\" # 爬取起点url self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html' # 用于后续url的拼接 self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' # 省页面列表 self.province_url_list = [] # 市页面列表 self.city_url_list = [] # 区页面列表 self.county_url_list = [] # 乡镇、街道页面列表 self.town_url_list = [] def craw(self): try: # 记录正在下载、解析的url,便于分析错误 downloading_url = self.root_url html_content = self.html_downloader.download(downloading_url) # 第一个参数:需要解析的html代码 # 第二个参数:用于url拼接的url self.province_url_list = self.html_parser.province_parser(html_content, self.split_url) #print(self.province_url_list) with open(self.path+"shen_daima.txt", "a") as f: for province_name, province_url, province_code in self.province_url_list: province_code = province_code+'0000000000' f.write(province_code+"\t"+province_name+"\n") # 第一个参数:1-插入一个省数据;2-市数据;3-区数据;4-乡镇街道数据 # 第二个参数:省市区街道名称 # 第三个参数:上级的id,注意省没有上级id # 第四个参数:市区街道的行政区划编码 #province_id = self.mysql_handler.insert(1, province_name, None, None) # 记录正在下载、解析的url,便于分析错误 downloading_url = province_url html_content = self.html_downloader.download(downloading_url) self.city_url_list = self.html_parser.city_parser(html_content, self.split_url) with open(self.path+"other_daima.txt","a") as o: for city_name, city_url, city_code in self.city_url_list: o.write(city_code+"\t"+city_name+"\n") #city_id = self.mysql_handler.insert(2, city_name, province_id, city_code) # 例如直辖市没有下级页面 if city_url is None: continue # 记录正在下载、解析的url,便于分析错误 downloading_url = city_url html_content = self.html_downloader.download(downloading_url) self.county_url_list = self.html_parser.county_parser(html_content, self.split_url + province_code + "/") for county_name, county_url, county_code in self.county_url_list: o.write(county_code+"\t"+county_name+"\n") #county_id = self.mysql_handler.insert(3, county_name, city_id, county_code) if county_url is None: continue # 记录正在下载、解析的url,便于分析错误 print('To deal with county') downloading_url = county_url html_content = self.html_downloader.download(downloading_url) self.town_url_list = self.html_parser.town_parser(html_content, self.split_url) for town_name, town_url, town_code in self.town_url_list: # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码 o.write(town_code+"\t"+town_name+"\n") print(town_name, town_url, town_code) #self.mysql_handler.insert(4, town_name, county_id, town_code) #self.mysql_handler.close() f.close() o.close() except Exception as e: print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e) # 利用traceback定位异常 traceback.print_exc()
class SpiderMain(): """爬虫程序主模块""" def __init__(self): """构造函数,初始化属性""" self.urls = UrlManager() self.log = MyLog("spider_main", "logs") self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() #self.util=utill.DBConn() def craw(self, root_url): """爬虫入口函数""" areas = { "gulou": 100, "jianye": 72, "qinhuai": 100, "xuanwu": 67, "yuhuatai": 32, "qixia": 62, "baijiahu": 33, "chalukou1": 26, "jiangningqita11": 3, "dongshanzhen": 29, "jiangningdaxuecheng": 15, "jiulonghu": 12, "jiangjundadao11": 22, "kexueyuan": 9, "qilinzhen": 42, "tiexinqiao": 9, "pukou": 100, "liuhe": 1, } #areas = {"gulou":1} #1、抓取所有二手房详情界面链接,并将所有连接放入URL管理模块 for area, pg_sum in areas.items(): for num in range(1, pg_sum + 1): #1.1 拼接页面地址: https://nj.lianjia.com/ershoufang/gulou/pg2/ pg_url = root_url + area + "/pg" + str(num) + "/" self.log.logger.info("1.1 拼接页面地址:" + pg_url) print("1.1 拼接页面地址:" + pg_url) #1.2 启动下载器,下载页面. try: html_cont = self.downloader.download(pg_url) except Exception as e: self.log.logger.error("1.2 下载页面出现异常:" + repr(e)) time.sleep(60 * 30) else: #1.3 解析PG页面,获得二手房详情页面的链接,并将所有链接放入URL管理模块 try: ershoufang_urls = self.parser.get_erhoufang_urls( html_cont) except Exception as e: self.log.logger.error("1.3 页面解析出现异常:" + repr(e)) else: self.urls.add_new_urls(ershoufang_urls) #暂停0~3秒的整数秒,时间区间:[0,3] time.sleep(random.randint(0, 3)) time.sleep(60 * 20) #2、解析二手房具体细心页面 id = 1 stop = 1 while self.urls.has_new_url(): #2.1 获取url try: detail_url = self.urls.get_new_url() self.log.logger.info("2.1 二手房页面地址:" + detail_url) print("2.1 二手房页面地址:" + detail_url) except Exception as e: print("2.1 拼接地址出现异常") self.log.logger.error("2.1 拼接地址出现异常:" + detail_url) #2.2 下载页面 try: detail_html = self.downloader.download(detail_url) except Exception as e: self.log.logger.error("2.2 下载页面出现异常:" + repr(e)) self.urls.add_new_url(detail_url) time.sleep(60 * 30) else: #2.3 解析页面 try: ershoufang_data = self.parser.get_ershoufang_data( detail_html, id) except Exception as e: self.log.logger.error("2.3 解析页面出现异常:" + repr(e)) else: #2.4 输出数据 try: self.outputer.collect_data(ershoufang_data) except Exception as e: self.log.logger.error("2.4 输出数据出现异常:" + repr(e)) else: print(id) id = id + 1 stop = stop + 1 #暂停0~3秒的整数秒,时间区间:[0,3] time.sleep(random.randint(0, 3)) if stop == 2500: stop = 1 time.sleep(60 * 20)
# -*- coding: utf-8 -*- # @Author: cyb from html_downloader import HtmlDownloader downloader = HtmlDownloader() html_content = downloader.download( url='https://baike.baidu.com/item/Python/407313') print(html_content) # 测试得到响应的htmlcontent,经过比对是纯静态页面,想要的数据都已包含。 # # resp.text自动转换偶尔会得到乱码,所以改为resp.content.decode()