def spider_and_save_area_page(area): url = urls.gen_url_area_info(area) filename = filenames.get_filename_area(area) saver = utils.DataSaver(filename) page = utils.fetch_page(url) saver.save(page) return page
def process_area_page(area): """ 根据area页面: 1, 取得area页面 2, 解析出基本信息, 如max_page等 3, 生成待抓取url 4, 根据参数, 抓取url, 将页面存放在文件中, 或直接从文件中读取 5, 解析页面, 获得数据 6, 存入mysql """ #1, 取得area页面 if args_mode == "from_net": page = spider_and_save_area_page(area) else: filename = filenames.get_filename_area(area) page = utils.read_file(filename) #2, 解析出基本信息, 如max_page等 parser = parse_xiaoqu.XiaoquParser() parser.feed(page) max_page = parser.max_page result_list = [] mclient = mysql.MysqlClient() mysql.create_table_xiaoqu() for i, page_num in enumerate(range(max_page)): #3, 生成待抓取url xiaoqu_url = urls.gen_url_xiaoqu(area, page_num+1) #4, 根据参数, 抓取url, 将页面存放在文件中, 或直接从文件中读取 filename = filenames.get_filename_xiaoqu(area, xiaoqu_url) if args_mode == "from_net": page = utils.fetch_page(xiaoqu_url) saver = utils.DataSaver(filename) saver.save(page) page = utils.read_file(filename) #5, 解析页面, 获得数据 parser = parse_xiaoqu.XiaoquParser() parser.feed(page) result_list += parser.output() for result in result_list: print "get xiaoqu:",result #6, 存入mysql cmd = "delete from xiaoqu where name = '%s'" %(result) mclient.execute(cmd) cmd = "insert into xiaoqu (name, area) values ('%s', '%s')" % (result, area) #print cmd mclient.execute(cmd) if i == 2: break pass