def page_category(url): print('*****start page_categorie()开始单个分类页的爬取') recipes = [] base_url = url urls = [] # 放置页数的url,这里构造三页的page url # 构建url,一共为三页 for x in range(1, 4): urls.append(base_url + 'page/' + str(x) + '/') # 针对url做遍历,和发出请求 for url in urls: html = meishi_requests.get(url) for item in html.xpath('//div[@id="J_list"]//li'): temp = {} temp['show_img'] = verify_text( item.xpath('./div[@class="pic"]//img/@data-src')) temp['show_title'] = verify_text( item.xpath('./div[@class="detail"]//a/text()')) temp['show_username'] = verify_text( item.xpath( './div[@class="detail"]//p[@class="subline"]/a/text()')) temp['material'] = verify_text( item.xpath('.//p[@class="subcontent"]/text()')) temp['detail'] = detail.parse_detail_recipe( item.xpath('./div[@class="detail"]//a/@href')[0]) recipes.append(temp) # break # break return recipes print('*****start get_all_category()开始所有分类页的爬取')
def get_index_categories(): # 获取首页下分类的数据(只获取前两个) print('*****start index_categories() 开始首页分类数据提取') html = meishi_requests.get('https://www.meishichina.com/') # 获取分类名,装到列表中(不用验证了,可确定性很大) categories = html.xpath('//div[@class="w5"]//h3[position() <3]/a/text()') # 获取列表下的食谱详细数据 all_recipes = [] for item in html.xpath( '//div[@class="w5"]//div[@class="big4_list clear mt10"]/ul[position() < 3]' ): recipes = [] for i in item.xpath('./li'): temp = {} temp['show_title'] = verify_text(i.xpath('.//p/text()')) temp['show_username'] = verify_text( i.xpath('.//a[@class="u"]/text()')) temp['show_img'] = verify_text(i.xpath('.//img/@data-src')) temp['detail'] = detail.parse_detail_recipe( verify_text(i.xpath('.//a[1]/@href'))) recipes.append(temp) all_recipes.append(recipes) # 整合数据 for index, item in enumerate(all_recipes): data = {} data['categories'] = categories[index] if categories else '' data['recipes'] = item write_data('categories/index-categories', str(index + 1), data) # 写入数据 print('*****done index_categories() 首页分类数据提取结束')
def get_all_categories(): # 获取所有分类中的菜单,其实也就获取页面前三十个,并且每个爬取3页 print('*****start get_all_categories()开始所有分类页的爬取') html = meishi_requests.get('https://home.meishichina.com/recipe-type.html') # 下面数据不用验证了,可确定性很大 for index, item in enumerate( html.xpath('//div[@class="category_sub clear"][1]//li')): data = {} data['categories'] = item.xpath('./a/@title')[0] data['recipes'] = page_category(item.xpath('./a/@href')[0]) write_data('categories/all-categories', str(index + 1), data) # break print('*****done get_all_categories()所有分类页的爬取结束')
def get_detail_categories(): # 爬取详细分类的数据 print('*****start get_detail_categories() 开始详细分类的数据提取') # 请求菜谱分类页 html = meishi_requests.get('https://home.meishichina.com/recipe.html') # 提取每个分类的名字(不用校验,数据可确定性已经很高) for index, h3 in enumerate(html.xpath('//div[@class="ui_title"]//h3')): data = {} data['categories'] = h3.xpath('./a/text()')[0] data['recipes'] = get_ajax_data( h3.xpath('./a/@data')[0], h3.xpath('./a/@order')[0]) # 写入数据 write_data('categories/detail-categories', str(index + 1), data) print('*****done get_detail_categories() 详细分类的数据提取结束')
def get_slider_URL(): # 获取首页轮播图那里的url print('*****start get_slider_URL()提取轮播图url') data = {} html = meishi_requests.get('https://www.meishichina.com/') slider_pages_url = html.xpath( '//div[@id="home_index_slider"]/ul/li/a[@title != "2020,人人都是美食家"]/@href' ) imgs_url = html.xpath( '//div[@id="home_index_slider"]/ul/li/a[@title != "2020,人人都是美食家"]/img/@src' ) data['slider_pages_url'] = slider_pages_url data['imgs_url'] = imgs_url print('*****done get_slider_URL()提取轮播图url结束') return data
def parse_menu_pages(temporary_data): # 处理每个菜单页面中的数据 print('*****start parse_menu_pages(temporary_data) 开始菜单的页面数据处理') data = {} # 提取首页显示的数据 data['home_title'] = temporary_data['home_title'] data['home_content'] = temporary_data['home_content'] data['username'] = temporary_data['username'] # 发起请求 html = meishi_requests.get(temporary_data['url']) # 解析数据 data['page_data'] = {} # 标题 data['page_data']['title'] = html.xpath( '//a[@id="collect_title"]/text()')[0].strip() # 创建时间 data['page_data']['creation_time'] = init_time( html.xpath('//div[@class="collect_dp"]/span/text()')[0]) # 用户名 data['page_data']['username'] = temporary_data['username'] # 页面中的食谱信息 data['page_data']['recipes'] = [] for item in html.xpath('//div[@id="J_list"]//li'): temp = {} temp['show_img'] = verify_text( item.xpath('./div[@class="pic"]//img/@data-src')) temp['show_title'] = verify_text(item.xpath( './div[@class="detail"]//a/text()')) temp['show_username'] = verify_text(item.xpath( './div[@class="detail"]//p[@class="subline"]/a/text()')) temp['material'] = verify_text( item.xpath('.//p[@class="subcontent"]/text()')) temp['detail'] = detail.parse_detail_recipe( item.xpath('./div[@class="detail"]//a/@href')[0]) data['page_data']['recipes'].append(temp) # temp['detail'] # 格式化数据 # 把数据写入到文件 global INDEX with open('./menu_data/'+str(INDEX)+'.txt', 'w', encoding='utf-8') as fp: print('开始写入文件') fp.write(json.dumps(data, ensure_ascii=False)) print('写入文件结束') INDEX += 1 print('*****done parse_menu_pages(temporary_data) 菜单的页面数据处理完毕')
def parse_slider_pages(slider_page_url, slpage_img_url): # 处理进入每个轮播图中的网页数据 print('*****start parse_slider_pages(slider_page_url)提取轮播图页面数据') page_data = {} # slider_pages_data下的{} page_data['img_url'] = slpage_img_url html = meishi_requests.get(slider_page_url) # 获取页面描述 page_data['desc'] = verify_text(html.xpath('//p[@id="mof_desc"]/text()')) # 获取分标题和其内容 page_data['list'] = [] # slider_pages_data下的{}的list[] # 标题 mo_result = verify_list( html.xpath('//div[@class="mo" and position() < last()-1]/h2/text()')) # 内容 p_list = verify_list(html.xpath('//div[@class="msb"]/p/text()')) for index, p in enumerate(p_list): p_list[index] = p.replace('\n', '').strip('') # 每部分的菜谱 recipes_result = [] # slider_pages_data下的{}的list[]的对象下的recipes[] msb_ul = verify_list(html.xpath('//div[@class="msb_list clear"]/ul')) for item in msb_ul: # 得到每个包含详细的菜谱li集合,丢给detail模块处理,然后返回一个列表 # print(item.xpath('.//li')) recipes_result.append( detail.parse_slider_recipes_pages(item.xpath('./li'))) # 把相对应的标题,内容,菜谱集合进行数据重组 for index, item in enumerate(recipes_result): temp = {} temp['title'] = mo_result[index] if mo_result else '' temp['content'] = p_list[index] if p_list else '' temp['recipes'] = recipes_result[index] page_data['list'].append(temp) global INDEX with open('./slider_data/' + str(INDEX) + '.txt', 'w', encoding='utf-8') as fp: print('开始写入文件') fp.write(json.dumps(page_data, ensure_ascii=False)) INDEX += 1 # print(json.dumps(page_data, ensure_ascii=False)) print('*****done parse_slider_pages(slider_page_url)提取轮播图页面数据结束')
def get_menus_URL(): # 获取首页菜单的各个url print('*****start get_menus_URL() 开始提取首页的菜单url') data_temp = [] # 发起请求 html = meishi_requests.get('https://www.meishichina.com/') # 底下不要做验证,这边出错是得到数据致命性错误的,不能让程序修正 for item in html.xpath('//div[@id="w2_slider"]//li'): temp = {} temp['url'] = item.xpath('.//a/@href')[0] temp['home_title'] = item.xpath('.//a/text()')[0].strip() temp['home_content'] = init_text(item.xpath('./p/text()')) temp['username'] = item.xpath('./p/span/text()')[0].strip() data_temp.append(temp) # print(json.dumps(temp,ensure_ascii=False)) print('*****done get_menus_URL() 提取首页的菜单url结束') # 返回解析到的url,和首页中展示的数据 return data_temp