def ping_waimai_meituan_restaurant_by_id(id): print("******" + str(id) + "*****") url_web = waimei_meituan_com_restaurant_url_web + str(id) url_wap = waimei_meituan_com_restaurant_url_wap + str(id) html = get_response_by_url(url_web) # print(html) html = html_content_without_special_chars(html) soup = BeautifulSoup(html) # print(soup) noexits_soup = soup.select(".rest-info") is_restaurant_exist = len(noexits_soup) > 0 # print(is_restaurant_exist) if (is_restaurant_exist): model = effective_restaurant() model.id = id model.url_web = url_web model.url_wap = url_wap model.waimei_src = 'meituan' model.waimai_src_cn = '美团外卖' class_model = model.__dict__ print(class_model) print("-----------------------------id为:" + str(id) + "-----------------------------------") Insert(class_model, "effective_restaurants")
def ping_waimai_taobao_shop_by_id(id): print("******" + str(id) + "*****") url_web = taobao_waimai_url_web url_wap = taobao_waimai_url_wap + str(id) html = get_response_by_url(taobao_waimai_url_wap_api) print(html) html = html_content_without_special_chars(html) soup = BeautifulSoup(html) # print(soup) noexits_soup = soup.select("body .page .shop-info") print(noexits_soup) return is_restaurant_exist = len(noexits_soup) > 0 # print(is_restaurant_exist) if (is_restaurant_exist): model = effective_restaurant() model.id = id model.url_web = url_web model.url_wap = url_wap model.waimei_src = 'taobao' model.waimai_src_cn = '淘宝外卖' class_model = model.__dict__ print(class_model) print("-----------------------------id为:" + str(id) + "-----------------------------------") Insert(class_model, "effective_restaurants") return
def ping_waimai_eleme_shop_by_id(id): print("******" + str(id) + "*****") url_web = eleme_shop_url_web + str(id) url_wap = eleme_shop_url_wap + str(id) url_api = eleme_shop_url_wap_api + str(id) html = get_response_by_url(url_api) html = html_content_without_special_chars(html) print(html) is_restaurant_not_exist = str(html).find("message") > -1 print(is_restaurant_not_exist) if (is_restaurant_not_exist == False): model = effective_restaurant() model.id = id model.url_web = url_web model.url_wap = url_wap model.url_api = url_api model.waimei_src = 'eleme' model.waimai_src_cn = '饿了么' class_model = model.__dict__ print(class_model) print("-----------------------------id为:" + str(id) + "-----------------------------------") Insert(class_model, "effective_restaurants") print("*********************************Insert id为:" + str(id) + "*********************************")
def get_areas_from_Mysql(): session = DBSession() areas = session.query(MySql_Area).all() itmes = [] print(len(areas)) for item in areas: class_item = item.__dict__ # print(class_item) area = Area() area.id = class_item.get("id") area.pid = class_item.get("pid") area.shortname = class_item.get("shortname") area.name = class_item.get("name") area.merger_name = class_item.get("merger_name") area.level = class_item.get("level") area.pinyin = class_item.get("pinyin") area.code = class_item.get("code") area.zip_code = class_item.get("zip_code") area.first = class_item.get("first") area.lng = class_item.get("lng") area.lat = class_item.get("lat") class_area = area.__dict__ # print(class_area) itmes.append(class_area) # Insert(class_item,'Spider_China_Areas') # lists.append(item.__dict__) Insert(itmes, "Spider_China_Areas")
def get_meishijie_shiliao_shicai_ji(cid = _cid,category_pinyin=""): print("获取cid为"+str(cid)+"的禁忌食材,catagory_pinyin:"+category_pinyin+"-----------开始------------") url = base_url + str(cid) html = get_html_by_url(url) soup = BeautifulSoup(html) # shiyi_shicai = soup.li.next_siblings soup_shiyi_shicais = soup.findAll(id="ji_more") shicai_type = '' shicai_type_name = '' shicai_remark = '' shiliao_shicais = [] for soup_shiyi_shicai in soup_shiyi_shicais: shicai_type = soup_shiyi_shicai["class"][0] shicai_type_name = soup_shiyi_shicai.string remark_spans = soup_shiyi_shicai.next_sibling # print(soup_shiyi_shicai.next_sibling.next_sibling) ji_shicais = soup_shiyi_shicai.next_sibling.next_sibling.next_sibling for remark_span in remark_spans: # print(remark_span) shicai_remark = shicai_remark + remark_span.string + "\n\t " for shicai_li in ji_shicais: # for a in shicai_li: # print(a)) shicai_li_string = str(shicai_li.string).strip() # print() if shicai_li.string is not None: if shicai_li.string != "\n" and len(shicai_li_string)>0: shiliao_shicai = meishijiie_shiliao_shicai() shiliao_shicai.cid = cid shiliao_shicai.category_pinyin = category_pinyin shiliao_shicai.cnName = shicai_li.string shiliao_shicai.remark = shicai_remark shiliao_shicai.type = shicai_type shiliao_shicai.type_name = shicai_type_name shiliao_shicai.url = shicai_base_url + shiliao_shicai.cnName try: d = s_pinyin.hanzi2pinyin_split(string=shiliao_shicai.cnName,split=' ') l0 = d.replace(' ', '') shiliao_shicai.pinyin = l0 except : import traceback # traceback.print_exc() imgs = shicai_li.select("img") for img in imgs: shiliao_shicai.img_url = img["src"] class_shiliao_shicai = shiliao_shicai.__dict__ # print(class_shiliao_shicai) shiliao_shicais.append(class_shiliao_shicai) # img = shicai_li.find_all("img") # print(img) if (len(shiliao_shicais)>0): Insert(shiliao_shicais, "Meishijie_shiliao_shicais") print("获取cid为" + str(cid) + "的禁忌食材,catagory_pinyin:" + category_pinyin + "-----------结束------------")
def get_cities(): content = get_content_by_url(eleme_cities) data = content.decode("utf8", "ignore") obj = eval(data) print(sorted(obj.keys())) for item in sorted(obj.keys()): cities = [] key = item item_list = obj[item] print(key) print(item_list) print("\n") for city in item_list: print(city) eleme_city = ElemeCities_Item() eleme_city.or_id = city["id"] eleme_city.meta = key eleme_city.abbr = city["abbr"] eleme_city.latitude = city["latitude"] eleme_city.longitude = city["longitude"] eleme_city.name = city["name"] eleme_city.pinyin = city["pinyin"] # eleme_city.geohash =geohash.encode(eleme_city.latitude,eleme_city.longitude,precision=12) eleme_city.geohash = geohash.encode(eleme_city.latitude, eleme_city.longitude, 12) print("\n") v = eleme_city.__dict__ print(v) cities.append(v) f = open(filepath, 'a') s = str(cities) f.write(s) f.close() print(cities) Insert(cities, "Spider_Eleme_Cities_WithGeoHash")
def get_dish_menus(cid=_cid,page_num=1,cai_menu_types_st="3",category_pinyin=''): print("CID 为:"+str(cid)+" st 为"+cai_menu_types_st+" pagenum 为 " +str(page_num) +" category :"+category_pinyin +"-------开始---------------") # print(dish_types[cai_menu_types_st]) # return url = base_url+str(cid)+"&sortby=update&st="+cai_menu_types_st+"&page="+str(page_num) html = get_html_by_url(url) soup = BeautifulSoup(html) total_page =0 total_page_nums = soup.select(".gopage form",) for total_page_num in total_page_nums: page_text= str(total_page_num.get_text()).replace('页','').replace('共','').replace('到第','').replace(',','').strip() # print(page_text) # print(len(page_text)) total_page = int(page_text) # print(total_page) # print(type(total_page_num)) cai_menu_lists = soup.select(".listtyle1_list .listtyle1 a") dish_menu_list =[] for cai_menu_list in cai_menu_lists: # print(cai_menu_list) dish_menu = meishijie_shiliao_dish_menu() dish_menu.link_url =cai_menu_list["href"] dish_menu.cid = cid dish_menu.dish_types_st = cai_menu_types_st cn_pre = '' pre_item = get_category_by_cid(cid) if pre_item is not None: cn_pre = pre_item["cnName"] dish_menu.dish_type =cn_pre+dish_types.get(cai_menu_types_st) dish_menu.dish_cn = cai_menu_list["title"] img =cai_menu_list.find("img") dish_menu.img_url =img["src"] remarks = cai_menu_list.select(".c2 li") # print(remarks) for remark in remarks : # print(remark.string) dish_menu.cooking_remark+=remark.string +" \r\n" try: d = s_pinyin.hanzi2pinyin_split(string=dish_menu.dish_cn, split=' ') l0 = d.replace(' ', '') dish_menu.dish_pinyin = l0 except: import traceback # traceback.print_exc() dish_menu.page_num = page_num dish_menu_item = dish_menu.__dict__ dish_menu_list.append(dish_menu_item) # print(dish_menu_list) # f = open(filepath, 'a') # s = str(dish_menu_list) # f.write(s) # f.close() if (len(dish_menu_list) > 0): Insert(dish_menu_list,"Mershijie_shiliao_dishmenus") page_num = page_num+1 while (page_num <=total_page): # print(page) get_dish_menus(cid,cai_menu_types_st=cai_menu_types_st, page_num=page_num) break print("CID 为:"+str(cid)+" st 为"+cai_menu_types_st+" pagenum 为 " +str(page_num) +" category :"+category_pinyin +"-------结束---------------")
def get_meishijie_categories(cid,category_pinyin='',category_cn=''): url=base_url+str(cid) html =get_html_by_url(url) # print(html) # soup = BeautifulSoup(html) # print(soup) # print(soup.prettify()) sop = BeautifulSoup(html) # h = sop.prettify() # print( h ) # head = sop.find('head') # print(head) # p_categories = sop.findAll(attrs={'id':'listnav_ul'})[0] # print(p_categories) # dds = sop.select(".listnav_dl_style1 dd a") dds = sop.select(".listnav_dl_style1 .current a") # print(dds) # print(len(dds)) meishijie_shiliao_Categories=[] for dd in dds: # print(dd) s = dd.string d = s_pinyin.hanzi2pinyin_split(string=s,split=" ") l0 = d.replace(' ','') # l1 = d.strip() # print(dd["href"]) # print(dd.string) # print(l0) meishijie_shiliao_fenlei = meishijie_shiliao_parant_category() meishijie_shiliao_fenlei.cid =cid # if cid == 160 : # meishijie_shiliao_fenlei.category_pinyin="jibingtiaoli" # meishijie_shiliao_fenlei.category_cn='疾病调理' # elif cid== 190: # meishijie_shiliao_fenlei.category_pinyin="jibingtiaoli" # meishijie_shiliao_fenlei.category_cn='疾病调理' meishijie_shiliao_fenlei.category_pinyin=category_pinyin meishijie_shiliao_fenlei.category_cn=category_cn meishijie_shiliao_fenlei.cnName=dd.string meishijie_shiliao_fenlei.pinyin = l0 meishijie_shiliao_fenlei.url = dd["href"] class_meishijie_shiliao_fenlei = meishijie_shiliao_fenlei.__dict__ meishijie_shiliao_Categories.append(class_meishijie_shiliao_fenlei) print(meishijie_shiliao_Categories) # return Insert(meishijie_shiliao_Categories,collectionName='Meishijie_shiliao_Categories') '''获取该分类食材 start ''' get_meishijie_shiliao_shicai_yi(cid,meishijie_shiliao_fenlei.category_pinyin) #适宜食材 get_meishijie_shiliao_shicai_ji(cid,meishijie_shiliao_fenlei.category_pinyin) #禁忌食材 '''获取该分类食材 end''' # thread1 = timer(5, 2) # thread1.start() for st in dish_types_st: # print(st) # print(dish_types_st.get(st)) get_dish_menus(cid,page_num=1,cai_menu_types_st=dish_types_st.get(st),category_pinyin=meishijie_shiliao_fenlei.category_cn)