"请输入爬取场景类型编号(回车查询全部):\n['0-汽车服务', '1-汽车销售', '2-汽车维修', '3-摩托车服务', '4-餐饮服务', '5-购物服务', \n " "'6-生活服务', '7-体育休闲服务', '8-医疗保健服务', '9-住宿服务', '10-风景名胜', '11-商务住宅', '12-政府机构及社会团体', \n " "'13-科教文化服务', '14-交通设施服务', '15-金融保险服务', '16-公司企业', '17-道路附属设施', '18-公共设施', '19-地名地址信息']\n" ) city = city_list[int(cityIndex)] city_id = city_id_list[int(cityIndex)] sqlparam = "" if typeIndex != '': poi_type = poi_type_list[int(typeIndex)] sqlparam = " and ptype = '" + poi_type + "'" else: poi_type = "全部" print("查询" + city + "的" + poi_type + "数据") #查询对应city的grid1600数据 ora = cxOracle(dbname, dbpass, dbaddr) # 先删除异常表中爬取的ID的数据 print("删除异常表中的ID爬取的数据") #ora.Exec("DELETE PY_POI_POINT WHERE CITY='"+city_id+"' "+sqlparam+" AND ID IN(SELECT ID FROM PY_POI_EXCEPTION WHERE CITY ='"+city_id+"' "+sqlparam+") ") #ora.Exec("DELETE PY_POI_EXCEPTION WHERE CITY ='"+city_id+"' "+sqlparam+" ") # 查询栅格表 rs = ora.Query( "SELECT city, gridid, ptype, id, name FROM PY_POI WHERE CITY ='" + city_id + "' " + sqlparam + " " "AND ID NOT IN (SELECT ID FROM PY_POI_POINT WHERE CITY ='" + city_id + "' " + sqlparam + ") AND ISSEL IS NULL " "ORDER BY GRIDID ASC ") proxy_ip = get_proxy()
def spider(startIndex, endIndex): sqlparam = " and ptype = '" + poi_type + "'" sqlstart = " and tt.rowno >=" + startIndex sqlend = " and rownum <=" + endIndex print("查询" + city + "的" + poi_type + "数据") #查询对应city的grid1600数据 ora = cxOracle(dbname, dbpass, dbaddr) # 查询栅格表 rs = ora.Query( "select tt.* from (" "select t.*,rownum as rowno from(" "SELECT city, gridid, ptype, id, name FROM PY_POI WHERE CITY ='" + city_id + "' " + sqlparam + " " "AND ID NOT IN (SELECT ID FROM PY_POI_POINT WHERE CITY ='" + city_id + "' " + sqlparam + ") AND (ISSEL IS NULL or ISBONS ='1')" "ORDER BY GRIDID ASC)t where 1=1 " + sqlend + ")tt where 1=1 " + sqlstart) proxy_ip = get_proxy() if proxy_ip == None: raise NameError('代理池返回ip为空') for f in rs: print("GRID=" + str(f[1]) + ",ID=" + str(f[3])) OBJECTID = str(f[1]) id = str(f[3]) while True: # 使用while循环不断获取数据 datas = "" area = 0 try: bouns = getBounById2(id, proxy_ip) #bouns = getBounById(id) print("bouns=" + str(bouns)) for bound in bouns: if bound != None: lon = bound[0] lat = bound[1] area = bound[2] datas += str(lon) + "," + str(lat) + ";" print("area=" + str(area) + ",datas=" + str(datas)) print("ComputeArea=" + str(ComputeArea(datas[:-1]))) if abs(area - ComputeArea(datas[:-1])) < 2000: for bound in bouns: if bound != None: lon = bound[0] lat = bound[1] lon1, lat1 = gcj02towgs84(float(lon), float(lat)) sql2 = "insert into py_poi_point values('" + city_id + "','" + OBJECTID + "','" + poi_type + "','" + id + "'," + str( lon) + "," + str(lat) + "," + str( lon1) + "," + str(lat1) + ")" ora.Exec(sql2) break time.sleep(random.randint(1, 3)) except Exception as e: e = str(e).replace('\'', '`') print(e) sql3 = "insert into PY_POI_EXCEPTION values('" + city_id + "','" + OBJECTID + "','" + poi_type + "','" + id + "','" + e + "',sysdate)" ora.Exec(sql3) # 换ip proxy_ip = get_proxy() if proxy_ip == None: raise NameError('代理池返回ip为空') # 更新poi字段为已查询 sql4 = "" if area == 0: sql4 = "UPDATE PY_POI SET ISSEL = '1',ISBONS = '0' WHERE ID = '" + id + "' " else: sql4 = "UPDATE PY_POI SET ISSEL = '1',ISBONS = '1' WHERE ID = '" + id + "' " ora.Exec(sql4) time.sleep(random.randint(1, 3)) print(city + ":" + poi_type + "的" + startIndex + "-" + endIndex + "的数据爬取完成")