def _save_community(self): # 表中是否已有记录 query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format( self._community_detail["url"], self.source_id) communityid_sql = "SELECT COMMUNITY_ID FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format( self._community_detail["url"], self.source_id) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format(self._community_detail["name"])) return Dao.execute_query(communityid_sql)[0][0] # 数据插入操作 Dao.execute_dmls(self._insert_community()) return Dao.execute_query(communityid_sql)[0][0]
def _generate_seed_url(self): """ generate all url to visit """ # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510" # self._visit_pages(self._seed_url) # from page 1 to anypage which < 200 # # 从数据库添加 # self._seed_url = Dao._get_url_by_id(self.source_id) querysql = "SELECT COMMUNITY_ID,ORIGINAL_URL FROM communities WHERE source_id ='{}' and status<2 ; ".format( self.source_id) result = Dao.execute_query(querysql) for COMMUNITY_ID, ORIGINAL_URL in result: try: self._apartment_detail["COMMUNITY_ID"] = int(COMMUNITY_ID) self._apartment_detail["create_time"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print("_generate_seed_url func : "+ORIGINAL_URL) self._visit_pages(ORIGINAL_URL) sql = "update communities set status = '2' where COMMUNITY_ID = '{}' ".format( int(COMMUNITY_ID)) Dao.execute_dmls(sql) except Exception as e: print(e) sql = "update communities set status = '-1' where COMMUNITY_ID = '{}' ".format( int(COMMUNITY_ID)) Dao.execute_dmls(sql)
def _check_community(self, url): # 表中是否已有记录 完成的 communityid_sql = "SELECT COMMUNITY_ID FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} and status = 2 ".format( url, self.source_id) result = Dao.execute_query(communityid_sql) if result == None: return False return True
def get_communities(): dao = Dao() sql = '''SELECT DISTINCT COMMUNITY_ID, BUILDING_NUM, URL FROM ehdc.buildings WHERE STATUS = 0 LIMIT 1,100; ''' result = dao.execute_query(sql) return result
def _save_merchant(self): # 表中是否已有记录 query_sql = "SELECT * FROM merchant WHERE url = '{}'".format( self._merchant_detail["url"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format( self._merchant_detail["name"])) return # 数据插入操作 Dao.execute_dmls(self._insert_merchant())
def _save_community(self): # 表中是否已有记录 query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}'".format( self._community_detail["url"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format( self._community_detail["name"])) return # 数据插入操作 Dao.execute_dmls(self._insert_community())
def _save_community(self): # 表中是否已有记录 query_sql = "SELECT * FROM ehdc.communities_llzg WHERE NAME = '{}' and AREA_NAME='{}' ".format( self._community_detail["name"], self._community_detail["area_name"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format( self._community_detail["name"])) return # 数据插入操作 Dao.execute_dmls(self._insert_community())
def _save_apartment(self, apartment_detail): # 表中是否已有记录 query_sql = "SELECT * FROM apartments WHERE COMMUNITY_ID = {} and FLOOR_NUM ='{}' and APARTMENT_NUM ='{}' ".format( int(apartment_detail["COMMUNITY_ID"]), apartment_detail["FLOOR_NUM"], apartment_detail["APARTMENT_NUM"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format(str(apartment_detail["COMMUNITY_ID"]) + apartment_detail["FLOOR_NUM"] + apartment_detail["APARTMENT_NUM"])) return # 数据插入操作 try: Dao.execute_dmls(self._insert_apartment(apartment_detail)) except Exception as e: print(e)
def _generate_seed_url(self): """ generate all url to visit """ # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510" # self._visit_pages(self._seed_url) # from page 1 to anypage which < 200 # # 从数据库添加 # self._seed_url = Dao._get_url_by_id(self.source_id) querysql = "SELECT COMMUNITY_ID, BUILDING_NUM, URL FROM ehdc.buildings WHERE STATUS = 0 ; " result = Dao.execute_query(querysql) for COMMUNITY_ID, BUILDING_NUM, URL in result: self.execute(COMMUNITY_ID, BUILDING_NUM, URL)
display_name,logo,COMMUNITY_ID) VALUES ('', '{}', '{}', '', '{}', '', '{}', '{}', '{}', '', '{}', '{}', '{}', '{}');'''.format(merchant['business_name'], merchant['phone_number'], merchant['address'], '', merchant['lat'], merchant['long'], merchant['sub_title'], param['cat'], merchant['logo'], COMMUNITY_ID) dao.execute_dmls(insertSql) except: pass if __name__ == '__main__': querySql = 'SELECT a.name ,a.COMMUNITY_ID ,b.LATITUDE,b.LONGITUDE FROM shengchan_20140815.communities a ,shengchan_20140815.community_poses b WHERE a.COMMUNITY_ID =b.COMMUNITY_ID AND a.AREA_ID <11 AND a.AREA_ID > 0 ' dao = Dao() result = dao.execute_query(querySql) for name, COMMUNITY_ID, LATITUDE, LONGITUDE in result: getmerchants(LATITUDE, LONGITUDE, COMMUNITY_ID)
Dao = Dao() # query_sql = "SELECT url,location FROM merchant WHERE SOURCE_ID = 16 " # result = Dao.execute_query(query_sql) # for url , location in result: # bm=xBaiduMap() # print(location ,url) # if location is not None: # zuobiao = bm.getLocation(location,"深圳") # print(zuobiao) # if zuobiao is None: # continue # LONGITUDE = zuobiao[1] # LATITUDE = zuobiao [0] # update_sql = "update merchant set LONGITUDE = '{}' ,LATITUDE = '{}' where url = '{}' and SOURCE_ID = 16 ".format(LONGITUDE , LATITUDE , url) # Dao.execute_dmls(update_sql) query_sql = " SELECT id, BAIDU_LATI , BAIDU_LONG FROM job_beijing WHERE baidu_lati IS NOT NULL " result = Dao.execute_query(query_sql) for COMMUNITY_ID, LATITUDE, LONGITUDE in result: bm = xBaiduMap() if LATITUDE is not None: try: location = bm.getAddress(LONGITUDE, LATITUDE) except Exception as e: print(e) continue if location is None: continue update_sql = "update job_beijing set addr = '{}' where id = '{}' ".format( location, COMMUNITY_ID) Dao.execute_dmls(update_sql)
class DianpingMerchantCrawler(threading.Thread): def __init__(self, LT1, LG1, LT2, LG2, cityname, cityid, cityenname, name, shopId=0, categoryId=0): threading.Thread.__init__(self, name=name) self.cityid = cityid self.shopId = shopId self.categoryId = categoryId self.Lat1 = LT1 self.Lat2 = LT2 self.Long1 = LG1 self.Long2 = LG2 self.city_name = cityname self.values = { 'promoId': '0', 'shopType': '', 'categoryId': '', 'sortMode': '2', 'shopSortItem': '1', 'keyword': '', 'searchType': '1', 'branchGroupId': '0', 'shippingTypeFilterValue': '0', 'page': '1' } self.values["cityId"] = cityid self.values["cityEnName"] = cityenname self.url = "http://www.dianping.com/search/map/ajax/json" self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0', 'Referer': 'http://www.dianping.com/search/map/category/{}/0'.format(cityid) } self.dao = Dao() self.query_sql = "SELECT shopType ,categoryId,NAME FROM category WHERE categoryId <> shopType AND categoryId <>'None' " self.result = self.dao.execute_query(self.query_sql) self.query_sql2 = "SELECT shopId FROM shop_bean where city_name ='{}'".format( cityname) self.result2 = self.dao.execute_query(self.query_sql2) self.shopIds = [] if self.result2 is not None: for shopid in self.result2: self.shopIds.append(shopid[0]) def save_shop(self, shopRecordBean, categoryId): zuobiao = GoogleLatALng2Baidu(shopRecordBean["geoLng"], shopRecordBean["geoLat"]) insert_sql = "insert into shop_bean (address ,poi ,phoneNo ,shopId ,defaultPic,expand ,shopName,geoLat ,shopDealId,geoLng ,addDate ,shopPower ,shopPowerTitle ,avgPrice,memberCardId ," \ "bookingSetting ,dishTag ,branchUrl ,promoId ,hasSceneryOrder ,shopRecordBean ,regionList ,categoryId ,LATITUDE , LONGITUDE,city_name ) " \ "values( '{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}',\"{}\",'{}','{}',{},{},'{}')".format( shopRecordBean["address"], shopRecordBean["poi"], shopRecordBean["phoneNo"], shopRecordBean["shopId"], shopRecordBean["defaultPic"], shopRecordBean["expand"], shopRecordBean["shopName"], shopRecordBean["geoLat"], shopRecordBean["shopDealId"], shopRecordBean["geoLng"], shopRecordBean["addDate"], shopRecordBean["shopPower"], shopRecordBean["shopPowerTitle"], shopRecordBean["avgPrice"], shopRecordBean["memberCardId"], shopRecordBean["bookingSetting"], shopRecordBean["dishTag"], shopRecordBean["branchUrl"], shopRecordBean["promoId"], shopRecordBean["hasSceneryOrder"], str(shopRecordBean["shopRecordBean"]).replace('\'', '\\\'').replace('\"', '\\\"'), shopRecordBean["regionList"], categoryId, zuobiao["LATITUDE"], zuobiao["LONGITUDE"], self.city_name) self.dao.execute_dmls(insert_sql) def savePageJson(self, page=1): try: self.values["page"] = page print(self.values, "begin") self.data = urllib.parse.urlencode( self.values).encode(encoding='UTF8') request = urllib.request.Request(url=self.url, headers=self.headers, data=self.data) m_fp = urllib.request.urlopen(request, timeout=500) html_str = m_fp.read().decode('utf-8') m_fp.close() # print(self.url,self.headers,self.data,html_str) s = json.loads(html_str) shopRecordBeanList = s["shopRecordBeanList"] # 去除重复的商家 for shopRecordBean in shopRecordBeanList: if str(shopRecordBean["shopId"]) not in self.shopIds: self.save_shop(shopRecordBean, self.values["categoryId"]) self.shopIds.append(str(shopRecordBean["shopId"])) # print("商家之一") # for key in shopRecordBean: # print(" {} = {}".format(key, shopRecordBean[key])) # for key in shopRecordBeanList[0]: # print(" {} = {}".format(key, shopRecordBeanList[0][key])) return s except Exception as e: print(e) return self.savePageJson(page) # except mysql.connector.Error as e : # print(e) def crawler_each_category(self, result): for shopType, categoryId, NAME in result: # print("shoptype = {} , categoryid = {} ,name = {} begin ".format(shopType, categoryId, NAME)) self.values["shopType"] = shopType self.values["categoryId"] = categoryId s = self.savePageJson(1) pageCount = s["pageCount"] for page in range(2, pageCount + 1): try: self.savePageJson(page) except Exception as e: print(e) time.sleep(0.5) self.savePageJson(page) def crawler_each_category_withzuobiao(self, result, Lat, Long): self.values["glat1"] = Lat self.values["glong1"] = Long self.values["glat2"] = Lat - 0.1 self.values["glong2"] = Long + 0.2 self.crawler_each_category(result) def run(self): if self.Lat1 == 0: if self.shopId == 0: self.crawler_each_category(self.result) elif self.categoryId == 0: result = [self.shopId, self.shopId, ""] results = [] results.append(result) self.crawler_each_category(results) else: result = [self.shopId, self.categoryId, ""] results = [] results.append(result) self.crawler_each_category(results) else: Lat = self.Lat1 while (Lat >= self.Lat2): Long = self.Long1 while (Long <= self.Long2): self.crawler_each_category_withzuobiao( self.result, Lat, Long) Long += 0.19 Lat = Lat - 0.09 # crawler_each_category(result) update_sql = "update dianping_cities set status = 0 where cityId = {} ".format( self.cityid) def stop(self): self.thread_stop = True