def get_page_content_str(self, url): time.sleep(1) try: print("现在开始抓取" + url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } request = urllib.request.Request(url=url, headers=headers) m_fp = urllib.request.urlopen(request, timeout=500) html_str = m_fp.read().decode('utf-8') m_fp.close() return html_str except urllib.error.URLError as err: # logfile = open('test.log', 'a') # logfile.write("Error: {} \n in url : {}".format(err, url)) # logfile.close() # print("error in {}.get_page_content_str".format(__name__)) sql = " UPDATE fetch_list SET times = 0 WHERE url = '{}'".format( self._now_url) dao = Dao() dao.execute_dmls(sql) # if url[-3:] == "htm": # time.sleep(120) # return self.get_page_content_str(url) return None except Exception as err: print(err) sql = " UPDATE fetch_list SET times = 0 WHERE url = '{}'".format( self._now_url) dao = Dao() dao.execute_dmls(sql) return None
def _extract_data2(self, doc_str): doc = Pq(doc_str) a_list = doc(".place>ul>li>a") try: self._comcode_detail["province"] = doc(a_list[1]).text() self._comcode_detail["city"] = doc(a_list[2]).text() except IndexError as er: sql = " UPDATE fetch_list2 SET times = 0 WHERE url = '{}'".format( self._now_url) Dao.execute_dmls(sql) doc = Pq(doc_str) self._comcode_detail["area"] = doc('.content>ul>li>h1').text() doc = Pq(doc_str) tr_list = doc('.content>table>tr') for tr in tr_list: try: # time.sleep(1) td_list = doc(tr).find("td") self._comcode_detail["street"] = doc( td_list[0]).find("a").text() a_list = doc(td_list[1]).find("a") for a in a_list: self._comcode_detail["society_community"] = doc(a).text() self._save_comcode() except IndexError as er: print("error in " + doc(tr).text())
def save_apartments(COMMUNITY_ID, BUILDING_NUM, URL): # URL = 'http://www.szfcweb.com/szfcweb/(S(knmrwg452ea0mu55p2f5zi45))/DataSerach/SaleInfoHouseShow.aspx?PBTAB_ID=YFW003120_MD003&SPJ_ID=a5121bf5-f3af-451d-9e6c-01b1e33b2f7b' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0', 'Referer': 'http://www.szfcweb.com/szfcweb/(S(knmrwg452ea0mu55p2f5zi45))/DataSerach/SaleInfoProListIndex.aspx' } dao = Dao() request = urllib.request.Request(url=URL) m_fp = urllib.request.urlopen(request, timeout=500) html_str = m_fp.read().decode("utf8") doc = Pq(html_str) try: table = doc("table.table_xkb") td_list = doc(table).find("div.lfzt>a") for td in td_list: APARTMENT_NUM = doc(td).text() insertSQL = "INSERT INTO apartments (COMMUNITY_ID , BUILDING_NUM , APARTMENT_NUM ,STATUS ,create_time )" \ " VALUES ('{}','{}','{}','{}','{}' )".format(COMMUNITY_ID, BUILDING_NUM, APARTMENT_NUM, 2, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) dao.execute_dmls(insertSQL) except Exception: print(Exception) update_sql = "update ehdc.buildings set status=2 where url = {} ;".format( URL)
def _generate_seed_url(self): """ generate all url to visit """ # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510" # self._visit_pages(self._seed_url) # from page 1 to anypage which < 200 # # 从数据库添加 # self._seed_url = Dao._get_url_by_id(self.source_id) querysql = "SELECT COMMUNITY_ID,ORIGINAL_URL FROM communities WHERE source_id ='{}' and status<2 ; ".format( self.source_id) result = Dao.execute_query(querysql) for COMMUNITY_ID, ORIGINAL_URL in result: try: self._apartment_detail["COMMUNITY_ID"] = int(COMMUNITY_ID) self._apartment_detail["create_time"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print("_generate_seed_url func : "+ORIGINAL_URL) self._visit_pages(ORIGINAL_URL) sql = "update communities set status = '2' where COMMUNITY_ID = '{}' ".format( int(COMMUNITY_ID)) Dao.execute_dmls(sql) except Exception as e: print(e) sql = "update communities set status = '-1' where COMMUNITY_ID = '{}' ".format( int(COMMUNITY_ID)) Dao.execute_dmls(sql)
def _save_comcode(self): inser_sql = "INSERT INTO comcode (province ,city,area,street,society_community )" \ " VALUES ('{}','{}','{}','{}','{}' )".format(self._comcode_detail["province"], self._comcode_detail["city"], self._comcode_detail["area"], self._comcode_detail["street"], self._comcode_detail["society_community"]) Dao.execute_dmls(inser_sql)
def _save_community(self): # 表中是否已有记录 query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id = 18 ".format(self._community_detail["url"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format(self._community_detail["name"])) return # 数据插入操作 Dao.execute_dmls(self._insert_community())
def getmerchants(lat, long, COMMUNITY_ID): dao = Dao() param = { 'long': long, 'lat': lat, 'cat': '', 'page': '1', 'order': '1', 'ondoor': '0', 'type': 'nine' } cats = ['街道办', "居委会", '入学'] for cat in cats: param['cat'] = cat url = "http://llzg.com/llzgmri/m/p/business/list?" + urllib.parse.urlencode( param) r = urllib.request.urlopen(url) rlt = json.loads(r.read().decode('UTF-8')) try: for merchant in rlt['business']: insertSql = '''INSERT INTO ehdc.merchant_llzg (city_id, NAME, phone, area_name, location, description, url, LONGITUDE, LATITUDE, source_id, service, display_name,logo,COMMUNITY_ID) VALUES ('', '{}', '{}', '', '{}', '', '{}', '{}', '{}', '', '{}', '{}', '{}', '{}');'''.format(merchant['business_name'], merchant['phone_number'], merchant['address'], '', merchant['lat'], merchant['long'], merchant['sub_title'], param['cat'], merchant['logo'], COMMUNITY_ID) dao.execute_dmls(insertSql) except: pass
def _save_merchant(self): # 表中是否已有记录 query_sql = "SELECT * FROM merchant WHERE url = '{}'".format( self._merchant_detail["url"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format( self._merchant_detail["name"])) return # 数据插入操作 Dao.execute_dmls(self._insert_merchant())
def _save_community(self): # 表中是否已有记录 query_sql = "SELECT * FROM ehdc.communities_llzg WHERE NAME = '{}' and AREA_NAME='{}' ".format( self._community_detail["name"], self._community_detail["area_name"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format( self._community_detail["name"])) return # 数据插入操作 Dao.execute_dmls(self._insert_community())
def _visit_pages(self, seed_url): """ visit one url,get page content """ for single_url in seed_url: update_sql = " UPDATE fetch_list SET times = times+1 WHERE url = '{}'and source_id = 16".format( single_url[0]) Dao.execute_dmls(update_sql) self._now_url = single_url[0] html = self.get_page_content_str(single_url[0]) self._extract_data(html)
def _save_apartment(self, apartment_detail): # 表中是否已有记录 query_sql = "SELECT * FROM apartments WHERE COMMUNITY_ID = {} and FLOOR_NUM ='{}' and APARTMENT_NUM ='{}' ".format( int(apartment_detail["COMMUNITY_ID"]), apartment_detail["FLOOR_NUM"], apartment_detail["APARTMENT_NUM"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format(str(apartment_detail["COMMUNITY_ID"]) + apartment_detail["FLOOR_NUM"] + apartment_detail["APARTMENT_NUM"])) return # 数据插入操作 try: Dao.execute_dmls(self._insert_apartment(apartment_detail)) except Exception as e: print(e)
def _visit_pages(self, seed_url): """ visit one url,get page content """ for single_url in seed_url: update_sql = " UPDATE fetch_list SET times = times+1 WHERE url = '{}'and source_id =17".format( single_url[0]) Dao.execute_dmls(update_sql) self._base_url = single_url[0] self._now_url = single_url[0] html = self.get_page_content_str(single_url[0]) try: self._extract_data(html) except Exception as e: print(e) update_sql = " UPDATE fetch_list SET status = 1 WHERE url = '{}'and source_id =17".format( single_url[0]) Dao.execute_dmls(update_sql)
def execute(self, COMMUNITY_ID, BUILDING_NUM, URL): try: apartment_detail = { 'COMMUNITY_ID': 0, 'BUILDING_NUM': '', 'APARTMENT_NUM': '', 'STATUS': '2', 'create_time': '' } apartment_detail["COMMUNITY_ID"] = int(COMMUNITY_ID) apartment_detail["BUILDING_NUM"] = BUILDING_NUM apartment_detail["create_time"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) self._visit_pages(URL, apartment_detail) sql = "update BUILDINGS set status = '2' where URL = '{}' ; ".format( URL) Dao.execute_dmls(sql) except Exception as e: print(e) sql = "update BUILDINGS set status = -1 where URL = '{}' ; ".format( URL) Dao.execute_dmls(sql)
def _extract_data(self, doc_str): doc = Pq(doc_str) self._comcode_detail["province"] = doc('.content>ul>li>h1').text() doc = Pq(doc_str) tr_list = doc('.content>table>tr') for tr in tr_list: try: # time.sleep(1) td_list = doc(tr).find("td") self._comcode_detail["city"] = doc(td_list[0]).find("a").text() a_list = doc(td_list[1]).find("a") for a in a_list: self._comcode_detail["area"] = doc(a).text() url = self._base_url + doc(a).attr("href") # html = self.get_page_content_str(url) # self._extract_data2(html) insert_sql = " INSERT INTO fetch_list2 (source_id, url,times,page,STATUS) VALUE(98,'{}',0,0,0)".format( url) print("insert sql is [" + insert_sql) Dao.execute_dmls(insert_sql) except IndexError as er: print("error in " + doc(tr).text())
def _visit_pages(self, seed_url): """ visit one url,get page content """ for single_url in seed_url: # # 获取html源代码 # html = self.get_page_content_str(single_url) # # #使用哪个方法进行分析 # self._extract_data(html) # dao=Dao() # insert_sql =" INSERT INTO fetch_list (source_id, url,times,page,STATUS) VALUE(99,'{}',0,0,0)".format(single_url) # dao.execute_dmls(insert_sql) dao = Dao() update_sql = " UPDATE fetch_list2 SET times = times+1 WHERE url = '{}'and source_id = 98 ".format( single_url[0]) dao.execute_dmls(update_sql) self._now_url = single_url[0] html = self.get_page_content_str(single_url[0]) self._extract_data2(html)
def _extract_data(self, url): community_id = self._save_community() doc_str = self.get_page_content_str(url) doc = Pq(doc_str) tr_list = doc("table>tr") try: for tr in tr_list: Floor_num = Pq(tr)("td:eq(0)").text() a_list = doc(tr).find("td.preview>a") for a in a_list: apartment_detail = { 'COMMUNITY_ID': community_id, 'FLOOR_NUM': Floor_num, 'APARTMENT_NUM': doc(a).text(), 'STATUS': '2', 'create_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) } self._save_apartment(apartment_detail) sql = "update communities set status = '2' where ORIGINAL_URL = '{}' ; ".format(url) Dao.execute_dmls(sql) except Exception as e: print(e) sql = "update communities set status = -1 where ORIGINAL_URL = '{}' ; ".format(url) Dao.execute_dmls(sql)
from DAO import Dao filepath = "D:\\用户目录\\我的文档\\Tencent Files\\71913596\\FileRecv\\小区行政区街道A" file_list = os.listdir(filepath) for file in file_list: file = filepath + "\\" + file with open(file) as f: f_csv = csv.reader(f) headers = next(f_csv) body = next(f_csv) community_name = '' community_area = '' community_street = '' community_sc = '' for i in range(0, len(headers)): if headers[i] == "区": community_area = body[i] if headers[i] == "街道": community_street = body[i] if headers[i] == "社区": community_sc = body[i] if headers[i] == "小区": community_name = body[i] try: insertSQL = "INSERT INTO ehdc.shenzhen_community_to_street (community_name, community_area, community_street, community_sc) VALUES ('{}', '{}', '{}', '{}');".format( community_name, community_area, community_street, community_sc) dao = Dao() dao.execute_dmls(insertSQL) except: pass
# query_sql = "SELECT url,location FROM merchant WHERE SOURCE_ID = 16 " # result = Dao.execute_query(query_sql) # for url , location in result: # bm=xBaiduMap() # print(location ,url) # if location is not None: # zuobiao = bm.getLocation(location,"深圳") # print(zuobiao) # if zuobiao is None: # continue # LONGITUDE = zuobiao[1] # LATITUDE = zuobiao [0] # update_sql = "update merchant set LONGITUDE = '{}' ,LATITUDE = '{}' where url = '{}' and SOURCE_ID = 16 ".format(LONGITUDE , LATITUDE , url) # Dao.execute_dmls(update_sql) query_sql = " SELECT id, BAIDU_LATI , BAIDU_LONG FROM job_beijing WHERE baidu_lati IS NOT NULL " result = Dao.execute_query(query_sql) for COMMUNITY_ID, LATITUDE, LONGITUDE in result: bm = xBaiduMap() if LATITUDE is not None: try: location = bm.getAddress(LONGITUDE, LATITUDE) except Exception as e: print(e) continue if location is None: continue update_sql = "update job_beijing set addr = '{}' where id = '{}' ".format( location, COMMUNITY_ID) Dao.execute_dmls(update_sql)
def save_category(self): for category in self._category_list: sql = "INSERT INTO category (shopType,categoryId,name ) VALUES ('{}','{}','{}' )".format( category["shopType"], category["categoryId"], category["name"]) Dao.execute_dmls(sql)
def save_building(self, url): SQL = " INSERT INTO ehdc.buildings (COMMUNITY_ID,BUILDING_NUM,URL,STATUS) VALUES ('{}','{}','{}','{}') ;".format( str(self._apartment_detail["COMMUNITY_ID"]), self._apartment_detail["BUILDING_NUM"], url, self._apartment_detail["STATUS"]) Dao.execute_dmls(SQL)
from DAO import Dao from pyquery import PyQuery as Pq dao = Dao() querySql = "SELECT COMMUNITY_ID,ORIGINAL_URL ,(SELECT COUNT(DISTINCT BUILDING_NUM )FROM apartments WHERE apartments.`COMMUNITY_ID` = communities.`COMMUNITY_ID` ) FROM communities WHERE STATUS = -1 AND SOURCE_ID = 19" result = dao.execute_query(querySql) for communityId, url, count in result: print(url) headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} request = urllib.request.Request(url=url, headers=headers) m_fp = urllib.request.urlopen(request, timeout=500) try: html = m_fp.read().decode('gb2312') except Exception as e : print(e) print(url + "出错了") continue pq = Pq(html) li = pq("div.st_tree>ul>li") pagecount = len(li) # for i in li: # pagecount = pagecount + 1 if (pagecount != count): updateSql = "update communities set STATUS = -1 WHERE STATUS = 1 AND SOURCE_ID = 19 and COMMUNITY_ID = {}".format( communityId) dao.execute_dmls(updateSql) else : updateSql = "update communities set STATUS = 2 WHERE STATUS = 1 AND SOURCE_ID = 19 and COMMUNITY_ID = {}".format( communityId) dao.execute_dmls(updateSql)
class DianpingMerchantCrawler(threading.Thread): def __init__(self, LT1, LG1, LT2, LG2, cityname, cityid, cityenname, name, shopId=0, categoryId=0): threading.Thread.__init__(self, name=name) self.cityid = cityid self.shopId = shopId self.categoryId = categoryId self.Lat1 = LT1 self.Lat2 = LT2 self.Long1 = LG1 self.Long2 = LG2 self.city_name = cityname self.values = { 'promoId': '0', 'shopType': '', 'categoryId': '', 'sortMode': '2', 'shopSortItem': '1', 'keyword': '', 'searchType': '1', 'branchGroupId': '0', 'shippingTypeFilterValue': '0', 'page': '1' } self.values["cityId"] = cityid self.values["cityEnName"] = cityenname self.url = "http://www.dianping.com/search/map/ajax/json" self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0', 'Referer': 'http://www.dianping.com/search/map/category/{}/0'.format(cityid) } self.dao = Dao() self.query_sql = "SELECT shopType ,categoryId,NAME FROM category WHERE categoryId <> shopType AND categoryId <>'None' " self.result = self.dao.execute_query(self.query_sql) self.query_sql2 = "SELECT shopId FROM shop_bean where city_name ='{}'".format( cityname) self.result2 = self.dao.execute_query(self.query_sql2) self.shopIds = [] if self.result2 is not None: for shopid in self.result2: self.shopIds.append(shopid[0]) def save_shop(self, shopRecordBean, categoryId): zuobiao = GoogleLatALng2Baidu(shopRecordBean["geoLng"], shopRecordBean["geoLat"]) insert_sql = "insert into shop_bean (address ,poi ,phoneNo ,shopId ,defaultPic,expand ,shopName,geoLat ,shopDealId,geoLng ,addDate ,shopPower ,shopPowerTitle ,avgPrice,memberCardId ," \ "bookingSetting ,dishTag ,branchUrl ,promoId ,hasSceneryOrder ,shopRecordBean ,regionList ,categoryId ,LATITUDE , LONGITUDE,city_name ) " \ "values( '{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}',\"{}\",'{}','{}',{},{},'{}')".format( shopRecordBean["address"], shopRecordBean["poi"], shopRecordBean["phoneNo"], shopRecordBean["shopId"], shopRecordBean["defaultPic"], shopRecordBean["expand"], shopRecordBean["shopName"], shopRecordBean["geoLat"], shopRecordBean["shopDealId"], shopRecordBean["geoLng"], shopRecordBean["addDate"], shopRecordBean["shopPower"], shopRecordBean["shopPowerTitle"], shopRecordBean["avgPrice"], shopRecordBean["memberCardId"], shopRecordBean["bookingSetting"], shopRecordBean["dishTag"], shopRecordBean["branchUrl"], shopRecordBean["promoId"], shopRecordBean["hasSceneryOrder"], str(shopRecordBean["shopRecordBean"]).replace('\'', '\\\'').replace('\"', '\\\"'), shopRecordBean["regionList"], categoryId, zuobiao["LATITUDE"], zuobiao["LONGITUDE"], self.city_name) self.dao.execute_dmls(insert_sql) def savePageJson(self, page=1): try: self.values["page"] = page print(self.values, "begin") self.data = urllib.parse.urlencode( self.values).encode(encoding='UTF8') request = urllib.request.Request(url=self.url, headers=self.headers, data=self.data) m_fp = urllib.request.urlopen(request, timeout=500) html_str = m_fp.read().decode('utf-8') m_fp.close() # print(self.url,self.headers,self.data,html_str) s = json.loads(html_str) shopRecordBeanList = s["shopRecordBeanList"] # 去除重复的商家 for shopRecordBean in shopRecordBeanList: if str(shopRecordBean["shopId"]) not in self.shopIds: self.save_shop(shopRecordBean, self.values["categoryId"]) self.shopIds.append(str(shopRecordBean["shopId"])) # print("商家之一") # for key in shopRecordBean: # print(" {} = {}".format(key, shopRecordBean[key])) # for key in shopRecordBeanList[0]: # print(" {} = {}".format(key, shopRecordBeanList[0][key])) return s except Exception as e: print(e) return self.savePageJson(page) # except mysql.connector.Error as e : # print(e) def crawler_each_category(self, result): for shopType, categoryId, NAME in result: # print("shoptype = {} , categoryid = {} ,name = {} begin ".format(shopType, categoryId, NAME)) self.values["shopType"] = shopType self.values["categoryId"] = categoryId s = self.savePageJson(1) pageCount = s["pageCount"] for page in range(2, pageCount + 1): try: self.savePageJson(page) except Exception as e: print(e) time.sleep(0.5) self.savePageJson(page) def crawler_each_category_withzuobiao(self, result, Lat, Long): self.values["glat1"] = Lat self.values["glong1"] = Long self.values["glat2"] = Lat - 0.1 self.values["glong2"] = Long + 0.2 self.crawler_each_category(result) def run(self): if self.Lat1 == 0: if self.shopId == 0: self.crawler_each_category(self.result) elif self.categoryId == 0: result = [self.shopId, self.shopId, ""] results = [] results.append(result) self.crawler_each_category(results) else: result = [self.shopId, self.categoryId, ""] results = [] results.append(result) self.crawler_each_category(results) else: Lat = self.Lat1 while (Lat >= self.Lat2): Long = self.Long1 while (Long <= self.Long2): self.crawler_each_category_withzuobiao( self.result, Lat, Long) Long += 0.19 Lat = Lat - 0.09 # crawler_each_category(result) update_sql = "update dianping_cities set status = 0 where cityId = {} ".format( self.cityid) def stop(self): self.thread_stop = True