def _extract_data2(self, doc_str): doc = Pq(doc_str) a_list = doc(".place>ul>li>a") try: self._comcode_detail["province"] = doc(a_list[1]).text() self._comcode_detail["city"] = doc(a_list[2]).text() except IndexError as er: sql = " UPDATE fetch_list2 SET times = 0 WHERE url = '{}'".format( self._now_url) Dao.execute_dmls(sql) doc = Pq(doc_str) self._comcode_detail["area"] = doc('.content>ul>li>h1').text() doc = Pq(doc_str) tr_list = doc('.content>table>tr') for tr in tr_list: try: # time.sleep(1) td_list = doc(tr).find("td") self._comcode_detail["street"] = doc( td_list[0]).find("a").text() a_list = doc(td_list[1]).find("a") for a in a_list: self._comcode_detail["society_community"] = doc(a).text() self._save_comcode() except IndexError as er: print("error in " + doc(tr).text())
def save_apartments(COMMUNITY_ID, BUILDING_NUM, URL): # URL = 'http://www.szfcweb.com/szfcweb/(S(knmrwg452ea0mu55p2f5zi45))/DataSerach/SaleInfoHouseShow.aspx?PBTAB_ID=YFW003120_MD003&SPJ_ID=a5121bf5-f3af-451d-9e6c-01b1e33b2f7b' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0', 'Referer': 'http://www.szfcweb.com/szfcweb/(S(knmrwg452ea0mu55p2f5zi45))/DataSerach/SaleInfoProListIndex.aspx' } dao = Dao() request = urllib.request.Request(url=URL) m_fp = urllib.request.urlopen(request, timeout=500) html_str = m_fp.read().decode("utf8") doc = Pq(html_str) try: table = doc("table.table_xkb") td_list = doc(table).find("div.lfzt>a") for td in td_list: APARTMENT_NUM = doc(td).text() insertSQL = "INSERT INTO apartments (COMMUNITY_ID , BUILDING_NUM , APARTMENT_NUM ,STATUS ,create_time )" \ " VALUES ('{}','{}','{}','{}','{}' )".format(COMMUNITY_ID, BUILDING_NUM, APARTMENT_NUM, 2, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) dao.execute_dmls(insertSQL) except Exception: print(Exception) update_sql = "update ehdc.buildings set status=2 where url = {} ;".format( URL)
def _generate_seed_url(self): """ generate all url to visit """ # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510" # self._visit_pages(self._seed_url) # from page 1 to anypage which < 200 # # 从数据库添加 # self._seed_url = Dao._get_url_by_id(self.source_id) querysql = "SELECT COMMUNITY_ID,ORIGINAL_URL FROM communities WHERE source_id ='{}' and status<2 ; ".format( self.source_id) result = Dao.execute_query(querysql) for COMMUNITY_ID, ORIGINAL_URL in result: try: self._apartment_detail["COMMUNITY_ID"] = int(COMMUNITY_ID) self._apartment_detail["create_time"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print("_generate_seed_url func : "+ORIGINAL_URL) self._visit_pages(ORIGINAL_URL) sql = "update communities set status = '2' where COMMUNITY_ID = '{}' ".format( int(COMMUNITY_ID)) Dao.execute_dmls(sql) except Exception as e: print(e) sql = "update communities set status = '-1' where COMMUNITY_ID = '{}' ".format( int(COMMUNITY_ID)) Dao.execute_dmls(sql)
def _save_comcode(self): inser_sql = "INSERT INTO comcode (province ,city,area,street,society_community )" \ " VALUES ('{}','{}','{}','{}','{}' )".format(self._comcode_detail["province"], self._comcode_detail["city"], self._comcode_detail["area"], self._comcode_detail["street"], self._comcode_detail["society_community"]) Dao.execute_dmls(inser_sql)
def _save_community(self): # 表中是否已有记录 query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id = 18 ".format(self._community_detail["url"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format(self._community_detail["name"])) return # 数据插入操作 Dao.execute_dmls(self._insert_community())
def _generate_seed_url(self): """ generate all url to visit """ # 从数据库添加 # from page 1 to anypage which < 200 global Dao Dao = Dao() self._seed_url = Dao._get_url_by_id(self.source_id)
def get_communities(): dao = Dao() sql = '''SELECT DISTINCT COMMUNITY_ID, BUILDING_NUM, URL FROM ehdc.buildings WHERE STATUS = 0 LIMIT 1,100; ''' result = dao.execute_query(sql) return result
def getmerchants(lat, long, COMMUNITY_ID): dao = Dao() param = { 'long': long, 'lat': lat, 'cat': '', 'page': '1', 'order': '1', 'ondoor': '0', 'type': 'nine' } cats = ['街道办', "居委会", '入学'] for cat in cats: param['cat'] = cat url = "http://llzg.com/llzgmri/m/p/business/list?" + urllib.parse.urlencode( param) r = urllib.request.urlopen(url) rlt = json.loads(r.read().decode('UTF-8')) try: for merchant in rlt['business']: insertSql = '''INSERT INTO ehdc.merchant_llzg (city_id, NAME, phone, area_name, location, description, url, LONGITUDE, LATITUDE, source_id, service, display_name,logo,COMMUNITY_ID) VALUES ('', '{}', '{}', '', '{}', '', '{}', '{}', '{}', '', '{}', '{}', '{}', '{}');'''.format(merchant['business_name'], merchant['phone_number'], merchant['address'], '', merchant['lat'], merchant['long'], merchant['sub_title'], param['cat'], merchant['logo'], COMMUNITY_ID) dao.execute_dmls(insertSql) except: pass
def _save_merchant(self): # 表中是否已有记录 query_sql = "SELECT * FROM merchant WHERE url = '{}'".format( self._merchant_detail["url"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format( self._merchant_detail["name"])) return # 数据插入操作 Dao.execute_dmls(self._insert_merchant())
def convert_to_db(filename, db_filename): click.echo("Converting \"{}\" into a database \"{}\"".format( filename, db_filename)) worksheet = get_spreadsheet(filename) click.echo("Creating database file...") new_dao = Dao(db_filename) conn = new_dao.create_connection() if conn is not None: table_name = raw_input("Give name for the database table: ") click.echo() columns = get_headers(worksheet) values = get_values(worksheet) new_dao.create_table(table_name) for column in columns: new_dao.create_column(table_name, column) column_titles = ', '.join(columns) new_dao.insert_values(table_name, column_titles, values) conn.close() click.echo("Database '{}' created.".format(db_filename))
def mining(self, stamp_start, stamp_finish): stamp_start, stamp_finish = int(stamp_start) + 1, int(stamp_finish) cont = 5 pivo = 5 options = webdriver.ChromeOptions() options.add_argument('headless') self._driver = webdriver.Chrome() while stamp_start < stamp_finish: url = "https://twitter.com/search?f=tweets&vertical=default&q=%23{}%20since%3A{}%20until%3A{}&l=pt&src=typd".format( self._hastag, stamp_start, stamp_start + self.STEPP_TIMESTAMP) driver = self._driver driver.get(url) assert "since" in driver.title elementList = driver.find_elements_by_class_name("js-stream-tweet") lasttSizeList = len(elementList) while True: body = driver.find_element_by_tag_name('body') body.send_keys(Keys.END) time.sleep(2) elementList = driver.find_elements_by_class_name( "js-stream-tweet") sizeList = len(elementList) if lasttSizeList == sizeList: break elif sizeList > 100: break lasttSizeList = len(elementList) stringList = [] for tweet in reversed(elementList): idTweet = tweet.get_attribute("data-tweet-id") tweetTimeStamp = tweet.find_elements_by_class_name( "js-short-timestamp")[0].get_attribute("data-time") stringList.append(idTweet + ' ' + tweetTimeStamp + '\n') dao = Dao() dao.insert( 'manager', ['hastag', 'idTweet', 'idCandidato', 'timeStamp'], [self._hastag, idTweet, self._candidato, tweetTimeStamp]) print('...') cont += 1 if cont > pivo: pivo += 5 self.get_status(stamp_start) stamp_start = stamp_start + self.STEPP_TIMESTAMP + 1 self._driver.close() self._driver.quit()
def _save_community(self): # 表中是否已有记录 query_sql = "SELECT * FROM ehdc.communities_llzg WHERE NAME = '{}' and AREA_NAME='{}' ".format( self._community_detail["name"], self._community_detail["area_name"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format( self._community_detail["name"])) return # 数据插入操作 Dao.execute_dmls(self._insert_community())
def _visit_pages(self, seed_url): """ visit one url,get page content """ for single_url in seed_url: update_sql = " UPDATE fetch_list SET times = times+1 WHERE url = '{}'and source_id = 16".format( single_url[0]) Dao.execute_dmls(update_sql) self._now_url = single_url[0] html = self.get_page_content_str(single_url[0]) self._extract_data(html)
def __init__(self, LT1, LG1, LT2, LG2, cityname, cityid, cityenname, name, shopId=0, categoryId=0): threading.Thread.__init__(self, name=name) self.cityid = cityid self.shopId = shopId self.categoryId = categoryId self.Lat1 = LT1 self.Lat2 = LT2 self.Long1 = LG1 self.Long2 = LG2 self.city_name = cityname self.values = { 'promoId': '0', 'shopType': '', 'categoryId': '', 'sortMode': '2', 'shopSortItem': '1', 'keyword': '', 'searchType': '1', 'branchGroupId': '0', 'shippingTypeFilterValue': '0', 'page': '1' } self.values["cityId"] = cityid self.values["cityEnName"] = cityenname self.url = "http://www.dianping.com/search/map/ajax/json" self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0', 'Referer': 'http://www.dianping.com/search/map/category/{}/0'.format(cityid) } self.dao = Dao() self.query_sql = "SELECT shopType ,categoryId,NAME FROM category WHERE categoryId <> shopType AND categoryId <>'None' " self.result = self.dao.execute_query(self.query_sql) self.query_sql2 = "SELECT shopId FROM shop_bean where city_name ='{}'".format( cityname) self.result2 = self.dao.execute_query(self.query_sql2) self.shopIds = [] if self.result2 is not None: for shopid in self.result2: self.shopIds.append(shopid[0])
def get_page_content_str(self, url): time.sleep(1) try: print("现在开始抓取" + url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } request = urllib.request.Request(url=url, headers=headers) m_fp = urllib.request.urlopen(request, timeout=500) html_str = m_fp.read().decode('utf-8') m_fp.close() return html_str except urllib.error.URLError as err: # logfile = open('test.log', 'a') # logfile.write("Error: {} \n in url : {}".format(err, url)) # logfile.close() # print("error in {}.get_page_content_str".format(__name__)) sql = " UPDATE fetch_list SET times = 0 WHERE url = '{}'".format( self._now_url) dao = Dao() dao.execute_dmls(sql) # if url[-3:] == "htm": # time.sleep(120) # return self.get_page_content_str(url) return None except Exception as err: print(err) sql = " UPDATE fetch_list SET times = 0 WHERE url = '{}'".format( self._now_url) dao = Dao() dao.execute_dmls(sql) return None
def start(self): if self._print_status: print('start mining #{} ...'.format(self._hastag)) dao = Dao() timeStamp_tweet_list = dao.select( '*', "manager", "hastag = '{}' ORDER BY 'timeStamp'".format(self._hastag)) lastTweetTimeStamp = int(timeStamp_tweet_list[-1]['timeStamp']) if lastTweetTimeStamp > self.START_TIMESTAMP: self.mining(lastTweetTimeStamp, self.FINISH_TIMESTAMP) else: self.mining(self.START_TIMESTAMP, self.FINISH_TIMESTAMP)
def _save_apartment(self, apartment_detail): # 表中是否已有记录 query_sql = "SELECT * FROM apartments WHERE COMMUNITY_ID = {} and FLOOR_NUM ='{}' and APARTMENT_NUM ='{}' ".format( int(apartment_detail["COMMUNITY_ID"]), apartment_detail["FLOOR_NUM"], apartment_detail["APARTMENT_NUM"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format(str(apartment_detail["COMMUNITY_ID"]) + apartment_detail["FLOOR_NUM"] + apartment_detail["APARTMENT_NUM"])) return # 数据插入操作 try: Dao.execute_dmls(self._insert_apartment(apartment_detail)) except Exception as e: print(e)
def _check_community(self, url): # 表中是否已有记录 完成的 communityid_sql = "SELECT COMMUNITY_ID FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} and status = 2 ".format( url, self.source_id) result = Dao.execute_query(communityid_sql) if result == None: return False return True
def _visit_pages(self, seed_url): """ visit one url,get page content """ for single_url in seed_url: update_sql = " UPDATE fetch_list SET times = times+1 WHERE url = '{}'and source_id =17".format( single_url[0]) Dao.execute_dmls(update_sql) self._base_url = single_url[0] self._now_url = single_url[0] html = self.get_page_content_str(single_url[0]) try: self._extract_data(html) except Exception as e: print(e) update_sql = " UPDATE fetch_list SET status = 1 WHERE url = '{}'and source_id =17".format( single_url[0]) Dao.execute_dmls(update_sql)
def execute(self, COMMUNITY_ID, BUILDING_NUM, URL): try: apartment_detail = { 'COMMUNITY_ID': 0, 'BUILDING_NUM': '', 'APARTMENT_NUM': '', 'STATUS': '2', 'create_time': '' } apartment_detail["COMMUNITY_ID"] = int(COMMUNITY_ID) apartment_detail["BUILDING_NUM"] = BUILDING_NUM apartment_detail["create_time"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) self._visit_pages(URL, apartment_detail) sql = "update BUILDINGS set status = '2' where URL = '{}' ; ".format( URL) Dao.execute_dmls(sql) except Exception as e: print(e) sql = "update BUILDINGS set status = -1 where URL = '{}' ; ".format( URL) Dao.execute_dmls(sql)
def _visit_pages(self, seed_url): """ visit one url,get page content """ for single_url in seed_url: # # 获取html源代码 # html = self.get_page_content_str(single_url) # # #使用哪个方法进行分析 # self._extract_data(html) # dao=Dao() # insert_sql =" INSERT INTO fetch_list (source_id, url,times,page,STATUS) VALUE(99,'{}',0,0,0)".format(single_url) # dao.execute_dmls(insert_sql) dao = Dao() update_sql = " UPDATE fetch_list2 SET times = times+1 WHERE url = '{}'and source_id = 98 ".format( single_url[0]) dao.execute_dmls(update_sql) self._now_url = single_url[0] html = self.get_page_content_str(single_url[0]) self._extract_data2(html)
def _extract_data(self, doc_str): doc = Pq(doc_str) self._comcode_detail["province"] = doc('.content>ul>li>h1').text() doc = Pq(doc_str) tr_list = doc('.content>table>tr') for tr in tr_list: try: # time.sleep(1) td_list = doc(tr).find("td") self._comcode_detail["city"] = doc(td_list[0]).find("a").text() a_list = doc(td_list[1]).find("a") for a in a_list: self._comcode_detail["area"] = doc(a).text() url = self._base_url + doc(a).attr("href") # html = self.get_page_content_str(url) # self._extract_data2(html) insert_sql = " INSERT INTO fetch_list2 (source_id, url,times,page,STATUS) VALUE(98,'{}',0,0,0)".format( url) print("insert sql is [" + insert_sql) Dao.execute_dmls(insert_sql) except IndexError as er: print("error in " + doc(tr).text())
def _generate_seed_url(self): """ generate all url to visit """ # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510" # self._visit_pages(self._seed_url) # from page 1 to anypage which < 200 # # 从数据库添加 # self._seed_url = Dao._get_url_by_id(self.source_id) querysql = "SELECT COMMUNITY_ID, BUILDING_NUM, URL FROM ehdc.buildings WHERE STATUS = 0 ; " result = Dao.execute_query(querysql) for COMMUNITY_ID, BUILDING_NUM, URL in result: self.execute(COMMUNITY_ID, BUILDING_NUM, URL)
def _extract_data(self, url): community_id = self._save_community() doc_str = self.get_page_content_str(url) doc = Pq(doc_str) tr_list = doc("table>tr") try: for tr in tr_list: Floor_num = Pq(tr)("td:eq(0)").text() a_list = doc(tr).find("td.preview>a") for a in a_list: apartment_detail = { 'COMMUNITY_ID': community_id, 'FLOOR_NUM': Floor_num, 'APARTMENT_NUM': doc(a).text(), 'STATUS': '2', 'create_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) } self._save_apartment(apartment_detail) sql = "update communities set status = '2' where ORIGINAL_URL = '{}' ; ".format(url) Dao.execute_dmls(sql) except Exception as e: print(e) sql = "update communities set status = -1 where ORIGINAL_URL = '{}' ; ".format(url) Dao.execute_dmls(sql)
def __init__(self): self._conn = sqlite3.connect('database.db') self.vaccines = Dao(DTO.Vaccine, self._conn) self.suppliers = Dao(DTO.Supplier, self._conn) self.clinics = Dao(DTO.Clinic, self._conn) self.logistics = Dao(DTO.Logistic, self._conn)
class CommunitiesListCrawler(BaseCrawler, threading.Thread): global Dao Dao = Dao() def __init__(self): super().__init__() self.detail_info_urls = [] self.source_id = 31 self._base_url = "http://www.tywsfdc.com/" self._root_url = "http://www.tywsfdc.com/Firsthand/tyfc/publish/p/ProNBList.do?pid" self._apartment_detail = { 'COMMUNITY_ID': 0, 'BUILDING_NUM': '', 'APARTMENT_NUM': '', 'STATUS': '2', 'create_time': '' } def _visit_pages(self, seed_url): """ visit one url,get page content """ # 单个url # html = self.get_page_content_str(self._seed_url[0]) #用数据库的时候 self._pid = seed_url[seed_url.rindex("-"):] seed_url = self._root_url + "=" + self._pid # print("_visit_pages " + seed_url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0', 'Referer': seed_url } values = {'pid': self._pid, 'pageNo': '1', 'pageSize': '50'} data = urllib.parse.urlencode(values).encode(encoding='UTF8') request = urllib.request.Request( url="http://www.tywsfdc.com/Firsthand/tyfc/publish/ProNBList.do", headers=headers, data=data) m_fp = urllib.request.urlopen(request, timeout=500) html_str = m_fp.read().decode("utf8") self.findEachBuilding(html_str) # b = set(self._resualt) # self._resualt=[i for i in b] # # dao=Dao() # insert_sql="" # for res1 in b : # insert_sql = "INSERT INTO merchant_tmp (description,url )VALUES ( '{}', 'http://www.youlin.me/category/407')".format(res1) # print( insert_sql ) # dao = Dao() # dao.execute_dmls(insert_sql) def get_page_content_str(self, url): try: print("现在开始抓取" + url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } request = urllib.request.Request(url=url, headers=headers) m_fp = urllib.request.urlopen(request, timeout=1500) html_str_uncode = m_fp.read() if html_str_uncode == '': print("出问题了,没出来数据") return self.get_page_content_str(url) m_fp.close() return html_str_uncode except urllib.error.URLError as err: return None except Exception as err: print(err) return None def _generate_seed_url(self): """ generate all url to visit """ # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510" # self._visit_pages(self._seed_url) # from page 1 to anypage which < 200 # # 从数据库添加 # self._seed_url = Dao._get_url_by_id(self.source_id) querysql = "SELECT COMMUNITY_ID,ORIGINAL_URL FROM communities WHERE source_id ='{}' and status<2 ; ".format( self.source_id) result = Dao.execute_query(querysql) for COMMUNITY_ID, ORIGINAL_URL in result: try: self._apartment_detail["COMMUNITY_ID"] = int(COMMUNITY_ID) self._apartment_detail["create_time"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print("_generate_seed_url func : "+ORIGINAL_URL) self._visit_pages(ORIGINAL_URL) sql = "update communities set status = '2' where COMMUNITY_ID = '{}' ".format( int(COMMUNITY_ID)) Dao.execute_dmls(sql) except Exception as e: print(e) sql = "update communities set status = '-1' where COMMUNITY_ID = '{}' ".format( int(COMMUNITY_ID)) Dao.execute_dmls(sql) # 直接加,测试 # self._seed_url.append(self._base_url) def findEachBuilding(self, html): doc = Pq(html) tr_list = doc("table>tr") # print("tr size ") for tr in tr_list: try: # 进入每一栋的url objid = doc(tr).attr("objid") if objid == None: continue self._apartment_detail["BUILDING_NUM"] = Pq(tr)( "td:eq(2)").text() url = "http://www.tywsfdc.com/Firsthand/tyfc/publish/p/ProNBView.do?proPID={}&nbid={}".format( self._pid, objid) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0', 'Referer': url } RequestURL = "http://www.tywsfdc.com/Firsthand/tyfc/publish/probld/NBView.do?nid={}&projectid={}".format( objid, self._pid) values = {'nid': objid, 'projectid': self._pid} data = urllib.parse.urlencode(values).encode(encoding='UTF8') request = urllib.request.Request(url=RequestURL, headers=headers, data=data) m_fp = urllib.request.urlopen(request, timeout=500) html_str = m_fp.read().decode("utf8") self._extract_data(html_str) except Exception as e: print(e) pass def _extract_data(self, doc_str): try: doc = Pq(doc_str) # 每一单元 building_list = doc("ul#bldlist>span") for building in building_list: bld = doc(building).attr("id") bld = bld[3:] self._apartment_detail["BUILDING_NUM"] = doc(building).text() # 每一层: xpath = "div.flrlist>table#{}>tr".format(bld) tr_list = doc(xpath) # total_item =int( doc("").text().strip()) # count_num = int(total_item) / 12 for tr in tr_list: self._apartment_detail["FLOOR_NUM"] = Pq(tr)( "td:eq(0)").text() a_list = Pq(tr)("td:eq(1)>span>a") for a in a_list: self._apartment_detail["APARTMENT_NUM"] = doc(a).text() if self._apartment_detail["APARTMENT_NUM"].strip( ) != '': self._apartment_detail[ "create_time"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) self._save_apartments() except Exception as err: print(err) time.sleep(100) self._extract_data(doc_str) def _insert_community(self): result = "INSERT INTO apartments (COMMUNITY_ID , BUILDING_NUM ,FLOOR_NUM , APARTMENT_NUM ,STATUS ,create_time )" \ " VALUES ('{}','{}','{}','{}','{}','{}' )".format(self._apartment_detail["COMMUNITY_ID"], self._apartment_detail["BUILDING_NUM"], self._apartment_detail["FLOOR_NUM"], self._apartment_detail["APARTMENT_NUM"], self._apartment_detail["STATUS"], self._apartment_detail["create_time"]) return result def _save_apartments(self): # 表中是否已有记录 query_sql = "SELECT * FROM apartments WHERE COMMUNITY_ID = {} and BUILDING_NUM ='{}' and APARTMENT_NUM ='{}'and FLOOR_NUM='{}' ; ".format( int(self._apartment_detail["COMMUNITY_ID"]), self._apartment_detail["BUILDING_NUM"], self._apartment_detail["APARTMENT_NUM"], self._apartment_detail["FLOOR_NUM"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format( str(self._apartment_detail["COMMUNITY_ID"]) + self._apartment_detail["BUILDING_NUM"] + self._apartment_detail["APARTMENT_NUM"])) return # 数据插入操作 try: Dao.execute_dmls(self._insert_community()) except Exception as e: print(e) def craw(self): self._generate_seed_url()
class CommunitiesListCrawler(BaseCrawler): global Dao Dao = Dao() def __init__(self): # TODO 用参数化和多线程来执行抓取 super().__init__() self.detail_info_urls = [] self.source_id = 21 self._base_url = "http://www.njhouse.com.cn/persalereg.php" self._community_detail = { 'url': '', 'name': '', 'location': '', 'area_name': '', 'description': '', 'latitude': '', 'longitude': '' } def _visit_pages(self, seed_url): """ visit one url,get page content """ # for single_url in seed_url: # update_sql = " UPDATE fetch_list SET times = times+1 WHERE url = '{}'and source_id =17".format( # single_url[0]) # Dao.execute_dmls(update_sql) # self._base_url = single_url[0] # self._now_url = single_url[0] # html = self.get_page_content_str(single_url[0]) # try: # self._extract_data(html) # except Exception as e: # print(e) # update_sql = " UPDATE fetch_list SET status = 1 WHERE url = '{}'and source_id =17".format( # single_url[0]) # Dao.execute_dmls(update_sql) # 单个url html = self.get_page_content_str(self._seed_url[0]) self._extract_data(html) # b = set(self._resualt) # self._resualt=[i for i in b] # # dao=Dao() # insert_sql="" # for res1 in b : # insert_sql = "INSERT INTO merchant_tmp (description,url )VALUES ( '{}', 'http://www.youlin.me/category/407')".format(res1) # print( insert_sql ) # dao = Dao() # dao.execute_dmls(insert_sql) def get_page_content_str(self, url): time.sleep(1) try: print("现在开始抓取" + url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } request = urllib.request.Request(url=url, headers=headers) m_fp = urllib.request.urlopen(request, timeout=5500) html_str_uncode = m_fp.read() m_fp.close() return html_str_uncode except urllib.error.URLError as err: # logfile = open('test.log', 'a') # logfile.write("Error: {} \n in url : {}".format(err, url)) # logfile.close() # print("error in {}.get_page_content_str".format(__name__)) # if url[-3:] == "htm": # time.sleep(120) # return self.get_page_content_str(url) return None except Exception as err: print(err) return None def _generate_seed_url(self): """ generate all url to visit """ # from page 1 to anypage which < 200 # # 从数据库添加 # self._seed_url = Dao._get_url_by_id(self.source_id) # 直接加,测试 self._seed_url.append(self._base_url) def _extract_data(self, doc_str): doc = Pq(doc_str) tables = doc("table>tr>td>table") # total_item =int( doc("").text().strip()) # count_num = int(total_item) / 12 for table in tables: try: doc = Pq(table) # test = doc(doc("tr")[1]).find("td")[1].text() self._community_detail['location'] = Pq( doc("tr:eq(1)"))("td:eq(1)").text() self._community_detail['name'] = Pq( doc("tr:eq(2)"))("a").text() self._community_detail['url'] = Pq( doc("tr:eq(2)"))("a").attr("href") self._community_detail['area_name'] = Pq( doc("tr:eq(8)"))("td:eq(1)").text() self._save_community() except Exception as err: print(table) print(err) continue def _insert_community(self): result = "INSERT INTO communities (ORIGINAL_URL,NAME,AREA_NAME,LATITUDE,LONGITUDE,location,source_id )" \ " VALUES ('{}','{}','{}','{}','{}' ,'{}','{}' )".format(self._community_detail["url"], self._community_detail["name"], self._community_detail["area_name"], self._community_detail["latitude"], self._community_detail["longitude"], self._community_detail["location"], self.source_id) return result def _save_community(self): # 表中是否已有记录 query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format( self._community_detail["url"], self.source_id) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format( self._community_detail["name"])) return # 数据插入操作 Dao.execute_dmls(self._insert_community())
display_name,logo,COMMUNITY_ID) VALUES ('', '{}', '{}', '', '{}', '', '{}', '{}', '{}', '', '{}', '{}', '{}', '{}');'''.format(merchant['business_name'], merchant['phone_number'], merchant['address'], '', merchant['lat'], merchant['long'], merchant['sub_title'], param['cat'], merchant['logo'], COMMUNITY_ID) dao.execute_dmls(insertSql) except: pass if __name__ == '__main__': querySql = 'SELECT a.name ,a.COMMUNITY_ID ,b.LATITUDE,b.LONGITUDE FROM shengchan_20140815.communities a ,shengchan_20140815.community_poses b WHERE a.COMMUNITY_ID =b.COMMUNITY_ID AND a.AREA_ID <11 AND a.AREA_ID > 0 ' dao = Dao() result = dao.execute_query(querySql) for name, COMMUNITY_ID, LATITUDE, LONGITUDE in result: getmerchants(LATITUDE, LONGITUDE, COMMUNITY_ID)
class _Repository: def __init__(self): self._conn = sqlite3.connect('database.db') self.vaccines = Dao(DTO.Vaccine, self._conn) self.suppliers = Dao(DTO.Supplier, self._conn) self.clinics = Dao(DTO.Clinic, self._conn) self.logistics = Dao(DTO.Logistic, self._conn) def close(self): self._conn.commit() self._conn.close() def create_tables(self): self._conn.executescript(""" CREATE TABLE IF NOT EXISTS logistics ( id INTEGER PRIMARY KEY, name TEXT NOT NULL, count_sent INTEGER NOT NULL , count_received INTEGER NOT NULL ); CREATE TABLE IF NOT EXISTS suppliers ( id INTEGER PRIMARY KEY, name TEXT NOT NULL, logistic INTEGER REFERENCES logistics(id) ); CREATE TABLE IF NOT EXISTS clinics ( id INTEGER PRIMARY KEY, location TEXT NOT NULL, demand INTEGER NOT NULL , logistic INTEGER REFERENCES logistics(id) ); CREATE TABLE IF NOT EXISTS vaccines ( id INTEGER PRIMARY KEY, date DATE NOT NULL, supplier INTEGER REFERENCES suppliers(id), quantity INTEGER NOT NULL ); """) def receiveShipment(self, nameOfSup, amount, date): # insert the next vaccine to the vaccine table # get the id of the logistics from the suppliers table using the name supplier = self.suppliers.find(name=nameOfSup) supplierIndex = supplier[0].id # get the id of the last inserted line to create a new id for the new vaccine lastId = self.vaccines.getLastInsertedId() newId = lastId[0] + 1 newVaccine = DTO.Vaccine(newId, date, supplierIndex, amount) self.vaccines.insert(newVaccine) idOfLogistics = supplier[0].logistic # update the count_received of this logistics company in logistics table logistic = self.logistics.find(id=idOfLogistics) currCountRec = logistic[0].count_Received set_value = {'count_received': currCountRec + int(amount)} # only where the id = idOfLogistics we got from the find query cond = {'id': idOfLogistics} self.logistics.update(set_value, cond) def sendShipment(self, locationOfClinic, amount): clinic = self.clinics.find(location=locationOfClinic) # get the id of the logistic of this clinic idOfLogistics = clinic[0].logistic # update the count_sent of this logistics company in logistics table logistic = self.logistics.find(id=idOfLogistics) currCountSent = logistic[0].count_Sent set_value = {'count_sent': currCountSent + int(amount)} # only where the id = idOfLogistics we got from the find query cond = {"id": idOfLogistics} self.logistics.update(set_value, cond) # remove amount from inventory allVaccines = self.vaccines.findWithASCOrder('date') tempAmount = int(amount) for vaccine in allVaccines: if tempAmount == 0: break # we need to delete the line since the quantity will be zero if vaccine.quantity <= int(tempAmount): self.vaccines.delete(id=vaccine.id) tempAmount = tempAmount - int(vaccine.quantity) # if we can take amount and not delete else: set_value = {'quantity': vaccine.quantity - int(tempAmount)} cond = {"id": vaccine.id} self.vaccines.update(set_value, cond) tempAmount = 0 # remove amount from the demand of location currDemand = clinic[0].demand set_value = {"demand": currDemand - int(amount)} cond = {"location": locationOfClinic} self.clinics.update(set_value, cond)
class CommunitiesListCrawler(BaseCrawler, threading.Thread): global Dao Dao = Dao() def __init__(self, page_num): # TO threading.Thread.__init__(self, name=page_num) super().__init__(self) self.detail_info_urls = [] self.source_id = 30 self.min_page = page_num * 30 + 1 self.max_page = page_num * 30 + 31 self._base_url = "http://newhouse.hfhome.cn/" self._community_detail = { 'url': '', 'name': '', 'location': '', 'area_name': '', 'description': '', 'latitude': '', 'longitude': '' } def _visit_pages(self, seed_url): """ visit one url,get page content """ # 单个url html = self.get_page_content_str(seed_url) self.findEachBuilding(html) def findEachBuilding(self, html): doc = Pq(html) tr_list = doc("table#GVFwxkz>tr") for tr in tr_list: name = Pq(tr)("td:eq(1)").text() self._community_detail["name"] = name href = doc(tr).find("td>a").attr("href") if href == None: continue href = href[href.index("?"):] url = "http://newhouse.hfhome.cn/Modal/RoomList.aspx" + href if self._check_community(url): print(url + " --- 已经爬取过了") continue self._community_detail["url"] = url self._extract_data(url) def _extract_data(self, url): community_id = self._save_community() doc_str = self.get_page_content_str(url) doc = Pq(doc_str) tr_list = doc("table>tr") try: for tr in tr_list: Floor_num = Pq(tr)("td:eq(0)").text() a_list = doc(tr).find("td.preview>a") for a in a_list: apartment_detail = { 'COMMUNITY_ID': community_id, 'FLOOR_NUM': Floor_num, 'APARTMENT_NUM': doc(a).text(), 'STATUS': '2', 'create_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) } self._save_apartment(apartment_detail) sql = "update communities set status = '2' where ORIGINAL_URL = '{}' ; ".format(url) Dao.execute_dmls(sql) except Exception as e: print(e) sql = "update communities set status = -1 where ORIGINAL_URL = '{}' ; ".format(url) Dao.execute_dmls(sql) def _insert_community(self): result = "INSERT INTO communities (ORIGINAL_URL,NAME,AREA_NAME,LATITUDE,LONGITUDE,address,source_id )" \ " VALUES ('{}','{}','{}','{}','{}' ,'{}','{}' )".format(self._community_detail["url"], self._community_detail["name"], self._community_detail["area_name"], self._community_detail["latitude"], self._community_detail["longitude"], self._community_detail["location"], self.source_id) return result def _insert_apartment(self, apartment_detail): result = "INSERT INTO apartments (COMMUNITY_ID , APARTMENT_NUM ,STATUS ,FLOOR_NUM,create_time )" \ " VALUES ('{}','{}','{}','{}','{}' )".format(apartment_detail["COMMUNITY_ID"], apartment_detail["APARTMENT_NUM"], apartment_detail["STATUS"], apartment_detail["FLOOR_NUM"], apartment_detail["create_time"]) return result def _save_apartment(self, apartment_detail): # 表中是否已有记录 query_sql = "SELECT * FROM apartments WHERE COMMUNITY_ID = {} and FLOOR_NUM ='{}' and APARTMENT_NUM ='{}' ".format( int(apartment_detail["COMMUNITY_ID"]), apartment_detail["FLOOR_NUM"], apartment_detail["APARTMENT_NUM"]) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format(str(apartment_detail["COMMUNITY_ID"]) + apartment_detail["FLOOR_NUM"] + apartment_detail["APARTMENT_NUM"])) return # 数据插入操作 try: Dao.execute_dmls(self._insert_apartment(apartment_detail)) except Exception as e: print(e) def _save_community(self): # 表中是否已有记录 query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format( self._community_detail["url"], self.source_id) communityid_sql = "SELECT COMMUNITY_ID FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format( self._community_detail["url"], self.source_id) if Dao.execute_query(query_sql) is not None: print(" {} is already exists ,so next".format(self._community_detail["name"])) return Dao.execute_query(communityid_sql)[0][0] # 数据插入操作 Dao.execute_dmls(self._insert_community()) return Dao.execute_query(communityid_sql)[0][0] def _check_community(self, url): # 表中是否已有记录 完成的 communityid_sql = "SELECT COMMUNITY_ID FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} and status = 2 ".format( url, self.source_id) result = Dao.execute_query(communityid_sql) if result == None: return False return True def run(self): # for i in range(self.min_page, self.max_page): for i in range(363, 397): url = "http://newhouse.hfhome.cn/hffd_xkz.aspx?page={}".format(i) self._visit_pages(url)