def get_base_info(self): driver = webdriver.PhantomJS( executable_path=r'D:\python2.7\phantomjs\bin\phantomjs.exe') driver.get('http://hotel.elong.com/chengdu/') time.sleep(2) page = 0 hotels_inf = [] while page <= 580: times = 10 for i in range(times + 1): driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(5) pageSource = driver.page_source # html=pageSource.decode("utf-8") soup = BeautifulSoup(pageSource, "html.parser") if soup: hotel_info = soup.findAll('span', attrs={'class': 'l1'}) for x in hotel_info: hotel_name = x.get('title') logitude = x.get('data-lng') latitude = x.get('data-lat') hotel_id = x.get('data-hotelid') hotel_address = x.get('data-hoteladdress') print hotel_id, hotel_name, logitude, latitude, hotel_address colum_name = [ 'hotel_id', 'hotel_name', 'hotel_address', 'logitude', 'latitude' ] value_list = [ hotel_id if hotel_id else '', hotel_name if hotel_name else '', hotel_address if hotel_address else '', logitude if logitude else '', latitude if latitude else '' ] if hotel_name: conn_db.dataframe_to_mysql().insert_to_mysql_flow( self.mysql_db_table, colum_name, value_list, 0, len(colum_name) - 1, conn_db.conn_localhost) print '---------------------------------------------------------------' driver.find_element_by_xpath("//a[contains(text(),'下一页')]").click( ) # selenium的xpath用法,找到包含“下一页”的a标签去点击 page = page + 1 time.sleep(2) # 睡2秒让网页加载完再去读它的html代码 # with open(date+".txt","w") as f: # for hotel_inf in hotels_inf: # for hotel_attr in hotel_inf: # print hotel_attr # f.write(hotel_attr.encode('utf8')+' ') # f.write('\n') # driver.get('http://hotel.elong.com/chengdu/') # time.sleep(2) driver.close()
def get_detail_info(self, url, hotel_id, proxy): soup = qunar_spider().download_soup_waitting(url, proxy) if soup: hotel_info = soup.find('div', attrs={'class': 'hotel-info-left'}) hotel_name = hotel_info.find('h1').text hotel_address = soup.find( 'div', attrs={'class': 'text-overflow qt-lh hotel-address'}) if hotel_address: hotel_address = hotel_address.text else: hotel_address = '' location = hotel_info.find('h1').get('data-gpoint') if location: logitude = location.split(',')[0] latitude = location.split(',')[1] else: logitude = '' latitude = '' hotel_info = soup.find('div', attrs={ 'class': 'text-overflow qt-lh' }).findAll('span') print(hotel_info) if hotel_info: pattern = re.findall(r'\(?0\d{2,3}[) -]?\d{7,8}|1\d{10}', str(hotel_info)) if pattern: hotel_mobile = '|'.join(pattern) else: hotel_mobile = '' else: hotel_mobile = '' colum_name = [ 'hotel_id', 'hotel_name', 'hotel_address', 'logitude', 'latitude', 'hotel_mobile' ] value_list = [ hotel_id, hotel_name, hotel_address, logitude, latitude, hotel_mobile ] if hotel_name: conn_db.dataframe_to_mysql().insert_to_mysql_flow( self.mysql_db_table, colum_name, value_list, 0, len(colum_name), conn_db.conn_localhost)
def get_hotel_id(self): sql_hotel_sql = 'select hotel_id from online_db.spider_qunar_data group by hotel_id' sql_hotel_id = conn_db.dataframe_to_mysql().get_mysql_data(conn=conn_db.conn_localhost,sql=sql_hotel_sql) sql_hotel_id = [x for x in sql_hotel_id.values] hotel_id = [i for i in range(1,self.n) if i not in sql_hotel_id] random.shuffle(hotel_id) return hotel_id
def get_proxy_ip(self): sql = 'select proxy_ip,port,get_fun from online_db.proxy_ip_info' proxy_info = conn_db.dataframe_to_mysql().get_mysql_data(conn=conn_db.conn_localhost,sql=sql) proxy_ip = [] for proxy,port,get_fun in proxy_info.values: proxy_dic = u"http://"+str(proxy)+u":"+str(port) proxy_dic = {get_fun:proxy_dic} proxy_ip.append(proxy_dic) return proxy_ip
def validateIp(n): url = "http://touch.qunar.com/hotel/chengdu" f = open("D:fangb_ip.txt", "w") socket.setdefaulttimeout(3) proxy = getProxyIp(n) for i in range(0, len(proxy)): try: ip = proxy[i].strip().split(" ") proxy_host = "http://" + ip[0] + ":" + ip[1] proxy_temp = {ip[2]: proxy_host} print(proxy_temp) res = requests.get(url, proxies=proxy_temp, headers=headers, timeout=10) print('response_code:' + str(res.status_code)) if res.status_code == 200: conn_db.dataframe_to_mysql().insert_to_mysql_flow( 'online_db.proxy_ip_info', ['proxy_ip', 'port', 'get_fun'], [ip[0], ip[1], ip[2]], 0, 2, conn_db.conn_localhost) print proxy[i] except Exception, e: print e continue
def get_base_info(self): driver = webdriver.PhantomJS( executable_path= r'D:\Python27\phantomjs\phantomjs-2.1.1-windows\bin\phantomjs.exe') driver.get('http://hotels.ctrip.com/hotel/chengdu28/') time.sleep(2) page = 0 while page <= 634: times = 10 for i in range(times + 1): driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # js="var q=document.body.scrollTop=100000" # driver.execute_script(js) time.sleep(5) pageSource = driver.page_source # html=pageSource.decode("utf-8") soup = BeautifulSoup(pageSource, "html.parser") if soup: hotel_info = soup.findAll('li', attrs={'class': 'hotel_item_name'}) for x in hotel_info: hotel_name_info = x.find('h2', attrs={ 'class': 'hotel_name' }).find('a') hotel_id = hotel_name_info.get('tracevalue').split( ';')[1].replace('hotelid=', '') hotel_name = hotel_name_info.get('title') url_ctm = hotel_name_info.get('data-ctm') today = str( datetime.datetime.strftime( datetime.datetime.today() + datetime.timedelta(days=1), '%Y-%m-%d')) hotel_url = 'http://hotels.ctrip.com/' + hotel_name_info.get( 'href' ) + '&checkIn' + today + '&checkOut' + today + url_ctm hotel_address_info = x.find( 'p', attrs={'class': 'hotel_item_htladdress'}) hotel_address = hotel_address_info.text.split( '。')[0].split('】')[-1] print hotel_id, hotel_name, hotel_url, hotel_address colum_name = [ 'hotel_id', 'hotel_name', 'hotel_url', 'hotel_address' ] value_list = [ hotel_id if hotel_id else '', hotel_name if hotel_name else '', hotel_url if hotel_url else '', hotel_address if hotel_address else '' ] if hotel_name: conn_db.dataframe_to_mysql().insert_to_mysql_flow( self.mysql_db_table, colum_name, value_list, 0, len(colum_name) - 1, conn_db.conn_localhost) print '---------------------------------------------------------------' driver.find_element_by_xpath("//a[contains(text(),'下一页')]").click( ) # selenium的xpath用法,找到包含“下一页”的a标签去点击 page = page + 1 time.sleep(2)