Beispiel #1
0
    def get_base_info(self):

        driver = webdriver.PhantomJS(
            executable_path=r'D:\python2.7\phantomjs\bin\phantomjs.exe')

        driver.get('http://hotel.elong.com/chengdu/')
        time.sleep(2)
        page = 0
        hotels_inf = []
        while page <= 580:
            times = 10
            for i in range(times + 1):
                driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(5)
            pageSource = driver.page_source
            # html=pageSource.decode("utf-8")
            soup = BeautifulSoup(pageSource, "html.parser")
            if soup:
                hotel_info = soup.findAll('span', attrs={'class': 'l1'})

                for x in hotel_info:
                    hotel_name = x.get('title')
                    logitude = x.get('data-lng')
                    latitude = x.get('data-lat')
                    hotel_id = x.get('data-hotelid')
                    hotel_address = x.get('data-hoteladdress')
                    print hotel_id, hotel_name, logitude, latitude, hotel_address
                    colum_name = [
                        'hotel_id', 'hotel_name', 'hotel_address', 'logitude',
                        'latitude'
                    ]
                    value_list = [
                        hotel_id if hotel_id else '',
                        hotel_name if hotel_name else '',
                        hotel_address if hotel_address else '',
                        logitude if logitude else '',
                        latitude if latitude else ''
                    ]
                    if hotel_name:
                        conn_db.dataframe_to_mysql().insert_to_mysql_flow(
                            self.mysql_db_table, colum_name, value_list, 0,
                            len(colum_name) - 1, conn_db.conn_localhost)

            print '---------------------------------------------------------------'
            driver.find_element_by_xpath("//a[contains(text(),'下一页')]").click(
            )  # selenium的xpath用法,找到包含“下一页”的a标签去点击
            page = page + 1
            time.sleep(2)  # 睡2秒让网页加载完再去读它的html代码
            # with open(date+".txt","w") as f:
            #     for hotel_inf in hotels_inf:
            #         for hotel_attr in hotel_inf:
            #             print hotel_attr
            #             f.write(hotel_attr.encode('utf8')+' ')
            #         f.write('\n')
        # driver.get('http://hotel.elong.com/chengdu/')
        # time.sleep(2)
        driver.close()
Beispiel #2
0
    def get_detail_info(self, url, hotel_id, proxy):

        soup = qunar_spider().download_soup_waitting(url, proxy)
        if soup:

            hotel_info = soup.find('div', attrs={'class': 'hotel-info-left'})
            hotel_name = hotel_info.find('h1').text

            hotel_address = soup.find(
                'div', attrs={'class': 'text-overflow qt-lh hotel-address'})
            if hotel_address:
                hotel_address = hotel_address.text
            else:
                hotel_address = ''

            location = hotel_info.find('h1').get('data-gpoint')
            if location:
                logitude = location.split(',')[0]
                latitude = location.split(',')[1]
            else:
                logitude = ''
                latitude = ''

            hotel_info = soup.find('div',
                                   attrs={
                                       'class': 'text-overflow qt-lh'
                                   }).findAll('span')

            print(hotel_info)
            if hotel_info:
                pattern = re.findall(r'\(?0\d{2,3}[) -]?\d{7,8}|1\d{10}',
                                     str(hotel_info))
                if pattern:
                    hotel_mobile = '|'.join(pattern)
                else:
                    hotel_mobile = ''
            else:
                hotel_mobile = ''

            colum_name = [
                'hotel_id', 'hotel_name', 'hotel_address', 'logitude',
                'latitude', 'hotel_mobile'
            ]
            value_list = [
                hotel_id, hotel_name, hotel_address, logitude, latitude,
                hotel_mobile
            ]
            if hotel_name:
                conn_db.dataframe_to_mysql().insert_to_mysql_flow(
                    self.mysql_db_table, colum_name, value_list, 0,
                    len(colum_name), conn_db.conn_localhost)
Beispiel #3
0
    def get_hotel_id(self):

        sql_hotel_sql = 'select hotel_id from online_db.spider_qunar_data group by hotel_id'
        sql_hotel_id = conn_db.dataframe_to_mysql().get_mysql_data(conn=conn_db.conn_localhost,sql=sql_hotel_sql)
        sql_hotel_id = [x for x in sql_hotel_id.values]
        hotel_id = [i for i in range(1,self.n) if i not in sql_hotel_id]
        random.shuffle(hotel_id)
        return hotel_id
Beispiel #4
0
    def get_proxy_ip(self):

        sql = 'select proxy_ip,port,get_fun from online_db.proxy_ip_info'
        proxy_info = conn_db.dataframe_to_mysql().get_mysql_data(conn=conn_db.conn_localhost,sql=sql)
        proxy_ip = []
        for proxy,port,get_fun in proxy_info.values:
            proxy_dic = u"http://"+str(proxy)+u":"+str(port)
            proxy_dic = {get_fun:proxy_dic}
            proxy_ip.append(proxy_dic)
        return proxy_ip
Beispiel #5
0
def validateIp(n):
    url = "http://touch.qunar.com/hotel/chengdu"
    f = open("D:fangb_ip.txt", "w")
    socket.setdefaulttimeout(3)
    proxy = getProxyIp(n)
    for i in range(0, len(proxy)):
        try:
            ip = proxy[i].strip().split(" ")
            proxy_host = "http://" + ip[0] + ":" + ip[1]
            proxy_temp = {ip[2]: proxy_host}
            print(proxy_temp)
            res = requests.get(url,
                               proxies=proxy_temp,
                               headers=headers,
                               timeout=10)
            print('response_code:' + str(res.status_code))
            if res.status_code == 200:
                conn_db.dataframe_to_mysql().insert_to_mysql_flow(
                    'online_db.proxy_ip_info', ['proxy_ip', 'port', 'get_fun'],
                    [ip[0], ip[1], ip[2]], 0, 2, conn_db.conn_localhost)
                print proxy[i]
        except Exception, e:
            print e
            continue
    def get_base_info(self):

        driver = webdriver.PhantomJS(
            executable_path=
            r'D:\Python27\phantomjs\phantomjs-2.1.1-windows\bin\phantomjs.exe')

        driver.get('http://hotels.ctrip.com/hotel/chengdu28/')
        time.sleep(2)
        page = 0
        while page <= 634:
            times = 10
            for i in range(times + 1):
                driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")
                # js="var q=document.body.scrollTop=100000"
                # driver.execute_script(js)
                time.sleep(5)

            pageSource = driver.page_source
            # html=pageSource.decode("utf-8")
            soup = BeautifulSoup(pageSource, "html.parser")
            if soup:
                hotel_info = soup.findAll('li',
                                          attrs={'class': 'hotel_item_name'})
                for x in hotel_info:
                    hotel_name_info = x.find('h2',
                                             attrs={
                                                 'class': 'hotel_name'
                                             }).find('a')
                    hotel_id = hotel_name_info.get('tracevalue').split(
                        ';')[1].replace('hotelid=', '')
                    hotel_name = hotel_name_info.get('title')
                    url_ctm = hotel_name_info.get('data-ctm')
                    today = str(
                        datetime.datetime.strftime(
                            datetime.datetime.today() +
                            datetime.timedelta(days=1), '%Y-%m-%d'))
                    hotel_url = 'http://hotels.ctrip.com/' + hotel_name_info.get(
                        'href'
                    ) + '&checkIn' + today + '&checkOut' + today + url_ctm

                    hotel_address_info = x.find(
                        'p', attrs={'class': 'hotel_item_htladdress'})
                    hotel_address = hotel_address_info.text.split(
                        '。')[0].split('】')[-1]
                    print hotel_id, hotel_name, hotel_url, hotel_address
                    colum_name = [
                        'hotel_id', 'hotel_name', 'hotel_url', 'hotel_address'
                    ]
                    value_list = [
                        hotel_id if hotel_id else '',
                        hotel_name if hotel_name else '',
                        hotel_url if hotel_url else '',
                        hotel_address if hotel_address else ''
                    ]
                    if hotel_name:
                        conn_db.dataframe_to_mysql().insert_to_mysql_flow(
                            self.mysql_db_table, colum_name, value_list, 0,
                            len(colum_name) - 1, conn_db.conn_localhost)

            print '---------------------------------------------------------------'
            driver.find_element_by_xpath("//a[contains(text(),'下一页')]").click(
            )  # selenium的xpath用法,找到包含“下一页”的a标签去点击
            page = page + 1
            time.sleep(2)