예제 #1
0
class chengJiaoInfo:
    # 初始化构造函数
    def __init__(self, city, url=None):
        self.city = city
        self.elementConstant = ElementConstant()
        self.getIpProxy = GetIpProxy()
        #        self.url = "https://bj.lianjia.com/ershoufang/pg{}/"
        if url == None:
            self.url = "https://%s.lianjia.com/chengjiao/pg{}/" % cityMap[
                self.city]
        else:
            self.url = url
        self.infos = {}
        self.proxyServer = ()
        # 传参使用进行excle生成
        self.generate_excle = generate_excle()
        self.elementConstant = ElementConstant()

    # 生成需要生成页数的链接
    def generate_allurl(self, user_in_nub):
        for url_next in range(1, int(user_in_nub) + 1):
            self.page = url_next
            yield self.url.format(url_next)

    # 开始函数
    def start(self):
        self.generate_excle.addSheetExcle(u'在售列表')
        user_in_nub = 100  #input('输入生成页数:')
        for i in self.generate_allurl(user_in_nub):
            try:
                print('page', i)
                url_count = self.get_allurl(i)
                while url_count == 0:
                    print("error get item url.", i)
                    time.sleep(3600)
                    self.proxyServer = self.getIpProxy.get_random_ip()
                    url_count = self.get_allurl(i)
                if self.page % 5 == 0:
                    self.saveResult()
            except Exception as e:
                print(i, e, 'failed')
        self.saveResult()

    def saveResult(self):
        date = str(datetime.datetime.now().date())
        dirName = 'data/chengjiao-%s' % self.city
        if not os.path.exists(dirName):
            os.makedirs(dirName)
        self.generate_excle.saveExcle(
            '%s/%s-%s.xls' % (dirName, date, self.city.split('/')[-1]))

    def get_allurl(self, generate_allurl):
        geturl = self.requestUrlForRe(generate_allurl)
        url_count = 0
        if geturl.status_code == 200:
            # 提取title跳转地址 对应每个商品
            re_set = re.compile('<li.*?<a.*?class="img.*?".*?href="(.*?)"')
            re_get = re.findall(re_set, geturl.text)
            url_count = len(re_get)
            for index in range(len(re_get)):
                print re_get[index]
                url_status = self.open_url(re_get[index], index)
                while url_status == False:
                    self.proxyServer = self.getIpProxy.get_random_ip()
                    url_status = self.open_url(re_get[index], index)
                print(re_get[index])
        elif geturl.status_code == 404:
            url_count = -1
        return url_count

    def open_url(self, re_get, index):
        print(re_get, index)
        res = self.requestUrlForRe(re_get)
        if res.status_code == 200:
            if '*****@*****.**' in res.text:
                return False
            soup = BeautifulSoup(res.text, 'lxml')
            self.infos['网址'] = re_get
            self.infos['标题'] = soup.title.text.split('_')[0]
            self.infos['售价(万)'] = soup.find(class_='dealTotalPrice').text
            self.infos['每平方售价'] = soup.find(
                class_='record_detail').text.split('元')[0][2:]
            partent = re.compile(
                '<li><span class="label">(.*?)</span>(.*?)</li>')
            result = re.findall(partent, res.text)
            for item in result:
                if item[0] == '建成年代':
                    self.infos['建成时间:年'] = item[1].strip()
                else:
                    self.infos[item[0]] = item[1].strip()

            partent = re.compile('<i>></i>(.*?)</a>')
            result = re.findall(partent, res.text)
            self.infos['所属下辖区'] = result[1].split('>')[1].split('二手房')[0]
            self.infos['所属商圈'] = result[2].split('>')[1].split('二手房')[0]
            self.infos['所属小区'] = result[3].split('>')[1].split('二手房')[0]
            msg = soup.find(class_='msg').contents
            result = [(
                a.contents[1],
                a.contents[0].text,
            ) for a in msg]
            for item in result:
                if item[0] == '建成年代':
                    self.infos['建成时间:年'] = item[1].strip()
                else:
                    self.infos[item[0]] = item[1].strip()
            self.infos['成交时间'] = '-'.join(
                soup.find(
                    class_='wrapper').contents[1].text.split()[0].split('.'))
            row = index + (self.page - 1) * 30
            #self.infos['序号'] = row + 1
            self.infos['城市'] = self.city
            print('row:' + str(row))
            if row == 0:
                for index_item in self.elementConstant.data_constant.keys():
                    self.generate_excle.writeExclePositon(
                        0, self.elementConstant.data_constant.get(index_item),
                        index_item)

                self.wirte_source_data(1)

            else:
                row = row + 1
                self.wirte_source_data(row)
        return True

    # 封装统一request请求,采取动态代理和动态修改User-Agent方式进行访问设置,减少服务端手动暂停的问题
    def requestUrlForRe(self, url):

        try:
            if len(self.proxyServer) == 0:
                tempProxyServer = self.getIpProxy.get_random_ip()
            else:
                tempProxyServer = self.proxyServer
            proxy_dict = {tempProxyServer[0]: tempProxyServer[1]}
            tempUrl = requests.get(url,
                                   headers=hds[random.randint(0,
                                                              len(hds) - 1)],
                                   proxies=proxy_dict)
            code = tempUrl.status_code
            #print tempUrl.text.decode('gbk')
            if code >= 200 or code < 300:
                self.proxyServer = tempProxyServer
                return tempUrl
            else:
                self.proxyServer = self.getIpProxy.get_random_ip()
                return self.requestUrlForRe(url)
        except Exception as e:
            self.proxyServer = self.getIpProxy.get_random_ip()
            s = requests.session()
            s.keep_alive = False
            return self.requestUrlForRe(url)

    # 源数据生成,写入excle中,从infos字典中读取数据,放置到list列表中进行写入操作,其中可修改规定写入格式
    def wirte_source_data(self, row):
        for itemKey in self.infos.keys():
            #print(itemKey + ':' + str(self.infos.get(itemKey)))

            item_valus = self.infos.get(itemKey)
            tempItemKey = self.elementConstant.unit_check_name(
                itemKey.encode('utf-8'))
            count = self.elementConstant.data_constant.get(tempItemKey)
            print tempItemKey, self.elementConstant.data_constant.get(
                tempItemKey), item_valus
            if tempItemKey != None and count != None:

                self.generate_excle.writeExclePositon(
                    row, self.elementConstant.data_constant.get(tempItemKey),
                    item_valus)
class salingInfo:
    # 初始化构造函数
    def __init__(self):
        self.elementConstant = ElementConstant()
        self.getIpProxy = GetIpProxy()
        self.url = "https://bj.lianjia.com/ershoufang/pg{}/"
        #        self.url = "https://bj.lianjia.com/chengjiao/pg{}/"
        self.infos = {}
        self.proxyServer = ()
        # 传参使用进行excle生成
        self.generate_excle = generate_excle()
        self.elementConstant = ElementConstant()

    # 生成需要生成页数的链接
    def generate_allurl(self, user_in_nub):
        for url_next in range(1, int(user_in_nub) + 1):
            self.page = url_next
            yield self.url.format(url_next)

    # 开始函数
    def start(self):
        self.generate_excle.addSheetExcle(u'在售列表')
        user_in_nub = input('输入生成页数:')

        for i in self.generate_allurl(user_in_nub):
            self.get_allurl(i)
            print(i)
        self.generate_excle.saveExcle('LianJiaSpider.xls')

    def get_allurl(self, generate_allurl):
        geturl = self.requestUrlForRe(generate_allurl)
        if geturl.status_code == 200:
            # 提取title跳转地址 对应每个商品
            re_set = re.compile('<li.*?<a.*?class="img.*?".*?href="(.*?)"')
            re_get = re.findall(re_set, geturl.text)
            for index in range(len(re_get)):
                self.open_url(re_get[index], index)
                print re_get[index]

    def open_url(self, re_get, index):
        res = self.requestUrlForRe(re_get)
        if res.status_code == 200:
            soup = BeautifulSoup(res.text, 'lxml')
            self.infos['网址'] = re_get
            self.infos['标题'] = soup.select('.main')[0].text
            self.infos['总价'] = soup.select('.total')[0].text + u'万'
            self.infos['每平方售价'] = soup.select('.unitPriceValue')[0].text

            self.infos['户型'] = soup.select('.mainInfo')[0].text
            self.infos['朝向'] = soup.select('.mainInfo')[1].text
            self.infos['大小'] = soup.select('.mainInfo')[2].text
            self.infos['楼层'] = soup.select('.subInfo')[0].text
            self.infos['装修'] = soup.select('.subInfo')[1].text
            self.infos['房子类型'] = soup.select('.subInfo')[2].text

            self.infos['小区名称'] = soup.select('.info')[0].text
            self.infos['区域'] = soup.select('.info > a')[0].text
            # infos['地区'] = soup.select('.info > a')[1].text
            self.infos['详细区域'] = soup.select('.info')[1].text
            self.infos['链家编号'] = soup.select('.info')[3].text
            self.infos['关注房源'] = soup.select('#favCount')[0].text + u"人关注"
            self.infos['看过房源'] = soup.select('#cartCount')[0].text + u"人看过"

            partent = re.compile(
                '<li><span class="label">(.*?)</span>(.*?)</li>')
            result = re.findall(partent, res.text)

            for item in result:
                if item[0] != u"抵押信息" and item[0] != u"房本备件":
                    self.infos[item[0]] = item[1]
            row = index + (self.page - 1) * 30
            self.infos['序号'] = row + 1
            self.infos['状态'] = u'在售'
            self.infos['城市'] = u'北京'
            print 'row:' + str(row)
            if row == 0:
                for index_item in self.elementConstant.data_constant.keys():
                    self.generate_excle.writeExclePositon(
                        0, self.elementConstant.data_constant.get(index_item),
                        index_item)

                self.wirte_source_data(1)

            else:
                row = row + 1
                self.wirte_source_data(row)
        return self.infos

    # 封装统一request请求,采取动态代理和动态修改User-Agent方式进行访问设置,减少服务端手动暂停的问题
    def requestUrlForRe(self, url):

        try:
            if len(self.proxyServer) == 0:
                tempProxyServer = self.getIpProxy.get_random_ip()
            else:
                tempProxyServer = self.proxyServer

            proxy_dict = {tempProxyServer[0]: tempProxyServer[1]}
            tempUrl = requests.get(url,
                                   headers=hds[random.randint(0,
                                                              len(hds) - 1)],
                                   proxies=proxy_dict)

            code = tempUrl.status_code
            if code >= 200 or code < 300:
                self.proxyServer = tempProxyServer
                return tempUrl
            else:
                self.proxyServer = self.getIpProxy.get_random_ip()
                return self.requestUrlForRe(url)
        except Exception as e:
            self.proxyServer = self.getIpProxy.get_random_ip()
            s = requests.session()
            s.keep_alive = False
            return self.requestUrlForRe(url)

    # 源数据生成,写入excle中,从infos字典中读取数据,放置到list列表中进行写入操作,其中可修改规定写入格式
    def wirte_source_data(self, row):
        for itemKey in self.infos.keys():
            print itemKey + ':' + str(self.infos.get(itemKey))

            item_valus = self.infos.get(itemKey)
            if itemKey == '详细区域':
                temps_item_valus = item_valus.split()
                print temps_item_valus[0], temps_item_valus[
                    1], temps_item_valus[2]
                self.generate_excle.writeExclePositon(
                    row, self.elementConstant.data_constant.get('所属下辖区'),
                    temps_item_valus[0])
                self.generate_excle.writeExclePositon(
                    row, self.elementConstant.data_constant.get('所属商圈'),
                    temps_item_valus[1])
                self.generate_excle.writeExclePositon(
                    row, self.elementConstant.data_constant.get('所属环线'),
                    temps_item_valus[2])
            else:

                tempItemKey = self.elementConstant.unit_check_name(
                    itemKey.encode('utf-8'))
                count = self.elementConstant.data_constant.get(tempItemKey)
                print tempItemKey, self.elementConstant.data_constant.get(
                    tempItemKey), item_valus
                if tempItemKey != None and count != None:
                    # todo 检查使用标准,修改使用逻辑
                    if tempItemKey == '链家编号':
                        item_valus = item_valus[0:len(item_valus) - 2]
                    elif tempItemKey == '单价(元/平米)':
                        item_valus = item_valus[0:len(item_valus) - 4]
                    elif tempItemKey == '建筑面积:平米':
                        item_valus = item_valus[0:len(item_valus) - 1]
                    elif tempItemKey == '建成时间:年':
                        item_valus = item_valus[0:item_valus.index('年')]
                    elif tempItemKey == '关注(人)' or tempItemKey == '看过房源:人':
                        item_valus = item_valus[0:len(item_valus) - 3]
                    elif tempItemKey == '挂牌时间':
                        item_valus = item_valus.replace('-', '/')
                    elif tempItemKey == '上次交易时间':
                        item_valus = item_valus.replace('-', '/')

                    self.generate_excle.writeExclePositon(
                        row,
                        self.elementConstant.data_constant.get(tempItemKey),
                        item_valus)