Esempio n. 1
0
    def parse_datas(self, soup):
        totalfind = soup.select("h2.total.fl > span")
        if 0 == ToolsBox.strToInt(totalfind[0].get_text()): return '0'
        page_datas = []
        communitys = soup.select("div.info>div.title>a")
        regions = soup.select('a.district')
        blocks = soup.select('a.bizcircle')
        prices = soup.select('.totalPrice>span')
        forsales = soup.select('.totalSellCount>span')
        buildyears = soup.select('.positionInfo')

        for community, region, block, price, forsale, buildyear in zip(
                communitys, regions, blocks, prices, forsales, buildyears):
            each_data = dict()
            each_data['community_name'] = community.get_text()
            each_data['community_url'] = community.get('href')
            each_data['region'] = region.get_text()
            each_data['block'] = block.get_text()
            each_data['builded_year'] = ToolsBox.strToInt(buildyear.get_text())
            each_data['forsale_num'] = ToolsBox.strToInt(forsale.get_text())
            each_data['price'] = ToolsBox.strToInt(price.get_text())
            # each_data['date']
            each_data['from'] = "LJ"
            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)
            # ToolsBox.printDic(page_datas)

        return page_datas
Esempio n. 2
0
    def parse_datas(self,soup):

        page_datas = []

        titles = soup.select("h1 > a")
        infos = soup.select("p.house_info")
        hots = soup.select("p.house_hot")
        areas = soup.select("div.the_area span")
        prices = soup.select("div.the_price span")
        splitby = re.compile(r']|,|\s')

        for title, info, hot, area, price in zip(titles, infos, hots, areas, prices):

            each_data = {'advantage': '', 'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0,
                         'total_floor': 0}

            each_data['title'] = title.get_text()
            each_data['details_url'] = 'http://xm.maitian.cn' + title.get('href')

            try:
                each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            except Exception as e:
                with open('logtest.txt', 'a+') as fout:
                    fout.write('*************' + str(datetime.datetime.now()) + '*************\n')
                    fout.write('麦田解析total_price出错,待解析的数据:' + price.get_text())
                    traceback.print_exc(file=fout)
                    print(traceback.format_exc())

            try:
                each_data['block'] = info.get_text().strip()
                each_data['community_name'] = splitby.split(each_data['block'])[-1].strip()
                each_data['block'] = each_data['block'].replace(each_data['community_name'],'')
            except Exception as e:
                with open('logtest.txt', 'a+') as fout:
                    fout.write('*************' + str(datetime.datetime.now()) + '*************\n')
                    fout.write('Parse Failt of :%s \n' % info.get_text())
                    traceback.print_exc(file=fout)
                    print(traceback.format_exc())

            # try:
                # 麦田的格式,这里是户型、优势和楼层
            temp = ToolsBox.clearStr(hot.text).split('|')
            for item in temp:
                d1 = self.parse_item(item)
                each_data = self.add_advantage(d1, each_data)   #each_data = dict(each_data, **d1)

            # 这是解析面积
            each_data = dict(each_data, **self.parse_item(area.get_text()))

            each_data['from'] = "MT"

            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data)

        return page_datas
Esempio n. 3
0
    def parse_datas(self, soup):
        i = 1
        page_datas = []

        details = soup.select('dd.detail ')
        hrefs = soup.select('span.c_blue0041d9.aVisited.f14B > a')
        comms = soup.select('span.xuzhentian > a')
        prices = soup.select('span > em')

        for detail, href, comm, price in zip(details, hrefs, comms, prices):

            each_data = dict(advantage='',
                             builded_year=0,
                             spatial_arrangement='',
                             floor_index=0,
                             total_floor=0)
            each_data['title'] = href.get_text().strip()
            each_data['community_name'] = comm.get_text().strip()
            each_data['details_url'] = "http://esf.xmhouse.com" + href.get(
                'href')
            each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            h_infos = re.search(r'<span style="margin-left: 5px; color: #000000">.*</span>(.*) <div', str(detail), re.S) \
                .group(1).replace('<br/>', '').replace('\r\n', '').replace(' ', '').split(',')

            for item in h_infos:
                try:
                    d1 = {}
                    d1 = self.parse_item(item)
                    each_data = self.add_advantage(
                        d1, each_data)  #each_data = dict(each_data, **d1)
                except Exception as e:
                    with open('logtest.txt', 'a+') as fout:
                        fout.write('*************' +
                                   str(datetime.datetime.now()) +
                                   '*************\n')
                        fout.write('      获取的数据:')
                        for i1 in h_infos:
                            fout.write(i1 + ',')
                        fout.write('\n      XmParser解析时发生错误的Item是: ' +
                                   str(item) + '\n')
                        traceback.print_exc(file=fout)
                        print(traceback.format_exc())

            each_data['from'] = "XMHouse"
            # ToolsBox.printDic(each_data)
            # print('******************{0}******************'.format(i))
            # i += 1

            each_data = self.pipe(each_data)
            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)

        return page_datas
Esempio n. 4
0
    def parse_datas(self, soup):
        totalfind = soup.select("span.tit em")
        if 0 == ToolsBox.strToInt(totalfind[1].get_text()): return '0'
        page_datas = []
        communitys = soup.select("h3 > a")
        adds = soup.select('.li-info>address')
        dates = soup.select('p.date')
        prices = soup.select('p>strong')
        forsales = soup.select('p.bot-tag>span>a')
        for community, add, date, price, forsale in zip(
                communitys, adds, dates, prices, forsales):
            each_data = dict()
            each_data['community_name'] = community.get('title')
            each_data['community_url'] = community.get('href')
            add1 = ToolsBox.clearStr(add.get_text())
            addlist = add1.split(']')
            if len(addlist) > 1:
                regionlist = addlist[0].replace('[', '').split('-')
                if len(regionlist) > 1:
                    each_data['region'], each_data['block'] = regionlist
                else:
                    each_data['region'] = regionlist
                each_data['address'] = addlist[1]
            else:
                each_data['address'] = add1
            each_data['builded_year'] = ToolsBox.strToInt(date.get_text())
            each_data['forsale_num'] = ToolsBox.strToInt(forsale.get_text())
            each_data['price'] = ToolsBox.strToInt(price.get_text())
            # each_data['date']
            each_data['from'] = "AJK"

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)
            # ToolsBox.printDic(page_datas)
        return page_datas
Esempio n. 5
0
    def parse_datas(self, soup):

        page_datas = []

        details = soup.select(".houseInfo")
        comms = soup.select(".positionInfo a")
        prices = soup.select(".totalPrice")
        titles = soup.select("div.title a.CLICKDATA")

        for title, detail, price, comm in zip(titles, details, prices, comms):
            each_data = dict(builded_year=0,
                             spatial_arrangement='',
                             floor_index=0,
                             total_floor=0,
                             details_url=title.get('href'),
                             advantage='')
            each_data['title'] = title.get_text().strip()

            houseInfos = re.split(r'\s*[|,\s]\s*',
                                  ToolsBox.clearStr(detail.get_text()))
            # print(houseInfos)
            # print("1"*20)
            each_data['community_name'] = comm.get_text().strip()
            if len(each_data['community_name']) >= 20:
                input(each_data['community_name'] + ':' +
                      str(len(each_data['community_name'])))

            # houseInfos = houseInfos[1:]         #第一个是小区名称,切片去除
            for item in houseInfos:
                # print(item)
                d1 = self.parse_item(item)
                each_data = self.add_advantage(d1, each_data)

            each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            each_data['from'] = "Beike"
            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)
            # print(each_data)

        if not page_datas:
            item_num = soup.select(".fl span")
            if item_num:
                page_datas = item_num[0].get_text().strip()

        return page_datas
Esempio n. 6
0
    def parse_datas(self, soup):

        page_datas = []

        items = soup.select('div.info')
        titles = soup.select('p.title a ')
        comms = soup.select('p.hlistP  a span')
        addresses = soup.select('p.hlistP a.addressChange')
        regions = soup.select('p.hlistP > span')
        mores = soup.select('.moreInfo')
        prices = soup.select('.price')

        for item,title,comm,addr,region,price,more in \
                zip(items,titles,comms,addresses,regions,prices,mores):

            each_data = dict(builded_year=0,
                             spatial_arrangement='',
                             floor_index=0,
                             total_floor=0)

            each_data['title'] = title.get_text()
            each_data['details_url'] = 'http://www.917.com' + title.get('href')

            details = item.select('p')
            for string in details[1].stripped_strings:
                d1 = self.parse_item(string.strip())
                each_data = self.add_advantage(d1, each_data)

            each_data['community_name'] = comm.get_text()
            each_data['community_address'] = addr.get_text()
            each_data['region'] = region.get_text().replace('|', '').replace(
                ' ', '')
            each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            each_data['from'] = "917"

            getP = more.select('p')
            for p in getP:
                if '建筑面积' in p.get_text():
                    d1 = self.parse_item(p.get_text().strip())
                    each_data = self.add_advantage(d1, each_data)

            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)
        return page_datas
Esempio n. 7
0
    def parse_datas(self,soup):
        page_datas = []

        titles = soup.select("div.title > a")
        houseinfo = soup.select("div.houseInfo")
        positionInfo = soup.select("div.positionInfo")
        totalprices = soup.select("div.totalPrice")
        #
        for title, info, position, totalPrice in zip(titles, houseinfo, positionInfo, totalprices):
            each_data = {'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0, 'total_floor': 0}
            each_data['title'] = title.get_text()
            each_data['details_url'] = title.get('href')
            each_data['total_price'] = ToolsBox.strToInt(totalPrice.get_text())

            info_item = info.get_text().split('|')

            # each_data['community_name'] = info_item[0].strip()  # 第1个总是小区名称
            for i in range(0, len(info_item)):
                d1 = self.parse_item(info_item[i].strip())
                each_data = self.add_advantage(d1,each_data)

            position = position.get_text().replace('\t', '').replace('\n', '').split()
            each_data['community_name'] = position[0].strip()  # 10月21日改变了小区名称位置
            # print(position)
            each_data['block'] = position[-1]

            if ')' not in position[0]:  # 链前的别墅会用'4层2008年建'的形式,加入')',以便分隔
                position[0] = position[0].replace('层', '层)')

            for item in position[0].split(')'):  # 2017.4.1链家格式有改
                d1 = self.parse_item(item.strip())  # 2017.4.1链家格式有改
                each_data = self.add_advantage(d1, each_data)
                # each_data = dict(each_data, **d1)

            each_data['from'] = "lianjia"

            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data)

        if not page_datas:
            total_num = soup.select('.total span')
            if total_num:
                page_datas = total_num[0].get_text().strip()
        return page_datas
Esempio n. 8
0
    def parse_datas(self, soup):

        page_datas = []

        # title = soup.select("title")
        # if len(title) > 0:
        #     print("The page's title is : {0}".format(title[0].get_text()))
        # else:
        #     print("There is no title finded!")

        titles = soup.select(".shop_list > dl h4 a")
        houses = soup.select("p.tel_shop")
        comms = soup.select(".shop_list > dl dd p.add_shop a")
        comm_addresses = soup.select(".shop_list > dl dd p.add_shop span")
        prices = soup.select(".price_right .red b")
        for title, comm, comm_addresse, house, price in zip(
                titles, comms, comm_addresses, houses, prices):
            each_data = dict(builded_year=0,
                             spatial_arrangement='',
                             floor_index=0,
                             total_floor=0,
                             advantage='')

            each_data['title'] = title.get('title')
            each_data['details_url'] = "https://xm.esf.fang.com" + title.get(
                'href')
            for item in house.children:
                if isinstance(item, bs4.element.NavigableString):
                    d1 = self.parse_item(ToolsBox.clearStr(item))
                    each_data = self.add_advantage(d1, each_data)

            each_data['community_name'] = comm.get('title').strip()
            each_data['community_address'] = comm_addresse.get_text().strip()
            each_data['comm_url'] = comm.get('href').strip()
            each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            each_data['from'] = "Soufan"
            #
            each_data = self.pipe(each_data)
            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)

        return page_datas
Esempio n. 9
0
    def parse_datas(self,soup):

        page_datas = []

        titles = soup.select("h2.fix a")
        houses = soup.select('p.moudle')
        houses1 = soup.select('td.sm222 p.msg')
        # comms = soup.select('span.comm-address')
        prices = soup.select('div.percent b')
        # print(titles)
        for title,detail,detail1,price in zip(titles,houses,houses1,prices):
            # each_data = {}
            each_data = dict(advantage='', builded_year=0, spatial_arrangement='', floor_index=0, total_floor=0)
            each_data['title'] = title.get_text()
            each_data['details_url'] = 'https://danxia.com' + title.get('href')

            each_data['community_name'] = detail.select('a')[0].get_text()
            temp = detail.select('span')
            for item in temp:
                d1 = self.parse_item(item.get_text())
                each_data = self.add_advantage(d1, each_data)
                # each_data = dict(each_data, **d1)

            temp1 = detail1.select('span')
            for item in temp1:
                d1 = self.parse_item(item.get_text())
                each_data = self.add_advantage(d1, each_data)
                # each_data = dict(each_data, **d1)

            each_data['total_price'] = ToolsBox.strToInt(price.get_text())

            each_data['from'] = "Danxia"

            each_data = self.pipe(each_data)  # 2016.6.4增加一个专门的数据处理

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data)

        return page_datas
Esempio n. 10
0
    def parse_datas(self, soup):

        page_datas = []

        # details = soup.select("div.house-info")
        # comms = soup.select("div.house-info > a ")
        # positions = soup.select("div.house-position")
        # prices = soup.select("span.georgia")
        # titles = soup.select("h3 > a")
        # regions = soup.select(".region")

        # for title,comm,detail,position,price,region in zip(titles,comms,details,positions,prices,regions):

        # 2019/9/9乐居网改版面的
        titles = soup.select("div.title_in")
        d_urls = soup.select("div.title_in > a")
        adds = soup.select("div.address")
        infos = soup.select("div.house_info")
        prices = soup.select("div.price > span")
        for title, add, d_url, info, price in zip(titles, adds, d_urls, infos,
                                                  prices):
            each_data = dict(builded_year=0,
                             spatial_arrangement='',
                             floor_index=0,
                             total_floor=0,
                             title=title.get('title'))
            comms = add.select('span')
            each_data['community_name'] = ToolsBox.clearStr(
                comms[0].get_text())
            for comm in comms:
                comm = ToolsBox.clearStr(comm.get_text())
                if '-' != comm:
                    if '-' in comm:
                        c_item = comm.split('-')
                        each_data['region'] = c_item[0]
                        each_data['block'] = c_item[1]
                    if '年' in comm:
                        out = self.parse_item(comm)
                        each_data = self.add_advantage(out, each_data)
            h_info = info.select('span')
            for item in h_info:
                item = ToolsBox.clearStr(item.get_text())
                each_data = self.add_advantage(self.parse_item(item),
                                               each_data)
            each_data['details_url'] = 'https:' + d_url.get('href')
            each_data['total_price'] = ToolsBox.strToInt(price.get_text())

            # , details_url='http://xm.esf.leju.com' + title.get('href')
            # mr20 = detail.select("span.mr20")
            # posi = position.select("span")
            # for j in range(1,len(posi)):
            #     out = self.parse_item(posi[j].get_text())
            #     each_data = self.add_advantage(out, each_data)
            #     # if len(out) > 0:
            #     #     if ('advantage' in each_data.keys()) and ('advantage' in out.keys()):
            #     #         each_data['advantage'] = each_data['advantage'] + ',' + out['advantage']
            #     #     else:
            #     #         each_data = dict(each_data, **out)
            # for item in mr20:
            #     d1 = self.parse_item(item.get_text())
            #     each_data = self.add_advantage(d1, each_data)
            #     # if len(d1) > 0:
            #     #     if ('advantage' in each_data.keys()) and ('advantage' in d1.keys()):
            #     #         each_data['advantage'] = each_data['advantage'] + ',' + d1['advantage']
            #     #     else:
            #     #         each_data = dict(each_data, **d1)
            # each_data['community_address'] = region.get_text().strip()
            # each_data['community_name'] = comm.get_text()
            # each_data['total_price'] =ToolsBox.strToInt(price.get_text())
            # each_data['price'] = round(float(each_data['total_price']*10000/each_data['area']),2)

            each_data['from'] = "lejv"
            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)

        return page_datas
Esempio n. 11
0
    def parse_page(self, soup):
        page_datas = []
        details_urls = soup.select(".property>a")
        titles = soup.select("h3.property-content-title-name")
        houses = soup.select("div.property-content-detail>section")
        # houses = soup.select('div.house-details')
        comms = soup.select('.property-content-info-comm-name')
        adds = soup.select('.property-content-info-comm-address')
        prices = soup.select('.property-price-total-num')
        for details_url, title, details, comm, price, add in zip(
                details_urls, titles, houses, comms, prices, adds):
            each_data = dict(advantage='',
                             builded_year=0,
                             spatial_arrangement='',
                             floor_index=0,
                             total_floor=0)
            each_data['title'] = title.get_text()
            each_data['details_url'] = details_url.get('href')

            houses = details.select(".property-content-info")
            detail = houses[0].select("p")
            for string in detail:
                d1 = self.parse_item(
                    ToolsBox.clearStr(string.get_text().strip()))
                each_data = self.add_advantage(d1, each_data)

            each_data['community_name'] = comm.get_text().strip()

            add_list = []
            for string in add.strings:
                add_list.append(ToolsBox.clearStr(string.strip()))
            try:
                each_data['region'], each_data['block'], each_data[
                    'community_address'] = add_list
            except Exception as e:
                with open('logtest.txt', 'a+') as fout:
                    fout.write('*************' + str(datetime.datetime.now()) +
                               '*************\n')
                    fout.write('AJK解析区、板块、地址时出错,待解析的数据:')
                    traceback.print_exc(file=fout)
                    print(traceback.format_exc())
            # print(price)
            each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            each_data['from'] = "AJK"
            # try:  # 2016.8.1 这里解析也时有出差,把它保留下来
            #     each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            # except Exception as e:
            #     with open('logtest.txt', 'a+') as fout:
            #         fout.write('*************' + str(datetime.datetime.now()) + '*************\n')
            #         fout.write('AJK解析total_price出错,待解析的数据:' + price.get_text())
            #         traceback.print_exc(file=fout)
            #         print(traceback.format_exc())

            # try:
            #     comminfo = comm.get('title').split()
            #     each_data['community_name'] = comminfo[0]
            #     each_data['region'], each_data['block'], each_data['community_address'] = comminfo[1].split('-', 2)
            # except Exception as e:
            #     with open('logtest.txt', 'a+') as fout:
            #         fout.write('*************' + str(datetime.datetime.now()) + '*************\n')
            #         fout.write('Parse Failt of :%s \n' % comm.get('title'))
            #         traceback.print_exc(file=fout)
            #         print(traceback.format_exc())
            # each_data['community_name'] = each_data['community_name'].strip()

            # try:
            #     house = details.select('span')
            #     # 2016.8.17 重写了字段解析,抽象出一个parse_item方法
            #     for h in house:
            #         if len(h.attrs) == 0:
            #             string = h.get_text().encode('utf8')
            #             d1 = {}
            #             d1 = self.parse_item(string)
            #             each_data = self.add_advantage(d1, each_data)   #each_data = dict(each_data, **d1)
            #     each_data['from'] = "AJK"
            # except Exception as e:
            #     with open('logtest.txt', 'a+') as fout:
            #         fout.write('*************' + str(datetime.datetime.now()) + '*************\n')
            #         fout.write('      待解析的数据:\n')
            #         for i1 in house:
            #             fout.write(str(i1) + '\n')
            #         fout.write('\n      字段数:' + str(len(house)) + '\n')
            #         traceback.print_exc(file=fout)
            #         print(traceback.format_exc())
            each_data = self.pipe(each_data)  # 2016.6.4增加一个专门的数据处理

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)
        return page_datas
Esempio n. 12
0
    def parse_datas(self, soup):

        page_datas = []

        details = soup.select(".size")
        # comms = soup.select("a span.address-eara")
        # print(comms)
        prices = soup.select(".num")
        titles = soup.select("div.ershoufang-list .title a")
        regions = soup.select("span.area a")
        lists = soup.select(".ershoufang-list")

        for title, detail, list1, price, region in zip(titles, details, lists,
                                                       prices, regions):
            # for title in titles:

            each_data = {
                'builded_year': 0,
                'spatial_arrangement': '',
                'floor_index': 0,
                'total_floor': 0,
                'advantage': '',
                'title': title.get('title'),
                'details_url': 'http:' + title.get('href')
            }
            for item in (detail.stripped_strings):
                d1 = self.parse_item(item)
                each_data = self.add_advantage(d1, each_data)

            each_data['total_price'] = ToolsBox.strToInt(price.get_text())

            address = list1.select("dd.address")

            # print(address[0])
            # print(len(address))
            # for item in comm.stripped_strings:
            #     print(item)
            # print(comm.stripped_strings)
            # print(50*'0')
            if len(address) > 0:
                if len(address[0].select("a.address-eara")) > 0:
                    each_data['region'] = ToolsBox.clearStr(
                        address[0].select("a.address-eara")[0].get_text())

                if len(address[0].select("span.address-eara")) > 0:
                    # each_data['community_name'] = address[0].select("span.address-eara")[0].get_text()
                    # print(each_data['community_name'])
                    each_data['community_name'] = ToolsBox.clearStr(
                        address[0].select("span.address-eara")[0].get_text())

            # try:
            # except (IndexError) as e:
            #     print("****页面数据不规范*****")
            #     input(address)

            # each_data['community_name'] = (comm.get_text())
            # print(comm.children)
            # for name in comm.descendants:
            #     print(name)
            #     # pass
            # print('-'*50)
            # each_data['region'] = ToolsBox.clearStr(region.get_text())

            each_data['from'] = "ganji"
            # print(each_data)
            each_data = self.pipe(each_data)
            #
            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)

        return page_datas
Esempio n. 13
0
    def parse_datas(self, soup):

        page_datas = []
        # print(soup)

        titles = soup.select("h2.title > a")
        prices = soup.select('p.sum > b')
        houses = soup.select('.list-info')

        for title, price, house in zip(titles, prices, houses):
            each_data = {'advantage': '', 'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0,
                         'total_floor': 0, 'title': title.get_text(), 'details_url': title.get('href'),
                         'total_price': ToolsBox.strToInt(price.get_text())}
            details = house.select('p.baseinfo')
            spans = details[0].select('span')
            for span in spans:
                string = ToolsBox.clearStr(span.get_text()).encode('utf8')
                # d1 = {}
                d1 = self.parse_item(string)
                each_data = self.add_advantage(d1, each_data)  # each_data = dict(each_data, **d1)
            comms = details[1].select('a')

            each_data['community_name'] = comms[0].get_text()

            if comms[0].get('href') is None:
                each_data['comm_url'] = ''
            else:
                each_data['comm_url'] = 'http://xm.58.com' + comms[0].get('href')

            each_data['from'] = "58"

            try:
                if len(comms) >= 2:
                    # input('region')
                    each_data['region'] = comms[1].get_text().strip()
            except Exception as e:
                # print('-------这个记录没有拿到小区的区域------------')
                # ToolsBox.printDic(each_data)
                print(e)

            try:
                if len(comms) >= 3:
                    # input('address')
                    each_data['community_address'] = comms[2].get_text().strip()
            except Exception as e:
                # print('-------这个记录没有拿到小区地址------------')
                # ToolsBox.printDic(each_data)
                print(e)

            each_data = self.pipe(each_data)

            if each_data:
                match_comm = re.findall(r'^\d+$', each_data['community_name'])
                # 不知道为什么,有时小区名称会都是数字,需要屏蔽
                # print(match_comm)
                if len(match_comm) > 0:
                    print('/////////////////出现纯数字的小区了!!!!!!////////////////////////')
                    ToolsBox.priList(each_data)
                    print(soup)
                    # print(each_data['community_name'])
                    # var1 = input(each_data['community_name']+'出现纯数字的小区了!!!!!!!!!')
                else:
                    page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data)

        return page_datas
Esempio n. 14
0
    def craw_a_page(self, new_url, retries=3):

        # 计算并打印延时情况
        if self.delay > 0:
            sleepSeconds = random.randint(self.delay, self.delay * 2)
            print('craw {0} after {1} seconds ({2} ~ {3}):'.format(
                self.count, sleepSeconds, self.delay, self.delay * 2))
        else:
            print('craw {0} :'.format(self.count))

        # 获取请求头、代理信息,每个页面都不相同
        # proxy = self.proxy_builder()
        proxy = None
        self.headers_builder()

        # 下载
        html_cont, code = self.downloader.download(new_url,
                                                   headers=self.headers,
                                                   proxy=proxy)

        # 对下载内容进行处理
        # 1、如果被404的处理
        if 400 <= code < 600:
            # if isinstance(html_cont, int) and (400 <= (html_cont) < 600):
            self.HTTP404 += 1
            print("返回异常(在MassController里): {0}".format(code))
            if html_cont is not None:
                self.downloader.getTitle(html_cont)
                new_urls, new_datas = self.parser.page_parse(html_cont)
                if new_datas == 'checkcode':  # 如果解析出是输入验证码
                    print(str(datetime.datetime.now()))
                    self.delay = input("遇到验证码,输入延时秒数后,保留已解析的数据......")
                    if self.delay == '':
                        self.delay = 0
                    else:
                        self.delay = ToolsBox.strToInt(self.delay)
                    self.total = self.total + self.outputer.out_mysql()
                    if retries > 0:
                        return self.craw_a_page(new_url, retries - 1)
            time.sleep(30 * self.HTTP404)  # 被禁止访问了,消停一会
            if self.HTTP404 > self.HTTP404_stop:
                # 在安居客中如果是“安全局宿舍”,会出现找不到的错误,这里给它自动跳过
                match_comm = re.findall(r'kw=(.*)&from_url', new_url)
                if unquote(match_comm[0], 'utf-8') != '0':
                    print(str(datetime.datetime.now()))
                    self.delay = input("你似乎被禁止访问了,输入延时秒数后,保留已解析的数据......")
                    if self.delay == '':
                        self.delay = 0
                    else:
                        self.delay = ToolsBox.strToInt(self.delay)
                self.total = self.total + self.outputer.out_mysql()
                self.HTTP404 = 0
            else:
                return self.craw_a_page(new_url)
        # 2、正常得到网页
        elif html_cont is not None:
            # 2019.3.11简化了分析
            new_urls, new_datas = self.parser.page_parse(html_cont)  # 返回解析内容

            if new_datas == 'checkcode':  # 如果解析出是输入验证码
                print(str(datetime.datetime.now()))
                self.delay = input("遇到验证码,输入延时秒数后,保留已解析的数据......")
                if self.delay == '':
                    self.delay = 0
                else:
                    self.delay = ToolsBox.strToInt(self.delay)
                self.total = self.total + self.outputer.out_mysql()
                if retries > 0:
                    return self.craw_a_page(new_url, retries - 1)
            elif new_datas == '0':  #这是查询出来没有数据记录
                print('这页查出来的记录数为0,不是解析不出来')
                print('本页面      datas:没有,urls:当然没有')
            elif len(new_datas) == 0 and len(new_urls) == 0:  # 解析无数据
                self.nodata += 1
                if self.nodata < self.nodata_stop:
                    print("本页面未解析出数据,可再试{0}次".format(self.nodata_stop -
                                                     self.nodata))
                    print(html_cont)
                    time.sleep(random.randint(3, 7))
                    return self.craw_a_page(new_url)
                else:
                    with open('logtest.txt', 'a+') as fout:
                        fout.write('\n*******' + str(datetime.datetime.now()) +
                                   '*************')
                        fout.write('\n 本页面无数据:%s. \n' % new_url)
                    if self.nodata < 999:
                        self.delay = input(
                            '页面连续无数据,可点击上面链接检查,如无问题,输入延时秒数后,保留已解析的数据......')
                        if self.delay == '':
                            self.delay = 0
                        else:
                            self.delay = ToolsBox.strToInt(self.delay)
                        self.nodata = 0
                    else:
                        #对self.nodata = 1000以上的,如赶集网忽略没有数据
                        self.nodata = 1000
            else:  # 正常情况,解析
                print('本页面      datas:{0},urls:{1}'.format(
                    len(new_datas), len(new_urls)))
                # 把页面链接放入url管理器
                self.urls.add_new_urls(new_urls)

                # 把小区名称放入小区管理器
                for data in new_datas:
                    self.add_comm(data)

                # 把挂牌信息传入outputer,清除无效数据后,放在outputer.raw_datas记录集中
                self.outputer.collect_data(new_datas)
                data_num = self.outputer.get_datas_quantity()
                print(
                    "共%6.0f = %6.0f 重复 + %5.0f 数据池 + %6.0f 存入数据库 " %
                    (data_num['dupli_count'] + data_num['r_data'] + self.total,
                     data_num['dupli_count'], data_num['r_data'], self.total))

                if 3000 < data_num['r_data']:
                    print("正在存入数据库中,请稍侯......")
                    storenum = self.outputer.out_mysql()
                    if storenum:
                        self.total = self.total + storenum
                self.count += 1
                self.nodata = 0 if self.nodata < 999 else 1000  # 如果有数据,把self.nodata计数器复原
                self.HTTP404 = 0  # 如果有数据,把self.HTTP404计数器清零
        # 3、html_cont内容是None,这是出现500以上的download失败
        else:
            print('不能从服务器上下载{0}'.format(new_url))
            self.HTTP404 += 1
            time.sleep(15 * self.HTTP404)  # 被禁止访问了,消停一会
            if self.HTTP404 > self.HTTP404_stop:
                self.delay = input(
                    '连续不能获取页面内容,可点击上面链接检查,如无问题,输入延时秒数后,保留已解析的数据......')
                if self.delay == '':
                    self.delay = 0
                else:
                    self.delay = ToolsBox.strToInt(self.delay)
                self.total = self.total + self.outputer.out_mysql()
                self.HTTP404 = 0
            else:
                # if retries > 0:
                #     return self.craw_a_page(new_url, retries - 1)
                return self.craw_a_page(new_url)

        # 延时模块:放在最后,第一次抓取时不用延时
        if not 0 >= self.delay:
            time.sleep(sleepSeconds)  # 2017.5。15把下载延时功能放在这里,这个模块相当于控制器
Esempio n. 15
0
    def craw_a_page_of_commPrice(self, new_url, retries=3):

        # 计算并打印延时情况
        if self.delay > 0:
            sleepSeconds = random.randint(self.delay, self.delay * 2)
            print('craw {0} after {1} seconds ({2} ~ {3}):'.format(
                self.count, sleepSeconds, self.delay, self.delay * 2))
        else:
            print('craw {0} :'.format(self.count))

        # 获取请求头、代理信息,每个页面都不相同
        proxy = None
        self.headers_builder()

        # 下载
        html_cont, code = self.downloader.download(new_url,
                                                   headers=self.headers,
                                                   proxy=proxy)

        # 对下载内容进行处理
        # 1、如果正常得到网页
        if html_cont is not None:
            new_urls, new_datas = self.parser.page_parse(html_cont)  # 返回解析内容
            if new_datas == 'checkcode':  # 如果解析出是输入验证码
                print(str(datetime.datetime.now()))
                self.delay = input("遇到验证码,输入延时秒数后,保留已解析的数据......")
                if self.delay == '':
                    self.delay = 0
                else:
                    self.delay = ToolsBox.strToInt(self.delay)
                # 要改输出方式self.total = self.total + self.outputer.out_mysql()
                if retries > 0:
                    return self.craw_a_page_of_commPrice(new_url, retries - 1)
            elif new_datas == '0':  # 这是查询出来没有数据记录
                print('未找到该小区')
                print('本页面      datas:没有,urls:当然没有')
            elif len(new_datas) == 0 and len(new_urls) == 0:  # 解析无数据
                self.nodata += 1
                if self.nodata < self.nodata_stop:
                    print("本页面未解析出数据,可再试{0}次".format(self.nodata_stop -
                                                     self.nodata))
                    time.sleep(random.randint(3, 7))
                    return self.craw_a_page_of_commPrice(new_url)
                else:
                    with open('logtest.txt', 'a+') as fout:
                        fout.write('\n*******' + str(datetime.datetime.now()) +
                                   '*************')
                        fout.write('\n 本页面无数据:%s. \n' % new_url)
                    if self.nodata < 999:
                        self.delay = input(
                            '页面连续无数据,可点击上面链接检查,如无问题,输入延时秒数后,保留已解析的数据......')
                        if self.delay == '':
                            self.delay = 0
                        else:
                            self.delay = ToolsBox.strToInt(self.delay)
                        self.nodata = 0
                    else:
                        self.nodata = 1000
            else:  # 正常情况,解析
                print('本页面      datas:{0},urls:{1}'.format(
                    len(new_datas), len(new_urls)))

                # 把页面链接放入url管理器
                self.urls.add_new_urls(new_urls)
                # ToolsBox.priList(new_urls)
                # 把小区名称放入小区管理器
                # for data in new_datas:
                #     self.add_comm(data)
                # ToolsBox.priList(new_datas)
                # 把挂牌信息传入outputer,清除无效数据后,放在outputer.raw_datas记录集中
                # self.outputer.collect_data(new_datas)
                # data_num = self.outputer.get_datas_quantity()
                # print("共%6.0f = %6.0f 重复 + %5.0f 数据池 + %6.0f 存入数据库 " % (
                #     data_num['dupli_count'] + data_num['r_data'] + self.total, data_num['dupli_count'],
                #     data_num['r_data'],
                #     self.total))
                #
                # if 3000 < data_num['r_data']:
                #     print("正在存入数据库中,请稍侯......")
                #     storenum = self.outputer.out_mysql()
                #     if storenum:
                #         self.total = self.total + storenum
                self.count += 1
                self.nodata = 0 if self.nodata < 999 else 1000  # 如果有数据,把self.nodata计数器复原
                self.HTTP404 = 0  # 如果有数据,把self.HTTP404计数器清零
        # 2、html_cont内容是None
        else:
            print('不能从服务器上下载{0}'.format(new_url))
            print(
                "返回异常(在MassController里的craw_a_page_of_commPrice中): {0}".format(
                    code))
            self.HTTP404 += 1
            time.sleep(15 * self.HTTP404)  # 被禁止访问了,消停一会
            if self.HTTP404 > self.HTTP404_stop:
                self.delay = input(
                    '连续不能获取页面内容,可点击上面链接检查,如无问题,输入延时秒数后,保留已解析的数据......')
                if self.delay == '':
                    self.delay = 0
                else:
                    self.delay = ToolsBox.strToInt(self.delay)
                # #改输出self.total = self.total + self.outputer.out_mysql()
                self.HTTP404 = 0
            else:
                return self.craw_a_page_of_commPrice(new_url)

        # 延时模块:放在最后,第一次抓取时不用延时
        if not 0 >= self.delay:
            time.sleep(sleepSeconds)