Beispiel #1
0
    def _get_new_datas_B(self, sel):

        page_datas = []

        houses = sel.select('td.t')
        titles = sel.select('td.t > a.t')
        prices = sel.select('td.tc > b.pri')
        spans = sel.select('td.tc')

        for span, house, title, price in zip(spans, houses, titles, prices):
            each_data = {
                'builded_year': 0,
                'spatial_arrangement': '',
                'floor_index': 0,
                'total_floor': 0
            }
            each_data['title'] = title.get_text()
            each_data['details_url'] = title.get('href')

            each_data['total_price'] = int(
                filter(str.isdigit,
                       price.get_text().encode('utf8')))

            # 解析面积
            item = span.select('span.f14')
            d1 = self.parse_item(item[1].get_text())
            each_data = dict(each_data, **d1)

            comm = house.select('.a_xq1')
            each_data['community_name'] = comm[1].get_text()

            hss = house.select('span.c_ccc')

            for hs in hss:
                string = hs.next_sibling
                if string is not None:
                    string = mytools.clearStr(string).encode('utf8')
                    d1 = {}
                    d1 = self.parse_item(string)
                    if d1.has_key('advantage') and each_data.has_key(
                            'advantage'):
                        each_data['advantage'] = each_data[
                            'advantage'] + '/' + d1['advantage']
                    else:
                        each_data = dict(each_data, **d1)

            each_data['price'] = round(
                float(each_data['total_price'] * 10000 / each_data['area']), 2)
            each_data['from'] = "58"

            each_data = self.pipe(each_data)

            if each_data:
                # if each_data.has_key('total_floor') and each_data.has_key('total_price') and each_data.has_key('area') and each_data.has_key('community_name'):
                page_datas.append(each_data)
            else:
                if mytools.ShowInvalideData(each_data):
                    page_datas.append(each_data)

        return page_datas
Beispiel #2
0
    def _get_new_datas(self, sel):

        page_datas = []

        titles = sel.xpath('//h2[@class="title"]/a')
        prices = sel.xpath('//p[@class="sum"]/b')
        houses = sel.xpath('//div[@class="list-info"]')

        for title, price, house in zip(titles, prices, houses):
            each_data = {
                'advantage': '',
                'builded_year': 0,
                'spatial_arrangement': '',
                'floor_index': 0,
                'total_floor': 0
            }
            each_data['title'] = title.text
            each_data['details_url'] = title.get('href')
            each_data['total_price'] = int(
                filter(str.isdigit, price.text.encode('utf8')))

            details = house.xpath('./p')
            spans = details[0].xpath('./span')
            for span in spans:
                string = span.text
                if string is not None:
                    string = mytools.clearStr(string).encode('utf8')
                    d1 = {}
                    d1 = self.parse_item(string)
                    each_data = dict(each_data, **d1)

            comms = details[1].xpath('.//a')
            each_data['community_name'] = comms[0].text

            if comms[0].get('href') is None:
                each_data['comm_url'] = ''
            else:
                each_data['comm_url'] = 'http://xm.58.com' + comms[0].get(
                    'href')
            each_data['from'] = "58"

            each_data = self.pipe(each_data)

            if each_data:
                each_data['price'] = round(
                    float(each_data['total_price'] * 10000 /
                          each_data['area']), 2)
                # if each_data.has_key('total_floor') and each_data.has_key('total_price') and each_data.has_key('area') and each_data.has_key('community_name'):
                page_datas.append(each_data)
            else:
                if mytools.ShowInvalideData(each_data):
                    page_datas.append(each_data)

        # print('debug:this page have %s datas' %len(page_datas))

        return page_datas
Beispiel #3
0
    def _get_new_datas(self,soup):

        page_datas = []

        details = soup.select("div.house-info")
        comms = soup.select("div.house-info > a ")
        positions = soup.select("div.house-position")
        prices = soup.select("span.georgia")
        titles = soup.select("h3 > a")

        for title,comm,detail,position,price in zip(titles,comms,details,positions,prices):

            each_data = {'builded_year':0,'spatial_arrangement':'','floor_index':0,'total_floor':0}
            each_data['title'] = title.get('title')
            each_data['details_url'] = title.get('href')
            mr20 = detail.select("span.mr20")
            posi = position.select("span")
            for j in range(1,len(posi)):
                out = {}
                out = self.parse_item(posi[j].get_text())
                if len(out) > 0:
                    if each_data.has_key('advantage') and out.has_key('advantage'):
                        each_data['advantage'] = each_data['advantage'] + ',' + out['advantage']        
                    else:
                        each_data = dict(each_data, **out) 
            for item in mr20:
                d1 = {}
                d1 = self.parse_item(item.get_text())
                if len(d1) > 0:
                    if each_data.has_key('advantage') and d1.has_key('advantage'):
                        each_data['advantage'] = each_data['advantage'] + ',' + d1['advantage']
                    else:
                        each_data = dict(each_data, **d1) 

            each_data['community_name'] = comm.get_text()
            each_data['total_price'] = int(round(float(price.get_text()),0))
            each_data['price'] = round(float(each_data['total_price']*10000/each_data['area']),2)
            each_data['from'] = "lejv"

            each_data = self.pipe(each_data)        #2016.6.4增加一个专门的数据处理

            if each_data.has_key('total_floor') and each_data.has_key('total_price') and each_data.has_key('area') and each_data.has_key('community_name'):
                page_datas.append(each_data)
            else:
                if mytools.ShowInvalideData(each_data):page_datas.append(each_data)

        return page_datas 
Beispiel #4
0
    def _get_new_datas(self,soup):
        page_datas = []
        # page_comms = []
        details = soup.select('dd.detail ')
        hrefs = soup.select('span.c_blue0041d9.aVisited.f14B > a')
        comms = soup.select('span.xuzhentian > a')
        prices = soup.select('span > em')

        for detail,href,comm,price in zip(details,hrefs,comms,prices):

            # each_data = {'advantage':'','builded_year':0,'spatial_arrangement':''}
            each_data = {'advantage':'','builded_year':0,'spatial_arrangement':'','floor_index':0,'total_floor':0}
            each_data['title'] = href.get_text().strip().encode('utf8')
            each_data['community_name'] = comm.get_text().strip().encode('utf8')
            each_data['details_url'] = "http://esf.xmhouse.com" + href.get('href')
            each_data['total_price'] = int(price.get_text())
            h_infos = re.search(r'<span style="margin-left: 5px; color: #000000">.*</span>(.*) <div',str(detail),re.S)\
                    .group(1).replace('<br/>','').replace('\r\n','').replace(' ','').split(',')

            for item in h_infos:
                try:
                    #2016.8.22使用新的专门解析函数
                    d1 = {}
                    d1 = self.parse_item(item)
                    each_data = dict(each_data, **d1)
                except Exception, e:
                    with open('logtest.txt','a+') as fout:
                        fout.write('*************' + str(datetime.datetime.now()) + '*************\n')
                        fout.write('      获取的数据:' )
                        for i1 in h_infos:
                            fout.write(i1 + ',')
                        fout.write('\n      XmParser解析时发生错误的Item是: ' + str(item) + '\n')
                        traceback.print_exc(file=fout) 
                        print traceback.format_exc()
                       
            each_data['from'] = "XMHouse" 
            each_data['price'] = round(each_data['total_price']*10000/each_data['area'],0)
            # for key,value in each_data.items():
            #     print('%20s : %s' %(key,value))
            each_data = self.pipe(each_data) 
            # print(each_data)
            if each_data.has_key('total_floor') and each_data.has_key('total_price') and each_data.has_key('area') and each_data.has_key('community_name'):
                # each_data['price'] = round(each_data['total_price']*10000/each_data['area'],0)
                page_datas.append(each_data)
            else:                
                if mytools.ShowInvalideData(each_data):page_datas.append(each_data)
Beispiel #5
0
    def _get_new_datas(self, soup):
        page_datas = []
        # 赶集网2017.03改版

        details = soup.select("dd.dd-item.size")
        comms = soup.select("dd.dd-item.address > span ")
        prices = soup.select("span.num.js-price")
        titles = soup.select("dd.dd-item.title > a")

        for title, detail, comm, price in zip(titles, details, comms, prices):

            each_data = {
                'builded_year': 0,
                'spatial_arrangement': '',
                'floor_index': 0,
                'total_floor': 0
            }
            each_data['title'] = title.get('title')
            each_data['details_url'] = 'http://xm.ganji.com' + title.get(
                'href')
            for item in (detail.stripped_strings):
                d1 = {}
                d1 = self.parse_item(item)
                each_data = dict(each_data, **d1)
            each_data['community_name'] = comm.stripped_strings.next().strip(
            ).split(' ')[0].replace('.', '').encode('utf8')
            each_data['total_price'] = int(round(float(price.get_text()), 0))
            each_data['price'] = round(
                float(each_data['total_price'] * 10000 / each_data['area']), 2)
            each_data['from'] = "ganji"
            each_data = self.pipe(each_data)

            if each_data:
                # if each_data and each_data.has_key('total_floor') and each_data.has_key('total_price') and each_data.has_key('area') and each_data.has_key('community_name'):
                page_datas.append(each_data)
            else:
                if mytools.ShowInvalideData(each_data):
                    page_datas.append(each_data)

        return page_datas
Beispiel #6
0
    def _get_new_datas(self, soup):
        page_datas = []

        titles = soup.select("div.title > a")
        houseinfo = soup.select("div.houseInfo")
        positionInfo = soup.select("div.positionInfo")
        totalprices = soup.select("div.totalPrice")
        for title, info, position, totalPrice in zip(titles, houseinfo,
                                                     positionInfo,
                                                     totalprices):
            # each_data有些需要设置初始值
            each_data = {
                'builded_year': 0,
                'spatial_arrangement': '',
                'floor_index': 0,
                'total_floor': 0
            }
            each_data['title'] = title.get_text()
            each_data['details_url'] = title.get('href')
            each_data['total_price'] = int(
                round(
                    float(
                        re.search('(\d+.?\d+)万'.decode('utf8'),
                                  totalPrice.get_text()).groups(0)[0]), 0))

            info_item = (info.get_text().split('|'))

            each_data['community_name'] = info_item[0].strip()  # 第1个总是小区名称
            for i in range(1, len(info_item)):
                d1 = {}
                d1 = self.parse_item(info_item[i].strip())
                if d1.has_key('advantage') and each_data.has_key('advantage'):
                    d1['advantage'] = each_data['advantage'] + ',' + d1[
                        'advantage']
                each_data = dict(each_data, **d1)

            position = position.get_text().replace('\t',
                                                   '').replace('\n',
                                                               '').split()
            each_data['block'] = position[-1]

            if ')' not in position[0]:  #链前的别墅会用'4层2008年建'的形式,加入')',以便分隔
                position[0] = position[0].replace('层', '层)')

            for item in position[0].split(')'):  #2017.4.1链家格式有改
                d1 = {}
                # d1 = self.parse_item(position[i].strip())
                d1 = self.parse_item(item.strip())  #2017.4.1链家格式有改
                each_data = dict(each_data, **d1)

            each_data['price'] = float(each_data['total_price'] * 10000 /
                                       each_data['area'])
            each_data['from'] = "lianjia"

            each_data = self.pipe(each_data)

            if each_data.has_key('total_floor') and each_data.has_key(
                    'total_price') and each_data.has_key(
                        'area') and each_data.has_key('community_name'):
                page_datas.append(each_data)
            else:
                if mytools.ShowInvalideData(each_data):
                    page_datas.append(each_data)

        return page_datas
Beispiel #7
0
                each_data['from'] = "AJK" 
            except Exception, e:
                with open('logtest.txt','a+') as fout:
                    fout.write('*************' + str(datetime.datetime.now()) + '*************\n')
                    fout.write('      待解析的数据:\n' )
                    for i1 in house:
                        fout.write(str(i1) + '\n')
                    fout.write('\n      字段数:' + str(len(house)) +'\n' )
                    traceback.print_exc(file=fout) 
                    print(traceback.format_exc())

            each_data = self.pipe(each_data)        #2016.6.4增加一个专门的数据处理

            if each_data:
                if each_data.has_key('community_name'):
                    page_datas.append(each_data)
                else:
                    if mytools.ShowInvalideData(each_data):page_datas.append(each_data)
            else:
                if mytools.ShowInvalideData(each_data):page_datas.append(each_data)
                
            # if each_data:
            #     page_datas.append(each_data)

        return page_datas           #2016.5.30直接从字典里提取数据,不再另外加一个小区名的list