예제 #1
0
                    d1 = self.parse_item(p.get_text().strip())
                    each_data = self.add_advantage(d1, each_data)

            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)
        return page_datas


if __name__ == "__main__":
    downloader = Downloader.Downloader()
    parser = www917Page()
    url = 'https://www.917.com/sell/pn10/'
    headers = {
        "Host":
        "www.917.com",
        "Referer":
        "http://www.917.com/",
        'User-Agent':
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/7.0)',
    }
    html_cont, code = downloader.download(url, headers=headers)

    urls, datas = parser.page_parse(html_cont)

    ToolsBox.priList(urls)
예제 #2
0
            each_data['from'] = "lejv"
            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)

        return page_datas


if __name__ == "__main__":
    downloader = Downloader.Downloader()
    parser = LejvPage()
    url = 'https://xm.esf.leju.com/house'
    headers = {
        "Host":
        "xm.esf.leju.com",
        "Referer":
        "http://xm.esf.leju.com/house/",
        'User-Agent':
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/7.0)',
    }

    html_cont, code = downloader.download(url, headers=headers)
    # print(html_cont)
    urls, datas = parser.page_parse(html_cont)
    ToolsBox.priList(datas)
예제 #3
0
    def parse_datas(self, soup):

        page_datas = []
        # print(soup)

        titles = soup.select("h2.title > a")
        prices = soup.select('p.sum > b')
        houses = soup.select('.list-info')

        for title, price, house in zip(titles, prices, houses):
            each_data = {'advantage': '', 'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0,
                         'total_floor': 0, 'title': title.get_text(), 'details_url': title.get('href'),
                         'total_price': ToolsBox.strToInt(price.get_text())}
            details = house.select('p.baseinfo')
            spans = details[0].select('span')
            for span in spans:
                string = ToolsBox.clearStr(span.get_text()).encode('utf8')
                # d1 = {}
                d1 = self.parse_item(string)
                each_data = self.add_advantage(d1, each_data)  # each_data = dict(each_data, **d1)
            comms = details[1].select('a')

            each_data['community_name'] = comms[0].get_text()

            if comms[0].get('href') is None:
                each_data['comm_url'] = ''
            else:
                each_data['comm_url'] = 'http://xm.58.com' + comms[0].get('href')

            each_data['from'] = "58"

            try:
                if len(comms) >= 2:
                    # input('region')
                    each_data['region'] = comms[1].get_text().strip()
            except Exception as e:
                # print('-------这个记录没有拿到小区的区域------------')
                # ToolsBox.printDic(each_data)
                print(e)

            try:
                if len(comms) >= 3:
                    # input('address')
                    each_data['community_address'] = comms[2].get_text().strip()
            except Exception as e:
                # print('-------这个记录没有拿到小区地址------------')
                # ToolsBox.printDic(each_data)
                print(e)

            each_data = self.pipe(each_data)

            if each_data:
                match_comm = re.findall(r'^\d+$', each_data['community_name'])
                # 不知道为什么,有时小区名称会都是数字,需要屏蔽
                # print(match_comm)
                if len(match_comm) > 0:
                    print('/////////////////出现纯数字的小区了!!!!!!////////////////////////')
                    ToolsBox.priList(each_data)
                    print(soup)
                    # print(each_data['community_name'])
                    # var1 = input(each_data['community_name']+'出现纯数字的小区了!!!!!!!!!')
                else:
                    page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data)

        return page_datas