Example #1
0
    def _get_level1_links(self, start_url):
        rs = []

        top_level = get_soup(start_url)
        for second_level in top_level.select(
                "body div.m-filter div.position > dl > dd > div")[0].select(
                    "div > a"):
            rs.append(
                (self.base_url + second_level["href"], second_level.text))
        return rs
Example #2
0
    def _get_level2_links(self, part, start_url):
        rs = {}

        top_level = get_soup(start_url)
        for second_level in top_level.select(
                "body div.m-filter div.position > dl > dd > div")[0].select(
                    "div")[1].select("a"):
            rs[self.base_url + second_level["href"]] = (part,
                                                        second_level.text)
        return rs
Example #3
0
    def _extract(self):
        html = get_soup(self.url)

        obj = {}
        obj["identity"] = self.url

        # money
        obj["finance"] = {}
        obj["finance"]["total"] = float(
            html.select(
                "body > div.overview > div.content > div.price > span.total")
            [0].text) * 1e4
        obj["finance"]["down_payment"] = float('nan')
        obj["finance"]["tax"] = float('nan')
        obj["finance"]["per_m2"] = self._get_per_m2(html)

        #location
        obj["location"] = {}
        obj["location"]["name"] = html.select(
            "body > div.overview > div.content > div.aroundInfo > div.communityName > a.info"
        )[0].text
        obj["location"]["partition"] = self.part
        obj["location"]["area"] = self.area
        obj["location"]["supplement"] = [
            a.text for a in html.select(
                "body > div.overview > div.content > div.aroundInfo > div.areaName > a"
            )
        ]

        # property
        obj["property"] = {}
        prop = defaultdict(lambda: None)
        prop.update({
            li.select('span')[0].text: li.find(text=True, recursive=False)
            for li in html.select(
                "#introduction > div > div > div.base > div.content > ul > li")
        })
        obj["property"]["formation"] = prop["房屋户型"]
        obj["property"]["floor"] = prop["所在楼层"]
        obj["property"]["total_area"] = self._get_area(prop["建筑面积"])
        obj["property"]["construct_type"] = prop["建筑类型"]
        obj["property"]["structure"] = prop["建筑结构"]
        obj["property"]["orientation"] = prop["房屋朝向"]
        obj["property"]["construct_date"] = self._construct_date(html)

        return obj
Example #4
0
 def _generate_count_and_pages(self):
     html = get_soup(self.url)
     return self._get_count(html), self._get_pages(html)
Example #5
0
 def _get_links(self):
     html = get_soup(self.url)
     divs = html.select(
         "body > div.content > div.leftContent > ul > li > div.info.clear > div.title > a"
     )
     return [div["href"] for div in divs]