def _get_level1_links(self, start_url): rs = [] top_level = get_soup(start_url) for second_level in top_level.select( "body div.m-filter div.position > dl > dd > div")[0].select( "div > a"): rs.append( (self.base_url + second_level["href"], second_level.text)) return rs
def _get_level2_links(self, part, start_url): rs = {} top_level = get_soup(start_url) for second_level in top_level.select( "body div.m-filter div.position > dl > dd > div")[0].select( "div")[1].select("a"): rs[self.base_url + second_level["href"]] = (part, second_level.text) return rs
def _extract(self): html = get_soup(self.url) obj = {} obj["identity"] = self.url # money obj["finance"] = {} obj["finance"]["total"] = float( html.select( "body > div.overview > div.content > div.price > span.total") [0].text) * 1e4 obj["finance"]["down_payment"] = float('nan') obj["finance"]["tax"] = float('nan') obj["finance"]["per_m2"] = self._get_per_m2(html) #location obj["location"] = {} obj["location"]["name"] = html.select( "body > div.overview > div.content > div.aroundInfo > div.communityName > a.info" )[0].text obj["location"]["partition"] = self.part obj["location"]["area"] = self.area obj["location"]["supplement"] = [ a.text for a in html.select( "body > div.overview > div.content > div.aroundInfo > div.areaName > a" ) ] # property obj["property"] = {} prop = defaultdict(lambda: None) prop.update({ li.select('span')[0].text: li.find(text=True, recursive=False) for li in html.select( "#introduction > div > div > div.base > div.content > ul > li") }) obj["property"]["formation"] = prop["房屋户型"] obj["property"]["floor"] = prop["所在楼层"] obj["property"]["total_area"] = self._get_area(prop["建筑面积"]) obj["property"]["construct_type"] = prop["建筑类型"] obj["property"]["structure"] = prop["建筑结构"] obj["property"]["orientation"] = prop["房屋朝向"] obj["property"]["construct_date"] = self._construct_date(html) return obj
def _generate_count_and_pages(self): html = get_soup(self.url) return self._get_count(html), self._get_pages(html)
def _get_links(self): html = get_soup(self.url) divs = html.select( "body > div.content > div.leftContent > ul > li > div.info.clear > div.title > a" ) return [div["href"] for div in divs]