def __decode_project_name_row(cls, column_nodes): project = {} if utils.remove_blank_char(column_nodes[0].text) == '项目名称': project['project_name'] = utils.remove_blank_char(column_nodes[1].text) if utils.remove_blank_char(column_nodes[2].text) == '宗地号': project['land_serial_num'] = utils.remove_blank_char(column_nodes[3].text) return project
def __decode_building_list(cls, building_table_node): ''' project_id integer NOT NULL, project_name character varying(255) NOT NULL, building_name character varying(255) NOT NULL, plan_license character varying(255) NOT NULL, build_license character varying(255) NOT NULL, :param building_table_node: :return: ''' project = {} project['building_list'] = [] if building_table_node is None: project building_nodes = building_table_node.find_all('tr') #删除前3行,这是一些表头信息 del building_nodes[0] del building_nodes[0] del building_nodes[0] for building_node in building_nodes: column_nodes = building_node.find_all('td') if len(column_nodes) < 5: continue building = {} building['project_name'] = utils.remove_blank_char(column_nodes[0].text) building['building_name'] = utils.remove_blank_char(column_nodes[1].text) building['plan_license'] = utils.remove_blank_char(column_nodes[2].text) building['build_license'] = utils.remove_blank_char(column_nodes[3].text) link_node = column_nodes[4].find('a') if link_node is not None: building['url'] = '{}{}'.format(cls.__url, utils.remove_blank_char(link_node['href'])) project['building_list'].append(building) return project
def __decode_contact_num_row(self, column_nodes): project = {} if utils.remove_blank_char(column_nodes[0].text) == '合同文号': project['land_contact_num'] = utils.remove_blank_char(column_nodes[1].text) if utils.remove_blank_char(column_nodes[2].text) == '使用年限': yearstr = utils.remove_blank_char(column_nodes[3].text) project['land_years_limit'] = utils.get_num(yearstr) return project
def __decode_area(cls, column_nodes): house = {} if utils.remove_blank_char(column_nodes[0].text) == '建筑面积': house['build_area'] = utils.get_num(column_nodes[1].text) if utils.remove_blank_char(column_nodes[2].text) == '户内面积': house['inside_area'] = utils.get_num(column_nodes[3].text) if utils.remove_blank_char(column_nodes[4].text) == '分摊面积': house['share_area'] = utils.get_num(column_nodes[5].text) return house
def __decode_contact_and_price(cls, column_nodes): house = {} if utils.remove_blank_char(column_nodes[0].text) == '合同号': house['contact_code'] = utils.remove_blank_char( column_nodes[1].text) if utils.remove_blank_char(column_nodes[2].text) == '备案价格': price_text = utils.remove_blank_char(column_nodes[3].text) #58800元 / 平方米(按建筑面积计) house['price'] = utils.get_num(price_text) return house
def __decode_now_sale_row(self, column_nodes): project = {} if utils.remove_blank_char(column_nodes[0].text) == '现售总套数': project['now_sale_count'] = utils.remove_blank_char(column_nodes[1].text) if utils.remove_blank_char(column_nodes[2].text) == '现售总面积': area = utils.remove_blank_char(column_nodes[3].text) if len(area) == 0: area = 0 project['now_area'] = area return project
def __decode_house(cls, house_node, branch_name): ''' :param house_node: :param branch_name: :return: ''' div_nodes = house_node.find_all('div') if len(div_nodes) != 2: utils.print('获取房间信息失败: {}, {}'.format(branch_name, house_node.text)) return None house = {} house['branch'] = branch_name house['room_num'] = utils.remove_blank_char(div_nodes[0].text) href_node = div_nodes[1].find('a') if href_node is None: utils.print('获取房间的连接信息失败, {}, {}'.format(branch_name, house_node.text)) return None url = '{}{}'.format(cls.__url, href_node['href']) utils.print('读取房间 {} {} {} {}的信息...'.format(cls.__project_name, cls.__building_name, branch_name, house['room_num'])) r = utils.request_with_retry(url) if r is None: utils.print('读取房屋{}的页面信息失败'.format(house['room_num'])) return None html_node = BeautifulSoup(r.text, 'lxml') return NewHSrcHousePageDecoder.decode(html_node)
def __crawl_one_page(self, pageindex): ''' 抓去一页的房屋信息 :param pageindex: :return: 是否要继续查找下一页,如果当前页出错,或者查找的结果一个都没写进去,那就没必要再找下一页了 ''' utils.print('抓取第{}页...'.format(pageindex)) url = self.__url.format(pageindex) r = utils.request_with_retry(url) s = BeautifulSoup(r.text, 'lxml') if pageindex == 1: if not self.__get_total_count(s): return False tablenode = s.find('table', id='DataGrid1') if tablenode is None: utils.print('查找表格失败') return False house_list = [] house_nodes = tablenode.find_all('tr') for house_node in house_nodes: house_properties = house_node.find_all('td') if len(house_properties) < 9: continue if house_properties[0].text == '项目名称': continue house = orm.OldHouseSource() #columns = ['thedate', 'region', 'serial_num', 'project_name','area', 'use_type', 'code', 'agency_info'] house.project_name = utils.remove_blank_char( house_properties[0].text) house.serial_num = house_properties[1].text house.region = utils.remove_blank_char(house_properties[2].text) house.area = house_properties[3].text house.use_type = house_properties[4].text house.code = house_properties[6].text house.agency_info = utils.remove_blank_char( house_properties[7].text) house.thedate = house_properties[8].text house_list.append(house) return orm_ope.insert_item_list(house_list)
def __decode_branch_info(cls, branch_root_node): ''' 解析座号列表,获取当前座号,以及其他座号列表 :param branch_root_node: :return: result['current'] = [A]座 result['list]['url] ''' branch_info = {} branch_info['list'] = [] current_branch_node = branch_root_node.find('font') if current_branch_node is not None: branch_info['current'] = utils.remove_blank_char( current_branch_node.text) branch_nodes = branch_root_node.find_all('a') for branch_node in branch_nodes: branch = {} branch['url'] = '{}{}'.format(cls.__url, branch_node['href']) branch['name'] = utils.remove_blank_char(branch_node.text) branch_info['list'].append(branch) return branch_info
def __decode_floor_roomnum_usage(cls, column_nodes): house = {} if utils.remove_blank_char(column_nodes[0].text) == '楼层': house['floor'] = utils.remove_blank_char(column_nodes[1].text) if utils.remove_blank_char(column_nodes[2].text) == '房号': house['room_num'] = utils.remove_blank_char(column_nodes[3].text) if utils.remove_blank_char(column_nodes[4].text) == '用途': house['usage'] = utils.remove_blank_char(column_nodes[5].text) return house
def __decode_one_row(cls, row_node): column_nodes = row_node.find_all('td') if len(column_nodes) < 2: return {} first_column_text = utils.remove_blank_char(column_nodes[0].text) if first_column_text == '项目楼栋情况': return cls.__decode_project_buiding_branch_row(column_nodes) elif first_column_text == '合同号': return cls.__decode_contact_and_price(column_nodes) elif first_column_text == '楼层': return cls.__decode_floor_roomnum_usage(column_nodes) elif first_column_text == '建筑面积': return cls.__decode_area(column_nodes) else: return {}
def __decode_project_buiding_branch_row(cls, column_nodes): house = {} if utils.remove_blank_char(column_nodes[0].text) == '项目楼栋情况': house['building_name'] = utils.remove_blank_char( column_nodes[1].text) if utils.remove_blank_char(column_nodes[2].text) == '座号': house['branch'] = utils.remove_blank_char(column_nodes[3].text) if utils.remove_blank_char(column_nodes[4].text) == '户型': house['house_type'] = utils.remove_blank_char(column_nodes[5].text) return house
def __decode_one_project_info_row(cls, row_node): '''解析这行,如果有需要的信息,就将他转换为字典''' column_nodes = row_node.find_all('td') if len(column_nodes) < 2: return {} first_column_text = utils.remove_blank_char(column_nodes[0].text) if first_column_text == '项目名称': return cls.__decode_project_name_row(column_nodes) elif first_column_text == '宗地位置': return cls.__decode_address_row(column_nodes) elif first_column_text == '合同文号': return cls.__decode_contact_num_row(column_nodes) elif first_column_text == '房屋用途': return cls.__decode_house_usage_row(column_nodes) elif first_column_text == '土地用途': return cls.__decode_land_usage_row(column_nodes) elif first_column_text == '预售总套数': return cls.__decode_pre_sale_row(column_nodes) elif first_column_text == '现售总套数': return cls.__decode_now_sale_row(column_nodes) else: return {}
def __decode_land_usage_row(self, column_nodes): project = {} if utils.remove_blank_char(column_nodes[0].text) == '土地用途': project['land_usage'] = utils.remove_blank_char(column_nodes[1].text) return project
def __decode_address_row(self, column_nodes): project = {} if utils.remove_blank_char(column_nodes[0].text) == '宗地位置': project['address'] = utils.remove_blank_char(column_nodes[1].text) return project