def __decode_project_name_row(cls, column_nodes):
     project = {}
     if utils.remove_blank_char(column_nodes[0].text) == '项目名称':
         project['project_name'] = utils.remove_blank_char(column_nodes[1].text)
     if utils.remove_blank_char(column_nodes[2].text) == '宗地号':
         project['land_serial_num'] = utils.remove_blank_char(column_nodes[3].text)
     return project
    def __decode_building_list(cls, building_table_node):
        '''
        project_id integer NOT NULL,
        project_name character varying(255) NOT NULL,
        building_name character varying(255) NOT NULL,
        plan_license character varying(255) NOT NULL,
        build_license character varying(255) NOT NULL,
        :param building_table_node:
        :return:
        '''
        project = {}
        project['building_list'] = []
        if building_table_node is None:
            project

        building_nodes = building_table_node.find_all('tr')
        #删除前3行,这是一些表头信息
        del building_nodes[0]
        del building_nodes[0]
        del building_nodes[0]
        for building_node in building_nodes:
            column_nodes = building_node.find_all('td')
            if len(column_nodes) < 5:
                continue
            building = {}
            building['project_name'] = utils.remove_blank_char(column_nodes[0].text)
            building['building_name'] = utils.remove_blank_char(column_nodes[1].text)
            building['plan_license'] = utils.remove_blank_char(column_nodes[2].text)
            building['build_license'] = utils.remove_blank_char(column_nodes[3].text)
            link_node = column_nodes[4].find('a')
            if link_node is not None:
                building['url'] = '{}{}'.format(cls.__url, utils.remove_blank_char(link_node['href']))
            project['building_list'].append(building)
        return project
 def __decode_contact_num_row(self, column_nodes):
     project = {}
     if utils.remove_blank_char(column_nodes[0].text) == '合同文号':
         project['land_contact_num'] = utils.remove_blank_char(column_nodes[1].text)
     if utils.remove_blank_char(column_nodes[2].text) == '使用年限':
         yearstr = utils.remove_blank_char(column_nodes[3].text)
         project['land_years_limit'] = utils.get_num(yearstr)
     return project
Exemple #4
0
 def __decode_area(cls, column_nodes):
     house = {}
     if utils.remove_blank_char(column_nodes[0].text) == '建筑面积':
         house['build_area'] = utils.get_num(column_nodes[1].text)
     if utils.remove_blank_char(column_nodes[2].text) == '户内面积':
         house['inside_area'] = utils.get_num(column_nodes[3].text)
     if utils.remove_blank_char(column_nodes[4].text) == '分摊面积':
         house['share_area'] = utils.get_num(column_nodes[5].text)
     return house
Exemple #5
0
 def __decode_contact_and_price(cls, column_nodes):
     house = {}
     if utils.remove_blank_char(column_nodes[0].text) == '合同号':
         house['contact_code'] = utils.remove_blank_char(
             column_nodes[1].text)
     if utils.remove_blank_char(column_nodes[2].text) == '备案价格':
         price_text = utils.remove_blank_char(column_nodes[3].text)
         #58800元 / 平方米(按建筑面积计)
         house['price'] = utils.get_num(price_text)
     return house
 def __decode_now_sale_row(self, column_nodes):
     project = {}
     if utils.remove_blank_char(column_nodes[0].text) == '现售总套数':
         project['now_sale_count'] = utils.remove_blank_char(column_nodes[1].text)
     if utils.remove_blank_char(column_nodes[2].text) == '现售总面积':
         area = utils.remove_blank_char(column_nodes[3].text)
         if len(area) == 0:
             area = 0
         project['now_area'] = area
     return project
Exemple #7
0
    def __decode_house(cls, house_node, branch_name):
        '''
        :param house_node:
        :param branch_name:
        :return:
        '''
        div_nodes = house_node.find_all('div')
        if len(div_nodes) != 2:
            utils.print('获取房间信息失败: {}, {}'.format(branch_name,
                                                  house_node.text))
            return None
        house = {}
        house['branch'] = branch_name
        house['room_num'] = utils.remove_blank_char(div_nodes[0].text)
        href_node = div_nodes[1].find('a')
        if href_node is None:
            utils.print('获取房间的连接信息失败, {}, {}'.format(branch_name,
                                                     house_node.text))
            return None

        url = '{}{}'.format(cls.__url, href_node['href'])
        utils.print('读取房间 {} {} {} {}的信息...'.format(cls.__project_name,
                                                    cls.__building_name,
                                                    branch_name,
                                                    house['room_num']))
        r = utils.request_with_retry(url)
        if r is None:
            utils.print('读取房屋{}的页面信息失败'.format(house['room_num']))
            return None

        html_node = BeautifulSoup(r.text, 'lxml')
        return NewHSrcHousePageDecoder.decode(html_node)
Exemple #8
0
    def __crawl_one_page(self, pageindex):
        '''
        抓去一页的房屋信息
        :param pageindex:
        :return: 是否要继续查找下一页,如果当前页出错,或者查找的结果一个都没写进去,那就没必要再找下一页了
        '''
        utils.print('抓取第{}页...'.format(pageindex))
        url = self.__url.format(pageindex)
        r = utils.request_with_retry(url)
        s = BeautifulSoup(r.text, 'lxml')
        if pageindex == 1:
            if not self.__get_total_count(s):
                return False

        tablenode = s.find('table', id='DataGrid1')
        if tablenode is None:
            utils.print('查找表格失败')
            return False
        house_list = []

        house_nodes = tablenode.find_all('tr')
        for house_node in house_nodes:
            house_properties = house_node.find_all('td')
            if len(house_properties) < 9:
                continue
            if house_properties[0].text == '项目名称':
                continue
            house = orm.OldHouseSource()
            #columns = ['thedate', 'region', 'serial_num', 'project_name','area', 'use_type', 'code', 'agency_info']
            house.project_name = utils.remove_blank_char(
                house_properties[0].text)
            house.serial_num = house_properties[1].text
            house.region = utils.remove_blank_char(house_properties[2].text)
            house.area = house_properties[3].text
            house.use_type = house_properties[4].text
            house.code = house_properties[6].text
            house.agency_info = utils.remove_blank_char(
                house_properties[7].text)
            house.thedate = house_properties[8].text
            house_list.append(house)

        return orm_ope.insert_item_list(house_list)
Exemple #9
0
 def __decode_branch_info(cls, branch_root_node):
     '''
     解析座号列表,获取当前座号,以及其他座号列表
     :param branch_root_node:
     :return:
     result['current'] = [A]座
     result['list]['url]
     '''
     branch_info = {}
     branch_info['list'] = []
     current_branch_node = branch_root_node.find('font')
     if current_branch_node is not None:
         branch_info['current'] = utils.remove_blank_char(
             current_branch_node.text)
     branch_nodes = branch_root_node.find_all('a')
     for branch_node in branch_nodes:
         branch = {}
         branch['url'] = '{}{}'.format(cls.__url, branch_node['href'])
         branch['name'] = utils.remove_blank_char(branch_node.text)
         branch_info['list'].append(branch)
     return branch_info
Exemple #10
0
 def __decode_floor_roomnum_usage(cls, column_nodes):
     house = {}
     if utils.remove_blank_char(column_nodes[0].text) == '楼层':
         house['floor'] = utils.remove_blank_char(column_nodes[1].text)
     if utils.remove_blank_char(column_nodes[2].text) == '房号':
         house['room_num'] = utils.remove_blank_char(column_nodes[3].text)
     if utils.remove_blank_char(column_nodes[4].text) == '用途':
         house['usage'] = utils.remove_blank_char(column_nodes[5].text)
     return house
Exemple #11
0
 def __decode_one_row(cls, row_node):
     column_nodes = row_node.find_all('td')
     if len(column_nodes) < 2:
         return {}
     first_column_text = utils.remove_blank_char(column_nodes[0].text)
     if first_column_text == '项目楼栋情况':
         return cls.__decode_project_buiding_branch_row(column_nodes)
     elif first_column_text == '合同号':
         return cls.__decode_contact_and_price(column_nodes)
     elif first_column_text == '楼层':
         return cls.__decode_floor_roomnum_usage(column_nodes)
     elif first_column_text == '建筑面积':
         return cls.__decode_area(column_nodes)
     else:
         return {}
Exemple #12
0
 def __decode_project_buiding_branch_row(cls, column_nodes):
     house = {}
     if utils.remove_blank_char(column_nodes[0].text) == '项目楼栋情况':
         house['building_name'] = utils.remove_blank_char(
             column_nodes[1].text)
     if utils.remove_blank_char(column_nodes[2].text) == '座号':
         house['branch'] = utils.remove_blank_char(column_nodes[3].text)
     if utils.remove_blank_char(column_nodes[4].text) == '户型':
         house['house_type'] = utils.remove_blank_char(column_nodes[5].text)
     return house
 def __decode_one_project_info_row(cls, row_node):
     '''解析这行,如果有需要的信息,就将他转换为字典'''
     column_nodes = row_node.find_all('td')
     if len(column_nodes) < 2:
         return {}
     first_column_text = utils.remove_blank_char(column_nodes[0].text)
     if first_column_text == '项目名称':
         return cls.__decode_project_name_row(column_nodes)
     elif first_column_text == '宗地位置':
         return cls.__decode_address_row(column_nodes)
     elif first_column_text == '合同文号':
         return cls.__decode_contact_num_row(column_nodes)
     elif first_column_text == '房屋用途':
         return cls.__decode_house_usage_row(column_nodes)
     elif first_column_text == '土地用途':
         return cls.__decode_land_usage_row(column_nodes)
     elif first_column_text == '预售总套数':
         return cls.__decode_pre_sale_row(column_nodes)
     elif first_column_text == '现售总套数':
         return cls.__decode_now_sale_row(column_nodes)
     else:
         return {}
 def __decode_land_usage_row(self, column_nodes):
     project = {}
     if utils.remove_blank_char(column_nodes[0].text) == '土地用途':
         project['land_usage'] = utils.remove_blank_char(column_nodes[1].text)
     return project
 def __decode_address_row(self, column_nodes):
     project = {}
     if utils.remove_blank_char(column_nodes[0].text) == '宗地位置':
         project['address'] = utils.remove_blank_char(column_nodes[1].text)
     return project