コード例 #1
0
    def __crawl_one_page_project(self, page_index):
        r = None
        utils.print('正在读取第%d页项目列表...' % page_index)
        if page_index == 1:
            r = utils.request_with_retry(self.__url)
        else:
            r = utils.request_with_retry('{}index.aspx'.format(self.__url), self.__create_form_data(page_index))
        if r is None:
            utils.print('读取项目页面失败, page_index = {}'.format(page_index))
            return
        html_node = BeautifulSoup(r.text, 'lxml')
        self.extract_formdata_from_newpage(html_node)
        if page_index == 1:
            self.__get_total_count(html_node)

        project_nodes = self.__get_project_nodes(html_node)
        project_list = []
        for project_node in project_nodes:
            project = self.__convert_project_node_to_project(project_node)
            if project is None:
                continue
            project['is_crawled'] = False
            project_list.append(project)
        utils.print('解析出%d条项目信息' % len(project_list))
        writedcount = NewHouseSourceDao.write_project_summary_list(project_list)
        utils.print('写入数据库 %d 条记录' % writedcount)
コード例 #2
0
    def __crawl_project_detail(self, project_info):
        '''
        获取指定项目的详细信息,然后写入到数据库中
        :param url:
        :param project_info:  这个是从列表中获取的项目的简要信息
        :return:
        '''
        utils.print('读取项目{}页面'.format(project_info['project_name']))
        r = utils.request_with_retry(project_info['url'])
        if r is None:
            utils.print('读取项目: {} , 页面失败...'.format(
                project_info['project_name']))
            return False

        s = BeautifulSoup(r.text, 'lxml')
        if not NewHSrcPrjPageDecoder.decode_and_write(s, project_info):
            return False

        for building in project_info['building_list']:
            try:

                utils.print('读取 {} 的 {} 页面...'.format(
                    project_info['project_name'], building['building_name']))
                building['project_id'] = project_info['id']
                building['is_crawled'] = False
                if NewHouseSourceDao.is_building_crawled(building) > 0:
                    continue

                r = utils.request_with_retry(building['url'])
                if r is None:
                    utils.print('读取项目 {} 的楼栋 {} 页面失败.'.format(
                        project_info['project_name'],
                        building['building_name']))
                    continue

                html_node = BeautifulSoup(r.text, 'lxml')
                house_list = NewHSrcBldPageDecoder.decode(
                    html_node, building['building_name'],
                    project_info['project_name'])

                if NewHouseSourceDao.write_newhouse_building(building) == 0:
                    continue
                building_id = NewHouseSourceDao.get_building_id(building)
                if building_id == 0:
                    print('获取楼栋id失败,{}, {}'.format(
                        project_info['project_name'],
                        building['building_name']))
                    continue
                for house in house_list:
                    house['building_id'] = building_id

                NewHouseSourceDao.write_houselist(house_list)
                NewHouseSourceDao.update_building_state_to_crawled(building_id)
            except Exception as e:
                utils.print('抓取建筑 {} 失败...'.format(building['building_name']))
                utils.print(str(e))

        NewHouseSourceDao.update_project_state_to_crawled(
            project_info['presale_license_num'])
コード例 #3
0
    def decode(cls, page_node, building_name, project_name):
        '''
        :param page_node:
        :return:  返回一个house_list
        '''
        cls.__project_name = project_name
        cls.__building_name = building_name
        branch_node = page_node.find('div', id='divShowBranch')
        branch_info = {}
        if branch_node is not None:
            branch_info = cls.__decode_branch_info(branch_node)

        house_list_node = cls.__get_house_list_node(page_node)
        house_list = []
        if house_list_node is not None and branch_info.__contains__('current'):
            house_list = cls.__decode_house_list(house_list_node,
                                                 branch_info['current'])

        for branch in branch_info['list']:
            r = utils.request_with_retry(branch['url'])
            html_node = BeautifulSoup(r.text, 'lxml')
            house_list_node = cls.__get_house_list_node(html_node)
            if house_list_node is not None:
                house_list.extend(
                    cls.__decode_house_list(house_list_node, branch['name']))

        return house_list
コード例 #4
0
    def __decode_house(cls, house_node, branch_name):
        '''
        :param house_node:
        :param branch_name:
        :return:
        '''
        div_nodes = house_node.find_all('div')
        if len(div_nodes) != 2:
            utils.print('获取房间信息失败: {}, {}'.format(branch_name,
                                                  house_node.text))
            return None
        house = {}
        house['branch'] = branch_name
        house['room_num'] = utils.remove_blank_char(div_nodes[0].text)
        href_node = div_nodes[1].find('a')
        if href_node is None:
            utils.print('获取房间的连接信息失败, {}, {}'.format(branch_name,
                                                     house_node.text))
            return None

        url = '{}{}'.format(cls.__url, href_node['href'])
        utils.print('读取房间 {} {} {} {}的信息...'.format(cls.__project_name,
                                                    cls.__building_name,
                                                    branch_name,
                                                    house['room_num']))
        r = utils.request_with_retry(url)
        if r is None:
            utils.print('读取房屋{}的页面信息失败'.format(house['room_num']))
            return None

        html_node = BeautifulSoup(r.text, 'lxml')
        return NewHSrcHousePageDecoder.decode(html_node)
コード例 #5
0
    def __query_one_area(self, area_name):
        '''
        这个调用接口时,在fromdata中传递的参数不同
        返回的response也不同,第一行和最后一行不是html,不规范,要注意做兼容处理
        :param area_name:
        :return:
        '''
        print('{} query {} info...'.format(dt.now(), area_name))
        r = None
        if area_name == '全市':
            r = utils.request_with_retry(self.__url)
        else:
            fromdata = self.areas[area_name]
            self.form_data[
                'ctl00$ContentPlaceHolder1$scriptManager1'] = fromdata[
                    'ctl00$ContentPlaceHolder1$scriptManager1']
            self.form_data['__EVENTTARGET'] = fromdata['__EVENTTARGET']
            r = utils.request_with_retry(self.__url, self.form_data)

        s = BeautifulSoup(r.text, 'lxml')
        self.extract_formdata_from_newpage(s)
        self.__extract_info_from_page_into_db(s, area_name)
コード例 #6
0
 def run(self):
     utils.print('正在读取项目列表...')
     r = utils.request_with_retry(self.__url)
     if r is None:
         utils.print('读取项目页面失败...')
         return
     html_node = BeautifulSoup(r.text, 'lxml')
     project_nodes = self.__get_project_nodes(html_node)
     for project_node in project_nodes:
         project = self.__convert_project_node_to_project(project_node)
         if project is None:
             continue
         project['is_crawled'] = False
         writedcount = NewHouseSourceDao.write_project_summary(project)
         if writedcount > 0:
             MailSender.send_alarm_message('深圳有新地产项目通过预售', str(project))
コード例 #7
0
    def __crawl_one_page(self, pageindex):
        '''
        抓去一页的房屋信息
        :param pageindex:
        :return: 是否要继续查找下一页,如果当前页出错,或者查找的结果一个都没写进去,那就没必要再找下一页了
        '''
        utils.print('抓取第{}页...'.format(pageindex))
        url = self.__url.format(pageindex)
        r = utils.request_with_retry(url)
        s = BeautifulSoup(r.text, 'lxml')
        if pageindex == 1:
            if not self.__get_total_count(s):
                return False

        tablenode = s.find('table', id='DataGrid1')
        if tablenode is None:
            utils.print('查找表格失败')
            return False
        house_list = []

        house_nodes = tablenode.find_all('tr')
        for house_node in house_nodes:
            house_properties = house_node.find_all('td')
            if len(house_properties) < 9:
                continue
            if house_properties[0].text == '项目名称':
                continue
            house = orm.OldHouseSource()
            #columns = ['thedate', 'region', 'serial_num', 'project_name','area', 'use_type', 'code', 'agency_info']
            house.project_name = utils.remove_blank_char(
                house_properties[0].text)
            house.serial_num = house_properties[1].text
            house.region = utils.remove_blank_char(house_properties[2].text)
            house.area = house_properties[3].text
            house.use_type = house_properties[4].text
            house.code = house_properties[6].text
            house.agency_info = utils.remove_blank_char(
                house_properties[7].text)
            house.thedate = house_properties[8].text
            house_list.append(house)

        return orm_ope.insert_item_list(house_list)