def __crawl_one_page_project(self, page_index): r = None utils.print('正在读取第%d页项目列表...' % page_index) if page_index == 1: r = utils.request_with_retry(self.__url) else: r = utils.request_with_retry('{}index.aspx'.format(self.__url), self.__create_form_data(page_index)) if r is None: utils.print('读取项目页面失败, page_index = {}'.format(page_index)) return html_node = BeautifulSoup(r.text, 'lxml') self.extract_formdata_from_newpage(html_node) if page_index == 1: self.__get_total_count(html_node) project_nodes = self.__get_project_nodes(html_node) project_list = [] for project_node in project_nodes: project = self.__convert_project_node_to_project(project_node) if project is None: continue project['is_crawled'] = False project_list.append(project) utils.print('解析出%d条项目信息' % len(project_list)) writedcount = NewHouseSourceDao.write_project_summary_list(project_list) utils.print('写入数据库 %d 条记录' % writedcount)
def __crawl_project_detail(self, project_info): ''' 获取指定项目的详细信息,然后写入到数据库中 :param url: :param project_info: 这个是从列表中获取的项目的简要信息 :return: ''' utils.print('读取项目{}页面'.format(project_info['project_name'])) r = utils.request_with_retry(project_info['url']) if r is None: utils.print('读取项目: {} , 页面失败...'.format( project_info['project_name'])) return False s = BeautifulSoup(r.text, 'lxml') if not NewHSrcPrjPageDecoder.decode_and_write(s, project_info): return False for building in project_info['building_list']: try: utils.print('读取 {} 的 {} 页面...'.format( project_info['project_name'], building['building_name'])) building['project_id'] = project_info['id'] building['is_crawled'] = False if NewHouseSourceDao.is_building_crawled(building) > 0: continue r = utils.request_with_retry(building['url']) if r is None: utils.print('读取项目 {} 的楼栋 {} 页面失败.'.format( project_info['project_name'], building['building_name'])) continue html_node = BeautifulSoup(r.text, 'lxml') house_list = NewHSrcBldPageDecoder.decode( html_node, building['building_name'], project_info['project_name']) if NewHouseSourceDao.write_newhouse_building(building) == 0: continue building_id = NewHouseSourceDao.get_building_id(building) if building_id == 0: print('获取楼栋id失败,{}, {}'.format( project_info['project_name'], building['building_name'])) continue for house in house_list: house['building_id'] = building_id NewHouseSourceDao.write_houselist(house_list) NewHouseSourceDao.update_building_state_to_crawled(building_id) except Exception as e: utils.print('抓取建筑 {} 失败...'.format(building['building_name'])) utils.print(str(e)) NewHouseSourceDao.update_project_state_to_crawled( project_info['presale_license_num'])
def decode(cls, page_node, building_name, project_name): ''' :param page_node: :return: 返回一个house_list ''' cls.__project_name = project_name cls.__building_name = building_name branch_node = page_node.find('div', id='divShowBranch') branch_info = {} if branch_node is not None: branch_info = cls.__decode_branch_info(branch_node) house_list_node = cls.__get_house_list_node(page_node) house_list = [] if house_list_node is not None and branch_info.__contains__('current'): house_list = cls.__decode_house_list(house_list_node, branch_info['current']) for branch in branch_info['list']: r = utils.request_with_retry(branch['url']) html_node = BeautifulSoup(r.text, 'lxml') house_list_node = cls.__get_house_list_node(html_node) if house_list_node is not None: house_list.extend( cls.__decode_house_list(house_list_node, branch['name'])) return house_list
def __decode_house(cls, house_node, branch_name): ''' :param house_node: :param branch_name: :return: ''' div_nodes = house_node.find_all('div') if len(div_nodes) != 2: utils.print('获取房间信息失败: {}, {}'.format(branch_name, house_node.text)) return None house = {} house['branch'] = branch_name house['room_num'] = utils.remove_blank_char(div_nodes[0].text) href_node = div_nodes[1].find('a') if href_node is None: utils.print('获取房间的连接信息失败, {}, {}'.format(branch_name, house_node.text)) return None url = '{}{}'.format(cls.__url, href_node['href']) utils.print('读取房间 {} {} {} {}的信息...'.format(cls.__project_name, cls.__building_name, branch_name, house['room_num'])) r = utils.request_with_retry(url) if r is None: utils.print('读取房屋{}的页面信息失败'.format(house['room_num'])) return None html_node = BeautifulSoup(r.text, 'lxml') return NewHSrcHousePageDecoder.decode(html_node)
def __query_one_area(self, area_name): ''' 这个调用接口时,在fromdata中传递的参数不同 返回的response也不同,第一行和最后一行不是html,不规范,要注意做兼容处理 :param area_name: :return: ''' print('{} query {} info...'.format(dt.now(), area_name)) r = None if area_name == '全市': r = utils.request_with_retry(self.__url) else: fromdata = self.areas[area_name] self.form_data[ 'ctl00$ContentPlaceHolder1$scriptManager1'] = fromdata[ 'ctl00$ContentPlaceHolder1$scriptManager1'] self.form_data['__EVENTTARGET'] = fromdata['__EVENTTARGET'] r = utils.request_with_retry(self.__url, self.form_data) s = BeautifulSoup(r.text, 'lxml') self.extract_formdata_from_newpage(s) self.__extract_info_from_page_into_db(s, area_name)
def run(self): utils.print('正在读取项目列表...') r = utils.request_with_retry(self.__url) if r is None: utils.print('读取项目页面失败...') return html_node = BeautifulSoup(r.text, 'lxml') project_nodes = self.__get_project_nodes(html_node) for project_node in project_nodes: project = self.__convert_project_node_to_project(project_node) if project is None: continue project['is_crawled'] = False writedcount = NewHouseSourceDao.write_project_summary(project) if writedcount > 0: MailSender.send_alarm_message('深圳有新地产项目通过预售', str(project))
def __crawl_one_page(self, pageindex): ''' 抓去一页的房屋信息 :param pageindex: :return: 是否要继续查找下一页,如果当前页出错,或者查找的结果一个都没写进去,那就没必要再找下一页了 ''' utils.print('抓取第{}页...'.format(pageindex)) url = self.__url.format(pageindex) r = utils.request_with_retry(url) s = BeautifulSoup(r.text, 'lxml') if pageindex == 1: if not self.__get_total_count(s): return False tablenode = s.find('table', id='DataGrid1') if tablenode is None: utils.print('查找表格失败') return False house_list = [] house_nodes = tablenode.find_all('tr') for house_node in house_nodes: house_properties = house_node.find_all('td') if len(house_properties) < 9: continue if house_properties[0].text == '项目名称': continue house = orm.OldHouseSource() #columns = ['thedate', 'region', 'serial_num', 'project_name','area', 'use_type', 'code', 'agency_info'] house.project_name = utils.remove_blank_char( house_properties[0].text) house.serial_num = house_properties[1].text house.region = utils.remove_blank_char(house_properties[2].text) house.area = house_properties[3].text house.use_type = house_properties[4].text house.code = house_properties[6].text house.agency_info = utils.remove_blank_char( house_properties[7].text) house.thedate = house_properties[8].text house_list.append(house) return orm_ope.insert_item_list(house_list)