Beispiel #1
0
    def __extact_by_type(self, node, area_name):
        '''
        提取了所有按照类型划分的数据
        :param node:
        :return:
        '''
        utils.print('提取按照户型分类的数据...')
        table = node.find('table')
        if table is None:
            utils.print('没有找到按照户型分类的数据')
            return []

        row_node_list = table.find_all('tr')
        i = 0
        house_list = []
        for row in row_node_list:
            if i == 0:
                i += 1
                continue
            columns = row.find_all('td')
            if len(columns) < 6:
                continue
            house = {}
            house['region'] = area_name
            house['house_type'] = columns[0].text
            house['deal_count'] = columns[1].text
            house['area'] = area = utils.get_num(columns[2].text)
            house['price'] = utils.get_num(columns[3].text)
            house['availableforsalecount'] = utils.get_num(columns[4].text)
            house['availableforsalearea'] = utils.get_num(columns[5].text)

            house_list.append(house)
            i += 1
        return house_list
Beispiel #2
0
 def decode(cls, page_node):
     '''
     id serial not null,
     building_name character varying(255), --几栋
     branch character varying(10),   --座号
     house_type character varying(255),
     contact_code character varying(255),
     price double precision,
     floor integer,
     room_num character varying(50),
     usage character varying(50),
     build_area double precision,
     inside_area double precision,
     share_area double precision,
     :param page_node:
     :return:
     '''
     tr_nodes = page_node.find_all('tr', class_='a1')
     house_info = {}
     try:
         for tr_node in tr_nodes:
             temp_house = cls.__decode_one_row(tr_node)
             house_info.update(temp_house)
         return house_info
     except Exception as e:
         utils.print('解析房屋页面信息时发生错误, error: {}'.format(str(e)))
         return None
    def __extact_by_type(self, node, area_name):
        '''
        提取了所有按照类型划分的数据
        :param node:
        :return:
        '''
        utils.print('提取按照户型分类的数据...')
        table = node.find('table')
        if table is None:
            utils.print('没有找到按照户型分类的数据')
            return []

        row_node_list = table.find_all('tr')
        i = 0
        house_list = []
        for row in row_node_list:
            if i == 0:
                i += 1
                continue
            columns = row.find_all('td')
            if len(columns) < 6:
                continue
            house = orm.NewHouseByType()
            house.thedate = dt.now()
            house.region = area_name
            house.house_type = columns[0].text
            house.deal_count = columns[1].text
            house.area = area = utils.get_num(columns[2].text)
            house.price = utils.get_num(columns[3].text)
            house.availableforsalecount = utils.get_num(columns[4].text)
            house.availableforsalearea = utils.get_num(columns[5].text)

            house_list.append(house)
            i += 1
        return house_list
    def __extract_by_use(self, node, area_name):
        utils.print('提取按照用途分类的数据...')
        table = node.find('table')
        if table is None:
            utils.print('没有找到按照用户分类的数据')
            return []

        row_node_list = table.find_all('tr')
        i = 0
        house_list = []
        for row in row_node_list:
            if i == 0:
                i += 1
                continue
            columns = row.find_all('td')
            if len(columns) < 3:
                continue
            house = {}
            house['region'] = area_name
            house['use_type'] = columns[0].text
            house['deal_count'] = utils.get_num(columns[1].text)
            house['area'] = area = utils.get_num(columns[2].text)
            house_list.append(house)
            i += 1
        return house_list
Beispiel #5
0
 def crawl(self):
     project = NewHouseSourceDao.get_one_project()
     if project is None:
         utils.print('暂无项目可抓取...')
         return False
     self.__crawl_project_detail(project)
     return True
Beispiel #6
0
    def __extract_by_use(self, node, area_name):
        utils.print('提取按照用途分类的数据...')
        table = node.find('table')
        if table is None:
            utils.print('没有找到按照用户分类的数据')
            return []

        row_node_list = table.find_all('tr')
        i = 0
        house_list = []
        for row in row_node_list:
            if i == 0:
                i += 1
                continue
            columns = row.find_all('td')
            if len(columns) < 3:
                continue
            house = orm.OldHouseByUse()
            house.thedate = dt.now()
            house.region = area_name
            house.use_type = columns[0].text
            house.deal_count = utils.get_num(columns[1].text)
            house.area = area = utils.get_num(columns[2].text)
            house_list.append(house)
            i += 1
        return house_list
Beispiel #7
0
    def __extract_by_area(self, node, area_name):
        '''
        按照面积的大小分类
        :param node:
        :param area_name:
        :return:
        '''
        utils.print('提取按照面积分类的数据...')
        table = node.find('table')
        if table is None:
            utils.print('没有找到按照面积分类的数据')
            return []

        row_node_list = table.find_all('tr')
        i = 0
        house_list = []
        for row in row_node_list:
            if i == 0:
                i += 1
                continue
            columns = row.find_all('td')
            if len(columns) < 5:
                continue
            house = {}
            house['region'] = area_name
            house['area_level'] = columns[0].text
            house['deal_count'] = columns[1].text
            house['area'] = area = utils.get_num(columns[2].text)
            house['price'] = utils.get_num(columns[3].text)
            house['total_price'] = utils.get_num(columns[4].text)
            house_list.append(house)
            i += 1
        return house_list
    def __extract_by_area(self, node, area_name):
        '''
        按照面积的大小分类
        :param node:
        :param area_name:
        :return:
        '''
        utils.print('提取按照面积分类的数据...')
        table = node.find('table')
        if table is None:
            utils.print('没有找到按照面积分类的数据')
            return []

        row_node_list = table.find_all('tr')
        i = 0
        house_list = []
        for row in row_node_list:
            if i == 0:
                i += 1
                continue
            columns = row.find_all('td')
            if len(columns) < 5:
                continue
            house = orm.NewHouseByArea()
            house.thedate = dt.now()
            house.region = area_name
            house.area_level = columns[0].text
            house.deal_count = columns[1].text
            house.area = area = utils.get_num(columns[2].text)
            house.price = utils.get_num(columns[3].text)
            house.total_price = utils.get_num(columns[4].text)
            house_list.append(house)
            i += 1
        return house_list
def job1():
    utils.print('job1')
    counter = 0
    while True:
        time.sleep(1)
        counter+=1
        if counter > 120:
            break
Beispiel #10
0
 def crawl(self):
     '''
     url=
     要创建3个表来存储相关数据:按照户型、面积和类型分别存储
     然后,对于每种类型,要分别爬取全市、南山、福田、罗湖、盐田、保安、龙岗 七个区域的数据
     :return:
     '''
     utils.print('---开始抓取深圳新房成交数据---')
     for area in self.areas:
         self.__query_one_area(area)
    def decode_and_write(cls, page_node, project_info):
        project_info = cls.__decode(page_node, project_info)
        # 写入的结果不做判断,只看一会能不能获取到id,能获取到就算成功
        NewHouseSourceDao.write_project(project_info)
        project_id = NewHouseSourceDao.get_project_id(project_info)

        if project_id  == 0:
            utils.print('获取项目Id失败, {}'.format(project_info['project_name']))
            return False
        project_info['id'] = project_id
        return True
    def crawl(self):
        utils.print('开始抓取二手房源数据...')
        pageindex = self.__page_index
        while True:
            try:
                if not self.__crawl_one_page(pageindex):
                    break
            except Exception as e:
                utils.print('抓取第{}页失败, {}'.format(pageindex, str(e)))
                continue

            if self.__total_count < self.__page_size * (pageindex - 1):
                break
            pageindex += 1
Beispiel #13
0
    def __decode_house(cls, house_node, branch_name):
        '''
        :param house_node:
        :param branch_name:
        :return:
        '''
        div_nodes = house_node.find_all('div')
        if len(div_nodes) != 2:
            utils.print('获取房间信息失败: {}, {}'.format(branch_name,
                                                  house_node.text))
            return None
        house = {}
        house['branch'] = branch_name
        house['room_num'] = utils.remove_blank_char(div_nodes[0].text)
        href_node = div_nodes[1].find('a')
        if href_node is None:
            utils.print('获取房间的连接信息失败, {}, {}'.format(branch_name,
                                                     house_node.text))
            return None

        url = '{}{}'.format(cls.__url, href_node['href'])
        utils.print('读取房间 {} {} {} {}的信息...'.format(cls.__project_name,
                                                    cls.__building_name,
                                                    branch_name,
                                                    house['room_num']))
        r = utils.request_with_retry(url)
        if r is None:
            utils.print('读取房屋{}的页面信息失败'.format(house['room_num']))
            return None

        html_node = BeautifulSoup(r.text, 'lxml')
        return NewHSrcHousePageDecoder.decode(html_node)
    def __crawl_one_page_project(self, page_index):
        r = None
        utils.print('正在读取第%d页项目列表...' % page_index)
        if page_index == 1:
            r = utils.request_with_retry(self.__url)
        else:
            r = utils.request_with_retry('{}index.aspx'.format(self.__url), self.__create_form_data(page_index))
        if r is None:
            utils.print('读取项目页面失败, page_index = {}'.format(page_index))
            return
        html_node = BeautifulSoup(r.text, 'lxml')
        self.extract_formdata_from_newpage(html_node)
        if page_index == 1:
            self.__get_total_count(html_node)

        project_nodes = self.__get_project_nodes(html_node)
        project_list = []
        for project_node in project_nodes:
            project = self.__convert_project_node_to_project(project_node)
            if project is None:
                continue
            project['is_crawled'] = False
            project_list.append(project)
        utils.print('解析出%d条项目信息' % len(project_list))
        writedcount = NewHouseSourceDao.write_project_summary_list(project_list)
        utils.print('写入数据库 %d 条记录' % writedcount)
 def __get_total_count(self, node):
     '''
     从第一页的数据中提取总数
     :param node:
     :return:
     '''
     spans = node.find_all('span', class_='a1')
     if len(spans) != 2:
         utils.print('查找记录总数失败')
         return False
     nums = re.findall(r'\d+', spans[1].text)
     if len(nums) != 1:
         utils.print('从字符串 {} 提取记录总数失败'.format(spans[1].text))
         return
     self.__total_count = int(nums[0])
     return True
 def run(self):
     utils.print('正在读取项目列表...')
     r = utils.request_with_retry(self.__url)
     if r is None:
         utils.print('读取项目页面失败...')
         return
     html_node = BeautifulSoup(r.text, 'lxml')
     project_nodes = self.__get_project_nodes(html_node)
     for project_node in project_nodes:
         project = self.__convert_project_node_to_project(project_node)
         if project is None:
             continue
         project['is_crawled'] = False
         writedcount = NewHouseSourceDao.write_project_summary(project)
         if writedcount > 0:
             MailSender.send_alarm_message('深圳有新地产项目通过预售', str(project))
    def __get_project_nodes(self, node):
        '''从html中找出所有预售项目的行'''
        table_node = node.find('table', id='DataList1')
        if table_node is None:
            utils.print('获取项目列表表格失败...')
            return []
        sub_table_node = table_node.find('table')
        if sub_table_node is None:
            utils.print('获取项目列表子表格失败...')
            return []
        project_nodes = sub_table_node.find_all('tr')
        if len(project_nodes) < 3:
            return []

        #前两行是标题和空行
        del project_nodes[0]
        del project_nodes[0]
        return project_nodes
Beispiel #18
0
    def __crawl_one_page(self, pageindex):
        '''
        抓去一页的房屋信息
        :param pageindex:
        :return: 是否要继续查找下一页,如果当前页出错,或者查找的结果一个都没写进去,那就没必要再找下一页了
        '''
        utils.print('抓取第{}页...'.format(pageindex))
        url = self.__url.format(pageindex)
        r = utils.request_with_retry(url)
        s = BeautifulSoup(r.text, 'lxml')
        if pageindex == 1:
            if not self.__get_total_count(s):
                return False

        tablenode = s.find('table', id='DataGrid1')
        if tablenode is None:
            utils.print('查找表格失败')
            return False
        house_list = []

        house_nodes = tablenode.find_all('tr')
        for house_node in house_nodes:
            house_properties = house_node.find_all('td')
            if len(house_properties) < 9:
                continue
            if house_properties[0].text == '项目名称':
                continue
            house = orm.OldHouseSource()
            #columns = ['thedate', 'region', 'serial_num', 'project_name','area', 'use_type', 'code', 'agency_info']
            house.project_name = utils.remove_blank_char(
                house_properties[0].text)
            house.serial_num = house_properties[1].text
            house.region = utils.remove_blank_char(house_properties[2].text)
            house.area = house_properties[3].text
            house.use_type = house_properties[4].text
            house.code = house_properties[6].text
            house.agency_info = utils.remove_blank_char(
                house_properties[7].text)
            house.thedate = house_properties[8].text
            house_list.append(house)

        return orm_ope.insert_item_list(house_list)
    def __query_one_area(self, area_name):
        '''
        这个调用接口时,在fromdata中传递的参数不同
        返回的response也不同,第一行和最后一行不是html,不规范,要注意做兼容处理
        :param area_name:
        :return:
        '''
        utils.print('query {} info...'.format(area_name))
        r = None
        if area_name == '全市':
            r = utils.request_with_retry(self.__url)
        else:
            fromdata = self.areas[area_name]
            self.form_data[
                'ctl00$ContentPlaceHolder1$scriptManager1'] = fromdata[
                    'ctl00$ContentPlaceHolder1$scriptManager1']
            self.form_data['__EVENTTARGET'] = fromdata['__EVENTTARGET']
            r = utils.request_with_retry(self.__url, self.form_data)

        s = BeautifulSoup(r.text, 'lxml')
        self.extract_formdata_from_newpage(s)
        self.__extract_info_from_page_into_db(s, area_name)
Beispiel #20
0
    def __read_mailer_sender_info(cls):
        mail_info = {}
        config = configparser.ConfigParser()
        try:
            read_ok = config.read('{}\config.ini'.format(sys.path[0]),
                                  encoding='utf-8-sig')
            print(str(read_ok))
            section = 'mail'
            mail_info['host'] = config.get(section, 'host')
            mail_info['user'] = config.get(section, 'user')
            mail_info['pass'] = config.get(section, 'pass')
            mail_info['sender'] = config.get(section, 'sender')
            mail_info['receivers'] = config.get(section, 'receivers')
            if mail_info['host'] == '' or mail_info['user'] == '' or mail_info[
                    'pass'] == '' or mail_info['sender'] == '' or mail_info[
                        'receivers'] == '':
                utils.print('读取邮件配置信息失败: 配置内容为: {}'.format(str(mail_info)))
                return None
            return mail_info

        except Exception as e:
            print(' 读取配置文件出错.')
            print(e)
            return None
Beispiel #21
0
 def query_every_day_data(self):
     try:
         utils.print('---------------开始轮询-------------------')
         new_deal = NewHouseDealInfoCrawler()
         new_deal.crawl()
         old_deal = OldHouseDealInfoCrawler()
         old_deal.crawl()
         old_source = OldHouseSourceCrawler()
         old_source.crawl()
         utils.print('---------------结束轮询-------------------')
         print('')
     except Exception as e:
         utils.print('在轮询期间发生未知错误, {}'.format(str(e)))
         traceback.print_exc()
def job2():
    utils.print('job2')
Beispiel #23
0
    def __crawl_project_detail(self, project_info):
        '''
        获取指定项目的详细信息,然后写入到数据库中
        :param url:
        :param project_info:  这个是从列表中获取的项目的简要信息
        :return:
        '''
        utils.print('读取项目{}页面'.format(project_info['project_name']))
        r = utils.request_with_retry(project_info['url'])
        if r is None:
            utils.print('读取项目: {} , 页面失败...'.format(
                project_info['project_name']))
            return False

        s = BeautifulSoup(r.text, 'lxml')
        if not NewHSrcPrjPageDecoder.decode_and_write(s, project_info):
            return False

        for building in project_info['building_list']:
            try:

                utils.print('读取 {} 的 {} 页面...'.format(
                    project_info['project_name'], building['building_name']))
                building['project_id'] = project_info['id']
                building['is_crawled'] = False
                if NewHouseSourceDao.is_building_crawled(building) > 0:
                    continue

                r = utils.request_with_retry(building['url'])
                if r is None:
                    utils.print('读取项目 {} 的楼栋 {} 页面失败.'.format(
                        project_info['project_name'],
                        building['building_name']))
                    continue

                html_node = BeautifulSoup(r.text, 'lxml')
                house_list = NewHSrcBldPageDecoder.decode(
                    html_node, building['building_name'],
                    project_info['project_name'])

                if NewHouseSourceDao.write_newhouse_building(building) == 0:
                    continue
                building_id = NewHouseSourceDao.get_building_id(building)
                if building_id == 0:
                    print('获取楼栋id失败,{}, {}'.format(
                        project_info['project_name'],
                        building['building_name']))
                    continue
                for house in house_list:
                    house['building_id'] = building_id

                NewHouseSourceDao.write_houselist(house_list)
                NewHouseSourceDao.update_building_state_to_crawled(building_id)
            except Exception as e:
                utils.print('抓取建筑 {} 失败...'.format(building['building_name']))
                utils.print(str(e))

        NewHouseSourceDao.update_project_state_to_crawled(
            project_info['presale_license_num'])
Beispiel #24
0
 def crawl_new_house_source_projects(self):
     if NewHouseSourceDao.get_one_project(None) is not None:
         utils.print('已经抓取过房源信息...')
         return
     new_house_source_crawler = NewHSrcProjectCrawler()
     new_house_source_crawler.crawl()
Beispiel #25
0
    @classmethod
    def crawl_new_house_source_projects(self):
        if NewHouseSourceDao.get_one_project(None) is not None:
            utils.print('已经抓取过房源信息...')
            return
        new_house_source_crawler = NewHSrcProjectCrawler()
        new_house_source_crawler.crawl()

    @classmethod
    def query_and_mail_new_house_info(self):
        monitor = NewHouseMonitor()
        monitor.run()


utils.print('-----------程序启动--------------')

new_house_detail_querier = NewHouseDetailQuerier()


def schedule_task():
    # 每秒钟检查一次是否有新房源要抓取
    schedule.every().second.do(new_house_detail_querier.query_one_project)
    # 每小时检查是否有新房通过预售
    schedule.every().hour.do(
        ShenzhenHouseCrawler.query_and_mail_new_house_info)
    # 每天12点抓取当天的 新房和二手房成交信息
    schedule.every().day.at('12:00').do(
        ShenzhenHouseCrawler.query_every_day_data)