Beispiel #1
0
    def test_add_info_log_to_list(self):
        log = LogHelper()
        log.info(self.INFO_MESSAGE)

        self.assertEqual(
            log.logs.__len__() == 1
            and log.logs[0].log_level == LogHelper.INFO, True)
Beispiel #2
0
    def test_not_add_logs_that_do_not_comply_with_the_data_types(self):
        log = LogHelper()

        log.debug(self.DEBUG_MESSAGE)
        log.info(self.INFO_MESSAGE)
        log.warning(self.WARNING_MESSAGE)
        log.error(self.ERROR_MESSAGE)
        log.fatal(self.FATAL_MESSAGE)

        log.add_external_logs([1, 2, 3, 'uno', 'dos'])
        log.add_external_logs('datos')
        self.assertTrue(log.logs.__len__() == 5)
Beispiel #3
0
    def test_print_in_file_only_info_or_higher_levels(self):
        log = LogHelper()

        log.debug(self.DEBUG_MESSAGE)
        log.info(self.INFO_MESSAGE)
        log.warning(self.WARNING_MESSAGE)
        log.error(self.ERROR_MESSAGE)
        log.fatal(self.FATAL_MESSAGE)

        log_name = 'test_print_in_file_only_info_or_higher_levels.log'
        self.assertTrue(log.logs.__len__() == 5)
        self.assertEqual(log.save_logs(log_name, log_level=LogHelper.INFO),
                         True)
        with open(f'./{log_name}') as file:
            data = file.read()
            self.assertFalse(self.DEBUG_MESSAGE in data)
            self.assertTrue(self.INFO_MESSAGE in data)
            self.assertTrue(self.WARNING_MESSAGE in data)
            self.assertTrue(self.ERROR_MESSAGE in data)
            self.assertTrue(self.FATAL_MESSAGE in data)
Beispiel #4
0
    def test_add_external_logs(self):
        log = LogHelper()
        external_logs = LogHelper()

        log.debug(self.DEBUG_MESSAGE)
        log.info(self.INFO_MESSAGE)
        log.warning(self.WARNING_MESSAGE)
        log.error(self.ERROR_MESSAGE)
        log.fatal(self.FATAL_MESSAGE)

        external_msg_debug = 'Mensaje de debug externo'
        external_msg_info = 'Mensaje de info externo'
        external_msg_warning = 'Mensaje de warning externo'
        external_msg_error = 'Mensaje de error externo'
        external_msg_fatal = 'Mensaje de fatal externo'

        external_logs.debug(external_msg_debug)
        external_logs.info(external_msg_info)
        external_logs.warning(external_msg_warning)
        external_logs.error(external_msg_error)
        external_logs.fatal(external_msg_fatal)

        log.add_external_logs(external_logs.logs)
        self.assertTrue(log.logs.__len__() == 10)

        log_name = 'test_add_external_logs.log'
        self.assertEqual(log.save_logs(log_name, log_level=LogHelper.DEBUG),
                         True)
        with open(f'./{log_name}') as file:
            data = file.read()
            self.assertTrue(self.DEBUG_MESSAGE in data)
            self.assertTrue(self.INFO_MESSAGE in data)
            self.assertTrue(self.WARNING_MESSAGE in data)
            self.assertTrue(self.ERROR_MESSAGE in data)
            self.assertTrue(self.FATAL_MESSAGE in data)

            self.assertTrue(external_msg_debug in data)
            self.assertTrue(external_msg_info in data)
            self.assertTrue(external_msg_warning in data)
            self.assertTrue(external_msg_error in data)
            self.assertTrue(external_msg_fatal in data)
Beispiel #5
0
class AnjukeHouseSpider(object):
    def __init__(self):
        self.pipeline = AnjukeHousePipeline()
        self.md5_set = self.get_md5()
        self.log = LogHelper(log_file='log/anjuke.log',
                             log_name='anjuke').get_logger()

    def spider(self):
        while True:
            if is_late_at_night():
                sleep(1000)
            for city in ANJUKE_CITY:
                url_city = INIT_URL.format(city)
                response = requests.get(url_city,
                                        headers=HEADERS).content.decode('utf8')
                soup = BeautifulSoup(response, 'lxml')
                span = soup.find('span', {'class': 'elems-l'})
                items = span.find_all('a')
                for item in items:
                    area_url = item.get('href')[0:-2]
                    area = item.get_text()
                    # 不抓取周边城市
                    if '南湖' not in area:
                        continue
                    for o in SEACHER_SORT:
                        for p in range(1, 20, 1):
                            url = '%s%s-p%s/#filtersort' % (area_url, o, p)
                            rsp = requests.get(
                                url, headers=HEADERS).content.decode('utf8')
                            if '验证码必须填写' in rsp:
                                self.log.error('已被屏蔽,暂停抓取')
                                sleep(100000000)
                            save_flag = self.parse(rsp, city, area)
                            sleep_time = round(random.uniform(3, 7), 2)
                            sleep(sleep_time)
                            if not save_flag:
                                self.log.info('***** break circulation ******')
                                break
            self.log.info("run around and sleep 1800s")
            sleep(1800)

    def parse(self, response, city, area):
        soup = BeautifulSoup(response, 'lxml')
        house_list = soup.find_all('li', {'class': 'list-item'})
        # 判断该URL是否有没收录的房源
        flag = False
        for item in house_list:
            house_title_ = item.find('div', {'class': 'house-title'}).find('a')
            title = house_title_.get_text().strip()
            url = house_title_.get('href')

            details_item_list = item.find_all('div', {'class': 'details-item'})
            detail_first = details_item_list[0].find_all('span')
            house_type = detail_first[0].get_text(
            ) if len(detail_first) > 0 else ''
            building_area = detail_first[1].get_text().replace(
                'm²', '') if len(detail_first) > 1 else ''
            floor = detail_first[2].get_text() if len(detail_first) > 2 else ''
            building_time = detail_first[3].get_text(
            ) if len(detail_first) > 3 else ''

            detail_second = details_item_list[1].find(
                'span') if len(details_item_list) > 1 else ''
            if not detail_second:
                continue
            detail_second_text = detail_second.get_text().strip()
            dst_list = detail_second_text.split('\n')
            community = dst_list[0].strip() if len(dst_list) > 0 else ''
            address = dst_list[1].strip() if len(dst_list) > 1 else ''

            tags_bottom = item.find('div', {'class', 'tags-bottom'})
            tags_span = tags_bottom.find_all('span')
            advantage = ''
            for ts in tags_span:
                tag = ts.get_text()
                advantage = '%s|%s' % (advantage, tag)
            advantage = advantage[1:]

            broker = item.find('span', {'class': 'broker-name broker-text'})
            salesman = broker.get_text()

            price_det = item.find('span', {'class': 'price-det'})
            total_price = price_det.get_text().replace('万', '')

            unit_price = item.find('span', {'class': 'unit-price'})
            avg_price = unit_price.get_text().replace('元/m²', '')

            md5_str = '%s%s%s%s' % (community, house_type, building_area,
                                    total_price)
            hm = hashlib.md5()
            hm.update(md5_str.encode("utf8"))
            url_md5 = hm.hexdigest()
            save_result = self.save(total_price, avg_price, title, house_type,
                                    building_area, floor, building_time,
                                    community, city, area, address, advantage,
                                    salesman, url, url_md5)
            if not flag and save_result:
                flag = True
        return flag

    def save(self, total_price, avg_price, title, house_type, building_area,
             floor, building_time, community, city, area, address, advantage,
             salesman, url, url_md5):
        if url_md5 not in self.md5_set:
            self.pipeline.insert(total_price, avg_price, title, house_type,
                                 building_area, floor, building_time,
                                 community, city, area, address, advantage,
                                 salesman, url, url_md5)
            self.md5_set.add(url_md5)
            return True
        return False

    def get_md5(self):
        md5_set = set()
        query = self.pipeline.query_md5()
        for item in query:
            md5_set.add(item['url_md5'])
        return md5_set