Exemple #1
0
 def __init__(self, config: typing.Dict[str, typing.Any]) -> None:
     super().__init__()
     self.config = config
     self.session_factory = create_session_factory(config)
     self.s3_client = S3Client(config)
     self.slack_client = SlackClient(config.get("SLACK_CHANNEL"),
                                     config.get("SLACK_API_TOKEN"))
     self.region_level_1 = self.config["REGION_REGEX_LEVEL_1"]
     self.region_level_2 = self.config["REGION_REGEX_LEVEL_2"]
Exemple #2
0
 def __init__(self, config: typing.Dict[str, typing.Any]) -> None:
     super().__init__()
     self.config = config
     self.session_factory = create_session_factory(config)
     self.s3_client = S3Client(config)
     self.slack_client = SlackClient(
         config.get("SLACK_CHANNEL"), config.get("SLACK_API_TOKEN")
     )
     self.storing_date: datetime.datetime = tznow(
         pytz.timezone("Asia/Seoul")
     )
     self.region_level_1 = self.config["REGION_REGEX_LEVEL_1"]
     self.region_level_2 = self.config["REGION_REGEX_LEVEL_2"]
     self.region_level_3 = self.config["REGION_REGEX_LEVEL_3"]
     self.competed_sido_ids: typing.Dict[str, int] = dict()
     self.completed_gugun_ids: typing.Dict[str, int] = dict()
Exemple #3
0
    def __init__(
        self,
        config: typing.Dict[str, typing.Any],
    ):
        super().__init__()
        self.config = config

        self.slack_client = SlackClient(config.get("SLACK_CHANNEL"),
                                        config.get("SLACK_API_TOKEN"))
        self.info_care_client = InfocareClient(config)
        self.s3_client = S3Client(config)
        self.total_statistics = CrawlerStatistics()
        self.failure_statistics = CrawlerStatistics()
        self.crawling_date: datetime.datetime = tznow(
            pytz.timezone("Asia/Seoul"))
        self.crawling_start_time: str = str(timestamp(self.crawling_date))
Exemple #4
0
 def __init__(self, config: typing.Dict[str, typing.Any]) -> None:
     super().__init__()
     self.config = config
     self.session_factory = create_session_factory(config)
     self.s3_client = S3Client(config)
     self.slack_client = SlackClient(config.get("SLACK_CHANNEL"),
                                     config.get("SLACK_API_TOKEN"))
     self.region_level_1 = self.config["REGION_REGEX_LEVEL_1"]
     self.region_level_2 = self.config["REGION_REGEX_LEVEL_2"]
     self.region_level_3 = self.config["REGION_REGEX_LEVEL_3"]
     self.completed_sido_statistics: typing.Dict[str,
                                                 typing.List[int]] = dict()
     self.completed_gugun_statistics: typing.Dict[
         str, typing.List[int]] = dict()
     self.completed_sido_ids: typing.Dict[str, int] = dict()
     self.completed_gugun_ids: typing.Dict[str, int] = dict()
Exemple #5
0
 def __init__(
     self,
     config: typing.Dict[str, typing.Any],
 ):
     super().__init__()
     self.config = config
     self.slack_client = SlackClient(config.get("SLACK_CHANNEL"),
                                     config.get("SLACK_API_TOKEN"))
     self.taein_client = TaeinClient(
         client_delay=self.config['CLIENT_DELAY'],
         proxy=random.choice(self.config['PROXY_HOST_LIST']))
     self.s3_client = S3Client(config)
     self.total_statistics = CrawlerStatistics()
     self.failure_statistics = CrawlerStatistics()
     self.crawling_date: datetime.datetime = tznow(
         pytz.timezone("Asia/Seoul"))
     self.crawling_start_time: str = str(
         timestamp(tznow(pytz.timezone("Asia/Seoul"))))
Exemple #6
0
 def __init__(
     self,
     config: typing.Dict[str, typing.Any],
 ) -> None:
     super().__init__()
     self.config = config
     self.slack_client = SlackClient(
         config.get("SLACK_CHANNEL"), config.get("SLACK_API_TOKEN")
     )
     self.nsdi_client = NsdiClient(config)
     self.s3_client = S3Client(config)
     self.region_land_use_dict: typing.Dict[str, str] = dict()
     self.region_land_feature_dict: typing.Dict[str, str] = dict()
     self.total_statistics = CrawlerStatistics()
     self.failure_statistics = CrawlerStatistics()
     self.crawling_date: datetime.datetime = tznow(
         pytz.timezone("Asia/Seoul")
     )
     self.crawling_start_time: str = str(
         timestamp(tznow(pytz.timezone("Asia/Seoul")))
     )
Exemple #7
0
class TaeinCrawler(object):
    def __init__(
        self,
        config: typing.Dict[str, typing.Any],
    ):
        super().__init__()
        self.config = config
        self.slack_client = SlackClient(config.get("SLACK_CHANNEL"),
                                        config.get("SLACK_API_TOKEN"))
        self.taein_client = TaeinClient(
            client_delay=self.config['CLIENT_DELAY'],
            proxy=random.choice(self.config['PROXY_HOST_LIST']))
        self.s3_client = S3Client(config)
        self.total_statistics = CrawlerStatistics()
        self.failure_statistics = CrawlerStatistics()
        self.crawling_date: datetime.datetime = tznow(
            pytz.timezone("Asia/Seoul"))
        self.crawling_start_time: str = str(
            timestamp(tznow(pytz.timezone("Asia/Seoul"))))

    def run(self, run_by: str) -> None:
        self.slack_client.send_info_slack(
            f"TIME_STAMP: {self.crawling_start_time}\n"
            f"크롤링 시작합니다 "
            f"({self.config['ENVIRONMENT']}, {run_by})")

        self.crawl()
        self.upload_crawler_log_to_s3(run_by)

        statistics = slack_failure_percentage_statistics(
            self.total_statistics, self.failure_statistics)

        self.slack_client.send_info_slack(
            f"크롤링 완료\n"
            f"TIME_STAMP: {self.crawling_start_time}\n\n"
            f"statistics:\n"
            f"statistics_count\n{statistics['statistics_count']}\n\n"
            f"bids_count\n{statistics['bids_count']}")

    def crawl(self) -> None:
        login_id = self.config["LOGIN_ID"]
        login_pw = self.config["LOGIN_PW"]

        self.taein_client.fetch_main_page()
        self.taein_client.fetch_login_page()
        self.taein_client.login(login_id, login_pw)

        try:
            logger.info("Crawling region list")

            region = self.taein_client.fetch_region_list()
            if not region:
                raise TaeinCrawlerNotFoundError("not found region list")

            logger.info("Crawling mulgun kind")
            self.crawl_mulgun_kind(region)
        except Exception as e:
            raise e
        finally:
            self.taein_client.logout()

    def crawl_mulgun_kind(self, region: TaeinRegion) -> None:
        end_date_format = self.crawling_date.strftime("%Y-%m-%d")
        end_date = datetime.datetime.strptime(end_date_format, "%Y-%m-%d")
        start_date = end_date - relativedelta(months=1)

        mulgun = self.taein_client.fetch_mulgun_kind_list(start_date, end_date)

        if not mulgun:
            raise TaeinCrawlerNotFoundError("not found mulgun list")

        for mulgun_text, mulgun_value in mulgun.mulgun_kind_dict.items():
            if re.search(f"{self.config['MULGUN_KIND']}$", mulgun_text):
                self.crawl_sido_region(region, mulgun_text, mulgun_value)

    def crawl_sido_region(self, region: TaeinRegion, mulgun_text: str,
                          mulgun_value: str) -> None:
        sido_list = region.taein_sido_list

        for sido in sido_list:
            sido_name = sido.sido_name
            if re.search(self.config["REGION_REGEX_LEVEL_1"], sido_name):
                gugun = sido.taein_gugun
                self.crawl_gugun_region(sido_name, gugun, mulgun_text,
                                        mulgun_value)

    def crawl_gugun_region(
        self,
        sido_name: str,
        gugun: TaeinGugun,
        mulgun_text: str,
        mulgun_value: str,
    ) -> None:
        gugun_name = gugun.gugun_name
        if re.search(self.config["REGION_REGEX_LEVEL_2"], gugun_name):
            self.taein_client = TaeinClient(
                client_delay=self.config['CLIENT_DELAY'],
                proxy=random.choice(self.config['PROXY_HOST_LIST']))
            self.taein_client.login(self.config["LOGIN_ID"],
                                    self.config["LOGIN_PW"])
            dong_list = gugun.dong_list
            self.crawl_dong_region(
                sido_name,
                gugun_name,
                dong_list,
                mulgun_text,
                mulgun_value,
            )
            self.taein_client.logout()
            time.sleep(60)

    def crawl_dong_region(
        self,
        sido_name: str,
        gugun_name: str,
        dong_list: typing.List[str],
        mulgun_text: str,
        mulgun_value: str,
    ) -> None:
        for dong_name in dong_list:
            if re.search(self.config["REGION_REGEX_LEVEL_3"], dong_name):
                area_step = self.config["BUILDING_AREA_STEP"]
                area_start = self.config["BUILDING_AREA_START"]
                area_end = self.config["BUILDING_AREA_END"]
                for area_range in range(area_start, area_end + area_step,
                                        area_step):
                    start_area = "최소" if area_range == 0 else area_range
                    end_area = "최대" if area_range == 400 else area_range + 20
                    try:
                        self.crawl_statistics_page(
                            sido_name,
                            gugun_name,
                            dong_name,
                            start_area,
                            end_area,
                            mulgun_text,
                            mulgun_value,
                        )
                    except Exception as e:
                        self.failure_statistics.statistics_count += 1
                        raise e

                self.crawl_bid_page(
                    sido_name,
                    gugun_name,
                    dong_name,
                    mulgun_text,
                    mulgun_value,
                )

    def crawl_statistics_page(
        self,
        sido_name: str,
        gugun_name: str,
        dong_name: str,
        start_area: int,
        end_area: int,
        mulgun_text: str,
        mulgun_value: str,
    ) -> None:
        end_date_format = self.crawling_date.strftime("%Y-%m-%d")
        end_date = datetime.datetime.strptime(end_date_format, "%Y-%m-%d")
        start_date = end_date - relativedelta(months=1)

        statistics_response = self.taein_client.fetch_statistics_page(
            sido_name,
            gugun_name,
            dong_name,
            str(start_area),
            str(end_area),
            start_date,
            end_date,
            mulgun_value,
        )

        if not statistics_response:
            raise TaeinCrawlerNotFoundError("not found statistics response")

        logger.info("Crawling statistics page",
                    sido=sido_name,
                    gugun=gugun_name,
                    dong=dong_name,
                    start_area=start_area,
                    end_area=end_area,
                    mulgun_text=mulgun_text,
                    proxy=self.taein_client.session.proxies)

        if not statistics_response.dong_statistics_exist:
            logger.info("Not exist dong statistiscs")
            return

        self.total_statistics.statistics_count += 1

        data = statistics_response.raw_data

        file_name = (f"{sido_name}_"
                     f"{gugun_name}_"
                     f"{dong_name}_"
                     f"{mulgun_text}_"
                     f"{start_area}_"
                     f"{end_area}_"
                     f"statistics.html")

        with TempDir() as temp_dir:
            temp_path = str(temp_dir) + "\\"
            file_path = temp_path + file_name
            write_file(file_path, data)
            self.upload_page_to_s3(
                sido_name,
                gugun_name,
                dong_name,
                mulgun_text,
                file_path,
                file_name,
                "statistics",
            )

    def crawl_bid_page(
        self,
        sido_name: str,
        gugun_name: str,
        dong_name: str,
        mulgun_text: str,
        mulgun_value: str,
    ) -> None:
        end_date_format = self.crawling_date.strftime("%Y-%m-%d")
        end_date = datetime.datetime.strptime(end_date_format, "%Y-%m-%d")
        start_date = end_date - relativedelta(months=1)

        statistics_response = self.taein_client.fetch_statistics_page(
            sido_name, gugun_name, dong_name, "최소", "최대", start_date, end_date,
            mulgun_value)

        if not statistics_response:
            raise TaeinCrawlerNotFoundError("not found statistics response")

        bid_count = statistics_response.bid_count
        total_page = statistics_response.bid_total_page

        if bid_count > 0:
            try:
                for index in range(1, total_page + 1):
                    bid_response = self.taein_client.fetch_bid_list_page(
                        sido_name, gugun_name, dong_name, start_date, end_date,
                        mulgun_text, bid_count, index)

                    if not bid_response:
                        raise TaeinCrawlerNotFoundError(
                            "not found bid response")

                    self.total_statistics.bids_count += 1

                    data = bid_response.raw_data

                    logger.info("Crawling bid page",
                                sido=sido_name,
                                gugun=gugun_name,
                                dong=dong_name,
                                mulgun_text=mulgun_text,
                                page_index=index,
                                proxy=self.taein_client.session.proxies)

                    file_name = (f"{sido_name}_"
                                 f"{gugun_name}_"
                                 f"{dong_name}_"
                                 f"{mulgun_text}_"
                                 f"bid_{index}.html")

                    with TempDir() as temp_dir:
                        temp_path = str(temp_dir) + "\\"
                        file_path = temp_path + file_name
                        write_file(file_path, data)
                        self.upload_page_to_s3(
                            sido_name,
                            gugun_name,
                            dong_name,
                            mulgun_text,
                            file_path,
                            file_name,
                            "bid",
                        )
            except Exception as e:
                self.failure_statistics.bids_count += 1
                raise e

    def upload_page_to_s3(
        self,
        sido_name: str,
        gugun_name: str,
        dong_name: str,
        mulgun_text: str,
        file_path: str,
        file_name: str,
        data_type: str,
    ) -> None:
        folder_name = (f"{self.config['ENVIRONMENT']}/"
                       f"{self.crawling_date.year}/"
                       f"{self.crawling_date.month:02}/"
                       f"{self.crawling_date.day:02}/"
                       f"{str(self.crawling_start_time)}/"
                       f"data/"
                       f"{sido_name}/"
                       f"{gugun_name}/"
                       f"{dong_name}/"
                       f"{mulgun_text}")

        if data_type == "bid":
            folder_name += f"/{data_type}"

        self.s3_client.upload_any_file(
            folder_name=folder_name,
            file_name=file_name,
            file_path=file_path,
            mime_type="text/html",
            mode="rb",
        )

        logger.info(
            "Upload page to s3",
            sido=sido_name,
            gugun=gugun_name,
            dong=dong_name,
            mulgun_text=mulgun_text,
            data_type=data_type,
        )

    def upload_crawler_log_to_s3(self, run_by: str) -> None:
        total_statistics = attr.asdict(self.total_statistics)
        area_step = self.config["BUILDING_AREA_STEP"]
        area_start = self.config["BUILDING_AREA_START"]
        area_end = self.config["BUILDING_AREA_END"]
        area_range = [{
            "start_area": area_end,
            "end_area": 1000
        } if x == area_end else {
            "start_area": x,
            "end_area": x + area_step
        } for x in range(area_start, area_end + area_step, area_step)]
        data = {
            "time_stamp": self.crawling_start_time,
            "run_by": run_by,
            "finish_time_stamp": str(timestamp(tznow())),
            "total_statistics": total_statistics,
            "area_range": area_range,
        }

        folder_name = (f"{self.config['ENVIRONMENT']}/"
                       f"{self.crawling_date.year}/"
                       f"{self.crawling_date.month:02}/"
                       f"{self.crawling_date.day:02}/"
                       f"{str(self.crawling_start_time)}/"
                       f"crawler-log")

        file_name = f"{self.crawling_start_time}.json"

        self.s3_client.upload_json(folder_name=folder_name,
                                   file_name=file_name,
                                   data=data)

        logger.info(
            "Upload crawler log to s3",
            folder_name=folder_name,
            file_name=file_name,
        )
Exemple #8
0
class InfocareStore(object):
    def __init__(self, config: typing.Dict[str, typing.Any]) -> None:
        super().__init__()
        self.config = config
        self.session_factory = create_session_factory(config)
        self.s3_client = S3Client(config)
        self.slack_client = SlackClient(
            config.get("SLACK_CHANNEL"), config.get("SLACK_API_TOKEN")
        )
        self.storing_date: datetime.datetime = tznow(
            pytz.timezone("Asia/Seoul")
        )
        self.region_level_1 = self.config["REGION_REGEX_LEVEL_1"]
        self.region_level_2 = self.config["REGION_REGEX_LEVEL_2"]
        self.region_level_3 = self.config["REGION_REGEX_LEVEL_3"]
        self.competed_sido_ids: typing.Dict[str, int] = dict()
        self.completed_gugun_ids: typing.Dict[str, int] = dict()

    def run(self, run_by: str) -> None:
        if self.config["ENVIRONMENT"] == "local":
            session = self.session_factory()
            try:
                init_loan_db_schema(session)
            except Exception:
                raise
            finally:
                session.close()

        self.slack_client.send_info_slack(
            f"Store 시작합니다. ({self.config['ENVIRONMENT']}, {run_by})"
        )
        crawler_log_id = self.config["CRAWLER_LOG_ID"]

        if crawler_log_id:
            self.fetch_received_log_folder()  # 수동 log id 폴더 저장
        else:
            self.fetch_latest_log_folder()  # 최신 log id 폴더 저장

        self.slack_client.send_info_slack(
            f"Store 종료합니다. ({self.config['ENVIRONMENT']}, {run_by})"
        )

    def fetch_latest_log_folder(self) -> None:
        env_prefix = f"{self.config['ENVIRONMENT']}/"
        year_prefix = self.fetch_latest_date_folder(env_prefix)
        month_prefix = self.fetch_latest_date_folder(year_prefix)
        day_prefix = self.fetch_latest_date_folder(month_prefix)
        log_id_prefix = self.fetch_latest_date_folder(day_prefix)
        self.fetch_sido_region_folder(log_id_prefix)

    def fetch_latest_date_folder(self, base_prefix: str) -> str:
        date_list: typing.List[str] = list()
        for response in self.s3_client.get_objects(base_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise InfocareStoreS3NotFound("not found date list")
            for date_prefix in prefixes:
                date = (
                    date_prefix["Prefix"]
                    .replace(base_prefix, "")
                    .replace("/", "")
                    .strip()
                )
                date_list.append(date)
            date_list.sort()
        base_prefix += date_list[-1] + "/"

        return base_prefix

    def fetch_received_log_folder(self) -> None:
        crawler_log_id = self.config["CRAWLER_LOG_ID"]
        crawler_date = tzfromtimestamp(float(crawler_log_id))
        log_id_prefix = (
            f"{self.config['ENVIRONMENT']}/"
            f"{crawler_date.year}/"
            f"{crawler_date.month}/"
            f"{crawler_date.day}/"
            f"{crawler_log_id}/"
        )
        self.fetch_sido_region_folder(log_id_prefix)

    def fetch_sido_region_folder(self, log_id_prefix: str) -> None:
        data_prefix = log_id_prefix + "data/"
        sido_check: bool = False
        for response in self.s3_client.get_objects(data_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise InfocareStoreS3NotFound("not found sido region list")
            for sido_prefix in prefixes:
                sido_name = (
                    sido_prefix["Prefix"]
                    .replace(data_prefix, "")
                    .replace("/", "")
                    .strip()
                )
                if re.search(self.region_level_1, sido_name):
                    sido_check = True
                    self.fetch_gugun_region_folder(sido_prefix["Prefix"])

        if not sido_check:
            raise InfocareStoreRegionNotFound(
                f"not found sido({self.region_level_1})"
            )

    def fetch_gugun_region_folder(self, sido_prefix: str) -> None:
        gugun_check: bool = False
        for response in self.s3_client.get_objects(sido_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise InfocareStoreS3NotFound("not found gugun region list")
            for gugun_prefix in prefixes:
                gugun_name = (
                    gugun_prefix["Prefix"]
                    .replace(sido_prefix, "")
                    .replace("/", "")
                    .strip()
                )
                if re.search(self.region_level_2, gugun_name):
                    gugun_check = True
                    self.fetch_dong_region_folder(gugun_prefix["Prefix"])

        if not gugun_check:
            raise InfocareStoreRegionNotFound(
                f"not found gugun({self.region_level_2})"
            )

    def fetch_dong_region_folder(self, gugun_prefix: str) -> None:
        dong_check: bool = False
        for response in self.s3_client.get_objects(
            gugun_prefix, Delimiter="/"
        ):
            prefixes = response.common_prefixes
            if not prefixes:
                raise InfocareStoreS3NotFound("not found dong region list")
            for dong_prefix in prefixes:
                dong_name = (
                    dong_prefix["Prefix"]
                    .replace(gugun_prefix, "")
                    .replace("/", "")
                    .strip()
                )
                if re.search(self.region_level_3, dong_name):
                    dong_check = True
                    self.fetch_main_using_type_folder(dong_prefix["Prefix"])

        if not dong_check:
            raise InfocareStoreRegionNotFound(
                f"not found dong({self.region_level_3})"
            )

    def fetch_main_using_type_folder(self, dong_prefix: str) -> None:
        for response in self.s3_client.get_objects(dong_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise InfocareStoreS3NotFound("not found main using list")
            for main_using_prefix in prefixes:
                self.fetch_sub_using_type_folder(main_using_prefix["Prefix"])

    def fetch_sub_using_type_folder(self, main_using_prefix: str) -> None:
        for response in self.s3_client.get_objects(
            main_using_prefix, Delimiter="/"
        ):
            prefixes = response.common_prefixes
            if not prefixes:
                raise InfocareStoreS3NotFound("not found sub using list")
            for sub_using_prefix in prefixes:
                self.fetch_statistics_folder(sub_using_prefix["Prefix"])

    def fetch_statistics_folder(self, sub_using_prefix: str) -> None:
        """
        인포케어 통계 html 파일에는 선택된 시도, 시군구, 동읍면에 대한 통계가 드롭다운형식으로
        저장되어있음. 해당 드롭다운의 인덱스를 활용하여 중복데이터 저장을 방지합니다.

        만약에 시도가 바뀌었다면 드롭다운에서의 시군구와 동읍면의 인덱스는 첫번째일것이며
        만약에 시군구가 바뀌었다면 드롭다운에서의 동읍면의 인덱스는 첫번째일것입니다.

        해당 html 파일에서 데이터가 선택된 시,도에 대한
        첫번째 시군구 and 첫번째 동읍면이라면 시,도가 바뀐것으로
        시도, 시군구, 동읍면의 통계를 전부 저장합니다.

        해당 html 파일에서 데이터가 선택된 시군구에 대한
        첫번째 동읍면이라면 시군구가 바뀐것으로
        시군구, 동읍면 통계를 전부 저장합니다.

        위의 2가지 케이스에 해당하지 않으면 동읍면 통계만 저장합니다.
        """

        for response in self.s3_client.get_objects(
            sub_using_prefix, Delimiter="/"
        ):
            contents = response.contents
            prefixes = response.common_prefixes
            if not contents:
                raise InfocareStoreS3NotFound("not found statistics data")
            for content in contents:
                file_prefix = content["Key"]
                s3_response = self.s3_client.get_object(file_prefix)
                statistics_data = s3_response.body.read().decode("utf-8")
                statistics = InfocareStatisticResponse.from_html(
                    statistics_data
                )

                # 시,도 통계 저장
                if (
                    statistics.first_gugun_name == statistics.gugun_name
                    and statistics.first_dong_name == statistics.dong_name
                ):
                    db_sido_id = self.store_sido_region(statistics.sido_name)
                    self.store_statistics_data(
                        statistics,
                        db_sido_id=db_sido_id,
                    )
                    # sido id 캐싱
                    self.competed_sido_ids.update(
                        {statistics.sido_name: db_sido_id}
                    )
                # 시,군,구 통계 저장
                if statistics.first_dong_name == statistics.dong_name:
                    sido_name = statistics.sido_name
                    db_sido_id = self.competed_sido_ids[sido_name]
                    db_gugun_id = self.store_gugun_region(
                        statistics.gugun_name, db_sido_id
                    )
                    self.store_statistics_data(
                        statistics,
                        db_gugun_id=db_gugun_id,
                    )
                    # gugun id 캐싱
                    self.completed_gugun_ids.update(
                        {statistics.gugun_name: db_gugun_id}
                    )
                # 읍,면,동 통계 저장
                gugun_name = statistics.gugun_name
                db_gugun_id = self.completed_gugun_ids[gugun_name]
                db_dong_id = self.store_dong_region(
                    statistics.dong_name, db_gugun_id
                )
                self.store_statistics_data(
                    statistics,
                    db_dong_id=db_dong_id,
                )

                if prefixes:  # 낙찰사례 페이지 저장
                    for bid_prefix in prefixes:
                        self.fetch_bid_folder(
                            bid_prefix["Prefix"], statistics, db_dong_id
                        )
                else:  # 해당 동에 대한 낙찰사례가 없는경우 전에 있던 낙찰사례를 만료시킴
                    self.store_bid_expired_check(
                        statistics_data=statistics,
                        db_dong_id=db_dong_id,
                        bid_list=[],
                    )

    def fetch_bid_folder(
        self,
        bid_prefix: str,
        statistics_data: InfocareStatisticResponse,
        db_dong_id: int,
    ) -> None:  # 낙찰사례 페이지일 경우,
        for response in self.s3_client.get_objects(bid_prefix, Delimiter="/"):
            contents = response.contents
            if not contents:
                raise InfocareStoreS3NotFound("not found bid data")
            for content in contents:
                file_prefix = content["Key"]
                s3_response = self.s3_client.get_object(file_prefix)
                bid_data = s3_response.body.read().decode("utf-8")
                bid_response = InfocareBidResponse.from_html(bid_data)
                bid_list = bid_response.infocare_bid_list
                self.store_bid_expired_check(
                    bid_list=bid_list,
                    statistics_data=statistics_data,
                    db_dong_id=db_dong_id,
                )  # 해당 동에 대한 낙찰사례를 순회하며 만료시킴
                self.store_bid_data(bid_list, statistics_data, db_dong_id)

    def store_sido_region(self, sido_name: str) -> int:
        session = self.session_factory()
        try:
            db_sido = InfocareSido.create_or_update(session, sido_name)
            db_sido_id = db_sido.id
            session.commit()
        except Exception:
            session.rollback()
            raise
        finally:
            session.close()

        return db_sido_id

    def store_gugun_region(self, gugun_name: str, db_sido_id: int) -> int:
        session = self.session_factory()
        try:
            db_gugun = InfocareGugun.create_or_update(
                session, gugun_name, db_sido_id
            )
            db_gugun_id = db_gugun.id
            session.commit()
        except Exception:
            session.rollback()
            raise
        finally:
            session.close()

        return db_gugun_id

    def store_dong_region(self, dong_name: str, db_gugun_id: int) -> int:
        session = self.session_factory()
        try:
            db_dong = InfocareDong.create_or_update(
                session, dong_name, db_gugun_id
            )
            db_dong_id = db_dong.id
            session.commit()
        except Exception:
            session.rollback()
            raise
        finally:
            session.close()

        return db_dong_id

    def store_statistics_data(
        self,
        data: InfocareStatisticResponse,
        *,
        db_sido_id: typing.Optional[int] = None,
        db_gugun_id: typing.Optional[int] = None,
        db_dong_id: typing.Optional[int] = None,
    ) -> None:
        session = self.session_factory()
        # Store sido statistics
        if db_sido_id:
            if data.sido_year_bid_count == 0:
                return
            try:
                InfocareStatistics.create_or_update(
                    session,
                    data.start_date,
                    data.end_date,
                    MAIN_USAGE_TYPE[data.main_usage_type],
                    data.sub_usage_type,
                    data.sido_year_avg_price_rate,
                    data.sido_year_avg_bid_rate,
                    data.sido_year_bid_count,
                    data.sido_six_month_avg_price_rate,
                    data.sido_six_month_avg_bid_rate,
                    data.sido_six_month_bid_count,
                    data.sido_three_month_avg_price_rate,
                    data.sido_three_month_avg_bid_rate,
                    data.sido_three_month_bid_count,
                    db_sido_id,
                    db_gugun_id,
                    db_dong_id,
                )
                session.commit()
            except Exception:
                session.rollback()
                raise
            finally:
                session.close()
            logger.info("Store Sido Statistics", sido=data.sido_name)

        # Store gugun statistics
        elif db_gugun_id:
            if data.gugun_year_bid_count == 0:
                return
            try:
                InfocareStatistics.create_or_update(
                    session,
                    data.start_date,
                    data.end_date,
                    MAIN_USAGE_TYPE[data.main_usage_type],
                    data.sub_usage_type,
                    data.gugun_year_avg_price_rate,
                    data.gugun_year_avg_bid_rate,
                    data.gugun_year_bid_count,
                    data.gugun_six_month_avg_price_rate,
                    data.gugun_six_month_avg_bid_rate,
                    data.gugun_six_month_bid_count,
                    data.gugun_three_month_avg_price_rate,
                    data.gugun_three_month_avg_bid_rate,
                    data.gugun_three_month_bid_count,
                    db_sido_id,
                    db_gugun_id,
                    db_dong_id,
                )
                session.commit()
            except Exception:
                session.rollback()
                raise
            finally:
                session.close()
            logger.info(
                "Store Gugun Statistics",
                sido=data.sido_name,
                gugun=data.gugun_name,
            )

        # Store dong statistics
        elif db_dong_id:
            if data.dongli_year_bid_count == 0:
                return
            try:
                InfocareStatistics.create_or_update(
                    session,
                    data.start_date,
                    data.end_date,
                    MAIN_USAGE_TYPE[data.main_usage_type],
                    data.sub_usage_type,
                    data.dongli_year_avg_price_rate,
                    data.dongli_year_avg_bid_rate,
                    data.dongli_year_bid_count,
                    data.dongli_six_month_avg_price_rate,
                    data.dongli_six_month_avg_bid_rate,
                    data.dongli_six_month_bid_count,
                    data.dongli_three_month_avg_price_rate,
                    data.dongli_three_month_avg_bid_rate,
                    data.dongli_three_month_bid_count,
                    db_sido_id,
                    db_gugun_id,
                    db_dong_id,
                )
                session.commit()
            except Exception:
                session.rollback()
                raise
            finally:
                session.close()
            logger.info(
                "Store Dong Statistics",
                sido=data.sido_name,
                gugun=data.gugun_name,
                dong=data.dong_name,
            )

    def store_bid_data(
        self,
        bid_list: typing.List[InfocareBid],
        statistics_data: InfocareStatisticResponse,
        db_dong_id: int,
    ) -> None:
        for bid in bid_list:
            session = self.session_factory()
            try:
                InfocareBid.create_or_update(
                    session,
                    bid.case_number,
                    bid.address,
                    MAIN_USAGE_TYPE[statistics_data.main_usage_type],
                    statistics_data.sub_usage_type,
                    bid.bid_date,
                    bid.estimated_price,
                    bid.lowest_price,
                    bid.success_price,
                    bid.success_bid_rate,
                    db_dong_id,
                )
                session.commit()
            except Exception:
                session.rollback()
                raise
            finally:
                session.close()
            logger.info("Store Bid Statistics", bid=bid.address)

    def store_bid_expired_check(
        self,
        *,
        statistics_data: InfocareStatisticResponse,
        db_dong_id: int,
        bid_list: typing.List[InfocareBid],
    ) -> None:
        session = self.session_factory()

        try:
            db_bid_list = (
                session.query(InfocareBid)
                .filter(
                    InfocareBid.infocare_dong_id == db_dong_id,
                    InfocareBid.expired_date.is_(None),
                )
                .all()
            )

            for db_bid in db_bid_list:
                expired_check = True
                if bid_list:
                    for bid in bid_list:
                        if (
                            db_bid.bid_date == bid.bid_date.date()
                            and db_bid.case_number == bid.case_number
                            and db_bid.address == bid.address
                            and db_bid.main_usage_type.name
                            == MAIN_USAGE_TYPE[statistics_data.main_usage_type]
                            and db_bid.sub_usage_type
                            == statistics_data.sub_usage_type
                        ):
                            expired_check = False

                if expired_check:  # db에 저장된 해당 데이터가 만료되었으면
                    InfocareBid.update_expired_date(
                        session, db_bid, self.storing_date
                    )
                    session.commit()
        except Exception:
            session.rollback()
            raise
        finally:
            session.close()
Exemple #9
0
class NsdiCrawler(object):
    def __init__(
        self,
        config: typing.Dict[str, typing.Any],
    ) -> None:
        super().__init__()
        self.config = config
        self.slack_client = SlackClient(
            config.get("SLACK_CHANNEL"), config.get("SLACK_API_TOKEN")
        )
        self.nsdi_client = NsdiClient(config)
        self.s3_client = S3Client(config)
        self.region_land_use_dict: typing.Dict[str, str] = dict()
        self.region_land_feature_dict: typing.Dict[str, str] = dict()
        self.total_statistics = CrawlerStatistics()
        self.failure_statistics = CrawlerStatistics()
        self.crawling_date: datetime.datetime = tznow(
            pytz.timezone("Asia/Seoul")
        )
        self.crawling_start_time: str = str(
            timestamp(tznow(pytz.timezone("Asia/Seoul")))
        )

    def run(self, run_by: str) -> None:
        """
        토지이용계획정보와 토지특성정보 2가지를 크롤링합니다.
        만약 크롤러 로그가 S3에 없다면 2019-01-01 이후 데이터를 크롤링 합니다.
        만약 크롤러 로그가 있다면 로그를 통해 지역별로 날짜를 비교하며 중복데이터는 저장하지 않습니다
            현재 날짜 기준 6개월 전으로 시작날짜를 잡고 크롤링 합니다.
        크롤러 로그는 정상적으로 모든 프로세스가 완료되었을때만 작성됩니다.
        수집한 데이터가 없어도 크롤러 로그는 항상 s3에 지역별 최신으로 올려줍니다.
        """

        self.slack_client.send_info_slack(
            f"TIME_STAMP: {self.crawling_start_time}\n"
            f"업데이트할 데이터를 찾습니다"
            f"({self.config['ENVIRONMENT']}, {run_by})"
        )

        self.crawl(run_by)

        statistics = slack_failure_percentage_statistics(
            self.total_statistics, self.failure_statistics
        )

        if (
            self.total_statistics.land_use_zip_count == 0
            and self.total_statistics.land_feature_zip_count == 0
        ):
            self.slack_client.send_info_slack(
                f"업데이트할 데이터가 없습니다\n"
                f"TIME_STAMP: {self.crawling_start_time}\n\n"
                f"statistics: {statistics}",
            )
        else:
            self.slack_client.send_info_slack(
                f"데이터 업데이트 완료\n"
                f"TIME_STAMP: {self.crawling_start_time}\n\n"
                f"statistics: {statistics}",
            )

    def crawl(self, run_by: str) -> None:
        land_use_log_none = self.fetch_region_crawler_log(
            prov_org="NIDO",
            gubun="F",
            svc_se="F",
            svc_id="F014",
            name_type="토지이용계획정보",
        )

        land_feature_log_none = self.fetch_region_crawler_log(
            prov_org="SCOS",
            gubun="F",
            svc_se="F",
            svc_id="F024",
            name_type="토지특성정보",
        )

        try:
            self.crawl_land_use(
                prov_org="NIDO",
                svc_se="F",
                svc_id="F014",
                crawler_log_none=land_use_log_none,
            )
        except NsdiCrawlerNotFoundError:
            logger.info("해당하는 날짜의 데이터가 없습니다")

        try:
            self.crawl_land_feature(
                prov_org="SCOS",
                svc_se="F",
                svc_id="F024",
                crawler_log_none=land_feature_log_none,
            )
        except NsdiCrawlerNotFoundError:
            logger.info("해당하는 날짜의 데이터가 없습니다")

        if self.total_statistics.land_use_zip_count > 0:
            self.update_crawler_log(run_by, "토지이용계획정보")

        if self.total_statistics.land_feature_zip_count > 0:
            self.update_crawler_log(run_by, "토지특성정보")

        if (
            self.total_statistics.land_use_zip_count == 0
            and self.total_statistics.land_feature_zip_count != 0
        ):
            self.update_crawler_log(run_by, "토지이용계획정보없음")
        if (
            self.total_statistics.land_feature_zip_count == 0
            and self.total_statistics.land_use_zip_count != 0
        ):
            self.update_crawler_log(run_by, "토지특성정보없음")

    def crawl_land_use(
        self,
        *,
        prov_org: str,
        svc_se: str,
        svc_id: str,
        crawler_log_none: bool,
    ) -> None:  # 토지이용계획정보 크롤링
        data_type = self.config["DATA_TYPE"]
        extrc_se_search = "AL" if data_type == "전체데이터" else "CH"

        if crawler_log_none:
            start_date = "2019-01-01"
        else:
            start_date = (
                self.crawling_date - datetime.timedelta(weeks=25)
            ).strftime("%Y-%m-%d")

        end_date = (self.crawling_date + datetime.timedelta(days=1)).strftime(
            "%Y-%m-%d"
        )

        self.crawl_land_info(
            svc_se, svc_id, start_date, end_date, extrc_se_search, prov_org
        )

        if self.total_statistics.land_use_zip_count == 0:
            logger.info("수집된 데이터가 없습니다")

    def crawl_land_feature(
        self,
        *,
        prov_org: str,
        svc_se: str,
        svc_id: str,
        crawler_log_none: bool,
    ) -> None:  # 토지특성정보
        data_type = self.config["DATA_TYPE"]
        extrc_se_search = "AL" if data_type == "전체데이터" else "CH"

        if crawler_log_none:
            start_date = "2019-01-01"
        else:
            start_date = (
                self.crawling_date - datetime.timedelta(weeks=25)
            ).strftime("%Y-%m-%d")

        end_date = (self.crawling_date + datetime.timedelta(days=1)).strftime(
            "%Y-%m-%d"
        )

        self.crawl_land_info(
            svc_se, svc_id, start_date, end_date, extrc_se_search, prov_org
        )

        if self.total_statistics.land_feature_zip_count == 0:
            logger.info("수집된 데이터가 없습니다")

    def crawl_land_info(
        self,
        svc_se: str,
        svc_id: str,
        start_date: str,
        end_date: str,
        extrc_se_search: str,
        prov_org: str,
    ) -> None:

        try:
            page = self.nsdi_client.fetch_land_using_info_table(
                svc_se,
                svc_id,
                start_date,
                end_date,
                extrc_se_search,
                prov_org,
                1,
            )
        except TypeError:
            raise NsdiCrawlerNotFoundError("해당하는 날짜의 데이터가 없습니다")

        with tempfile.TemporaryDirectory() as temp_dir:  # 임시 디렉토리 설정
            for i in range(page.total_page, 0, -1):
                info_list = self.nsdi_client.fetch_land_using_info_table(
                    svc_se,
                    svc_id,
                    start_date,
                    end_date,
                    extrc_se_search,
                    prov_org,
                    i,
                ).land_using_info

                for info in reversed(info_list):
                    region_dict_date = info.base_date
                    if info.city_type == "인천광역시 남구":
                        if info.name_type == "토지이용계획정보":
                            region_dict_date = self.region_land_use_dict[
                                "인천광역시 미추홀구"
                            ]
                        elif info.name_type == "토지특성정보":
                            region_dict_date = self.region_land_feature_dict[
                                "인천광역시 미추홀구"
                            ]
                    else:
                        if info.name_type == "토지이용계획정보":
                            region_dict_date = self.region_land_use_dict[
                                info.city_type
                            ]
                        elif info.name_type == "토지특성정보":
                            region_dict_date = self.region_land_feature_dict[
                                info.city_type
                            ]
                    log_datetime = datetime.datetime.strptime(
                        region_dict_date, "%Y-%m-%d"
                    )
                    nsdi_datetime = datetime.datetime.strptime(
                        info.base_date, "%Y-%m-%d"
                    )

                    if nsdi_datetime > log_datetime and (
                        info.name_type == "토지이용계획정보"
                        or info.name_type == "토지특성정보"
                    ):  # 로그에 비해 최신이면 다운로드
                        logger.info(
                            "Crawling Data",
                            data_type=info.data_type,
                            city_type=info.city_type,
                            name_type=info.name_type,
                            base_date=info.base_date,
                            file_size=info.file_size,
                        )
                        self.download_zip_data(info, temp_dir, prov_org)
                        if info.name_type == "토지이용계획정보":
                            self.region_land_use_dict[
                                info.city_type
                            ] = info.base_date
                        elif info.name_type == "토지특성정보":
                            self.region_land_feature_dict[
                                info.city_type
                            ] = info.base_date

                    else:
                        continue

    def download_zip_data(
        self,
        nsdi_land_using_info: NsdiLandUsingInfo,
        temp_dir: str,
        prov_org: str,
    ) -> None:
        table_data = nsdi_land_using_info.table_data
        temp_path = str(temp_dir) + "\\"
        file_name = table_data.file_nm_dialog

        if self.config["DOWNLOAD"] == "ON":
            # 압축 파일 다운로드
            response = self.nsdi_client.fetch_download_response(
                table_data, prov_org
            )
            download_from_response(temp_path, file_name, response)
            path = temp_path + file_name
        else:
            path = resource.get_resource("/csv/nsdi_csv.zip")

        self.upload_zip_data(nsdi_land_using_info, path)

        if nsdi_land_using_info.name_type == "토지이용계획정보":
            self.total_statistics.land_use_zip_count += 1
        elif nsdi_land_using_info.name_type == "토지특성정보":
            self.total_statistics.land_feature_zip_count += 1

    def upload_zip_data(
        self, nsdi_land_using_info: NsdiLandUsingInfo, temp_path: str
    ) -> None:

        if len(nsdi_land_using_info.city_type.split()) > 1:  # 시,군,구 데이터일때
            region_split = nsdi_land_using_info.city_type.split()
            sido_name = region_split[0]
            gugun_name = nsdi_land_using_info.city_type.replace(sido_name, "")
        else:  # 시,도 데이터일때
            sido_name = nsdi_land_using_info.city_type
            gugun_name = "ALL"

        folder_name = (
            f"{self.config['ENVIRONMENT']}/"
            f"{self.crawling_date.year}/"
            f"{self.crawling_date.month:02}/"
            f"{self.crawling_date.day:02}/"
            f"{str(self.crawling_start_time)}/"
            f"{nsdi_land_using_info.name_type}/"
            f"data/"
            f"{nsdi_land_using_info.data_type}/"
            f"{sido_name}/"
            f"{gugun_name}/"
            f"base_date_{nsdi_land_using_info.base_date}"
        )
        file_name = nsdi_land_using_info.table_data.file_nm_dialog

        self.s3_client.upload_s3_zip(
            folder_name=folder_name,
            file_name=file_name,
            temp_path=temp_path,
            mime_type="application/zip",
        )

    def update_crawler_log(self, run_by: str, name_type: str) -> None:
        """
        크롤러 로그는 기존 크롤러 로그를 업데이트하는 방식으로 작성되어집니다.
        """
        region_date_list: typing.List[CrawlerRegionDate] = []
        total_statistics = attr.asdict(self.total_statistics)
        if name_type == "토지이용계획정보":
            total_statistics["land_feature_zip_count"] = 0
        elif name_type == "토지특성정보":
            total_statistics["land_use_zip_count"] = 0
        elif name_type == "토지이용계획정보없음":
            total_statistics["land_feature_zip_count"] = 0
            total_statistics["land_use_zip_count"] = 0
            name_type = "토지이용계획정보"
        elif name_type == "토지특성정보없음":
            total_statistics["land_feature_zip_count"] = 0
            total_statistics["land_use_zip_count"] = 0
            name_type = "토지특성정보"

        if name_type == "토지이용계획정보":
            for region, date in self.region_land_use_dict.items():
                if date != "0001-01-01":
                    region_date_list.append(
                        CrawlerRegionDate(region=region, date=date)
                    )
        elif name_type == "토지특성정보":
            for region, date in self.region_land_feature_dict.items():
                if date != "0001-01-01":
                    region_date_list.append(
                        CrawlerRegionDate(region=region, date=date)
                    )

        data = {
            "time_stamp": self.crawling_start_time,
            "run_by": run_by,
            "finish_time_stamp": str(timestamp(tznow())),
            "total_statistics": total_statistics,
            "region_date": [vars(x) for x in region_date_list],
        }

        folder_name = (
            f"{self.config['ENVIRONMENT']}/"
            f"{self.crawling_date.year}/"
            f"{self.crawling_date.month:02}/"
            f"{self.crawling_date.day:02}/"
            f"{str(self.crawling_start_time)}/"
            f"{name_type}/"
            f"crawler-log"
        )

        file_name = f"{self.crawling_start_time}.json"

        self.s3_client.upload_s3(
            folder_name, file_name, data, "application/json", encoding="utf-8"
        )

    def fetch_region_crawler_log(
        self,
        *,
        prov_org: str,
        gubun: str,
        svc_se: str,
        svc_id: str,
        name_type: str,
    ) -> bool:
        """
        지역별 날짜를 딕셔너리에 저장하고 크롤러 로그의 유무를 반환합니다.
        토지특성정보 데이터의 경우에는 시,군,구에 대한 최신 데이터 날짜도 가져옵니다.
        """
        response = self.nsdi_client.init_page(prov_org, gubun, svc_se, svc_id)
        region_list = self.nsdi_client.fetch_region_list(response)

        for region in region_list:
            if name_type == "토지특성정보":
                self.region_land_feature_dict.update(
                    {region.adm_code_nm: "0001-01-01"}
                )
                region_detail_list = self.nsdi_client.fetch_region_detail_list(
                    region.adm_code
                )
                for region_detail in region_detail_list:
                    self.region_land_feature_dict.update(
                        {region_detail.adm_code_nm: "0001-01-01"}
                    )
            elif name_type == "토지이용계획정보":
                self.region_land_use_dict.update(
                    {region.adm_code_nm: "0001-01-01"}
                )

        try:
            crawler_log = self.fetch_crawler_log(name_type)
            crawler_log_none = False
            for sido in crawler_log.region_date:
                if name_type == "토지이용계획정보":
                    self.region_land_use_dict[sido.region] = sido.date
                elif name_type == "토지특성정보":
                    self.region_land_feature_dict[sido.region] = sido.date
        except TypeError:
            crawler_log_none = True

        return crawler_log_none

    def fetch_crawler_log(self, name_type: str) -> CrawlerLogResponse:
        log_id_prefix = self.fetch_crawler_log_path(name_type)
        response = self.s3_client.get_object(log_id_prefix)
        json_log = json.loads(response.body.read())
        return CrawlerLogResponse.from_json(json_log)

    def fetch_crawler_log_path(self, name_type: str) -> str:  # 최신 로그 폴더 경로
        env_prefix = f"{self.config['ENVIRONMENT']}/"
        year_list: typing.List[str] = []
        month_list: typing.List[str] = []
        day_list: typing.List[str] = []
        time_stamp_list: typing.List[str] = []
        log_id_list: typing.List[str] = []

        for response in self.s3_client.get_objects(env_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            for year_prefix in prefixes:
                year = (
                    year_prefix["Prefix"]
                    .replace(env_prefix, "")
                    .replace("/", "")
                    .strip()
                )
                year_list.append(year)
            year_list.sort()
        year_prefix = env_prefix + year_list[-1] + "/"

        for response in self.s3_client.get_objects(year_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            for month_prefix in prefixes:
                month = (
                    month_prefix["Prefix"]
                    .replace(year_prefix, "")
                    .replace("/", "")
                    .strip()
                )
                month_list.append(month)
            month_list.sort()
        month_prefix = year_prefix + month_list[-1] + "/"

        for response in self.s3_client.get_objects(
            month_prefix, Delimiter="/"
        ):
            prefixes = response.common_prefixes

            for day_prefix in prefixes:
                day = (
                    day_prefix["Prefix"]
                    .replace(month_prefix, "")
                    .replace("/", "")
                    .strip()
                )
                day_list.append(day)
            day_list.sort()
        day_prefix = month_prefix + day_list[-1] + "/"

        for response in self.s3_client.get_objects(day_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            for time_stamp_prefix in prefixes:
                time_stamp = (
                    time_stamp_prefix["Prefix"]
                    .replace(day_prefix, "")
                    .replace("/", "")
                    .strip()
                )
                time_stamp_list.append(time_stamp)
            time_stamp_list.sort()

        time_stamp_prefix = day_prefix + time_stamp_list[-1] + "/"

        log_id_prefix = f"{time_stamp_prefix}" f"{name_type}/" f"crawler-log/"

        for response in self.s3_client.get_objects(log_id_prefix):
            for content in response.contents:
                log_id = content["Key"].split("/")[-1].replace(".json", "")
                log_id_list.append(log_id)
            log_id_list.sort()

        log_id_prefix += log_id_list[-1] + ".json"

        return log_id_prefix
Exemple #10
0
class NsdiStore(object):
    def __init__(self, config: typing.Dict[str, typing.Any]) -> None:
        super().__init__()
        self.config = config
        self.session_factory = create_session_factory(config)
        self.s3_client = S3Client(config)
        self.slack_client = SlackClient(config.get("SLACK_CHANNEL"),
                                        config.get("SLACK_API_TOKEN"))
        self.region_level_1 = self.config["REGION_REGEX_LEVEL_1"]
        self.region_level_2 = self.config["REGION_REGEX_LEVEL_2"]

    def run(self, run_by: str) -> None:

        self.slack_client.send_info_slack(
            f"Store 시작합니다. ({self.config['ENVIRONMENT']}, {run_by})")

        crawler_log_id = self.config["CRAWLER_LOG_ID"]

        if crawler_log_id:
            self.fetch_received_log_folder()  # 수동 log id 폴더 저장
        else:
            self.fetch_latest_log_folder()  # 최신 log id 폴더 저장

        self.slack_client.send_info_slack(
            f"Store 종료합니다. ({self.config['ENVIRONMENT']}, {run_by})")

    def fetch_received_log_folder(self) -> None:
        crawler_log_id = self.config["CRAWLER_LOG_ID"]
        crawler_date = tzfromtimestamp(float(crawler_log_id))
        log_id_prefix = (f"{self.config['ENVIRONMENT']}/"
                         f"{crawler_date.year}/"
                         f"{crawler_date.month}/"
                         f"{crawler_date.day}/"
                         f"{crawler_log_id}/")
        self.fetch_name_type_folder(log_id_prefix)

    def fetch_latest_log_folder(self) -> None:
        env_prefix = f"{self.config['ENVIRONMENT']}/"
        year_prefix = self.fetch_latest_folder(env_prefix)
        month_prefix = self.fetch_latest_folder(year_prefix)
        day_prefix = self.fetch_latest_folder(month_prefix)
        log_id_prefix = self.fetch_latest_folder(day_prefix)
        self.fetch_name_type_folder(log_id_prefix)

    def fetch_latest_folder(self, base_prefix: str) -> str:
        date_list: typing.List[str] = list()
        for response in self.s3_client.get_objects(base_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise NsdiStoreS3NotFound("not found date list")
            for date_prefix in prefixes:
                date = (date_prefix["Prefix"].replace(base_prefix,
                                                      "").replace("/",
                                                                  "").strip())
                date_list.append(date)
            date_list.sort()
        base_prefix += date_list[-1] + "/"

        return base_prefix

    def fetch_name_type_folder(self, log_id_prefix: str) -> None:
        for response in self.s3_client.get_objects(log_id_prefix,
                                                   Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise NsdiStoreS3NotFound("not found name type folder")
            for name_type_prefix in prefixes:
                name_type = (name_type_prefix["Prefix"].replace(
                    name_type_prefix, "").replace("/", "").strip())
                if "토지이용계획정보" == name_type:
                    self.fetch_sido_region_folder(name_type_prefix['Prefix'],
                                                  "토지이용계획정보")
                elif "토지특성정보" == name_type:
                    self.fetch_sido_region_folder(name_type_prefix['Prefix'],
                                                  "토지특성정보")

    def fetch_sido_region_folder(self, name_type_prefix: str,
                                 name_type: str) -> None:
        name_type_prefix += "data/전체데이터/"
        sido_check: bool = False
        for response in self.s3_client.get_objects(name_type_prefix,
                                                   Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise NsdiStoreS3NotFound("not found sido region list")
            for sido_prefix in prefixes:
                sido_name = (sido_prefix["Prefix"].replace(
                    name_type_prefix, "").replace("/", "").strip())
                if re.search(self.region_level_1, sido_name):
                    sido_check = True
                    self.fetch_gugun_region_folder(sido_prefix["Prefix"],
                                                   name_type)

        if not sido_check:
            raise NsdiStoreRegionNotFound(
                f"not found sido({self.region_level_1})")

    def fetch_gugun_region_folder(self, sido_prefix: str,
                                  name_type: str) -> None:
        gugun_check: bool = False
        for response in self.s3_client.get_objects(sido_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise NsdiStoreS3NotFound("not found gugun region list")
            for gugun_prefix in prefixes:
                gugun_name = (gugun_prefix["Prefix"].replace(
                    sido_prefix, "").replace("/", "").strip())
                if re.search(self.region_level_2, gugun_name):
                    gugun_check = True
                    self.fetch_base_date_folder(gugun_prefix["Prefix"],
                                                name_type)

        if not gugun_check:
            raise NsdiStoreRegionNotFound(
                f"not found gugun({self.region_level_2})")

    def fetch_base_date_folder(self, gugun_prefix: str,
                               name_type: str) -> None:
        for response in self.s3_client.get_objects(gugun_prefix,
                                                   Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise NsdiStoreS3NotFound("not found gugun region list")
            for base_date_prefix in prefixes:
                self.fetch_zip_data_folder(base_date_prefix["Prefix"],
                                           name_type)

    def fetch_zip_data_folder(self, base_date_prefix: str,
                              name_type: str) -> None:
        for response in self.s3_client.get_objects(base_date_prefix,
                                                   Delimiter="/"):
            contents = response.contents
            if not contents:
                raise NsdiStoreS3NotFound("not found statistics data")
            for content in contents:
                file_prefix = content["Key"]
                file_name = (file_prefix.replace(base_date_prefix,
                                                 "").replace("/", "").strip())
                with tempfile.TemporaryDirectory() as temp_dir:
                    folder_path = str(temp_dir) + "/"
                    logger.info(folder_path)
                    file_path = folder_path + file_name
                    logger.info("S3 ZIP DOWNLOAD", file_name=file_name)
                    self.s3_client.download_object(file_prefix, file_path)
                    file_name = file_name.replace(".zip", ".csv")
                    extract_zip_file(file_path, folder_path, file_name)
                    file_path = file_path.replace(".zip", ".csv")

                    converted_csv_path = self.convert_csv_file(
                        file_path, folder_path, file_name, name_type)
                    self.store_csv_data(converted_csv_path, name_type)

    def convert_csv_file(self, file_path: str, folder_path: str,
                         file_name: str, name_type: str) -> str:
        logger.info("Convert start", file_name=file_name)
        if name_type == "토지이용계획정보":
            converted_csv_path = convert_land_csv(file_path, folder_path,
                                                  file_name, NSDI_USE_DICT)
        elif name_type == "토지특성정보":
            converted_csv_path = convert_land_csv(file_path, folder_path,
                                                  file_name, NSDI_FEATURE_DICT)
        else:
            raise NsdiStoreError("not found name type")
        logger.info("Convert finish", file_name=file_name)

        return converted_csv_path

    def store_csv_data(self, file_path: str, name_type: str) -> None:
        """
        우선은 10,000줄 단위로 읽은 후 upsert하도록 하였습니다
        만약 bulk insert를 원할경우 store_bulk_insert 메소드를 사용해주세요
        """
        rows = list()
        if name_type == "토지이용계획정보":
            # self.store_land_use_bulk_insert(file_path)
            for idx, row in enumerate(read_csv(file_path)):
                rows.append(row)
                if idx % 10000 == 0 and idx != 0:
                    self.store_land_use_bulk_upsert(rows)
                    rows.clear()
            if rows:
                self.store_land_use_bulk_upsert(rows)
        elif name_type == "토지특성정보":
            # self.store_land_feature_bulk_insert(file_path)
            for idx, row in enumerate(read_csv(file_path)):
                rows.append(row)
                if idx % 10000 == 0 and idx != 0:
                    self.store_land_feature_bulk_upsert(rows)
                    rows.clear()
            if rows:
                self.store_land_feature_bulk_upsert(rows)

    def store_land_use_bulk_insert(self, file_path: str) -> None:
        """
        csv 파일을 한번에 읽어서 bulk insert를 해줍니다
        """
        bulk_values: typing.List[typing.Dict[str, str]]
        session = self.session_factory()
        with open(file_path, "r", encoding="utf-8-sig") as csv_file:
            csv_dict_reader = DictReader(csv_file)
            bulk_values = list(csv_dict_reader)
        logger.info("bulk insert start")
        try:
            session.bulk_insert_mappings(NsdiLandUse, bulk_values)
            session.commit()
        except Exception:
            raise NsdiStoreError("Store land use error")
        finally:
            session.close()
        logger.info("bulk insert finish")

    def store_land_feature_bulk_insert(self, file_path: str) -> None:
        """
        csv 파일을 한번에 읽어서 bulk insert를 해줍니다
        """
        bulk_values: typing.List[typing.Dict[str, str]]
        session = self.session_factory()
        with open(file_path, "r", encoding="utf-8-sig") as csv_file:
            csv_dict_reader = DictReader(csv_file)
            bulk_values = list(csv_dict_reader)
        logger.info("bulk insert start")
        try:
            session.bulk_insert_mappings(NsdiLandFeature, bulk_values)
            session.commit()
        except Exception:
            raise NsdiStoreError("Store land feature error")
        finally:
            session.close()
        logger.info("bulk insert finish")

    def store_land_use_bulk_upsert(self, bulk_values: list) -> None:
        """
        이 부분을 사용하려면 loan-model에 bulk_create_or_update를 추가해야합니다.
        """
        session = self.session_factory()
        logger.info("bulk upsert start")
        try:
            NsdiLandUse.bulk_create_or_update(session, bulk_values)
            session.commit()
        except Exception:
            raise NsdiStoreError("Store land use error")
        finally:
            session.close()
        logger.info("bulk upsert finish")

    def store_land_feature_bulk_upsert(self, bulk_values: list) -> None:
        """
        이 부분을 사용하려면 loan-model에 bulk_create_or_update를 추가해야합니다.
        """
        session = self.session_factory()
        logger.info("bulk upsert start")
        try:
            NsdiLandFeature.bulk_create_or_update(session, bulk_values)
            session.commit()
        except Exception:
            raise NsdiStoreError("Store land feature error")
        finally:
            session.close()
        logger.info("bulk upsert finish")
Exemple #11
0
class InfoCareCrawler(object):
    def __init__(
        self,
        config: typing.Dict[str, typing.Any],
    ):
        super().__init__()
        self.config = config

        self.slack_client = SlackClient(config.get("SLACK_CHANNEL"),
                                        config.get("SLACK_API_TOKEN"))
        self.info_care_client = InfocareClient(config)
        self.s3_client = S3Client(config)
        self.total_statistics = CrawlerStatistics()
        self.failure_statistics = CrawlerStatistics()
        self.crawling_date: datetime.datetime = tznow(
            pytz.timezone("Asia/Seoul"))
        self.crawling_start_time: str = str(timestamp(self.crawling_date))

    def run(self, run_by: str) -> None:
        self.slack_client.send_info_slack(
            f"TIME_STAMP: {self.crawling_start_time}\n"
            f"크롤링 시작합니다 "
            f"({self.config['ENVIRONMENT']}, {run_by})")
        self.crawl()
        self.update_crawler_log(run_by)

        statistics = slack_failure_percentage_statistics(
            self.total_statistics, self.failure_statistics)

        self.slack_client.send_info_slack(
            f"크롤링 완료\n"
            f"TIME_STAMP: {self.crawling_start_time}\n\n"
            f"statistics:\n"
            f"region_count\n{statistics['region_count']}\n\n"
            f"statistics_count\n{statistics['statistics_count']}\n\n"
            f"bids_count\n{statistics['bids_count']}")

    def crawl(self) -> None:
        login_id = self.config["LOGIN_ID"]
        login_pw = self.config["LOGIN_PW"]

        chk_id = self.info_care_client.fetch_chk_id().chk_id
        self.info_care_client.login(login_id, login_pw, chk_id)

        # 도, 시군구, 읍면동 리스트를 받아와 검색
        try:
            self.crawl_sido_region()
        except Exception as e:
            raise e
        finally:
            self.info_care_client.logout()

    def crawl_sido_region(self) -> None:
        try:
            do_list = self.info_care_client.fetch_sido_list()
        except Exception as e:
            self.failure_statistics.region_count += 1
            raise e

        for do in do_list:
            if re.search(self.config["SIDO"], do.sido_name):
                self.crawl_sigungu_region(do.sido_name)

    def crawl_sigungu_region(self, sido: str) -> None:
        try:
            si_list = self.info_care_client.fetch_sigungu_list(sido)
        except Exception as e:
            self.failure_statistics.region_count += 1
            raise e

        for si in si_list:
            if re.search(self.config["SIGUNGU"], si.sigungu_name):
                self.crawl_dongli_region(sido, si.sigungu_name)

    def crawl_dongli_region(self, sido: str, sigungu: str) -> None:
        try:
            dongli_list = self.info_care_client.fetch_dongli_list(
                sido, sigungu)
        except Exception as e:
            self.failure_statistics.region_count += 1
            raise e

        for dong in dongli_list:
            if re.search(self.config["DONGLI"], dong.dongli_name):
                self.crawl_main_using_type(sido, sigungu, dong.dongli_name)

    def crawl_main_using_type(self, sido: str, sigungu: str,
                              dongli: str) -> None:
        try:
            main_using_list = self.info_care_client.fetch_main_using_type()
        except Exception as e:
            self.failure_statistics.region_count += 1
            raise e

        for main_using in main_using_list:
            main_using_type = main_using.main_using_type

            if re.search(self.config["MAIN_USING_TYPE"], main_using_type):
                self.crawl_sub_using_type(sido, sigungu, dongli,
                                          main_using_type)

    def crawl_sub_using_type(self, sido: str, sigungu: str, dongli: str,
                             main_using_type: str) -> None:
        try:
            sub_using_list = self.info_care_client.fetch_sub_using_type(
                main_using_type)
        except Exception as e:
            self.failure_statistics.region_count += 1
            raise e

        for sub_using in sub_using_list:
            sub_using_type = sub_using.sub_using_type

            if re.search(self.config["SUB_USING_TYPE"], sub_using_type):

                logger.info(
                    "Crawling Statistics",
                    sido=sido,
                    sigungu=sigungu,
                    dong=dongli,
                    main_using_type=main_using_type,
                    sub_using_type=sub_using_type,
                )
                data = self.info_care_client.fetch_statistics_page(
                    sido, sigungu, dongli, main_using_type, sub_using_type)
                self.crawl_html_data(
                    data,
                    sido,
                    sigungu,
                    dongli,
                    main_using_type,
                    sub_using_type,
                )

        self.total_statistics.region_count += 1

    def crawl_html_data(
        self,
        search_data: InfocareSearchResponse,
        sido: str,
        sigungu: str,
        dongli: str,
        main_using_type: str,
        sub_using_type: str,
    ) -> None:

        start_date = search_data.term1
        end_date = search_data.term2
        try:
            self.download_html_data(
                search_data.raw_data,
                sido,
                sigungu,
                dongli,
                main_using_type,
                sub_using_type,
                "statistics",
            )
        except Exception as e:
            self.failure_statistics.statistics_count += 1
            raise e

        self.total_statistics.statistics_count += 1

        # 낙찰사례가 1개 이상인경우 more 버튼의 페이지 다운로드
        if search_data.bids_count > 0:
            try:
                big_page = self.info_care_client.fetch_bid_page(
                    sido,
                    sigungu,
                    dongli,
                    main_using_type,
                    sub_using_type,
                    start_date,
                    end_date,
                    search_data.category,
                )
                self.download_html_data(
                    big_page.raw_data,
                    sido,
                    sigungu,
                    dongli,
                    main_using_type,
                    sub_using_type,
                    "bid",
                )
            except Exception as e:
                self.failure_statistics.bids_count += 1
                raise e

            self.total_statistics.bids_count += 1

    def download_html_data(
        self,
        data: str,
        sido: str,
        sigungu: str,
        dongli: str,
        main_using_type: str,
        sub_using_type: str,
        data_type: str,
    ) -> None:

        file_name = (f"{sido}_"
                     f"{sigungu}_"
                     f"{dongli}_"
                     f"{main_using_type}_"
                     f"{sub_using_type}_"
                     f"{data_type}")

        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = str(temp_dir) + "\\"
            file_name = f"{file_name}.html"
            file_path = temp_path + file_name
            write_file(file_path, data)

            self.store_detail(
                sido,
                sigungu,
                dongli,
                main_using_type,
                sub_using_type,
                file_path,
                file_name,
                data_type,
            )

    def store_detail(
        self,
        sido: str,
        sigungu: str,
        dongli: str,
        main_using_type: str,
        sub_using_type: str,
        file_path: str,
        file_name: str,
        data_type: str,
    ) -> None:

        folder_name = (f"{self.config['ENVIRONMENT']}/"
                       f"{self.crawling_date.year}/"
                       f"{self.crawling_date.month:02}/"
                       f"{self.crawling_date.day:02}/"
                       f"{str(self.crawling_start_time)}/"
                       f"data/"
                       f"{sido}/"
                       f"{sigungu}/"
                       f"{dongli}/"
                       f"{main_using_type}/"
                       f"{sub_using_type}")

        if data_type == "bid":
            folder_name += f"/{data_type}"

        self.s3_client.upload_any_file(
            folder_name=folder_name,
            file_name=file_name,
            file_path=file_path,
            mime_type="text/html",
            mode="rb",
        )

    def update_crawler_log(self, run_by: str) -> None:
        total_statistics = attr.asdict(self.total_statistics)

        data = {
            "time_stamp": self.crawling_start_time,
            "run_by": run_by,
            "finish_time_stamp": str(timestamp(tznow())),
            "total_statistics": total_statistics,
        }

        folder_name = (f"{self.config['ENVIRONMENT']}/"
                       f"{self.crawling_date.year}/"
                       f"{self.crawling_date.month:02}/"
                       f"{self.crawling_date.day:02}/"
                       f"{str(self.crawling_start_time)}/"
                       f"crawler-log")

        file_name = f"{self.crawling_start_time}.json"

        self.s3_client.upload_json(folder_name=folder_name,
                                   file_name=file_name,
                                   data=data)
Exemple #12
0
class TaeinStore(object):
    def __init__(self, config: typing.Dict[str, typing.Any]) -> None:
        super().__init__()
        self.config = config
        self.session_factory = create_session_factory(config)
        self.s3_client = S3Client(config)
        self.slack_client = SlackClient(config.get("SLACK_CHANNEL"),
                                        config.get("SLACK_API_TOKEN"))
        self.region_level_1 = self.config["REGION_REGEX_LEVEL_1"]
        self.region_level_2 = self.config["REGION_REGEX_LEVEL_2"]
        self.region_level_3 = self.config["REGION_REGEX_LEVEL_3"]
        self.completed_sido_statistics: typing.Dict[str,
                                                    typing.List[int]] = dict()
        self.completed_gugun_statistics: typing.Dict[
            str, typing.List[int]] = dict()
        self.completed_sido_ids: typing.Dict[str, int] = dict()
        self.completed_gugun_ids: typing.Dict[str, int] = dict()

    def run(self, run_by: str) -> None:
        """
        로컬에서 실행 시 DB에 taein_area_range를 덤프해야 제대로 작동합니다
        """

        self.slack_client.send_info_slack(
            f"Store 시작합니다. ({self.config['ENVIRONMENT']}, {run_by})")

        crawler_log_id = self.config["CRAWLER_LOG_ID"]

        self.check_area_range_valid_or_not()  # 크롤링한 area_range 맞는지 체크

        if crawler_log_id:
            self.fetch_received_log_folder()  # 수동 log id 폴더 저장
        else:
            self.fetch_latest_log_folder()  # 최신 log id 폴더 저장

        self.slack_client.send_info_slack(
            f"Store 종료합니다. ({self.config['ENVIRONMENT']}, {run_by})")

    def check_area_range_valid_or_not(self) -> None:
        crawler_log = self.fetch_crawler_log()
        area_range_list = crawler_log.area_range_list
        session = self.session_factory()
        try:
            for area_range in area_range_list:
                session.query(TaeinAreaRange).filter(
                    TaeinAreaRange.start_area == area_range.start_area,
                    TaeinAreaRange.end_area == area_range.end_area,
                ).one()
        except Exception:
            raise
        finally:
            session.close()

    def fetch_received_log_folder(self) -> None:
        crawler_log_id = self.config["CRAWLER_LOG_ID"]
        crawler_date = tzfromtimestamp(float(crawler_log_id))
        log_id_prefix = (f"{self.config['ENVIRONMENT']}/"
                         f"{crawler_date.year}/"
                         f"{crawler_date.month}/"
                         f"{crawler_date.day}/"
                         f"{crawler_log_id}/")
        self.fetch_sido_region_folder(log_id_prefix)

    def fetch_latest_log_folder(self) -> None:
        env_prefix = f"{self.config['ENVIRONMENT']}/"
        year_prefix = self.fetch_latest_folder(env_prefix)
        month_prefix = self.fetch_latest_folder(year_prefix)
        day_prefix = self.fetch_latest_folder(month_prefix)
        log_id_prefix = self.fetch_latest_folder(day_prefix)
        self.fetch_sido_region_folder(log_id_prefix)

    def fetch_latest_folder(self, base_prefix: str) -> str:
        date_list: typing.List[str] = list()
        for response in self.s3_client.get_objects(base_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise TaeinStoreS3NotFound("not found date list")
            for date_prefix in prefixes:
                date = (date_prefix["Prefix"].replace(base_prefix,
                                                      "").replace("/",
                                                                  "").strip())
                date_list.append(date)
            date_list.sort()
        base_prefix += date_list[-1] + "/"

        return base_prefix

    def fetch_sido_region_folder(self, log_id_prefix: str) -> None:
        data_prefix = log_id_prefix + "data/"
        sido_check: bool = False
        for response in self.s3_client.get_objects(data_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise TaeinStoreS3NotFound("not found sido region list")
            for sido_prefix in prefixes:
                sido_name = (sido_prefix["Prefix"].replace(
                    data_prefix, "").replace("/", "").strip())
                if re.search(self.region_level_1, sido_name):
                    sido_check = True
                    self.fetch_gugun_region_folder(sido_prefix["Prefix"])

        if not sido_check:
            raise TaeinStoreRegionNotFound(
                f"not found sido({self.region_level_1})")

    def fetch_gugun_region_folder(self, sido_prefix: str) -> None:
        """
        인천 남구와 인천 미추홀구는 같은 지역입니다. 태인경매 홈페이지에서는 해당 두 지역에 대한
        데이터를 모두 가지고 있는데 남구의 경우 데이터가 부정확하며 미추홀구로 검색해야 데이터가
        정확합니다. 따라서 인천 남구는 스킵시켜줍니다.
        """
        gugun_check: bool = False
        for response in self.s3_client.get_objects(sido_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise TaeinStoreS3NotFound("not found gugun region list")
            for gugun_prefix in prefixes:
                gugun_name = (gugun_prefix["Prefix"].replace(
                    sido_prefix, "").replace("/", "").strip())
                if "인천" in sido_prefix and gugun_name == "남구":
                    continue
                if re.search(self.region_level_2, gugun_name):
                    gugun_check = True
                    self.fetch_dong_region_folder(gugun_prefix["Prefix"])

        if not gugun_check:
            raise TaeinStoreRegionNotFound(
                f"not found gugun({self.region_level_2})")

    def fetch_dong_region_folder(self, gugun_prefix: str) -> None:
        dong_check: bool = False
        for response in self.s3_client.get_objects(gugun_prefix,
                                                   Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise TaeinStoreS3NotFound("not found dong region list")
            for dong_prefix in prefixes:
                dong_name = (dong_prefix["Prefix"].replace(
                    gugun_prefix, "").replace("/", "").strip())
                if re.search(self.region_level_3, dong_name):
                    dong_check = True
                    self.fetch_mulgun_kind_folder(dong_prefix["Prefix"])

        if not dong_check:
            raise TaeinStoreRegionNotFound(
                f"not found dong({self.region_level_3})")

    def fetch_mulgun_kind_folder(self, dong_prefix: str) -> None:
        for response in self.s3_client.get_objects(dong_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            if not prefixes:
                raise TaeinStoreS3NotFound("not found main using list")
            for mulgun_kind_prefix in prefixes:
                self.fetch_statistics_folder(mulgun_kind_prefix["Prefix"])

    def fetch_statistics_folder(self, mulgun_kind_prefix: str) -> None:
        for response in self.s3_client.get_objects(mulgun_kind_prefix,
                                                   Delimiter="/"):
            db_dong_id: typing.Optional[int] = None
            contents = response.contents
            prefixes = response.common_prefixes
            if not contents:
                raise TaeinStoreS3NotFound("not found statistics data")
            for content in contents:
                file_prefix = content["Key"]
                s3_response = self.s3_client.get_object(file_prefix)
                statistics_data = s3_response.body.read().decode("utf-8")
                statistics = TaeinStatisticsResponse.from_html(statistics_data)

                sido_name = statistics.sido_name
                gugun_name = statistics.gugun_name
                dong_name = statistics.dong_name

                db_area_range_id = self.get_area_range(
                    statistics.building_start_area,
                    statistics.building_end_area,
                )

                try:
                    self.completed_sido_statistics[sido_name]
                except KeyError:
                    self.completed_sido_statistics[sido_name] = list()

                if (db_area_range_id
                        not in self.completed_sido_statistics[sido_name]):
                    db_sido_id = self.store_sido_region(sido_name)
                    self.store_statistics_data(statistics,
                                               db_area_range_id,
                                               db_sido_id=db_sido_id)
                    # sido id 캐싱
                    self.completed_sido_ids.update({sido_name: db_sido_id})
                    # 전용 면적에 대한 시도 데이터 저장완료 캐싱
                    self.completed_sido_statistics[sido_name].append(
                        db_area_range_id)

                try:
                    self.completed_gugun_statistics[sido_name + gugun_name]
                except KeyError:
                    self.completed_gugun_statistics[sido_name +
                                                    gugun_name] = list()

                if (db_area_range_id
                        not in self.completed_gugun_statistics[sido_name +
                                                               gugun_name]):
                    db_sido_id = self.completed_sido_ids[sido_name]
                    db_gugun_id = self.store_gugun_region(
                        gugun_name, db_sido_id)
                    self.store_statistics_data(statistics,
                                               db_area_range_id,
                                               db_gugun_id=db_gugun_id)
                    # gugun id 캐싱
                    self.completed_gugun_ids.update(
                        {sido_name + gugun_name: db_gugun_id})
                    # 전용 면적에 대한 구군 데이터 저장완료 캐싱
                    self.completed_gugun_statistics[
                        sido_name + gugun_name].append(db_area_range_id)

                # 읍,면,동 통계 저장
                db_gugun_id = self.completed_gugun_ids[sido_name + gugun_name]
                db_dong_id = self.store_dong_region(dong_name, db_gugun_id)
                self.store_statistics_data(statistics,
                                           db_area_range_id,
                                           db_dong_id=db_dong_id)

            if prefixes:
                for bid_prefix in prefixes:
                    self.fetch_bid_folder(bid_prefix["Prefix"], db_dong_id)

    def fetch_bid_folder(self, data_type_prefix: str, db_dong_id: int) -> None:
        for response in self.s3_client.get_objects(data_type_prefix,
                                                   Delimiter="/"):
            contents = response.contents
            if not contents:
                raise TaeinStoreS3NotFound("not found bid data")
            for content in contents:
                file_prefix = content["Key"]
                s3_response = self.s3_client.get_object(file_prefix)
                bid_data = s3_response.body.read().decode("utf-8")
                bid = TaeinBidResponse.from_html(bid_data)
                self.store_bid_data(bid.taein_bid_list, db_dong_id)

    def fetch_crawler_log(self) -> CrawlerLogResponse:
        log_prefix = self.fetch_crawler_log_path()
        response = self.s3_client.get_object(log_prefix)
        json_log = json.loads(response.body.read())
        return CrawlerLogResponse.from_json(json_log)

    def fetch_crawler_log_path(self) -> str:  # 최신 로그 폴더 경로
        env_prefix = f"{self.config['ENVIRONMENT']}/"
        year_list: typing.List[str] = []
        month_list: typing.List[str] = []
        day_list: typing.List[str] = []
        time_stamp_list: typing.List[str] = []
        log_id_list: typing.List[str] = []

        for response in self.s3_client.get_objects(env_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            for year_prefix in prefixes:
                year = (year_prefix["Prefix"].replace(env_prefix,
                                                      "").replace("/",
                                                                  "").strip())
                year_list.append(year)
            year_list.sort()
        year_prefix = env_prefix + year_list[-1] + "/"

        for response in self.s3_client.get_objects(year_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            for month_prefix in prefixes:
                month = (month_prefix["Prefix"].replace(
                    year_prefix, "").replace("/", "").strip())
                month_list.append(month)
            month_list.sort()
        month_prefix = year_prefix + month_list[-1] + "/"

        for response in self.s3_client.get_objects(month_prefix,
                                                   Delimiter="/"):
            prefixes = response.common_prefixes

            for day_prefix in prefixes:
                day = (day_prefix["Prefix"].replace(month_prefix,
                                                    "").replace("/",
                                                                "").strip())
                day_list.append(day)
            day_list.sort()
        day_prefix = month_prefix + day_list[-1] + "/"

        for response in self.s3_client.get_objects(day_prefix, Delimiter="/"):
            prefixes = response.common_prefixes
            for time_stamp_prefix in prefixes:
                time_stamp = (time_stamp_prefix["Prefix"].replace(
                    day_prefix, "").replace("/", "").strip())
                time_stamp_list.append(time_stamp)
            time_stamp_list.sort()

        time_stamp_prefix = day_prefix + time_stamp_list[-1] + "/"

        log_prefix = f"{time_stamp_prefix}crawler-log/"

        for response in self.s3_client.get_objects(log_prefix):
            for content in response.contents:
                log_id = content["Key"].split("/")[-1].replace(".json", "")
                log_id_list.append(log_id)
            log_id_list.sort()

        log_prefix += log_id_list[-1] + ".json"

        return log_prefix

    def get_area_range(self, start_area: str, end_area: str) -> int:
        if start_area == "최소":
            start_area = 0
        if end_area == "최대":
            end_area = 1000

        session = self.session_factory()
        try:
            db_area_range = (session.query(TaeinAreaRange).filter(
                TaeinAreaRange.start_area == start_area,
                TaeinAreaRange.end_area == end_area,
            ).one())
            db_area_range_id = db_area_range.id
            session.commit()
        except Exception:
            raise
        finally:
            session.close()

        return db_area_range_id

    def store_sido_region(self, sido_name: str) -> int:
        session = self.session_factory()
        beautified_name = SIDO_REGION_DICT[sido_name]
        try:
            db_sido = TaeinSido.create_or_update(session, sido_name,
                                                 beautified_name)
            db_sido_id = db_sido.id
            session.commit()
        except Exception:
            session.rollback()
            raise
        finally:
            session.close()

        return db_sido_id

    def store_gugun_region(self, gugun_name: str, db_sido_id: int) -> int:
        session = self.session_factory()
        try:
            db_gugun = TaeinGugun.create_or_update(session, gugun_name,
                                                   db_sido_id)
            db_gugun_id = db_gugun.id
            session.commit()
        except Exception:
            session.rollback()
            raise
        finally:
            session.close()

        return db_gugun_id

    def store_dong_region(self, dong_name: str, db_gugun_id: int) -> int:
        session = self.session_factory()
        try:
            db_dong = TaeinDong.create_or_update(session, dong_name,
                                                 db_gugun_id)
            db_dong_id = db_dong.id
            session.commit()
        except Exception:
            session.rollback()
            raise
        finally:
            session.close()

        return db_dong_id

    def store_statistics_data(
        self,
        data: TaeinStatisticsResponse,
        db_area_range_id: int,
        *,
        db_sido_id: typing.Optional[int] = None,
        db_gugun_id: typing.Optional[int] = None,
        db_dong_id: typing.Optional[int] = None,
    ) -> None:
        session = self.session_factory()
        # Store sido statistics
        if db_sido_id:
            try:
                TaeinStatistics.create_or_update(
                    session,
                    data.start_date_str,
                    data.start_date,
                    data.end_date_str,
                    data.end_date,
                    str(data.sido_year_avg_price_rate),
                    data.sido_year_avg_price_rate,
                    str(data.sido_year_avg_bid_rate),
                    data.sido_year_avg_bid_rate,
                    data.sido_year_bid_count,
                    str(data.sido_six_month_avg_price_rate),
                    data.sido_six_month_avg_price_rate,
                    str(data.sido_six_month_avg_bid_rate),
                    data.sido_six_month_avg_bid_rate,
                    data.sido_six_month_bid_count,
                    db_sido_id,
                    db_gugun_id,
                    db_dong_id,
                    db_area_range_id,
                )
                session.commit()
            except Exception:
                session.rollback()
                raise
            finally:
                session.close()
            logger.info("Store Sido Statistics", sido=data.sido_name)

        # Store gugun statistics
        elif db_gugun_id:
            try:
                TaeinStatistics.create_or_update(
                    session,
                    data.start_date_str,
                    data.start_date,
                    data.end_date_str,
                    data.end_date,
                    str(data.gugun_year_avg_price_rate),
                    data.gugun_year_avg_price_rate,
                    str(data.gugun_year_avg_bid_rate),
                    data.gugun_year_avg_bid_rate,
                    data.gugun_year_bid_count,
                    str(data.gugun_six_month_avg_price_rate),
                    data.gugun_six_month_avg_price_rate,
                    str(data.gugun_six_month_avg_bid_rate),
                    data.gugun_six_month_avg_bid_rate,
                    data.gugun_six_month_bid_count,
                    db_sido_id,
                    db_gugun_id,
                    db_dong_id,
                    db_area_range_id,
                )
                session.commit()
            except Exception:
                session.rollback()
                raise
            finally:
                session.close()
            logger.info(
                "Store Gugun Statistics",
                sido=data.sido_name,
                gugun=data.gugun_name,
            )

        # Store dong statistics
        elif db_dong_id:
            try:
                TaeinStatistics.create_or_update(
                    session,
                    data.start_date_str,
                    data.start_date,
                    data.end_date_str,
                    data.end_date,
                    str(data.dong_year_avg_price_rate),
                    data.dong_year_avg_price_rate,
                    str(data.dong_year_avg_bid_rate),
                    data.dong_year_avg_bid_rate,
                    data.dong_year_bid_count,
                    str(data.dong_six_month_avg_price_rate),
                    data.dong_six_month_avg_price_rate,
                    str(data.dong_six_month_avg_bid_rate),
                    data.dong_six_month_avg_bid_rate,
                    data.dong_six_month_bid_count,
                    db_sido_id,
                    db_gugun_id,
                    db_dong_id,
                    db_area_range_id,
                )
                session.commit()
            except Exception:
                session.rollback()
                raise
            finally:
                session.close()
            logger.info(
                "Store Dong Statistics",
                sido=data.sido_name,
                gugun=data.gugun_name,
                dong=data.dong_name,
            )
        else:
            session.close()
            raise

    def store_bid_data(self, bid_list: typing.List[TaeinBidData],
                       db_dong_id: int) -> None:
        for bid in bid_list:
            session = self.session_factory()
            try:
                TaeinBid.create_or_update(
                    session,
                    bid.bid_date_str,
                    bid.bid_date,
                    bid.bid_event_number,
                    bid.address,
                    bid.bid_judged_price,
                    bid.bid_success_price,
                    bid.average_bid_rate_str,
                    bid.average_bid_rate,
                    bid.bidder_count,
                    bid.bid_kind,
                    db_dong_id,
                )
                session.commit()
            except Exception:
                session.rollback()
                raise
            finally:
                session.close()
            logger.info("Store Bid statistics", bid=bid.address)