Esempio n. 1
0
class BaseCrawler(metaclass=ABCMeta):
    """
    만약 posts_soup 와 soup(post_soup) 를 분리해서 사용하고 싶다면
    set_posts_soup 에서 self.soup = ... -> self.posts_soup = ...
    이후 self.posts_soup 로 get_posts에서 독립적인 사용이 가능하다.
    """
    def __init__(
        self,
        table: NamedTuple,
        brand_name: str,
        main_url: str,
        base_page_url: Optional[str] = "",
        model_date_format: str = "%Y-%m-%d",
        encoding: Optional[str] = None,
        custom_config: Config = None,
    ):
        self.brand_name = brand_name
        if custom_config is None:
            self.config = Config()
        else:
            self.config = custom_config
        self.init_logger()
        self.base_page_url = base_page_url
        self.curr_ctgr_url = None
        self.curr_post_url = None
        self.table = table
        if self.config.is_test is True:
            self.table_name = 'test.' + self.table.__name__
        else:
            self.table_name = 'public.' + self.table.__name__
        self.conn = DB(table=table, table_name=self.table_name)
        self.encoding = encoding
        self.session = RetrySession(encoding=encoding)
        self.main_url = main_url
        self.model_date_format = model_date_format
        self.soup = None
        self.posts_soup = None
        self.metas = {}

        if not self.config.is_valid():
            raise ValueError(f"Invalid Config: \n {self.config.__repr__()}")

    def __getstate__(self):
        d = self.__dict__.copy()
        if "logger" in d:
            d["logger"] = d["logger"].name
        return d

    def __setstate__(self, d):
        if "logger" in d:
            d["logger"] = getLogger(d["logger"])
        self.__dict__.update(d)

    # must add add_query as 'where brand_name' for unique together news_id, brand_name
    def get_exist_ids(self):
        rows = self.conn.select(
            fields=["news_id"],
            table=self.table_name,
            add_query=f" WHERE media_name='{self.brand_name}'",
        )
        return set(map(lambda r: int(r[0]), rows))

    # ======================== FLOW =============================
    def go(self):
        self.logger.info(
            f"{self.brand_name} Crawler Start at {datetime.now()}")
        self.page_parse()

    def page_parse(self) -> None:
        datas = []
        stop_crawling = False
        for page in self.gen_pages():
            if stop_crawling:
                break

            self.curr_ctgr_url = page
            self.set_posts_soup(page)
            posts = self.get_posts()
            self.logger.debug(
                f"Num of Crawled Posts: {len(posts)} in Page: {page} ")
            for url in posts:
                self.curr_post_url = url
                try:
                    data: NamedTuple = self.detail_parse()
                except Exception as e:
                    self.logger.error(
                        f"Error During Detail Parse \nURL: {url} \nDetail: {e}"
                    )
                    continue

                # 현재 크롤링한 아이템의 날짜가 limit date를 넘으면 저장하지 않고 멈춤
                if self.get_date() < self.config.end_date:
                    stop_crawling = True
                    break

                datas.append(tuple(data))

        self.save(datas=datas)

    @retry(8, 3)
    def set_soup(self, url: str) -> None:
        try:
            res = self.session.get(url)
            self.soup = BeautifulSoup(res, "html.parser")
        except TypeError:
            self.soup = BeautifulSoup(res.text, "html.parser")

    @retry(5, 3)
    def set_posts_soup(self, url: str) -> None:
        self.set_soup(url)

    def set_metas(self) -> None:
        metas = {}
        for m in self.soup.find_all("meta"):
            if not isinstance(m, Tag):
                continue
            k = m.get("property", m.get("name", m.get('http-equiv', None)))
            v = m.get("content", None)
            if k is not None and v is not None:
                metas[k] = v
        self.metas = metas

    @staticmethod
    def get_id(url: str) -> str:
        return re.findall("\d+", url)[-1]

    def data_cleaning(self, datas: List[Tuple]) -> List[Tuple]:
        result = []
        id_idx = 1
        unique_ids = set(map(lambda d: d[id_idx], datas))
        exist_ids = self.get_exist_ids()
        new_ids = unique_ids.difference(exist_ids)

        for d in datas:
            new_id = d[id_idx]
            if new_id in new_ids:
                result.append(d)
                new_ids.remove(new_id)
        return result

    def save(self, datas: List[Tuple]):
        self.logger.info(
            f"{self.brand_name} Crawler Finished at {datetime.now()}")
        self.logger.debug(f"Length Of Datas Before Cleaning: {len(datas)}")
        clean_datas = self.data_cleaning(datas)
        self.logger.debug(
            f"Length Of Datas After Cleaning: {len(clean_datas)}")

        if self.config.is_test is True:
            self.save_datas_as_pickle(clean_datas)
            assert self.table_name == f'test.{self.table.__name__}'
        self.conn.insert_magazine(clean_datas)
        self.logger.debug(f"Insert To Data to DB \n {self.conn.__dict__}")

    def detail_parse(self):
        post_url = self.curr_post_url
        self.set_soup(post_url)
        self.set_metas()
        return self.table(
            self.brand_name,
            int(self.get_id(post_url)),
            self.get_date(),
            self.get_title(),
            self.get_content(),
            self.curr_post_url,
            self.get_keywords(),
            self.get_post_type(),
        )

    def gen_pages(self) -> Generator:
        page = 0
        while True:
            page += 1
            yield self.base_page_url + str(page)

    @abstractmethod
    def get_date(self) -> date:
        pass

    @abstractmethod
    def get_posts(self) -> List[str]:
        pass

    @abstractmethod
    def get_title(self) -> str:
        pass

    @abstractmethod
    def get_content(self) -> str:
        pass

    @abstractmethod
    def get_post_type(self) -> Optional[str]:
        pass

    @abstractmethod
    def get_keywords(self) -> str:
        pass

    # ======================== FLOW END =============================
    # ======================== Utils[Optional] ======================

    def save_datas_as_pickle(self, datas):
        pkl_save_path = f"{self.config.TESTFILE_SAVE_PATH}/"
        Path(pkl_save_path).mkdir(parents=True, exist_ok=True)
        with open(f"{pkl_save_path}/{self.brand_name}.pkl", "wb") as f:
            pickle.dump(datas, f)

    @staticmethod
    def clean_date_txt(txt):
        return txt.lower()

    @staticmethod
    def get_clean_txt(txt):
        return "".join(txt.split())

    @check_return(date)
    def extract_time(self, text: str) -> Optional[date]:
        clean_txt = self.clean_date_txt(text)
        date_time = next(datefinder.find_dates(clean_txt), None)
        if date_time is None:
            return None

        return date_time.date()

    def furbish_link(self, link: str, prefix=None) -> str:
        if link.startswith("//"):
            link = "http:" + link

        if prefix is not None and isinstance(prefix, str):
            link = prefix + link

        if "http" not in link or link.startswith("/"):
            link = urljoin(self.main_url, link)

        return link

    def get_links(self, attrs) -> Dict[str, str]:
        # attrs: Dictionary or has values() method
        suspectors = ["href", "src", "ec-data-src"]
        links = defaultdict(str)
        for s in suspectors:
            link = attrs.get(s, None)
            if link is not None and isinstance(link, str):
                links[s] = self.furbish_link(link)
        return links

    @staticmethod
    def attrs_to_text(attrs) -> str:
        # attrs: Dictionary or has values() method
        all_hints = []
        for v in attrs.values():
            if isinstance(v, list):
                all_hints.append(" ".join(v))
            else:
                all_hints.append(v)
        return " ".join(all_hints)

    def init_logger(self):
        if self.config is None or self.brand_name is None:
            raise SyntaxError(
                "Must be Called After Ininitalize !Config and brand_name!")

        l_path = self.config.LOG_SAVE_PATH
        if not os.path.exists(l_path):
            os.makedirs(l_path, exist_ok=True)

        self.logger = named_logger(f"{opjoin(l_path, self.brand_name)}.log",
                                   "crawl_logger")