async def get_products(db: DatabaseManager) -> List[ProductBase]: categories: List[ProductBase] = [] rows = db.find(PRODUCT, {}) if rows: for row in rows: categories.append(ProductBase(**row)) return categories
async def get_categories(db: DatabaseManager) -> List[CategoryBase]: categories: List[CategoryBase] = [] rows = db.find(CATEGORY, {}) if rows: for row in rows: categories.append(CategoryBase(**row)) return categories
class ProductCrawl: ''' 상품목록에서 상품 정보를 수집하는 crawler 입니다. Attributes: driver ''' PRODUCT_COLLECTION = "product" CRAWL_CONFIG_COLLECTION = "crawl_config" _excepted_data_count = 0 def __init__(self): logging.info('start product crawl') # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton self.database_manager = DatabaseManager() self.crawl_config: CrawlConfiguration = ConfigManager().crawl_config # start Page Default self._paging_start: int = 1 self._view_size: int = 80 # self.crawl_config.crawl_count # 먼저 확인해야함. 다시 수집시 DB->Config 정보 셋 # TODO: 나중에 처리하도록 수정 # self._check_crawl_configuration() self._result: list = [] self._category: dict = None self.productInfo_arr = [] self._current_page: int = 0 self.last_crawled_date_time = datetime.datetime.now() def _upsert_crawl_configuration(self, start_page): """모든 분석이 끝나고 Config 정보 update""" # 조건 _filter = {} # 변경 데이터 _config = dict() _config['start_page'] = start_page self.database_manager.update(self.CRAWL_CONFIG_COLLECTION, _filter, _config) def _check_crawl_configuration(self): """Config 정보 set""" _config: dict = self.database_manager.find_one(self.CRAWL_CONFIG_COLLECTION) if _config.get('start_page') is not None: self._paging_start = _config['start_page'] if _config.get('crawl_category_list') is not None: self.crawl_config.crawl_category = _config['crawl_category_list'] def _category_getter(self, crawl_category: list) -> list: """ 카테고리 목록 조회해서 분석 :return category 목록들""" _categories: list = [] if crawl_category is None: crawl_category = self.crawl_config.crawl_category for item in crawl_category: query = self.database_manager.keyword_query('paths', item) _categories.extend(list(self.database_manager.find('category', query=query))) return _categories def make_url(self, paging_index: int, frm: str = "NVSHMDL", _filter: str = "") -> str: """category id, 페이지 사이즈, 페이지 넘버를 조합하여 url 생성""" _url = ("https://search.shopping.naver.com/search/category?catId={0}&frm={1}{2}&origQuery&pagingIndex={3}&pagingSize={4}&productSet=model&query&sort=rel×tamp=&viewType=list") _cid = self._category['cid'] return _url.format(_cid, frm, _filter, paging_index, self._view_size) async def parse(self, identifier: str, context: dict, crawl_category: list = None): """ 외부에서 파싱을 하기 위해 호출하는 함수 """ _categories: list = self._category_getter(crawl_category) for category in _categories: await asyncio.sleep(1) jobs = context['jobs'] job_info = jobs[identifier] job_info['status'] = 'in Progress' job_info['category'] = category.get('name') self._category = category """파싱 프로세스 시작""" self._current_page = 0 # Default = 1 _url = self.make_url(paging_index=1) _total_count, _filter = self._get_base_data(_url) # Page 조건 변경 필요 _is_oversize = _total_count > 8000 # Page 계산 _page_size = Utils.calc_page(_total_count, self._view_size) if _is_oversize: self._filter_parse(_filter) else: await self._execute_parse(_page_size) logging.info('>>> end childCategory: ' + self._category.get('name') + ' Pg.' + str(self._current_page)) job_info['status'] = 'done' def _make_list(self, _min, _max, _half): result = [] a = [_min, _half] b = [_half, _max] result.append(a) result.append(b) return result def _filter_parse_recursive(self, min_value, max_value): _param = ("&maxPrice={0}&minPrice={1}".format(str(max_value), str(min_value))) _url = self.make_url(1, "NVSHPRC", _param) _total_count, _filter = self._get_base_data(_url) _is_oversize = _total_count > 8000 _page_size = Utils.calc_page(_total_count, self._view_size) if _is_oversize: half_price = math.ceil((min_value + max_value) / 2) _range = self._make_list(min_value, max_value, half_price) for value in _range: self._filter_parse_recursive(value[0], value[1]) else: await self._execute_parse(_page_size, _param) pass def _filter_parse(self, filters: list): # 한번만 호출된다. for _filter in filters: _filterAction = _filter.get('filterAction') _separator = "-" # default = - _paramName = None if _filterAction is not None: _separator = _filterAction.get('separator') # price split _value: str = _filter.get('value') _param = "" _min = 0 _max = 0 if _value is not None: _min, _max = (int(_price) for _price in _value.split(_separator)) logging.info("Filter Parse >> min{0} / max{1}".format(_min, _max)) self._filter_parse_recursive(_min, _max) async def _execute_parse(self, page_number, filter_param: str = ""): for page_number in range(1, page_number): try: _url = self.make_url(page_number, _filter=filter_param) self.parse_data(self._get_product_json(_url)) logging.info(">>> URL : " + _url) logging.info('>>> start parsing: ' + self._category.get('name') + ' Pg.' + str(page_number)) self._current_page = page_number except Exception as e: logging.debug(">>> Category Collect Err " + str(self._current_page) + " name: " + self._category.get('name') + " Err :" + str(e)) def _get_product_json(self, url) -> dict: """ 상품 정보 가져오기 :arg :param url: request URL :return: data_dict 상품 정보 """ # header 추가 필요. try: _headers = {'Content-Type': 'application/json;'} req = requests.get(url, _headers) html = req.text soup = BeautifulSoup(html, 'html.parser') # html.parser를 사용해서 soup에 넣겠다 json_data = soup.find('script', text=re.compile('application/json')) data_dict = json.loads(str(json_data.contents[0])) except Exception as e: data_dict = None # 슬립 시간 조정 필요 - 8초가 부족할 수 있음. time.sleep(8) # 비정상적인 요청이 감지됨 - 다시 URL을 요청한다. logging.error("no find Data request Error >> {0} | URL >> {1}".format(e, url)) self._get_product_json(url) return data_dict def parse_data(self, data_dict): """ 데이터 파싱 """ product_info: dict = self._get_data(data_dict, 'products') if product_info is not None: '''수집된 데이터가 있는 경우''' product_list: list = product_info.get('list') self._excepted_data_count = 0 logging.info("수집 시작 - 상품 데이터 수: " + str(len(product_list))) if len(product_list) > 0: for product in product_list: product_data = dict() product_item = product.get('item') if product_item.get('adId') is None: '''광고 데이터가 아닌 경우에만 수집''' # 카테고리 정보 Setting self._set_category_info(product_data) # 상품 정보 Setting self._set_product_info(product_data, product_item) self._insert_product_info(product_data) else: self._excepted_data_count += 1 else: logging.error('!!! Exception: 상품 정보가 없습니다.') # if len(product_list) != len(products_data) + self._excepted_data_count: # logging.error("!!! Exception: 데이터 수 확인이 필요 합니다.") # logging.info("수집된 데이터 수: " + str(len(products_data))) # logging.info("수집 제외된 데이터 수: " + str(self._excepted_data_count)) else: logging.error('!!! Exception: 데이터가 수집되지 않았습니다.') def _set_category_info(self, product_data: dict): '''상품정보에 카테고리 정보 셋팅 arg: products_data: 상품 정보 객체 ''' product_data['n_cid'] = self._category.get('cid') product_data['cid'] = self._category.get('_id') product_data['paths'] = self._category.get('paths') product_data['cname'] = self._category.get('name') def _set_product_info(self, product_data: dict, product_item): product_data['n_id'] = product_item.get('id') product_data['imageUrl'] = product_item.get('imageUrl') product_data['title'] = product_item.get('productTitle') product_data['price'] = product_item.get('price') # product_data['option'] = {} _attribute: str = product_item.get('attributeValue', "") _attribute_value: str = product_item.get('characterValue', "") if (_attribute != "") and (_attribute_value != ""): # 옵션 정보가 있는 경우 product_option_key: list = product_item.get('attributeValue').split('|') # 옵션 키값 product_option_value: list = product_item.get('characterValue').split('|') # 옵션 벨류값 product_data['option'] = dict(zip(product_option_key, product_option_value)) def _insert_product_info(self, value: dict): """db data insert""" try: # TODO: 값 비교는 어디서 하지? _selection = self.database_manager.find_query("n_id", value.get("n_id")) self.database_manager.update(self.PRODUCT_COLLECTION, _selection, value) pass except Exception as e: logging.error('!!! Fail: Insert data to DB: ', e) def _get_base_data(self, url): _data = self._get_product_json(url) _total_count = 0 value_filters: Optional[dict] = None if _data is not None: products = self._get_data(_data, 'products') if products is not None: _total_count = products.get('total') if _total_count is not None: _total_count = int(_total_count) else: _total_count = 0 filters = self._get_data(_data, 'mainFilters') if filters is not None: value_filters = self._get_filter(filters) return _total_count, value_filters def _get_data(self, data: dict, _type: str): return data.get('props', {}).get('pageProps', {}).get('initialState', {}).get(_type) def _get_filter(self, main_filters: dict) -> dict: value_filters = None for _filter in main_filters: _filterType: str = _filter.get('filterType') if (_filterType is not None) and (eq(_filterType, 'price')): value_filters = _filter.get('filterValues') if value_filters is not None: break return value_filters