class CategoryCrawl(object): CATEGORY_ID = 50000000 # Default value COLLECTION = 'category' _DELIMITER = 'category?catId=' _PATH_TOKEN = '#' def __init__(self): # 크롤링 설정 정보 관리 - singleton self.crawl_config: CrawlConfiguration = ConfigManager().crawl_config # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton self.database_manager = DatabaseManager() # 중복 데이터 확인을 위해 미리 저장된 결과 list를 조회한다. self._category_list: list = list( self.database_manager.find_all_mongo(self.COLLECTION)) self.CATEGORY_ID = self.crawl_config.category_id def _update(self, cid, name, paths: str): # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton self.database_manager = DatabaseManager() _query = self.database_manager.find_query('cid', cid) _update_data = dict() _update_data['name'] = name _update_data['paths'] = paths _update_data['update_time'] = datetime.now() return self.database_manager.update(self.COLLECTION, _query, {"&set": _update_data}) def _insert(self, cid, name, paths: str, is_root: bool = False): """ Mongo Database Insert """ _is_exists: bool = False for item in self._category_list: _name = item['name'] _cid = item['cid'] _paths = item['paths'] if is_root: if eq(_name, name): self._category_list.remove(item) return else: if eq(_cid, cid): if eq(_name, name) and eq(_paths, paths): self._category_list.remove(item) return else: self._update() self._category_list.remove(item) return _category_document = dict() _category_document['cid'] = cid _category_document['name'] = name _category_document['paths'] = paths _category_document['insert_time'] = datetime.now() return self.database_manager.insert_one_mongo(self.COLLECTION, _category_document) def _is_exists(self, field, value: str): """MongoDB에 cid 값을 조회하여 조건에 맞는 document가 있는지 확인""" _query = self.database_manager.find_query(field, value) return self.database_manager.count_document('category', _query) > 0 def _parse_category(self, element: HtmlElement, root_paths: str): ul_tag: HtmlElement = element.find('ul') if ul_tag is not None: li_tags = ul_tag.findall('li') li: HtmlElement for li in li_tags: li_a_tag = li.find('a') if li_a_tag is not None: _name = li_a_tag.text _href = li_a_tag.get('href') _cid = Utils.separate_right(_href, self._DELIMITER) _paths = Utils.join_path(self._PATH_TOKEN, root_paths, _name) self._insert(_cid, _name, _paths) div_tag = li.find('div') if div_tag is not None: self._parse_category(div_tag, _paths) if li.find('ul') is not None: self._parse_category(li, _paths) def parse(self): for category_id in range(self.CATEGORY_ID, self.CATEGORY_ID + 11): _url = 'https://search.shopping.naver.com/category/category/{0}' logging.info("PID >> %s | CategoryID >> %d " % (os.getpid(), category_id)) request = requests.get(_url.format(category_id)) Utils.take_a_sleep(0, 1) # 상태 체크 if request.status_code != 200: return try: _content = request.content tree: HtmlElement = html.fromstring(_content) header_xpath = '//*[@id="__next"]/div/div[2]/h2' _root_name = tree.xpath(header_xpath)[0].text self._insert(str(category_id), _root_name, None, True) xpath = '//*[@id="__next"]/div/div[2]/div/div' elements: [HtmlElement] = tree.xpath(xpath) element: HtmlElement for element in elements: if element.find('div') is not None: a_tag: HtmlElement = element[0].find('h3').find('a') _name = a_tag.find('strong').text _href = a_tag.get('href') _cid = Utils.separate_right(_href, self._DELIMITER) _paths = Utils.join_path(self._PATH_TOKEN, _root_name, _name) self._insert(_cid, _name, _paths) self._parse_category(element[0], _paths) else: logging.info('Element is not Exists') except Exception as e: logging.error(str(e)) # 더이상 필요없는 카테고리 아이템들 제거 for item in self._category_list: _query = self.database_manager.find_query('_id', item['_id']) self.database_manager.delete_one(self.COLLECTION, _query) def run(self): pass
class CategoryCrawl(object): URL = 'https://search.shopping.naver.com/category/category/{0}' CATEGORY = 50000000 DELIMITER = 'cat_id=' COLLECTION = 'category' def __init__(self): # 크롬 selenium Driver - singleton self.driver = Selenium().driver # 크롤링 설정 정보 관리 - singleton self.crawl_config: CrawlConfiguration = ConfigManager().crawl_config # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton self.database_manager = DatabaseManager() # 중복 데이터 확인을 위해 미리 저장된 결과 list를 조회한다. self._category_list: list = list( self.database_manager.find_all_mongo(self.COLLECTION)) def _update(self, cid, name, paths: str): _query = self.database_manager.find_query('cid', cid) _update_data = dict() _update_data['name'] = name _update_data['paths'] = paths _update_data['update_time'] = datetime.now() return self.database_manager.update(self.COLLECTION, _query, {"&set": _update_data}) def _insert(self, cid, name, paths: str, is_root: bool = False): """ Mongo Database Insert """ _is_exists: bool = False for item in self._category_list: _name = item['name'] _cid = item['cid'] _paths = item['paths'] if is_root: if eq(_name, name): self._category_list.remove(item) return else: if eq(_cid, cid): if eq(_name, name) and eq(_paths, paths): self._category_list.remove(item) return else: self._update() self._category_list.remove(item) return _category_document = dict() _category_document['cid'] = cid _category_document['name'] = name _category_document['paths'] = paths _category_document['insert_time'] = datetime.now() return self.database_manager.insert_one_mongo(self.COLLECTION, _category_document) def _is_exists(self, field, value: str): """MongoDB에 cid 값을 조회하여 조건에 맞는 document가 있는지 확인""" _query = self.database_manager.find_query(field, value) return self.database_manager.count_document('category', _query) > 0 def parse(self): self.driver.get(self.URL) try: for category in self.driver.find_elements_by_xpath( '//*[@id="home_category_area"]/div[1]/ul/li'): time.sleep(1) self._parse_root(category) # 더이상 필요없는 카테고리 아이템들 제거 for item in self._category_list: _query = self.database_manager.find_query('_id', item['_id']) self.database_manager.delete_one(self.COLLECTION, _query) except Exception as e: logging.error(str(e)) def _parse_root(self, category: WebElement): # Root 이름 root_name: str = category.text # root_name = text.replace('/', '-') logging.info('rootName : ' + root_name) for exclude_category in self.crawl_config.exclude_category: if eq(root_name, exclude_category): return None class_att = category.get_attribute('class') click_xpath = '//*[@id="home_{0}"]'.format(class_att) self.driver.implicitly_wait(5) # 먼저 클릭해봄. self.driver.find_element_by_xpath(click_xpath).send_keys(Keys.ENTER) # class_att 맞춰 내부 xPath 설정 time.sleep(1) xpath_cate = '//*[@id="home_{0}_inner"]/div[1]'.format(class_att) # Root Category element: WebElement = None while 1: if element is not None: break else: # 클릭 이벤트가 정상적으로 안들어오면 계속 클릭하자.. self.driver.find_element_by_xpath(click_xpath).send_keys( Keys.ENTER) self.driver.implicitly_wait(4) time.sleep(1) element = self.driver.find_element_by_xpath(xpath_cate) self._insert(None, root_name, None, True) # Root -> sub co_col_elements = element.find_elements(By.CLASS_NAME, 'co_col') self._parse_co_col(co_col_elements, root_name) def _parse_co_cel(self, co_cel_elements, root_name): co_cel: WebElement for co_cel in co_cel_elements: # href sub_href = co_cel.find_element_by_tag_name('a').get_attribute( 'href') # cid _cid = Utils.separate_right(sub_href, self.DELIMITER) sub_element: WebElement = co_cel.find_element_by_tag_name('strong') # name _name = sub_element.find_element_by_tag_name('a').text _name = re.sub("전체보기", "", _name) # paths _paths = Utils.join_path(token='#', source=root_name, value=_name) # cid, name, paths self._insert(_cid, _name, _paths) # 하위 카테고리 리스트 child_items: [WebElement] = co_cel.find_elements(By.TAG_NAME, 'li') self._parse_child(child_items, _paths) pass def _parse_co_col(self, sub_category, root_name): co_col: WebElement for co_col in sub_category: time.sleep(1) # 중간 카테고리 co_cel_elements = co_col.find_elements_by_class_name('co_cel') self._parse_co_cel(co_cel_elements, root_name) def _parse_child(self, child_items, sub_paths): child_item: WebElement for child_item in child_items: time.sleep(1) # href _href = child_item.find_element_by_tag_name('a').get_attribute( 'href') # cid _cid = Utils.separate_right(_href, self.DELIMITER) # name _name = child_item.text # 이름 # paths _paths = Utils.join_path(token='#', source=sub_paths, value=_name) self._insert(_cid, _name, _paths)