コード例 #1
0
    def __init__(self):
        # 크롤링 설정 정보 관리 - singleton
        self.crawl_config: CrawlConfiguration = ConfigManager().crawl_config
        # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton
        self.database_manager = DatabaseManager()

        # 중복 데이터 확인을 위해 미리 저장된 결과 list를 조회한다.
        self._category_list: list = list(self.database_manager.find_all_mongo(self.COLLECTION))
        self.CATEGORY_ID = self.crawl_config.category_id
コード例 #2
0
    def _update(self, cid, name, paths: str):
        # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton
        self.database_manager = DatabaseManager()

        _query = self.database_manager.find_query('cid', cid)

        _update_data = dict()
        _update_data['name'] = name
        _update_data['paths'] = paths
        _update_data['update_time'] = datetime.now()

        return self.database_manager.update(self.COLLECTION, _query, {"&set": _update_data})
コード例 #3
0
 def test_get_categories_config(self):
     '''설정값에 선언한 수집할 카테고리 목록을 가져오는 함수'''
     _categories: list()
     try:
         crawl_config: CrawlConfiguration = ConfigManager().crawl_config
         database_manager: DatabaseManager = DatabaseManager()
     except Exception as e:
         return False
コード例 #4
0
async def get_products(db: DatabaseManager) -> List[ProductBase]:
    categories: List[ProductBase] = []
    rows = db.find(PRODUCT, {})

    if rows:
        for row in rows:
            categories.append(ProductBase(**row))

    return categories
コード例 #5
0
async def get_categories(db: DatabaseManager) -> List[CategoryBase]:
    categories: List[CategoryBase] = []
    rows = db.find(CATEGORY, {})

    if rows:
        for row in rows:
            categories.append(CategoryBase(**row))

    return categories
コード例 #6
0
ファイル: productcrawl.py プロジェクト: PBBOY/productCrawling
    def __init__(self):
        logging.info('start product crawl')
        # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton
        self.database_manager = DatabaseManager()
        self.crawl_config: CrawlConfiguration = ConfigManager().crawl_config
        # start Page Default
        self._paging_start: int = 1
        self._view_size: int = 80  # self.crawl_config.crawl_count


        # 먼저 확인해야함. 다시 수집시 DB->Config 정보 셋
        # TODO: 나중에 처리하도록 수정
        # self._check_crawl_configuration()

        self._result: list = []

        self._category: dict = None

        self.productInfo_arr = []
        self._current_page: int = 0
        self.last_crawled_date_time = datetime.datetime.now()
コード例 #7
0
    def test_get_categories(self):
        '''
        DB에서 카테고리 정보 가져오기
        :return:
        '''
        crawl_category = ['사무용품']
        database_manager: DatabaseManager = DatabaseManager()
        _categories:list()

        for item in crawl_category:
            query = database_manager.keyword_query('paths', item)
            _categories.extend(list(database_manager.find('category', query=query)))
        if _categories is None:
            return False
コード例 #8
0
def main():
    logger = make_logger()

    logger.info('Crawl Test')

    # driver = Selenium().driver

    ConfigManager()
    db = DatabaseManager()

    # 카테고리 파싱 주석
    # CategoryCrawl().parse()

    ProductCrawl().parse()

    logger.info('Crawl Test End')
コード例 #9
0
def main():
    logger = make_logger()

    logger.info('Crawl Test')

    # driver = Selenium().driver

    ConfigManager()
    DatabaseManager()

    # pickle.dumps(DatabaseManager())
    # 카테고리 파싱 주석
    CategoryCrawl().run()

    # ProductCrawl()

    logger.info('Crawl Test End')
コード例 #10
0
ファイル: productcrawl.py プロジェクト: PBBOY/productCrawling
class ProductCrawl:
    '''
    상품목록에서 상품 정보를 수집하는 crawler 입니다.
    Attributes:
        driver
    '''
    PRODUCT_COLLECTION = "product"
    CRAWL_CONFIG_COLLECTION = "crawl_config"

    _excepted_data_count = 0

    def __init__(self):
        logging.info('start product crawl')
        # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton
        self.database_manager = DatabaseManager()
        self.crawl_config: CrawlConfiguration = ConfigManager().crawl_config
        # start Page Default
        self._paging_start: int = 1
        self._view_size: int = 80  # self.crawl_config.crawl_count


        # 먼저 확인해야함. 다시 수집시 DB->Config 정보 셋
        # TODO: 나중에 처리하도록 수정
        # self._check_crawl_configuration()

        self._result: list = []

        self._category: dict = None

        self.productInfo_arr = []
        self._current_page: int = 0
        self.last_crawled_date_time = datetime.datetime.now()

    def _upsert_crawl_configuration(self, start_page):
        """모든 분석이 끝나고 Config 정보 update"""
        # 조건
        _filter = {}
        # 변경 데이터
        _config = dict()
        _config['start_page'] = start_page

        self.database_manager.update(self.CRAWL_CONFIG_COLLECTION, _filter, _config)

    def _check_crawl_configuration(self):
        """Config 정보 set"""
        _config: dict = self.database_manager.find_one(self.CRAWL_CONFIG_COLLECTION)

        if _config.get('start_page') is not None:
            self._paging_start = _config['start_page']

        if _config.get('crawl_category_list') is not None:
            self.crawl_config.crawl_category = _config['crawl_category_list']

    def _category_getter(self, crawl_category: list) -> list:
        """ 카테고리 목록 조회해서 분석
        :return category 목록들"""
        _categories: list = []
        if crawl_category is None:
            crawl_category = self.crawl_config.crawl_category

        for item in crawl_category:
            query = self.database_manager.keyword_query('paths', item)
            _categories.extend(list(self.database_manager.find('category', query=query)))

        return _categories

    def make_url(self, paging_index: int, frm: str = "NVSHMDL", _filter: str = "") -> str:
        """category id, 페이지 사이즈, 페이지 넘버를 조합하여 url 생성"""
        _url = ("https://search.shopping.naver.com/search/category?catId={0}&frm={1}{2}&origQuery&pagingIndex={3}&pagingSize={4}&productSet=model&query&sort=rel&timestamp=&viewType=list")
        _cid = self._category['cid']
        return _url.format(_cid, frm, _filter, paging_index, self._view_size)

    async def parse(self, identifier: str, context: dict, crawl_category: list = None):
        """ 외부에서 파싱을 하기 위해 호출하는 함수 """

        _categories: list = self._category_getter(crawl_category)

        for category in _categories:
            await asyncio.sleep(1)

            jobs = context['jobs']

            job_info = jobs[identifier]
            job_info['status'] = 'in Progress'
            job_info['category'] = category.get('name')

            self._category = category
            """파싱 프로세스 시작"""
            self._current_page = 0
            # Default = 1
            _url = self.make_url(paging_index=1)
            _total_count, _filter = self._get_base_data(_url)

            # Page 조건 변경 필요
            _is_oversize = _total_count > 8000
            # Page 계산
            _page_size = Utils.calc_page(_total_count, self._view_size)

            if _is_oversize:
                self._filter_parse(_filter)

            else:
                await self._execute_parse(_page_size)

            logging.info('>>> end childCategory: ' + self._category.get('name') + ' Pg.' + str(self._current_page))

        job_info['status'] = 'done'

    def _make_list(self, _min, _max, _half):
        result = []
        a = [_min, _half]
        b = [_half, _max]
        result.append(a)
        result.append(b)

        return result

    def _filter_parse_recursive(self, min_value, max_value):
        _param = ("&maxPrice={0}&minPrice={1}".format(str(max_value), str(min_value)))
        _url = self.make_url(1, "NVSHPRC", _param)
        _total_count, _filter = self._get_base_data(_url)
        _is_oversize = _total_count > 8000
        _page_size = Utils.calc_page(_total_count, self._view_size)
        if _is_oversize:
            half_price = math.ceil((min_value + max_value) / 2)
            _range = self._make_list(min_value, max_value, half_price)

            for value in _range:
                self._filter_parse_recursive(value[0], value[1])

        else:
            await self._execute_parse(_page_size, _param)
        pass

    def _filter_parse(self, filters: list):
        # 한번만 호출된다.
        for _filter in filters:
            _filterAction = _filter.get('filterAction')
            _separator = "-"  # default = -
            _paramName = None
            if _filterAction is not None:
                _separator = _filterAction.get('separator')
                # price split
            _value: str = _filter.get('value')
            _param = ""
            _min = 0
            _max = 0
            if _value is not None:
                _min, _max = (int(_price) for _price in _value.split(_separator))

            logging.info("Filter Parse >> min{0} / max{1}".format(_min, _max))

            self._filter_parse_recursive(_min, _max)

    async def _execute_parse(self, page_number, filter_param: str = ""):

        for page_number in range(1, page_number):
            try:
                _url = self.make_url(page_number, _filter=filter_param)

                self.parse_data(self._get_product_json(_url))

                logging.info(">>> URL : " + _url)
                logging.info('>>> start parsing: ' + self._category.get('name') + ' Pg.' + str(page_number))

                self._current_page = page_number
            except Exception as e:
                logging.debug(">>> Category Collect Err " + str(self._current_page)
                              + "  name: " + self._category.get('name') + "  Err :" + str(e))

    def _get_product_json(self, url) -> dict:
        """
        상품 정보 가져오기
        :arg
        :param url: request URL
        :return: data_dict 상품 정보
        """
        # header 추가 필요.
        try:
            _headers = {'Content-Type': 'application/json;'}
            req = requests.get(url, _headers)

            html = req.text
            soup = BeautifulSoup(html, 'html.parser')  # html.parser를 사용해서 soup에 넣겠다

            json_data = soup.find('script', text=re.compile('application/json'))

            data_dict = json.loads(str(json_data.contents[0]))
        except Exception as e:
            data_dict = None
            # 슬립 시간 조정 필요 - 8초가 부족할 수 있음.
            time.sleep(8)
            # 비정상적인 요청이 감지됨 - 다시 URL을 요청한다.
            logging.error("no find Data request Error >> {0} | URL >> {1}".format(e, url))
            self._get_product_json(url)

        return data_dict

    def parse_data(self, data_dict):
        """ 데이터 파싱 """
        product_info: dict = self._get_data(data_dict, 'products')
        if product_info is not None:
            '''수집된 데이터가 있는 경우'''

            product_list: list = product_info.get('list')

            self._excepted_data_count = 0

            logging.info("수집 시작 - 상품 데이터 수: " + str(len(product_list)))
            if len(product_list) > 0:
                for product in product_list:
                    product_data = dict()

                    product_item = product.get('item')
                    if product_item.get('adId') is None:

                        '''광고 데이터가 아닌 경우에만 수집'''
                        # 카테고리 정보 Setting
                        self._set_category_info(product_data)
                        # 상품 정보 Setting
                        self._set_product_info(product_data, product_item)

                        self._insert_product_info(product_data)
                    else:
                        self._excepted_data_count += 1
            else:
                logging.error('!!! Exception: 상품 정보가 없습니다.')
            # if len(product_list) != len(products_data) + self._excepted_data_count:
            #     logging.error("!!! Exception: 데이터 수 확인이 필요 합니다.")
            #     logging.info("수집된 데이터 수: " + str(len(products_data)))
            #     logging.info("수집 제외된 데이터 수: " + str(self._excepted_data_count))
        else:
            logging.error('!!! Exception: 데이터가 수집되지 않았습니다.')

    def _set_category_info(self, product_data: dict):
        '''상품정보에 카테고리 정보 셋팅
            arg:
                products_data: 상품 정보 객체
        '''
        product_data['n_cid'] = self._category.get('cid')
        product_data['cid'] = self._category.get('_id')
        product_data['paths'] = self._category.get('paths')
        product_data['cname'] = self._category.get('name')

    def _set_product_info(self, product_data: dict, product_item):
        product_data['n_id'] = product_item.get('id')
        product_data['imageUrl'] = product_item.get('imageUrl')
        product_data['title'] = product_item.get('productTitle')
        product_data['price'] = product_item.get('price')

        # product_data['option'] = {}

        _attribute: str = product_item.get('attributeValue', "")
        _attribute_value: str = product_item.get('characterValue', "")

        if (_attribute != "") and (_attribute_value != ""):
            # 옵션 정보가 있는 경우
            product_option_key: list = product_item.get('attributeValue').split('|')  # 옵션 키값
            product_option_value: list = product_item.get('characterValue').split('|')  # 옵션 벨류값


            product_data['option'] = dict(zip(product_option_key, product_option_value))

    def _insert_product_info(self, value: dict):
        """db data insert"""
        try:
            # TODO: 값 비교는 어디서 하지?
            _selection = self.database_manager.find_query("n_id", value.get("n_id"))
            self.database_manager.update(self.PRODUCT_COLLECTION, _selection, value)
            pass
        except Exception as e:
            logging.error('!!! Fail: Insert data to DB: ', e)

    def _get_base_data(self, url):
        _data = self._get_product_json(url)

        _total_count = 0
        value_filters: Optional[dict] = None

        if _data is not None:
            products = self._get_data(_data, 'products')
            if products is not None:

                _total_count = products.get('total')
                if _total_count is not None:
                    _total_count = int(_total_count)
                else:
                    _total_count = 0

            filters = self._get_data(_data, 'mainFilters')
            if filters is not None:
                value_filters = self._get_filter(filters)

        return _total_count, value_filters

    def _get_data(self, data: dict, _type: str):
        return data.get('props', {}).get('pageProps', {}).get('initialState', {}).get(_type)

    def _get_filter(self, main_filters: dict) -> dict:
        value_filters = None

        for _filter in main_filters:
            _filterType: str = _filter.get('filterType')
            if (_filterType is not None) and (eq(_filterType, 'price')):
                value_filters = _filter.get('filterValues')
                if value_filters is not None:
                    break

        return value_filters
コード例 #11
0
class CategoryCrawl(object):

    CATEGORY_ID = 50000000  # Default value

    COLLECTION = 'category'
    _DELIMITER = 'category?catId='
    _PATH_TOKEN = '#'

    def __init__(self):
        # 크롤링 설정 정보 관리 - singleton
        self.crawl_config: CrawlConfiguration = ConfigManager().crawl_config
        # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton
        self.database_manager = DatabaseManager()

        # 중복 데이터 확인을 위해 미리 저장된 결과 list를 조회한다.
        self._category_list: list = list(
            self.database_manager.find_all_mongo(self.COLLECTION))
        self.CATEGORY_ID = self.crawl_config.category_id

    def _update(self, cid, name, paths: str):
        # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton
        self.database_manager = DatabaseManager()

        _query = self.database_manager.find_query('cid', cid)

        _update_data = dict()
        _update_data['name'] = name
        _update_data['paths'] = paths
        _update_data['update_time'] = datetime.now()

        return self.database_manager.update(self.COLLECTION, _query,
                                            {"&set": _update_data})

    def _insert(self, cid, name, paths: str, is_root: bool = False):
        """ Mongo Database Insert """
        _is_exists: bool = False

        for item in self._category_list:
            _name = item['name']
            _cid = item['cid']
            _paths = item['paths']
            if is_root:
                if eq(_name, name):
                    self._category_list.remove(item)
                    return
            else:
                if eq(_cid, cid):
                    if eq(_name, name) and eq(_paths, paths):
                        self._category_list.remove(item)
                        return
                    else:
                        self._update()
                        self._category_list.remove(item)
                        return

        _category_document = dict()
        _category_document['cid'] = cid
        _category_document['name'] = name
        _category_document['paths'] = paths
        _category_document['insert_time'] = datetime.now()

        return self.database_manager.insert_one_mongo(self.COLLECTION,
                                                      _category_document)

    def _is_exists(self, field, value: str):
        """MongoDB에 cid 값을 조회하여 조건에 맞는 document가 있는지 확인"""
        _query = self.database_manager.find_query(field, value)
        return self.database_manager.count_document('category', _query) > 0

    def _parse_category(self, element: HtmlElement, root_paths: str):
        ul_tag: HtmlElement = element.find('ul')

        if ul_tag is not None:
            li_tags = ul_tag.findall('li')

            li: HtmlElement
            for li in li_tags:
                li_a_tag = li.find('a')
                if li_a_tag is not None:
                    _name = li_a_tag.text
                    _href = li_a_tag.get('href')
                    _cid = Utils.separate_right(_href, self._DELIMITER)
                    _paths = Utils.join_path(self._PATH_TOKEN, root_paths,
                                             _name)

                    self._insert(_cid, _name, _paths)
                    div_tag = li.find('div')
                    if div_tag is not None:
                        self._parse_category(div_tag, _paths)

                    if li.find('ul') is not None:
                        self._parse_category(li, _paths)

    def parse(self):
        for category_id in range(self.CATEGORY_ID, self.CATEGORY_ID + 11):
            _url = 'https://search.shopping.naver.com/category/category/{0}'
            logging.info("PID >> %s | CategoryID >> %d " %
                         (os.getpid(), category_id))

            request = requests.get(_url.format(category_id))
            Utils.take_a_sleep(0, 1)
            #  상태 체크
            if request.status_code != 200:
                return
            try:
                _content = request.content
                tree: HtmlElement = html.fromstring(_content)
                header_xpath = '//*[@id="__next"]/div/div[2]/h2'
                _root_name = tree.xpath(header_xpath)[0].text

                self._insert(str(category_id), _root_name, None, True)

                xpath = '//*[@id="__next"]/div/div[2]/div/div'
                elements: [HtmlElement] = tree.xpath(xpath)

                element: HtmlElement
                for element in elements:
                    if element.find('div') is not None:
                        a_tag: HtmlElement = element[0].find('h3').find('a')
                        _name = a_tag.find('strong').text
                        _href = a_tag.get('href')
                        _cid = Utils.separate_right(_href, self._DELIMITER)
                        _paths = Utils.join_path(self._PATH_TOKEN, _root_name,
                                                 _name)

                        self._insert(_cid, _name, _paths)
                        self._parse_category(element[0], _paths)
                    else:
                        logging.info('Element is not Exists')

            except Exception as e:
                logging.error(str(e))

            # 더이상 필요없는 카테고리 아이템들 제거
            for item in self._category_list:
                _query = self.database_manager.find_query('_id', item['_id'])
                self.database_manager.delete_one(self.COLLECTION, _query)

    def run(self):
        pass
コード例 #12
0
class CategoryCrawl(object):
    URL = 'https://search.shopping.naver.com/category/category/{0}'
    CATEGORY = 50000000
    DELIMITER = 'cat_id='
    COLLECTION = 'category'

    def __init__(self):
        # 크롬 selenium Driver - singleton
        self.driver = Selenium().driver
        # 크롤링 설정 정보 관리 - singleton
        self.crawl_config: CrawlConfiguration = ConfigManager().crawl_config
        # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton
        self.database_manager = DatabaseManager()
        # 중복 데이터 확인을 위해 미리 저장된 결과 list를 조회한다.
        self._category_list: list = list(
            self.database_manager.find_all_mongo(self.COLLECTION))

    def _update(self, cid, name, paths: str):
        _query = self.database_manager.find_query('cid', cid)

        _update_data = dict()
        _update_data['name'] = name
        _update_data['paths'] = paths
        _update_data['update_time'] = datetime.now()

        return self.database_manager.update(self.COLLECTION, _query,
                                            {"&set": _update_data})

    def _insert(self, cid, name, paths: str, is_root: bool = False):
        """ Mongo Database Insert """
        _is_exists: bool = False
        for item in self._category_list:
            _name = item['name']
            _cid = item['cid']
            _paths = item['paths']
            if is_root:
                if eq(_name, name):
                    self._category_list.remove(item)
                    return
            else:
                if eq(_cid, cid):
                    if eq(_name, name) and eq(_paths, paths):
                        self._category_list.remove(item)
                        return
                    else:
                        self._update()
                        self._category_list.remove(item)
                        return

        _category_document = dict()
        _category_document['cid'] = cid
        _category_document['name'] = name
        _category_document['paths'] = paths
        _category_document['insert_time'] = datetime.now()

        return self.database_manager.insert_one_mongo(self.COLLECTION,
                                                      _category_document)

    def _is_exists(self, field, value: str):
        """MongoDB에 cid 값을 조회하여 조건에 맞는 document가 있는지 확인"""
        _query = self.database_manager.find_query(field, value)
        return self.database_manager.count_document('category', _query) > 0

    def parse(self):
        self.driver.get(self.URL)

        try:
            for category in self.driver.find_elements_by_xpath(
                    '//*[@id="home_category_area"]/div[1]/ul/li'):
                time.sleep(1)
                self._parse_root(category)

            # 더이상 필요없는 카테고리 아이템들 제거
            for item in self._category_list:
                _query = self.database_manager.find_query('_id', item['_id'])
                self.database_manager.delete_one(self.COLLECTION, _query)

        except Exception as e:
            logging.error(str(e))

    def _parse_root(self, category: WebElement):
        # Root 이름
        root_name: str = category.text
        # root_name = text.replace('/', '-')

        logging.info('rootName : ' + root_name)

        for exclude_category in self.crawl_config.exclude_category:
            if eq(root_name, exclude_category):
                return None

        class_att = category.get_attribute('class')
        click_xpath = '//*[@id="home_{0}"]'.format(class_att)

        self.driver.implicitly_wait(5)
        # 먼저 클릭해봄.
        self.driver.find_element_by_xpath(click_xpath).send_keys(Keys.ENTER)
        # class_att 맞춰 내부 xPath 설정
        time.sleep(1)

        xpath_cate = '//*[@id="home_{0}_inner"]/div[1]'.format(class_att)

        # Root Category
        element: WebElement = None
        while 1:
            if element is not None:
                break
            else:
                # 클릭 이벤트가 정상적으로 안들어오면 계속 클릭하자..
                self.driver.find_element_by_xpath(click_xpath).send_keys(
                    Keys.ENTER)
                self.driver.implicitly_wait(4)
                time.sleep(1)
                element = self.driver.find_element_by_xpath(xpath_cate)

        self._insert(None, root_name, None, True)
        # Root -> sub
        co_col_elements = element.find_elements(By.CLASS_NAME, 'co_col')

        self._parse_co_col(co_col_elements, root_name)

    def _parse_co_cel(self, co_cel_elements, root_name):
        co_cel: WebElement
        for co_cel in co_cel_elements:
            # href
            sub_href = co_cel.find_element_by_tag_name('a').get_attribute(
                'href')
            # cid
            _cid = Utils.separate_right(sub_href, self.DELIMITER)

            sub_element: WebElement = co_cel.find_element_by_tag_name('strong')

            # name
            _name = sub_element.find_element_by_tag_name('a').text
            _name = re.sub("전체보기", "", _name)
            # paths
            _paths = Utils.join_path(token='#', source=root_name, value=_name)

            # cid, name, paths
            self._insert(_cid, _name, _paths)

            # 하위 카테고리 리스트
            child_items: [WebElement] = co_cel.find_elements(By.TAG_NAME, 'li')
            self._parse_child(child_items, _paths)
        pass

    def _parse_co_col(self, sub_category, root_name):
        co_col: WebElement
        for co_col in sub_category:
            time.sleep(1)
            # 중간 카테고리
            co_cel_elements = co_col.find_elements_by_class_name('co_cel')
            self._parse_co_cel(co_cel_elements, root_name)

    def _parse_child(self, child_items, sub_paths):
        child_item: WebElement
        for child_item in child_items:
            time.sleep(1)
            # href
            _href = child_item.find_element_by_tag_name('a').get_attribute(
                'href')
            # cid
            _cid = Utils.separate_right(_href, self.DELIMITER)
            # name
            _name = child_item.text  # 이름
            # paths
            _paths = Utils.join_path(token='#', source=sub_paths, value=_name)
            self._insert(_cid, _name, _paths)