Beispiel #1
0
    async def parse(self, identifier: str, context: dict):

        logging.info("Category Crawl Start >> WEB")

        for category_id in range(self.CATEGORY_ID, self.CATEGORY_ID + 11):
            await asyncio.sleep(1)
            _url = 'https://search.shopping.naver.com/category/category/{0}'
            logging.info("PID >> %s | CategoryID >> %d " % (os.getpid(), category_id))

            jobs = context['jobs']

            job_info = jobs[identifier]

            request = requests.get(_url.format(category_id))
            #  상태 체크
            if request.status_code != 200:
                return
            try:
                _content = request.content
                tree: HtmlElement = html.fromstring(_content)
                header_xpath = '//*[@id="__next"]/div/div[2]/h2'
                _root_name = tree.xpath(header_xpath)[0].text
                job_info['status'] = 'in progress'
                job_info['name'] = _root_name

                self.crawl_status(str(category_id), _root_name, request.status_code)

                self._insert(str(category_id), _root_name, None, True)

                xpath = '//*[@id="__next"]/div/div[2]/div/div'
                elements: [HtmlElement] = tree.xpath(xpath)

                element: HtmlElement
                for element in elements:
                    if element.find('div') is not None:
                        a_tag: HtmlElement = element[0].find('h3').find('a')
                        _name = a_tag.find('strong').text
                        _href = a_tag.get('href')
                        _cid = Utils.separate_right(_href, self._DELIMITER)
                        _paths = Utils.join_path(self._PATH_TOKEN, _root_name, _name)

                        self._insert(_cid, _name, _paths)
                        self._parse_category(element[0], _paths)
                    else:
                        logging.info('Element is not Exists')

            except Exception as e:
                logging.error(str(e))

            # 더이상 필요없는 카테고리 아이템들 제거
            for item in self._category_list:
                _query = self.database_manager.find_query('_id', item['_id'])
                self.database_manager.delete_one(self.COLLECTION, _query)
        logging.info("Category Crawl END >> WEB")

        context['jobs'][identifier]['status'] = 'done'
 def _parse_child(self, child_items, sub_paths):
     child_item: WebElement
     for child_item in child_items:
         time.sleep(1)
         # href
         _href = child_item.find_element_by_tag_name('a').get_attribute(
             'href')
         # cid
         _cid = Utils.separate_right(_href, self.DELIMITER)
         # name
         _name = child_item.text  # 이름
         # paths
         _paths = Utils.join_path(token='#', source=sub_paths, value=_name)
         self._insert(_cid, _name, _paths)
Beispiel #3
0
    async def parse(self, identifier: str, context: dict, crawl_category: list = None):
        """ 외부에서 파싱을 하기 위해 호출하는 함수 """

        _categories: list = self._category_getter(crawl_category)

        for category in _categories:
            await asyncio.sleep(1)

            jobs = context['jobs']

            job_info = jobs[identifier]
            job_info['status'] = 'in Progress'
            job_info['category'] = category.get('name')

            self._category = category
            """파싱 프로세스 시작"""
            self._current_page = 0
            # Default = 1
            _url = self.make_url(paging_index=1)
            _total_count, _filter = self._get_base_data(_url)

            # Page 조건 변경 필요
            _is_oversize = _total_count > 8000
            # Page 계산
            _page_size = Utils.calc_page(_total_count, self._view_size)

            if _is_oversize:
                self._filter_parse(_filter)

            else:
                await self._execute_parse(_page_size)

            logging.info('>>> end childCategory: ' + self._category.get('name') + ' Pg.' + str(self._current_page))

        job_info['status'] = 'done'
Beispiel #4
0
    def parse(self):
        for category_id in range(self.CATEGORY_ID, self.CATEGORY_ID + 11):
            _url = 'https://search.shopping.naver.com/category/category/{0}'
            logging.info("PID >> %s | CategoryID >> %d " %
                         (os.getpid(), category_id))

            request = requests.get(_url.format(category_id))
            Utils.take_a_sleep(0, 1)
            #  상태 체크
            if request.status_code != 200:
                return
            try:
                _content = request.content
                tree: HtmlElement = html.fromstring(_content)
                header_xpath = '//*[@id="__next"]/div/div[2]/h2'
                _root_name = tree.xpath(header_xpath)[0].text

                self._insert(str(category_id), _root_name, None, True)

                xpath = '//*[@id="__next"]/div/div[2]/div/div'
                elements: [HtmlElement] = tree.xpath(xpath)

                element: HtmlElement
                for element in elements:
                    if element.find('div') is not None:
                        a_tag: HtmlElement = element[0].find('h3').find('a')
                        _name = a_tag.find('strong').text
                        _href = a_tag.get('href')
                        _cid = Utils.separate_right(_href, self._DELIMITER)
                        _paths = Utils.join_path(self._PATH_TOKEN, _root_name,
                                                 _name)

                        self._insert(_cid, _name, _paths)
                        self._parse_category(element[0], _paths)
                    else:
                        logging.info('Element is not Exists')

            except Exception as e:
                logging.error(str(e))

            # 더이상 필요없는 카테고리 아이템들 제거
            for item in self._category_list:
                _query = self.database_manager.find_query('_id', item['_id'])
                self.database_manager.delete_one(self.COLLECTION, _query)
Beispiel #5
0
def category(i):
    # URL = "https://search.shopping.naver.com/category/category/" + str(i)
    URL = "https://search.shopping.naver.com/too-many-request"

    headers = {'Content-Type': 'application/json;'}

    req = requests.get(URL, headers)

    content = req.content
    soup = BeautifulSoup(content, 'html.parser')  # html.parser를 사용해서 soup에 넣겠다

    json_data = soup.find('script', text=re.compile('application/json'))
    try:
        data_dict = json.loads(str(json_data.contents[0]))

    except Exception as e:
        print('')

    # tree: HtmlElement = etree.fromstring(content)
    tree: HtmlElement = html.fromstring(content)
    header_xpath = '//*[@id="__next"]/div/div[2]/h2'
    header = tree.xpath(header_xpath)[0].text

    xpath = '//*[@id="__next"]/div/div[2]/div/div'
    elements: [HtmlElement] = tree.xpath(xpath)

    element: HtmlElement
    for i, element in enumerate(elements):
        print(i)
        try:
            if element.find('div') is not None:
                a_tag: HtmlElement = element[0].find('h3').find('a')
                href = a_tag.get('href')
                _cid = Utils.separate_right(href, "category?catId=")
                h3_tag = a_tag.find('strong').text
                paths = Utils.join_path('#', header, h3_tag)
                sub_category(element[0], paths)

        except Exception as e:
            print('')
Beispiel #6
0
    def _parse_category(self, element: HtmlElement, root_paths: str):
        ul_tag: HtmlElement = element.find('ul')

        if ul_tag is not None:
            li_tags = ul_tag.findall('li')

            li: HtmlElement
            for li in li_tags:
                li_a_tag = li.find('a')
                if li_a_tag is not None:
                    _name = li_a_tag.text
                    _href = li_a_tag.get('href')
                    _cid = Utils.separate_right(_href, self._DELIMITER)
                    _paths = Utils.join_path(self._PATH_TOKEN, root_paths, _name)

                    self._insert(_cid, _name, _paths)
                    div_tag = li.find('div')
                    if div_tag is not None:
                        self._parse_category(div_tag, _paths)

                    if li.find('ul') is not None:
                        self._parse_category(li, _paths)
    def _parse_co_cel(self, co_cel_elements, root_name):
        co_cel: WebElement
        for co_cel in co_cel_elements:
            # href
            sub_href = co_cel.find_element_by_tag_name('a').get_attribute(
                'href')
            # cid
            _cid = Utils.separate_right(sub_href, self.DELIMITER)

            sub_element: WebElement = co_cel.find_element_by_tag_name('strong')

            # name
            _name = sub_element.find_element_by_tag_name('a').text
            _name = re.sub("전체보기", "", _name)
            # paths
            _paths = Utils.join_path(token='#', source=root_name, value=_name)

            # cid, name, paths
            self._insert(_cid, _name, _paths)

            # 하위 카테고리 리스트
            child_items: [WebElement] = co_cel.find_elements(By.TAG_NAME, 'li')
            self._parse_child(child_items, _paths)
        pass
Beispiel #8
0
    def _filter_parse_recursive(self, min_value, max_value):
        _param = ("&maxPrice={0}&minPrice={1}".format(str(max_value), str(min_value)))
        _url = self.make_url(1, "NVSHPRC", _param)
        _total_count, _filter = self._get_base_data(_url)
        _is_oversize = _total_count > 8000
        _page_size = Utils.calc_page(_total_count, self._view_size)
        if _is_oversize:
            half_price = math.ceil((min_value + max_value) / 2)
            _range = self._make_list(min_value, max_value, half_price)

            for value in _range:
                self._filter_parse_recursive(value[0], value[1])

        else:
            await self._execute_parse(_page_size, _param)
        pass
    def _start_parsing_process(self):
        """파싱 프로세스 시작"""
        self._current_page = 0
        # Default = 1
        _url = self.make_url(paging_index=1)
        _total_count, _filter = self._get_base_data(_url)

        # Page 조건 변경 필요
        _is_oversize = _total_count > 8000
        # Page 계산
        _page_size = Utils.calc_page(_total_count, self._view_size)

        if _is_oversize:
            self._filter_parse(_filter)

        else:
            self._execute_parse(_page_size)

        logging.info('>>> end childCategory: ' + self._category.get('name') +
                     ' Pg.' + str(self._current_page))
Beispiel #10
0
def sub_category(element: HtmlElement, root_path: str):

    ul_tag: HtmlElement = element.find('ul')

    if ul_tag is not None:
        li_tags = ul_tag.findall('li')

        li: HtmlElement
        for li in li_tags:
            try:
                li_a_tag = li.find('a')
                if li_a_tag is not None:
                    href = li_a_tag.get('href')
                    text = li_a_tag.text
                    paths = Utils.join_path('#', root_path, text)
                    div_tag = li.find('div')
                    if div_tag is not None:
                        sub_category(div_tag, paths)

                    if li.find('ul') is not None:
                        sub_category(li, paths)
            except Exception as e:
                print('')