Python CrawUtil 예제들, CrawlingCategory.CrawlingUtil.CrawUtil Python 예제들

예제 #1

0

파일 보기

파일: MusicCategory.py 프로젝트: skymir-git/plus-web-project

class MusicCategory:

    crawlingUtil = CrawUtil()

    def __init__(self, pageNum, rootPath):

        path = rootPath + '/MusicCategory'
        URL = 'http://midiex.net/bbs/board.php?bo_table=score&page=' + str(
            pageNum)
        links = self.crawlingUtil.music_get_link(URL)
        fileNum = self.crawlingUtil.isInDirectory(path)

        p = 0

        for count in range(len(links)):
            result_text = self.crawlingUtil.music_get_text(
                'http://midiex.net' + links[count])

            #result_text = result_text[19:]
            result_text = re.sub(
                "[-=+,#/\?:%$.@*\"※~&%!r\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]",
                "", result_text)
            result_text = result_text.replace('xa0', '')
            result_text = result_text.replace('u200b', '')

            if result_text.strip() == '':
                p += 1
            else:
                OUTPUT_FILE_NAME = 'MusicCategory/MusicCategory%05d.txt' % (
                    count + fileNum - p)
                print(OUTPUT_FILE_NAME)
                open_output_file = open(OUTPUT_FILE_NAME, 'w', -1, "utf-8")
                open_output_file.write(result_text)
                open_output_file.close()

예제 #2

0

파일 보기

class PaintingCategory:

    crawlingUtil = CrawUtil()

    def __init__(self, pageNum, rootPath):

        path = rootPath + '/PaintCategory'
        URL = 'https://bbs.ruliweb.com/hobby/board/300066?page=' + str(pageNum)
        links = self.crawlingUtil.paint_get_link(URL)
        fileNum = self.crawlingUtil.isInDirectory(path)
        p = 0
        print(links)

        for count in range(len(links)):
            result_text = self.crawlingUtil.paint_get_text(links[count])
            #result_text = result_text[19:]
            result_text = re.sub(
                "[-=+,#/\?:%$.@*\"※~&%!r\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]",
                "", result_text)
            result_text = result_text.replace('xa0', '')
            result_text = result_text.replace('u200b', '')

            p = 0

            if result_text.strip() == '':
                p += 1
            else:
                OUTPUT_FILE_NAME = 'PaintCategory/PaintCategory%05d.txt' % (
                    count + fileNum - p)
                print(OUTPUT_FILE_NAME)
                open_output_file = open(OUTPUT_FILE_NAME, 'w', -1, "utf-8")
                open_output_file.write(result_text)
                open_output_file.close()

예제 #3

0

파일 보기

파일: TextCategory.py 프로젝트: skymir-git/plus-web-project

class TextCategory:

    crawlingUtil = CrawUtil()

    def __init__(self, pageNum, rootPath):

        path = rootPath + '/TextCategory'
        URL = 'http://www.joara.com/literature/view/book_list.html?page_no='+ str(pageNum)+\
              '&bookpart=&sl_type=&sl_chkcost=&sl_category=&sl_search=&sl_keyword=&sl_chk=&sl_minchapter=&sl_' \
              'maxchapter=&sl_redate=&sl_orderby=&sl_othercategory=&list_type=normal&sub_category='
        links = self.crawlingUtil.text_get_link(URL)
        fileNum = self.crawlingUtil.isInDirectory(path)

        p = 0

        for count in range(len(links)):
            result_text = self.crawlingUtil.text_get_text(
                'http://www.joara.com' + links[count])

            #result_text = result_text[19:]
            result_text = re.sub(
                "[-=+,#/\?:%$.@*\"※~&%!r\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]",
                "", result_text)
            result_text = result_text.replace('xa0', '')
            result_text = result_text.replace('u200b', '')

            if result_text.strip() == '':
                p += 1
            else:
                OUTPUT_FILE_NAME = 'TextCategory/TextCategory%05d.txt' % (
                    count + fileNum - p)
                print(OUTPUT_FILE_NAME)
                open_output_file = open(OUTPUT_FILE_NAME, 'w', -1, "utf-8")
                open_output_file.write(result_text)
                open_output_file.close()

예제 #4

0

파일 보기

파일: TravelPhotoCategory.py 프로젝트: skymir-git/plus-web-project

class TravelPhotoCategory:

    crawlingUtil = CrawUtil()

    def __init__(self, pageNum, rootPath):

        path = rootPath + '/TravelPhotoCategory'
        URL = 'http://www.tourtips.com/ap/column/list/?&page=' + str(pageNum)
        links = self.crawlingUtil.travel_photo_get_link(URL)
        fileNum = self.crawlingUtil.isInDirectory(path)

        p = 0

        for count in range(len(links)):
            result_text = self.crawlingUtil.travel_photo_get_text('http://www.tourtips.com' + links[count])

            #result_text = result_text[19:]
            result_text = re.sub("[-=+,#/\?:%$.@*\"※~&%!r\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]", "", result_text)
            result_text = result_text.replace('xa0','')
            result_text = result_text.replace('u200b', '')

            if result_text.strip() == '':
                p += 1
            else:
                OUTPUT_FILE_NAME = 'TravelPhotoCategory/TravelPhotoCategory%05d.txt' % (count + fileNum - p)
                print(OUTPUT_FILE_NAME)
                open_output_file = open(OUTPUT_FILE_NAME, 'w', -1, "utf-8")
                open_output_file.write(result_text)
                open_output_file.close()

예제 #5

0

파일 보기

파일: FoodCategory.py 프로젝트: skymir-git/plus-web-project

class FoodCategory:

    crawlingUtil = CrawUtil()

    def __init__(self, pageNum, rootPath):

        path = rootPath + '/FoodCategory'
        #URL = 'https://www.82cook.com/entiz/enti.php?bn=10&page=' + str(pageNum)
        URL = 'http://recipekorea.com/bbs/board.php?bo_table=ld_0502?&page=' + str(pageNum)
        links = self.crawlingUtil.food_get_link(URL)
        fileNum = self.crawlingUtil.isInDirectory(path)

        p = 0

        for count in range(len(links)):
            #result_text = self.crawlingUtil.food_get_text('https://www.82cook.com/entiz/' + links[count])
            result_text = self.crawlingUtil.food_get_text(links[count])
            #result_text = result_text[19:]
            result_text = re.sub("[-=+,#/\?:%$.@*\"※~&%!r\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]", "", result_text)
            result_text = result_text.replace('xa0','')
            result_text = result_text.replace('u200b', '')

            if result_text.strip() == '':
                p += 1
            else:
                OUTPUT_FILE_NAME = 'FoodCategory/FoodCategory%05d.txt' % (count + fileNum - p)
                print(OUTPUT_FILE_NAME)
                open_output_file = open(OUTPUT_FILE_NAME, 'w', -1, "utf-8")
                open_output_file.write(result_text)
                open_output_file.close()

예제 #6

0

파일 보기

class GameCategory:

    crawlingUtil = CrawUtil()

    chromePath = ''  # 크롬드라이버 경로 ex) 'C://Program Files (x86)/Google/Chrome/Application/chromedriver.exe'
    driver = webdriver.Chrome(chromePath)
    ID = ''
    PASS = ''

    def __init__(self, count, rootPath):  #naver -> unityHUB
        self.seleniumSetting()
        path = rootPath + '/GameCategory'  # 폴더 생성 시의 경로
        fileNum = self.crawlingUtil.isInDirectory(path)  # 폴더 생성 & 파일 count 함수
        p = 0
        increaseCount = 0
        for i in range(count):
            html = urlopen(
                'https://cafe.naver.com/ArticleList.nhn?search.clubid=26377973&search.menuid=58&search.boardtype=L&search.totalCount=151&search.page='
                + str(i + 1)).read()
            soup = bs(html, 'html.parser', from_encoding='MS949')
            information_list = soup.find(
                'div',
                class_='article-board m-tcol-c').find_all('a',
                                                          class_='m-tcol-c')
            for link in information_list:
                if link['href'] != '#':
                    increaseCount += 1
                    p = self.detailFinder(link['href'], increaseCount, fileNum,
                                          p)

    def detailFinder(self, detailURL, count, fileNum, p):
        self.driver.get('https://cafe.naver.com' + detailURL)
        time.sleep(1)
        self.driver.switch_to.default_content  # (1) 상위 프레임으로 전환
        self.driver.switch_to.frame('cafe_main')  # 네이버까페의 경우  (New 명령어)
        time.sleep(1)
        try:
            upName = self.driver.find_element_by_class_name(
                'NHN_Writeform_Main')
            pText = upName.find_elements_by_tag_name('table')
            OUTPUT_FILE_NAME = 'GameCategory/GameCategory%05d.txt' % (
                count + fileNum - p)
            print(OUTPUT_FILE_NAME)
            open_output_file = open(OUTPUT_FILE_NAME, 'w', -1, "utf-8")
            open_output_file.write(pText[3].text)
            open_output_file.close()

        except:
            p += 1

        time.sleep(1)

        return p

    def copy_input(self, xpath, input):
        pyperclip.copy(input)
        self.driver.find_element_by_xpath(xpath).click()
        # 클립보드 내용 붙여넣기
        ActionChains(self.driver).key_down(Keys.CONTROL).send_keys('v').key_up(
            Keys.CONTROL).perform()
        time.sleep(1)

    def seleniumSetting(self):

        self.driver.implicitly_wait(3)
        self.driver.get('https://www.naver.com/')
        loginButton = self.driver.find_element_by_xpath(
            '//*[@id="account"]/div/a/i')
        loginButton.click()
        time.sleep(3)
        self.copy_input('//*[@id="id"]', self.ID)
        time.sleep(1)
        self.copy_input('//*[@id="pw"]', self.PASS)
        time.sleep(1)
        self.driver.find_element_by_xpath(
            '//*[@id="frmNIDLogin"]/fieldset/input').click()
        time.sleep(1)