class MusicCategory: crawlingUtil = CrawUtil() def __init__(self, pageNum, rootPath): path = rootPath + '/MusicCategory' URL = 'http://midiex.net/bbs/board.php?bo_table=score&page=' + str( pageNum) links = self.crawlingUtil.music_get_link(URL) fileNum = self.crawlingUtil.isInDirectory(path) p = 0 for count in range(len(links)): result_text = self.crawlingUtil.music_get_text( 'http://midiex.net' + links[count]) #result_text = result_text[19:] result_text = re.sub( "[-=+,#/\?:%$.@*\"※~&%!r\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]", "", result_text) result_text = result_text.replace('xa0', '') result_text = result_text.replace('u200b', '') if result_text.strip() == '': p += 1 else: OUTPUT_FILE_NAME = 'MusicCategory/MusicCategory%05d.txt' % ( count + fileNum - p) print(OUTPUT_FILE_NAME) open_output_file = open(OUTPUT_FILE_NAME, 'w', -1, "utf-8") open_output_file.write(result_text) open_output_file.close()
class PaintingCategory: crawlingUtil = CrawUtil() def __init__(self, pageNum, rootPath): path = rootPath + '/PaintCategory' URL = 'https://bbs.ruliweb.com/hobby/board/300066?page=' + str(pageNum) links = self.crawlingUtil.paint_get_link(URL) fileNum = self.crawlingUtil.isInDirectory(path) p = 0 print(links) for count in range(len(links)): result_text = self.crawlingUtil.paint_get_text(links[count]) #result_text = result_text[19:] result_text = re.sub( "[-=+,#/\?:%$.@*\"※~&%!r\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]", "", result_text) result_text = result_text.replace('xa0', '') result_text = result_text.replace('u200b', '') p = 0 if result_text.strip() == '': p += 1 else: OUTPUT_FILE_NAME = 'PaintCategory/PaintCategory%05d.txt' % ( count + fileNum - p) print(OUTPUT_FILE_NAME) open_output_file = open(OUTPUT_FILE_NAME, 'w', -1, "utf-8") open_output_file.write(result_text) open_output_file.close()
class TextCategory: crawlingUtil = CrawUtil() def __init__(self, pageNum, rootPath): path = rootPath + '/TextCategory' URL = 'http://www.joara.com/literature/view/book_list.html?page_no='+ str(pageNum)+\ '&bookpart=&sl_type=&sl_chkcost=&sl_category=&sl_search=&sl_keyword=&sl_chk=&sl_minchapter=&sl_' \ 'maxchapter=&sl_redate=&sl_orderby=&sl_othercategory=&list_type=normal&sub_category=' links = self.crawlingUtil.text_get_link(URL) fileNum = self.crawlingUtil.isInDirectory(path) p = 0 for count in range(len(links)): result_text = self.crawlingUtil.text_get_text( 'http://www.joara.com' + links[count]) #result_text = result_text[19:] result_text = re.sub( "[-=+,#/\?:%$.@*\"※~&%!r\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]", "", result_text) result_text = result_text.replace('xa0', '') result_text = result_text.replace('u200b', '') if result_text.strip() == '': p += 1 else: OUTPUT_FILE_NAME = 'TextCategory/TextCategory%05d.txt' % ( count + fileNum - p) print(OUTPUT_FILE_NAME) open_output_file = open(OUTPUT_FILE_NAME, 'w', -1, "utf-8") open_output_file.write(result_text) open_output_file.close()
class TravelPhotoCategory: crawlingUtil = CrawUtil() def __init__(self, pageNum, rootPath): path = rootPath + '/TravelPhotoCategory' URL = 'http://www.tourtips.com/ap/column/list/?&page=' + str(pageNum) links = self.crawlingUtil.travel_photo_get_link(URL) fileNum = self.crawlingUtil.isInDirectory(path) p = 0 for count in range(len(links)): result_text = self.crawlingUtil.travel_photo_get_text('http://www.tourtips.com' + links[count]) #result_text = result_text[19:] result_text = re.sub("[-=+,#/\?:%$.@*\"※~&%!r\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]", "", result_text) result_text = result_text.replace('xa0','') result_text = result_text.replace('u200b', '') if result_text.strip() == '': p += 1 else: OUTPUT_FILE_NAME = 'TravelPhotoCategory/TravelPhotoCategory%05d.txt' % (count + fileNum - p) print(OUTPUT_FILE_NAME) open_output_file = open(OUTPUT_FILE_NAME, 'w', -1, "utf-8") open_output_file.write(result_text) open_output_file.close()
class FoodCategory: crawlingUtil = CrawUtil() def __init__(self, pageNum, rootPath): path = rootPath + '/FoodCategory' #URL = 'https://www.82cook.com/entiz/enti.php?bn=10&page=' + str(pageNum) URL = 'http://recipekorea.com/bbs/board.php?bo_table=ld_0502?&page=' + str(pageNum) links = self.crawlingUtil.food_get_link(URL) fileNum = self.crawlingUtil.isInDirectory(path) p = 0 for count in range(len(links)): #result_text = self.crawlingUtil.food_get_text('https://www.82cook.com/entiz/' + links[count]) result_text = self.crawlingUtil.food_get_text(links[count]) #result_text = result_text[19:] result_text = re.sub("[-=+,#/\?:%$.@*\"※~&%!r\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]", "", result_text) result_text = result_text.replace('xa0','') result_text = result_text.replace('u200b', '') if result_text.strip() == '': p += 1 else: OUTPUT_FILE_NAME = 'FoodCategory/FoodCategory%05d.txt' % (count + fileNum - p) print(OUTPUT_FILE_NAME) open_output_file = open(OUTPUT_FILE_NAME, 'w', -1, "utf-8") open_output_file.write(result_text) open_output_file.close()
class GameCategory: crawlingUtil = CrawUtil() chromePath = '' # 크롬드라이버 경로 ex) 'C://Program Files (x86)/Google/Chrome/Application/chromedriver.exe' driver = webdriver.Chrome(chromePath) ID = '' PASS = '' def __init__(self, count, rootPath): #naver -> unityHUB self.seleniumSetting() path = rootPath + '/GameCategory' # 폴더 생성 시의 경로 fileNum = self.crawlingUtil.isInDirectory(path) # 폴더 생성 & 파일 count 함수 p = 0 increaseCount = 0 for i in range(count): html = urlopen( 'https://cafe.naver.com/ArticleList.nhn?search.clubid=26377973&search.menuid=58&search.boardtype=L&search.totalCount=151&search.page=' + str(i + 1)).read() soup = bs(html, 'html.parser', from_encoding='MS949') information_list = soup.find( 'div', class_='article-board m-tcol-c').find_all('a', class_='m-tcol-c') for link in information_list: if link['href'] != '#': increaseCount += 1 p = self.detailFinder(link['href'], increaseCount, fileNum, p) def detailFinder(self, detailURL, count, fileNum, p): self.driver.get('https://cafe.naver.com' + detailURL) time.sleep(1) self.driver.switch_to.default_content # (1) 상위 프레임으로 전환 self.driver.switch_to.frame('cafe_main') # 네이버까페의 경우 (New 명령어) time.sleep(1) try: upName = self.driver.find_element_by_class_name( 'NHN_Writeform_Main') pText = upName.find_elements_by_tag_name('table') OUTPUT_FILE_NAME = 'GameCategory/GameCategory%05d.txt' % ( count + fileNum - p) print(OUTPUT_FILE_NAME) open_output_file = open(OUTPUT_FILE_NAME, 'w', -1, "utf-8") open_output_file.write(pText[3].text) open_output_file.close() except: p += 1 time.sleep(1) return p def copy_input(self, xpath, input): pyperclip.copy(input) self.driver.find_element_by_xpath(xpath).click() # 클립보드 내용 붙여넣기 ActionChains(self.driver).key_down(Keys.CONTROL).send_keys('v').key_up( Keys.CONTROL).perform() time.sleep(1) def seleniumSetting(self): self.driver.implicitly_wait(3) self.driver.get('https://www.naver.com/') loginButton = self.driver.find_element_by_xpath( '//*[@id="account"]/div/a/i') loginButton.click() time.sleep(3) self.copy_input('//*[@id="id"]', self.ID) time.sleep(1) self.copy_input('//*[@id="pw"]', self.PASS) time.sleep(1) self.driver.find_element_by_xpath( '//*[@id="frmNIDLogin"]/fieldset/input').click() time.sleep(1)