def Parsing_list_url(URL, page_url): List = [] domain = Domain_check(URL['url']) #만약 driver이 켜져있으면 끄고, 없으면 그냥 진행 try: driver.quit() except: pass driver = chromedriver() driver.get(page_url) try: WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "td.aL"))) #time.large를 발견하면 에이작스 로딩이 완료됬다는 가정 except: try: WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "td.aL"))) except: return (driver, List); html = driver.page_source bs = BeautifulSoup(html, 'html.parser') posts = bs.find("div", {"class": "tbl_container"}).find("tbody").findAll("tr") for post in posts: if post.find("th") != None: continue if len(post.find("td").text) <= 1: continue url_done = domain + "/" + post.find("td", {"class": "aL"}).find("a")['href'] List.append(url_done) data = (driver, List) driver.quit() return data
def Parsing_list_url(URL, page_url): List = [] domain = Domain_check(URL['url']) #만약 driver이 켜져있으면 끄고, 없으면 그냥 진행 try: driver.quit() except: pass driver = chromedriver() driver.get(page_url) WebDriverWait(driver, 100).until( EC.presence_of_element_located((By.CSS_SELECTOR, "span.li_list"))) time.sleep(2) ''' for i in range(int(num)): driver.find_element_by_xpath('//*[@id="paging"]/li[4]/a').click() WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.CSS_SELECTOR, "span.li_num"))) ''' html = driver.page_source bs = BeautifulSoup(html, 'html.parser') try: posts1 = bs.find("ul", {"class": 'listContent'}).findAll("li") posts2 = bs.find("ul", {"class": 'listContent mb20'}).findAll("li") posts = posts1 + posts2 except: try: posts1 = bs.find("ul", {"class": 'listContent'}).findAll("li") posts2 = bs.find("ul", {"class": 'listContent mb20'}).findAll("li") posts = posts1 + posts2 except: data = (driver, List) return data try: for post in posts: url = post.find("span", { "class": "li_subject li_list2" }).find("a")['onclick'] url = url.split("'")[1] url = domain + url List.append(url) except: List = [] data = (driver, List) return data
def Parsing_list_url(URL, page_url): List = [] domain = Domain_check(URL['url']) #만약 driver이 켜져있으면 끄고, 없으면 그냥 진행 try: driver.quit() except: pass driver = chromedriver() List.append(page_url) data = (driver, List) return data
def Parsing_list_url(URL, page_url): List = [] #만약 driver이 켜져있으면 끄고, 없으면 그냥 진행 try: driver.quit() except: pass driver = chromedriver() driver.get(page_url) WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "searchForm"))) time.sleep(2) html = driver.page_source bs = BeautifulSoup(html, 'html.parser') try: posts = bs.find("div", { "class": "table_wrap" }).find("tbody").find_all("tr") except: try: posts = bs.find("div", { "class": "table_wrap" }).find("tbody").find_all("tr") except: data = (driver, List) return data try: for post in posts: url = (post.find("a")["href"]).split("'")[1] url = "https://www.youthcenter.go.kr/board/boardDetail.do?bbsNo=3&ntceStno=" + url + "&pageUrl=board%2Fboard&orderBy=REG_DTM&orderMode=DESC" List.append(url) except: List = [] data = (driver, List) return data
def Parsing_list_url(URL, page_url): List = [] domain = Domain_check(URL['url']) #만약 driver이 켜져있으면 끄고, 없으면 그냥 진행 try: driver.quit() except: pass driver = chromedriver() driver = everytime.login(driver) #에브리타임 게시판이 사라졌을 경우 대비 try: driver.get(page_url) driver.implicitly_wait(3) except: data = (driver, List) return data WebDriverWait(driver, 100).until( EC.presence_of_element_located((By.CSS_SELECTOR, "a.article"))) html = driver.page_source bs = BeautifulSoup(html, 'html.parser') posts = bs.find("div", {"class": 'wrap articles'}).findAll("article") if len(posts) == 1: #게시물이 아무것도 없는 경우 pass else: for post in posts: url = post.find("a")['href'] url = domain + url List.append(url) data = (driver, List) return data
def Parsing_post_data(driver, post_url, URL, recent_post): post_data_prepare = [] domain = Domain_check(URL['url']) end_date = date_cut(URL['info']) now_num = 0 repeat_num = 0 post_driver = chromedriver() # 포스트 페이지를 위한 드라이버 driver.get(post_url) if (URL['info'].split("_")[2] == "campustown"): WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.CSS_SELECTOR, "div.card"))) driver.find_element_by_xpath( '//*[@id="ct"]/div[5]/div/div[1]/div/button[2]').click() # //*[@id="ct"]/div[4]/div/div[1]/div/button[2] else: WebDriverWait(driver, 30).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "div.area_text"))) #div.header을 발견하면 에이작스 로딩이 완료됬다는 가정 last_posts = [0] while 1: driver.find_element_by_tag_name("body").send_keys(Keys.END) time.sleep(1) html = driver.page_source bs = BeautifulSoup(html, 'html.parser') posts = bs.find("div", { "class": 'wrap_postlist' }).findAll("div", {"class": "item"}) #더이상 내릴 수 없으면 break if len(last_posts) == len(posts): break else: last_posts = posts for post in posts[now_num:]: try: post_data = {} url = post.find("a", {"class": "link"})['href'] url = domain + url try: post_driver.get(url) #driver.get(url) except: if len(post_data_prepare) == 0: recent_post = None else: recent_post = post_data_prepare[0]['title'] data = (post_data_prepare, recent_post) return data try: WebDriverWait(post_driver, 30).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "div.txt_area"))) #a.item을 발견하면 에이작스 로딩이 완료됬다는 가정 except: if len(post_data_prepare) == 0: recent_post = None else: recent_post = post_data_prepare[0]['title'] data = (post_data_prepare, recent_post) return data html_post = post_driver.page_source bs_post = BeautifulSoup(html_post, 'html.parser') if (URL['info'].split("_")[2] == "campustown"): title = bs_post.find("h3", { "class": "tit_h3" }).get_text(" ", strip=True) else: if bs_post.find( "div", {"class": "se-module se-module-text se-title-text" }) == None: title = bs_post.find("h3", { "class": "tit_h3" }).get_text(" ", strip=True) else: title = bs_post.find("div", { "class": "se-module se-module-text se-title-text" }).find("span").get_text(" ", strip=True) if bs_post.find("p", {"class": "blog_date"}) == None: date = bs_post.find("p", { "class": "se_date" }).get_text(" ", strip=True) else: date = bs_post.find("p", { "class": "blog_date" }).get_text(" ", strip=True) if date.find("시간") != -1 or date.find("분") != -1 or date.find( "초") != -1: now = datetime.datetime.now().strftime("%Y-%m-%d") date = now + " 00:00:00" date = str( datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) else: date = date + ":00" date = str( datetime.datetime.strptime(date, "%Y. %m. %d. %H:%M:%S")) if (URL['info'].split("_")[2] == "campustown"): phrase = bs_post.find("div", { 'class': "post_ct" }).get_text(" ", strip=True) else: phrase = bs_post.find("div", { 'class': "se-main-container" }).get_text(" ", strip=True) phrase = post_wash(phrase) #post 의 공백을 전부 제거하기 위함 if (URL['info'].split("_")[2] == "campustown"): if bs_post.find("div", { 'class': "post_ct" }).find("img", {"id": "img_1"}) is None: img = 3 else: img = bs_post.find("div", { "class": "post_ct" }).find("img", {"id": "img_1"})['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 3 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img else: if bs_post.find("div", { "class": "se-main-container" }).find("img", {"id": "img_2"}) is None: img = 3 else: img = bs_post.find("div", { "class": "se-main-container" }).find("img", {"id": "img_2"})['src'] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 3 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 3: if img_size(img): pass else: img = 3 post_data['title'] = title.upper() post_data['author'] = "0" post_data['date'] = date post_data['post'] = phrase.lower() post_data['img'] = img post_data['url'] = "https://" + url[10:] # 'm'떼어버리는 작업 print(date, "::::", title) if (date < end_date) or (title.upper() == recent_post): break else: post_data_prepare.append(post_data) except: continue now_num = len(posts) repeat_num += 1 if (date <= end_date) or (title.upper() == recent_post): break if len(post_data_prepare) == 0: recent_post = None else: recent_post = post_data_prepare[0]['title'] data = (post_data_prepare, recent_post) post_driver.close() return data
def Parsing_post_data(driver, post_url, URL, recent_post): post_data_prepare = [] domain = Domain_check(URL['url']) end_date = date_cut(URL['info']) now_num = 0 repeat_num = 0 post_driver = chromedriver() # 포스트 페이지를 위한 드라이버 driver.get(post_url) last_posts = [0] while 1: driver.find_element_by_tag_name("body").send_keys(Keys.END) time.sleep(1) html = driver.page_source bs = BeautifulSoup(html, 'html.parser') posts = bs.find("div", { "class": 'articlelist' }).find("ol", { "class": 'group' }).find_all("li") #더이상 내릴 수 없으면 break if len(last_posts) == len(posts): break else: last_posts = posts for post in posts[now_num:]: try: post_data = {} url = post.find("a", {"class": "article"})['href'] url = domain + url try: post_driver.get(url) #driver.get(url) except: if len(post_data_prepare) == 0: recent_post = None else: recent_post = post_data_prepare[0]['title'] data = (post_data_prepare, recent_post) return data try: WebDriverWait(post_driver, 30).until( EC.presence_of_element_located( (By.TAG_NAME, "time"))) #a.item을 발견하면 에이작스 로딩이 완료됬다는 가정 except: if len(post_data_prepare) == 0: recent_post = None else: recent_post = post_data_prepare[0]['title'] data = (post_data_prepare, recent_post) return data html_post = post_driver.page_source bs_post = BeautifulSoup(html_post, 'html.parser') title = str( post.find("p", { "class": "text short" }).get_text(" ", strip=True)).split("<br>")[0] date = bs_post.find("p", { "class": "profile" }).find("time").get_text(" ", strip=True) date_len = len(date.split("/")) # 작성일 현재 년도 인경우 if date_len == 2: current_year = str(datetime.datetime.now().year) date = current_year + '/' + date + ":00" date = str( datetime.datetime.strptime(date, "%Y/%m/%d %H:%M:%S")) else: date = str( datetime.datetime.strptime(date, "%Y/%m/%d %H:%M:%S")) post = bs_post.find("div", { "class": "articleitem" }).find("p", { "class": "text" }).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 if bs_post.find("div", {"class": "attaches full"}) is None: img = 3 else: img = bs_post.find("div", { "class": "attaches full" }).find("img")["src"] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 3 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 3: if img_size(img): pass else: img = 3 post_data['title'] = title.upper() post_data['author'] = "" post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = url print(date, "::::", title) if (date < end_date) or (title.upper() == recent_post): break else: post_data_prepare.append(post_data) except: continue now_num = len(posts) repeat_num += 1 if (date <= end_date) or (title.upper() == recent_post): break if len(post_data_prepare) == 0: recent_post = None else: recent_post = post_data_prepare[0]['title'] data = (post_data_prepare, recent_post) post_driver.close() return data
def Parsing_list_url(URL, page_url, lastly_post, db, driver): List = [] domain = Domain_check(URL['url']) end_date = date_cut(URL['info']) lastly_num = 0 #한번만 실행하기위한 조건변수 #lastly_post = get_lastly_post(URL) #lastly_post 가져온다 try: driver.get(page_url) except: driver = chromedriver() driver = daum.login(driver) driver.get(page_url) #페이지 구조 변경 예외 if URL['info'] == "sj30_sejongstation_news": data = (driver, List) return data try: WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, "td.headcate"))) except: data = (driver, List) return data num = 1 while 1: cnt = 0 if URL['info'].split("_")[2] == 'qna': query = '//*[@id="primaryContent"]/table/tbody/tr[2]/td[2]/div[3]/div/a[' + str( num) + ']' else: query = '//*[@id="primaryContent"]/table/tbody/tr[2]/td[2]/div[2]/div/a[' + str( num) + ']' try: driver.find_element_by_xpath(query).click() except: data = (driver, List) return data try: WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "td.headcate"))) except: driver.refresh() WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "td.headcate"))) html = driver.page_source bs = BeautifulSoup(html, 'html.parser') posts = bs.find("table", { "class": "bbsList" }).find("tbody").findAll("tr") for post in posts: if post.find("td", {"class": "num"}).find("img") != None: continue title = post.find("td", { "class": "subject" }).find("a").get_text(" ", strip=True) if post.find("td", {"class": "date"}) == None: date = datetime.now() else: date = post.find("td", {"class": "date"}).text.strip() if date.find(":") != -1: now = datetime.now().strftime("%Y-%m-%d") date = now + " 00:00:00" else: date = "20" + date + " 00:00:00" date = str(datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) if date + title == lastly_post: #이전 최근 글을 만나면, cnt = 0, break 를 함으로서 만나기 전까지의 List를 보낸다. cnt = 0 lastly_num = 1 break elif end_date <= date: url = post.find("td", {"class": "subject"}).find("a")['href'] url = domain + url List.append(url) cnt += 1 time.sleep(3) #항상 첫번째페이지의 공지를 제외한 첫번째글이 lastly_post가 되도록 지정해줌 if lastly_num == 1 or lastly_post == 0: for post in posts: if post.find("td", {"class": "num"}).find("img") != None: continue title = post.find("td", { "class": "subject" }).find("a").get_text(" ", strip=True) date = post.find("td", {"class": "date"}).text.strip() if date.find(":") != -1: now = datetime.now().strftime("%Y-%m-%d") date = now + " 00:00:00" else: date = "20" + date + " 00:00:00" date = str(datetime.strptime(date, "%Y.%m.%d %H:%M:%S")) lastly_post = date + title push_lastly_post(URL, lastly_post, db) break if cnt == 0: #날짜가 전부 옛날이면 break break else: #page를 넘기기 위해 필요한 num이 7이면 7그대로 고정 if num == 7: pass else: num += 1 data = (driver, List) time.sleep(2) return data
def everytime_all_board(URL, end_date, db): main_url = URL['url'] board_search_url = "https://everytime.kr/community/search?keyword=" board_search_word = ['게시판', '갤러리'] board_list = [] # driver 연결 try: driver = chromedriver() driver = everytime.login(driver) except Exception as e: error_handler(e, URL, main_url, db) return WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, "a.article"))) html = driver.page_source bs = BeautifulSoup(html, 'html.parser') # 에브리타임 상단 동적 게시판 긁기============================================================================= board_group_list = bs.find("div", { "id": "submenu" }).findAll('div', {"class": "group"}) for board_group in board_group_list: try: board_li_list = board_group.find("ul").findAll("li") for board_li in board_li_list: board_li_dic = {} board_li_dic['tag'] = board_li.find("a").text if board_li.find("a").text.strip() == "더 보기": continue else: board_li_dic['url'] = main_url + board_li.find("a")['href'] if (board_li_dic['tag'].find("찾기") != -1): continue board_list.append(board_li_dic) except: continue # 에브리타임 추가 동적 게시판 긁기 for search_word in board_search_word: try: board_search_url_done = board_search_url + search_word driver.get(board_search_url_done) WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, "a.result"))) html = driver.page_source bs = BeautifulSoup(html, 'html.parser') board_a_list = bs.find("div", { "class": "searchresults" }).findAll('a') for board_a in board_a_list: board_li_dic = {} board_li_dic['tag'] = board_a.find("h3").text board_li_dic['url'] = main_url + board_a.get('href') board_list.append(board_li_dic) except: continue #=========================================================================================================== # 동적 게시판들 반복문 for board in board_list: page = 1 page_flag = 0 board_url = board['url'] page_url = Change_page(board_url, page) #현재 페이지 포스트 url 반환 print("\nTarget : ", URL['info'], " :: ", board['tag']) continue_handler(URL['info'] + " :: " + board['tag'], URL, page_url) # 페이지 반복문 while True: if page_flag == 50: page_flag = 0 driver.quit() time.sleep(3) driver = chromedriver() driver = everytime.login(driver) try: print("page_url :::: ", page_url) #현재 url 출력 print("Page : ", page) #현재 페이지 출력 post_urls = Parsing_list_url(main_url, page_url, driver, db) # everytime 고질병 문제 고려, 재시도 if len(post_urls) == 0: time.sleep(2) post_urls = Parsing_list_url(main_url, page_url, driver, db) post_data_prepare = [] # 포스트 반복문 for post_url in post_urls: get_post_data = Parsing_post_data(driver, post_url, URL, board['tag'], db) if get_post_data == "error": break title = get_post_data[1] date = get_post_data[2] print(date, "::::", title) #현재 크롤링한 포스트의 date, title 출력 #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append if str(date) <= end_date: continue else: post_data_prepare.append(get_post_data[0]) add_cnt = db_manager(URL, post_data_prepare, db) print("add_OK : ", add_cnt) #DB에 저장된 게시글 수 출력 #DB에 추가된 게시글이 0 이면 break, 아니면 다음페이지 if add_cnt == 0: page_flag = 0 break else: page_flag += 1 page += 1 page_url = Change_page(board_url, page) except Exception as e: error_handler(e, URL, page_url, db) driver.quit() time.sleep(3) driver = chromedriver() driver = everytime.login(driver) break #드라이버 연결 해제 driver.quit()
def Parsing_post_data(driver, post_url, URL, recent_post): post_data_prepare = [] domain = Domain_check(URL['url']) end_date = date_cut(URL['info']) now_num = 0 repeat_num = 0 post_driver = chromedriver() # 포스트 페이지를 위한 드라이버 driver.get(post_url) last_posts = [0] while 1: driver.find_element_by_tag_name("body").send_keys(Keys.END) time.sleep(1) html = driver.page_source bs = BeautifulSoup(html, 'html.parser') posts = bs.find("div", { "class": 'grid' }).find("div", { "class": 'grid-item' }).find_all("div", {"class": "grid-item"}) #더이상 내릴 수 없으면 break if len(last_posts) == len(posts): break else: last_posts = posts for post in posts[now_num:]: if post.find("div", {"class": "item onclick"}) is None or post.find( "div", {"class": "item_wrap"}) is None: pass else: try: post_data = {} url = post.find("div", { "class": "item onclick" }).get("onclick").split("'")[1] try: post_driver.get(url) #driver.get(url) except: if len(post_data_prepare) == 0: recent_post = None else: recent_post = post_data_prepare[0]['title'] data = (post_data_prepare, recent_post) return data try: WebDriverWait(post_driver, 30).until( EC.presence_of_element_located( (By.TAG_NAME, "time"))) #a.item을 발견하면 에이작스 로딩이 완료됬다는 가정 except: if len(post_data_prepare) == 0: recent_post = None else: recent_post = post_data_prepare[0]['title'] data = (post_data_prepare, recent_post) return data html_post = post_driver.page_source bs_post = BeautifulSoup(html_post, 'html.parser') title = bs_post.find("div", { "class": "col-md-12 start_article_info mobile_pre_article" }).find("h1").get_text(" ", strip=True) date = bs_post.find("div", { "class": "col-md-12 start_article_info mobile_pre_article" }).find("h3").get_text(" ", strip=True) date = date + " 00:00:00" date = str( datetime.datetime.strptime(date, "%Y . %m . %d %H:%M:%S")) post = bs_post.find("div", { "class": "col-md-12 content_start" }).get_text(" ", strip=True) post = post_wash(post) #post 의 공백을 전부 제거하기 위함 if bs_post.find( "div", {"class": "col-md-12 single_header no_pc"}) is None: img = 3 else: img = bs_post.find( "div", { "class": "col-md-12 single_header no_pc" }).find("img")["src"] #게시글의 첫번째 이미지를 가져옴. if 1000 <= len(img): img = 3 else: if img.startswith("http://") or img.startswith( "https://"): # img가 내부링크인지 외부 링크인지 판단. pass elif img.startswith("//"): img = "http:" + img else: img = domain + img if img != 3: if img_size(img): pass else: img = 3 post_data['title'] = title.upper() post_data['author'] = "" post_data['date'] = date post_data['post'] = post.lower() post_data['img'] = img post_data['url'] = url print(date, "::::", title) if (date < end_date) or (title.upper() == recent_post): break else: post_data_prepare.append(post_data) except: continue now_num = len(posts) repeat_num += 1 if (date <= end_date) or (title.upper() == recent_post): break if len(post_data_prepare) == 0: recent_post = None else: recent_post = post_data_prepare[0]['title'] data = (post_data_prepare, recent_post) post_driver.close() return data