def Crawling(URL, db):
	driver = None
	info_name = URL['info'].split('_')
	crawling_name = info_name[0]	#게시판 크롤링 선택
	page = 1
	main_url = URL['url']	#게시판 url 추출 : 페이지 바꾸는 데에 사용
	page_url = eval(crawling_name + '.Change_page(main_url, page)')	#현재 페이지 포스트 url 반환
	end_date = date_cut(URL['info'])	# end_date 추출
	if crawling_name in ["sj34"]:		# 동적 게시판 예외
		sj34.everytime_all_board(URL, end_date, db)
		return
	if crawling_name in ["sj20"]:		# 제외 게시판
		return;

	#현재 크롤링하는 게시판 info 출력
	print("Target : ", URL['info'])
	continue_handler(URL['info'], URL, page_url)

	#크롤링 유무판단
	if is_crawling(db, URL['info']) == False:
		return

	while True:
		if crawling_name in ["sj23", "sj26", "sj27", "sj28", "sj30", "sj44"]:
			lastly_post = get_lastly_post(URL, db)
		try:
			print("\npage_url :::: ", page_url)	#현재 url 출력
			print("Page : ", page)				#현재 페이지 출력
			#driver_page 생성---------------------------
			if crawling_name in ['sj10']:
				driver_page = URLparser_EUCKR(page_url)
			elif crawling_name in ['sj12']:
				driver_page = URLparser_UTF8(page_url)
			else:
				driver_page = URLparser(page_url)
			#-------------------------------------------
			#Selenium을 쓰는 경우----------------------------------------------------------------------------------------------
			if crawling_name in ["sj23", "sj26", "sj27", "sj28", "sj29", "sj38", "sj44"]:
				data = eval(crawling_name + '.Parsing_list_url(URL, page_url)')
				driver = data[0]
				post_urls = data[1]
			elif crawling_name in ["sj30"]:#---------------------------세종대역 예외처리
				data = eval(crawling_name + '.Parsing_list_url(URL, page_url, lastly_post, db, driver)')
				driver = data[0]
				post_urls = data[1]
			#Requests를 쓰는 경우----------------------------------------------------------------------------------------------
			else:
				#로그인을 하는 경우-------------------------------------------------------------------------------
				if URL['login'] == 1:
					post_urls = eval(crawling_name + '.Parsing_list_url(URL, page_url)')
				#로그인을 하지않는 경우---------------------------------------------------------------------------
				else:
					if driver_page is None:		#Connect Failed 이면 break
						error_handler("driver_none", URL, page_url, db)
						break
					else:
						#parsing 형태--------------------------------------------------
						if crawling_name in ['sj10']:
							bs_page = BeautifulSoup(driver_page, 'lxml')
						else:
							bs_page = BeautifulSoup(driver_page, 'html.parser')
						#--------------------------------------------------------------
					post_urls = eval(crawling_name + '.Parsing_list_url(URL, bs_page)')
				#-----------------------------------------------------------------------------------------------
			#-----------------------------------------------------------------------------------------------------------------
			#get_post_data 형식 : [게시글정보dictionary, title, date]-------------------------------------------------------------------------------------------------------
			#date 규격은 "0000-00-00 00:00:00"
			post_data_prepare = []
			for post_url in post_urls:
				#Selenium인 경우--------------------------------------------------------------------------------------------------------------------
				if crawling_name in ['sj29', 'sj30']:#------------------게시판 규격인 경우
					get_post_data = eval(crawling_name + '.Parsing_post_data(driver, post_url, URL)')
				#---------------------------------------------------------------------------------------------------게시판 규격이 아닌 경우
				elif crawling_name in ['sj23', 'sj26', 'sj27', 'sj28', 'sj44']:
					data = eval(crawling_name + '.Parsing_post_data(driver, post_url, URL, lastly_post)')
					post_data_prepare = data[0]
					lastly_post = data[1]
					if lastly_post is None:
						pass
					else:
						push_lastly_post(URL, lastly_post, db)
				#Requests인 경우--------------------------------------------------------------------------------------------------------------------
				else:
					#driver_post 생성--------------------------------
					if crawling_name in ["sj21", "sj4", "sj5", "sj8", "sj16"]: #---driver_post가 필요없는 경우
						pass
					elif crawling_name in ['sj10', 'sj33']:
						driver_post = URLparser_EUCKR(post_url)
					elif crawling_name in ['sj12']:
						driver_post = URLparser_UTF8(post_url)
					else:
						driver_post = URLparser(post_url)
					#------------------------------------------------
					#-----------------------------------------------------------------------------------------------위키백과 구조
					if crawling_name in ['sj21']:
						get_post_data = eval(crawling_name + '.Parsing_post_data(post_url, URL)')
					#-----------------------------------------------------------------------------------------------게시판 규격이 아닌 구조
					elif crawling_name in ["sj4", "sj5", "sj8", "sj16"]:
						post_data_prepare = eval(crawling_name + '.Parsing_post_data(post_url, URL)')
						break
					#-----------------------------------------------------------------------------------------------게시판 규격인 구조
					else:
						if driver_post is None:		#Connect Failed 이면 continue
							error_handler("driver_none", URL, page_url, db)
							break
						else:
							#parsing 형태-------------------------------------------
							if crawling_name in ['sj10']:
								bs_post = BeautifulSoup(driver_post, 'lxml')
							elif crawling_name in ['sj12']:
								bs_post = driver_post
							else:
								bs_post = BeautifulSoup(driver_post, 'html.parser')
							#-------------------------------------------------------
						get_post_data = eval(crawling_name + '.Parsing_post_data(bs_post, post_url, URL)')
				#-----------------------------------------------------------------------------------------------------------------------------------
				
				#post_data_prepare이 이미 완성된 경우-----------------------------------------------------------------------
				if crawling_name in ["sj4", "sj5", "sj8", "sj16", "sj23", "sj26", "sj27", "sj28", "sj44"]:
					pass
				#post_data_prepare이 완성되지 않은 경우---------------------------------------------------------------------
				else:
					if get_post_data == None:	#잘못된 포스트 데이터인 경우
						continue
					title = get_post_data[1]
					date = get_post_data[2]
		
					print(date, "::::", title)	#현재 크롤링한 포스트의 date, title 출력
		
					#게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append
					if str(date) <= end_date:
						continue
					else:
						post_data_prepare.append(get_post_data[0])
			#----------------------------------------------------------------------------------------------------------
			#--------------------------------------------------------------------------------------------------------------------------------------------------------------
			add_cnt = db_manager(URL, post_data_prepare, db)
			print("add_OK : ", add_cnt)	#DB에 저장된 게시글 수 출력
		
			#dirver 종료 [Selenium 을 사용했을 시]
			if crawling_name in ["sj23", "sj26", "sj27", "sj28", "sj29", "sj30", "sj38", "sj44"]:
				driver.quit()
			
			#DB에 추가된 게시글이 0 이면 break, 아니면 다음페이지
			if add_cnt == 0:
				break
			else:
				page += 1
				page_url = eval(crawling_name + '.Change_page(main_url, page)')
		# Error handler : 만약 크롤링이 실패했을 경우, 에러를 logging 하고 크롤링을 중단한다.
		except Exception as e:
			error_handler(e, URL, page_url, db)
			break
Example #2
0
def Parsing_list_url(URL, page_url, lastly_post, db, driver):
    List = []
    domain = Domain_check(URL['url'])
    end_date = date_cut(URL['info'])
    lastly_num = 0  #한번만 실행하기위한 조건변수
    #lastly_post = get_lastly_post(URL)	#lastly_post 가져온다
    try:
        driver.get(page_url)
    except:
        driver = chromedriver()
        driver = daum.login(driver)
        driver.get(page_url)

    #페이지 구조 변경 예외
    if URL['info'] == "sj30_sejongstation_news":
        data = (driver, List)
        return data

    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "td.headcate")))
    except:
        data = (driver, List)
        return data
    num = 1
    while 1:
        cnt = 0
        if URL['info'].split("_")[2] == 'qna':
            query = '//*[@id="primaryContent"]/table/tbody/tr[2]/td[2]/div[3]/div/a[' + str(
                num) + ']'
        else:
            query = '//*[@id="primaryContent"]/table/tbody/tr[2]/td[2]/div[2]/div/a[' + str(
                num) + ']'
        try:
            driver.find_element_by_xpath(query).click()
        except:
            data = (driver, List)
            return data
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "td.headcate")))
        except:
            driver.refresh()
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "td.headcate")))
        html = driver.page_source
        bs = BeautifulSoup(html, 'html.parser')
        posts = bs.find("table", {
            "class": "bbsList"
        }).find("tbody").findAll("tr")

        for post in posts:
            if post.find("td", {"class": "num"}).find("img") != None:
                continue
            title = post.find("td", {
                "class": "subject"
            }).find("a").get_text(" ", strip=True)
            if post.find("td", {"class": "date"}) == None:
                date = datetime.now()
            else:
                date = post.find("td", {"class": "date"}).text.strip()
            if date.find(":") != -1:
                now = datetime.now().strftime("%Y-%m-%d")
                date = now + " 00:00:00"
            else:
                date = "20" + date + " 00:00:00"
                date = str(datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))

            if date + title == lastly_post:  #이전 최근 글을 만나면, cnt = 0, break 를 함으로서 만나기 전까지의 List를 보낸다.
                cnt = 0
                lastly_num = 1
                break
            elif end_date <= date:
                url = post.find("td", {"class": "subject"}).find("a")['href']
                url = domain + url
                List.append(url)
                cnt += 1
        time.sleep(3)

        #항상 첫번째페이지의 공지를 제외한 첫번째글이 lastly_post가 되도록 지정해줌
        if lastly_num == 1 or lastly_post == 0:
            for post in posts:
                if post.find("td", {"class": "num"}).find("img") != None:
                    continue
                title = post.find("td", {
                    "class": "subject"
                }).find("a").get_text(" ", strip=True)
                date = post.find("td", {"class": "date"}).text.strip()
                if date.find(":") != -1:
                    now = datetime.now().strftime("%Y-%m-%d")
                    date = now + " 00:00:00"
                else:
                    date = "20" + date + " 00:00:00"
                    date = str(datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
                lastly_post = date + title
                push_lastly_post(URL, lastly_post, db)
                break

        if cnt == 0:  #날짜가 전부 옛날이면 break
            break
        else:  #page를 넘기기 위해 필요한 num이 7이면 7그대로 고정
            if num == 7:
                pass
            else:
                num += 1

    data = (driver, List)
    time.sleep(2)
    return data