def kai_scan(): # 지능정보산업협회 name = '지능정보산업협회' try: # 신청가능 사업 공지 req = requests.get('http://www.k-ai.or.kr/kr/information/notice.php') req.encoding = 'utf-8' html = req.text soup = BeautifulSoup(html, 'html.parser') # selector 로 데이터가저오기 top_count = soup.select('span.notice-icon') titles = soup.select('td > a') dates = soup.select('tr > td:nth-child(3)') # 체크포인트 불러오기, 저장하기 check_point = mongo.check_point_read(name)['title'] # top_count 공지갯수를 세서 공지를 제외하고 check_point를 저장 mongo.check_point_save(name, titles[len(top_count)].text.strip()) # 데이터 변수로 받아서 txt 저장 for i in range(len(titles)): title = titles[i].text.strip() if check_point != title: link = 'http://www.k-ai.or.kr' + titles[i].get('href') date = dates[i].text try: edate = date.split("~").pop(1) sdate = date.split("~").pop(0) # 형식이 다른경우 except Exception: sdate = date edate = '' # 상단고정 공지때문에 저장된건 중복 걸러주기 if mongo.is_saved(title) is None: mongo.post_save(name, title, link, sdate, edate) print('이름: ' + name + '\n제목:' + title + '\n링크: ' + link + '\n날짜: ' + date + '\n') else: break except Exception: message.site_error_push(name)
def btp_scan(): try: names = ['부산테크노파크_사업공고', '부산테크노파크_공지사항'] urls = [ 'http://www.btp.or.kr/index.php?action=BD0000M&pagecode=P000000010&language=KR', 'http://www.btp.or.kr/index.php?action=BD0000M&pagecode=P000000013&language=KR' ] for j in range(2): for pageindex in range(1, 3): # 1-2페이지까지 돌림 # 신청가능 사업 공지 req = requests.get(urls[j] + "&pageIndex=" + str(pageindex)) req.encoding = 'utf-8' html = req.text soup = BeautifulSoup(html, 'html.parser') # selector 로 데이터가저오기 titles = soup.select(' tr > td.ui-pleft20 > a') if j == 0: dates = soup.select('tr > td:nth-child(3)') else: dates = soup.select('tr > td:nth-child(4)') # 데이터 변수로 받아서 txt 저장 for i in range(len(titles)): title = titles[i].text.strip() if mongo.is_saved(title) is None: param = re.findall("\d+", titles[i].get('href')) link = urls[j] + '&command=View&idx=' + param[0] date = dates[i].text.split("(").pop(0).strip() try: edate = date.split("~").pop(1) sdate = date.split("~").pop(0) # 형식이 다른경우 except Exception: sdate = '' edate = date # 상단고정 공지때문에 저장된건 중복 걸러주기 mongo.post_save(names[j], title, link, sdate, edate) print('이름: ' + names[j] + '\n제목:' + title + '\n링크: ' + link + '\n날짜: ' + sdate + edate + '\n') except Exception: message.site_error_push(name="부산테크노파트")
def kotra_scan(): # 코트라 사이트 name = 'Kotra' try: url = 'http://www.kotra.or.kr' # 신청가능 사업 공지 req = requests.get( 'http://www.kotra.or.kr/kh/business/busiList.do?&MENU_CD=T0503&TOP_MENU_CD=T0500&LEFT_MENU_CD=T0503&PARENT_MENU_CD=&CO_TYPE=undefined&boardType=0' ) html = req.text soup = BeautifulSoup(html, 'html.parser') # selector 로 데이터가저오기 titles = soup.select('td > a') dates = soup.select('tr > td:nth-child(3)') # 체크포인트 불러오기, 저장하기 mongo.check_point_save(name, titles[0].text.strip()) # 데이터 변수로 받아서 txt 저장 for i in range(len(titles)): title = titles[i].text link = url + titles[i].get('href').split('\'').pop(1) date = dates[i].text try: sdate = date.split(" ~ ").pop(0) edate = date.split(" ~ ").pop(1) except Exception: # 상시모집인 경우 sdate = date edate = '' if mongo.is_saved(title) is None: mongo.post_save(name, title, link, sdate, edate) print('이름: ' + name + '\n제목: ' + title + '\n링크: ' + link + '\n신청기간: ' + date + '\n') except Exception: message.site_error_push(name)
def nipa_scan(): # 정보통신 산업진흥원 url names = ['정보통신산업진흥원1', '정보통신산업진흥원2'] try: url = 'http://www.nipa.kr' uris = ['', '/biz/'] urls = [ 'http://www.nipa.kr/board/boardList.it?boardNo=103&menuNo=32&page=1', 'http://www.nipa.kr/biz/bizNotice.it?menuNo=18&page=1' ] # 신청가능 사업 공지 for j in range(2): req = requests.get(urls[j]) html = req.text soup = BeautifulSoup(html, 'html.parser') # selector 로 데이터가저오기 titles = soup.select('td > a') dates = soup.select('tr > td.date') # 체크포인트 불러오기, 저장하기 check_point = mongo.check_point_read(names[j])['title'] mongo.check_point_save(names[j], titles[0].text) # 데이터 변수로 받아서 txt 저장 for i in range(len(titles)): title = titles[i].text link = url + uris[j] + titles[i].get('onclick').split( '\'').pop(1) date = dates[i].text if mongo.is_saved(title) is None: mongo.post_save(names[j], title, link, date, '') print('이름: ' + names[j] + '\n제목: ' + title + '\n링크: ' + link + '\n등록일: ' + date + '\n') except Exception: message.site_error_push(names[0])
def kstartup_scan(driver): name = 'kstartup' # k- startup 신청게시판 url url = 'http://www.k-startup.go.kr/common/announcement/announcementList.do?mid=30004&bid=701&searchAppAt=A' board_url = 'http://www.k-startup.go.kr/common/announcement/announcementDetail.do?mid=30004&bid=701&searchPrefixCode=BOARD_701_001&searchPostSn=' # 드라이버로 웹 열기 driver.get(url) driver.implicitly_wait(3) driver.find_element_by_tag_name('body').send_keys(Keys.END) # 지난번 저장했던 포스트 가져오기 check_point = mongo.check_point_read(name)['title'] point_flag = False # 페이지 5회 이동 pagecount = 0 while pagecount < 5: driver.find_element_by_xpath('//*[@id="listPlusAdd"]/a').click() driver.find_element_by_tag_name('body').send_keys(Keys.END) time.sleep(1) pagecount += 1 try: # 중요 공지 가져오기 impo_board = driver.find_element_by_class_name('ann_list_impor') boards_list = impo_board.find_elements_by_xpath('./li') # 중요공지는 10개내외로 순차적이지않으므로, 제목을 비교하여 이전에것들을 제외시킨다. for x in boards_list: # 제목 title = x.find_element_by_tag_name('a').text.strip() if mongo.is_saved(title) is None: # 날짜 - 있는것과 없는것 구분처리 try: due_date = re.findall( "\d{4}-\d{2}-\d{2}", x.find_element_by_xpath('./ul/li[3]').text.strip()) date = due_date[0] except Exception: date = "상시모집" # link - bi.net_url, kstartup_url 구분처리 params = re.findall( "\d+", x.find_element_by_tag_name('a').get_attribute('href')) if len(params) == 2: # bi-net 이동하는 함수일때, 파라미터가 2개임 link = "http://www.bi.go.kr/board/editView.do?boardVO.viewFlag=view&boardID=NOTICE&postSeq=" + \ params[0] + "®istDate=" + params[1] elif len(params) > 2: link = board_url + params[2] else: link = "링크오류" mongo.post_save(name, title, link, '', date) print('이름: ' + name + '\n제목:' + title + '\n링크: ' + link + '\n마감일: ' + date + '\n') except Exception: message.site_error_push(name + " < 중요공지부분 > ") pass try: # 페이지별 공지 가져오기 ann_board = driver.find_element_by_class_name('ann_list') boards_list2 = ann_board.find_elements_by_xpath('./li') for x in boards_list2: title = x.find_element_by_tag_name('a').text.strip() if mongo.is_saved(title) is None and title != "": # 날짜 있는것과 없는것 구분처리 try: due_date = re.findall( "\d{4}-\d{2}-\d{2}", x.find_element_by_xpath('./ul/li[3]').text.strip()) date = due_date[0] except Exception: date = "상시모집" # bi.net_url, kstartup_url 구분처리 params = re.findall( "\d+", x.find_element_by_tag_name('a').get_attribute('href')) if len(params) == 2: # bi-net 이동하는 함수일때, 파라미터가 2개임 link = "http://www.bi.go.kr/board/editView.do?boardVO.viewFlag=view&boardID=NOTICE&postSeq=" + \ params[0] + "®istDate=" + params[1] elif len(params) > 2: link = board_url + params[2] else: link = "링크오류" mongo.post_save(name, title, link, '', date) print('이름: ' + name + '\n제목:' + title + '\n링크: ' + link + '\n마감일: ' + date + '\n') else: break except Exception: message.site_error_push(name + " < 일반 공지 부분 > ") pass