def report_main(jm_code, rcp_no): try: # driver 세팅(결의, 공고) driver = get_driver( 'C:\\Users\\admin\\PycharmProjects\\webCrawl\\chromedriver.exe', 'http://dart.fss.or.kr/dsaf001/main.do?rcpNo={0}'.format(rcp_no)) # 주총 결의의 rcpno 히스토리 rcpno_list = get_rcpno_list(driver) # 최초 문서의 공고년도 first_rcp_yy = rcpno_list[0][:4] conn = get_dbcon('esg') cursor = conn.cursor() # 보상위원회 유무 확인 get_tab(driver, 'b') bosang_yn = get_board_yn(driver, 'b') print(bosang_yn) driver.switch_to_default_content() # 감사위원회 유무 확인 get_tab(driver, 'g') gamsa_yn = get_board_yn(driver, 'g') print(gamsa_yn) # --------------------------------------------------------------------------------- # # DB 삽입 # 중복체크 insert_qry = """insert into proxy700_tmp values('{0}', '{1}', '{2}', '{3}')""".format( jm_code, '2018', bosang_yn, gamsa_yn) cursor.execute(insert_qry) finally: cursor.close() close_dbcon(conn) close_driver(driver)
def get_search_options(): driver = utils.start_driver('chrome') home_dir = str(pathlib.Path.home()) session.login(driver, home_dir + '/plst.credential.json') utils.wait(3) filt_name_dict = { 'level': 'SKILL LEVELS', 'role': 'ROLES', 'subject': 'SUBJECTS TO LEARN', 'tool': 'TOOLS', 'cert': 'CERTIFICATIONS', 'author': 'AUTHORS', } try: for filt, filt_name in sorted(filt_name_dict.items()): opt_url_dict = search.get_filter_options_url(driver, filt_name) utils.save_json(opt_url_dict, 'search/filt_{}_urls.json'.format(filt)) utils.wait(10) finally: session.logout(driver) utils.wait(3) utils.close_driver(driver)
def tearDown(self): """ Cleanup the environment. """ print('\nTearing down.') utils.close_driver(self.driver) self.stop_proxy()
def get_course_videos(course_list, ncourse_max): cache_dir = 'cache' driver = utils.start_driver('chrome', download_dir=cache_dir) home_dir = str(pathlib.Path.home()) session.login(driver, home_dir + '/plst.credential.json') utils.wait(3) try: ncourse = 0 for line in open(course_list): if line.startswith('#'): continue course_id = line.rstrip() video_list, nmodule, nclip = load_course_video_list(course_id) nvideo = len(video_list) out_dir = 'courses/{}/videos'.format(course_id) os.makedirs(out_dir, exist_ok=True) if count_videos(out_dir) == nvideo: continue utils.print_message('get video of "{}" ({}/{}): {} modules and ' '{} clips'.format(course_id, ncourse + 1, ncourse_max, nmodule, nclip)) for module_id, clip_id, clip_url in video_list: video_url = course.get_video_url(driver, clip_url) video_basename = video_url.split('?')[0].split('/')[-1] video_name = '{}/{}.{}.{}'.format(out_dir, module_id, clip_id, video_basename) utils.download_file(video_url, video_name, verbose=True) ndownload = count_videos(out_dir) if ndownload != nvideo: utils.print_message('*ERROR*: course "{}", expected {} clips, ' 'downloaded {}'.format( course_id, nvideo, ndownload)) raise ncourse += 1 utils.print_message( '----------------------------------------------') utils.wait(3) if ncourse == ncourse_max: break finally: session.logout(driver) utils.wait(3) utils.close_driver(driver)
def get_rcpNo(jm_code, keywod, st_dt, ed_dt): driver = get_driver( 'C:\\Users\\admin\\PycharmProjects\\webCrawl\\chromedriver.exe', 'http://dart.fss.or.kr/dsab002/main.do#') # 드라이버 로드 driver.implicitly_wait(10) driver.find_element_by_name('textCrpNm').send_keys(jm_code) # 종목코드 driver.find_element_by_xpath( '//*[@id="searchForm"]/fieldset/div/p[3]/span[2]/a[7]').click() # 기간 #driver.find_element_by_name('startDate').send_keys(st_dt) # 기간_시작 #driver.find_element_by_name('endDate').send_keys(ed_dt) # 기간_종료 driver.find_element_by_name('reportName').send_keys(keywod) # 검색어 time.sleep(1) driver.find_element_by_xpath( '//*[@id="searchForm"]/fieldset/div/p[8]/input').click() # 검색 res_list = driver.find_elements_by_xpath( '//*[@id="listContents"]/div[1]/table/tbody/tr') # 결과 리스트 # 최상위 데이터만 수집 ( => 짧은 주기로 수집해야 함) # 결과 리스트에서 가용 데이터 추출 if len(res_list) == 0: print('검색 결과가 없습니다.') return 0 else: item = res_list[0] # 문서번호 rcp_no = item.find_elements_by_tag_name('td')[2].find_element_by_tag_name( 'a').get_attribute('href')[-14:] # 기재정정 rcp_yn = item.find_elements_by_tag_name('td')[2].find_element_by_tag_name( 'a').find_element_by_tag_name('span').text # 시장구분 rcp_gb = item.find_elements_by_tag_name('td')[5].find_element_by_tag_name( 'img').get_attribute('title') if '유가' in rcp_gb: rcp_gb = 'K' else: rcp_gb = 'Q' if len(rcp_no) != 14: print('rcpNo 형식이 다릅니다.') return 0 if '첨부' in rcp_yn: print('첨부정정은 수집대상 제외') return 0 close_driver(driver) return rcp_no
def hando_main(jm_code, rcp_no, gijun_yy): # driver 세팅 driver = get_driver( 'C:\\Users\\admin\\PycharmProjects\\webCrawl\\chromedriver.exe', 'http://dart.fss.or.kr/dsaf001/main.do?rcpNo={0}'.format(rcp_no)) hando = [] hando.extend(get_hando(driver, jm_code, gijun_yy)) for h in hando: print(h) # driver close close_driver(driver)
def get_rcpNo(jm_code, keywod): driver = get_driver( 'C:\\Users\\admin\\PycharmProjects\\webCrawl\\chromedriver.exe', 'http://dart.fss.or.kr/dsab002/main.do#') # 드라이버 로드 driver.find_element_by_name('textCrpNm').send_keys(jm_code) # 종목코드 driver.find_element_by_xpath( '//*[@id="searchForm"]/fieldset/div/p[3]/span[2]/a[4]').click( ) # 기간 : 1년 checked = driver.find_element_by_xpath( '//*[@id="finalReport"]').get_attribute('checked') # 최종보고서 여부 #if checked: # driver.find_element_by_xpath('//*[@id="finalReport"]').click() # 최종보고서 체크 해제 driver.find_element_by_id('reportName').send_keys(keywod) # 검색구분 : 결의 driver.find_element_by_xpath( '//*[@id="searchForm"]/fieldset/div/p[8]/input').click() # 검색 res_list = driver.find_elements_by_xpath( '//*[@id="listContents"]/div[1]/table/tbody/tr') # 결과 리스트 # 최상위 데이터만 수집 ( => 짧은 주기로 수집해야 함) # 결과 리스트에서 가용 데이터 추출 if len(res_list) == 0: print('검색 결과가 없습니다.') return 0 else: item = res_list[0] for i in range(0, len(res_list)): if '2018.12' in res_list[i].find_elements_by_tag_name( 'td')[2].find_element_by_tag_name('a').text: item = res_list[i] # 문서번호 rcp_no = item.find_elements_by_tag_name('td')[2].find_element_by_tag_name( 'a').get_attribute('href')[-14:] # 기재정정 rcp_yn = item.find_elements_by_tag_name('td')[2].find_element_by_tag_name( 'a').find_element_by_tag_name('span').text if len(rcp_no) != 14: print('rcpNo 형식이 다릅니다.') return 0 if '첨부' in rcp_yn: print('첨부정정은 수집대상 제외') return 0 close_driver(driver) return rcp_no
def get_search_option_courses(): driver = utils.start_driver('chrome') home_dir = str(pathlib.Path.home()) session.login(driver, home_dir + '/plst.credential.json') utils.wait(3) filt_name_dict = { 'role': 'ROLES', 'subject': 'SUBJECTS TO LEARN', 'tool': 'TOOLS', 'cert': 'CERTIFICATIONS', # 'level': 'SKILL LEVELS', # 'author': 'AUTHORS', } try: for filt, filt_name in sorted(filt_name_dict.items()): opt_url_dict = utils.load_json( 'search/filt_{}_urls.json'.format(filt)) out_dir = 'search/filt_{}_courses'.format(filt) os.makedirs(out_dir, exist_ok=True) opt_index = 0 nopt = len(opt_url_dict) for opt, url in sorted(opt_url_dict.items()): opt_index += 1 fname_json = '{}/{}.json'.format(out_dir, opt_index) if os.path.isfile(fname_json): continue # if opt_index >= 10: # break utils.print_message( 'get all courses with filt={}, option={} ({}/{})'.format( filt, opt, opt_index, nopt)) course_id_list = search.get_all_courses_per_option( driver, url, wait_time=10) opt_courses_dict = {opt: course_id_list} utils.save_json(opt_courses_dict, fname_json) utils.wait(20) finally: session.logout(driver) utils.wait(3) utils.close_driver(driver)
def get_rcpNo(jm_code, keywod): driver = get_driver('C:\\Users\\admin\\PycharmProjects\\webCrawl\\chromedriver.exe', 'http://kind.krx.co.kr/disclosure/details.do?method=searchDetailsMain#viewer') # 드라이버 로드 driver.find_element_by_xpath('//*[@id="AKCKwd"]').send_keys(jm_code) # 종목코드 time.sleep(1) driver.find_element_by_xpath('//*[@id="search-btn-dates"]/ul/li[5]/a').send_keys(Keys.ENTER) # 기간 : 1년 time.sleep(1) checked = driver.find_element_by_xpath('//*[@id="lastReport"]').get_attribute('checked') # 최종보고서 여부 if checked: driver.find_element_by_xpath('//*[@id="lastReport"]').send_keys(Keys.ENTER) # 최종보고서 체크 해제 driver.find_element_by_id('reportNmTemp').send_keys(keywod) # 검색구분 : 결의 time.sleep(1) driver.find_element_by_xpath('//*[@id="searchForm"]/section[1]/div/div[3]/a[1]').send_keys(Keys.ENTER) # 검색 time.sleep(1) res_list = driver.find_elements_by_xpath('//*[@id="main-contents"]/section[1]/table/tbody/tr') # 결과 리스트 # 최상위 데이터만 수집 ( => 짧은 주기로 수집해야 함) # 결과 리스트에서 가용 데이터 추출 if len(res_list) == 0: print('검색 결과가 없습니다.') sys.exit(0) else: item = res_list[0] # 문서번호 rcp_no = item.find_elements_by_tag_name('td')[3].find_element_by_tag_name('a').get_attribute('onclick')[-19:-5] print(rcp_no) # 기재정정 #rcp_yn = item.find_elements_by_tag_name('td')[3].find_element_by_tag_name('a').find_element_by_tag_name('font').text rcp_yn = '' # 시장구분 rcp_gb = item.find_elements_by_tag_name('td')[2].find_element_by_tag_name('img').get_attribute('alt') if len(rcp_no) != 14: print('rcpNo 형식이 다릅니다.') sys.exit(0) if '첨부' in rcp_yn: print('첨부정정은 수집대상 제외') sys.exit(0) close_driver(driver) return rcp_no, rcp_yn, rcp_gb
def get_rcpNo(jm_code, keywod, st_dt, en_dt): driver = get_driver( 'C:\\Users\\admin\\PycharmProjects\\webCrawl\\chromedriver.exe', 'http://dart.fss.or.kr/dsab002/main.do#') # 드라이버 로드 driver.find_element_by_name('textCrpNm').send_keys(jm_code) # 종목코드 driver.find_element_by_xpath('//*[@id="startDate"]').send_keys( st_dt) # 기간 시작 driver.find_element_by_xpath('//*[@id="endDate"]').send_keys( en_dt) # 기간 종료 driver.find_element_by_id('reportName').send_keys(keywod) # 검색구분 : 결의 driver.find_element_by_xpath( '//*[@id="searchForm"]/fieldset/div/p[8]/input').click() # 검색 res_list = driver.find_elements_by_xpath( '//*[@id="listContents"]/div[1]/table/tbody/tr') # 결과 리스트 # 최상위 데이터만 수집 ( => 짧은 주기로 수집해야 함) # 결과 리스트에서 가용 데이터 추출 if len(res_list) == 0: print('검색 결과가 없습니다.') sys.exit(0) else: item = res_list[0] # 문서번호 rcp_no = item.find_elements_by_tag_name('td')[2].find_element_by_tag_name( 'a').get_attribute('href')[-14:] # 기재정정 rcp_yn = item.find_elements_by_tag_name('td')[2].find_element_by_tag_name( 'a').find_element_by_tag_name('span').text if len(rcp_no) != 14: print('rcpNo 형식이 다릅니다.') sys.exit(0) if '첨부' in rcp_yn: print('첨부정정은 수집대상 제외') sys.exit(0) close_driver(driver) return rcp_no, rcp_yn
def bd_main(jm_code, rcp_no): try: # driver 세팅(결의, 공고) driver = get_driver( 'C:\\Users\\admin\\PycharmProjects\\webCrawl\\chromedriver.exe', 'http://dart.fss.or.kr/dsaf001/main.do?rcpNo={0}'.format(rcp_no)) driver.implicitly_wait(10) bd_gubun, bd_kind, bd_gum, bd_total, bd_gijun_ymd = get_bd_table( driver) conn = get_dbcon('esg') cursor = conn.cursor() # 중복 체크 및 DB 삽입 dup_select = """select * from proxy080 where jm_code = '{0}' and bd_gijun_ymd = '{1}' """.format(jm_code, bd_gijun_ymd) cursor.execute(dup_select) if cursor.rowcount > 0: insert_qry = """update proxy080 set bd_gubun = '{2}', bd_kind = '{3}', bd_gum = {4}, bd_total = {5} where jm_code = '{0}' and bd_gijun_ymd = '{1}' """.format(jm_code, bd_gijun_ymd, bd_gubun, bd_kind, bd_gum, bd_total) else: insert_qry = """insert into proxy080 values('{0}', '{1}', '{2}', '{3}', {4}, {5}) """.format(jm_code, bd_gijun_ymd, bd_gubun, bd_kind, bd_gum, bd_total) cursor.execute(insert_qry) finally: cursor.close() close_dbcon(conn) close_driver(driver)
def get_all_htmls(): '''Download all htmls of Bible in Chinese. Output data will be saved in ../data/chinese_cn/*.html ''' driver = utils.start_driver('phantomjs', verbose=True) out_dir = '{}/data/chinese_cn'.format(work_dir) os.makedirs(out_dir, exist_ok=True) try: for i in range(1, 74): url = ('http://xiaozhushou.org/index.php/?m=bible&template={}' .format(i)) utils.open_url(driver, url, verbose=True) chpt_url_list = [] for elem in driver.find_elements_by_xpath( '//ul[@id="chapter_list"]/li/a'): chpt_url = elem.get_attribute('href') chpt_url_list.append(chpt_url) for chpt_url in chpt_url_list: book_id = str(i).zfill(3) chpt_id = chpt_url.split('=')[-1].zfill(3) out_html_name = ('{}/{}_{}_chapter.html' .format(out_dir, book_id, chpt_id)) out_audio_name = ('{}/{}_{}_audio.mp3' .format(out_dir, book_id, chpt_id)) get_content(driver, chpt_url, out_html_name, out_audio_name) except: print('*ERROR* something wrong') raise finally: utils.close_driver(driver, verbose=True)
def get_notice(jm_code, rcp_no, cursor): try: # driver 세팅 driver = get_driver( 'C:\\Users\\admin\\PycharmProjects\\webCrawl\\chromedriver.exe', 'http://dart.fss.or.kr/dsaf001/main.do?rcpNo={0}'.format(rcp_no)) # 주총 공고의 rcpno 히스토리 rcpno_list = get_rcpno_list(driver) # 최초 문서의 공고년도 first_rcp_no = rcpno_list[0] first_rcp_yy = first_rcp_no[:4] # 이전 rcp_no pre_rcp_no = '' for i in range(0, len(rcpno_list)): if rcp_no == rcpno_list[i] and i > 0: pre_rcp_no = rcpno_list[i - 1] break print(rcpno_list, pre_rcp_no) # ------------------------- 주총공고 ------------------------- try: notice_gb, notice_tb, notice_ref, notice_etc = get_notice_data( rcp_no, driver) # 중복체크 dup_select = """select * from proxy011 where rcp_no = '{0}'""".format( rcp_no) cursor.execute(dup_select) dup_cnt = cursor.rowcount if dup_cnt > 0: return 0 res_rcpno = '' # 결의문 rcpno 가져오기 if len(notice_tb[0]) == 8: res_select = """select first_rcpno from proxy001 where jm_code = '{0}' and meet_ymd = '{1}' and meet_gb = '{2}' and meet_time = '{3}' """.format(jm_code, notice_tb[0], notice_gb, notice_tb[1]) cursor.execute(res_select) if cursor.rowcount > 0: res_rcpno = cursor.fetchone()[0] # 회차 max 값 max_select = """select * from proxy011 where left(first_rcpno, 4) = '{0}' and jm_code = '{1}' group by meet_seq """.format(first_rcp_yy, jm_code) cursor.execute(max_select) max_seq = cursor.rowcount # meet_seq 생성 seq_select = """select meet_seq from proxy011 where first_rcpno = '{0}' """.format(first_rcp_no) cursor.execute(seq_select) seq = cursor.fetchone() if cursor.rowcount < 1: seq = str(max_seq + 1).zfill(2) else: seq = "".join(seq) seq = seq[-2:] yyyy = make_ymd(notice_tb[0]) if yyyy is not None and yyyy != '': yyyy = yyyy[:4] else: yyyy = time.strftime('%Y') meet_seq = jm_code + yyyy + seq notice_qry = notice_mst_ins(meet_seq, rcp_no, jm_code, notice_gb, rcpno_list[0], notice_tb, notice_ref, res_rcpno) cursor.execute(notice_qry) # crawling to deri ymdstr = get_full_ymdstr(notice_tb[0], notice_tb[1]) deri_qry = deri_ins(meet_seq, rcp_no, pre_rcp_no, jm_code, notice_tb[0], notice_gb, ymdstr, notice_tb[2]) cursor.execute(deri_qry) driver.switch_to_default_content() info_logger.info('[0] Key creation success.') except Exception as e: error_logger.error('[0] Key creation fail. [{0}] : {1}'.format( rcp_no, e)) # ------------------------- 이사 및 위원회 활동내역 ------------------------- get_isa_act(driver, meet_seq, rcp_no, cursor) driver.switch_to_default_content() # ------------------------- 사외이사보수 ------------------------- get_isa_bosu(driver, meet_seq, rcp_no, cursor) driver.switch_to_default_content() # ------------------------- 단일 거래규모 일정규모 이상 거래 ------------------------- get_transaction_single(driver, meet_seq, rcp_no, cursor) driver.switch_to_default_content() # ------------------------- 거래총액 일정규모 이상 거래 ------------------------- get_transaction_total(driver, meet_seq, rcp_no, cursor) driver.switch_to_default_content() # ------------------------- 재무제표 ------------------------- get_financial_table(driver, meet_seq, rcp_no, cursor) # ------------------------- 정관의 변경 ------------------------- get_change_article(driver, meet_seq, rcp_no, cursor) # ------------------------- 이사선임 ------------------------- get_elect_isa(driver, meet_seq, rcp_no, cursor) # ------------------------- 이사보수한도 ------------------------- get_limit_bosu(driver, meet_seq, rcp_no, cursor) # ------------------------- 주식매수선택권 ------------------------- get_stockoption(driver, meet_seq, rcp_no, cursor) except Exception as e: error_logger.error('[Notice] crawling fail. [{0}] : {1}'.format( rcp_no, e)) finally: close_driver(driver)
description = utils.get_course_description(course_url) curriculum = utils.get_course_curriculum(course_url) content = template.format( preview_img=utils.get_course_preview_image(course_url), title=slug, short_description=course_url, description=description, description_ru=utils.translate_2_ru(description), curriculum=curriculum, curriculum_ru=utils.translate_2_ru(curriculum), url=course_url, video=utils.get_course_youtube_share_id(course_url) ) pyperclip.copy(content) open('./tmp/result.txt', 'w').write(content) print('Done') if __name__ == '__main__': try: main() except Exception as inst: print("Unexpected error:", sys.exc_info()[0]) print(type(inst)) print(inst.args) print(inst) finally: time.sleep(100) utils.close_driver()
course_meta = course.find_element_by_xpath( './/*[@class="courses-list__item-meta"]') out['author'] = course_meta.find_element_by_xpath( './p[@class="courses-list__item-authors"]/span/a').text out['level'] = course_meta.find_element_by_xpath( './p[@class="courses-list__item-level"]').text out['date'] = course_meta.find_element_by_xpath( './time[@class="courses-list__item-date"]').text out['level'] = course_meta.find_element_by_xpath( './time[@class="courses-list__item-duration"]').text ncourse += 1 out_list.append(out) utils.save_json(out_list, json_name) if __name__ == '__main__': import session driver = utils.start_driver('chrome') out_json = sys.argv[1] out_html = out_json.rsplit('.', 1)[0] + '.html' try: session.login(driver, 'input/credential.json') load_all_courses(driver, out_html, num_load=None) get_all_courses(driver, out_json) finally: utils.close_driver(driver)
def get_resolution(jm_code, rcp_no, rcp_gb, cursor): try: # driver 세팅(결의, 공고) driver = get_driver( 'C:\\Users\\admin\\PycharmProjects\\webCrawl\\chromedriver.exe', 'http://dart.fss.or.kr/dsaf001/main.do?rcpNo={0}'.format(rcp_no)) driver.implicitly_wait(10) # 주총 공고의 rcpno 히스토리 rcpno_list = get_rcpno_list(driver) # 최초 문서의 공고년도 first_rcp_no = rcpno_list[0] first_rcp_yy = first_rcp_no[:4] # 주총결의 데이터 세팅 driver.switch_to.frame(driver.find_element_by_tag_name("iframe")) # 상장 구분 if 'Y' in rcp_gb: tb_mst = driver.find_elements_by_xpath( '//*[@id="XFormD52_Form0_Table0"]/tbody/tr') else: tb_mst = driver.find_elements_by_xpath( '//*[@id="XFormD2_Form0_Table0"]/tbody/tr') # 주총 결의 meet_tb = [0 for x in range(9)] if 'Y' in rcp_gb: meet_tb[0] = tb_mst[1].find_elements_by_tag_name( 'td')[1].text # 일자 meet_tb[1] = tb_mst[1].find_elements_by_tag_name( 'td')[2].text # 시간 meet_tb[8] = tb_mst[0].find_elements_by_tag_name( 'td')[1].text # 주총구분 else: meet_tb[0] = tb_mst[0].find_elements_by_tag_name( 'td')[2].text # 일자 meet_tb[1] = tb_mst[1].find_elements_by_tag_name( 'td')[1].text # 시간 meet_tb[8] = tb_mst[8].find_elements_by_tag_name( 'td')[1].text # 주총구분 meet_tb[2] = tb_mst[2].find_elements_by_tag_name('td')[1].text # 장소 meet_tb[3] = tb_mst[3].find_elements_by_tag_name('td')[1].text # 의안내용 meet_tb[4] = tb_mst[4].find_elements_by_tag_name( 'td')[1].text # 이사회결의일 meet_tb[5] = tb_mst[5].find_elements_by_tag_name( 'td')[2].text # 사외이사_참 meet_tb[6] = tb_mst[6].find_elements_by_tag_name( 'td')[1].text # 사외이사_불참 meet_tb[7] = tb_mst[7].find_elements_by_tag_name( 'td')[1].text # 감사참석여부 # 사외이사 선임 및 사업목적 테이블 유무 확인 """isa_1, isa_2, isa_3, isa_4, tb_biz = False, False, False, False, False spans = driver.find_elements_by_tag_name('span') for span in spans: title = span.text title = get_hangul(title) if title == '이사선임세부내역': isa_1 = True elif title == '사외이사선임세부내역': isa_2 = True elif title == '감사위원선임세부내역': isa_3 = True elif title == '감사선임세부내역': isa_4 = True elif title == '사업목적변경세부내역': tb_biz = True """ # 이사선임 & 사업목적 isa_arr = [] biz_arr = [] if 'Y' in rcp_gb: # 이사선임 isa_1 = driver.find_elements_by_xpath( '//*[@id="LIB_L9019"]') # 이사선임 div 유무 isa_2 = driver.find_elements_by_xpath( '//*[@id="LIB_L9018"]') # 사외이사선임 div 유무 isa_3 = driver.find_elements_by_xpath( '//*[@id="LIB_L9016"]') # 감사위원선임 div 유무 isa_4 = driver.find_elements_by_xpath( '//*[@id="LIB_L9015"]') # 감사선임 div 유무 if isa_1: isa_arr.extend(get_isa(driver, 'LIB_L9019')) if isa_2: isa_arr.extend(get_isa(driver, 'LIB_L9018')) if isa_3: isa_arr.extend(get_isa(driver, 'LIB_L9016')) if isa_4: isa_arr.extend(get_isa(driver, 'LIB_L9015')) # 사업목적 tb_biz = driver.find_elements_by_xpath( '//*[@id="LIB_L9017"]') # 사업목적 div 유무 if tb_biz: biz_arr.extend(get_biz(driver, 'LIB_L9017')) elif 'K' in rcp_gb: # 이사선임 isa_1 = driver.find_elements_by_xpath( '//*[@id="LIB_L7021"]') # 이사선임 div 유무 isa_2 = driver.find_elements_by_xpath( '//*[@id="LIB_L7020"]') # 사외이사선임 div 유무 isa_3 = driver.find_elements_by_xpath( '//*[@id="LIB_L7018"]') # 감사위원선임 div 유무 isa_4 = driver.find_elements_by_xpath( '//*[@id="LIB_L7017"]') # 감사선임 div 유무 if isa_1: isa_arr.extend(get_isa(driver, 'LIB_L7021')) if isa_2: isa_arr.extend(get_isa(driver, 'LIB_L7020')) if isa_3: isa_arr.extend(get_isa(driver, 'LIB_L7018')) if isa_4: isa_arr.extend(get_isa(driver, 'LIB_L7017')) # 사업목적 tb_biz = driver.find_elements_by_xpath( '//*[@id="LIB_L7019"]') # 사업목적 div 유무 if tb_biz: biz_arr.extend(get_biz(driver, 'LIB_L7019')) else: # 이사선임 isa_1 = driver.find_elements_by_xpath( '//*[@id="LIB_L3025"]') # 이사선임 div 유무 isa_2 = driver.find_elements_by_xpath( '//*[@id="LIB_L3024"]') # 사외이사선임 div 유무 isa_3 = driver.find_elements_by_xpath( '//*[@id="LIB_L3022"]') # 감사위원선임 div 유무 isa_4 = driver.find_elements_by_xpath( '//*[@id="LIB_L3021"]') # 감사선임 div 유무 if isa_1: isa_arr.extend(get_isa(driver, 'LIB_L3025')) if isa_2: isa_arr.extend(get_isa(driver, 'LIB_L3024')) if isa_3: isa_arr.extend(get_isa(driver, 'LIB_L3022')) if isa_4: isa_arr.extend(get_isa(driver, 'LIB_L3021')) # 사업목적 tb_biz = driver.find_elements_by_xpath( '//*[@id="LIB_L3023"]') # 사업목적 div 유무 if tb_biz: biz_arr.extend(get_biz(driver, 'LIB_L3023')) # --------------------------------------------------------------------------------- # # DB 삽입 # 중복체크 dup_select = """select * from proxy001 where rcp_no = '{0}'""".format( rcp_no) cursor.execute(dup_select) dup_cnt = cursor.rowcount if dup_cnt > 0: return 0 # 회차 max 값 max_select = """select * from proxy001 where left(first_rcpno, 4) = '{0}' and jm_code = '{1}' group by meet_seq """.format(first_rcp_yy, jm_code) cursor.execute(max_select) max_seq = cursor.rowcount # meet_seq 생성 seq_select = """select meet_seq from proxy001 where first_rcpno = '{0}' """.format(first_rcp_no) cursor.execute(seq_select) seq = cursor.fetchone() if cursor.rowcount < 1: seq = str(max_seq + 1).zfill(2) else: seq = "".join(seq) seq = seq[-2:] yyyy = make_ymd(meet_tb[0][:4]) if yyyy is not None and yyyy != '': yyyy = yyyy[:4] else: yyyy = time.strftime('%Y') meet_seq = jm_code + yyyy + seq # 결의 mst 삽입 in_qry = resolution_mst_ins(meet_seq, meet_tb, jm_code, rcp_no, rcpno_list[0]) cursor.execute(in_qry) #print(in_qry) # 이사선임 삽입 if isa_arr: #print(isa_arr) for i in range(0, len(isa_arr)): ins_isa_info = isa_info_ins(meet_seq, isa_arr[i], rcp_no, i) cursor.execute(ins_isa_info) #print(str(i), '번째 이사 쿼리 : ', ins_isa_info) if chk_no_data(isa_arr[i][4]): ins_isa_car = isa_car_ins(meet_seq, isa_arr[i], rcp_no, i) # 이사선임_경력 cursor.execute(ins_isa_car) #print(str(i), '번째 이사 경력 쿼리 : ', ins_isa_car) if chk_no_data(isa_arr[i][5]): ins_isa_dup = isa_dup_ins(meet_seq, isa_arr[i], rcp_no, i) # 이사선임_겸직 cursor.execute(ins_isa_dup) #print(str(i), '번째 이사 겸직 쿼리 : ', ins_isa_dup) # 사업목적 변경 삽입 if biz_arr: for i in range(0, len(biz_arr)): #print(biz_arr[i]) ins_biz = biz_ins(meet_seq, biz_arr[i], rcp_no) cursor.execute(ins_biz) #print("사업목적 변경 쿼리 : ", ins_biz) except Exception as e: error_logger.error( 'Resolution crawling fail. : [{0}] [{1}] {2}'.format( jm_code, rcp_no, e)) finally: close_driver(driver)
def resolution_main(jm_code, rcp_no, rcp_yn, rcp_gb): # driver 세팅(결의, 공고) driver = get_driver( 'C:\\Users\\admin\\PycharmProjects\\webCrawl\\chromedriver.exe', 'http://dart.fss.or.kr/dsaf001/main.do?rcpNo={0}'.format(rcp_no)) # 주총결의 데이터 세팅 driver.switch_to.frame(driver.find_element_by_tag_name("iframe")) # 상장 구분 if '유가' in rcp_gb: tb_mst = driver.find_elements_by_xpath( '//*[@id="XFormD52_Form0_Table0"]/tbody/tr') else: tb_mst = driver.find_elements_by_xpath( '//*[@id="XFormD2_Form0_Table0"]/tbody/tr') # 주총 결의 meet_tb = [0 for x in range(9)] if '유가' in rcp_gb: meet_tb[0] = tb_mst[1].find_elements_by_tag_name('td')[1].text # 일자 meet_tb[1] = tb_mst[1].find_elements_by_tag_name('td')[2].text # 시간 meet_tb[8] = tb_mst[0].find_elements_by_tag_name('td')[1].text # 주총구분 else: meet_tb[0] = tb_mst[0].find_elements_by_tag_name('td')[2].text # 일자 meet_tb[1] = tb_mst[1].find_elements_by_tag_name('td')[1].text # 시간 meet_tb[8] = tb_mst[8].find_elements_by_tag_name('td')[1].text # 주총구분 meet_tb[2] = tb_mst[2].find_elements_by_tag_name('td')[1].text # 장소 meet_tb[3] = tb_mst[3].find_elements_by_tag_name('td')[1].text # 의안내용 meet_tb[4] = tb_mst[4].find_elements_by_tag_name('td')[1].text # 이사회결의일 meet_tb[5] = tb_mst[5].find_elements_by_tag_name('td')[2].text # 사외이사_참 meet_tb[6] = tb_mst[6].find_elements_by_tag_name('td')[1].text # 사외이사_불참 meet_tb[7] = tb_mst[7].find_elements_by_tag_name('td')[1].text # 감사참석여부 # 이사선임 & 사업목적 isa_arr = [] biz_arr = [] if '유가' in rcp_gb: # 이사선임 isa_1 = driver.find_elements_by_xpath( '//*[@id="LIB_L9019"]') # 이사선임 div 유무 isa_2 = driver.find_elements_by_xpath( '//*[@id="LIB_L9018"]') # 사외이사선임 div 유무 isa_3 = driver.find_elements_by_xpath( '//*[@id="LIB_L9016"]') # 감사위원선임 div 유무 isa_4 = driver.find_elements_by_xpath( '//*[@id="LIB_L9015"]') # 감사선임 div 유무 if isa_1 != '' and isa_1: isa_arr.extend(get_isa(driver, 'LIB_L9019')) if isa_2 != '' and isa_2: isa_arr.extend(get_isa(driver, 'LIB_L9018')) if isa_3 != '' and isa_3: isa_arr.extend(get_isa(driver, 'LIB_L9016')) if isa_4 != '' and isa_4: isa_arr.extend(get_isa(driver, 'LIB_L9015')) # 사업목적 tb_biz = driver.find_elements_by_xpath( '//*[@id="LIB_L9017"]') # 사업목적 div 유무 if tb_biz != '' and tb_biz: biz_arr.extend(get_biz(driver, 'LIB_L9017')) elif '코스닥' in rcp_gb: # 이사선임 isa_1 = driver.find_elements_by_xpath( '//*[@id="LIB_L7021"]') # 이사선임 div 유무 isa_2 = driver.find_elements_by_xpath( '//*[@id="LIB_L7020"]') # 사외이사선임 div 유무 isa_3 = driver.find_elements_by_xpath( '//*[@id="LIB_L7018"]') # 감사위원선임 div 유무 isa_4 = driver.find_elements_by_xpath( '//*[@id="LIB_L7017"]') # 감사선임 div 유무 if isa_1 != '' and isa_1: isa_arr.extend(get_isa(driver, 'LIB_L7021')) if isa_2 != '' and isa_2: isa_arr.extend(get_isa(driver, 'LIB_L7020')) if isa_3 != '' and isa_3: isa_arr.extend(get_isa(driver, 'LIB_L7018')) if isa_4 != '' and isa_4: isa_arr.extend(get_isa(driver, 'LIB_L7017')) # 사업목적 tb_biz = driver.find_elements_by_xpath( '//*[@id="LIB_L7019"]') # 사업목적 div 유무 if tb_biz != '' and tb_biz: biz_arr.extend(get_biz(driver, 'LIB_L7019')) else: # 이사선임 isa_1 = driver.find_elements_by_xpath( '//*[@id="LIB_L3025"]') # 이사선임 div 유무 isa_2 = driver.find_elements_by_xpath( '//*[@id="LIB_L3024"]') # 사외이사선임 div 유무 isa_3 = driver.find_elements_by_xpath( '//*[@id="LIB_L3022"]') # 감사위원선임 div 유무 isa_4 = driver.find_elements_by_xpath( '//*[@id="LIB_L3021"]') # 감사선임 div 유무 if isa_1 != '' and isa_1: isa_arr.extend(get_isa(driver, 'LIB_L3025')) if isa_2 != '' and isa_2: isa_arr.extend(get_isa(driver, 'LIB_L3024')) if isa_3 != '' and isa_3: isa_arr.extend(get_isa(driver, 'LIB_L3022')) if isa_4 != '' and isa_4: isa_arr.extend(get_isa(driver, 'LIB_L3021')) # 사업목적 tb_biz = driver.find_elements_by_xpath( '//*[@id="LIB_L3023"]') # 사업목적 div 유무 if tb_biz != '' and tb_biz: biz_arr.extend(get_biz(driver, 'LIB_L3023')) # DB 삽입 try: conn = get_dbcon('esg') cursor = conn.cursor() try: # 조회용 주총 값 ymd = make_ymd(meet_tb[0]) gb = get_regYn(meet_tb[8]) seq_select = """select * from proxy001 where meet_ymd = '{0}' and jm_code = '{1}' and meet_gb = '{2}' """.format(ymd, jm_code, gb) cursor.execute(seq_select) rows = cursor.rowcount # 기재정정이 아닐 경우 중복체크 if rcp_yn == '' and rows > 0: print('중복 데이터가 있습니다.') sys.exit(0) # report_ver 키값 생성(개정일 + seq) report_ver = rcp_no[:8] + str(rows + 1).zfill(2) # 결의 mst 삽입 in_qry = resolution_mst_ins(meet_tb, jm_code, report_ver, rcp_no) cursor.execute(in_qry) print(in_qry) # 이사선임 삽입 if isa_arr: ins_isa, dup_isa = isa_mst_ins(isa_arr, meet_tb[0], jm_code, gb, report_ver) # 이사선임 for i in range(0, len(ins_isa)): # 이사 중복 체크 cursor.execute(dup_isa[i]) dup_cnt = cursor.rowcount if dup_cnt > 0: print('중복된 이사가 있습니다.') continue cursor.execute(ins_isa[i]) print(str(i) + " : " + ins_isa[i]) if chk_no_data(isa_arr[i][4]): ins_isa_car = isa_car_ins(isa_arr[i], meet_tb[0], jm_code, gb, report_ver, i) # 이사선임_경력 cursor.execute(ins_isa_car) print(str(i) + " : " + ins_isa_car) if chk_no_data(isa_arr[i][5]): ins_isa_dup = isa_dup_ins(isa_arr[i], meet_tb[0], jm_code, gb, report_ver, i) # 이사선임_겸직 cursor.execute(ins_isa_dup) print(str(i) + " : " + ins_isa_dup) # 사업목적 변경 삽입 if biz_arr: for i in range(0, len(biz_arr)): ins_biz = biz_ins(biz_arr, meet_tb[0], jm_code, gb, report_ver) cursor.execute(ins_biz[i]) print(str(i) + " : " + str(ins_biz[i])) except: f = open("C:\\Users\\rmffo\\PycharmProjects\\log\\error_log.txt", 'a') f.write(jm_code + '\n') f.close() cursor.close() finally: close_dbcon(conn) # driver close close_driver(driver)