def setup_and_test_proxy(): if not nextbox_sub_ensure(5): return False input_xp = '//*[@id="app-content-vue"]/div/div[2]/input' button_xp = '//*[@id="app-content-vue"]/div/div[2]/button' but = get_xpath(button_xp) inp = get_xpath(input_xp) # no input? means proxy is active! if not inp: chat("proxy is enabled, disabling first") but.click() ret = input_into_text_field(input_xp, conf["proxy_domain"]) if not ret: chat("failed input of proxy_domain") return False if not test_and_click_button(button_xp): chat("failed button click") return False chat("checking 'new' login page now") return test_for_valid_login_page(conf["proxy_domain"], "https")
def parse_town(self, url, params, target, elemType, town_code=None, town_name=None, _isRecent=False): tr_list = get_xpath(url, params, '//tr') th_list = get_xpath(url, params, '//th') td_columns = len(tr_list[1]) if td_columns < 3: pass parse_result = [] for i in range(len(tr_list)): if tr_list[i][0].get('rowspan') == None: # 이 tr의 맨 왼쪽칸이 rowspan=3으로 지정되지 않았다 == 이 tr은 '합계'가 아닌 '남' '여' 칸을 다루고 있다. 따라서 pass. pass elif tr_list[i][0].text == None: # 이 tr의 맨 왼쪽칸은 비어있다 == 2번째 칸에 선거구명이 적혀있다. constituency = tr_list[i][1] # 여기 저장되는 constituency 이름은 기초자치단체명 또는 선거구명임. ex. <td rowspan="3" class="firstTd alignL">중구동구</td> #local_provincePage.py와 다르게, 여기는 모든 elemType이 'constituency_in_municipal_division'이므로 별도의 if문으로 나눌 필요 없음. villages = tr_list[i][2] # 읍면동수 ex. <td rowspan="3" class=alignR>23</td> pollStations = tr_list[i][3] # 투표구수 ex. <td rowspan="3" class=alignR>52</td> population = tr_list[i][4] # ex. <td rowspan="3" class=alignR>148,789<br/>(174 , 0)</td> ###### TODO: electorates 부분의 위치가 계속 바뀌고 있음. 확인할 필요 있음. electorates = tr_list[i][6] # ex. <td class=alignR>127,836<br/>(163 , 0)</td> popul_elector_ratio = tr_list[i][td_columns-2] # 선거인수/인구수 비율 ex. <td rowspan="3" class=alignR>85.9</td> households = tr_list[i][td_columns-1] # 세대수 ex. <td rowspan="3" class=alignR>67,548<br/>(172 , 0)</td> # 굳이 'td_columns-1' 인덱스를 쓰는 이유: "역대선거"와 "최근선거"의 표 칸 배치가 달라서. constituency_info = (constituency, villages, pollStations, population, electorates, popul_elector_ratio, households) constituency_info = dict(list(zip(self.attrs_constituency, constituency_info))) parse_result.append(constituency_info) parse_result = [self.parse_tr_xhtml(tr_elem, town_name=town_name) for tr_elem in parse_result] _elemType_str = '선거구별 선거인수' print(('crawled %s #%d - %s, %s(%d)...' % (target, self.nth, _elemType_str, town_name, len(parse_result)))) return parse_result
def backup_test(): if not nextbox_sub_ensure(3): return False cont_button_xp = '//*[@id="app-content-vue"]/div/div/button' # if this button exists, then we've a not cleaned up backup wait_for_xpath(cont_button_xp, 5) continue_button = get_xpath(cont_button_xp) if continue_button.is_enabled(): chat("someone left his stuff here, cleaning up...") click_xpath(cont_button_xp) xp_dropdown = '//*[@id="app-content-vue"]/div/div[1]/div[1]/div[2]/input' if not wait_for_xpath(xp_dropdown, 5): return False # storage dropdown input field click_xpath(xp_dropdown) chat("chooosing target storage - 1st click the dropdown") # now select first storage click_xpath( '//*[@id="app-content-vue"]/div/div[1]/div[1]/div[3]/ul/li[1]/span/div/span[1]' ) chat("now click to choose") # now click into input field input_into_text_field('//*[@id="app-content-vue"]/div/div[1]/input', "xxxtest_backup_name") chat("enter our designated backup name") if not test_and_click_button( '//*[@id="app-content-vue"]/div/div[1]/button'): return False wait_for_xpath(cont_button_xp, 5) continue_button = get_xpath(cont_button_xp) log(f"waiting for backup...", " B ", end=" ", no_pad=False, flush=True) while not continue_button.is_enabled(): log(".", "", end="", no_pad=True, flush=True) sleep(0.5) print() banner_msg = get_xpath('//*[@id="app-content-vue"]/div/div/span').text # don't leave the backup un-cleaned-up click_xpath(cont_button_xp) if banner_msg == "Completed Backup successfully": return True return False
def storages_info(): if not nextbox_sub_ensure(2): return False items_mounted = '//*[@id="app-content-vue"]/div/div[1]/*/button' items_unmounted = '//*[@id="app-content-vue"]/div/div[2]/*/button' top = get_xpath(items_mounted, as_list=True) bottom = get_xpath(items_unmounted, as_list=True) chat(f"storages: top: #{top} - bottom: #{bottom}") return {"top": top or [], "bottom": bottom or []}
def logout(): chat("logging out now, just because I can!") click_xpath('//*[@id="expand"]') chat("expanding, just for the show...") get_xpath('//*[@id="expanddiv"]') res = get_xpath('//li[@data-id="logout"]').find_element(By.TAG_NAME, "a") if not res: chat("failed logging out...") return False return res.click()
def parse_city(self, url, params, target, elemType, city_name=None, _isRecent=False): tr_list = get_xpath(url, params, '//tr') th_list = get_xpath(url, params, '//th') td_columns = len(tr_list[1]) if td_columns < 3: pass parse_result = [] for i in range(len(tr_list)): if tr_list[i][0].get('rowspan') == None: # 이 tr의 맨 왼쪽칸이 rowspan=3으로 지정되지 않았다 == 이 tr은 '합계'가 아닌 '남' '여' 칸을 다루고 있다. 따라서 pass. pass else: municipal = tr_list[i][0] # 여기 저장되는 municipal 이름은 기초자치단체명 또는 선거구명임. ex. <td rowspan="3" class="firstTd alignL">중구동구</td> if elemType=='local_division': villages = tr_list[i][1] # 읍면동수 ex. <td rowspan="3" class=alignR>23</td> pollStations = tr_list[i][2] # 투표구수 ex. <td rowspan="3" class=alignR>52</td> population = tr_list[i][3] # ex. <td rowspan="3" class=alignR>148,789<br/>(174 , 0)</td> ###### TODO: electorates 부분의 위치가 계속 바뀌고 있음. 확인할 필요 있음. if _isRecent: electorates = tr_list[i][td_columns-3] # ex. <td class=alignR>127,836<br/>(163 , 0)</td> else: electorates = tr_list[i][5] # ex. <td class=alignR>127,836<br/>(163 , 0)</td> else: #elemType == 'constituency_in_province' villages = tr_list[i][2] # 읍면동수 ex. <td rowspan="3" class=alignR>23</td> pollStations = tr_list[i][3] # 투표구수 ex. <td rowspan="3" class=alignR>52</td> population = tr_list[i][4] # ex. <td rowspan="3" class=alignR>148,789<br/>(174 , 0)</td> ###### TODO: electorates 부분의 위치가 계속 바뀌고 있음. 확인할 필요 있음. electorates = tr_list[i][6] # ex. <td class=alignR>127,836<br/>(163 , 0)</td> popul_elector_ratio = tr_list[i][td_columns-2] # 선거인수/인구수 비율 ex. <td rowspan="3" class=alignR>85.9</td> households = tr_list[i][td_columns-1] # 세대수 ex. <td rowspan="3" class=alignR>67,548<br/>(172 , 0)</td> # 굳이 'td_columns-1' 인덱스를 쓰는 이유: "역대선거"와 "최근선거"의 표 칸 배치가 달라서. municipal_info = (municipal, villages, pollStations, population, electorates, popul_elector_ratio, households) municipal_info = dict(list(zip(self.attrs_municipal, municipal_info))) parse_result.append(municipal_info) parse_result = [self.parse_tr_xhtml(tr_elem, city_name=city_name) for tr_elem in parse_result] if elemType=='local_division': _elemType_str = '행정구역별(시군구) 선거인수(국내거소미신고 재외국민 포함)' else: #elemType == 'constituency_in_province' _elemType_str = '선거구별 선거인수' print(('crawled %s #%d - %s, %s(%d)...' % (target, self.nth, _elemType_str, city_name, len(parse_result)))) return parse_result
def _parse_item(self, ele): d = {} concat = [] for k, v in self.config['attr'].items(): if '+' in v: concat.append((k, v)) continue raw = False if v.endswith('/@RAW'): raw = True v = v[:-len('/@RAW')] res = utils.get_xpath(ele, v) if raw: res = utils.tree2md(res) elif hasattr(res, 'itertext'): res = ' '.join([r.strip() for r in res.itertext()]) elif hasattr(res, 'text'): res = res.text() res = unicode(res).strip() if k.endswith('url') and not res.startswith('http'): res = self.base_url.rstrip('/') + '/' + res.lstrip('/') d[k] = res d['_crawl_time'] = datetime.now() for k, v in concat: d[k] = eval(v, d.copy()) logger.debug("[%s] parsed item: %s", self.config['name'], d) return d
def parse(self, url, city_name=None): elems = get_xpath(url, '//td') num_attrs = len(self.attrs) members = (dict(zip(self.attrs, elems[i*num_attrs:(i+1)*num_attrs]))\ for i in xrange(len(elems) / num_attrs)) members = [self.parse_member(member, city_name) for member in members] print 'crawled #%d - %s(%d)...' % (self.nth, city_name, len(members)) return members
def parse(self, url): elems = get_xpath(url, '//td') num_attrs = len(self.attrs) members = (dict(zip(self.attrs, elems[i*num_attrs:(i+1)*num_attrs]))\ for i in xrange(len(elems) / num_attrs)) members = [self.parse_member(member) for member in members] print 'crawled #%d (%d)...' % (self.nth, len(members)) return members
def parse(self, url, city_name=None): elems = get_xpath(url, '//td') num_attrs = len(self.attrs) members = (dict(zip(self.attrs, elems[i*num_attrs:(i+1)*num_attrs]))\ for i in xrange(len(elems) / num_attrs)) members = [ self.parse_member(member, city_name=city_name) for member in members ] print 'crawled #%d - %s(%d)...' % (self.nth, city_name or '비례대표', len(members)) return members
def parse_elected(self, url, params, target, city_code, consti_toCode): code_toNumElected = dict() params['cityCode'] = -1 if (params['electionCode']==7) else city_code xpath = get_xpath(url, params, './/table[@id="table01"]')[0] tr_list = xpath.findall('.//tr') #개별 <tr> 안에 한 줄씩 <td>들이 들어있음. num_trs = len(tr_list) row_head = 1 consti_seq = [] if (params['electionCode']==7):#'assembly_PR' num_elected = num_trs - row_head for consti in consti_toCode: code = consti_toCode[consti][0] code_toNumElected[code] = num_elected consti_seq.append(code) return (code_toNumElected, consti_seq) elif (params['electionCode']==9):#'local-mp_PR' name_index = 0 string_read_index = 1 elif (params['electionCode']==8):#'local-pp_PR' name_index = 0 string_read_index = 0 elif (target=='local-pp') or (target=='local-mp'): name_index = 1 string_read_index = 1 else: name_index = 0 string_read_index = 1 i = row_head while i < num_trs: district_name = tr_list[i][name_index].text[string_read_index:] if not district_name in consti_toCode: i = i+1 pass else: district_code = consti_toCode[district_name][0] num_elected = 1 i = i+1 while i < num_trs and district_name == tr_list[i][name_index].text[string_read_index:]: #print("%d, %d, %d, %s, %s" % (num_trs, i, num_elected, district_name, tr_list[i][name_index].text[1:])) num_elected = num_elected+1 i = i+1 code_toNumElected[district_code] = num_elected consti_seq.append(district_code) del consti_toCode[district_name][0] return (code_toNumElected, consti_seq)
def goto_nextbox_nav(idx): if not url_paths_equalish(conf["dom"].current_url, "/apps/nextbox/"): if not goto_nextbox(): return False sleep(1) navs = get_xpath('//*[@id="app-navigation-vue"]').find_elements( By.TAG_NAME, "li") navs = [nav.find_element(By.TAG_NAME, "a") for nav in navs] try: navs[idx - 1].click() except Exception: return False return True
def input_into_text_field(xpath: str, text: str, clear: bool = True): el = get_xpath(xpath) if not el: return False wait_for_xpath(xpath, 5) el.click() if clear: clen = len(el.get_attribute("value")) el.send_keys("\b" * clen) wait_for_xpath(xpath, 5) #el.clear() wait_for_xpath(xpath, 5) el.send_keys(text) return True
def test_and_click_button(el_or_xpath, click=True): sleep(1) button = el_or_xpath if isinstance(el_or_xpath, str): button = get_xpath(el_or_xpath) if not button.is_enabled(): chat( "something went wrong, we would expect that the button is not disabled!" ) chat(str(el_or_xpath)) return False else: chat("all conditions are met, let's push the button!!!") button.click() sleep(1) return True
def where_in_nextbox(): title_xpath = '//*[@id="app-content-vue"]/div/div[1]/h2' el = get_xpath(title_xpath) if not el: err("not even inside nextbox app?") return False dct = { "Remote Access for Your NextBox": 1, "Mounted Storages": 2, "Full System Backup": 3, "Remote Access - Status": 4, "Backwards Proxy Remote Access for Your NextBox": 5, "Static Domain Configuration": 6, "HTTPS / TLS Configuration": 7, "System Logs": 8 } out = dct.get(el.text) if out is None: err("could not determine where we are inside the nextbox app") return out
def parse_constant_candiNum(self, url, params, target, target_kor, city_name, city_code, city_index, townCode_JSON): #지금 이건 비례대표만 해당하는 거임 ㅇㅇㅇㅇㅇ xpath = get_xpath(url, params, './/table[@id="table01"]')[0] tr_list = xpath.findall('.//tr') #fucking_4th_president_ths!!!! num_trs = int(len(tr_list)) th_list = xpath.findall('.//th') #fucking_4th_president_ths!!!! num_ths = int(len(th_list)) district_toCode = self.town_toCode(city_index, townCode_JSON) if (target=='assembly_PR') or (target=='local-pp_PR'): code_toNumElected = self.PR_code_toNumElected(city_index, townCode_JSON) else: code_toNumElected = self.code_toNumElected(city_index, townCode_JSON) for code in code_toNumElected: num_elected = code_toNumElected[code] if th_list[0].get('rowspan') != None: #"최근선거"가 아니라면 #for i in range(num_ths): # if th_list[i].get('colspan') != None: #이 칸은 "정당별/후보자별 득표수"임. colspan = 총 후보자수 + 1(합계). # num_ths_left = i #정당/후보별 득표수 왼쪽에 있는 칸: '구시군명', '선거인수', '투표수'. 보통은 3일 것임. # max_candidate_num = int(th_list[i].get('colspan')) - 1 #총 후보자수 # break num_ths_left = 3 #정당/후보별 득표수 왼쪽에 있는 칸: '구시군명', '선거인수', '투표수'. 보통은 3일 것임. max_candidate_num = int(th_list[3].get('colspan')) - 1 #총 후보자수. 이 칸은 "정당별/후보자별 득표수"임. colspan = 총 후보자수 + 1(합계). candi_name_list = th_list[6:(6+max_candidate_num)] #element: <th><strong>한나라당</strong></th> row_head = 2 #읽기 시작할 행의 번째. 0번째줄의 "합계" 칸을 비울 것인가? num_tds = 6 + max_candidate_num #저 6의 확장일반화 방법은 없는가. else: #"최근선거"라면 #for i in range(num_ths): # if th_list[i].get('colspan') != None: #이 칸은 "정당별/후보자별 득표수"임. colspan = 총 후보자수. # num_ths_left = i #정당/후보별 득표수 왼쪽에 있는 칸: '구시군명', '선거인수', '투표수'. 보통은 3일 것임. # max_candidate_num = int(th_list[i].get('colspan')) #총 후보자수 # break num_ths_left = 3 #정당/후보별 득표수 왼쪽에 있는 칸: '구시군명', '선거인수', '투표수'. 보통은 3일 것임. max_candidate_num = int(th_list[3].get('colspan')) #총 후보자수. 이 칸은 "정당별/후보자별 득표수"임. colspan = 총 후보자수 candi_name_list = tr_list[1][3:(3+max_candidate_num)] #element: <td><strong>한나라당</strong></td> row_head = 2 #읽기 시작할 행의 번째. 0번째줄의 후보/정당명은 비우고, 1번째줄의 "합계" 칸을 비울 것인가? num_tds = num_ths + max_candidate_num - 1 region_info = () # 이 region '전체'의 개표결과를 담음. district_list = [] # 이 region 내의 각 district별 개표결과를 담음. candidate_num = max_candidate_num for i in range(num_trs - row_head): index = (i+row_head) if i==0: # 이 region '전체'의 개표결과를 담음. district_name = '합계' district_code = -1 else: # 이 region 내의 각 district별 개표결과를 담음. district_name = tr_list[index][0].text#tr_list[index][0].text # 여기 저장되는 district 이름은 선거구 이름임. district_code = district_toCode[district_name] #district_code = district_code_list[i-1]['CODE'] electorates = tr_list[index][1] counted_vote = tr_list[index][2] votes_num_percent = tr_list[index][num_ths_left : num_ths_left+candidate_num] #element: <td>1,940,259<br>(42.28)</td> cand_list = list(map(lambda x, y: dict(list(zip(self.attrs_result, [x, y]))), candi_name_list, votes_num_percent)) #('name': <th><strong>한나라당</strong></th>, 'vote': <td>1,940,259<br>(42.28)</td>) valid_vote = tr_list[index][num_ths_left + max_candidate_num+0] undervote = tr_list[index][num_ths_left + max_candidate_num+1] blank_ballot = tr_list[index][num_ths_left + max_candidate_num+2] district_info = (district_name, district_code, num_elected, electorates, counted_vote, candidate_num, cand_list, valid_vote, undervote, blank_ballot) district_info = dict(list(zip(self.attrs_district, district_info))) if i==0: # 이 region '전체'의 개표결과를 담음. region_info = self.parse_consti(district_info, city_name=city_name, city_code=city_code) else: # 이 region 내의 각 district별 개표결과를 담음. district_list.append(self.parse_consti(district_info, city_name=city_name, city_code=city_code)) return_result = [{'region_name': city_name, 'region_code': city_code, 'region_result': region_info, 'district_result': district_list}] if (target=='president' and self.nth <= 15) or \ (target=='assembly' and self.nth <= 16) or \ (target=='local-ma' and self.nth <= 3) or \ (target=='local-mp' and self.nth <= 3) or \ (target=='local-pa' and self.nth <= 3) or \ (target=='local-pp' and self.nth <= 3): return_result[0]['district_result'].sort(key=operator.itemgetter('district_code')) print('\x1b[1;31mcrawled %s election #%d - \x1b[1;m%s, %s(%d)' % \ (target, self.nth, target_kor+' 구시군별 득표', city_name, len(return_result[0]['district_result']))) return return_result
def parse_various_candiNum(self, url, params, target, target_kor, city_name, city_code, city_index, townCode_JSON): #지금 이건 지역구만 해당하는 거임 ㅇㅇㅇㅇㅇ xpath = get_xpath(url, params, './/table[@id="table01"]')[0] tr_list = xpath.findall('.//tr') #개별 <tr> 안에 한 줄씩 <td>들이 들어있음. num_trs = len(tr_list) thead_list = xpath.findall('.//th') num_theads = len(thead_list) code_toDistrict = self.PR_code_toConsti(city_index, townCode_JSON) if target=='local-mp_PR' \ else self.code_toConsti(city_index, townCode_JSON) district_codeSeq = self.PR_consti_Seq(city_index, townCode_JSON) if target=='local-mp_PR' \ else self.consti_Seq(city_index,townCode_JSON) max_candidate_num = len(tr_list[2]) - len(thead_list) # +1-1. 후보자 부분의 '계' 때문. code_toNumElected = self.PR_code_toNumElected(city_index, townCode_JSON) if target=='local-mp_PR' \ else self.code_toNumElected(city_index, townCode_JSON) town_toCode = self.town_toCode(city_index, townCode_JSON) for i in range(num_theads): if thead_list[i].get('colspan') != None: num_ths_left = i break consti_list = [] seq_index = 0 num_nonElection = 0 for i in range(num_trs): if len(tr_list[i]) < 2: pass elif tr_list[i][1].text == None: # 선거인수 칸이 blank인 줄을 찾으면, 그 칸 아래가 실득표수이므로... district_code = district_codeSeq[seq_index] district_name = code_toDistrict[str(district_code)] num_elected = code_toNumElected[str(district_code)] while tr_list[i][0].text != district_name: if (target=='local-mp' and 1 <= self.nth <= 3): if seq_index == num_nonElection: district_name = tr_list[i+1][0].text + ' ' + district_name elif tr_list[i+1][0].text != tr_list[i-1][0].text: townCode_next = town_toCode[tr_list[i+1][0].text] // 10 townCode_seq = district_code // 1000 - 60000 print("%d, %d" %(townCode_next, townCode_seq)) if townCode_next == townCode_seq: district_name = tr_list[i+1][0].text + ' ' + district_name else: district_name = tr_list[i-1][0].text + ' ' + district_name else: district_name = tr_list[i+1][0].text + ' ' + district_name print("\x1b[1;31m%s\x1b[1;m" % district_name) consti_list.append(self.noElection_consti(city_name, district_name, district_code, num_elected)) num_nonElection = num_nonElection+1 seq_index = seq_index+1 district_code = district_codeSeq[seq_index] district_name = code_toDistrict[str(district_code)] num_elected = code_toNumElected[str(district_code)] if (target=='local-mp' and 1 <= self.nth <= 3): district_name = tr_list[i+1][0].text + ' ' + district_name candidate_num = 0 candi_name_list = [] votes_num_percent = [] electorates = tr_list[i+1][num_ths_left-2] counted_vote = tr_list[i+1][num_ths_left-1] for j in range(max_candidate_num): j_index = j+num_ths_left if (tr_list[i][j_index].findtext('strong') == None) : break #if (tr_list[i][j_index].findtext('strong') != None) : candidate_num = candidate_num+1 candi_name_list.append(tr_list[i][j_index]) #element: <td><strong>한나라당<br>김광영</strong></td> votes_num_percent.append(tr_list[i+1][j_index]) #element: <td>3,050<br>(4.09)</td> cand_list = list(map(lambda x, y: dict(list(zip(self.attrs_result, [x, y]))), candi_name_list, votes_num_percent)) #('name': <td><strong>한나라당<br>김광영</strong></td>, 'vote': <td>3,050<br>(4.09)</td>) valid_vote = tr_list[i+1][num_ths_left + max_candidate_num+0] undervote = tr_list[i+1][num_ths_left + max_candidate_num+1] blank_ballot = tr_list[i+1][num_ths_left + max_candidate_num+2] district_info = (district_name, district_code, num_elected, electorates, counted_vote, candidate_num, cand_list, valid_vote, undervote, blank_ballot) district_info = dict(list(zip(self.attrs_district, district_info))) consti_list.append(self.parse_consti(district_info, city_name=city_name, city_code=city_code)) print("\x1b[1;32m%s\x1b[1;m" % district_name) seq_index = seq_index+1 while seq_index < len(district_codeSeq): district_code = district_codeSeq[seq_index] district_name = code_toDistrict[str(district_code)] if (target=='local-mp' and 1 <= self.nth <= 3): district_name = tr_list[i][0].text + ' ' + district_name num_elected = code_toNumElected[str(district_code)] print("\x1b[1;33m%s\x1b[1;m" % district_name) consti_list.append(self.noElection_consti(city_name, district_name, district_code, num_elected)) num_nonElection = num_nonElection+1 seq_index = seq_index+1 return_result = [{'region_name': city_name, 'region_code': city_code, 'district_result': consti_list}] print('\x1b[1;31mcrawled %s election #%d - \x1b[1;m%s, %s(%d)' % (target, self.nth, target_kor+' 선거구별 득표', city_name, seq_index)) print('\t└ %s, %s(%d)...' % ('무투표 선거구', city_name, num_nonElection)) return return_result