def _main(self): try: self.driver.get('https://www.qichacha.com/') # 直接访问登录链接可能会被当成恶意访问 self.driver.maximize_window() self.driver.find_element_by_xpath( '/html/body/header/div/ul[2]/li[9]/a').click() try: WebDriverWait(self.driver, 2).until( EC.visibility_of_element_located((By.ID, 'normalLogin'))) except (NoSuchElementException, TimeoutException, NoSuchElementException, WebDriverException, ElementNotVisibleException, ElementNotSelectableException, BaseException) as e: logging.error(msg=e) self.driver.refresh() self.driver.find_element_by_xpath( '//*[@id="normalLogin"]').click() # 首页登录按钮 time.sleep(1) self.driver.find_element_by_id('nameNormal').send_keys( '13535045165') # 账号 self.driver.find_element_by_id('pwdNormal').send_keys( 'l1198659788') time.sleep(1) data = '' try: WebDriverWait(self.driver, 4).until( EC.visibility_of_element_located( (By.XPATH, ('//*[@id="nc_1__scale_text"]/span')))) self.driver.find_element_by_xpath( '//*[@id="nc_1__scale_text"]/span').click() # 阿里验证码 self._move(driver=self.driver, slider=self.driver.find_element_by_xpath( '//*[@id="nc_1__scale_text"]/span'), xoffset=308) WebDriverWait(self.driver, 4).until( EC.visibility_of_element_located( (By.CLASS_NAME, 'imgCaptcha_img'))) dr = self.driver.find_element_by_xpath( '//*[@id="nc_1__imgCaptcha_img"]/img') data = dr.get_attribute('src')[22:] # 获取图片base64后的数据 except (NoSuchElementException, TimeoutException, NoSuchElementException, WebDriverException, ElementNotVisibleException, ElementNotSelectableException, BaseException) as e: logging.error(msg=e) self.driver.refresh() # TODO(CLay): 此处可调用打码平台或者进行图片处理识别 if get_image(data): self.FILE_NAME = get_image(data) # 获取base64转本地图片之后的绝对路径 # 手动打码 captcha_key = input(''' 请手动输入验证码: ''').strip() time.sleep(6) self.driver.find_element_by_xpath( '//*[@id="nc_1_captcha_input"]').send_keys(captcha_key) self.driver.find_element_by_xpath( '//*[@id="nc_1_scale_submit"]/span').click() if self.driver.find_element_by_class_name( 'imgCaptcha_btn').get_attribute( 'style') == 'border-top-color: red;': self.driver.refresh() time.sleep(1) self.driver.find_element_by_xpath( '//*[@id="user_login_normal"]/button').click() self.driver.maximize_window() try: WebDriverWait(self.driver, 3).until( EC.visibility_of_element_located( (By.XPATH, '//*[@id="bindwxModal"]/div/div/div/button'))) self.driver.find_element_by_xpath( '//*[@id="bindwxModal"]/div/div/div/button').click() except (NoSuchElementException, TimeoutException, NoSuchElementException, WebDriverException, ElementNotVisibleException, ElementNotSelectableException, BaseException) as e: logging.error(msg=e) self.driver.refresh() # TODO # 搜索框 self.driver.find_element_by_xpath( '//*[@id="searchkey"]').send_keys(self.addr) time.sleep(1.5) self.driver.find_element_by_xpath( '//*[@id="V3_Search_bt"]').click() # 遍历第一页页面详细链接 a_list = self.driver.find_elements_by_class_name('ma_h1') urls = [i.get_attribute('href') for i in a_list] for url in urls: js = "window.open('{}')".format(url) time.sleep(0.1) self.driver.execute_script(js) all_handles = self.driver.window_handles for handle1 in all_handles[1:]: self.driver.switch_to.window(handle1) item = self._ITEM try: # 判断公司是否是需要爬取 WebDriverWait(self.driver, 1).until( EC.visibility_of_element_located( (By.XPATH, ('//*[@class="ntable"][2]')))) item['cxDate'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['entName'] = self.driver.find_element_by_xpath( '//*[@id="company-top"]/div[2]/div[2]/div[1]/h1').text item['entFddbr'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[1]/tbody/tr[2]/td[1]/div/div[1]/div[2]/a/h2' ).text item['entAddress'] = self.driver.find_element_by_xpath( '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[2]/span[3]/a[1]' ).text item['qyxydm'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[3]/td[2]').text item['gsxxResultList'][0][ 'entZczb'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[1]/td[2]' ).text item['gsxxResultList'][0][ 'entZczbIs'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[1]/td[4]' ).text item['gsxxResultList'][0][ 'entzt'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[2]/td[2]' ).text item['gsxxResultList'][0][ 'entclsj'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[2]/td[4]' ).text item['gsxxResultList'][0][ 'entXydm'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[3]/td[2]' ).text item['gsxxResultList'][0][ 'entSbh'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[3]/td[4]' ).text item['gsxxResultList'][0][ 'entZch'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[4]/td[2]' ).text item['gsxxResultList'][0][ 'entJgdm'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[3]/td[4]' ).text item['gsxxResultList'][0][ 'entGsType'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[5]/td[2]' ).text item['gsxxResultList'][0][ 'entXy'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[5]/td[4]' ).text item['gsxxResultList'][0][ 'entclsjIs'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[6]/td[2]' ).text item['gsxxResultList'][0][ 'entRegister'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[6]/td[4]' ).text item['gsxxResultList'][0][ 'entRegion'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[7]/td[2]' ).text item['gsxxResultList'][0][ 'entEname'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[7]/td[4]' ).text item['gsxxResultList'][0][ 'entOutName'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[8]/td[2]/span' ).text item['gsxxResultList'][0][ 'entPledgeCount'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[8]/td[4]' ).text item['gsxxResultList'][0][ 'entManMany'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[9]/td[2]' ).text item['gsxxResultList'][0][ 'entStopTime'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[9]/td[4]' ).text item['gsxxResultList'][0][ 'entAddress'] = self.driver.find_element_by_xpath( '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[2]/span[3]/a[1]' ).text item['gsxxResultList'][0][ 'entScope'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[11]/td[2]' ).text # 变更记录 tr_list = self.driver.find_element_by_id( 'Changelist').find_elements_by_tag_name('tr')[1:] for tr in tr_list: bg_dict = {} bg_dict['bgNo'] = tr.find_element_by_xpath( 'td[1]').text bg_dict['bgDate'] = tr.find_element_by_xpath( 'td[2]').text bg_dict['bgpri'] = tr.find_element_by_xpath( 'td[3]').text bg_dict['bgBefore'] = tr.find_element_by_xpath( 'td[4]').text bg_dict['bgLater'] = tr.find_element_by_xpath( 'td[5]').text item['bgjlResultList'].append(bg_dict) if item['entFddbr'] not in duplicate_entFddr: result_data.append(item) duplicate_entFddr.append(item['entFddbr']) self.driver.close() # 关闭当前窗口 except (NoSuchElementException, TimeoutException, NoSuchElementException, WebDriverException, ElementNotVisibleException, ElementNotSelectableException, BaseException) as e: # self.main(addr=addr) logging.error(msg=e) self.driver.close() continue if len(urls) >= 20: # 切换到第一页 self.driver.switch_to.window(all_handles[0]) params_encode_list = params_encode(addr=self.addr) for next_url_key in params_encode_list: next_url = 'https://www.qichacha.com/search?' + next_url_key js = "window.open('{}', '_blank')".format(next_url) self.driver.execute_script(js) # 句柄停留在最后一个打开的页面 all_next_page_handles = self.driver.window_handles # 获取前五个页面的句柄 if all_next_page_handles[1]: self.driver.switch_to.window( all_next_page_handles[1]) # 切换到第二个窗口 a_detail_list = self.driver.find_elements_by_class_name( 'ma_h1') urls_detail = [ j.get_attribute('href') for j in a_detail_list ] for url_detail in urls_detail: js_detail = "window.open('{}')".format(url_detail) self.driver.execute_script(js_detail) all_handles_detail = self.driver.window_handles # 获取主页面下的子页面句柄 # 遍历详细页面链接 for handle_detail in all_handles_detail[1:]: self.driver.switch_to.window(handle_detail) item = self._ITEM try: # 判断公司是否是需要爬取 WebDriverWait(self.driver, 1).until( EC.visibility_of_element_located( (By.XPATH, ('//*[@class="ntable"][2]')))) item['cxDate'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) item['entName'] = self.driver.find_element_by_xpath( '//*[@id="company-top"]/div[2]/div[2]/div[1]/h1' ).text item['entFddbr'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[1]/tbody/tr[2]/td[1]/div/div[1]/div[2]/a/h2' ).text item['entAddress'] = self.driver.find_element_by_xpath( '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[2]/span[3]/a[1]' ).text item['qyxydm'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[3]/td[2]' ).text item['gsxxResultList'][0][ 'entZczb'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[1]/td[2]' ).text item['gsxxResultList'][0][ 'entZczbIs'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[1]/td[4]' ).text item['gsxxResultList'][0][ 'entzt'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[2]/td[2]' ).text item['gsxxResultList'][0][ 'entclsj'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[2]/td[4]' ).text item['gsxxResultList'][0][ 'entXydm'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[3]/td[2]' ).text item['gsxxResultList'][0][ 'entSbh'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[3]/td[4]' ).text item['gsxxResultList'][0][ 'entZch'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[4]/td[2]' ).text item['gsxxResultList'][0][ 'entJgdm'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[3]/td[4]' ).text item['gsxxResultList'][0][ 'entGsType'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[5]/td[2]' ).text item['gsxxResultList'][0][ 'entXy'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[5]/td[4]' ).text item['gsxxResultList'][0][ 'entclsjIs'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[6]/td[2]' ).text item['gsxxResultList'][0][ 'entRegister'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[6]/td[4]' ).text item['gsxxResultList'][0][ 'entRegion'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[7]/td[2]' ).text item['gsxxResultList'][0][ 'entEname'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[7]/td[4]' ).text item['gsxxResultList'][0][ 'entOutName'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[8]/td[2]/span' ).text item['gsxxResultList'][0][ 'entPledgeCount'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[8]/td[4]' ).text item['gsxxResultList'][0][ 'entManMany'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[9]/td[2]' ).text item['gsxxResultList'][0][ 'entStopTime'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[9]/td[4]' ).text item['gsxxResultList'][0][ 'entAddress'] = self.driver.find_element_by_xpath( '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[2]/span[3]/a[1]' ).text item['gsxxResultList'][0][ 'entScope'] = self.driver.find_element_by_xpath( '//*[@id="Cominfo"]/table[2]/tbody/tr[11]/td[2]' ).text # 变更记录 tr_list = self.driver.find_element_by_id( 'Changelist').find_elements_by_tag_name( 'tr')[1:] for tr in tr_list: bg_dict = {} bg_dict['bgNo'] = tr.find_element_by_xpath( 'td[1]').text bg_dict[ 'bgDate'] = tr.find_element_by_xpath( 'td[2]').text bg_dict[ 'bgpri'] = tr.find_element_by_xpath( 'td[3]').text bg_dict[ 'bgBefore'] = tr.find_element_by_xpath( 'td[4]').text bg_dict[ 'bgLater'] = tr.find_element_by_xpath( 'td[5]').text item['bgjlResultList'].append(bg_dict) if item['entFddbr'] not in duplicate_entFddr: result_data.append(item) duplicate_entFddr.append(item['entFddbr']) self.driver.close() # 关闭当前窗口 except (NoSuchElementException, TimeoutException, NoSuchElementException, WebDriverException, ElementNotVisibleException, ElementNotSelectableException, BaseException) as e: logging.error(msg=e) self.driver.close() continue finally: self.driver.switch_to.window( all_handles_detail[0]) self.driver.quit() return json.dumps({'result': result_data}) else: self.driver.quit() return json.dumps({'result': result_data}) except (NoSuchElementException, TimeoutException, NoSuchElementException, WebDriverException, ElementNotVisibleException, ElementNotSelectableException, BaseException) as e: logging.error(msg=e) self.driver.quit() finally: self.driver.quit()
} net = tinyyolonet.Yolo_tiny() #net = models.resnet18(pretrained=False) #net.fc = nn.Linear(512, 784) net.cuda() checkpoint_load = torch.load('model_best/Best_Models.pth') #net.load_state_dict(checkpoint_load['state_dict']) net.load_state_dict(checkpoint_load['state_dict']) net.eval() #hyper_parameter i = 448 threshold = 0.005 x = func.get_image(i) print(x) ima_raw = func.get_image_raw(i) x = x.unsqueeze(0) x = Variable(x) x = x.cuda() #print(x) pre = net(x) probs = pre[0,:294].contiguous().view(49, 6) confs = pre[0,294:392].contiguous().view(49, 2) coords = pre[0,392:].contiguous().view(49, 2, 4) confs_1 = confs[:, 0] confs_2 = confs[:, 1]
def collect_image(url): data = get_image(url) show_images(data)
def test_should_get_image(self, mock_get_image): mock_get_image.return_value = self.data file = get_image(self.test_url) self.assertIsInstance(file, np.ndarray)