def sele_app_id(): conn = hu_utils.open_local_db("app_info") ress = hu_utils.select_one(conn) ids = [] for res in ress: ids.append(res[0]) return ids
def main(): try: browser = get_chrome(True) # 获取chrome browser.maximize_window() # 全屏浏览器界面 url = "https://www.qimai.cn/weixin" browser.get(url) browser.implicitly_wait(10) wait = WebDriverWait(browser, 10) wait.until(EC.presence_of_all_elements_located(("xpath", "//p[@class='medium-txt']"))) trs = browser.find_elements("xpath", "//tr[@class='ivu-table-row']") # 100条信息 public_accounts_infos = [] for tr in trs: public_accounts_info = {} public_accounts_name = tr.find_element("xpath", "td//p[@class='medium-txt']").text et_name = tr.find_element("xpath", "td[3]/div/span").text strength_value = tr.find_element("xpath", "td[4]/div/span").text public_accounts_info["name"] = public_accounts_name public_accounts_info["et_name"] = et_name public_accounts_info["strength_value"] = strength_value print(public_accounts_info) public_accounts_infos.append(public_accounts_info) finally: logger.info("关闭浏览器") browser.close() conn = hu_utils.open_local_db(db="app_info") hu_utils.insert_update_many(conn, public_accounts_infos, "public_accounts_info")
def get_etid(): conn = hu_utils.open_line_db() et_namess = hu_utils.select_ones(conn) for et_names in et_namess: et_nams = [] print len(et_names) for et_name in et_names: if et_name[1]: et_nam = {} et_nam["etid"] = et_name[0] et_nam["et_name"] = et_name[1] et_nams.append(et_nam) conn = hu_utils.open_local_db() hu_utils.insert_ignore_many(conn, et_nams, "et_name_status")
def main(): """ 主框架 :return: """ start_urls1 = [] et_names = get_etid() for et_name in et_names: start_urlz = {} if et_name[1]: start_url = "https://www.tianyancha.com/search?searchType=company&key=%s" % et_name[ 1] start_urlz["start_url"] = start_url start_urlz["etid"] = et_name[0] start_urlz["et_name"] = et_name[1] start_urls1.append(start_urlz) parse_start_url(start_urls1) parse = Parse_url_two(dt_url_twos) et_host_infos, et_busi_infos, et_shareholder_infos, et_foreign_investment_infos, et_branch_offices, wechat_list_infos, et_container_copyright_infos, et_container_icp_infos, et_trademark_infos, et_rongzi_infos = parse.main( ) # 分别存入数据库 conn = hu_utils.open_local_db() hu_utils.insert_ignore_many(conn, et_busi_infos, "et_busi_info") conn = hu_utils.open_local_db() hu_utils.insert_ignore_many(conn, et_host_infos, "et_host_info") conn = hu_utils.open_local_db() hu_utils.insert_ignore_many(conn, et_shareholder_infos, "et_shareholder_info") conn = hu_utils.open_local_db() hu_utils.insert_ignore_many(conn, et_foreign_investment_infos, "et_foreign_investment_info") conn = hu_utils.open_local_db() hu_utils.insert_ignore_many(conn, et_branch_offices, "et_branch_office") conn = hu_utils.open_local_db() hu_utils.insert_ignore_many(conn, wechat_list_infos, "et_wechat_list_info") conn = hu_utils.open_local_db() hu_utils.insert_ignore_many(conn, et_container_copyright_infos, "et_container_copyright_info") conn = hu_utils.open_local_db() hu_utils.insert_ignore_many(conn, et_container_icp_infos, "et_container_icp_info") conn = hu_utils.open_local_db() hu_utils.insert_ignore_many(conn, et_trademark_infos, "et_trademark_info") conn = hu_utils.open_local_db() # print("------et_rongzi_infos:", et_rongzi_infos) hu_utils.insert_ignore_many(conn, et_rongzi_infos, "et_rongzi_infos") conn = hu_utils.open_local_db() hu_utils.insert_update_many(conn, update_status, "et_name_status")
def get_etid(): conn = hu_utils.open_local_db() et_names = hu_utils.select_one(conn) print "获取的企业数", len(et_names) return et_names
def main(): ids = sele_app_id() browser = get_chrome() browser.maximize_window() url = "https://www.qimai.cn/rank/index/brand/free/device/iphone/country/cn/genre/5000" browser.get(url) browser.implicitly_wait(10) wait = WebDriverWait(browser, 10) browser.get_screenshot_as_file('1.png') wait.until(EC.presence_of_element_located(('xpath', "//tr/td/div"))) # 等待 time.sleep(1) global shibai_num shibai_num += 1 if shibai_num == 1: shibai = browser.find_element( "xpath", "//span[contains(@class,'icon-shibai')]") shibai.click() time.sleep(1) try: a = 0 while True: if a > 24: break ul = browser.find_element( "xpath", '//ul[contains(@class,"more-item-list")]') lis = ul.find_elements('xpath', "li") # print(len(lis)) li = lis[a] aa = 0 a += 1 time.sleep(2) logger.info("开始爬取第{}页排行榜".format(a)) browser.execute_script('window.scrollTo(0,0)') # 上拉进度条 li.click() # 一级页面点击更换排行榜 time.sleep(3) browser.refresh() wait.until(EC.presence_of_element_located( ('xpath', "//tr/td/div"))) # 等待 time.sleep(3.1) for i in range(3): # 加载之后的第50到200的数据 browser.execute_script( 'window.scrollTo(0,document.body.scrollHeight)') # 下拉进度条 time.sleep(5) browser.execute_script('window.scrollTo(0,0)') # 上拉进度条 time.sleep(5) trs = browser.find_elements("xpath", "//table/tbody/tr") # if a < 2: # continue for tr in trs: aa += 1 if aa > 200: continue logger.info("开始爬取第{}条数据".format(aa)) ios_android_ID = {} android_main = {} ios_main = {} ios_Leaderboardls = [] android_Leaderboardls = [] android_rating = [] android_id = 0 try: browser.execute_script('window.scrollBy(0,67)') # 进度条移动 time.sleep(0.3) # if aa <= 198: # continue # ios——info tds = tr.find_elements("xpath", "td") app_name = tds[1].find_element("xpath", 'div/div/a') ios_url = app_name.get_attribute("href") # ios 的url ios_id = re.search( "\/app\/rank\/appid\/(\d+)\/country\/cn", ios_url).group(1) # ios的id print("ios_id:", ios_id) if int(ios_id) in ids: logger.info("{}已存储到数据库".format(ios_id)) continue try: company_ulsz = tds[7].find_element("xpath", "a") company_ulsz.click() time.sleep(5) browser.switch_to.window( browser.window_handles[1]) # 公司页面 HTML_twoC = browser.find_element( "xpath", "//body").get_attribute("innerHTML") ios_main = parse_html_twoC(ios_main, HTML_twoC) browser.close() time.sleep(2) browser.switch_to.window( browser.window_handles[0]) # 初始页面 except: logger.debug("获取元素出错,没有找到公司页面") action_click(browser, app_name) # action time.sleep(5) browser.switch_to.window( browser.window_handles[1]) # app详情页 try: HTML_twoA7 = browser.find_element( "xpath", "//body").get_attribute("innerHTML") ios_Leaderboardls = parse_html_twoA7( ios_Leaderboardls, HTML_twoA7, ios_id) except: logger.info("parse_html_twoA7解析失败") uls = browser.find_elements("xpath", "//ul[@class='select-list']") lis0 = uls[0].find_elements("xpath", 'li') li1 = lis0[1].find_element("xpath", 'a') li3 = lis0[3].find_element("xpath", 'a') li1.click() time.sleep(5) try: HTML_twoA1 = browser.find_element( "xpath", "//body").get_attribute("innerHTML") ios_main, android_main = parse_html_twoA1( ios_main, android_main, HTML_twoA1) except: logger.info("解析失败") li3.click() time.sleep(5) HTML_twoA3 = browser.find_element( "xpath", "//body").get_attribute("innerHTML") ios_main = parse_html_twoA3(ios_main, HTML_twoA3) # 安卓--info button_a = browser.find_element( "xpath", '//button[contains(@class,"btn-android")]') button_text = button_a.text # print(button_text) if "发现安卓版" not in button_text: button_a.click() time.sleep(5) android_url = browser.current_url # 获取的是安卓页面的url android_main["caiji_url"] = android_url # print(android_url) android_id = re.search( "\/andapp\/baseinfo\/appid\/(\d+)", android_url).group(1) # 安卓的id android_main["android_id"] = android_id uls = browser.find_elements( "xpath", "//ul[@class='select-list']") lis1 = uls[0].find_elements("xpath", 'li') lis2 = uls[1].find_elements("xpath", 'li') li1 = lis1[1].find_element("xpath", 'a') li3 = lis1[3].find_element("xpath", 'a') li5 = lis2[1].find_element("xpath", 'a') li1.click() time.sleep(5) HTML_twoB1 = browser.find_element( "xpath", "//body").get_attribute("innerHTML") android_main = parse_html_twoB1( android_main, HTML_twoB1) li3.click() time.sleep(5) try: HTML_twoB3 = browser.find_element( "xpath", "//body").get_attribute("innerHTML") android_rating = parse_html_twoB3( android_rating, HTML_twoB3, android_id) except: logger.info("解析失败") li5.click() time.sleep(5) HTML_twoB5 = {} try: towB5 = browser.find_element( "xpath", "//div[contains(@class,'ivu-tabs-tab-active')]" ).text # print("towB5:",towB5) HTML_twoB5[towB5] = browser.find_element( "xpath", "//div[@id='rank-info']").get_attribute( "innerHTML") ivu_tabs = browser.find_elements( "xpath", "//div[@class='ivu-tabs-tab']") z = 0 for ivu_tab in ivu_tabs: z += 1 if (z % 2) == 0: # print(ivu_tab.text) ivu_tab.click() time.sleep(4) azz = "" azz = browser.find_element( "xpath", "//div[contains(@class,'ivu-tabs-tab-active')]" ).text HTML_twoB5[azz] = browser.find_element( "xpath", "//div[@id='rank-info']" ).get_attribute("innerHTML") android_Leaderboardls = parse_html_twoB5( android_Leaderboardls, HTML_twoB5, android_id) except: logger.debug("无排名数据") browser.switch_to.window(browser.window_handles[1]) browser.close() # 关闭一个多余页面 time.sleep(1) browser.switch_to.window(browser.window_handles[0]) # 更换页面 # info ios_main["ios_id"] = ios_id ios_main["caiji_url"] = ios_url ios_android_ID["ios_id"] = ios_id ios_android_ID["android_id"] = android_id logger.info("解析结束,开始存储数据") except: logger.info("-----第{}条数据获取失败-----".format(aa)) while len(browser.window_handles) > 1: logger.info('关闭一个多余窗口!') browser.switch_to.window(browser.window_handles[1]) browser.close() browser.switch_to.window(browser.window_handles[0]) finally: logger.info("开始数据存储") if ios_main: conn = hu_utils.open_local_db("app_info") hu_utils.insert_update_one(conn, ios_main, "ios_main") else: logger.info("ios_main 无数据") if "android_id" in android_main.keys(): conn = hu_utils.open_local_db("app_info") hu_utils.insert_update_one(conn, android_main, "android_main") if ios_android_ID: conn = hu_utils.open_local_db("app_info") hu_utils.insert_update_one(conn, ios_android_ID, "ios_android_ID") else: logger.info("ios_android_ID 无数据") if ios_Leaderboardls: conn = hu_utils.open_local_db("app_info") hu_utils.insert_update_many(conn, ios_Leaderboardls, "ios_Leaderboardls") else: logger.info("ios_Leaderboardls 无数据") if android_Leaderboardls: conn = hu_utils.open_local_db("app_info") hu_utils.insert_update_many(conn, android_Leaderboardls, "android_Leaderboardls") else: logger.info("android_Leaderboardls 无数据") if android_rating: conn = hu_utils.open_local_db("app_info") hu_utils.insert_update_many(conn, android_rating, "android_rating") else: logger.info("android_rating 无数据") finally: logger.info("关闭浏览器") browser.quit()