Example #1
0
def sele_app_id():
    conn = hu_utils.open_local_db("app_info")
    ress = hu_utils.select_one(conn)
    ids = []
    for res in ress:
        ids.append(res[0])
    return ids
Example #2
0
def main():
    try:
        browser = get_chrome(True)  # 获取chrome
        browser.maximize_window()  # 全屏浏览器界面
        url = "https://www.qimai.cn/weixin"
        browser.get(url)
        browser.implicitly_wait(10)
        wait = WebDriverWait(browser, 10)
        wait.until(EC.presence_of_all_elements_located(("xpath", "//p[@class='medium-txt']")))
        trs = browser.find_elements("xpath", "//tr[@class='ivu-table-row']")  # 100条信息
        public_accounts_infos = []
        for tr in trs:
            public_accounts_info = {}
            public_accounts_name = tr.find_element("xpath", "td//p[@class='medium-txt']").text
            et_name = tr.find_element("xpath", "td[3]/div/span").text
            strength_value = tr.find_element("xpath", "td[4]/div/span").text
            public_accounts_info["name"] = public_accounts_name
            public_accounts_info["et_name"] = et_name
            public_accounts_info["strength_value"] = strength_value
            print(public_accounts_info)
            public_accounts_infos.append(public_accounts_info)
    finally:
        logger.info("关闭浏览器")
        browser.close()
        conn = hu_utils.open_local_db(db="app_info")
        hu_utils.insert_update_many(conn, public_accounts_infos, "public_accounts_info")
Example #3
0
def get_etid():
    conn = hu_utils.open_line_db()
    et_namess = hu_utils.select_ones(conn)
    for et_names in et_namess:
        et_nams = []
        print len(et_names)
        for et_name in et_names:
            if et_name[1]:
                et_nam = {}
                et_nam["etid"] = et_name[0]
                et_nam["et_name"] = et_name[1]
                et_nams.append(et_nam)
        conn = hu_utils.open_local_db()
        hu_utils.insert_ignore_many(conn, et_nams, "et_name_status")
def main():
    """
    主框架
    :return:
    """
    start_urls1 = []
    et_names = get_etid()
    for et_name in et_names:
        start_urlz = {}
        if et_name[1]:
            start_url = "https://www.tianyancha.com/search?searchType=company&key=%s" % et_name[
                1]
            start_urlz["start_url"] = start_url
            start_urlz["etid"] = et_name[0]
            start_urlz["et_name"] = et_name[1]
            start_urls1.append(start_urlz)
    parse_start_url(start_urls1)
    parse = Parse_url_two(dt_url_twos)
    et_host_infos, et_busi_infos, et_shareholder_infos, et_foreign_investment_infos, et_branch_offices, wechat_list_infos, et_container_copyright_infos, et_container_icp_infos, et_trademark_infos, et_rongzi_infos = parse.main(
    )
    # 分别存入数据库
    conn = hu_utils.open_local_db()
    hu_utils.insert_ignore_many(conn, et_busi_infos, "et_busi_info")
    conn = hu_utils.open_local_db()
    hu_utils.insert_ignore_many(conn, et_host_infos, "et_host_info")
    conn = hu_utils.open_local_db()
    hu_utils.insert_ignore_many(conn, et_shareholder_infos,
                                "et_shareholder_info")
    conn = hu_utils.open_local_db()
    hu_utils.insert_ignore_many(conn, et_foreign_investment_infos,
                                "et_foreign_investment_info")
    conn = hu_utils.open_local_db()
    hu_utils.insert_ignore_many(conn, et_branch_offices, "et_branch_office")
    conn = hu_utils.open_local_db()
    hu_utils.insert_ignore_many(conn, wechat_list_infos, "et_wechat_list_info")
    conn = hu_utils.open_local_db()
    hu_utils.insert_ignore_many(conn, et_container_copyright_infos,
                                "et_container_copyright_info")
    conn = hu_utils.open_local_db()
    hu_utils.insert_ignore_many(conn, et_container_icp_infos,
                                "et_container_icp_info")
    conn = hu_utils.open_local_db()
    hu_utils.insert_ignore_many(conn, et_trademark_infos, "et_trademark_info")
    conn = hu_utils.open_local_db()
    # print("------et_rongzi_infos:", et_rongzi_infos)
    hu_utils.insert_ignore_many(conn, et_rongzi_infos, "et_rongzi_infos")
    conn = hu_utils.open_local_db()
    hu_utils.insert_update_many(conn, update_status, "et_name_status")
def get_etid():
    conn = hu_utils.open_local_db()
    et_names = hu_utils.select_one(conn)
    print "获取的企业数", len(et_names)
    return et_names
Example #6
0
def main():
    ids = sele_app_id()
    browser = get_chrome()
    browser.maximize_window()
    url = "https://www.qimai.cn/rank/index/brand/free/device/iphone/country/cn/genre/5000"
    browser.get(url)
    browser.implicitly_wait(10)
    wait = WebDriverWait(browser, 10)
    browser.get_screenshot_as_file('1.png')
    wait.until(EC.presence_of_element_located(('xpath', "//tr/td/div")))  # 等待
    time.sleep(1)
    global shibai_num
    shibai_num += 1
    if shibai_num == 1:
        shibai = browser.find_element(
            "xpath", "//span[contains(@class,'icon-shibai')]")
        shibai.click()
        time.sleep(1)
    try:
        a = 0
        while True:
            if a > 24:
                break
            ul = browser.find_element(
                "xpath", '//ul[contains(@class,"more-item-list")]')
            lis = ul.find_elements('xpath', "li")
            # print(len(lis))
            li = lis[a]
            aa = 0
            a += 1
            time.sleep(2)
            logger.info("开始爬取第{}页排行榜".format(a))
            browser.execute_script('window.scrollTo(0,0)')  # 上拉进度条
            li.click()  # 一级页面点击更换排行榜
            time.sleep(3)
            browser.refresh()
            wait.until(EC.presence_of_element_located(
                ('xpath', "//tr/td/div")))  # 等待
            time.sleep(3.1)
            for i in range(3):  # 加载之后的第50到200的数据
                browser.execute_script(
                    'window.scrollTo(0,document.body.scrollHeight)')  # 下拉进度条
                time.sleep(5)
            browser.execute_script('window.scrollTo(0,0)')  # 上拉进度条
            time.sleep(5)
            trs = browser.find_elements("xpath", "//table/tbody/tr")
            # if a < 2:
            #     continue
            for tr in trs:
                aa += 1
                if aa > 200:
                    continue
                logger.info("开始爬取第{}条数据".format(aa))
                ios_android_ID = {}
                android_main = {}
                ios_main = {}
                ios_Leaderboardls = []
                android_Leaderboardls = []
                android_rating = []
                android_id = 0
                try:
                    browser.execute_script('window.scrollBy(0,67)')  # 进度条移动
                    time.sleep(0.3)
                    # if aa <= 198:
                    #     continue
                    # ios——info
                    tds = tr.find_elements("xpath", "td")
                    app_name = tds[1].find_element("xpath", 'div/div/a')
                    ios_url = app_name.get_attribute("href")  # ios 的url
                    ios_id = re.search(
                        "\/app\/rank\/appid\/(\d+)\/country\/cn",
                        ios_url).group(1)  # ios的id
                    print("ios_id:", ios_id)
                    if int(ios_id) in ids:
                        logger.info("{}已存储到数据库".format(ios_id))
                        continue
                    try:
                        company_ulsz = tds[7].find_element("xpath", "a")
                        company_ulsz.click()
                        time.sleep(5)
                        browser.switch_to.window(
                            browser.window_handles[1])  # 公司页面
                        HTML_twoC = browser.find_element(
                            "xpath", "//body").get_attribute("innerHTML")
                        ios_main = parse_html_twoC(ios_main, HTML_twoC)
                        browser.close()
                        time.sleep(2)
                        browser.switch_to.window(
                            browser.window_handles[0])  # 初始页面
                    except:
                        logger.debug("获取元素出错,没有找到公司页面")
                    action_click(browser, app_name)
                    # action
                    time.sleep(5)
                    browser.switch_to.window(
                        browser.window_handles[1])  # app详情页
                    try:
                        HTML_twoA7 = browser.find_element(
                            "xpath", "//body").get_attribute("innerHTML")
                        ios_Leaderboardls = parse_html_twoA7(
                            ios_Leaderboardls, HTML_twoA7, ios_id)
                    except:
                        logger.info("parse_html_twoA7解析失败")
                    uls = browser.find_elements("xpath",
                                                "//ul[@class='select-list']")
                    lis0 = uls[0].find_elements("xpath", 'li')
                    li1 = lis0[1].find_element("xpath", 'a')
                    li3 = lis0[3].find_element("xpath", 'a')
                    li1.click()
                    time.sleep(5)
                    try:
                        HTML_twoA1 = browser.find_element(
                            "xpath", "//body").get_attribute("innerHTML")
                        ios_main, android_main = parse_html_twoA1(
                            ios_main, android_main, HTML_twoA1)
                    except:
                        logger.info("解析失败")
                    li3.click()
                    time.sleep(5)
                    HTML_twoA3 = browser.find_element(
                        "xpath", "//body").get_attribute("innerHTML")
                    ios_main = parse_html_twoA3(ios_main, HTML_twoA3)

                    # 安卓--info
                    button_a = browser.find_element(
                        "xpath", '//button[contains(@class,"btn-android")]')
                    button_text = button_a.text
                    # print(button_text)
                    if "发现安卓版" not in button_text:
                        button_a.click()
                        time.sleep(5)
                        android_url = browser.current_url  # 获取的是安卓页面的url
                        android_main["caiji_url"] = android_url
                        # print(android_url)
                        android_id = re.search(
                            "\/andapp\/baseinfo\/appid\/(\d+)",
                            android_url).group(1)  # 安卓的id
                        android_main["android_id"] = android_id
                        uls = browser.find_elements(
                            "xpath", "//ul[@class='select-list']")
                        lis1 = uls[0].find_elements("xpath", 'li')
                        lis2 = uls[1].find_elements("xpath", 'li')
                        li1 = lis1[1].find_element("xpath", 'a')
                        li3 = lis1[3].find_element("xpath", 'a')
                        li5 = lis2[1].find_element("xpath", 'a')
                        li1.click()
                        time.sleep(5)
                        HTML_twoB1 = browser.find_element(
                            "xpath", "//body").get_attribute("innerHTML")
                        android_main = parse_html_twoB1(
                            android_main, HTML_twoB1)
                        li3.click()
                        time.sleep(5)
                        try:
                            HTML_twoB3 = browser.find_element(
                                "xpath", "//body").get_attribute("innerHTML")
                            android_rating = parse_html_twoB3(
                                android_rating, HTML_twoB3, android_id)
                        except:
                            logger.info("解析失败")
                        li5.click()
                        time.sleep(5)
                        HTML_twoB5 = {}
                        try:
                            towB5 = browser.find_element(
                                "xpath",
                                "//div[contains(@class,'ivu-tabs-tab-active')]"
                            ).text
                            # print("towB5:",towB5)
                            HTML_twoB5[towB5] = browser.find_element(
                                "xpath",
                                "//div[@id='rank-info']").get_attribute(
                                    "innerHTML")
                            ivu_tabs = browser.find_elements(
                                "xpath", "//div[@class='ivu-tabs-tab']")
                            z = 0
                            for ivu_tab in ivu_tabs:
                                z += 1
                                if (z % 2) == 0:
                                    # print(ivu_tab.text)
                                    ivu_tab.click()
                                    time.sleep(4)
                                    azz = ""
                                    azz = browser.find_element(
                                        "xpath",
                                        "//div[contains(@class,'ivu-tabs-tab-active')]"
                                    ).text
                                    HTML_twoB5[azz] = browser.find_element(
                                        "xpath", "//div[@id='rank-info']"
                                    ).get_attribute("innerHTML")
                            android_Leaderboardls = parse_html_twoB5(
                                android_Leaderboardls, HTML_twoB5, android_id)
                        except:
                            logger.debug("无排名数据")
                    browser.switch_to.window(browser.window_handles[1])
                    browser.close()  # 关闭一个多余页面
                    time.sleep(1)
                    browser.switch_to.window(browser.window_handles[0])  # 更换页面
                    # info
                    ios_main["ios_id"] = ios_id
                    ios_main["caiji_url"] = ios_url
                    ios_android_ID["ios_id"] = ios_id
                    ios_android_ID["android_id"] = android_id
                    logger.info("解析结束,开始存储数据")
                except:
                    logger.info("-----第{}条数据获取失败-----".format(aa))
                    while len(browser.window_handles) > 1:
                        logger.info('关闭一个多余窗口!')
                        browser.switch_to.window(browser.window_handles[1])
                        browser.close()
                    browser.switch_to.window(browser.window_handles[0])
                finally:
                    logger.info("开始数据存储")
                    if ios_main:
                        conn = hu_utils.open_local_db("app_info")
                        hu_utils.insert_update_one(conn, ios_main, "ios_main")
                    else:
                        logger.info("ios_main 无数据")
                    if "android_id" in android_main.keys():
                        conn = hu_utils.open_local_db("app_info")
                        hu_utils.insert_update_one(conn, android_main,
                                                   "android_main")
                    if ios_android_ID:
                        conn = hu_utils.open_local_db("app_info")
                        hu_utils.insert_update_one(conn, ios_android_ID,
                                                   "ios_android_ID")
                    else:
                        logger.info("ios_android_ID 无数据")
                    if ios_Leaderboardls:
                        conn = hu_utils.open_local_db("app_info")
                        hu_utils.insert_update_many(conn, ios_Leaderboardls,
                                                    "ios_Leaderboardls")
                    else:
                        logger.info("ios_Leaderboardls 无数据")
                    if android_Leaderboardls:
                        conn = hu_utils.open_local_db("app_info")
                        hu_utils.insert_update_many(conn,
                                                    android_Leaderboardls,
                                                    "android_Leaderboardls")
                    else:
                        logger.info("android_Leaderboardls 无数据")
                    if android_rating:
                        conn = hu_utils.open_local_db("app_info")
                        hu_utils.insert_update_many(conn, android_rating,
                                                    "android_rating")
                    else:
                        logger.info("android_rating 无数据")
    finally:
        logger.info("关闭浏览器")
        browser.quit()