Esempio n. 1
0
 def __init__(self):
     self._host = ''
     self._port = 3306
     self._user = '******'
     self._password = ''
     self._db = 'google_play'
     self.printf = GetSetting().get_logger()
Esempio n. 2
0
 def run(self):
     proxies = []
     setting = GetSetting()
     session = setting.get_session()
     loop = setting.get_loop()
     tasks = await self.build_async_tasks(session)
     check_tasks = []
     for proxy in tasks:
         for ipdic in proxy.result():
             task = asyncio.ensure_future(self.check_proxy(ipdic,session))
             check_tasks.append(task)
     loop.run_until_complete(asyncio.wait(check_tasks))
     if check_tasks:
         mysql_tasks = []
         for proxy in check_tasks:
             if proxy.result():
                 task = asyncio.ensure_future(self.save_mysql(proxy.result()))
                 mysql_tasks.append(task)
                 proxies.append(proxy.result())
         loop.run_until_complete(asyncio.wait(mysql_tasks))
     return proxies
Esempio n. 3
0
    def __init__(self):
        self.apk_names = set()
        self.son_category_url = set()
        self.session = GetSetting().get_session()
        self.crawl_proxy = crawl_fn()
        self.loop = GetSetting().get_loop()
        self.lock = asyncio.Lock()
        self.rcon = GetSetting().get_redis()
        self._Request = InitiateRequest()
        self.printf = GetSetting().get_logger()
        self.proxies = []
        self.get_redis = RedisOption()
        self.host = "https://play.google.com/"

        self.urls = {
            'url_newgame_free': 'https://play.google.com/store/apps/category/GAME/collection/topselling_new_free?authuser=0',
            # new热门免费游戏 post url
            'url_newcolletion_free': 'https://play.google.com/store/apps/collection/topselling_new_free?authuser=0',
            # # new热门免费应用
            "url_app_topsell": "https://play.google.com/store/apps/collection/topselling_free",
            'url_paid_topcol': 'https://play.google.com/store/apps/collection/topselling_paid?authuser=0',
            'url_topgross_col': 'https://play.google.com/store/apps/collection/topgrossing?authuser=0',
            'url_topgame_free': 'https://play.google.com/store/apps/category/GAME/collection/topselling_free?authuser=0',
            'url_topgross_game': 'https://play.google.com/store/apps/category/GAME/collection/topgrossing?authuser=0',
            "url_new_app": "https://play.google.com/store/apps/collection/topselling_new_free",
            "url_new_game": "https://play.google.com/store/apps/category/GAME/collection/topselling_new_free",
        }
 def __init__(self):
     self.setting = GetSetting()
     self.loop = self.setting.get_loop()
     self.session = self.setting.get_session()
     self.lock = asyncio.Lock()
     self.crawl_proxy = crawl_fn()
     self.parsing = ParsingData()
     self.get_pool = GetMysqlPool()
     self.loop.run_until_complete(
         asyncio.ensure_future(self.get_pool.init_pool()))
     self.get_redis = RedisOption()
     self._Request = InitiateRequest()
     self.apknames = set()
     self.proxies = []
     self.all_data_list = []
     self.printf = self.setting.get_logger()
     self.country_dict = {
         # 'us': '&hl=en&gl=us',
         'zh': '&hl=zh&gl=us',
         'zhtw': '&hl=zh_TW&gl=us',
         'ko': '&hl=ko&gl=us',
         'ar': '&hl=ar&gl=us',
         'jp': '&hl=ja&gl=us',
     }
Esempio n. 5
0
class GetMysqlPool:
    def __init__(self):
        self._host = ''
        self._port = 3306
        self._user = '******'
        self._password = ''
        self._db = 'google_play'
        self.printf = GetSetting().get_logger()

    async def init_pool(self):
        self.pool = await aiomysql.create_pool(host=self._host,
                                               port=self._port,
                                               user=self._user,
                                               password=self._password,
                                               db=self._db,
                                               charset='utf8',
                                               autocommit=True)

    async def insert_mysql_(self, data):
        """
        将各位的数据存入或更新到数据库中
        """
        async with self.pool.acquire() as conn:
            async with conn.cursor() as cur:
                if data["country"] == "us":
                    to_mysql = "crawl_google_play_app_info"
                else:
                    to_mysql = "crawl_google_play_app_info_" + data["country"]
                sql_google = """

                                        insert into {} (language,appsize,category,contentrating,current_version,description,developer,whatsnew,developer_url,instalations,isbusy,last_updatedate,minimum_os_version,name,pkgname,url) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                                        ON DUPLICATE KEY UPDATE appsize=VALUES(appsize),category=VALUES(category),contentrating=VALUES(contentrating),current_version=VALUES(current_version),
                                        description=VALUES(description),developer=VALUES(developer),whatsnew=VALUES(whatsnew),
                                        instalations=VALUES(instalations),last_updatedate=VALUES(last_updatedate),minimum_os_version=VALUES(minimum_os_version),name=VALUES(name)
        """.format(to_mysql)
                try:
                    params = (data["country"], data["size"], data["category"],
                              data["content_rating"], data["app_version"],
                              data["description"], data["provider"],
                              data["what_news"], data["developer_url"],
                              data["installs"], data["is_busy"],
                              data["update_time"], data["min_os_version"],
                              data["name"], data["pkgname"], data["url"])
                    result = await cur.execute(sql_google, params)
                except Exception as e:
                    self.printf.info("数据库语句:" + sql_google)
                    self.printf.info("错误时候的数据" + str(data))
                    self.printf.info('数据库错误信息:' + str(e))

    async def find_pkgname(self, pkgname):
        async with self.pool.acquire() as conn:
            async with conn.cursor() as cur:
                try:
                    sql = """
                        select current_version from crawl_google_play_app_info as f where  f.pkgname = "{}"
                    """.format(pkgname)
                    await cur.execute(sql, None)
                    result = await cur.fetchone()
                    if result:
                        return result[0]
                    else:
                        return None
                except Exception as e:
                    self.printf.info("数据库语句:" + sql)
                    self.printf.info("错误时候的包名" + str(pkgname))
                    self.printf.info('数据库错误信息:' + str(e))
                    return None
class CheckUpdateApkname:
    def __init__(self):
        self.setting = GetSetting()
        self.loop = self.setting.get_loop()
        self.session = self.setting.get_session()
        self.lock = asyncio.Lock()
        self.crawl_proxy = crawl_fn()
        self.parsing = ParsingData()
        self.get_pool = GetMysqlPool()
        self.loop.run_until_complete(
            asyncio.ensure_future(self.get_pool.init_pool()))
        self.get_redis = RedisOption()
        self._Request = InitiateRequest()
        self.apknames = set()
        self.proxies = []
        self.all_data_list = []
        self.printf = self.setting.get_logger()
        self.country_dict = {
            # 'us': '&hl=en&gl=us',
            'zh': '&hl=zh&gl=us',
            'zhtw': '&hl=zh_TW&gl=us',
            'ko': '&hl=ko&gl=us',
            'ar': '&hl=ar&gl=us',
            'jp': '&hl=ja&gl=us',
        }

    async def _get_proxy(self):
        async with self.lock:
            if len(self.proxies) < 3:
                self.proxies = await self.crawl_proxy.run(self.session)
            try:
                proxy = choice(self.proxies)
                return proxy
            except:
                await self._get_proxy()

    async def check_app_version(self, data, time=3, proxy=None):
        """
        检查美国的版本是否更新
        """
        now_pkgname = data["pkgname"]
        now_app_version = await self.get_pool.find_pkgname(now_pkgname)
        apk_url = "https://play.google.com/store/apps/details?id=" + now_pkgname
        for i in range(3):
            if proxy is None:
                proxy = await self._get_proxy()
            try:
                datas = await self._Request.get_request(
                    self.session, apk_url, proxy)
                if datas:
                    analysis_data = self.parsing.analysis_country_data(datas)
                    # 判断是否已经可下载
                    if analysis_data is None:
                        data_return = {}
                        data_return["pkgname"] = now_pkgname
                        data_return["is_update"] = 0
                        return data_return, None
                    analysis_data["country"] = "us"
                    analysis_data["pkgname"] = now_pkgname
                    analysis_data["url"] = apk_url
                    check_app_version = analysis_data["app_version"]
                    change_time = self.parsing.change_time(
                        'us', analysis_data["update_time"])
                    if change_time is not None:
                        analysis_data["update_time"] = change_time
                    # 数据库中版本不为空,且检查版本与数据库相同或者检查版本为空时,不更新
                    if now_app_version is not None and (
                            check_app_version == now_app_version
                            or check_app_version is None):
                        data_return = {}
                        data_return["app_version"] = now_app_version
                        data_return["pkgname"] = now_pkgname
                        data_return["is_update"] = 0
                    else:
                        data_return = {}
                        data_return["app_version"] = check_app_version
                        data_return["pkgname"] = now_pkgname
                        data_return["is_update"] = 1
                    return data_return, analysis_data
                else:
                    self.printf.info("data is none")
            except Exception as e:
                if str(e) == "":
                    self.printf.info("错误数据" + str(data))
                self.printf.info(str(e))
        else:
            # 失败三次重新放入redis中
            self.printf.info('失败三次重新放入redis')
            data_return = {}
            data_return["pkgname"] = now_pkgname
            data_return["is_update"] = 2
            return data_return, None

    async def check_other_coutry(self, data, time=3, proxy=None):
        '''
        获取其他国家的数据
        '''
        for country in self.country_dict:
            pkgname = data["pkgname"]
            apk_url = "https://play.google.com/store/apps/details?id=" + pkgname + self.country_dict[
                country]
            if proxy == None:
                proxy = await self._get_proxy()
            for i in range(3):
                try:
                    datas = await self._Request.get_request(
                        self.session, apk_url, proxy)
                    if datas:
                        check_app_data = self.parsing.analysis_country_data(
                            datas)
                        if check_app_data is None:
                            break
                        check_app_data["pkgname"] = pkgname
                        check_app_data["country"] = country
                        check_app_data["url"] = apk_url
                        change_time = self.parsing.change_time(
                            country, check_app_data["update_time"])
                        if change_time is not None:
                            check_app_data["update_time"] = change_time
                        self.all_data_list.append(check_app_data)
                        break
                except Exception as e:
                    if str(e) == "":
                        self.printf.info("错误数据" + str(data))
                    self.printf.info(str(e))
            else:
                return None

    def _get_pkgdata_redis(self, start):
        """
        从redis中获取pkg的数据
        """
        pkg_datas = []
        for i in range(100):
            end = time.time()
            if (end - start) > 20:
                return pkg_datas
            pkg_data = self.get_redis.get_redis_pkgname()
            pkg_datas.append(pkg_data)
        return pkg_datas

    def _build_check_tasks(self, results):
        '''
        创建检查美国信息的任务队列
        :param results:
        :return: 需要检查并要存入redis的pkg数据的字典,需要存入mysql美国的pkg数据的字典(两个字典)
        '''
        check_tasks = []
        for result in results:
            task = asyncio.ensure_future(self.check_app_version(result))
            check_tasks.append(task)
        return check_tasks

    def _task_ensure_future(self, func, data, tasks):
        task = asyncio.ensure_future(func(data))
        tasks.append(task)

    def _build_other_insert(self, check_results):
        '''
        遍历以美国为基准的需要更新的数据,分别更新redis, 创建检查其他国家的任务队列和将美国数据插入mysql的任务队列
        :param check_results:
        :return: 存入mysql的任务队列和检查其他国家的任务队列
        '''

        save_mysql_tasks = []
        check_other_tasks = []
        for check_result in check_results:
            try:
                data_return, analysis_data = check_result
                if data_return is not None and data_return["is_update"] == 2:
                    self.get_redis.update_pkgname_redis(data_return)
                if analysis_data is not None:
                    self._task_ensure_future(self.get_pool.insert_mysql_,
                                             analysis_data, save_mysql_tasks)
                if data_return is not None and data_return["is_update"] == 1:
                    self._task_ensure_future(self.check_other_coutry,
                                             data_return, check_other_tasks)
            except Exception as e:
                self.printf.info('错误信息:' + str(e))
        return save_mysql_tasks, check_other_tasks

    def run(self):
        """
        从redis中获取pkg数据->检查美国的包是否有更新->更新redis->以美国为基准获取其他国家有版本更新的包的数据->存入数据库
        """
        while True:
            start = time.time()
            pkg_datas = self._get_pkgdata_redis(start)
            check_tasks = self._build_check_tasks(pkg_datas)
            if len(check_tasks) >= 1:
                check_results = self.loop.run_until_complete(
                    asyncio.gather(*check_tasks))
                save_mysql_tasks, check_other_tasks = self._build_other_insert(
                    check_results)
                if len(check_other_tasks) >= 1:
                    self.loop.run_until_complete(
                        asyncio.wait(check_other_tasks))
                    for result_list in self.all_data_list:
                        if result_list is not None:
                            task = self.get_pool.insert_mysql_(result_list)
                            save_mysql_tasks.append(task)
                    self.all_data_list = []
                if len(save_mysql_tasks) >= 1:
                    self.loop.run_until_complete(
                        asyncio.wait(save_mysql_tasks))
Esempio n. 7
0
class CrawlApkName:
    def __init__(self):
        self.apk_names = set()
        self.son_category_url = set()
        self.session = GetSetting().get_session()
        self.crawl_proxy = crawl_fn()
        self.loop = GetSetting().get_loop()
        self.lock = asyncio.Lock()
        self.rcon = GetSetting().get_redis()
        self._Request = InitiateRequest()
        self.printf = GetSetting().get_logger()
        self.proxies = []
        self.get_redis = RedisOption()
        self.host = "https://play.google.com/"

        self.urls = {
            'url_newgame_free': 'https://play.google.com/store/apps/category/GAME/collection/topselling_new_free?authuser=0',
            # new热门免费游戏 post url
            'url_newcolletion_free': 'https://play.google.com/store/apps/collection/topselling_new_free?authuser=0',
            # # new热门免费应用
            "url_app_topsell": "https://play.google.com/store/apps/collection/topselling_free",
            'url_paid_topcol': 'https://play.google.com/store/apps/collection/topselling_paid?authuser=0',
            'url_topgross_col': 'https://play.google.com/store/apps/collection/topgrossing?authuser=0',
            'url_topgame_free': 'https://play.google.com/store/apps/category/GAME/collection/topselling_free?authuser=0',
            'url_topgross_game': 'https://play.google.com/store/apps/category/GAME/collection/topgrossing?authuser=0',
            "url_new_app": "https://play.google.com/store/apps/collection/topselling_new_free",
            "url_new_game": "https://play.google.com/store/apps/category/GAME/collection/topselling_new_free",
        }

    def get_apknames_tasks(self):
        tasks = []
        for url in self.urls.values():
            post_data_first = {
                'ipf': '1',
                'xhr': '1'
            }
            task = asyncio.ensure_future(self.fetch_post_apkname(url, post_data_first))
            tasks.append(task)
            for i in range(1, 20):
                post_data = {
                    'start': i * 60,
                    'num': '60',
                    'numChildren': '0',
                    'cctcss': 'square-cover',
                    'cllayout': 'NORMAL',
                    'ipf': '1',
                    'xhr': '1',
                }
                task = asyncio.ensure_future(self.fetch_post_apkname(url, post_data))
                tasks.append(task)
        return tasks


    def build_async_tasks(self,urls):
        tasks = []
        for url in urls:
            task = asyncio.ensure_future(self.get_web_data(url))
            tasks.append(task)
        return tasks

    async def fetch_post_apkname(self,url,data):
        for i in range(3):
            proxy = await self.get_proxy()
            try:
                datas = await self._Request.post_request(self.session, url, proxy, data)
                analysis_data = etree.HTML(datas)
                apknames = analysis_data.xpath(
                    "//div[@class='card no-rationale square-cover apps small']//span[@class='preview-overlay-container']/@data-docid")
                for apkname in apknames:
                    self.apk_names.add(apkname)
                break
            except Exception as e:
                self.printf.info(str(e))
                try:
                    self.proxies.remove(proxy)
                except:
                    pass

    async def fetch_get_apkname(self,url):
        for i in range(3):
            proxy = await self.get_proxy()
            try:
                data = await self._Request.get_request(self.session, url, proxy)
                analysis_data = etree.HTML(data)
                apknames = analysis_data.xpath(
                    "//div[@class='card no-rationale square-cover apps small']//span[@class='preview-overlay-container']/@data-docid")
                for apkname in apknames:
                    self.apk_names.add(apkname)
                break
            except Exception as e:
                self.printf.info(str(e))
                try:
                    self.proxies.remove(proxy)
                except:
                    pass
    async def get_category_url(self, data):
        analysis_data = etree.HTML(data)
        urls = analysis_data.xpath("//div[@class='dropdown-submenu']//a/@href")
        urls = [self.host + url for url in urls]
        feasible_url = set()
        for url in urls:
            if "GAME" in url or "SOCIAL" in url or "SPORTS" in url or "SHOPPING" in url or "HEALTH_AND_FITNESS" in url or "COMICS" in url:
                feasible_url.add(url)
        return feasible_url

    async def get_proxy(self):
        async with self.lock:
            if len(self.proxies) < 3:
                self.proxies = await self.crawl_proxy.run(self.session)
            try:
                proxy = choice(self.proxies)
                return proxy
            except Exception as e:
                self.printf.info(str(e))
                await self.get_proxy()

    async def get_web_data(self, url):
        proxy = await self.get_proxy()
        try:
            data = await self._Request.get_request(self.session, url, proxy)
            return data
        except:
            try:
                self.proxies.remove(proxy)
            except:
                pass

    async def get_main_url(self):
        proxy = await self.get_proxy()
        url = "https://play.google.com/store/apps"
        try:
            data = await self._Request.get_request(self.session, url, proxy)
            analysis_data = etree.HTML(data)
            urls = analysis_data.xpath("//div[@class='g4kCYe']/a/@href")
            return urls
        except:
            try:
                self.proxies.remove(proxy)
            except:
                pass

    def run(self):
        self.apk_names.clear()
        # 获取最外层的apkname
        tasks = self.get_apknames_tasks()
        self.loop.run_until_complete(asyncio.wait(tasks))
        # 获取最外层类别的url,共八种
        urls = self.loop.run_until_complete(self.get_main_url())
        tasks = self.build_async_tasks(urls)
        results = self.loop.run_until_complete(asyncio.gather(*tasks))
        # 获取里层的分类的url
        task = asyncio.ensure_future(self.get_category_url(results[0]))
        allurls = self.loop.run_until_complete(task)
        for url in allurls:
            self.son_category_url.add(url)
        get_apkname_tasks = []
        for url in self.son_category_url:
            task = asyncio.ensure_future(self.fetch_get_apkname(url))
            get_apkname_tasks.append(task)
        self.loop.run_until_complete(asyncio.gather(*get_apkname_tasks))
        for apkname in self.apk_names:
            self.get_redis.save_pkgname_redis(apkname)