def __init__(self): self._host = '' self._port = 3306 self._user = '******' self._password = '' self._db = 'google_play' self.printf = GetSetting().get_logger()
def run(self): proxies = [] setting = GetSetting() session = setting.get_session() loop = setting.get_loop() tasks = await self.build_async_tasks(session) check_tasks = [] for proxy in tasks: for ipdic in proxy.result(): task = asyncio.ensure_future(self.check_proxy(ipdic,session)) check_tasks.append(task) loop.run_until_complete(asyncio.wait(check_tasks)) if check_tasks: mysql_tasks = [] for proxy in check_tasks: if proxy.result(): task = asyncio.ensure_future(self.save_mysql(proxy.result())) mysql_tasks.append(task) proxies.append(proxy.result()) loop.run_until_complete(asyncio.wait(mysql_tasks)) return proxies
def __init__(self): self.apk_names = set() self.son_category_url = set() self.session = GetSetting().get_session() self.crawl_proxy = crawl_fn() self.loop = GetSetting().get_loop() self.lock = asyncio.Lock() self.rcon = GetSetting().get_redis() self._Request = InitiateRequest() self.printf = GetSetting().get_logger() self.proxies = [] self.get_redis = RedisOption() self.host = "https://play.google.com/" self.urls = { 'url_newgame_free': 'https://play.google.com/store/apps/category/GAME/collection/topselling_new_free?authuser=0', # new热门免费游戏 post url 'url_newcolletion_free': 'https://play.google.com/store/apps/collection/topselling_new_free?authuser=0', # # new热门免费应用 "url_app_topsell": "https://play.google.com/store/apps/collection/topselling_free", 'url_paid_topcol': 'https://play.google.com/store/apps/collection/topselling_paid?authuser=0', 'url_topgross_col': 'https://play.google.com/store/apps/collection/topgrossing?authuser=0', 'url_topgame_free': 'https://play.google.com/store/apps/category/GAME/collection/topselling_free?authuser=0', 'url_topgross_game': 'https://play.google.com/store/apps/category/GAME/collection/topgrossing?authuser=0', "url_new_app": "https://play.google.com/store/apps/collection/topselling_new_free", "url_new_game": "https://play.google.com/store/apps/category/GAME/collection/topselling_new_free", }
def __init__(self): self.setting = GetSetting() self.loop = self.setting.get_loop() self.session = self.setting.get_session() self.lock = asyncio.Lock() self.crawl_proxy = crawl_fn() self.parsing = ParsingData() self.get_pool = GetMysqlPool() self.loop.run_until_complete( asyncio.ensure_future(self.get_pool.init_pool())) self.get_redis = RedisOption() self._Request = InitiateRequest() self.apknames = set() self.proxies = [] self.all_data_list = [] self.printf = self.setting.get_logger() self.country_dict = { # 'us': '&hl=en&gl=us', 'zh': '&hl=zh&gl=us', 'zhtw': '&hl=zh_TW&gl=us', 'ko': '&hl=ko&gl=us', 'ar': '&hl=ar&gl=us', 'jp': '&hl=ja&gl=us', }
class GetMysqlPool: def __init__(self): self._host = '' self._port = 3306 self._user = '******' self._password = '' self._db = 'google_play' self.printf = GetSetting().get_logger() async def init_pool(self): self.pool = await aiomysql.create_pool(host=self._host, port=self._port, user=self._user, password=self._password, db=self._db, charset='utf8', autocommit=True) async def insert_mysql_(self, data): """ 将各位的数据存入或更新到数据库中 """ async with self.pool.acquire() as conn: async with conn.cursor() as cur: if data["country"] == "us": to_mysql = "crawl_google_play_app_info" else: to_mysql = "crawl_google_play_app_info_" + data["country"] sql_google = """ insert into {} (language,appsize,category,contentrating,current_version,description,developer,whatsnew,developer_url,instalations,isbusy,last_updatedate,minimum_os_version,name,pkgname,url) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE appsize=VALUES(appsize),category=VALUES(category),contentrating=VALUES(contentrating),current_version=VALUES(current_version), description=VALUES(description),developer=VALUES(developer),whatsnew=VALUES(whatsnew), instalations=VALUES(instalations),last_updatedate=VALUES(last_updatedate),minimum_os_version=VALUES(minimum_os_version),name=VALUES(name) """.format(to_mysql) try: params = (data["country"], data["size"], data["category"], data["content_rating"], data["app_version"], data["description"], data["provider"], data["what_news"], data["developer_url"], data["installs"], data["is_busy"], data["update_time"], data["min_os_version"], data["name"], data["pkgname"], data["url"]) result = await cur.execute(sql_google, params) except Exception as e: self.printf.info("数据库语句:" + sql_google) self.printf.info("错误时候的数据" + str(data)) self.printf.info('数据库错误信息:' + str(e)) async def find_pkgname(self, pkgname): async with self.pool.acquire() as conn: async with conn.cursor() as cur: try: sql = """ select current_version from crawl_google_play_app_info as f where f.pkgname = "{}" """.format(pkgname) await cur.execute(sql, None) result = await cur.fetchone() if result: return result[0] else: return None except Exception as e: self.printf.info("数据库语句:" + sql) self.printf.info("错误时候的包名" + str(pkgname)) self.printf.info('数据库错误信息:' + str(e)) return None
class CheckUpdateApkname: def __init__(self): self.setting = GetSetting() self.loop = self.setting.get_loop() self.session = self.setting.get_session() self.lock = asyncio.Lock() self.crawl_proxy = crawl_fn() self.parsing = ParsingData() self.get_pool = GetMysqlPool() self.loop.run_until_complete( asyncio.ensure_future(self.get_pool.init_pool())) self.get_redis = RedisOption() self._Request = InitiateRequest() self.apknames = set() self.proxies = [] self.all_data_list = [] self.printf = self.setting.get_logger() self.country_dict = { # 'us': '&hl=en&gl=us', 'zh': '&hl=zh&gl=us', 'zhtw': '&hl=zh_TW&gl=us', 'ko': '&hl=ko&gl=us', 'ar': '&hl=ar&gl=us', 'jp': '&hl=ja&gl=us', } async def _get_proxy(self): async with self.lock: if len(self.proxies) < 3: self.proxies = await self.crawl_proxy.run(self.session) try: proxy = choice(self.proxies) return proxy except: await self._get_proxy() async def check_app_version(self, data, time=3, proxy=None): """ 检查美国的版本是否更新 """ now_pkgname = data["pkgname"] now_app_version = await self.get_pool.find_pkgname(now_pkgname) apk_url = "https://play.google.com/store/apps/details?id=" + now_pkgname for i in range(3): if proxy is None: proxy = await self._get_proxy() try: datas = await self._Request.get_request( self.session, apk_url, proxy) if datas: analysis_data = self.parsing.analysis_country_data(datas) # 判断是否已经可下载 if analysis_data is None: data_return = {} data_return["pkgname"] = now_pkgname data_return["is_update"] = 0 return data_return, None analysis_data["country"] = "us" analysis_data["pkgname"] = now_pkgname analysis_data["url"] = apk_url check_app_version = analysis_data["app_version"] change_time = self.parsing.change_time( 'us', analysis_data["update_time"]) if change_time is not None: analysis_data["update_time"] = change_time # 数据库中版本不为空,且检查版本与数据库相同或者检查版本为空时,不更新 if now_app_version is not None and ( check_app_version == now_app_version or check_app_version is None): data_return = {} data_return["app_version"] = now_app_version data_return["pkgname"] = now_pkgname data_return["is_update"] = 0 else: data_return = {} data_return["app_version"] = check_app_version data_return["pkgname"] = now_pkgname data_return["is_update"] = 1 return data_return, analysis_data else: self.printf.info("data is none") except Exception as e: if str(e) == "": self.printf.info("错误数据" + str(data)) self.printf.info(str(e)) else: # 失败三次重新放入redis中 self.printf.info('失败三次重新放入redis') data_return = {} data_return["pkgname"] = now_pkgname data_return["is_update"] = 2 return data_return, None async def check_other_coutry(self, data, time=3, proxy=None): ''' 获取其他国家的数据 ''' for country in self.country_dict: pkgname = data["pkgname"] apk_url = "https://play.google.com/store/apps/details?id=" + pkgname + self.country_dict[ country] if proxy == None: proxy = await self._get_proxy() for i in range(3): try: datas = await self._Request.get_request( self.session, apk_url, proxy) if datas: check_app_data = self.parsing.analysis_country_data( datas) if check_app_data is None: break check_app_data["pkgname"] = pkgname check_app_data["country"] = country check_app_data["url"] = apk_url change_time = self.parsing.change_time( country, check_app_data["update_time"]) if change_time is not None: check_app_data["update_time"] = change_time self.all_data_list.append(check_app_data) break except Exception as e: if str(e) == "": self.printf.info("错误数据" + str(data)) self.printf.info(str(e)) else: return None def _get_pkgdata_redis(self, start): """ 从redis中获取pkg的数据 """ pkg_datas = [] for i in range(100): end = time.time() if (end - start) > 20: return pkg_datas pkg_data = self.get_redis.get_redis_pkgname() pkg_datas.append(pkg_data) return pkg_datas def _build_check_tasks(self, results): ''' 创建检查美国信息的任务队列 :param results: :return: 需要检查并要存入redis的pkg数据的字典,需要存入mysql美国的pkg数据的字典(两个字典) ''' check_tasks = [] for result in results: task = asyncio.ensure_future(self.check_app_version(result)) check_tasks.append(task) return check_tasks def _task_ensure_future(self, func, data, tasks): task = asyncio.ensure_future(func(data)) tasks.append(task) def _build_other_insert(self, check_results): ''' 遍历以美国为基准的需要更新的数据,分别更新redis, 创建检查其他国家的任务队列和将美国数据插入mysql的任务队列 :param check_results: :return: 存入mysql的任务队列和检查其他国家的任务队列 ''' save_mysql_tasks = [] check_other_tasks = [] for check_result in check_results: try: data_return, analysis_data = check_result if data_return is not None and data_return["is_update"] == 2: self.get_redis.update_pkgname_redis(data_return) if analysis_data is not None: self._task_ensure_future(self.get_pool.insert_mysql_, analysis_data, save_mysql_tasks) if data_return is not None and data_return["is_update"] == 1: self._task_ensure_future(self.check_other_coutry, data_return, check_other_tasks) except Exception as e: self.printf.info('错误信息:' + str(e)) return save_mysql_tasks, check_other_tasks def run(self): """ 从redis中获取pkg数据->检查美国的包是否有更新->更新redis->以美国为基准获取其他国家有版本更新的包的数据->存入数据库 """ while True: start = time.time() pkg_datas = self._get_pkgdata_redis(start) check_tasks = self._build_check_tasks(pkg_datas) if len(check_tasks) >= 1: check_results = self.loop.run_until_complete( asyncio.gather(*check_tasks)) save_mysql_tasks, check_other_tasks = self._build_other_insert( check_results) if len(check_other_tasks) >= 1: self.loop.run_until_complete( asyncio.wait(check_other_tasks)) for result_list in self.all_data_list: if result_list is not None: task = self.get_pool.insert_mysql_(result_list) save_mysql_tasks.append(task) self.all_data_list = [] if len(save_mysql_tasks) >= 1: self.loop.run_until_complete( asyncio.wait(save_mysql_tasks))
class CrawlApkName: def __init__(self): self.apk_names = set() self.son_category_url = set() self.session = GetSetting().get_session() self.crawl_proxy = crawl_fn() self.loop = GetSetting().get_loop() self.lock = asyncio.Lock() self.rcon = GetSetting().get_redis() self._Request = InitiateRequest() self.printf = GetSetting().get_logger() self.proxies = [] self.get_redis = RedisOption() self.host = "https://play.google.com/" self.urls = { 'url_newgame_free': 'https://play.google.com/store/apps/category/GAME/collection/topselling_new_free?authuser=0', # new热门免费游戏 post url 'url_newcolletion_free': 'https://play.google.com/store/apps/collection/topselling_new_free?authuser=0', # # new热门免费应用 "url_app_topsell": "https://play.google.com/store/apps/collection/topselling_free", 'url_paid_topcol': 'https://play.google.com/store/apps/collection/topselling_paid?authuser=0', 'url_topgross_col': 'https://play.google.com/store/apps/collection/topgrossing?authuser=0', 'url_topgame_free': 'https://play.google.com/store/apps/category/GAME/collection/topselling_free?authuser=0', 'url_topgross_game': 'https://play.google.com/store/apps/category/GAME/collection/topgrossing?authuser=0', "url_new_app": "https://play.google.com/store/apps/collection/topselling_new_free", "url_new_game": "https://play.google.com/store/apps/category/GAME/collection/topselling_new_free", } def get_apknames_tasks(self): tasks = [] for url in self.urls.values(): post_data_first = { 'ipf': '1', 'xhr': '1' } task = asyncio.ensure_future(self.fetch_post_apkname(url, post_data_first)) tasks.append(task) for i in range(1, 20): post_data = { 'start': i * 60, 'num': '60', 'numChildren': '0', 'cctcss': 'square-cover', 'cllayout': 'NORMAL', 'ipf': '1', 'xhr': '1', } task = asyncio.ensure_future(self.fetch_post_apkname(url, post_data)) tasks.append(task) return tasks def build_async_tasks(self,urls): tasks = [] for url in urls: task = asyncio.ensure_future(self.get_web_data(url)) tasks.append(task) return tasks async def fetch_post_apkname(self,url,data): for i in range(3): proxy = await self.get_proxy() try: datas = await self._Request.post_request(self.session, url, proxy, data) analysis_data = etree.HTML(datas) apknames = analysis_data.xpath( "//div[@class='card no-rationale square-cover apps small']//span[@class='preview-overlay-container']/@data-docid") for apkname in apknames: self.apk_names.add(apkname) break except Exception as e: self.printf.info(str(e)) try: self.proxies.remove(proxy) except: pass async def fetch_get_apkname(self,url): for i in range(3): proxy = await self.get_proxy() try: data = await self._Request.get_request(self.session, url, proxy) analysis_data = etree.HTML(data) apknames = analysis_data.xpath( "//div[@class='card no-rationale square-cover apps small']//span[@class='preview-overlay-container']/@data-docid") for apkname in apknames: self.apk_names.add(apkname) break except Exception as e: self.printf.info(str(e)) try: self.proxies.remove(proxy) except: pass async def get_category_url(self, data): analysis_data = etree.HTML(data) urls = analysis_data.xpath("//div[@class='dropdown-submenu']//a/@href") urls = [self.host + url for url in urls] feasible_url = set() for url in urls: if "GAME" in url or "SOCIAL" in url or "SPORTS" in url or "SHOPPING" in url or "HEALTH_AND_FITNESS" in url or "COMICS" in url: feasible_url.add(url) return feasible_url async def get_proxy(self): async with self.lock: if len(self.proxies) < 3: self.proxies = await self.crawl_proxy.run(self.session) try: proxy = choice(self.proxies) return proxy except Exception as e: self.printf.info(str(e)) await self.get_proxy() async def get_web_data(self, url): proxy = await self.get_proxy() try: data = await self._Request.get_request(self.session, url, proxy) return data except: try: self.proxies.remove(proxy) except: pass async def get_main_url(self): proxy = await self.get_proxy() url = "https://play.google.com/store/apps" try: data = await self._Request.get_request(self.session, url, proxy) analysis_data = etree.HTML(data) urls = analysis_data.xpath("//div[@class='g4kCYe']/a/@href") return urls except: try: self.proxies.remove(proxy) except: pass def run(self): self.apk_names.clear() # 获取最外层的apkname tasks = self.get_apknames_tasks() self.loop.run_until_complete(asyncio.wait(tasks)) # 获取最外层类别的url,共八种 urls = self.loop.run_until_complete(self.get_main_url()) tasks = self.build_async_tasks(urls) results = self.loop.run_until_complete(asyncio.gather(*tasks)) # 获取里层的分类的url task = asyncio.ensure_future(self.get_category_url(results[0])) allurls = self.loop.run_until_complete(task) for url in allurls: self.son_category_url.add(url) get_apkname_tasks = [] for url in self.son_category_url: task = asyncio.ensure_future(self.fetch_get_apkname(url)) get_apkname_tasks.append(task) self.loop.run_until_complete(asyncio.gather(*get_apkname_tasks)) for apkname in self.apk_names: self.get_redis.save_pkgname_redis(apkname)