def test_getCompleteAppInfoPartial(): app = AppItem() app['name'] = 'test app' app['id'] = 'testid' app['rating'] = 1 app['install_fee'] = 0 app['app_icon'] = '' appAccessor = AppAccessor() appAccessor.insertOrUpdateApp(app) assert appAccessor.getCompleteAppInfo( 'testid') is None, "Database only has partial info."
def getCompleteAppInfo(app_ids: List[str]) -> List[AppItem]: """ search the app_ids in database or retrieve from Google Play. run scraper against the apps that are not in our database if scraper failed for some app_id, their corresponding app_info will have only app_id :param app_ids: list of app_id :return: list of dictionary(set). each set represents the app_info of the corresponding app_id, the return list have the same length as the input list """ if len(app_ids) == 0: return [] # https://stackoverflow.com/a/39537308/746461 # Python 3.6 and up keeps dict insertion order. # Python 3.7 formalizes it to a language specification. if sys.version_info < (3, 6): # sys.version_info is a named tuple. https://docs.python.org/3/glossary.html#term-named-tuple print(f'Python {tuple(sys.version_info)} may not keep dictionary insertion order. Upgrade to at least version 3.6.', file=sys.stderr) appAccessor = AppAccessor() with connection.cursor() as cursor: # very important! # connection.cursor() gives django cursor, connection.cursor().connection.cursor() gives underlying sqlite cursor. # django cursor and sqlite cursor uses different parameter style! if GooglePlayAdvancedSearch.DBUtils.getAppCountInDatabase(cursor) > 0: # search database first pass. If the app isn't in database, leave it none. We will fill in the second pass. app_infos = {id: appAccessor.getCompleteAppInfo(id) for id in app_ids} else: app_infos = {id: None for id in app_ids} # search database second pass # for first-pass non-found apps, pass into scraper appsMissingInDatabase = [k for k, v in app_infos.items() if v is None] if len(appsMissingInDatabase) > 0: code2 = os.system(('python ' if sys.platform == 'win32' else '') + "../scraper/Program.py -p %s" % ",".join(appsMissingInDatabase)) if hasattr(os, 'WEXITSTATUS') and os.WEXITSTATUS(code2) == GooglePlayAdvancedSearch.Errors.sslErrorCode \ or not hasattr(os, 'WEXITSTATUS') and code2 == GooglePlayAdvancedSearch.Errors.sslErrorCode: raise requests.exceptions.SSLError() appAccessor = AppAccessor() scraper_fail_id = [] for id in appsMissingInDatabase: tmp = appAccessor.getCompleteAppInfo(id) if tmp: app_infos[id] = tmp else: assert id in app_infos app_infos[id] = {'id': id} # if scraper fails, just pass "id" to appDetails to display scraper_fail_id.append(id) print("Scraper failed %d times: %s" % (len(scraper_fail_id), ", ".join(scraper_fail_id))) print("There were %d ids not in our database or stale. %d are now added" % (len(appsMissingInDatabase), len(appsMissingInDatabase) - len(scraper_fail_id))) print(f'total results: {len(app_ids)}') assert None not in app_infos.values(), "Every app id returned from Google should have an app detail." return list(app_infos.values())
def search(request: django.http.HttpRequest): # If the user loads Google Analysis, let Nginx handle rating limit. if not request.COOKIES.get('_gaload') and limitRate(getClientIP(request)): return JsonResponse({'error': 'Rate limit reached. Wait 60 seconds.'}) keyword = request.GET.get('q', '').strip() with connection.cursor() as cursor: try: logSearch(cursor, keyword, request) except django.db.utils.OperationalError: cursor.execute(apiHelper.getSqlCreateTableSearch()) try: logSearch(cursor, keyword, request) except Exception as e: print(str(e)) excludedPIds = [ int(n) for n in request.GET.get('pids', '').split(',') if n != '' ] excludedCIds = [ int(n) for n in request.GET.get('cids', '').split(',') if n != '' ] try: appAccessor = AppAccessor() appInfos = appAccessor.searchApps(keyword) needCompleteInfo = determineAppInfoCompleteness(request) if needCompleteInfo: appInfos = getCompleteAppInfo([a['id'] for a in appInfos]) appInfos = filterApps(appInfos, excludedCIds, excludedPIds, request) # If we cannot find 200 matches from our database, we try to find more matches from Google. if len(appInfos) < 200 and cache.get('searchkey-' + keyword) is None: cache.set('searchkey-' + keyword, '', timeout=60 * 5) # do not search the same keyword in 5 minutes appInfos2 = apiHelper.searchGooglePlay(keyword) if needCompleteInfo: appInfos2 = getCompleteAppInfo([a['id'] for a in appInfos2]) appInfos2 = filterApps(appInfos2, excludedCIds, excludedPIds, request) appInfoIds = {a['id'] for a in appInfos} appInfos.extend( [a for a in appInfos2 if a['id'] not in appInfoIds]) sortType = request.GET.get('sort') if sortType == 'rlh': # rating low to high appInfos = sorted(appInfos, key=lambda a: a['rating']) elif sortType == 'rhl': # rating high to low appInfos = sorted(appInfos, key=lambda a: a['rating'], reverse=True) elif sortType == 'plh': # number of permissions low to high appInfos = sorted(appInfos, key=lambda a: len(a['permissions'])) elif sortType == 'phl': # number of permissions low to high appInfos = sorted(appInfos, key=lambda a: len(a['permissions']), reverse=True) response = JsonResponse({'apps': [dict(a) for a in appInfos]}, safe=False) response['Cache-Control'] = "public, max-age=3600" return response except requests.exceptions.SSLError as e: # In getCompleteAppInfo, we throw our own SSLError where we don't have request object. if e.request: url = urlparse(e.request.url) return JsonResponse({ 'error': f'Searching is aborted because secure connection to https://{url.netloc} is compromised.\nAttacker is attacking us, but we didn\'t leak your data!' }) else: return JsonResponse({ 'error': f'Searching is aborted because secure connection is compromised.\nAttacker is attacking us, but we didn\'t leak your data!' })
def search(request): startTime = time.time() keyword = request.GET['q'] excludedPIds = [ int(n) for n in request.GET.get('pids', '').split(',') if n != '' ] excludedCIds = [ int(n) for n in request.GET.get('cids', '').split(',') if n != '' ] try: appAccessor = AppAccessor(1) appInfos = appAccessor.searchApps(keyword) needCompleteInfo = determineAppInfoCompleteness(request) if needCompleteInfo: appInfos = getCompleteAppInfo([a['id'] for a in appInfos]) if len(excludedPIds): appInfos = [ a for a in appInfos if isExcluded(a['permissions'], excludedPIds) == False ] if len(excludedCIds): appInfos = [ a for a in appInfos if isExcluded(a['categories'], excludedCIds) == False ] # If we cannot find 200 matches from our database, we try to find more matches from Google. if len(appInfos) < 200: appInfos2 = searchGooglePlay(keyword) if needCompleteInfo: appInfos2 = getCompleteAppInfo([a['id'] for a in appInfos2]) if len(excludedPIds): appInfos2 = [ a for a in appInfos2 if isExcluded(a['permissions'], excludedPIds) == False ] if len(excludedCIds): appInfos2 = [ a for a in appInfos2 if isExcluded(a['categories'], excludedCIds) == False ] appInfoIds = {a['id'] for a in appInfos} appInfos.extend( [a for a in appInfos2 if a['id'] not in appInfoIds]) sortType = request.GET.get('sort') if sortType == 'rlh': # rating low to high appInfos = sorted(appInfos, key=lambda a: a['rating']) elif sortType == 'rhl': # rating high to low appInfos = sorted(appInfos, key=lambda a: a['rating'], reverse=True) elif sortType == 'plh': # number of permissions low to high appInfos = sorted(appInfos, key=lambda a: len(a['permissions'])) elif sortType == 'phl': # number of permissions low to high appInfos = sorted(appInfos, key=lambda a: len(a['permissions']), reverse=True) response = JsonResponse( { 'executionSeconds': time.time() - startTime, 'apps': [dict(a) for a in appInfos] }, safe=False) response['Cache-Control'] = "private, max-age=3600" return response except requests.exceptions.SSLError as e: # In getCompleteAppInfo, we throw our own SSLError where we don't have request object. if e.request: url = urlparse(e.request.url) return JsonResponse({ 'error': f'Searching is aborted because secure connection to https://{url.netloc} is compromised.\nAttacker is attacking us, but we didn\'t leak your data!' }) else: return JsonResponse({ 'error': f'Searching is aborted because secure connection is compromised.\nAttacker is attacking us, but we didn\'t leak your data!' })