def parse_googleplay_apppage(page, id=None): if len(page) < config.pagesize_threshold: raise IDError(id) d = pq(page) result = dict() result['title'] = d('.doc-banner-title').text() result['description'] = d('#doc-original-text').text() result['meta'] = d('.doc-metadata-list').text() result['developer'] = urlutil.getidfromurl(d('.doc-header-link')[0].attrib['href']).replace('+', ' ') result['iconurl'] = d('.doc-banner-icon').find('img').attr('src') if not id: # to do pass # should parse id from page else: result['id'] = id # may not be always appeared in app page avg_item = d('.average-rating-value') if len(avg_item) == 0: result['rate'] = -1 else: result['rate'] = avg_item[0].text vote_item = d('.votes') if len(vote_item) == 0: result['usernum'] = -1 else: result['usernum'] = vote_item[0].text related_ids = [entry.get('href') for entry in d("div[data-analyticsid='related'] .common-snippet-title")] related_ids = list(map(urlutil.getidfromurl, related_ids)) installed_ids = [entry.get('href') for entry in d("div[data-analyticsid='users-also-installed'] .common-snippet-title")] installed_ids = list(map(urlutil.getidfromurl, installed_ids)) """ samedev_ids = [entry.get('href') for entry in d("div[data-analyticsid='more-from-developer'] .common-snippet-title")] """ try: samedev_ids = dev_downloader.get_ids_from_devid_by_developerpage(result['developer']) samedev_ids = [i for i in samedev_ids if i != id] except Exception as e: redprint("Exception in parsing devids from devpage") redprint(str(e)) samedev_ids = [entry.get('href') for entry in d("div[data-analyticsid='more-from-developer'] .common-snippet-title")] samedev_ids = list(map(urlutil.getidfromurl, samedev_ids)) result['alsoinstalled'] = installed_ids result['samedev'] = samedev_ids result['related'] = related_ids # adding timestamp result['timestamp'] = time.time() return result
def parse_get_devid(page): if len(page) < 100: print("ERROR, page size too small") raise Exception("page size too small") d = pq(page) parseli = d('.doc-header-link') if len(parseli) != 1: print("ERROR in parsing") raise Exception('Exception in Parsing dev id') devid = urlutil.getidfromurl(parseli[0].attrib['href']) return devid.replace('+', ' ')
def parse_googleplay_searchpage(page, query): if (len(page) < 2000): print("ERROR, page size too small") return [] d = pq(page) parseli = d('.search-results-list .search-results-item a.title') parseli = [urlutil.getidfromurl(a.attrib['href']) for a in parseli] """ if len(parseli) != 48: print("Error, # of li not 42, but %s, query is %s" % \ (len(parseli), query)) """ return parseli
def parse_googleplay_developerpage(page): idli = [] try: if len(page) < config.pagesize_threshold: # ugly coding return [] d = pq(page) matchedli = d('.num-pagination-page .snippet-list .details .title') idli = [urlutil.getidfromurl(item.get('href')) for item in matchedli] except IDError as e: print(str(e)) finally: return idli