def runf(self, job, debug=False): """ assuming all pages are utf-8 page = page.decode('utf-8') """ id, url, isvalidurl = job savefolder = os.path.join(config.here, self.state.taskname) esc_id = id.replace('/', '@##@') savefolder = os.path.join(savefolder, esc_id) if not os.path.exists(savefolder): pass # os.mkdir(savefolder) savefilepath = os.path.join(savefolder, 'home.html') if True or not os.path.exists(savefilepath): page = urlutil.getpage(url) if not isvalidurl and page: print(page) print(id, url) """ if toplevelp: newurls = myparser.get_urls(page) for u in newurls: newjob = (id, u, False) """ # I'm curious about -- why empty url could get some page saved """
def get_ids_from_devid_by_developerpage(devid): ids = [] num = config.nid_per_devpage for start in range(0, 400, num): url = urlutil.makeurlfromdevid(devid, start, num) newids = myparser.parse_googleplay_developerpage(urlutil.getpage(url).decode('utf-8')) if not newids: break ids += newids return ids
def download_googleplay_single_page(query, start, num): """ should return an id list """ query = query.replace(' ', '+') query = urllib.parse.quote_plus(query) url = 'https://play.google.com/store/search?q=%s&c=apps&start=%d&num=%d' % (query, start, num) greenprint(url) page = urlutil.getpage(url) page = page.decode('utf-8') blueprint('page size is %d' % len(page)) return myparser.parse_searchpage(page, query)
def main(): page = urlutil.getpage(baseurl) page = page.decode('utf-8') print('size of page fetched is %d' % len(page)) res = parse_appstore_home(page) print(len(res)) stp = res[0].find('genre') + 6 cats = [url[stp:] for url in res] cats = [url[4:url.find('/')] for url in cats] zipped = list(zip(cats, res)) print(zipped) with open('ios-home-res.info', 'w') as f: dumpstr = '\n'.join(['%s %s' % (cat, url) for cat, url in zipped]) f.write(dumpstr)
def test(): from urlutil import getpage page = getpage('https://play.google.com/store/apps/developer?id=Binary+Helix&hl=en') page = page.decode('utf-8') print(parse_topdev_devhome(page))