Ejemplo n.º 1
0
def runf(self, job, debug=False):
    """
    assuming all pages are utf-8
    page = page.decode('utf-8')
    """
    id, url, isvalidurl = job
    savefolder = os.path.join(config.here, self.state.taskname)
    esc_id = id.replace('/', '@##@')
    savefolder = os.path.join(savefolder, esc_id)
    if not os.path.exists(savefolder):
        pass
        # os.mkdir(savefolder)

    savefilepath = os.path.join(savefolder, 'home.html')
    if True or not os.path.exists(savefilepath):
        page = urlutil.getpage(url)

        if not isvalidurl and page:
            print(page)
            print(id, url)

        """
        if toplevelp:
            newurls = myparser.get_urls(page)
            for u in newurls:
                newjob = (id, u, False)
        """

        # I'm curious about -- why empty url could get some page saved
        """
Ejemplo n.º 2
0
def get_ids_from_devid_by_developerpage(devid):
    ids = []
    num = config.nid_per_devpage 
    for start in range(0, 400, num):
        url = urlutil.makeurlfromdevid(devid, start, num)
        newids = myparser.parse_googleplay_developerpage(urlutil.getpage(url).decode('utf-8'))
        if not newids:
            break
        ids += newids
    return ids
Ejemplo n.º 3
0
def download_googleplay_single_page(query, start, num):
    """
    should return an id list
    """
    query = query.replace(' ', '+')
    query = urllib.parse.quote_plus(query)
    url = 'https://play.google.com/store/search?q=%s&c=apps&start=%d&num=%d' % (query, start, num)
    greenprint(url)
    page = urlutil.getpage(url)
    page = page.decode('utf-8')
    blueprint('page size is %d' % len(page))
    return myparser.parse_searchpage(page, query)
Ejemplo n.º 4
0
def main():
    page = urlutil.getpage(baseurl)
    page = page.decode('utf-8')

    print('size of page fetched is %d' % len(page))
    res = parse_appstore_home(page)
    print(len(res))
    stp = res[0].find('genre') + 6
    cats = [url[stp:] for url in res]
    cats = [url[4:url.find('/')] for url in cats]
    zipped = list(zip(cats, res))
    print(zipped)

    with open('ios-home-res.info', 'w') as f:
        dumpstr = '\n'.join(['%s %s' % (cat, url) for cat, url in zipped])
        f.write(dumpstr)
Ejemplo n.º 5
0
def test():
    from urlutil import getpage
    page = getpage('https://play.google.com/store/apps/developer?id=Binary+Helix&hl=en')
    page = page.decode('utf-8')
    print(parse_topdev_devhome(page))