Example #1
0
def parse_googleplay_apppage(page, id=None):
    if len(page) < config.pagesize_threshold:
        raise IDError(id)

    d = pq(page)
    result = dict()
    result['title'] = d('.doc-banner-title').text()
    result['description'] = d('#doc-original-text').text()
    result['meta'] = d('.doc-metadata-list').text()
    result['developer'] = urlutil.getidfromurl(d('.doc-header-link')[0].attrib['href']).replace('+', ' ')
    result['iconurl'] = d('.doc-banner-icon').find('img').attr('src')

    if not id:
        # to do
        pass # should parse id from page
    else:
        result['id'] = id

    # may not be always appeared in app page
    avg_item = d('.average-rating-value')
    if len(avg_item) == 0:
        result['rate'] = -1
    else:
        result['rate'] = avg_item[0].text

    vote_item = d('.votes')
    if len(vote_item) == 0:
        result['usernum'] = -1
    else:
        result['usernum'] = vote_item[0].text

    related_ids = [entry.get('href') for entry in d("div[data-analyticsid='related'] .common-snippet-title")]
    related_ids = list(map(urlutil.getidfromurl, related_ids))

    installed_ids = [entry.get('href') for entry in d("div[data-analyticsid='users-also-installed'] .common-snippet-title")]
    installed_ids = list(map(urlutil.getidfromurl, installed_ids))

    """
    samedev_ids = [entry.get('href') for entry in d("div[data-analyticsid='more-from-developer'] .common-snippet-title")]
    """
    try:
        samedev_ids = dev_downloader.get_ids_from_devid_by_developerpage(result['developer'])
        samedev_ids = [i for i in samedev_ids if i != id]
    except Exception as e:
        redprint("Exception in parsing devids from devpage")
        redprint(str(e))
        samedev_ids = [entry.get('href') for entry in d("div[data-analyticsid='more-from-developer'] .common-snippet-title")]
        samedev_ids = list(map(urlutil.getidfromurl, samedev_ids))

    result['alsoinstalled'] = installed_ids
    result['samedev'] = samedev_ids
    result['related'] = related_ids

    # adding timestamp
    result['timestamp'] = time.time()

    return result
Example #2
0
def parse_get_devid(page):
    if len(page) < 100:
        print("ERROR, page size too small")
        raise Exception("page size too small")
    d = pq(page)
    parseli = d('.doc-header-link')
    if len(parseli) != 1:
        print("ERROR in parsing")
        raise Exception('Exception in Parsing dev id')
    devid = urlutil.getidfromurl(parseli[0].attrib['href'])
    return devid.replace('+', ' ')
Example #3
0
def parse_googleplay_searchpage(page, query):
    if (len(page) < 2000):
        print("ERROR, page size too small")
        return []
    d = pq(page)
    parseli = d('.search-results-list .search-results-item a.title')
    parseli = [urlutil.getidfromurl(a.attrib['href']) for a in parseli]
    """
    if len(parseli) != 48:
        print("Error, # of li not 42, but %s, query is %s" % \
                (len(parseli), query))
    """
    return parseli
Example #4
0
def parse_googleplay_developerpage(page):
    idli = []
    try:
        if len(page) < config.pagesize_threshold:
            # ugly coding
            return []
        d = pq(page)
        matchedli = d('.num-pagination-page .snippet-list .details .title')
        idli = [urlutil.getidfromurl(item.get('href')) for item in matchedli]
    except IDError as e:
        print(str(e))
    finally:
        return idli