def video_read(video_id, app_id, video_href): view_total = '' view_likes = '' view_dislikes = '' comments = '' param = { 'v':video_id, } url_body = urllib.urlencode(param) url = '/%s?%s'%(youtube_root, url_body) print '** youtube : %s **'%(url) status, body = youtube_http_get(url) if status == 404: db_app.db_execute_g(db_sql.sql_video_update_404, (str(datetime.now()), 1, app_id, video_href)) return if status != 200: raise Exception('youtube http connection status:%s'%(str(status))) soup = BeautifulSoup(body) view_total_fa = soup.find_all(name='span', attrs={'class':'watch-view-count'}) for view_total_f in view_total_fa: if view_total_f.strong != None and view_total_f.strong.text != None: view_total = view_total_f.strong.text.strip() view_likes_fa = soup.find_all(name='span', attrs={'class':'likes'}) for view_likes_f in view_likes_fa: view_likes = view_likes_f.text.strip() view_dislikes_fa = soup.find_all(name='span', attrs={'class':'dislikes'}) for view_dislikes_f in view_dislikes_fa: view_dislikes = view_dislikes_f.text.strip() comments_fa = soup.find_all(name='span', attrs={'class':'comments-section-stat'}) for comments_f in comments_fa: comments = comments_f.text.replace('(', '').replace(')', '').strip() print view_total, view_likes, view_dislikes, comments db_app.db_execute_g(db_sql.sql_video_update, (view_total, view_likes, view_dislikes, comments, str(datetime.now()), 1, app_id, video_href))
def review_read_loop(app_id, page_num, review_type, review_sort_order): params = { 'id':app_id, 'reviewSortOrder':review_sort_order, 'reviewType':review_type, 'pageNum':page_num } param = urllib.urlencode(params) url = '/store/getreviews' print param, url status, body = android_https_post(url, param) if status == 404: print '==: 404' db_app.db_execute_g(db_sql.sql_review_read_status_update, (app_id, )) return status, page_num if status != 200: print 'app review https connection error: %s'%(str(status)) return status, page_num #raise Exception('app getreview ajax status != 200') body = body.lstrip(")]}'").strip() try: review_read(app_id, body) db_app.db_execute_g(db_sql.sql_review_read_update, (page_num, app_id, )) page_num = int(page_num) + 1 except Exception as e: err.except_p(e) return status, page_num
def s_task_review_read_main(client_id, limit=1): rows = db_app.db_get_g(db_sql.sql_review_read_get_server_task, (limit, )) i_t = len(rows) i = 0 jobs = {} for row in rows: i = i + 1 #print '%d of %d'%(i, i_t), app_id = row[0] page_num = row[1] #page_num = 490 review_type = row[2] review_sort_order = row[3] job = { 'app_id':app_id, 'pageNum':page_num, 'review_type':review_type, 'review_sort_order':review_sort_order } jobs[i] = job print jobs for job in jobs: j = jobs[job] app_id = j['app_id'] db_app.db_execute_g(db_sql.sql_review_read_update_server_task, (client_id, str(datetime.now()), app_id, )) return jobs
def app_read_screenshot(app_id, soup): screenshots_fa = soup.find_all(name='div', attrs={'class':'doc-overview-screenshots'}) for screenshots_f in screenshots_fa: screenshot_fa = screenshots_f.find_all(name='img', attrs={'itemprop':'screenshots'}) for screenshot_f in screenshot_fa: if screenshot_f.has_key('src'): screenshot = screenshot_f['src'].strip() db_app.db_execute_g(db_sql.sql_app_screenshot_insert, (app_id, screenshot))
def s_cron_all_review_read(): #client_id = 'dtc' rows = db_app.db_get_g(db_sql.sql_review_read_get_cron_all, (client_id, )) for row in rows: app_id = row[0] #print app_id db_app.db_execute_g(db_sql.sql_review_read_update_cron, (str(datetime.now()), app_id, )) return len(rows)
def app_read_video(app_id, soup): videos_fa = soup.find_all(name='div', attrs={'class':'doc-overview-videos'}) for videos_f in videos_fa: video_fa = videos_f.find_all(name='param', attrs={'name':'movie'}) for video_f in video_fa: if video_f.has_key('value'): video = video_f['value'].strip() db_app.db_execute_g(db_sql.sql_app_video_insert, (app_id, video))
def review_read(app_id, body): j = json.loads(body) if j.has_key('htmlContent'): contents = j['htmlContent'].strip() soup = BeautifulSoup(contents) review_fa = soup.find_all(name='div', attrs={'class':'doc-review'}) for review_f in review_fa: review_author = '' review_date = '' review_device = '' review_version = '' review_id = '' review_rating = '' review_title = '' review_text = '' author_fa = review_f.find_all(name='span', attrs={'class':'doc-review-author'}) for author_f in author_fa: review_author = author_f.strong.text.strip() date_fa = review_f.find_all(name='span', attrs={'class':'doc-review-date'}) for date_f in date_fa: review_date = date_f.text.replace('-', '').strip() if date_f.next_sibling != None: if type(date_f.next_sibling) != bs4.element.NavigableString: continue device_version = date_f.next_sibling.replace('-', '').strip() device_version = device_version.split('with version') if len(device_version) == 2: review_device = device_version[0].strip() review_version = device_version[1].strip() if len(device_version) == 1: review_device = device_version[0].strip() id_fa = review_f.find_all(name='div', attrs={'class':'goog-inline-block review-permalink'}) for id_f in id_fa: if id_f.parent.has_key('href'): review_id = id_f.parent['href'].strip() review_id = urlparse.urlparse(review_id).query review_id = urlparse.parse_qs(review_id) if review_id.has_key('reviewId') and len(review_id['reviewId'])>0: review_id = review_id['reviewId'][0] else: review_id = '' rating_fa = review_f.find_all(name='div', attrs={'class':'ratings goog-inline-block'}) for rating_f in rating_fa: if rating_f.has_key('title'): review_rating = rating_f['title'].strip() review_rating = review_rating.split(' ') if len(review_rating) >= 2: review_rating = review_rating[1].strip() title_fa = review_f.find_all(name='h4', attrs={'class':'review-title'}) for title_f in title_fa: review_title = title_f.text.strip() text_fa = review_f.find_all(name='p', attrs={'class':'review-text'}) for text_f in text_fa: review_text = review_text + text_f.text.strip() + ' ' if review_id != '': db_app.db_execute_g(db_sql.sql_review_insert, (review_id, app_id, review_author, review_date, review_device, review_version, review_title, review_text, review_rating, str(datetime.now()),))
def s_sync_review_read_main(client_id, results): #client_id = 'dtc' #results = c_app_review.c_sync_review_read_main() i = 0 for result in results: i = i + 1 r = results[result] app_id = r['app_id'] page_num = r['pageNum'] read_status = r['read_status'] db_app.db_execute_g(db_sql.sql_review_read_update_server_sync, (page_num, read_status, str(datetime.now()), app_id, )) return i
def app_read_tab_permission(app_id, soup): perm_group_title = '' tab_permissions_fa = soup.find_all(name='div', attrs={'class':'doc-specs padded-content2'}) if len(tab_permissions_fa) <= 0: raise Exception('app tab permission len <= 0') tab_permissions_fa = tab_permissions_fa[0] perm_fa = tab_permissions_fa.find_all(name='li', attrs={'class':'doc-permission-group'}) for perm_f in perm_fa: for pc in perm_f.contents: if pc.has_key('class'): pcc = pc['class'] if 'doc-permission-group-title' in pcc: perm_group_title = pc.text.strip() if 'doc-permission-description' in pcc: perm_each_desc = pc.text.strip() db_app.db_execute_g(db_sql.sql_app_perm_insert, (app_id, perm_group_title, perm_each_desc))
def app_read_banner(app_id, soup): banner_title = '' banner_developer_href = '' banner_developer_name = '' banner_icon_src = '' rating_figure = '' raters = '' price = '' banner_title_fa = soup.find_all(name='td', attrs={'class':'doc-banner-title-container'}) if len(banner_title_fa) == 1: banner_title_f = banner_title_fa[0] if banner_title_f.h1 != None: banner_title = banner_title_f.h1.text if banner_title_f.a != None: if banner_title_f.a.has_key('href'): banner_developer_href = banner_title_f.a['href'].strip() if banner_title_f.a.text != None: banner_developer_name = banner_title_f.a.text.strip() banner_icon_fa = soup.find_all(name='div', attrs={'class':'doc-banner-icon'}) for banner_icon in banner_icon_fa: if banner_icon.img != None: if banner_icon.img.has_key('src'): banner_icon_src = banner_icon.img['src'].strip() banner_annotation_fa = soup.find_all(name='div', attrs={'class':'badges-badge-title goog-inline-block'}) for banner_annotation in banner_annotation_fa: banner_annotation_text = banner_annotation.text.strip() db_app.db_execute_g(db_sql.sql_app_awards_insert, (app_id, banner_annotation_text)) rating_price_fa = soup.find_all(name='td', attrs={'class':'doc-details-ratings-price'}) if len(rating_price_fa) == 1: rating_fa = rating_price_fa[0].find_all(name='div', attrs={'class':'ratings goog-inline-block'}) for rating_f in rating_fa: if rating_f.has_key('title'): rating_title = rating_f['title'].strip() rating_figure = rating_title.split(' ') if len(rating_figure) >= 2: rating_figure = rating_figure[1].strip() if rating_f.next_sibling != None: raters_f = rating_f.next_sibling if raters_f.text != None: raters = raters_f.text raters = raters.replace('(', '').replace(')', '').strip() price_fa = rating_price_fa[0].find_all(name='span', attrs={'class':'buy-button-price'}) for price_f in price_fa: price = price_f.text price = price.upper().replace('BUY', '').strip() db_app.db_execute_g(db_sql.sql_app_banner_update, (banner_title, banner_icon_src, banner_developer_name, banner_developer_href, rating_figure, raters, price, app_id))
def app_read_metadata(app_id, soup): meta_update = '' meta_current = '' meta_require = '' meta_install = '' meta_size = '' meta_category = '' meta_rating = '' metadata_fa = soup.find_all(name='div', attrs={'class':'doc-metadata'}) for metadata_f in metadata_fa: meta_google_plus_fa = metadata_f.find_all(name='div', attrs={'class':'plus-share-container'}) for meta_google_plus_f in meta_google_plus_fa: if len(meta_google_plus_f.contents)>0 and meta_google_plus_f.contents[0].has_key('href'): meta_google_plus_href = meta_google_plus_f.contents[0]['href'].strip() db_app.db_execute_g(db_sql.sql_app_google_plus_insert, (app_id, meta_google_plus_href)) meta_update_fa = metadata_f.find_all(name='dt', text='Updated:') if len(meta_update_fa) > 0 and meta_update_fa[0].next_sibling != None: meta_update_f = meta_update_fa[0].next_sibling if meta_update_f.time != None: meta_update = meta_update_f.time.text.strip() meta_current_fa = metadata_f.find_all(name='dt', text='Current Version:') if len(meta_current_fa) > 0 and meta_current_fa[0].next_sibling != None: meta_current_f = meta_current_fa[0].next_sibling meta_current = meta_current_f.text.strip() meta_require_fa = metadata_f.find_all(name='dt', text='Requires Android:') if len(meta_require_fa) > 0 and meta_require_fa[0].next_sibling != None: meta_require_f = meta_require_fa[0].next_sibling meta_require = meta_require_f.text.strip() meta_category_fa = metadata_f.find_all(name='dt', text='Category:') if len(meta_category_fa) > 0 and meta_category_fa[0].next_sibling != None: meta_category_f = meta_category_fa[0].next_sibling meta_category = meta_category_f.text.strip() meta_install_fa = metadata_f.find_all(name='dt', text='Installs:') if len(meta_install_fa) > 0 and meta_install_fa[0].next_sibling != None: meta_install_f = meta_install_fa[0].next_sibling meta_install = meta_install_f.text meta_install = meta_install.upper().replace('LAST 30 DAYS', '').strip() meta_size_fa = metadata_f.find_all(name='dt', text='Size:') if len(meta_size_fa) > 0 and meta_size_fa[0].next_sibling != None: meta_size_f = meta_size_fa[0].next_sibling meta_size = meta_size_f.text.strip() meta_rating_fa = metadata_f.find_all(name='dt', text='Content Rating:') if len(meta_rating_fa) > 0 and meta_rating_fa[0].next_sibling != None: meta_rating_f = meta_rating_fa[0].next_sibling meta_rating = meta_rating_f.text.strip() db_app.db_execute_g(db_sql.sql_app_metadata_update, (meta_update, meta_current, meta_require, meta_install, meta_size, meta_category, meta_rating, app_id))
def s_sync_review_main(client_id, results): #client_id = 'dtc' #results = c_app_review.c_sync_review_main() print client_id i = 0 for result in results: i = i + 1 r = results[result] review_id = r['review_id'] app_id = r['app_id'] reviewer = r['reviewer'] date = r['date'] device = r['device'] version = r['version'] title = r['title'] comment = r['comment'] review_star = r['review_star'] db_app.db_execute_g(db_sql.sql_review_insert_server_sync, (review_id, app_id, reviewer, date, device, version, title, comment, review_star, )) return i
def s_task_category_read_main(): client_id = 'dtc' limit = 10 rows = db_zoom.db_get_g(db_sql.sql_zoom_cate_read_get_server, (limit, )) i_t = len(rows) i = 0 jobs = {} for row in rows: i = i + 1 print '%d of %d'%(i, i_t) cate_path = row[0] cate_param = row[1] print cate_path, cate_param job = {'cate_path':cate_path, 'cate_param':cate_path} jobs[i] = job for j in jobs: job = jobs[j] cate_path = job['cate_path'] db_app.db_execute_g(db_sql.sql_zoom_cate_read_update_server_task, (client_id, cate_path, )) return jobs
def google_plus_read(app_id, google_plus_href): params = { 'url':google_plus_href, } param = urllib.urlencode(params) url = '/u/0/_/+1/fastbutton?%s'%(param) #print param, url status, body = plus_https_get(url) if status == 404: print '==: 404' db_app.db_execute_g(db_sql.sql_app_google_plus_update, ('-1', str(datetime.now()), app_id, google_plus_href, )) return if status != 200: raise Exception('app google plus https connection error: %s'%(str(status))) soup = BeautifulSoup(body) div_fa = soup.find_all(name='div', attrs={'id':'aggregateCount'}) for div_f in div_fa: google_plus_figure = div_f.text.strip() print google_plus_figure db_app.db_execute_g(db_sql.sql_app_google_plus_update, (google_plus_figure, str(datetime.now()), app_id, google_plus_href, ))
def app_read(app_id): try: url = '/%s/details?id=%s'%(android_root, app_id) print '** app %s **'%(url) status, body = android_https_get(url) #print status, body if status == 404: print '== 404' db_app.db_execute_g(db_sql.sql_app_read_update, (1, str(datetime.now()), app_id)) return if status != 200: raise Exception('app read https connection error: %s'%(str(status))) soup = BeautifulSoup(body) app_read_banner(app_id, soup) app_read_tab_overview(app_id, soup) app_read_tab_review(app_id, soup) app_read_tab_permission(app_id, soup) db_app.db_execute_g(db_sql.sql_app_read_update, (1, str(datetime.now()), app_id)) #util.sleep() except Exception as e: err.except_p(e)
def categories_read_main(): url = '/' print '** categories main %s **'%(url) status, body = zoom_http_get(url) if status != 200: raise Exception('zoom app home http connection errir:%s'%(str(status))) soup = BeautifulSoup(body) if soup.body.text.strip().find('Access not allowed. If you think this is an error, please contact us at [email protected]') > 0: raise Exception('Access not allowed. If you think this is an error, please contact us at [email protected]') divs = soup.body.find_all(name='div', attrs={'id':'categories-list'}) for div in divs: for d in div: if d.name.strip() != 'div': continue cate_group_name = d.h3.text.strip() ul = d.ul for li in ul: if li.a != None and li.a.has_key('href'): cate_name = li.a.text.strip() cate_path = li.a['href'].strip() print cate_group_name, cate_name, cate_path db_app.db_execute_g(db_sql.sql_zoom_cate_insert, (cate_group_name, cate_name, cate_path, str(datetime.now())))
def app_read_tab_review(app_id, soup): ## needs to work out rating_0 = '' rating_1 = '' rating_2 = '' rating_3 = '' rating_4 = '' rating_5 = '' tab_review = soup.find_all(name='div', attrs={'class':'doc-reviews padded-content2'}) if len(tab_review) <= 0: raise Exception('app tab review len <= 0') tab_review = tab_review[0] review_head_fa = tab_review.find_all(name='div', attrs={'class':'reviews-heading-container'}) for review_head_f in review_head_fa: user_rating_fa = review_head_f.find_all(name='div', attrs={'class':'user-ratings'}) if len(user_rating_fa) <= 0: return #raise Exception('app tab review user rating len <= 0') user_rating_fa = user_rating_fa[0] rating_tr_fa = user_rating_fa.find_all(name='span', attrs={'class':'histogram-label'}) for rating_tr_f in rating_tr_fa: rating_star = rating_figure = 'None' if rating_tr_f.has_key('data-rating'): rating_star = rating_tr_f['data-rating'].strip() if rating_tr_f.parent != None and rating_tr_f.parent.next_sibling != None: rating_figure = rating_tr_f.parent.next_sibling rating_figure = rating_figure.text.strip() if rating_star == '0': rating_0 = rating_figure if rating_star == '1': rating_1 = rating_figure if rating_star == '2': rating_2 = rating_figure if rating_star == '3': rating_3 = rating_figure if rating_star == '4': rating_4 = rating_figure if rating_star == '5': rating_5 = rating_figure db_app.db_execute_g(db_sql.sql_app_rating_update, (rating_0, rating_1, rating_2, rating_3, rating_4, rating_5, app_id))
def app_read_overview(app_id, soup): desc = '' developer_website = '' developer_email = '' developer_privacy = '' overview_fa = soup.find_all(name='div', attrs={'class':'doc-overview'}) for overview_f in overview_fa: desc_fa = overview_f.find_all(name='div', attrs={'id':'doc-original-text'}) for desc_f in desc_fa: desc = desc_f.text.strip() developer_website_fa = overview_f.find_all(name='a', text="Visit Developer's Website") for developer_website_f in developer_website_fa: if developer_website_f.has_key('href'): developer_website = developer_website_f['href'].strip() developer_email_fa = overview_f.find_all(name='a', text='Email Developer') for developer_email_f in developer_email_fa: if developer_email_f.has_key('href'): developer_email = developer_email_f['href'] developer_email = developer_email.replace('mailto:', '').strip() developer_privacy_fa = overview_f.find_all(name='a', text='Privacy Policy') for developer_privacy_f in developer_privacy_fa: if developer_privacy_f.has_key('href'): developer_privacy = developer_privacy_f['href'].strip() db_app.db_execute_g(db_sql.sql_app_overview_update, (desc, developer_website, developer_email, developer_privacy, app_id))
def review_read_main_init(): rows = db_app.db_get_g(db_sql.sql_review_read_app_get, ()) for row in rows: app_id = row[0] db_app.db_execute_g(db_sql.sql_review_read_insert, (app_id,))