def category_read(cate_path, cate_name, cate_type, cate_start): url = '%s/collection/%s?start=%s&num=24'%(cate_path, cate_type, cate_start) print '** category %s **'%(url) status, body = android_https_get(url) if status == 404: # not all category would have more 480 items. print '==: %s '%(str(status)) return if status != 200: raise Exception('app category https connection error: %s'%(str(status))) soup = BeautifulSoup(body) divs = soup.find_all(name='div', attrs={'class':'snippet snippet-medium'}) for div in divs: rank_divs = div.find_all(name='div', attrs={'class':'ordinal-value'}) if len(rank_divs) != 1: raise Exception('category div ordinal-value len != 1') rank = rank_divs[0].text.strip() href_as = div.find_all(name='a', attrs={'class':'title'}) if len(href_as) != 1: raise Exception('category div a href len != 1') if not href_as[0].has_key('href'): raise Exception('category div a href is empty') href = href_as[0]['href'] href = urlparse.urlparse(href) href_qs = urlparse.parse_qs(href.query) href_path = href.path href_id = None if href_qs.has_key('id') and len(href_qs['id']) > 0: href_id = href_qs['id'][0] if href_id == None: raise Exception('category div a href urlparse wrong') app_id = href_id.strip() db_play.db_execute_g(db_sql.sql_app_insert_with_rank, (app_id, rank))
def category_read_main(): finish = True rows = db_play.db_get_g(db_sql.sql_cate_read_get, ()) for row in rows: finish = False cate_name = row[0] cate_path = row[1] cate_param = row[2] cate_type = row[3] try: category_read(cate_path, cate_name, cate_type, cate_param) db_play.db_execute_g(db_sql.sql_cate_read_update, (cate_name, cate_path, cate_param, cate_type, )) except Exception as e: err.except_p(e) util.sleep() return finish
def categories_read_main(): url = '/%s/%s'%(android_root, android_categories) print '** categories main %s **'%(url) status, body = android_https_get(url) if status != 200: raise Exception('app home https connection error: %s'%(str(status))) soup = BeautifulSoup(body) divs = soup.body.find_all(name='div', attrs={'class':'padded-content3 app-home-nav'}) for div in divs: if len(div.contents) != 2: raise Exception('app home nav length != 2') h2 = div.contents[0] cate_group_name = h2.text.strip() ul = div.contents[1] lis = ul.find_all(name='li', attrs={'class':'category-item'}) if len(lis) <= 0: raise Exception('app home nav li length <= 0') for li in lis: a = li.a if a == None: raise Exception('app home nav li a == None') if not a.has_key('href'): raise Exception('app home nav li a href has not href') cate_path = urlparse.urlparse(a['href']).path.strip() cate_name = a.text.strip() db_play.db_execute_g(db_sql.sql_cate_insert, (cate_group_name, cate_name, cate_path, str(datetime.now()))) cate_i = 0 while cate_i < 504: db_play.db_execute_g(db_sql.sql_cate_read_insert, (cate_name, cate_path, cate_i, 'topselling_free')) db_play.db_execute_g(db_sql.sql_cate_read_insert, (cate_name, cate_path, cate_i, 'topselling_paid')) cate_i = cate_i + 24