def home_read_max(lang_href, menu_href, menu_name): url = menu_href print '** home cate max %s %s **'%(menu_name, url) status, body = lib_http_get(url) if status != 200: raise Exception('lib home_read_sub http error %s'%(str(status))) soup = BeautifulSoup(body) div_fa = soup.find_all(name='div', attrs={'class':'itmCatLstContent'}) if len(div_fa) > 0: div_f = div_fa[0] if div_f.a != None and div_f.a.has_key('href'): cate_href = div_f.a['href'].strip() cate_name = div_f.a.text.strip() p_num_read(lang_href, menu_href, cate_href, cate_name) util.sleep_i(1)
def cate_link_read_main(): global lib_host_http global lib_conn_http rows = db_lib.db_get_g(db_sql.sql_lib_lang_cate_link_read_get, ()) for row in rows: lang_href = row[0] cate_path = row[1] cate_param = row[2] link_href = row[3] if lib_host_http != lang_href.replace('http://', '').replace('/', '').strip(): lib_host_http = lang_href.replace('http://', '').replace('/', '').strip() lib_conn_http = http.get_conn_http(lib_host_http) try: cate_link_read(link_href, lang_href, cate_path, cate_param) util.sleep_i(1) except Exception as e: err.except_p(e)
def home_read_main(): global lib_host_http global lib_conn_http rows = db_lib.db_get_g(db_sql.sql_lib_lang_get, ()) for row in rows: lang_href = row[0] lang_title = row[1] lib_host_http = lang_href.replace('http://', '').replace('/', '').strip() lib_conn_http = http.get_conn_http(lib_host_http) ''' # if working on horizon pc, uncommit this if and commit above two lines. if lib_host_http != lang_href.replace('http://', '').replace('/', '').strip(): lib_host_http = lang_href.replace('http://', '').replace('/', '').strip() lib_conn_http = http.get_conn_http(lib_host_http) ''' try: home_read(lang_href) db_lib.db_execute_g(db_sql.sql_lib_lang_update, (lang_href,)) util.sleep_i(1) except Exception as e: err.except_p(e)
def cate_read_main(): global lib_host_http global lib_conn_http rows = db_lib.db_get_g(db_sql.sql_lib_lang_cate_read_get, ()) i_t = len(rows) i = 0 for row in rows: i = i + 1 print '%d of %d'%(i, i_t), lang_href = row[0] cate_path = row[1] cate_param = row[2] if lib_host_http != lang_href.replace('http://', '').replace('/', '').strip(): lib_host_http = lang_href.replace('http://', '').replace('/', '').strip() lib_conn_http = http.get_conn_http(lib_host_http) try: cate_read(lang_href, cate_path, cate_param) util.sleep_i(1) except Exception as e: err.except_p(e)
def category_read(cate_name, cate_path, cate_param): #cate_param = 16270 status = 200 while status == 200: url = '%s/?p=%s'%(cate_path, cate_param) #url = '%s/?p=%s&nav=halloween'%(cate_path, cate_param) # for special cases print '** zoom category %s %s **'%(cate_param, cate_path) status, body = zoom_http_get(url) if status == 404: print '==: %s '%(str(status)) db_zoom.db_execute_g(db_sql.sql_zoom_cate_read_param_update, (cate_param, cate_path, )) db_zoom.db_execute_g(db_sql.sql_zoom_cate_read_update, (cate_path, )) break if status != 200: raise Exception('zoom app category https connection error: %s'%(str(status))) soup = BeautifulSoup(body) if soup.body.text.strip().find('Access not allowed. If you think this is an error, please contact us at [email protected]') > 0: raise Exception('Access not allowed. If you think this is an error, please contact us at [email protected]') ul_fa = soup.find_all(name='ul', attrs={'id':'apps-list'}) for li_f in ul_fa: a_fa = li_f.find_all(name='a', attrs={'class':'goTo'}) for a_f in a_fa: if a_f.has_key('href'): a_href = a_f['href'].strip() a_title = a_f.text.strip() db_zoom.db_execute_g(db_sql.sql_zoom_app_insert, (a_title, a_href, )) db_zoom.db_execute_g(db_sql.sql_zoom_cate_read_param_update, (cate_param, cate_path, )) # 16290 finish = True next_fa = soup.find_all(name='li', attrs={'class':'next'}) for next_f in next_fa: finish = False if finish == True: print '== no next' db_zoom.db_execute_g(db_sql.sql_zoom_cate_read_update, (cate_path, )) break cate_param = str(int(cate_param)+10) # update cate_param util.sleep_i(10) #break print cate_path
def app_read_main(): finish = True rows = db_zoom.db_get_g(db_sql.sql_zoom_app_get, ()) i_t = len(rows) i = 0 for row in rows: i = i + 1 print '%d of %d'%(i, i_t), finish = False app_name = row[0] app_path = row[1] app_id = row[2] app_read_status = row[3] url = app_path.replace('.html', '_download.html').strip() #url = app_path.replace('.html', '_download.html?nav=halloween').strip() print '** zoom app %s **'%(url) status, body = zoom_http_get(url) if status == 404: print '== 404' db_zoom.db_execute_g(db_sql.sql_zoom_app_update, ('', app_path, )) continue if status != 200: print 'exception' continue soup = BeautifulSoup(body) if soup.body.text.strip().find('Access not allowed. If you think this is an error, please contact us at [email protected]') > 0: raise Exception('Access not allowed. If you think this is an error, please contact us at [email protected]') ### from here app_id = None divs_fa = soup.find_all(name='span', attrs={'class':'package'}) for divs_f in divs_fa: app_id = divs_f.text.replace('Package ', '').strip() print app_id db_zoom.db_execute_g(db_sql.sql_app_insert, (app_id, )) db_zoom.db_execute_g(db_sql.sql_zoom_app_update, (app_id, app_path)) util.sleep_i(10) return finish
db_app.db_init() print '** start db_merge **' db_app.db_merge(db_play.db_path, db_app.db_path) print db_app.db_merge(db_lib.db_path, db_app.db_path) print db_app.db_merge(db_zoom.db_path, db_app.db_path) print print '** end db_merge **' def app_read(): app_read_overview.main() app_read_youtube.main() app_read_google_plus.main() #app_read_review.main() def cate_read(): cate_read_google_play.main() cate_read_android_zoom.main() cate_read_androlib.main() if __name__ == '__main__': #main() for i in range(1, 3): main() util.sleep_i(10) print '==== main %d ====='%(i)