def download_price(theater_id,movie_id): # http://bj.nuomi.com/pcindex/main/timetable?cinemaid=1c2e250a3e9691059ee32187&mid=9762&needMovieInfo=0&tploption=5&_=1448004690864#j-movie-list1 url = "http://bj.nuomi.com/pcindex/main/timetable?cinemaid=%s&mid=%s&needMovieInfo=0"%(theater_id,movie_id) lfile = '%snuomi/price_%s_%s.html' % (download_dir,theater_id,movie_id) respHtml = browser.downad_and_save(url,lfile) soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset) li = [] dom_divs = soup.findAll('div',attrs = {'class': 'list'}) for day,div in enumerate(dom_divs): trs = div.findAll('tr') rows = [] for tr in trs: tds = tr.findAll('td') if not tds: continue p = tds[3].find('span') pp = p.contents[0].split(';')[-1] order_url = completeInnerUrl("http://bj.nuomi.com/", tds[4].find('a')['href']) li.append(','.join([str(day),theater_id,movie_id,tds[0].contents[0].strip(),tds[1].contents[0],pp.strip(),order_url])) csv_file = '%snuomi/price_%s_%s.csv' % (download_dir,theater_id,movie_id) save_file(csv_file,li)
def download_list(): proxies = util.load_proxies() for i in range(1,802): #802 url = 'http://www.cnhm.net/plant/index/page/%d' % (i) lfile = '%scnhm/list_%d.html' % (config.local_dir,i) proxy = random.choice(proxies).strip() print i,url,proxy respHtml = browser.downad_and_save(url,lfile) soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset) table = soup.find('div',attrs = {'class': 'main'}) if not table: os.system('rm -f %s'%(lfile)) print 'err' continue xitems = [] tr_list = table.findAll('li') for tr in tr_list: plant_name = tr.find('div',attrs = {'class': 'PlantName'}) namea = plant_name.find('a') uid = util.completeInnerUrl(url,namea['href']) cn_name = namea['title'] lating_name = tr.find('div',attrs = {'class': 'LatinName'})['title'] section = tr.find('div',attrs = {'class': 'section'}).contents[0] genera = tr.find('div',attrs = {'class': 'genera'}).contents[0] xitems.append(','.join([uid,cn_name,lating_name,section,genera ])) with open('%scnhm/xlist_%s.csv'%(config.local_dir,i),'w') as f: f.write('\r\n'.join(xitems)+'\r\n') f.close()
def download_movie_playing(): url = "http://bj.nuomi.com/movie/" lfile = '%smovies_playing.html' % (root_dir) respHtml = browser.downad_and_save(url,lfile) soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset) li = [] # <div class="section-item clearfix no-top-border"> dom_movies = soup.find('div',attrs = {'class': 'section-item clearfix no-top-border'}) dom_a = dom_movies.findAll('a') for m in dom_a: # dom_a = m.find('a') uid = m['href'].split('/')[-1] uri = completeInnerUrl("http://bj.nuomi.com/",m['href']) name = m.contents[0] li.append("Movie,%s,%s,%s"%(uid,uri,name)) csv_file = '%snuomi/movie_result.csv' % (download_dir) save_file(csv_file,li)
def download_mtheater_list(page_index=1,proxy=None): url = "http://t.dianping.com/movie/beijing/tab_cinema?pageno=%d" % (page_index) lfile = '%smtheater_list_%d.html' % (root_dir,page_index) respHtml = browser.downad_and_save(url,lfile,proxy) soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset) li = [] # <div class="index-cinema-list"> # <li class="item Fix"> dom_theaterlist = soup.find('div',attrs = {'class': 'index-cinema-list'}) dom_mtheaters = dom_theaterlist.findAll('li',attrs = {'class': 'item Fix'}) for mt in dom_mtheaters: dom_a = mt.findAll('a')[1] uid = dom_a['href'].split('/')[-1] uri = completeInnerUrl("http://t.dianping.com/",dom_a['href']) name = dom_a.contents[0] li.append("MovieTheater,北京,%s,%s,%s"%(uid,uri,name)) csv_file = '%smtheater_result_%d.csv' % (root_dir,page_index) save_file(csv_file,li)