def process_fecth(process_name, process_codes): total = len(process_codes) threads = [] THREAD_NUM = 30 if (total % THREAD_NUM): count = (total / THREAD_NUM) + 1 else: count = (total / THREAD_NUM) #print "%s threads (%s) start ... each num: %s" %(process_name,count,THREAD_NUM) for i in range(count): thread_name = "%s_thread_%d" % (process_name, i) thread_codes_start = i * THREAD_NUM thread_codes_end = i * THREAD_NUM + THREAD_NUM start = 0 end = 0 if thread_codes_start <= total: start = thread_codes_start if thread_codes_end <= total: end = thread_codes_end else: end = total thread_codes = process_codes[start:end] logger = LOG(thread_name) mylogger = logger.get_logger() th = threading.Thread(name=thread_name, target=thread_fetch, args=(thread_codes, thread_name, mylogger)) threads.append(th) #print thread_name,len(thread_codes) for t in threads: t.start() t.join()
my_codes_start = i * 100 my_codes_end = i * 100 + 100 start = 0 end = 0 if my_codes_start <= total: start = my_codes_start if my_codes_end <= total: end = my_codes_end else: end = total else: break thread_name = "thread_%s->%s" % (start, end) logger = LOG(thread_name) mylogger = logger.get_logger() th = threading.Thread(name=thread_name, target=fetch, args=(start, end, thread_name, mylogger)) threads.append(th) for t in threads: t.start() while True: th_num = threading.activeCount() - 1 #print "Current Has %d threads active crawlering datas waiting ..." %th_num #current_th_names = [item.name for item in threading.enumerate() if item.name !='MainThread'] if th_num == 0: break time.sleep(5) #print "Finshed!"
class MAOYAN: def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } self.orm = ORM() self.log = LOG() self.logger = self.log.get_logger() self.logger.info('*'*25) self.logger.info("Star:"+datetime.datetime.now().strftime("%Y-%m-%d")) self.logger.info('*' * 25) def request_movies_info(self): url = "http://maoyan.com/films" html = requests.get(url=url, headers=self.headers).text # save_html('demo_page.html',movie_text) ''' 先保存为html文件,对文件写beautifulsoup ''' # html = read_html('demo_pagr.html') movies = self.fetch_movie_name_and_url(html) # print(type(movies)) for i in tqdm(range(len(movies))): movie_english_name, movie_cat, movie_country, movie_dur, movie_rt, movie_ticket, movie_summary, movie_dir, movie_star = self.request_movie_info( movies[i]['movie_url']) movies[i]['movie_english_name'] = movie_english_name movies[i]['movie_cat'] = movie_cat movies[i]['movie_country'] = movie_country movies[i]['movie_dur'] = movie_dur movies[i]['movie_rt'] = movie_rt movies[i]['movie_ticket'] = movie_ticket movies[i]['movie_summary'] = movie_summary movies[i]['movie_dir'] = movie_dir movies[i]['movie_star'] = movie_star self.logger.info(movies[i]) self.orm.insert_movie_info(movies[i]) def fetch_movie_name_and_url(self, html): try: movie_names = set(self.orm.query_movie_name()) bs = BeautifulSoup(html, 'html.parser') movies = [] for dd in bs.find_all('dd'): movie_name = dd.find('div', 'channel-detail movie-item-title')['title'] if not movie_name in movie_names: movie_info = {} movie_info['movie_name'] = movie_name movie_info['movie_url'] = 'http://maoyan.com' + dd.find('div', 'movie-item').a['href'] movie_info['movie_image'] = \ dd.find('div', 'movie-item').a.find('div', 'movie-poster').find_all('img')[1][ 'data-src'] movie_sc = dd.find('div', 'channel-detail channel-detail-orange').text if '暂无评分' in movie_sc: movie_info['movie_sc'] = 0.0 else: movie_info['movie_sc'] = float(movie_sc) movies.append(movie_info) return movies except: self.logger.error("猫眼热门电影错误") def request_movie_info(self, url): try: html = requests.get(url=url, headers=self.headers).text.replace('&#', '0') # save_html('demo.html', html) # html = read_html('demo.html') bs = BeautifulSoup(html, 'html.parser') fo = bs.find('div', 'movie-brief-container') movie_english_name = fo.find('div', 'ename ellipsis').text.strip() fos = fo.ul.find_all('li') movie_cat = fos[0].text.strip() if ' / ' in fos[1].text: movie_country = fos[1].text.split(' / ')[0].strip() movie_dur = fos[1].text.split(' / ')[-1].strip() regex = re.compile(r'^[1-9]\d*\.\d*|0\.\d*[1-9]\d*$|\d') movie_dur = float(''.join(regex.findall(movie_dur))) else: movie_country = "" movie_dur = 0.0 movie_rt = fos[-1].text.strip() # print(movie_english_name) # print(movie_cat) # print(movie_country) # print(movie_dur) # print(movie_rt) movie_ticket = [i.text for i in bs.find('div', 'movie-stats-container').find('div', 'movie-index-content box').find_all( 'span')] if '暂无' in movie_ticket: movie_ticket = 0.0 else: movie_ticket = self.get_movie_ticket(html) movie_ticket = self.unite_ticket(movie_ticket) # print(movie_ticket) movie_summary = bs.find('div', 'mod-content').span.text.strip() # print(movie_summary) persons = [] for i in bs.find_all('div', 'module')[1].find_all('ul', 'celebrity-list clearfix'): persons.append([i.a.text.strip() for i in i.find_all('div', 'info')]) movie_dir = ','.join(persons[0]) movie_star = ','.join(persons[1]) # print(movie_dir) # print(movie_star) return movie_english_name, movie_cat, movie_country, movie_dur, movie_rt, movie_ticket, movie_summary, movie_dir, movie_star except: self.logger.error("电影主业信息爬取错误") ''' 解析票房的字符 ''' def get_movie_ticket(self, html, flag=False): p = re.compile(r"url\('(.*?)'\) format\('woff'\);") uni_font_url = re.findall(p, html) url = 'http:%s' % uni_font_url[0] # print("字体url:" + url) resp = requests.get(url) with open('maoyan.woff', 'wb') as fontfile: fontfile.write(resp.content) baseFonts = TTFont('basefont.woff') # 这个文件是保存在本地的, 需要手动解析一个字体库, 作为不变的部分 base_nums = ['4', '1', '3', '0', '5', '6', '7', '9', '2', '8'] # 基本的数字表 base_fonts = ['uniF66E', 'uniE944', 'uniE4BE', 'uniEF0F', 'uniEF8D', 'uniE963', 'uniE142', 'uniE023', 'uniE995', 'uniF3A0'] # 基本的映射表 onlineFonts = TTFont('maoyan.woff') # 网络上下载的动态的字体文件 uni_list = onlineFonts.getGlyphNames()[1:-1] # 只有中间的部分是数字 temp = {} # 解析字体库 for i in range(10): onlineGlyph = onlineFonts['glyf'][uni_list[i]] # 返回的是unicode对应信息的对象 for j in range(10): baseGlyph = baseFonts['glyf'][base_fonts[j]] if onlineGlyph == baseGlyph: temp[uni_list[i].replace('uni', '0x').lower()] = base_nums[j] # print(temp) for key in temp.keys(): initstr = key + ';' html = html.replace(initstr, str(temp[key])) if flag: return html else: return self.find_ticket(html) ''' 统一票房单位/万元 ''' def find_ticket(self, html): bs4 = BeautifulSoup(html, 'html.parser') div = bs4.find_all('div', 'movie-index-content box') try: result = div[0].span.text + div[0].find('span', 'unit').text return result except: self.logger.error("电影票房爬取错误") return '' def unite_ticket(self, ticket): try: regex = re.compile(r'^[1-9]\d*\.\d*|0\.\d*[1-9]\d*$|\d') num = float(''.join(regex.findall(ticket))) if '亿美元' in ticket: return num * 10000 * 6.7 elif '亿' in ticket: return num * 1000 elif '千万' in ticket: return num * 100 elif '十万' in ticket: return num * 10 elif '万美元' in ticket: return num * 6.7 else: return num except: self.logger.error("票房价格转换错误") def get_cinema(self, url): html = requests.get(url=url, headers=self.headers).text.replace('&#', '0') html = self.get_movie_ticket(html, flag=True) # save_html('./demo.html', html) # html = read_html('./demo.html') bs = BeautifulSoup(html, 'html.parser') for d in bs.find_all('div', 'show-list'): cinema = {} movie_name = d.find('h3', 'movie-name').text movie_time = datetime.datetime.now().strftime('%Y') + '年' + d.find('span', 'date-item active').text.split()[ -1] cinema['movie_name'] = movie_name cinema['movie_time'] = movie_time for r in d.find('tbody').find_all('tr'): td = r.find_all('td')[:-1] movie_open_time = td[0].find('span', 'begin-time').text movie_close_time = td[0].find('span', 'end-time').text movie_lan = td[1].span.text movie_address = td[2].span.text movie_price = float(td[3].find('span', 'stonefont').text) is_2d, is_3d = self.is3D(movie_lan) cinema['movie_open_time'] = movie_open_time cinema['movie_close_time'] = movie_close_time cinema['movie_lan'] = movie_lan cinema['movie_address'] = movie_address cinema['movie_price'] = movie_price cinema['is_2d'] = is_2d cinema['is_3d'] = is_3d self.orm.insert_movie_cinema(cinema) self.logger.info(cinema) # print(movie_name, movie_time, movie_open_time, movie_close_time, movie_lan, movie_address, movie_price, # is_2d, is_3d) def is3D(self, value): try: if '2D' in value.upper(): return True, False else: return False, True except: return False, False