class Scrapyer(object): """ Function: 1.search programs from http://www.tvmao.com and collect relative programs info 2.crawl detail info of programs from http://www.tvmao.com to help classify programs """ def __init__(self): self.retry_count = 3 self.empty_count = 0 self.pre_empty_flag = False self.enabled_programs = [] self.unabled_programs = [] self.collected_programs = [] self.proxypool = ProxyPool() self.proxy = self.proxypool.get_proxy() self.headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/' '537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } def change_proxy(self): """ change current proxy :return: """ self.proxypool.delete_proxy(self.proxy) self.proxy = self.proxypool.get_proxy() def check_empty(self, num, source_programs, lock): """ check whether if the current proxy is dead if the res '[]' occurs over 5 times consecutively :param num: number of current columns :param source_programs: programs need to crawl :param lock: lock to access the source_programs :return: """ if num == 0: if self.pre_empty_flag: self.empty_count += 1 if self.empty_count >= 5: for i in range(5, 0, -1): program = self.unabled_programs[i] if empty_times[program] < 2: self.unabled_programs.pop(i) with lock: source_programs.put(program) empty_times[program] += 1 self.change_proxy() self.empty_count = 0 else: self.pre_empty_flag = True self.empty_count = 1 elif self.pre_empty_flag: self.pre_empty_flag = False self.empty_count = 0 def collect_programs(self, page_uls, page_columns): """ parse programs from the crawed result by columns :param page_uls: all uls in the result :param page_columns: all categories in the result :return: """ prefix = 'http://www.tvmao.com' programs = [] for column, uls in zip(page_columns, page_uls): lis = uls.find_all('li', class_='mr10') if len(lis) == 0: continue if re.search('^(电视剧|电影)', column): href_names = [(prefix + li.p.a['href'], li.p.a.get_text()) for li in lis] elif re.search('^(综艺|明星|赛事)', column): href_names = [(prefix + li.a['href'], li.a['title']) for li in lis] else: continue programs.append(href_names) return dict(zip(page_columns, programs)) def crawl_relative_program(self, program, source_programs, lock): """ crawl relative programs info from http://www.tvmao.com :param program: :param source_programs: all programs need to crawl :param lock: lock to access the source_programs :return: """ url = 'http://www.tvmao.com/query.jsp?keys=%s&ed=' % quote(program) + \ 'bOWkp%2BeZveWkq%2BWmh%2BS4iua8lOazoeayq%2BS5i%2BWQu28%3D' # crawl the website bsObj = None self.retry_count = 3 while self.retry_count > 0: try: content = requests.get(url, proxies={'http': self.proxy}, headers=self.headers, timeout=2) bsObj = BeautifulSoup(content.text, 'html.parser') break except: self.retry_count -= 1 if self.retry_count <= 0: if DEBUG: print("waiting...") self.change_proxy() self.retry_count = 3 # parse infomation try: page_content = bsObj.find_all('div', class_='page-content')[0] page_columns = [ item.a.get_text() for item in page_content.dl.find_all('dd') ] page_columns = [ column for column in page_columns if not re.search('^(播出时间|电视频道)', column) ] page_content_uls = page_content.div.find_all( 'ul', class_=re.compile('^.+qtable$'), recursive=False) if len(page_columns) == 0: self.unabled_programs.append(program) else: self.enabled_programs.append(program) column_programs = self.collect_programs( page_content_uls, page_columns) return {program: column_programs} # check whether if the current proxy was dead self.check_empty(len(page_columns), source_programs, lock) except: with lock: source_programs.put(program) self.change_proxy() return None def run_crawl_relative_programs(self, source_programs, lock, limit=False): """ single process :param source_programs: all programs need to crawl :param lock: lock to access the source_programs :param limit: if size of source_programs has little change, end process when limit is true :return: collected programs info, enabled programs, unabled programs """ collected_programs = [] # count, pre = 0, source_programs.qsize() while True: try: with lock: program = source_programs.get_nowait() if DEBUG: print(source_programs.qsize()) if source_programs.qsize() < 1500: return collected_programs, self.enabled_programs, self.unabled_programs # count += 1 # if count % 50 == 0 and limit: # if pre - source_programs.qsize() < 0: # return collected_programs, self.enabled_programs, self.unabled_programs # pre = source_programs.qsize() result = self.crawl_relative_program(program, source_programs, lock) if result: collected_programs.append(result) time.sleep(randint(0, 1)) except: return collected_programs, self.enabled_programs, self.unabled_programs def category_classify(self, category): """ classify by the category from xingchen :param category: program intro or program category from xingchen :return: """ if re.search('军旅', category): return '军事' if re.search('纪录片', category): return '纪实' if re.search('动漫', category): return '少儿' if re.search('戏剧', category): return '戏曲' if re.search('真人秀', category): return '综艺' res = re.search('|'.join(all_categories), category) if res: return res.group() return None def intro_classify(self, intro): """ classify the category 'living' into more accurate category :param intro: introduction of the realtive program in xingchen :return: """ if re.search('军旅', intro): return '军事' if re.search('纪录片', intro): return '纪实' if re.search('动漫', intro): return '少儿' if re.search('戏剧', intro): return '戏曲' if re.search('真人秀', intro): return '综艺' res = re.search('|'.join(all_categories), intro) if res: return res.group() return "生活" def search_to_classify_program(self, href): """ classify programs by crawling more detail info from xingchen :param href: link of the relative program in xingchen :return: """ # crawling the website bsObj = None self.retry_count = 3 while self.retry_count > 0: try: content = requests.get(href, proxies={'http': self.proxy}, headers=self.headers, timeout=2) if content.status_code != 200: self.retry_count -= 1 if self.retry_count <= 0: if DEBUG: print("waiting...") self.change_proxy() self.retry_count = 3 continue bsObj = BeautifulSoup(content.text, 'html.parser') break except: self.retry_count -= 1 if self.retry_count <= 0: if DEBUG: print("waiting...") self.change_proxy() self.retry_count = 3 # classify the program by detail info from website try: if re.search('tvcolumn', href): res_1 = bsObj.find_all('td', class_='gray pl15') if res_1: category = res_1[0].findNext('td').get_text() if category != "生活": category = self.category_classify(category) return category if category else '综艺' div = bsObj.find_all('div', class_='clear more_c')[0] intro = '; '.join( [p.get_text() for p in div.find_all('p')]) return self.intro_classify(intro) else: return '综艺' elif re.search('drama', href): mark = bsObj.find(text='类别:') td = mark.parent.findNext('td') category = ' '.join( [a.get_text() for a in td.find_all('a', recursive=False)]) category = self.category_classify(category) return category if category else '电视剧' except: if DEBUG: print("f**k", href) return choice(['综艺', '电视剧']) def run_search_to_classify_programs(self, source_items, lock): """ single process :param source_items: all programs need to crawl more detail info :param lock: lock to access source_items :return: """ program_cateogry = [] while True: try: with lock: item = source_items.get_nowait() if DEBUG: print(source_items.qsize()) category = self.search_to_classify_program(item[2]) program_cateogry.append((item[0], category)) time.sleep(randint(0, 1)) except: return program_cateogry