def detail(self): try: logging.info(f'fetch tv {self.ft} detail init start...') self.__init_url() urls = [f'{self.web_root}{u[1:]}' for u in self.WEB_INDEX_URL_LIST] self.batch_(urls, self.fetch_html, self.parse_detail_html) logging.info(f'fetch tv {self.ft} detail init end...') except Exception as e: logging.error(repr(e))
async def fetch_html(url): """ :return: """ try: async with aiohttp.ClientSession() as session: headers = {'User-Agent': random.choice(config.UAS)} async with session.get(url, headers=headers, verify_ssl=False) as response: return await response.text(errors='ignore'), response.url except Exception as e: logging.error(repr(e))
def __build_urls(tv_id, url_list): try: u_list = [] if url_list and len(url_list) > 0: for u in url_list: tv_url = dict() tv_url['id'] = str(uuid.uuid4()) tv_url['tv_id'] = tv_id tv_url['tv_url'] = str(u).replace(' ', '') u_list.append(tv_url) return u_list except Exception as e: logging.error(repr(e))
def save_init(self): """ :return: """ with open(self.urls_file, 'r', encoding='GB18030') as ff: tvs = ff.readlines() logging.info(f'read init tv_url data record:{len(tvs)}') logging.info(f'start save init {self.ft} data to mysql db') try: with ThreadPoolExecutor(max_workers=25) as e: e.map(self.insert_tv, tvs) except Exception as e: logging.error(e) logging.info(f'end save init {self.ft} data to mysql db')
def insert_tv(self, tv): try: tv_json = dict(json.loads(tv)) tv_name = tv_json['tv_name'] if tv_name: urls = list(tv_json.get('urls', [])) urls = [u for u in urls if u != ' ' and u != '\t'] to = CSV2MD.__build_tv(tv_json) uo = CSV2MD.__build_urls(tv_json.get('tv_id'), urls) self.db.insert(self.tv_table_name, to) if uo and len(uo) > 0: self.db.insert_many(self.tv_urls_table_name, uo) except Exception as e: with open('error.txt', 'a', encoding='utf-8') as f: f.write(tv) logging.error(e)
def __deal_main_detail(ft, detail): """ :param detail: :return: """ try: do = {} if ft == config.TV_TYPE_MAIN: details = detail.split('@') tv_intro = details[-1] do['tv_intro'] = tv_intro t_detail = '@'.join(details[:-1]) + '@' tv_detail_list = [('影片名称:', 'tv_name'), ('影片演员:', 'tv_actors'), ('影片导演:', 'tv_director'), ('影片类型:', 'tv_type'), ('影片地区:', 'tv_area'), ('影片语言:', 'tv_lang'), ('上映日期:', 'tv_year'), ('影片备注:', 'tv_remark')] else: t_detail = detail tv_detail_list = [('影片名称: ', 'tv_name'), ('影片主演: ', 'tv_actors'), ('影片导演: ', 'tv_director'), ('栏目分类: ', 'tv_type'), ('影片地区: ', 'tv_area'), ('语言分类:', 'tv_lang'), ('上映年份: ', 'tv_year'), ('影片备注: ', 'tv_remark')] t_d_t = [x[0] for x in tv_detail_list] for td in tv_detail_list: do[td[1]] = '' if td[0] in t_detail: try: d_i = t_detail.index(td[0]) if d_i >= 0: f_h_i_1 = t_detail.index('@', d_i) if f_h_i_1 >= 0: f_h_i_2 = t_detail.index('@', f_h_i_1 + 1) tt_d = t_detail[f_h_i_1 + 1:f_h_i_2] if len([y for y in t_d_t if y not in tt_d]) == 8: do[td[1]] = tt_d else: do[td[1]] = '' except ValueError as e: do[td[1]] = '' continue return do except Exception as e: logging.error(e)
def detail(self): try: logging.info(f'fetch tv {self.ft} detail timing start...') # 修改采集时间 with open(self.timing_file, 'w') as fff: fff.write( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) # 采集视频index url self.__timing_url(self.web_root) # 开始采集detail urls = [f'{self.web_root}{u[1:]}' for u in self.CUR_2_LAST_URLS] self.batch_(urls, self.fetch_html, self.parse_detail_html) logging.info(f'fetch tv {self.ft} detail timing end...') except Exception as e: # 异常,采集时间回滚到之前 with open(self.timing_file, 'w') as fff: fff.write(self.timing) logging.error(repr(e))
def __build_tv(tv_json): try: tv_o = {} tv_json = dict(tv_json) for k in tv_json.keys(): if k != 'urls': if k == 'tv_intro': v = tv_json.get(k) v = str(v).replace(' ', '').replace('\t', '') v = v[:2000] + '...' if len(v) > 2000 else v elif k == 'tv_name': v = tv_json.get(k) v = str(v).replace(' ', '').replace('~', '')\ .replace('~', '').replace('[T]', '').replace('?', '').replace('?', '').replace('·', '') else: v = tv_json.get(k) tv_o[k] = v return tv_o except Exception as e: logging.error(repr(e))
def save_timing(self): with open(self.urls_file, 'r', encoding='GB18030') as ff: tvs = ff.readlines() logging.info(f'read timing {self.ft} url data record:{len(tvs)}') logging.info(f'start save {self.ft} timing csv data to mysql db') try: if tvs and len(tvs) > 0: for tv in tvs: tv = dict(json.loads(tv)) tv_name = tv.get('tv_name') if tv_name: m_tv = self.db.find_one(self.tv_table_name, f" tv_name=%s ", tv_name) if m_tv: # 已存在 tv_id = m_tv.get('tv_id') self.db.update_tv(self.tv_table_name, f" update_time=%s ", tv.get('update_time'), tv_id) else: # 不存在 tv_id = tv.get('tv_id') self.db.insert(self.tv_table_name, CSV2MD.__build_tv(tv)) self.db.delete(self.tv_urls_table_name, tv_id) urls = list(tv.get('urls')) urls = [u for u in urls if u != ' ' and u != '\t'] u_list = [] for u in urls: u_list.append({ 'id': str(uuid.uuid4()), 'tv_id': tv_id, 'tv_url': str(u).replace(' ', '') }) self.db.insert_many(self.tv_urls_table_name, u_list) except Exception as e: logging.error(e) logging.info(f'end save timing {self.ft} data to mysql db')