def __timing_url(self, web_root): """ :return: """ logging.info(f'fetch tv {self.ft} index url timing start...') r = requests.get(web_root, headers={'User-Agent': random.choice(config.UAS)}) index_url = etree.HTML(r.text).xpath( config.TV_FS_XPATH_MAP.get(self.ft).get('tv_index_url_xpath')) times = etree.HTML(r.text).xpath( config.TV_FS_XPATH_MAP.get( self.ft).get('tv_index_fetch_date_xpath')) i = 0 for n, ti in enumerate(times): ti = str(ti).strip() if self.__date_str_compare(ti, self.timing) > 0: self.CUR_2_LAST_URLS.append(index_url[n]) self.WEB_INDEX_URL_TIME_MAP[index_url[n][1:]] = ti i += 1 else: break if i == len(times): self.CUR_2_LAST_PAGE += 1 self.__timing_url( f'{self.web_root}?m=vod-index-pg-{self.CUR_2_LAST_PAGE}.html') logging.info(f'fetch tv {self.ft} index url timing end...')
def detail(self): try: logging.info(f'fetch tv {self.ft} detail init start...') self.__init_url() urls = [f'{self.web_root}{u[1:]}' for u in self.WEB_INDEX_URL_LIST] self.batch_(urls, self.fetch_html, self.parse_detail_html) logging.info(f'fetch tv {self.ft} detail init end...') except Exception as e: logging.error(repr(e))
def __init_url(self): """ :return: """ logging.info(f'fetch tv {self.ft} index url start...') web_index_url = [ f'{self.web_root}?m=vod-index-pg-{i}.html' for i in range(self.sp, self.ep) ] self.batch_(web_index_url, self.fetch_html, self.parse_index_html) logging.info(f'fetch tv {self.ft} index url end...')
def detail(self): try: logging.info(f'fetch tv {self.ft} detail timing start...') # 修改采集时间 with open(self.timing_file, 'w') as fff: fff.write( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) # 采集视频index url self.__timing_url(self.web_root) # 开始采集detail urls = [f'{self.web_root}{u[1:]}' for u in self.CUR_2_LAST_URLS] self.batch_(urls, self.fetch_html, self.parse_detail_html) logging.info(f'fetch tv {self.ft} detail timing end...') except Exception as e: # 异常,采集时间回滚到之前 with open(self.timing_file, 'w') as fff: fff.write(self.timing) logging.error(repr(e))
def save_init(self): """ :return: """ with open(self.urls_file, 'r', encoding='GB18030') as ff: tvs = ff.readlines() logging.info(f'read init tv_url data record:{len(tvs)}') logging.info(f'start save init {self.ft} data to mysql db') try: with ThreadPoolExecutor(max_workers=25) as e: e.map(self.insert_tv, tvs) except Exception as e: logging.error(e) logging.info(f'end save init {self.ft} data to mysql db')
def save_timing(self): with open(self.urls_file, 'r', encoding='GB18030') as ff: tvs = ff.readlines() logging.info(f'read timing {self.ft} url data record:{len(tvs)}') logging.info(f'start save {self.ft} timing csv data to mysql db') try: if tvs and len(tvs) > 0: for tv in tvs: tv = dict(json.loads(tv)) tv_name = tv.get('tv_name') if tv_name: m_tv = self.db.find_one(self.tv_table_name, f" tv_name=%s ", tv_name) if m_tv: # 已存在 tv_id = m_tv.get('tv_id') self.db.update_tv(self.tv_table_name, f" update_time=%s ", tv.get('update_time'), tv_id) else: # 不存在 tv_id = tv.get('tv_id') self.db.insert(self.tv_table_name, CSV2MD.__build_tv(tv)) self.db.delete(self.tv_urls_table_name, tv_id) urls = list(tv.get('urls')) urls = [u for u in urls if u != ' ' and u != '\t'] u_list = [] for u in urls: u_list.append({ 'id': str(uuid.uuid4()), 'tv_id': tv_id, 'tv_url': str(u).replace(' ', '') }) self.db.insert_many(self.tv_urls_table_name, u_list) except Exception as e: logging.error(e) logging.info(f'end save timing {self.ft} data to mysql db')