コード例 #1
0
ファイル: tv_spider_timing.py プロジェクト: xwl5242/tvspider
 def __timing_url(self, web_root):
     """
     :return:
     """
     logging.info(f'fetch tv {self.ft} index url timing start...')
     r = requests.get(web_root,
                      headers={'User-Agent': random.choice(config.UAS)})
     index_url = etree.HTML(r.text).xpath(
         config.TV_FS_XPATH_MAP.get(self.ft).get('tv_index_url_xpath'))
     times = etree.HTML(r.text).xpath(
         config.TV_FS_XPATH_MAP.get(
             self.ft).get('tv_index_fetch_date_xpath'))
     i = 0
     for n, ti in enumerate(times):
         ti = str(ti).strip()
         if self.__date_str_compare(ti, self.timing) > 0:
             self.CUR_2_LAST_URLS.append(index_url[n])
             self.WEB_INDEX_URL_TIME_MAP[index_url[n][1:]] = ti
             i += 1
         else:
             break
     if i == len(times):
         self.CUR_2_LAST_PAGE += 1
         self.__timing_url(
             f'{self.web_root}?m=vod-index-pg-{self.CUR_2_LAST_PAGE}.html')
     logging.info(f'fetch tv {self.ft} index url timing end...')
コード例 #2
0
ファイル: tv_spider_init.py プロジェクト: xwl5242/tvspider
 def detail(self):
     try:
         logging.info(f'fetch tv {self.ft} detail init start...')
         self.__init_url()
         urls = [f'{self.web_root}{u[1:]}' for u in self.WEB_INDEX_URL_LIST]
         self.batch_(urls, self.fetch_html, self.parse_detail_html)
         logging.info(f'fetch tv {self.ft} detail init end...')
     except Exception as e:
         logging.error(repr(e))
コード例 #3
0
ファイル: tv_spider_init.py プロジェクト: xwl5242/tvspider
 def __init_url(self):
     """
     :return:
     """
     logging.info(f'fetch tv {self.ft} index url start...')
     web_index_url = [
         f'{self.web_root}?m=vod-index-pg-{i}.html'
         for i in range(self.sp, self.ep)
     ]
     self.batch_(web_index_url, self.fetch_html, self.parse_index_html)
     logging.info(f'fetch tv {self.ft} index url end...')
コード例 #4
0
ファイル: tv_spider_timing.py プロジェクト: xwl5242/tvspider
 def detail(self):
     try:
         logging.info(f'fetch tv {self.ft} detail timing start...')
         # 修改采集时间
         with open(self.timing_file, 'w') as fff:
             fff.write(
                 time.strftime('%Y-%m-%d %H:%M:%S',
                               time.localtime(time.time())))
         # 采集视频index url
         self.__timing_url(self.web_root)
         # 开始采集detail
         urls = [f'{self.web_root}{u[1:]}' for u in self.CUR_2_LAST_URLS]
         self.batch_(urls, self.fetch_html, self.parse_detail_html)
         logging.info(f'fetch tv {self.ft} detail timing end...')
     except Exception as e:
         # 异常,采集时间回滚到之前
         with open(self.timing_file, 'w') as fff:
             fff.write(self.timing)
         logging.error(repr(e))
コード例 #5
0
ファイル: save2db.py プロジェクト: xwl5242/tvspider
 def save_init(self):
     """
     :return:
     """
     with open(self.urls_file, 'r', encoding='GB18030') as ff:
         tvs = ff.readlines()
     logging.info(f'read init tv_url data record:{len(tvs)}')
     logging.info(f'start save init {self.ft} data to mysql db')
     try:
         with ThreadPoolExecutor(max_workers=25) as e:
             e.map(self.insert_tv, tvs)
     except Exception as e:
         logging.error(e)
     logging.info(f'end save init {self.ft} data to mysql db')
コード例 #6
0
ファイル: save2db.py プロジェクト: xwl5242/tvspider
 def save_timing(self):
     with open(self.urls_file, 'r', encoding='GB18030') as ff:
         tvs = ff.readlines()
     logging.info(f'read timing {self.ft} url data record:{len(tvs)}')
     logging.info(f'start save {self.ft} timing csv data to mysql db')
     try:
         if tvs and len(tvs) > 0:
             for tv in tvs:
                 tv = dict(json.loads(tv))
                 tv_name = tv.get('tv_name')
                 if tv_name:
                     m_tv = self.db.find_one(self.tv_table_name,
                                             f" tv_name=%s ", tv_name)
                     if m_tv:
                         # 已存在
                         tv_id = m_tv.get('tv_id')
                         self.db.update_tv(self.tv_table_name,
                                           f" update_time=%s ",
                                           tv.get('update_time'), tv_id)
                     else:
                         # 不存在
                         tv_id = tv.get('tv_id')
                         self.db.insert(self.tv_table_name,
                                        CSV2MD.__build_tv(tv))
                     self.db.delete(self.tv_urls_table_name, tv_id)
                     urls = list(tv.get('urls'))
                     urls = [u for u in urls if u != ' ' and u != '\t']
                     u_list = []
                     for u in urls:
                         u_list.append({
                             'id': str(uuid.uuid4()),
                             'tv_id': tv_id,
                             'tv_url': str(u).replace(' ', '')
                         })
                     self.db.insert_many(self.tv_urls_table_name, u_list)
     except Exception as e:
         logging.error(e)
     logging.info(f'end save timing {self.ft} data to mysql db')