Ejemplo n.º 1
0
 def detail(self):
     try:
         logging.info(f'fetch tv {self.ft} detail init start...')
         self.__init_url()
         urls = [f'{self.web_root}{u[1:]}' for u in self.WEB_INDEX_URL_LIST]
         self.batch_(urls, self.fetch_html, self.parse_detail_html)
         logging.info(f'fetch tv {self.ft} detail init end...')
     except Exception as e:
         logging.error(repr(e))
Ejemplo n.º 2
0
 async def fetch_html(url):
     """
     :return:
     """
     try:
         async with aiohttp.ClientSession() as session:
             headers = {'User-Agent': random.choice(config.UAS)}
             async with session.get(url, headers=headers,
                                    verify_ssl=False) as response:
                 return await response.text(errors='ignore'), response.url
     except Exception as e:
         logging.error(repr(e))
Ejemplo n.º 3
0
 def __build_urls(tv_id, url_list):
     try:
         u_list = []
         if url_list and len(url_list) > 0:
             for u in url_list:
                 tv_url = dict()
                 tv_url['id'] = str(uuid.uuid4())
                 tv_url['tv_id'] = tv_id
                 tv_url['tv_url'] = str(u).replace(' ', '')
                 u_list.append(tv_url)
         return u_list
     except Exception as e:
         logging.error(repr(e))
Ejemplo n.º 4
0
 def save_init(self):
     """
     :return:
     """
     with open(self.urls_file, 'r', encoding='GB18030') as ff:
         tvs = ff.readlines()
     logging.info(f'read init tv_url data record:{len(tvs)}')
     logging.info(f'start save init {self.ft} data to mysql db')
     try:
         with ThreadPoolExecutor(max_workers=25) as e:
             e.map(self.insert_tv, tvs)
     except Exception as e:
         logging.error(e)
     logging.info(f'end save init {self.ft} data to mysql db')
Ejemplo n.º 5
0
 def insert_tv(self, tv):
     try:
         tv_json = dict(json.loads(tv))
         tv_name = tv_json['tv_name']
         if tv_name:
             urls = list(tv_json.get('urls', []))
             urls = [u for u in urls if u != ' ' and u != '\t']
             to = CSV2MD.__build_tv(tv_json)
             uo = CSV2MD.__build_urls(tv_json.get('tv_id'), urls)
             self.db.insert(self.tv_table_name, to)
             if uo and len(uo) > 0:
                 self.db.insert_many(self.tv_urls_table_name, uo)
     except Exception as e:
         with open('error.txt', 'a', encoding='utf-8') as f:
             f.write(tv)
         logging.error(e)
Ejemplo n.º 6
0
 def __deal_main_detail(ft, detail):
     """
     :param detail:
     :return:
     """
     try:
         do = {}
         if ft == config.TV_TYPE_MAIN:
             details = detail.split('@')
             tv_intro = details[-1]
             do['tv_intro'] = tv_intro
             t_detail = '@'.join(details[:-1]) + '@'
             tv_detail_list = [('影片名称:', 'tv_name'), ('影片演员:', 'tv_actors'),
                               ('影片导演:', 'tv_director'),
                               ('影片类型:', 'tv_type'), ('影片地区:', 'tv_area'),
                               ('影片语言:', 'tv_lang'), ('上映日期:', 'tv_year'),
                               ('影片备注:', 'tv_remark')]
         else:
             t_detail = detail
             tv_detail_list = [('影片名称: ', 'tv_name'),
                               ('影片主演: ', 'tv_actors'),
                               ('影片导演: ', 'tv_director'),
                               ('栏目分类: ', 'tv_type'), ('影片地区: ', 'tv_area'),
                               ('语言分类:', 'tv_lang'), ('上映年份: ', 'tv_year'),
                               ('影片备注: ', 'tv_remark')]
         t_d_t = [x[0] for x in tv_detail_list]
         for td in tv_detail_list:
             do[td[1]] = ''
             if td[0] in t_detail:
                 try:
                     d_i = t_detail.index(td[0])
                     if d_i >= 0:
                         f_h_i_1 = t_detail.index('@', d_i)
                         if f_h_i_1 >= 0:
                             f_h_i_2 = t_detail.index('@', f_h_i_1 + 1)
                             tt_d = t_detail[f_h_i_1 + 1:f_h_i_2]
                             if len([y for y in t_d_t
                                     if y not in tt_d]) == 8:
                                 do[td[1]] = tt_d
                             else:
                                 do[td[1]] = ''
                 except ValueError as e:
                     do[td[1]] = ''
                     continue
         return do
     except Exception as e:
         logging.error(e)
Ejemplo n.º 7
0
 def detail(self):
     try:
         logging.info(f'fetch tv {self.ft} detail timing start...')
         # 修改采集时间
         with open(self.timing_file, 'w') as fff:
             fff.write(
                 time.strftime('%Y-%m-%d %H:%M:%S',
                               time.localtime(time.time())))
         # 采集视频index url
         self.__timing_url(self.web_root)
         # 开始采集detail
         urls = [f'{self.web_root}{u[1:]}' for u in self.CUR_2_LAST_URLS]
         self.batch_(urls, self.fetch_html, self.parse_detail_html)
         logging.info(f'fetch tv {self.ft} detail timing end...')
     except Exception as e:
         # 异常,采集时间回滚到之前
         with open(self.timing_file, 'w') as fff:
             fff.write(self.timing)
         logging.error(repr(e))
Ejemplo n.º 8
0
 def __build_tv(tv_json):
     try:
         tv_o = {}
         tv_json = dict(tv_json)
         for k in tv_json.keys():
             if k != 'urls':
                 if k == 'tv_intro':
                     v = tv_json.get(k)
                     v = str(v).replace(' ', '').replace('\t', '')
                     v = v[:2000] + '...' if len(v) > 2000 else v
                 elif k == 'tv_name':
                     v = tv_json.get(k)
                     v = str(v).replace(' ', '').replace('~', '')\
                         .replace('~', '').replace('[T]', '').replace('?', '').replace('?', '').replace('·', '')
                 else:
                     v = tv_json.get(k)
                 tv_o[k] = v
         return tv_o
     except Exception as e:
         logging.error(repr(e))
Ejemplo n.º 9
0
 def save_timing(self):
     with open(self.urls_file, 'r', encoding='GB18030') as ff:
         tvs = ff.readlines()
     logging.info(f'read timing {self.ft} url data record:{len(tvs)}')
     logging.info(f'start save {self.ft} timing csv data to mysql db')
     try:
         if tvs and len(tvs) > 0:
             for tv in tvs:
                 tv = dict(json.loads(tv))
                 tv_name = tv.get('tv_name')
                 if tv_name:
                     m_tv = self.db.find_one(self.tv_table_name,
                                             f" tv_name=%s ", tv_name)
                     if m_tv:
                         # 已存在
                         tv_id = m_tv.get('tv_id')
                         self.db.update_tv(self.tv_table_name,
                                           f" update_time=%s ",
                                           tv.get('update_time'), tv_id)
                     else:
                         # 不存在
                         tv_id = tv.get('tv_id')
                         self.db.insert(self.tv_table_name,
                                        CSV2MD.__build_tv(tv))
                     self.db.delete(self.tv_urls_table_name, tv_id)
                     urls = list(tv.get('urls'))
                     urls = [u for u in urls if u != ' ' and u != '\t']
                     u_list = []
                     for u in urls:
                         u_list.append({
                             'id': str(uuid.uuid4()),
                             'tv_id': tv_id,
                             'tv_url': str(u).replace(' ', '')
                         })
                     self.db.insert_many(self.tv_urls_table_name, u_list)
     except Exception as e:
         logging.error(e)
     logging.info(f'end save timing {self.ft} data to mysql db')