def __init__(self, ft): self.ft = ft self.urls_file = config.TV_FS_FILE_MAP.get(ft) self.db = DB() self.tv_table_name = { config.TV_TYPE_MAIN: 't_tv', config.TV_TYPE_3PART: 't_tv_3part' }.get(ft) self.tv_urls_table_name = { config.TV_TYPE_MAIN: 't_tv_urls', config.TV_TYPE_3PART: 't_tv_urls_3part' }.get(ft)
class FetchTVAll: def __init__(self): self.db = DB() def update(self): sql = f"insert into t_tv_all(tv_id,tv_ids,tv_name,tv_actors,tv_director,tv_type,tv_area," \ f"tv_lang,tv_year,tv_img,tv_intro,update_time,img_save) select B.tv_id,B.tv_ids,B.tv_name," \ f"B.tv_actors,B.tv_director,B.tv_type,B.tv_area,B.tv_lang,B.tv_year,B.tv_img,B.tv_intro,B.update_time," \ f"'0' as img_save from(select A.tv_id,group_concat(A.tv_id) tv_ids,A.tv_name,A.tv_img,A.tv_actors," \ f"A.tv_director,A.tv_type,A.tv_area,A.tv_lang,A.tv_year,A.tv_intro,A.update_time from " \ f"(select * from t_tv union select * from t_tv_3part) A group by A.tv_name) B on duplicate " \ f"key update update_time=B.update_time" self.db.execute(sql)
class TopSpider: def __init__(self): self.db = DB() self.tv_type_url_map = { 'mv': [ 'https://v.qq.com/channel/movie', 'https://v.qq.com/channel/doco' ], 'dsj': ['https://v.qq.com/channel/tv'], 'zy': ['https://v.qq.com/channel/variety'], 'dm': [ 'https://v.qq.com/channel/cartoon', 'https://v.qq.com/channel/child' ], 'banner': ['https://v.qq.com'] } self.tops = [] def parse_top(self, html): if html and isinstance(html, str): root = etree.HTML(html) name = root.xpath( "//div[starts-with(@class,'site_slider ')]/div[2]//a/span/text()" ) img = root.xpath( "//div[starts-with(@class,'site_slider ')]/div[2]//a/@data-bgimage" ) self.tops = [(n, img[i]) for (i, n) in enumerate(name)] def fetch_top(self, tv_type): try: url = self.tv_type_url_map.get(tv_type, '') if url and len(url) > 0: self.db.delete_by_tv_type('t_b_t', tv_type) for u in url: r = requests.get( u, headers={'User-Agent': random.choice(config.UAS)}) self.parse_top(r.content.decode('utf-8')) if self.tops and len(self.tops) > 0: for top in self.tops: if top[0] != '大家在看': tv_banner = dict() tv_banner['id'] = str(uuid.uuid4()) tv_banner['tv_type'] = tv_type tv_banner['tv_name'] = top[0] tv_banner['tv_img'] = top[1] self.db.insert('t_b_t', tv_banner) except Exception as e: print(repr(e))
def __init__(self): self.db = DB() self.tv_type_url_map = { 'mv': [ 'https://v.qq.com/channel/movie', 'https://v.qq.com/channel/doco' ], 'dsj': ['https://v.qq.com/channel/tv'], 'zy': ['https://v.qq.com/channel/variety'], 'dm': [ 'https://v.qq.com/channel/cartoon', 'https://v.qq.com/channel/child' ], 'banner': ['https://v.qq.com'] } self.tops = []
class ImgSpider: def __init__(self): self.db = DB() async def fetch_html(self, url, tv_id): """ :return: """ try: async with aiohttp.ClientSession() as session: headers = {'User-Agent': random.choice(config.UAS)} async with session.get(url, headers=headers, verify_ssl=False) as response: return await response.read(), tv_id except Exception as e: logging.error(repr(e)) def parse_content(self, resp): if resp and resp.result() and len(resp.result()) == 2: resp = resp.result() with open(f'/home/imgs/{resp[1]}.jpg', 'wb') as f: f.write(resp[0]) self.db.update_tv('t_tv_all', f" img_save=%s ", '1', resp[1]) def batch_(self): """ :return: """ tvs = self.db.find_all('t_tv_all', f" img_save=%s ", '0') tvs = [(tv['tv_id'], tv['tv_img']) for tv in tvs] batch = int(len(tvs) / 50) + 1 for i in range(batch): end = len(tvs) if i == (batch - 1) else (i + 1) * 50 self.download(tvs[i * 50:end]) def download(self, tvs): loop = asyncio.get_event_loop() tasks = [] for tv in tvs: tid, url = tv[0], tv[1] if url: task = asyncio.ensure_future(self.fetch_html(url, tid)) task.add_done_callback(self.parse_content) tasks.append(task) tasks = asyncio.gather(*tasks) loop.run_until_complete(tasks)
class CSV2MD: def __init__(self, ft): self.ft = ft self.urls_file = config.TV_FS_FILE_MAP.get(ft) self.db = DB() self.tv_table_name = { config.TV_TYPE_MAIN: 't_tv', config.TV_TYPE_3PART: 't_tv_3part' }.get(ft) self.tv_urls_table_name = { config.TV_TYPE_MAIN: 't_tv_urls', config.TV_TYPE_3PART: 't_tv_urls_3part' }.get(ft) @staticmethod def __build_tv(tv_json): try: tv_o = {} tv_json = dict(tv_json) for k in tv_json.keys(): if k != 'urls': if k == 'tv_intro': v = tv_json.get(k) v = str(v).replace(' ', '').replace('\t', '') v = v[:2000] + '...' if len(v) > 2000 else v elif k == 'tv_name': v = tv_json.get(k) v = str(v).replace(' ', '').replace('~', '')\ .replace('~', '').replace('[T]', '').replace('?', '').replace('?', '').replace('·', '') else: v = tv_json.get(k) tv_o[k] = v return tv_o except Exception as e: logging.error(repr(e)) @staticmethod def __build_urls(tv_id, url_list): try: u_list = [] if url_list and len(url_list) > 0: for u in url_list: tv_url = dict() tv_url['id'] = str(uuid.uuid4()) tv_url['tv_id'] = tv_id tv_url['tv_url'] = str(u).replace(' ', '') u_list.append(tv_url) return u_list except Exception as e: logging.error(repr(e)) def insert_tv(self, tv): try: tv_json = dict(json.loads(tv)) tv_name = tv_json['tv_name'] if tv_name: urls = list(tv_json.get('urls', [])) urls = [u for u in urls if u != ' ' and u != '\t'] to = CSV2MD.__build_tv(tv_json) uo = CSV2MD.__build_urls(tv_json.get('tv_id'), urls) self.db.insert(self.tv_table_name, to) if uo and len(uo) > 0: self.db.insert_many(self.tv_urls_table_name, uo) except Exception as e: with open('error.txt', 'a', encoding='utf-8') as f: f.write(tv) logging.error(e) def save_init(self): """ :return: """ with open(self.urls_file, 'r', encoding='GB18030') as ff: tvs = ff.readlines() logging.info(f'read init tv_url data record:{len(tvs)}') logging.info(f'start save init {self.ft} data to mysql db') try: with ThreadPoolExecutor(max_workers=25) as e: e.map(self.insert_tv, tvs) except Exception as e: logging.error(e) logging.info(f'end save init {self.ft} data to mysql db') def save_timing(self): with open(self.urls_file, 'r', encoding='GB18030') as ff: tvs = ff.readlines() logging.info(f'read timing {self.ft} url data record:{len(tvs)}') logging.info(f'start save {self.ft} timing csv data to mysql db') try: if tvs and len(tvs) > 0: for tv in tvs: tv = dict(json.loads(tv)) tv_name = tv.get('tv_name') if tv_name: m_tv = self.db.find_one(self.tv_table_name, f" tv_name=%s ", tv_name) if m_tv: # 已存在 tv_id = m_tv.get('tv_id') self.db.update_tv(self.tv_table_name, f" update_time=%s ", tv.get('update_time'), tv_id) else: # 不存在 tv_id = tv.get('tv_id') self.db.insert(self.tv_table_name, CSV2MD.__build_tv(tv)) self.db.delete(self.tv_urls_table_name, tv_id) urls = list(tv.get('urls')) urls = [u for u in urls if u != ' ' and u != '\t'] u_list = [] for u in urls: u_list.append({ 'id': str(uuid.uuid4()), 'tv_id': tv_id, 'tv_url': str(u).replace(' ', '') }) self.db.insert_many(self.tv_urls_table_name, u_list) except Exception as e: logging.error(e) logging.info(f'end save timing {self.ft} data to mysql db')
def __init__(self): self.db = DB()