コード例 #1
0
ファイル: save2db.py プロジェクト: xwl5242/tvspider
 def __init__(self, ft):
     self.ft = ft
     self.urls_file = config.TV_FS_FILE_MAP.get(ft)
     self.db = DB()
     self.tv_table_name = {
         config.TV_TYPE_MAIN: 't_tv',
         config.TV_TYPE_3PART: 't_tv_3part'
     }.get(ft)
     self.tv_urls_table_name = {
         config.TV_TYPE_MAIN: 't_tv_urls',
         config.TV_TYPE_3PART: 't_tv_urls_3part'
     }.get(ft)
コード例 #2
0
class FetchTVAll:
    def __init__(self):
        self.db = DB()

    def update(self):
        sql = f"insert into t_tv_all(tv_id,tv_ids,tv_name,tv_actors,tv_director,tv_type,tv_area," \
              f"tv_lang,tv_year,tv_img,tv_intro,update_time,img_save) select B.tv_id,B.tv_ids,B.tv_name," \
              f"B.tv_actors,B.tv_director,B.tv_type,B.tv_area,B.tv_lang,B.tv_year,B.tv_img,B.tv_intro,B.update_time," \
              f"'0' as img_save from(select A.tv_id,group_concat(A.tv_id) tv_ids,A.tv_name,A.tv_img,A.tv_actors," \
              f"A.tv_director,A.tv_type,A.tv_area,A.tv_lang,A.tv_year,A.tv_intro,A.update_time from " \
              f"(select * from t_tv union select * from t_tv_3part) A group by A.tv_name) B on duplicate " \
              f"key update update_time=B.update_time"

        self.db.execute(sql)
コード例 #3
0
ファイル: topspider.py プロジェクト: xwl5242/tvspider
class TopSpider:
    def __init__(self):
        self.db = DB()
        self.tv_type_url_map = {
            'mv': [
                'https://v.qq.com/channel/movie',
                'https://v.qq.com/channel/doco'
            ],
            'dsj': ['https://v.qq.com/channel/tv'],
            'zy': ['https://v.qq.com/channel/variety'],
            'dm': [
                'https://v.qq.com/channel/cartoon',
                'https://v.qq.com/channel/child'
            ],
            'banner': ['https://v.qq.com']
        }
        self.tops = []

    def parse_top(self, html):
        if html and isinstance(html, str):
            root = etree.HTML(html)
            name = root.xpath(
                "//div[starts-with(@class,'site_slider ')]/div[2]//a/span/text()"
            )
            img = root.xpath(
                "//div[starts-with(@class,'site_slider ')]/div[2]//a/@data-bgimage"
            )
            self.tops = [(n, img[i]) for (i, n) in enumerate(name)]

    def fetch_top(self, tv_type):
        try:
            url = self.tv_type_url_map.get(tv_type, '')
            if url and len(url) > 0:
                self.db.delete_by_tv_type('t_b_t', tv_type)
                for u in url:
                    r = requests.get(
                        u, headers={'User-Agent': random.choice(config.UAS)})
                    self.parse_top(r.content.decode('utf-8'))
                    if self.tops and len(self.tops) > 0:
                        for top in self.tops:
                            if top[0] != '大家在看':
                                tv_banner = dict()
                                tv_banner['id'] = str(uuid.uuid4())
                                tv_banner['tv_type'] = tv_type
                                tv_banner['tv_name'] = top[0]
                                tv_banner['tv_img'] = top[1]
                                self.db.insert('t_b_t', tv_banner)
        except Exception as e:
            print(repr(e))
コード例 #4
0
ファイル: topspider.py プロジェクト: xwl5242/tvspider
 def __init__(self):
     self.db = DB()
     self.tv_type_url_map = {
         'mv': [
             'https://v.qq.com/channel/movie',
             'https://v.qq.com/channel/doco'
         ],
         'dsj': ['https://v.qq.com/channel/tv'],
         'zy': ['https://v.qq.com/channel/variety'],
         'dm': [
             'https://v.qq.com/channel/cartoon',
             'https://v.qq.com/channel/child'
         ],
         'banner': ['https://v.qq.com']
     }
     self.tops = []
コード例 #5
0
ファイル: imgspider.py プロジェクト: xwl5242/tvspider
class ImgSpider:
    def __init__(self):
        self.db = DB()

    async def fetch_html(self, url, tv_id):
        """
        :return:
        """
        try:
            async with aiohttp.ClientSession() as session:
                headers = {'User-Agent': random.choice(config.UAS)}
                async with session.get(url, headers=headers,
                                       verify_ssl=False) as response:
                    return await response.read(), tv_id
        except Exception as e:
            logging.error(repr(e))

    def parse_content(self, resp):
        if resp and resp.result() and len(resp.result()) == 2:
            resp = resp.result()
            with open(f'/home/imgs/{resp[1]}.jpg', 'wb') as f:
                f.write(resp[0])
            self.db.update_tv('t_tv_all', f" img_save=%s ", '1', resp[1])

    def batch_(self):
        """
        :return:
        """
        tvs = self.db.find_all('t_tv_all', f" img_save=%s ", '0')
        tvs = [(tv['tv_id'], tv['tv_img']) for tv in tvs]
        batch = int(len(tvs) / 50) + 1
        for i in range(batch):
            end = len(tvs) if i == (batch - 1) else (i + 1) * 50
            self.download(tvs[i * 50:end])

    def download(self, tvs):
        loop = asyncio.get_event_loop()
        tasks = []
        for tv in tvs:
            tid, url = tv[0], tv[1]
            if url:
                task = asyncio.ensure_future(self.fetch_html(url, tid))
                task.add_done_callback(self.parse_content)
                tasks.append(task)
        tasks = asyncio.gather(*tasks)
        loop.run_until_complete(tasks)
コード例 #6
0
ファイル: save2db.py プロジェクト: xwl5242/tvspider
class CSV2MD:
    def __init__(self, ft):
        self.ft = ft
        self.urls_file = config.TV_FS_FILE_MAP.get(ft)
        self.db = DB()
        self.tv_table_name = {
            config.TV_TYPE_MAIN: 't_tv',
            config.TV_TYPE_3PART: 't_tv_3part'
        }.get(ft)
        self.tv_urls_table_name = {
            config.TV_TYPE_MAIN: 't_tv_urls',
            config.TV_TYPE_3PART: 't_tv_urls_3part'
        }.get(ft)

    @staticmethod
    def __build_tv(tv_json):
        try:
            tv_o = {}
            tv_json = dict(tv_json)
            for k in tv_json.keys():
                if k != 'urls':
                    if k == 'tv_intro':
                        v = tv_json.get(k)
                        v = str(v).replace(' ', '').replace('\t', '')
                        v = v[:2000] + '...' if len(v) > 2000 else v
                    elif k == 'tv_name':
                        v = tv_json.get(k)
                        v = str(v).replace(' ', '').replace('~', '')\
                            .replace('~', '').replace('[T]', '').replace('?', '').replace('?', '').replace('·', '')
                    else:
                        v = tv_json.get(k)
                    tv_o[k] = v
            return tv_o
        except Exception as e:
            logging.error(repr(e))

    @staticmethod
    def __build_urls(tv_id, url_list):
        try:
            u_list = []
            if url_list and len(url_list) > 0:
                for u in url_list:
                    tv_url = dict()
                    tv_url['id'] = str(uuid.uuid4())
                    tv_url['tv_id'] = tv_id
                    tv_url['tv_url'] = str(u).replace(' ', '')
                    u_list.append(tv_url)
            return u_list
        except Exception as e:
            logging.error(repr(e))

    def insert_tv(self, tv):
        try:
            tv_json = dict(json.loads(tv))
            tv_name = tv_json['tv_name']
            if tv_name:
                urls = list(tv_json.get('urls', []))
                urls = [u for u in urls if u != ' ' and u != '\t']
                to = CSV2MD.__build_tv(tv_json)
                uo = CSV2MD.__build_urls(tv_json.get('tv_id'), urls)
                self.db.insert(self.tv_table_name, to)
                if uo and len(uo) > 0:
                    self.db.insert_many(self.tv_urls_table_name, uo)
        except Exception as e:
            with open('error.txt', 'a', encoding='utf-8') as f:
                f.write(tv)
            logging.error(e)

    def save_init(self):
        """
        :return:
        """
        with open(self.urls_file, 'r', encoding='GB18030') as ff:
            tvs = ff.readlines()
        logging.info(f'read init tv_url data record:{len(tvs)}')
        logging.info(f'start save init {self.ft} data to mysql db')
        try:
            with ThreadPoolExecutor(max_workers=25) as e:
                e.map(self.insert_tv, tvs)
        except Exception as e:
            logging.error(e)
        logging.info(f'end save init {self.ft} data to mysql db')

    def save_timing(self):
        with open(self.urls_file, 'r', encoding='GB18030') as ff:
            tvs = ff.readlines()
        logging.info(f'read timing {self.ft} url data record:{len(tvs)}')
        logging.info(f'start save {self.ft} timing csv data to mysql db')
        try:
            if tvs and len(tvs) > 0:
                for tv in tvs:
                    tv = dict(json.loads(tv))
                    tv_name = tv.get('tv_name')
                    if tv_name:
                        m_tv = self.db.find_one(self.tv_table_name,
                                                f" tv_name=%s ", tv_name)
                        if m_tv:
                            # 已存在
                            tv_id = m_tv.get('tv_id')
                            self.db.update_tv(self.tv_table_name,
                                              f" update_time=%s ",
                                              tv.get('update_time'), tv_id)
                        else:
                            # 不存在
                            tv_id = tv.get('tv_id')
                            self.db.insert(self.tv_table_name,
                                           CSV2MD.__build_tv(tv))
                        self.db.delete(self.tv_urls_table_name, tv_id)
                        urls = list(tv.get('urls'))
                        urls = [u for u in urls if u != ' ' and u != '\t']
                        u_list = []
                        for u in urls:
                            u_list.append({
                                'id': str(uuid.uuid4()),
                                'tv_id': tv_id,
                                'tv_url': str(u).replace(' ', '')
                            })
                        self.db.insert_many(self.tv_urls_table_name, u_list)
        except Exception as e:
            logging.error(e)
        logging.info(f'end save timing {self.ft} data to mysql db')
コード例 #7
0
ファイル: imgspider.py プロジェクト: xwl5242/tvspider
 def __init__(self):
     self.db = DB()