Beispiel #1
0
 def gen_tasks(self, urls):
     """ according the urls gen {name:'', url:''} dict.
     :param urls:
     :type urls:
     :return:
     :rtype:
     """
     tasks = []
     if isinstance(urls, list):
         for url in urls:
             dat = {'url': url, 'name': url.split('/')[-1]}
             if not self.overwrite and helper.is_file_ok(dat['name']):
                 self.result['skip'] += 1
             else:
                 tasks.append(dat)
     elif isinstance(urls, dict):
         # {k: v} => k is name, v is url value
         for k, v in urls.items():
             dat = {
                 'url': v,
                 'name': k,
             }
             if not self.overwrite and helper.is_file_ok(dat['name']):
                 self.result['skip'] += 1
             else:
                 tasks.append(dat)
     else:
         raise CrawlerParamsError('urls should be list/dict')
     return tasks
Beispiel #2
0
    def update_song_meta(self, name, dat=None):
        """ APIC:cover """
        dat = dat or {}
        song = ID3(name)
        before_update_size = helper.is_file_ok(name)
        tags = ['TIT2', 'TALB', 'TPE1']
        for tag in tags:
            if dat.get(tag) and dat.get(tag) != song.get(tag):
                song.add(getattr(id3, tag)(encoding=Encoding.UTF16, text=dat[tag]))

        if not song.get('APIC:cover') and dat.get('APIC'):
            self.CP.G('update album picture')
            with open(dat.get('APIC'), 'rb') as h:
                cover_raw = h.read()
            if cover_raw:
                frame = APIC(encoding=Encoding.UTF16, mime="image/jpeg",
                             desc="cover", type=PictureType.COVER_FRONT, data=cover_raw)
                song.add(frame)

        song.save()
        self.CP.C('-' * 32)
        after_size = helper.is_file_ok(name)
        for k, v in song.items():
            if 'APIC' not in k:
                self.CP.W(k, v)
        self.CP.G('update done: size from {} to {}, pic took {}'.format(before_update_size, after_size,
                                                                        after_size - before_update_size))
        self.CP.C('-' * 32)
Beispiel #3
0
    def sync_save(self, res, overwrite=False):
        """ save ``res`` to local synchronized

        :param res: {'url': '', 'name': ''}
        :type res: dict
        :param overwrite:
        :type overwrite:
        :return:
        :rtype: BeautifulSoup
        """
        if not isinstance(res, dict):
            raise CrawlerParamsError('res must be dict')

        url_, file_name = res.get('url', ''), res.get('name', '')
        if not url_ or not file_name:
            raise CrawlerParamsError('url&name is needed!')

        # log.debug('Sync {}'.format(res.get('name')))
        # not overwrite and file exists
        if not overwrite and helper.is_file_ok(file_name):
            return True

        cnt = self.do_sess_get(url_)
        # get res failed
        if not cnt:
            return False

        with open(file_name, 'wb') as f:
            f.write(cnt)
        zlog.debug('Sync Done {}'.format(res.get('name')))
        return True
Beispiel #4
0
    def download_image(self, params):
        """
            下载相应图片
        :param params:
        :type params: dict
        :return:
        :rtype:
        """
        img_url, title = params.get('img_url'), params.get('title')

        filename = '{}'.format(img_url.split('/')[-1])
        filename = filename.split('.')[0].zfill(2)

        _img_fd = gen_fd_from_image_url(img_url)
        _rel_pth = '{}/{}.jpg'.format(_img_fd, filename)
        fpth = '{}/{}/{}-{}.jpg'.format(self.base_dir, _img_fd, title, filename)

        if helper.is_file_ok(fpth):
            return

        if os.path.exists(fpth):
            log.debug('RETRY: ({})'.format(_rel_pth))
        else:
            helper.mkdir_p(fpth)

        img = self.crawl(img_url)
        if not img:
            return
        with open(fpth, 'wb') as f:
            f.write(img)
Beispiel #5
0
 def load_cookies(ck_pth):
     if not helper.is_file_ok(ck_pth):
         return
     _cookie_jar = cookiejar.LWPCookieJar(ck_pth)
     _cookie_jar.load(ck_pth, ignore_expires=True, ignore_discard=True)
     _cookies = requests.utils.dict_from_cookiejar(_cookie_jar)
     cookies = requests.utils.cookiejar_from_dict(_cookies)
     return cookies
Beispiel #6
0
    def is_file_id3_ok(self, song_name):
        song_pth = os.path.join(self._music_dir, song_name)
        if song_pth[-4:] != '.mp3':
            song_pth += '.mp3'

        if helper.is_file_ok(song_pth):
            has_pic, song_id3 = self._song_metas.get_song_meta(song_pth)
            return has_pic, song_pth
        return False, None
Beispiel #7
0
 def _download(self, src, save_to):
     if not self._override and helper.is_file_ok(save_to):
         zlog.info('{} is downloaded.'.format(save_to))
         return save_to
     if self._override and helper.is_file_ok(save_to):
         zlog.info('force remove exist file: ({})'.format(
             helper.C.format(save_to)))
         os.remove(save_to)
     zlog.debug('try get {}'.format(save_to))
     try:
         wget.download(src, out=save_to)
         # wget output end without new line
         print()
         zlog.info('downloaded {}'.format(
             helper.G.format(save_to.split('/')[-1])))
         return save_to
     except Exception as e:
         zlog.error('Download {}({}) Failed: {}'.format(
             save_to.split('/')[-1], src, e))
         return ''
Beispiel #8
0
    def download_and_save(self, params, force_write=False):
        img_url, title = params.get('img_url'), params.get('title')

        if not force_write and helper.is_file_ok(title):
            return self.save_status['skip']

        img = self.crawl(img_url)
        if not img:
            return self.save_status['fail']
        with open(title, 'wb') as f:
            f.write(img)
        return self.save_status['ok']
Beispiel #9
0
    def spawn(self, url, force_spawn=False):
        """use the url for creation of domain and fetch cookies

        - init cache dir by the url domain as ``<base>/domain``
        - save the cookies to file ``<base>/domain/cookie.txt``
        - init ``headers.get/post/json`` with response info
        - init ``site_dir/site_raw/site_media``

        :param url:
        :type url:
        :param force_spawn:
        :type force_spawn:
        :return:
        :rtype:
        """
        _url, domain = self.get_domain_home_from_url(url)
        if not _url:
            return False

        self.cache['site_dir'] = os.path.join(self.cache['base'], self.domain)
        for k in ['raw', 'media']:
            self.cache['site_' + k] = os.path.join(self.cache['site_dir'], k)
            helper.mkdir_p(self.cache['site_' + k], True)

        ck_pth = os.path.join(self.cache['site_dir'], 'cookie.txt')
        helper.mkdir_p(ck_pth)

        name = os.path.join(self.cache['site_raw'], 'homepage')
        # not force spawn and file ok
        if not force_spawn and helper.is_file_ok(name):
            # zlog.debug('{} exist!'.format(name))
            self.sess.cookies = self.load_cookies(ck_pth)
            return True
        else:
            zlog.debug('{} not exist!'.format(name))

        res = self.sess.get(url, headers=self.__header__)
        if res.status_code != 200:
            return False
        if res:
            helper.write_file(res.content, name)
        # self.load(url)

        for k, v in self.headers.items():
            self.headers[k] = res.request.headers

        self.dump_cookies(cookies=self.sess.cookies, save_to=ck_pth)

        return True
Beispiel #10
0
    def download_and_save(self, params, force_write=False):
        img_url, title = params.get('img_url'), params.get('title')

        # 给 小于 10 的名字补 0
        filename = '{}'.format(img_url.split('/')[-1])
        filename = filename.split('.')[0].zfill(2) + '.jpg'
        filename = '{}-{}'.format(title, filename)

        if not force_write and helper.is_file_ok(filename):
            return 0

        img = self.crawl(img_url)
        if not img:
            return 1
        with open(filename, 'wb') as f:
            f.write(img)
        return 3
Beispiel #11
0
        def down(lec, which_type):
            """"""
            if which_type not in ['mp3', 'doc']:
                log.warn('Not Supported of ({})'.format(which_type))
                return

            save_name = os.path.join(self.catalog_info['base_pth'], which_type,
                                     '{}.{}'.format(lec['name'], which_type))
            helper.mkdir_p(save_name)
            if helper.is_file_ok(save_name):
                print()
                log.debug('SKIP {}'.format(save_name))
                return
            # 默认等待2s, 如果是下载 mp3 随机等待
            rd = 2
            if which_type == 'mp3':
                rd = abc.randint(3, 6)

            url = lec[which_type]
            log.debug('[WAIT] {}s for ({}:{})...'.format(rd, save_name, url))
            time.sleep(rd)
            wget.download(url, out=save_name)
Beispiel #12
0
    def __init__(self, pth, dat=None, enable_default_log=True):
        """初始化配置文件
        - 文件不存在: 将字典dat初始化到文件中
        - 文件存在: 以字典数据类型来初始化配置文件

        :param dat: ``字典类型``
        :type dat: dict
        :param pth: ``文件存储路径``
        :type pth: str
        :param enable_default_log: ``是否启用默认log配置参数``
        :type enable_default_log: bool
        """
        try:
            if not helper.is_file_ok(pth):
                helper.write_file('', pth)
            self._pth, t = os.path.split(pth)
            self._cfg_name = t.split('.')[0]
        except Exception as _:
            self._pth = '/tmp'
            self._cfg_name = 'izen'

        self.cfg = profig.Config(pth, encoding='utf-8')

        # 读取配置
        self.cfg.read()

        # 初始化默认log字段类型
        if enable_default_log:
            self.__spawn()

        # 初始化自定义字典
        if dat:
            self.__do_init(dat)

        # 在配置不存在时, 需要首先初始化在内存中, 然后再同步到本地并退出执行程序
        if not os.path.exists(os.path.expanduser(pth)):
            self.cfg.sync()
Beispiel #13
0
 def load_from_cache(name):
     if not helper.is_file_ok(name):
         return ''
     return helper.read_file(name)
Beispiel #14
0
 def load_chapters(self):
     if helper.is_file_ok(self.catalog_info['cache_file']):
         self.da2017 = json.loads(
             helper.to_str(helper.read_file(
                 self.catalog_info['cache_file'])))
Beispiel #15
0
 def get_size(self, name):
     size = helper.is_file_ok(self._music_dir + '/{}'.format(name))
     return round(size / 1024 / 1024, 2)