Ejemplo n.º 1
0
    def load(self, url, use_cache=True, show_log=False, **kwargs):
        """fetch the url ``raw info``, use cache first, if no cache hit, try get from Internet

        :param url:
        :type url:
        :param use_cache:
        :type use_cache:
        :param show_log:
        :type show_log:
        :return: the ``raw info`` of the url
        :rtype: ``str``
        """
        _name = self.map_url_to_cache_id(url)
        raw = ''
        hit = False

        if use_cache:
            hit = True
            raw = self.load_from_cache(_name)

        if not raw:
            if show_log:
                zlog.debug('from cache got nothing {}'.format(_name))
            raw = self.do_sess_get(url, **kwargs)
            if raw:
                helper.write_file(raw, _name)

        # if not raw:
        #     hit = True
        #     raw = self.load_from_cache(_name)
        if show_log:
            zlog.debug('[{}:{:>8}] get {}'.format('Cache' if hit else 'Net',
                                                  len(raw), url))
        return raw
Ejemplo n.º 2
0
    def load_post(self,
                  url,
                  data,
                  headers=None,
                  ret='json',
                  use_cache=True,
                  show_log=False):
        name = self.map_url_to_cache_id(url)
        if data:
            keys = sorted(data.keys())
            for k in keys:
                name += '{}{}'.format(k, data[k])

        raw = ''
        hit = False

        if use_cache:
            hit = True
            raw = self.load_from_cache(name)

        if not raw:
            if show_log:
                zlog.debug('cache miss: ({})'.format(name))
            raw = self.do_sess_post(url, data, headers, ret)
            if raw:
                if ret == 'json':
                    raw = json.dumps(raw)
                zlog.debug('write ({}) to {}'.format(len(raw), name))
                helper.write_file(raw, name)

        if show_log:
            zlog.debug('[cache {}:{:>8}] post {}'.format(
                'hit' if hit else 'miss', len(raw or ''), name))
        return raw
Ejemplo n.º 3
0
    def search_name(self, name):
        pth, raw = self.load_cache(SiteURL.search,
                                   data={'keywords': name},
                                   use_str=True)
        if not raw or self.overwrite:
            raw = search_by_chrome(SiteURL.home, ('input.formhue', name))
            helper.write_file(raw, pth)

        parser = DygodParser(raw_data=raw)
        parser.do_parse()
        return parser.data['movies']
Ejemplo n.º 4
0
 def do_get(self, url, save_to):
     if helper.is_file_ok(save_to):
         return 200
     res = self.sess.get(url,
                         headers=self.headers,
                         cookies=self.cookies,
                         timeout=30)
     if res.status_code == 404:
         zlog.error('[{}]: {}'.format(res.status_code, url))
         return 404
     helper.write_file(res.content, save_to)
     zlog.debug('[{}] saved {}'.format(res.status_code, url))
     return res.status_code
Ejemplo n.º 5
0
    def spawn(self, url, force_spawn=False):
        """use the url for creation of domain and fetch cookies

        - init cache dir by the url domain as ``<base>/domain``
        - save the cookies to file ``<base>/domain/cookie.txt``
        - init ``headers.get/post/json`` with response info
        - init ``site_dir/site_raw/site_media``

        :param url:
        :type url:
        :param force_spawn:
        :type force_spawn:
        :return:
        :rtype:
        """
        _url, domain = self.get_domain_home_from_url(url)
        if not _url:
            return False

        self.cache['site_dir'] = os.path.join(self.cache['base'], self.domain)
        for k in ['raw', 'media']:
            self.cache['site_' + k] = os.path.join(self.cache['site_dir'], k)
            helper.mkdir_p(self.cache['site_' + k], True)

        ck_pth = os.path.join(self.cache['site_dir'], 'cookie.txt')
        helper.mkdir_p(ck_pth)

        name = os.path.join(self.cache['site_raw'], 'homepage')
        # not force spawn and file ok
        if not force_spawn and helper.is_file_ok(name):
            # zlog.debug('{} exist!'.format(name))
            self.sess.cookies = self.load_cookies(ck_pth)
            return True
        else:
            zlog.debug('{} not exist!'.format(name))

        res = self.sess.get(url, headers=self.__header__)
        if res.status_code != 200:
            return False
        if res:
            helper.write_file(res.content, name)
        # self.load(url)

        for k, v in self.headers.items():
            self.headers[k] = res.request.headers

        self.dump_cookies(cookies=self.sess.cookies, save_to=ck_pth)

        return True
Ejemplo n.º 6
0
    def get_album_images(self, index=1):
        self.get_album_pages(index)
        gap = GirlAlbumPageParser(
            file_name='{}/album_page/{}.html'.format(self.girl_id, index))
        gap.do_parse()

        albums = gap.data.get('albums')
        albums = [x.split('/')[-2] for x in albums]
        zlog.debug('{} => {}/{}\n{}'.format(self.girl_id, index, len(albums),
                                            albums))
        avail_images = []
        for album in albums:
            avail_images += self.get_single_album_images(album)

        helper.write_file(
            '\n'.join(avail_images),
            self.base_dir / 'logs/{}-{}.txt'.format(self.girl_id, index))
Ejemplo n.º 7
0
    def dump_failure_songs(self, song, action='add'):
        dat = self.load_failure_songs() or []
        dat = [x for x in dat if x]
        if action == 'add':
            dat.append(song)
        elif action == 'del':
            if song in dat:
                dat.pop(dat.index(song))
        elif action == 'clear':
            dat = []
        else:
            zlog.error('unsupported action: ({})'.format(action))
            return

        dat = list(set(dat))
        dat = yaml.dump(dat)
        file_pth = os.path.expanduser(self.failure_store)
        helper.write_file(dat, file_pth)
Ejemplo n.º 8
0
def run(girl_id, page_num, generate_album_images, process, force, view, whom,
        delete_girl, album_id, auto_mode):
    gc = GirlCrawler(girl_id, header_file=str(SAVE_DIR / 'header.txt'))
    if view or whom is not None:
        gc.store.show(whom)
        return
    if delete_girl:
        gc.store.delete()
        return

    if auto_mode:
        girl_id = gc.store.get_minimum_girl_id()
        zlog.debug('start with girl-id: {}'.format(girl_id))

    if generate_album_images:
        gc.get_all_albums()
        return

    if album_id:
        images = gc.get_single_album_images(album_id)
        helper.write_file(
            '\n'.join(images),
            gc.base_dir / 'logs/{}-{}.txt'.format(girl_id, album_id))

    girl_info = gc.store.girls.get(girl_id, {})
    if not girl_info.get('pages', 0):
        page_num = 0

    if album_id:
        page_num = album_id

    img_file = str(SAVE_DIR / 'logs/{}-{}.txt'.format(girl_id, page_num))
    e404_file = str(SAVE_DIR / 'logs/{}-{}-404.txt'.format(girl_id, page_num))
    # print(img_file, e404_file)

    e404 = []
    if not force and helper.is_file_ok(e404_file):
        e404 = [
            x for x in helper.read_file(e404_file, use_str=True).split('\n')
            if x
        ]

    images = [
        x for x in helper.read_file(img_file, use_str=True).split('\n') if x
    ]
    # gc = GirlCrawler(girl_id)
    todo = []
    for img in images:
        d = gc.gen_pth_by_url(img)
        if img in e404:
            continue
        if d:
            todo.append(str(img))

    zlog.debug('all/todo/404: {}/{}/{}'.format(len(images), len(todo),
                                               len(e404)))
    if not todo:
        zlog.info('all image of {} downloaded'.format(girl_id))
        gc.store.mark_done(page_num)
        return

    e404_, ok = gc.parallel_download(urls=todo, procs=process)
    if e404_:
        e404 += e404_
        helper.write_file('\n'.join(sorted(e404)), e404_file)
    else:
        if ok == len(todo):
            zlog.info('all image of {} downloaded'.format(girl_id))
            gc.store.mark_done(page_num)
Ejemplo n.º 9
0
 def sync(self):
     helper.write_file(json.dumps(self.girls), self._file)