Exemple #1
0
 def process_object_image(self,
                          task_name,
                          collection,
                          obj,
                          image_field,
                          image_url,
                          base_dir,
                          ext='jpg',
                          skip_existing=True):
     path = os.path.join(base_dir, hashed_path(image_url, ext=ext))
     if os.path.exists(path) and skip_existing:
         collection.update({'_id': obj['_id']}, {
             '$set': {
                 '%s_path' % image_field: path,
                 '%s_url' % image_field: image_url
             }
         })
     else:
         self.add_task(
             Task(task_name,
                  url=image_url,
                  obj=obj,
                  disable_cache=True,
                  image_field=image_field,
                  collection=collection,
                  base_dir=base_dir,
                  ext=ext))
Exemple #2
0
    def handler(self, collection, obj, set_field, base_dir, task_args=None, grab_args=None, callback=None):
        for image in obj.get(set_field, []):
            path = hashed_path(image["url"], base_dir=base_dir)
            if os.path.exists(path):
                if path != image["path"]:
                    db[collection].update(
                        {"_id": obj["_id"], ("%s.url" % set_field): image["url"]}, {"$set": {"%s.$.path": path}}
                    )
            else:
                kwargs = {}
                if task_args:
                    kwargs = deepcopy(task_args)

                g = Grab()
                g.setup(url=image["url"])
                if grab_args:
                    g.setup(**grab_args)
                g.setup(referer=build_image_hosting_referer(image["url"]))

                yield Task(
                    callback=callback or image_set_handler,
                    grab=g,
                    collection=collection,
                    path=path,
                    obj=obj,
                    image=image,
                    set_field=set_field,
                    disable_cache=True,
                    backup=g.dump_config(),
                    **kwargs
                )
Exemple #3
0
    def handler(self, url, collection, obj, path_field, base_dir, task_args=None, grab_args=None, callback=None):
        path = hashed_path(url, base_dir=base_dir)
        if os.path.exists(path):
            if path != obj.get(path_field, None):
                db[collection].update({"_id": obj["_id"]}, {"$set": {path_field: path}})
        else:
            kwargs = {}
            if task_args:
                kwargs = deepcopy(task_args)

            g = Grab()
            g.setup(url=url)
            if grab_args:
                g.setup(**grab_args)
            g.setup(referer=build_image_hosting_referer(url))

            yield Task(
                callback=callback or image_handler,
                grab=g,
                collection=collection,
                path=path,
                obj=obj,
                path_field=path_field,
                disable_cache=True,
                backup=g.dump_config(),
                **kwargs
            )
Exemple #4
0
    def handler(self, collection, obj, set_field, base_dir, task_args=None,
                grab_args=None, callback=None):
        from database import db

        for image in obj.get(set_field, []):
            path = hashed_path(image['url'], base_dir=base_dir)
            if os.path.exists(path):
                if path != image['path']:
                    db[collection].update(
                        {'_id': obj['_id'], ('%s.url' % set_field): image['url']},
                        {'$set': {('%s.$.path' % set_field): path}})
            else:
                kwargs = {}
                if task_args:
                    kwargs = deepcopy(task_args)

                g = Grab()
                g.setup(url=image['url'])
                if grab_args:
                    g.setup(**grab_args)
                g.setup(referer=build_image_hosting_referer(image['url']))

                yield Task(
                    callback=callback or image_set_handler,
                    grab=g,
                    collection=collection,
                    path=path,
                    obj=obj,
                    image=image,
                    set_field=set_field,
                    disable_cache=True,
                    backup=g.dump_config(),
                    **kwargs
                )
Exemple #5
0
 def process_object_image(self, task_name, collection, obj, image_field, image_url,
                          base_dir, ext='jpg', skip_existing=True):
     path = os.path.join(base_dir, hashed_path(image_url, ext=ext))
     if os.path.exists(path) and skip_existing:
         collection.update({'_id': obj['_id']},
                           {'$set': {'%s_path' % image_field: path,
                                     '%s_url' % image_field: image_url}})
     else:
         self.add_task(Task(task_name, url=image_url, obj=obj, disable_cache=True,
                            image_field=image_field,
                            collection=collection, base_dir=base_dir, ext=ext))
Exemple #6
0
    def save_hash(self, location, basedir, ext=None):
        """
        Save response body into file with special path
        builded from hash. That allows to lower number of files
        per directory.

        :param location: URL of file or something else. It is
            used to build the SHA1 hash.
        :param basedir: base directory to save the file. Note that
            file will not be saved directly to this directory but to
            some sub-directory of `basedir`
        :param ext: extension which should be appended to file name. The
            dot is inserted automatically between filename and extension.
        :returns: path to saved file relative to `basedir`

        Example::

            >>> url = 'http://yandex.ru/logo.png'
            >>> g.go(url)
            >>> g.response.save_hash(url, 'some_dir', ext='png')
            'e8/dc/f2918108788296df1facadc975d32b361a6a.png'
            # the file was saved to $PWD/some_dir/e8/dc/...

        TODO: replace `basedir` with two options: root and save_to. And
        returns save_to + path
        """

        if isinstance(location, unicode):
            location = location.encode('utf-8')
        rel_path = hashed_path(location, ext=ext)
        path = os.path.join(basedir, rel_path)
        if not os.path.exists(path):
            path_dir, path_fname = os.path.split(path)
            try:
                os.makedirs(path_dir)
            except OSError:
                pass
            with open(path, 'wb') as out:
                if isinstance(self._cached_body, unicode):
                    out.write(self._cached_body.encode('utf-8'))
                else:
                    out.write(self._cached_body)
        return rel_path
Exemple #7
0
    def save_hash(self, location, basedir, ext=None):
        """
        Save response body into file with special path
        builded from hash. That allows to lower number of files
        per directory.

        :param location: URL of file or something else. It is
            used to build the SHA1 hash.
        :param basedir: base directory to save the file. Note that
            file will not be saved directly to this directory but to
            some sub-directory of `basedir`
        :param ext: extension which should be appended to file name. The
            dot is inserted automatically between filename and extension.
        :returns: path to saved file relative to `basedir`

        Example::

            >>> url = 'http://yandex.ru/logo.png'
            >>> g.go(url)
            >>> g.response.save_hash(url, 'some_dir', ext='png')
            'e8/dc/f2918108788296df1facadc975d32b361a6a.png'
            # the file was saved to $PWD/some_dir/e8/dc/...

        TODO: replace `basedir` with two options: root and save_to. And
        returns save_to + path
        """

        if isinstance(location, unicode):
            location = location.encode('utf-8')
        rel_path = hashed_path(location, ext=ext)
        path = os.path.join(basedir, rel_path)
        if not os.path.exists(path):
            path_dir, path_fname = os.path.split(path)
            try:
                os.makedirs(path_dir)
            except OSError:
                pass
            with open(path, 'wb') as out:
                if isinstance(self._cached_body, unicode):
                    out.write(self._cached_body.encode('utf-8'))
                else:
                    out.write(self._cached_body)
        return rel_path
Exemple #8
0
    def handler(self,
                collection,
                obj,
                set_field,
                base_dir,
                task_args=None,
                grab_args=None,
                callback=None):
        from database import db

        for image in obj.get(set_field, []):
            path = hashed_path(image['url'], base_dir=base_dir)
            if os.path.exists(path):
                if path != image['path']:
                    db[collection].update(
                        {
                            '_id': obj['_id'],
                            ('%s.url' % set_field): image['url']
                        }, {'$set': {
                            '%s.$.path': path
                        }})
            else:
                kwargs = {}
                if task_args:
                    kwargs = deepcopy(task_args)

                g = Grab()
                g.setup(url=image['url'])
                if grab_args:
                    g.setup(**grab_args)
                g.setup(referer=build_image_hosting_referer(image['url']))

                yield Task(callback=callback or image_set_handler,
                           grab=g,
                           collection=collection,
                           path=path,
                           obj=obj,
                           image=image,
                           set_field=set_field,
                           disable_cache=True,
                           backup=g.dump_config(),
                           **kwargs)
Exemple #9
0
    def handler(self,
                url,
                collection,
                obj,
                path_field,
                base_dir,
                task_args=None,
                grab_args=None,
                callback=None):
        from database import db
        path = hashed_path(url, base_dir=base_dir)
        if os.path.exists(path):
            if path != obj.get(path_field, None):
                db[collection].update({'_id': obj['_id']},
                                      {'$set': {
                                          path_field: path
                                      }})
        else:
            kwargs = {}
            if task_args:
                kwargs = deepcopy(task_args)

            g = Grab()
            g.setup(url=url)
            if grab_args:
                g.setup(**grab_args)
            g.setup(referer=build_image_hosting_referer(url))

            yield Task(callback=callback or image_handler,
                       grab=g,
                       collection=collection,
                       path=path,
                       obj=obj,
                       path_field=path_field,
                       disable_cache=True,
                       backup=g.dump_config(),
                       **kwargs)