def validate_page(self, url): print 'validating page: %s' % url try: with connect(Requester) as c: r = c.urlopen(ro.Request(url, cookies=self.request_cookies)) except ro.Exception, ex: print 'oException validating, retrying: %s %s' % (url,ex.msg) with connect(Requester) as c: r = c.urlopen(ro.Request(url, cookies=self.request_cookies))
def GET(self,user_id_string): """ return back the info for the next set of images expects to receive the user id string can receive the id of the last viewed image """ # make sure we have a user string if not user_id_string: log.warning('ImageDetails GET [%s]: no user id string' % user_id_string) web.badrequest() # find user's last viewed key = '%s:user_details:%s' % (NS, user_id_string) last_viewed_id = rc.hget(key, 'last_viewed_id') if last_viewed_id: # we get back a string last_viewed_id = int(last_viewed_id) # if there is no last viewed, it's 0 else: last_viewed_id = 0 # find the data on the next set of images try: with connect(Images) as c: images = c.get_images_since(image_id=last_viewed_id, timestamp=None, limit=10, offset=0) except io.Exception, ex: log.exception('ImageDetails GET [%s] [%s]: getting images' % (user_id_string,last_viewed_id)) web.internalerror()
def populate_image_stats(self, image): """ returns a Image w/ image data + stats filled out """ ti = image image_data = ti.data if not ti.data: return ti ti.size = len(image_data) try: with connect(Blobby) as c: ti.shahash = c.get_data_bhash(image_data) except o.Exception, ex: raise o.Exception('oException getting shahash: %s' % ex.msg)
def download_image_data(self, url, cookies={}): # we want to download the image with connect(Requester) as c: try: img_r = c.urlopen(ro.Request(url, cookies=self.request_cookies)) except Exception, ex: # fail, try again ? print 'exception getting img: %s' % ex try: img_r = c.urlopen(ro.Request(img_url, cookies=self.request_cookies)) except Exception: print 'refailed' return None
for page_url in self.generate_page_urls(): # make sure it's a valid page try: if not self.validate_page(page_url): # we've hit an invalid page, done return added except ro.Exception, ex: print 'oException validating: %s %s' % (page_url,ex.msg) return self.validate_page(page_url) except Exception, ex: print 'Exception validating: %s %s' % (page_url,ex) return self.validate_page(page_url) # get all the pics on the page with connect(Scraper) as c: print 'getting page images' try: # TODO: be able to re-use cookies img_urls = c.get_images(page_url) except so.Exception, ex: print 'oException getting images: %s %s' % (page_url,ex.msg) if not sync: raise ex except Exception, ex: print 'Exception getting images: %s %s' % (page_url,ex) if not sync: raise ex print 'images: %s' % len(img_urls)
def _set_image_data(self, image): if image.data is not None: with connect(Blobby) as c: image.shahash = c.set_data(image.data) return image
def _populate_image_data(self, image): if not image.shahash: return None with connect(Blobby) as c: image.data = c.get_data(image.shahash) return image
class ImagesHandler(object): def __init__(self, redis_host='127.0.0.1'): self.redis_host = redis_host self.rc = Redis(redis_host) self.revent = ReventClient(redis_host=self.redis_host) # redis keys # incr this for the next image id # images:next_id = next_id # all the images for the given sha # images:datainstances:<shahash> = (ids) # timestamp of when image was added # images:ids:timestamps = sorted (ids,timestamp) # all the image ids for the page # images:page_ids:<page_url> (ids) # last time an image was added from page # images:pages:timestamps = sorted (url,timestamp) # images meta data # images:id = {} def _image_to_dict(self, image): data = {} ignored_attrs = ['data'] for attrs in image.thrift_spec[1:]: attr = attrs[2] if attr in ignored_attrs: continue v = getattr(image,attr) if v is not None: data[attr] = v return data def _dict_to_image(self, data): image = o.Image() for attrs in image.thrift_spec[1:]: attr = attrs[2] v = data.get(attr) if v is not None: # we might need to update the value # type, since all values come back # from redis as strings attr_type = attrs[1] # float if attr_type == 4: setattr(image,attr,float(v)) # int elif attr_type == 8: setattr(image,attr,int(v)) else: setattr(image,attr,v) return image def _delete_from_redis(self, image): # make these a transaction pipe = self.rc.pipeline() # remove it from the id set pipe.zrem('images:ids:timestamps',image.id) # remove it's hash pipe.delete('images:%s' % image.id) # decriment the count for it's image data pipe.srem('images:datainstances:%s' % image.shahash, image.id) # remove image from the page's id set if image.source_page_url: pipe.zrem('images:page_ids:%s' % image.source_page_url, image.id) # make it happen pipe.execute() return True def _save_to_redis(self, image): # make these a transaction pipe = self.rc.pipeline() # if our image doesn't have an id, set it up w/ one if not image.id: print 'got new image: %s' % image.shahash image.id = self.rc.incr('images:next_id') pipe.sadd('images:datainstances:%s' % image.shahash, image.id) # check and see if we used to have a different shahash old_shahash = self.rc.hget('images:%s' % image.id,'shahash') if old_shahash != image.shahash: # remove our id from the old shahash tracker pipe.srem('images:datainstances:%s' % old_shahash, image.id) # add it to the new tracker pipe.sadd('images:datainstances:%s' % image.shahash, image.id) # update / set our timestamp da = 0.0 if image.downloaded_at: da = image.downloaded_at else: da = time.time() pipe.zadd('images:ids:timestamps',image.id, da) # add this image to the page's id set if image.source_page_url: pipe.zadd('images:page_ids:%s' % image.source_page_url, image.id, da) # update our last scrape time for the page pipe.zadd('images:pages:timestamps', image.source_page_url, image.id) # take our image and make a dict image_data = self._image_to_dict(image) # set our data to redis key = 'images:%s' % image.id pipe.hmset(key,image_data) # execute our pipe pipe.execute() return image def _get_from_redis(self, image_id): # if the image id is in the id set than pull it's details if self.rc.zrank('images:ids:timestamps',image_id) is not None: # get the image data from redis key = 'images:%s' % image_id image_data = self.rc.hgetall(key) if not image_data: print 'redis had no image data' return None image = self._dict_to_image(image_data) return image return None def _populate_image_data(self, image): if not image.shahash: return None with connect(Blobby) as c: image.data = c.get_data(image.shahash) return image def _set_image_data(self, image): if image.data is not None: with connect(Blobby) as c: image.shahash = c.set_data(image.data) return image def get_image(self, image_id): """ returns Image for given id or blank Image """ # see if we have an image image = self._get_from_redis(image_id) if not image: raise o.ImageNotFound('Could not get image', image_id) # pull the actual image data self._populate_image_data(image) return image def add_image(self, image): """ like set but if we already have this image from this page we're not going to add it again. will also fill out image stats (size, dimension) """ # we're only for new images, no i'ds allowed # if u want to set an id by hand use set_image if image.id: raise o.Exception('Can not add image with id') if not image.data: raise o.Exception('Image must have data') if not image.source_page_url: raise o.Exception('Image must have source page url') # update it's stats image = self.populate_image_stats(image) # only add the image if we haven't seen it beforeQ # if we've seen it before there will be an id which # the set of images w/ this data and from this page share ids = self.rc.sinter('images:datainstance:%s' % image.shahash, 'images:page_ids:%s' % image.source_page_url) # we don't need to continue # we'll return back their original msg, w/o the id set if ids: print 'image already exists [%s], not setting' % ids return image # so the image appears to be new, good for it return self.set_image(image) def set_image(self, image): """ sets image data, returns image """ # would be better if we only saved if it didn't exist if image.data: # save the images data self._set_image_data(image) # could be an update, could be new image = self._save_to_redis(image) # let the world know we have added a new image self.revent.fire('image_added',{ 'source_page_url': image.source_page_url, 'source_url': image.source_url, 'shahash': image.shahash, 'vhash': image.vhash, 'xdim': image.xdim, 'ydim': image.ydim, }) return image def delete_image(self, image_id): """ removes an image """ # get it's image obj try: image = self.get_image(image_id) except o.ImageNotFound, ex: return False # delete the redis data self._delete_from_redis(image) # see if we need to remove the image data if self.rc.scard('images:datainstances:%s' % image.shahash) == 0: # no more images w/ the same data, remove image data with connect(Blobby) as c: c.delete_data(image.shahash) # it's gone, let'm know self.revent.fire('image_deleted',{ 'source_page_url': image.source_page_url, 'source_url': image.source_url, 'shahash': image.shahash, 'vhash': image.vhash, 'xdim': image.xdim, 'ydim': image.ydim, }) # and we're done! return True