def test(): from Httpy import Httpy httpy = Httpy() # Check we can hit the host url = 'http://twitter.com' r = httpy.get(url) if len(r.strip()) == 0: # Raise exception because the site is *very* broken, definitely can't rip from it if we can't hit the home page. raise Exception('unable to retrieve data from %s' % url) # Check ripper gets all images in an album url = SiteTwitter.get_sample_url() s = SiteTwitter(url) SiteTwitter.MAX_REQUESTS_PER_RIP = 1 urls = s.get_urls() expected = 5 if len(urls) < expected: # Returning non-None string since this may be a transient error. # Maybe the album was deleted but the ripper is working as expected. return 'expected at least %d images, got %d. url: %s' % ( expected, len(urls), url) # Returning None because the ripper is working as expected. No issues found. return None
def get_urls_user_albums(self): if self.url.endswith('/all'): # Images, not albums return self.get_urls_user_images() from Httpy import Httpy httpy = Httpy() user = self.url.split('//')[1].split('.')[0] r = httpy.get(self.url) result = [] for (index, cover) in enumerate( httpy.between(r, '<div class="cover">', '</div>')): if not '<a href="' in cover: continue album = httpy.between(cover, '<a href="', '"')[0] if album.startswith('//'): album = 'http:%s' % album albumid = album.split('/')[4] album = 'http://imgur.com/a/%s' % albumid for image in self.get_urls_album(album): # Tack this album's index/albumid to image image['saveas'] = '%03d_%s_%s' % (index + 1, albumid, image['saveas']) result.append(image) sleep(2) if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break return result
def get_urls(self): self.api_key = self.db.get_config('tumblr_key') if self.api_key == None: raise Exception('unable to rip album (%s), tumblr key not found in database' % self.url) from Httpy import Httpy httpy = Httpy() result = [] offset = 0 while True: url = self.get_api_url(offset=offset) r = httpy.get(url) json = loads(r) if not 'response' in json or not 'posts' in json['response']: #raise Exception('no posts found at %s' % self.url) break posts = json['response']['posts'] if len(posts) == 0: break for post in posts: for photos in post['photos']: result.append(photos['original_size']['url']) if self.post_type == 'post': break if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break offset += 20 sleep(1) return result
def get_urls_user_images(self): from Httpy import Httpy httpy = Httpy() result = [] url = self.url.replace('/all', '') page = total = index = 0 while True: page += 1 next_page = '%s/ajax/images?sort=0&order=1&album=0&page=%d&perPage=60' % ( url, page) r = httpy.get(next_page) json = loads(r) data = json['data'] if total == 0 and 'count' in data: total = data['count'] # TODO report progress for image in data['images']: result.append('http://i.imgur.com/%s%s' % (image['hash'], image['ext'])) if index >= total: break if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break sleep(1) return result
def get_urls_user_albums(self): if self.url.endswith('/all'): # Images, not albums return self.get_urls_user_images() from Httpy import Httpy httpy = Httpy() user = self.url.split('//')[1].split('.')[0] r = httpy.get(self.url) result = [] for (index, cover) in enumerate(httpy.between(r, '<div class="cover">', '</div>')): if not '<a href="' in cover: continue album = httpy.between(cover, '<a href="', '"')[0] if album.startswith('//'): album = 'http:%s' % album albumid = album.split('/')[4] album = 'http://imgur.com/a/%s' % albumid for image in self.get_urls_album(album): # Tack this album's index/albumid to image image['saveas'] = '%03d_%s_%s' % (index + 1, albumid, image['saveas']) result.append(image) sleep(2) if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break return result
def test(): ''' Test that ripper is working as expected. Raise exception if necessary. ''' from Httpy import Httpy httpy = Httpy() # Check we can hit the host url = 'http://imgur.com' r = httpy.get(url) if len(r.strip()) == 0: raise Exception('unable to retrieve data from %s' % url) # Check ripper gets all images in an album #url = 'http://markedone911.imgur.com/' #url = 'http://imgur.com/r/nsfw_oc/top/all' url = SiteImgur.get_sample_url() s = SiteImgur(url) urls = s.get_urls() for (i, u) in enumerate(urls): print i, u expected = 4 if len(urls) < expected: return 'expected at least %d images, got %d. url: %s' % ( expected, len(urls), url) return None
def search_img_url(query, distance): if ' ' in query: query = query.replace(' ', '%20') if not is_valid_url(query): raise Exception("Invalid query: '%s'" % query) try: hash = db.get_image_hash_from_url(url=query) if not hash: # Download image web = Httpy() try: image_buffer = web.download(url=query) except: raise Exception('unable to download image at %s' % query) try: im = image_from_buffer(image_buffer) hash = get_hash(im) except: raise Exception("Could not identify image") images = db.get_similar_images(hash, distance=distance) results = build_results_for_images(images) except Exception as e: return Response(json.dumps({'error': str(e)}), mimetype="application/json") return Response(results.json(), mimetype="application/json")
def test(): ''' Test that ripper is working as expected. Raise exception if necessary. ''' from Httpy import Httpy httpy = Httpy() # Check we can hit the host url = 'http://imgur.com' r = httpy.get(url) if len(r.strip()) == 0: raise Exception('unable to retrieve data from %s' % url) # Check ripper gets all images in an album #url = 'http://markedone911.imgur.com/' #url = 'http://imgur.com/r/nsfw_oc/top/all' url = SiteImgur.get_sample_url() s = SiteImgur(url) urls = s.get_urls() for (i,u) in enumerate(urls): print i, u expected = 4 if len(urls) < expected: return 'expected at least %d images, got %d. url: %s' % (expected, len(urls), url) return None
def test(): ''' Test that ripper is working as expected. StatusManager.py uses the results of this method to show what rippers are working/broken on the main page Returns: None - if ripper is working as expected str - Warning message if the ripper may not be working properly. Raises: Exception - if ripper is definitely broken. Exception message is used to display on site. ''' from Httpy import Httpy httpy = Httpy() # Check we can hit the host url = 'http://hostname.com' r = httpy.get(url) if len(r.strip()) == 0: # Raise exception because the site is *very* broken, definitely can't rip from it if we can't hit the home page. raise Exception('unable to retrieve data from %s' % url) # Check ripper gets all images in an album url = _SampleSite.get_sample_url() s = _SampleSite(url) urls = s.get_urls() expected = 10 if len(urls) < expected: # Returning non-None string since this may be a transient error. # Maybe the album was deleted but the ripper is working as expected. return 'expected at least %d images, got %d. url: %s' % ( expected, len(urls), url) # Returning None because the ripper is working as expected. No issues found. return None
def search_vid_url(query, distance, frame_count): if ' ' in query: query = query.replace(' ', '%20') try: video_id = db.get_video_from_url(url=query) if not video_id: # Download video web = Httpy() video_buffer = web.download(url=query) if not video_buffer: raise Exception('unable to download video at %s' % query) try: frames, info = info_from_video_buffer(video_buffer, os.path.splitext(query)[1][1:]) except: raise Exception("Could not identify video") videos = db.get_similar_videos_by_hash(frames, distance, frame_count) else: hashes = db.get_video_hashes(video_id) videos = db.get_similar_videos_by_hash(hashes, distance, frame_count) results = SearchResults(db.build_results_for_videos(videos)) except Exception as e: return Response(json.dumps({'error': str(e)}), mimetype="application/json") return Response(results.json(), mimetype="application/json")
def test(): ''' Test that ripper is working as expected. StatusManager.py uses the results of this method to show what rippers are working/broken on the main page Returns: None - if ripper is working as expected str - Warning message if the ripper may not be working properly. Raises: Exception - if ripper is definitely broken. Exception message is used to display on site. ''' from Httpy import Httpy httpy = Httpy() # Check we can hit the host url = 'http://hostname.com' r = httpy.get(url) if len(r.strip()) == 0: # Raise exception because the site is *very* broken, definitely can't rip from it if we can't hit the home page. raise Exception('unable to retrieve data from %s' % url) # Check ripper gets all images in an album url = _SampleSite.get_sample_url() s = _SampleSite(url) urls = s.get_urls() expected = 10 if len(urls) < expected: # Returning non-None string since this may be a transient error. # Maybe the album was deleted but the ripper is working as expected. return 'expected at least %d images, got %d. url: %s' % (expected, len(urls), url) # Returning None because the ripper is working as expected. No issues found. return None
def test(): ''' Test that ripper is working as expected. Raise exception if necessary. ''' from Httpy import Httpy httpy = Httpy() # Check we can hit the host url = 'http://8muses.com/' r = httpy.get(url) if len(r.strip()) == 0: raise Exception('unable to retrieve data from %s' % url) # Check ripper gets all images in an album url = 'http://www.8muses.com/index/category/hotassneighbor7' s = Site8muses(url) urls = s.get_urls() for (i, url) in enumerate(urls): print i, url expected = 21 if len(urls) != expected: return 'expected %d images, got %d. url: %s' % (expected, len(urls), url) return None
def get_image_count_for_album(url): url = url.replace('m.imgur.com', 'imgur.com').replace('https://', '').replace('http://', '') aid = url.split('/')[2] url = 'http://imgur.com/a/%s/noscript' % aid httpy = Httpy() r = httpy.get(url) return r.count('src="//i.imgur.com')
def test(): from Httpy import Httpy httpy = Httpy() try: r = httpy.get('http://www.vimeo.com/') if len(r.strip()) == 0: raise Exception('empty response from vimeo.com') except Exception, e: raise e
def get_urls(self): from Httpy import Httpy httpy = Httpy() r = httpy.get(self.url) result = [] for link in httpy.between(r, '/img.php?path=', '"'): result.append(link) return result
def sanitize_url(self): if '/image.php?id=' in self.url: from Httpy import Httpy httpy = Httpy() r = httpy.get(self.url) if not 'View complete gallery: <a href="' in r: raise Exception('no gallery found at %s' % self.url) self.url = 'http://imagearn.com/%s' % httpy.between(r, 'View complete gallery: <a href="', '"')[0] if not '/gallery.php?id=' in self.url: raise Exception('expected /gallery.php?id= not found in URL')
def sanitize_url(self): if '/image/' in self.url: from Httpy import Httpy httpy = Httpy() r = httpy.get(self.url) if not "class='gallery_title'><a href='" in r: raise Exception('no gallery found at %s' % self.url) self.url = httpy.between(r, "class='gallery_title'><a href='", "'")[0] if not '/gallery/' in self.url: raise Exception('expected /gallery/ not found in URL') if not self.url.endswith('/'): self.url += '/'
def get_urls(self): from Httpy import Httpy httpy = Httpy() r = httpy.get(self.url) result = [] for post in httpy.between(r, 'daposts">', '</div> </div> </div>'): images = httpy.between(post, 'href="', '"') if len(images) > 0 and 'javascript:' not in images[0]: result.append('http://www.chansluts.com%s' % images[0]) return result
def sanitize_url(self): if '/image.php?id=' in self.url: from Httpy import Httpy httpy = Httpy() r = httpy.get(self.url) if not 'View complete gallery: <a href="' in r: raise Exception('no gallery found at %s' % self.url) self.url = 'http://imagearn.com/%s' % httpy.between( r, 'View complete gallery: <a href="', '"')[0] if not '/gallery.php?id=' in self.url: raise Exception('expected /gallery.php?id= not found in URL')
def get_urls(self): from threading import Thread from time import sleep from Httpy import Httpy httpy = Httpy() # Sign in so we can get restricted content self.flickr_signin() r = httpy.get(self.url) self.result = [] index = 0 while True: for link in self.httpy.between( r, '><a data-track="photo-click" href="', '"'): if link == '{{photo_url}}': continue link = 'http://www.flickr.com%s' % link while not link.endswith('/'): link += '/' link += 'sizes/o/' # Default to 'original' size # Find and download image at this page while len(self.threads) >= self.max_threads: sleep(0.1) # Wait for threads self.threads.append(None) t = Thread(target=self.get_url_from_page, args=( link, index, )) t.start() index += 1 if len(self.result) + len( self.threads) > self.MAX_IMAGES_PER_RIP: break if len(self.result) + len(self.threads) > self.MAX_IMAGES_PER_RIP: break # Look for 'next' button if 'data-track="next" href="' in r: nextpage = self.httpy.between(r, 'data-track="next" href="', '"')[0] if not 'flickr.com' in nextpage: nextpage = 'http://flickr.com%s' % nextpage r = self.httpy.get(nextpage) else: # No more pages, we're done break # Wait for threads to finish while len(self.threads) > 0: sleep(0.1) return self.result
def get_urls(self): from Httpy import Httpy httpy = Httpy() fields = self.url.split('/') url = 'http://api.4chan.org/%s/res/%s.json' % (fields[3], fields[5]) try: r = httpy.get(url) json = loads(r) posts = json['posts'] except Exception, e: raise Exception('failed to load %s: %s' % (url, str(e)))
def get_urls(self): from Httpy import Httpy httpy = Httpy() r = httpy.get(self.url) r = r[r.find('showMoreGalleries'):] # To ignore user icon links = httpy.between(r, 'border=0 src="', '"') result = [] for link in links: link = 'http://%s' % link[link.find('.')+1:].replace('/images/thumb/', '/images/full/') result.append(link) if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break return result
def get_urls(self): from Httpy import Httpy httpy = Httpy() r = httpy.get(self.url) r = r[r.find('showMoreGalleries'):] # To ignore user icon links = httpy.between(r, 'border=0 src="', '"') result = [] for link in links: link = 'http://%s' % link[link.find('.') + 1:].replace( '/images/thumb/', '/images/full/') result.append(link) if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break return result
def get_urls(self): ''' Returns list of URLs from album. Does not download them. ''' from Httpy import Httpy httpy = Httpy() r = httpy.get(self.url) result = [] for link in httpy.between(r, '<img src="', '"'): link = 'http://hostname.com%s' % link result.append(link) if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break return result
def searchImages(unfiltered_search_text, start_index, source_ip='127.0.0.1', safe='off'): search_text = unfiltered_search_text.replace(' ', '%20') url = 'https://ajax.googleapis.com/ajax/services/search/images?v=1.0' url += '&q=%s' % search_text.replace(' ', '%20') url += '&start=%d' % start_index url += '&userip=%s' % source_ip url += '&safe=%s' % safe from Httpy import Httpy httpy = Httpy() try: response = httpy.get(url) except Exception, e: raise e
def get_urls(self): from Httpy import Httpy httpy = Httpy() r = httpy.get(self.url) result = [] for link in httpy.between(r, 'src="', '"'): if not 'http://' in link: continue if not 'imgur.com' in link: continue doti = link.rfind('.') - 1 if link[doti] == 'm': link = link.replace(link[doti:], link[doti + 1:]) result.append(link) if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break return result
def get_urls(self): from Httpy import Httpy httpy = Httpy() r = httpy.get(self.url) chunks = httpy.between(r, '<article class="', '</article>') if len(chunks) == 0: raise Exception('unable to find "article class" at %s '% self.url) r = chunks[0] result = [] for link in httpy.between(r, '<a href="', '"'): if link.startswith('//'): link = 'http:%s' % link link = link.replace(' ', '%20') result.append(link) return result
def get_urls(self): from Httpy import Httpy httpy = Httpy() r = httpy.get(self.url) result = [] for link in httpy.between(r, 'data-cfsrc="', '"'): if link.startswith('//'): link = 'http:%s' % link link = link.replace(' ', '%20') if '-cu_' in link: temp = link[:link.find('-cu_')] temp = '%s-me.%s' % (temp, link.split('.')[-1]) link = temp result.append(link) return result
def __init__(self, url): if not self.can_rip(url): # Don't instantiate if we can't rip it raise Exception('ripper (%s) cannot rip URL (%s)' % (self.__class__.__name__, url)) self.url = url self.sanitize_url() self.album_name = self.get_album_name() self.db = DB() self.httpy = Httpy() self.max_threads = self.MAX_THREADS self.threads = [] self.album_id = self.db.select_one( 'rowid', 'albums', 'host = ? and name = ?', [self.get_host(), self.get_album_name()]) self.path = self.db.select_one( 'path', 'albums', 'host = ? and name = ?', [self.get_host(), self.get_album_name()]) if self.path == None: # Album does not exist. self.album_exists = False self.path = '%s_%s' % (self.get_host(), self.album_name) else: # Album already exists self.album_exists = True
def get_urls_album(url): ''' Requires URL in the format: http://imgur.com/a/[albumid] ''' from Httpy import Httpy httpy = Httpy() try: r = httpy.get('http://api.imgur.com/2/album/%s.json' % url.split('/')[-1]) json = loads(r) if 'error' in json: # Error, fall back to noscript method raise Exception(error) except Exception, e: # Got exception, fall back to noscript method return SiteImgur.get_urls_album_noscript(url)
def get_urls(self): from Httpy import Httpy httpy = Httpy() r = httpy.get(self.url) result = [] for link in httpy.between(r, 'src="', '"'): if not 'http://' in link: continue if not 'imgur.com' in link: continue doti = link.rfind('.')-1 if link[doti] == 'm': link = link.replace(link[doti:], link[doti+1:]) result.append(link) if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break return result
def get_urls(self): from threading import Thread from time import sleep from Httpy import Httpy httpy = Httpy() # Sign in so we can get restricted content self.flickr_signin() r = httpy.get(self.url) self.result = [] index = 0 while True: for link in self.httpy.between(r, '><a data-track="photo-click" href="', '"'): if link == '{{photo_url}}': continue link = 'http://www.flickr.com%s' % link while not link.endswith('/'): link += '/' link += 'sizes/o/' # Default to 'original' size # Find and download image at this page while len(self.threads) >= self.max_threads: sleep(0.1) # Wait for threads self.threads.append(None) t = Thread(target=self.get_url_from_page, args=(link,index,)) t.start() index += 1 if len(self.result) + len(self.threads) > self.MAX_IMAGES_PER_RIP: break if len(self.result) + len(self.threads) > self.MAX_IMAGES_PER_RIP: break # Look for 'next' button if 'data-track="next" href="' in r: nextpage = self.httpy.between(r, 'data-track="next" href="', '"')[0] if not 'flickr.com' in nextpage: nextpage = 'http://flickr.com%s' % nextpage r = self.httpy.get(nextpage) else: # No more pages, we're done break # Wait for threads to finish while len(self.threads) > 0: sleep(0.1) return self.result
def sanity_check(db, spamtype, spamtext): ''' Ensures the spam filter is not malicious. Raises: Exception if filter is malicious and should not be added. ''' spamtext = spamtext.lower() whitelist = [ # URLS 'http://reddit.com/r/', 'http://reddit.com/comments/', 'http://www.reddit.com/r/', 'http://www.reddit.com/comments', 'http://imgur.com/', 'http://imgur.com/a/', 'http://i.imgur.com/', 'http://www.imgur.com/', 'http://www.imgur.com/a/', 'http://i.rarchives.com/' 'http://www.rarchives.com/', 'http://rip.rarchives.com/', # TEXT - TODO Get a better text whitelist '''the quick brown fox jumped over the lazy dog''' ] if spamtype == 'link' or spamtype == 'text': if len(spamtext) <= 3: raise Exception('[**!**] `%s` filter "`%s`" was **not** added because it is not long enough (must be more than 3 characters long).\n\n' % (spamtype, spamtext)) for whitelisted in whitelist: if spamtext in whitelisted.lower(): raise Exception('[**!**] `%s` filter "`%s`" was **not** added because it might remove relevant posts/comments (e.g. `%s...`).\n\n' % (spamtype, spamtext, whitelisted)) elif spamtype == 'tld': if spamtext in ['com', 'net', 'org']: raise Exception('[**!**] TLD `%s` was **not** added because it might remove relevant links (e.g. `.com` or `.net` or `.org`).\n\n' % spamtext) elif spamtype == 'user': if db.count('admins', 'username like ?', [spamtext]) > 0: raise Exception('[**!**] User `%s` was **not** added because you cannot add an admin to the spam filter\n\n' % spamtext) elif spamtype == 'thumb': # To validate the thumb-spam filter, load a non-spam imgur album and test the filter on that httpy = Httpy() unicode_resp = httpy.get('http://imgur.com/a/RdXNa') r = unicode_resp.decode('UTF-8').encode('ascii', 'ignore') if spamtext in r: raise Exception('[**!**] Thumb-spam filter `%s` was **not** added because the bot detected a false-positive (non-spam imgur albums would be detected as spam).\n\n' % spamtext)
def get_urls_album_noscript(url): ''' Requires URL in the format: http://imgur.com/a/[albumid] ''' from Httpy import Httpy httpy = Httpy() r = httpy.get('%s/noscript' % url) result = [] for link in httpy.between(r, 'img src="//i.', '"'): link = 'http://i.%s' % link try: link = self.get_highest_res(link) except Exception, e: # Image is gone. # Add it anyway so RipManager will mark the image as 'errored' pass result.append(link)
def test(): from Httpy import Httpy httpy = Httpy() # Check we can hit the host url = 'http://deviantart.com' r = httpy.get(url) if len(r.strip()) == 0: raise Exception('unable to retrieve data from %s' % url) url = 'http://www.imagefap.com/pictures/3802288/asdf' s = SiteImagefap(url) urls = s.get_urls() expected = 10 if len(urls) != expected: return 'expected %d images, got %d. url: %s' % (expected, len(urls), url) return None
def get_urls(self): from Httpy import Httpy httpy = Httpy() r = httpy.get(self.url) result = [] already_got = [] while True: for chunk in httpy.between(r, '<a class="thumb', '>'): if not 'href="' in chunk: continue link = httpy.between(chunk, 'href="', '"')[0] if link in already_got: continue already_got.append(link) # Get image from page while len(self.threads) >= self.max_threads: sleep(0.1) self.threads.append(None) t = Thread(target=self.get_url_from_page, args=(httpy, result, link,)) t.start() # Go to next page nexts = httpy.between(r, '<li class="next">', '</li>') if len(nexts) == 0 or not 'href"' in nexts[0]: break next_page = httpy.between(nexts[0], 'href="', '"')[0] if not 'offset=' in next_page: break r = httpy.get(next_page) while len(self.threads) > 0: sleep(0.1) return result
def __init__(self, user='', password='', user_agent=None): """ Initializes instance fields, sets user agent. Logs into reddit if user and password are given. """ # Default user agent is awesome! if user_agent is None: user_agent = 'ReddiWrap' # Create object we will use to communicate with reddit's servers self.web = Httpy(user_agent=user_agent) self.modhash = '' # Hash used to authenticate/interact with user account self.last_url = '' # The last URL retrieved self.before = None # ID pointing to 'previous' page self.after = None # ID pointing to 'next' page self.logged_in = False # Flag to detect if we are logged in or not
class InstagramWrapper: httpy = Httpy() CLIENT_ID = 'ada2177105f94b05b21c3839c21d3794' @staticmethod def get_user_id(username): url = 'https://api.instagram.com/v1/users/search?q=%s' % username url += '&client_id=%s' % InstagramWrapper.CLIENT_ID json = loads(InstagramWrapper.httpy.get(url)) users = json['data'] for user in users: if user['username'] == username: return user['id'] raise Exception("Username '%s' not found" % username) @staticmethod def get_posts(user_id, max_id=None, min_id=None): url = 'https://api.instagram.com/v1/users/%s/media/recent/' % user_id url += '?client_id=%s' % InstagramWrapper.CLIENT_ID if max_id: url += '&max_id=%s' % max_id if min_id: url += '&min_id=%s' % min_id json = loads(InstagramWrapper.httpy.get(url)) results = [] for post in json['data']: result = { 'id': post['id'], 'likes': post['likes']['count'], 'images': post['images'], 'link': post['link'], 'tags': post['tags'], 'type': post['type'], 'created': post['created_time'], } if 'caption' in post and post[ 'caption'] != None and 'text' in post['caption']: result['caption'] = post['caption']['text'] if post['type'] == 'video' and 'videos' in post: result['videos'] = post['videos'] results.append(result) return results @staticmethod def get_user_info(user_id): url = 'https://api.instagram.com/v1/users/%s' % user_id url += '?client_id=%s' % InstagramWrapper.CLIENT_ID json = loads(InstagramWrapper.httpy.get(url)) data = json['data'] return { 'bio': data['bio'], 'website': data['website'], 'profile_picture': data['profile_picture'], 'full_name': data['full_name'], 'total_media': data['counts']['media'] }
def get_urls(self): self.api_key = self.db.get_config('tumblr_key') if self.api_key == None: raise Exception( 'unable to rip album (%s), tumblr key not found in database' % self.url) from Httpy import Httpy httpy = Httpy() result = [] for posttype in ['photo', 'video']: offset = 0 while True: url = self.get_api_url(offset=offset, posttype=posttype) r = httpy.get(url) json = None try: json = loads(r) except: pass if json == None or 'response' not in json or 'posts' not in json[ 'response']: #raise Exception('no posts found at %s' % self.url) break posts = json['response']['posts'] if len(posts) == 0: break for post in posts: if 'photos' in post: for photos in post['photos']: result.append(photos['original_size']['url']) elif 'video_url' in post: result.append(post['video_url']) if self.post_type == 'post': break if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break offset += 20 sleep(1) if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break return result
def get_urls_subreddit(self): from Httpy import Httpy httpy = Httpy() page = 0 result = [] while True: r = httpy.get('%s/page/%d' % (self.url, page)) links = httpy.between(r, ' src="//i.', '"') if len(links) == 0: # Hit end of pages return result for link in links: if link in result: # Pages started repeating return result link = self.get_highest_res(link) result.append(link) page += 1
def get_urls_album_noscript(url): ''' Requires URL in the format: http://imgur.com/a/[albumid] ''' from Httpy import Httpy httpy = Httpy() r = httpy.get('%s/noscript' % url) result = [] for link in httpy.between(r, 'img src="//i.', '"'): link = 'http://i.%s' % link try: link = self.get_highest_res(link) except Exception, e: # Image is gone. # Add it anyway so RipManager will mark the image as 'errored' pass result.append({'url': link, 'saveas': link[link.rfind('/') + 1:]}) if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break
def test(): from Httpy import Httpy httpy = Httpy() # Check we can hit the host url = 'http://xhamster.com' r = httpy.get(url) if len(r.strip()) == 0: raise Exception('unable to retrieve data from %s' % url) # Check ripper gets all images in an album url = SiteXhamster.get_sample_url() s = SiteXhamster(url) urls = s.get_urls() expected = 10 if len(urls) < expected: return 'expected at least %d images, got %d. url: %s' % (expected, len(urls), url) return None
def test(): from Httpy import Httpy httpy = Httpy() # Check we can hit the host url = 'http://xhamster.com' r = httpy.get(url) if len(r.strip()) == 0: raise Exception('unable to retrieve data from %s' % url) # Check ripper gets all images in an album url = SiteXhamster.get_sample_url() s = SiteXhamster(url) urls = s.get_urls() expected = 10 if len(urls) < expected: return 'expected at least %d images, got %d. url: %s' % ( expected, len(urls), url) return None
def get_urls(self): from Httpy import Httpy httpy = Httpy() result = [] page = 1 r = httpy.get(self.url) while True: for chunk in httpy.between(r, "class='slideTool'", 'Related Galleries'): for link in httpy.between(chunk, "' src='", "'"): link = link.replace('_160.', '_1000.').replace('http://p2.', 'http://up.') result.append(link) break page += 1 next_page = self.url.replace('.html', '-%d.html' % page) if next_page in r: r = httpy.get(next_page) else: break return result
def get_urls(self): from Httpy import Httpy httpy = Httpy() url = self.url result = [] while True: r = httpy.get(url) for chunk in httpy.between(r, '<a name="', '</li>'): if not '<img src="' in chunk: continue image = httpy.between(chunk, '<img src="', '"')[0] image = image.replace('_stream', '_max') if image.startswith('//'): image = 'http:%s' % image result.append(image) if '<li class="next"><a href="' in r: url = httpy.between(r, '<li class="next"><a href="', '"')[0] else: break return result
def __init__(self): self.db = DB(DBFILE, **SCHEMA) self.web = Httpy() self._rabbitmq = pika.BlockingConnection( pika.ConnectionParameters(host='localhost')) self._rabbitmq_channel = self._rabbitmq.channel() self._rabbitmq_channel.exchange_declare(exchange='reddit', exchange_type='topic') self._rabbitmq_queue = self._rabbitmq_channel.queue_declare( '', exclusive=True) self._q = Queue()
def _message_callback_worker(self): logger.info("Started message callback worker") web = Httpy() while True: try: body = self._q.get() self._message_callback(body, web) except Exception as e: logger.error(e) finally: self._q.task_done()
def get_urls_subreddit(self): from Httpy import Httpy httpy = Httpy() page = 0 result = [] while True: r = httpy.get('%s/page/%d' % (self.url, page)) links = httpy.between(r, ' src="//i.', '"') if len(links) == 0: # Hit end of pages return result for link in links: if link in result: # Pages started repeating return result link = self.get_highest_res(link) result.append(link) if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break page += 1
def get_urls_user_images(self): from Httpy import Httpy httpy = Httpy() result = [] url = self.url.replace('/all', '') page = total = index = 0 while True: page += 1 next_page = '%s/ajax/images?sort=0&order=1&album=0&page=%d&perPage=60' % (url, page) r = httpy.get(next_page) json = loads(r) data = json['data'] if total == 0 and 'count' in data: total = data['count'] # TODO report progress for image in data['images']: result.append('http://i.imgur.com/%s%s' % (image['hash'], image['ext'])) if index >= total or self.hit_image_limit(): break sleep(1) return result
def get_urls_album_noscript(url): ''' Requires URL in the format: http://imgur.com/a/[albumid] ''' from Httpy import Httpy httpy = Httpy() r = httpy.get('%s/noscript' % url) result = [] for link in httpy.between(r, 'img src="//i.', '"'): link = 'http://i.%s' % link try: link = self.get_highest_res(link) except Exception, e: # Image is gone. # Add it anyway so RipManager will mark the image as 'errored' pass result.append({ 'url' : link, 'saveas' : link[link.rfind('/')+1:] }) if len(result) > SiteBase.MAX_IMAGES_PER_RIP: break
def test(): ''' Test that ripper is working as expected. Raise exception if necessary. ''' from Httpy import Httpy httpy = Httpy() # Check we can hit the host url = 'http://8muses.com/' r = httpy.get(url) if len(r.strip()) == 0: raise Exception('unable to retrieve data from %s' % url) # Check ripper gets all images in an album url = 'http://www.8muses.com/index/category/hotassneighbor7' s = Site8muses(url) urls = s.get_urls() expected = 21 if len(urls) != expected: return 'expected %d images, got %d. url: %s' % (expected, len(urls), url) return None