def scrape(self, all=True, max_depth=10): # No maximum depth if scraping all posts max_depth = float('inf') if all else max_depth # Create the tumblpy agent agent = Tumblpy(TUMBLR_KEYS['consumer'], TUMBLR_KEYS['secret']) offset = 0 posts = [] # Pull 20 posts a max number of times equal to max_depth while offset < max_depth: # Get 20 posts new_posts = agent.get('posts', self.url, params={'offset': offset * 20, 'limit': 20, 'notes_info': True}) new_posts = new_posts['posts'] # No posts found; stop scraping if not new_posts: break for post in new_posts: # if any of the new posts is from before last scraping, stop time = tz.make_aware(dt.fromtimestamp(post['timestamp'])) if time < self.last_scraped: offset = max_depth break posts += new_posts offset += 1 self.last_scraped = tz.now() # Create photos from posts for post in posts: photos = scraping.models.photos.Photo.from_tumblr_api(post, self) for photo_data in photos: photo = photo_data['photo'] raw_tags = photo_data['raw tags'] photo.save() photo.tags_from_ary(raw_tags)
def from_api(cls, name): # Create tumblpy agent agent = Tumblpy(TUMBLR_KEYS['consumer'], TUMBLR_KEYS['secret']) try: # Get blog info info = agent.get('info', name)['blog'] # Get avatar avatar = agent.get('avatar', name, params={'size': 512}) except TumblpyError: raise TumblpyError('Could not connect to {}'.format(name + '.tumblr.com')) # Create TumblrBlog instance = cls() # Assign fields instance.url = info['url'] instance.name = info['title'] instance.description = info['description'] instance.avatar_url = avatar['url'] # Return without saving to db return instance
def get_url(self, string): # Check if it matches a tumblr url pattern tumblr_url_regex = '(?P<url>(http\:\/\/)?[A-Za-z0-9\-]+\.tumblr\.com).*' tumblr_url_match = re.fullmatch(tumblr_url_regex, string) if tumblr_url_match: return {'type': TumblrBlog, 'url': tumblr_url_match} # Check if it matches a tumblr name pattern tumblr_name_regex = '[A-Za-z0-9\-]+' tumblr_name_match = re.fullmatch(tumblr_name_regex, string).string # Check if a tumblr blog with that name exists if tumblr_name_match: tumblr_agent = Tumblpy(TUMBLR['consumer'], TUMBLR['secret']) try: tumblr_agent.get('info', tumblr_name_match) # tumblpy didn't throw an exception, so blog exists return {'type': TumblrBlog, 'url': 'http://' + tumblr_name_match + '.tumblr.com/'} except TumblpyError: # tumblpy did throw an exception, so blog doesn't exist. pass
def getPosts(): t = Tumblpy(app_key = APP_KEY, app_secret = APP_SECRET, oauth_token = OAUTH_TOKEN, oauth_token_secret = OAUTH_TOKEN_SECRET) # Print out the user info, let's get the first blog url... blog_url = t.post('user/info') blog_url = blog_url['user']['blogs'][0]['url'] print "url: %s" % blog_url # Assume you are using the blog_url and Tumblpy instance from the previous section posts = t.get('posts', blog_url=blog_url) for post in posts['posts']: print "%s %s" % (post['date'],post['post_url'])
def main(): setup() t = Tumblpy(app_key = my_app_key, app_secret = my_app_secret, oauth_token=oauth_token, oauth_token_secret=oauth_token_secret) blog='staff.tumblr.com' #followers = t.get('posts', blog_url=blog) #print followers posts = t.get('tagged',blog_url=None, params={'tag':'asoiaf', 'before': 1363820400}) print len(posts) a = 0 for p in posts: print (p['post_url'], p['timestamp'], p['type']) '''
def dump_tumblr_json(): t = Tumblpy( app_key=app_config.TUMBLR_KEY, app_secret=os.environ['TUMBLR_APP_SECRET'], oauth_token=os.environ['TUMBLR_OAUTH_TOKEN'], oauth_token_secret=os.environ['TUMBLR_OAUTH_TOKEN_SECRET']) limit = 10 pages = range(0, 20) for page in pages: offset = page * limit posts = t.get('posts', blog_url=app_config.TUMBLR_URL, params={'limit': limit, 'offset': offset}) with open('data/backups/tumblr_prod_%s.json' % page, 'w') as f: f.write(json.dumps(posts))
def deletePosts(tag): t = Tumblpy(app_key = APP_KEY, app_secret = APP_SECRET, oauth_token = OAUTH_TOKEN, oauth_token_secret = OAUTH_TOKEN_SECRET) # Print out the user info, let's get the first blog url... blog_url = t.post('user/info') blog_url = blog_url['user']['blogs'][0]['url'] print "url: %s" % blog_url # Assume you are using the blog_url and Tumblpy instance from the previous section posts = t.get('posts', blog_url=blog_url, params={'tag':tag}) for post in posts['posts']: print "%s %s" % (post['date'],post['post_url']) id = post['id'] post = t.post('post/delete', blog_url=blog_url, params={'id':id}) print "post deleted!!"
def dump_tumblr_json(): t = Tumblpy( app_key=app_config.TUMBLR_KEY, app_secret=os.environ["%s_TUMBLR_APP_SECRET" % app_config.CONFIG_NAME], oauth_token=os.environ["%s_TUMBLR_OAUTH_TOKEN" % app_config.CONFIG_NAME], oauth_token_secret=os.environ["%s_TUMBLR_OAUTH_TOKEN_SECRET" % app_config.CONFIG_NAME], ) limit = 10 pages = range(0, 20) for page in pages: offset = page * limit posts = t.get("posts", blog_url=app_config.TUMBLR_URL, params={"limit": limit, "offset": offset}) with open("data/backups/tumblr_prod_%s.json" % page, "w") as f: f.write(json.dumps(posts))
def hidePosts(): t = Tumblpy(app_key = APP_KEY, app_secret = APP_SECRET, oauth_token = OAUTH_TOKEN, oauth_token_secret = OAUTH_TOKEN_SECRET) # Print out the user info, let's get the first blog url... blog_url = t.post('user/info') blog_url = blog_url['user']['blogs'][0]['url'] print "url: %s" % blog_url # Assume you are using the blog_url and Tumblpy instance from the previous section posts = t.get('posts', blog_url=blog_url, params={'tag':"JustMigrate"}) for post in posts['posts']: print "%s %s" % (post['date'],post['post_url']) id = post['id'] post = t.post('edit', blog_url=blog_url, params={'id':id,'state':'private'}) print "ahora está en estado %s" % post['state']
def do(): try: t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret) resp = t.get('posts/video', blog_url=self.blog.url, params={"offset": self.offset}) # 视频不存在会导致 url字段为空 posts = resp.get('posts') video_posts_handler(posts, self.blog) t.client.close() except TumblpyRateLimitError: spider_log.info("Key调用次数达到上限,本线程退出") return except TumblpyError as e: if e.error_code == 404: mark_dead_blog(self.blog) finally: session.remove()
def do(): try: t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret) # t.client.headers = {'Connection': 'close'} resp = t.get('posts/photo', blog_url=self.blog.url, params={"offset": self.offset}) posts = resp.get('posts') post_handler(posts, self.blog) t.client.close() except TumblpyRateLimitError: spider_log.info("Key调用次数达到上限,本线程退出") return except TumblpyError as e: if e.error_code == 404: mark_dead_blog(self.blog) finally: session.remove()
def do(): spider_log.info("开始获取博客信息!") blogs = load_all_blog() spider_log.info("加载Blog列表完成!") for blog in blogs: try: t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret) resp = t.get('info', blog_url=urlparse(blog.url).netloc) b = resp.get("blog") t.client.close() blog.name = b.get("name") blog.url = b.get("url") blog.posts = b.get("posts") spider_log.info("BlogId:{} 已更新".format(blog.id)) except TumblpyRateLimitError: spider_log.info("Key达到上限,本线程退出") except TumblpyError as e: if e.error_code == 404: mark_dead_blog(blog) finally: session.commit() session.remove()
rows = list(csv.reader(f)) for row in rows: svg_url, status, tumblr_url = row if not tumblr_url: row.append('') row.append('') continue post_id = tumblr_url.split('/')[-1] try: print post_id post = t.get('posts', blog_url=app_config.TUMBLR_URL, params={'id': post_id}) except TumblpyError, e: print 'GET error %s: %s %s' % (post_id, e.error_code, e.msg) row.append('') row.append('') continue caption = post['posts'][0]['caption'] attribution = re.search('<p class=\"signature-name\">(.*)<\/p>', caption) details = attribution.group(1) if ',' in details: name, location = details.split(',', 1)
def download_images_from_tumblr(tag, max_count=10, before_timestamp=None, saved_path=None, is_face_detect=None, is_animeface=None): print("download_image_from_tumblr/tag:", tag, ", before_timestamp:", before_timestamp) dl_count = 0 fc_count = 0 last_timestamp = 0 t = Tumblpy(CONSUMER_KEY, CONSUMER_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET) params = {'tag': tag} if before_timestamp is not None: params.update({"timestamp": before_timestamp}) output_path = saved_path if saved_path is None: output_path = "./dl_tumblr/" + tag while dl_count < max_count: if last_timestamp > 0: params.update({"before": last_timestamp}) tags = None try: tags = t.get('tagged', params=params) except Exception as e: print("error: ", str(e)) # print("tags: ", tags) if tags == None or len(tags) == 0: print("end: tags is void") break for i, tag in enumerate(tags): if i == None or tag == None: break # print("index: ", i, ", tag: ", tag) last_timestamp = tag["timestamp"] if "photos" in tag: photos = tag["photos"] for j, photo in enumerate(photos): # print("index: ", j, ", tag: ", photo) image_url = photo["original_size"]["url"] # print("i:", i, "j:", j, ", image_url:", image_url) file_path = download_image(image_url, output_path) if file_path: dl_count += 1 if is_face_detect is not None: fc_count += save_detected_faces( image_path=file_path, is_animeface=is_animeface) else: break return dl_count, fc_count, last_timestamp
from tumblpy import Tumblpy import time import Configuration config = Configuration.Configuration('config.ini') # Get the final tokens from the database or wherever you have them stored t = Tumblpy(config.ConsumerKey, config.ConsumerSecret, config.OauthToken, config.OauthTokenSecret) # Print out the user info, let's get the first blog url... blog_url = t.post('user/info') blog_url = blog_url['user']['blogs'][0]['url'] posts = t.get('posts', blog_url=blog_url) class UserInfo: def __init__(self, name, blogURL): self.Name = name self.BlogURL = blogURL class AutoFollow: USERNAMEFILE = "followedUser.txt" # Writes followed user name into a file (followedUser.txt) def WriteNameToFile(self, username): userFile = open(self.USERNAMEFILE, 'a') userFile.write(username) userFile.write('\n') userFile.close()
blog = flags.blog blog_url = "http://" + blog + ".tumblr.com/" consumer = flags.consumer secret = flags.secret # Authenticate on Tumblr t = Tumblpy(consumer, secret) auth_props = t.get_authentication_tokens() auth_url = auth_props['auth_url'] OAUTH_TOKEN_SECRET = auth_props['oauth_token_secret'] print "You're into Tumblr!" # Get the number of posts to evaluate posts = t.get('info', blog_url=blog_url) num_posts = posts['blog']['posts'] print "There are " + str(num_posts) + " posts to examine..." # Iterate through posts, searching for self-text posts, saving body of each post f = open((blog + '.txt'),'w') count = 0 texts = [] for i in range(0,num_posts,20): # Only text posts, only raw text, and include reblog information posts = t.get('posts', blog_url=blog_url, params = {'offset':str(i),'type':'text','filter':'text','reblog_info':'true'}) theposts = posts['posts'] for j in theposts: body = j.get('body').encode('utf8') if body != None and j.get('reblogged_root_url') == None:
from tumblpy import Tumblpy from flask import Flask, request, render_template, g, redirect, Response tmpl_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'templates') app = Flask(__name__, template_folder=tmpl_dir) # private variables currently hardcoded for ease of use CONSUMER_KEY = 'MN6llW04QBngyH2e31PCT3R0gMEaY656zQQFmwCyKdNKLr2dJ9' CONSUMER_SECRET = '82lF0LGIGsLvXfuHfQV1c7YkdjR6KL9wnSI1hXfpjpLu7Npgz8' OAUTH_TOKEN = 'lvUpN9aukdFxa17CRhlfFEpfbIoEeefdVW0prDUu7kXOw4FI3i' OAUTH_TOKEN_SECRET = 'K548qCNr7YrFBegFxRLmUmYJ2GIxwjjsDc39uq0UV2PxHiYKQs' t = Tumblpy(CONSUMER_KEY, CONSUMER_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET) posts = t.get('posts', blog_url="www.cloktahwho.tumblr.com") #print(posts) @app.before_request def before_request(): """ This function is run at the beginning of every web request (every time you enter an address in the web browser). We use it to setup a database connection that can be used throughout the request. The variable g is globally accessible. """ print("Hello this is trying to work.")
from tumblpy import Tumblpy import yaml settings_fn = 'settings.yml' with open(settings_fn) as f: settings = yaml.load(f) t = Tumblpy(app_key=settings['tumblr']['consumer_key'], app_secret=settings['tumblr']['consumer_secret'], oauth_token=settings['tumblr']['oauth_token'], oauth_token_secret=settings['tumblr']['oauth_secret']) posts = t.get('posts', blog_url='okbot.tumblr.com') print posts
with open('data/review.csv') as f: rows = list(csv.reader(f)) for row in rows: svg_url, status, tumblr_url = row if not tumblr_url: row.append('') row.append('') continue post_id = tumblr_url.split('/')[-1] try: print post_id post = t.get('posts', blog_url=app_config.TUMBLR_URL, params={ 'id': post_id }) except TumblpyError, e: print 'GET error %s: %s %s' % (post_id, e.error_code, e.msg) row.append('') row.append('') continue caption = post['posts'][0]['caption'] attribution = re.search('<p class=\"signature-name\">(.*)<\/p>', caption) details = attribution.group(1) if ',' in details: name, location = details.split(',', 1) else:
def get_tumblr_table(self): t = Tumblpy("sYKNnjJRqbxWWlg19sY8WYnZyQi6wURbilnE4k3vsyqX4vc4ER","n8mtWzKieR8qgTdwUWNhF3OYZVIsvMZXvVr9DKPlCGI6wE2VLV", "PyvcruFPx1YqhdAOkCWjCPWMBIYx3fUJaiFzjhxpkwUwps0VjC","Zjwmi2wYA83rtIdoL82BcWcj5sxm5QrI1MEnZX4DzFQHWydx1C") blog_url = t.post('user/info') blog_url = blog_url['user']['blogs'][1]['url'] posts = t.get('posts', blog_url=blog_url) posts_count = posts["total_posts"] #print posts table = gtk.Table(posts_count, 1, False) # set the spacing to 10 on x and 10 on y table.set_row_spacings(10) table.set_col_spacings(10) # pack the table into the scrolled window i = 0 for cur_post in posts["posts"]: buffer = "" cur_image_fac = catImageBox.catImageBox("http://www.linux.org.ru/tango/img/opensource-logo.png", 50, 50) if cur_post["type"] == "text": buffer = cur_post["body"] if cur_post["type"] == "photo": j = len(cur_post["photos"][0]["alt_sizes"]) -1 img_url = cur_post["photos"][0]["alt_sizes"][j]["url"] cur_image_fac = catImageBox.catImageBox(img_url, 75, 75) buffer = cur_post["caption"] s = MLStripper() s.feed(buffer) label = gtk.Label(s.get_data()) label.set_line_wrap(True) label.set_justify(gtk.JUSTIFY_LEFT) label.set_width_chars(30) label.show() #date box date_box = gtk.HBox(True, 1) date_icon = gtk.Image() date_icon.set_from_file("resources/cal.png") date_icon.show() cur_image = cur_image_fac.image cur_image.show() fdate = cur_post["date"] date_label = gtk.Label(fdate.split(" ")[0]) date_label.set_line_wrap(True) date_label.show() #date_box.pack_start(date_icon, True, True, 1) date_box.pack_start(cur_image, True, True, 1) date_box.pack_end(date_label, True, True, 1) date_box.show() #tag box tag_box = gtk.HBox(True, 1) tag_icon = gtk.Image() tag_icon.set_from_file("resources/tag.png") tag_icon.show() ftags = "" for cur_tag in cur_post["tags"]: ftags += cur_tag + " " tag_label = gtk.Label(ftags) tag_label.set_line_wrap(True) tag_label.show() tag_box.pack_start(tag_icon, True, True, 1) tag_box.pack_end(tag_label, True, True, 1) tag_box.show() separator = gtk.HSeparator() separator.show() box = gtk.VBox(True, 1) box.pack_start(date_box, True, True, 1) if cur_post["tags"].count > 0: box.pack_start(tag_box, True, True, 1) box.pack_start(label, True, True, 0) box.pack_end(separator, True, True, 0) box.show() table.attach(box, 1, 2, i, i+1) i = i+1 return table