Ejemplo n.º 1
0
 def scrape(self, all=True, max_depth=10):
     # No maximum depth if scraping all posts
     max_depth = float('inf') if all else max_depth
     # Create the tumblpy agent
     agent = Tumblpy(TUMBLR_KEYS['consumer'], TUMBLR_KEYS['secret'])
     offset = 0
     posts = []
     # Pull 20 posts a max number of times equal to max_depth
     while offset < max_depth:
         # Get 20 posts
         new_posts = agent.get('posts', self.url,
                           params={'offset': offset * 20,
                                   'limit': 20,
                                   'notes_info': True})
         new_posts = new_posts['posts']
         # No posts found; stop scraping
         if not new_posts:
             break
         for post in new_posts:
             # if any of the new posts is from before last scraping, stop
             time = tz.make_aware(dt.fromtimestamp(post['timestamp']))
             if time < self.last_scraped:
                 offset = max_depth
                 break
         posts += new_posts
         offset += 1
     self.last_scraped = tz.now()
     # Create photos from posts
     for post in posts:
         photos = scraping.models.photos.Photo.from_tumblr_api(post, self)
         for photo_data in photos:
             photo = photo_data['photo']
             raw_tags = photo_data['raw tags']
             photo.save()
             photo.tags_from_ary(raw_tags)
Ejemplo n.º 2
0
 def from_api(cls, name):
     # Create tumblpy agent
     agent = Tumblpy(TUMBLR_KEYS['consumer'], TUMBLR_KEYS['secret'])
     try:
         # Get blog info
         info = agent.get('info', name)['blog']
         # Get avatar
         avatar = agent.get('avatar', name, params={'size': 512})
     except TumblpyError:
         raise TumblpyError('Could not connect to {}'.format(name +
                                                             '.tumblr.com'))
     # Create TumblrBlog
     instance = cls()
     # Assign fields
     instance.url = info['url']
     instance.name = info['title']
     instance.description = info['description']
     instance.avatar_url = avatar['url']
     # Return without saving to db
     return instance
Ejemplo n.º 3
0
 def get_url(self, string):
     # Check if it matches a tumblr url pattern
     tumblr_url_regex = '(?P<url>(http\:\/\/)?[A-Za-z0-9\-]+\.tumblr\.com).*'
     tumblr_url_match = re.fullmatch(tumblr_url_regex, string)
     if tumblr_url_match:
         return {'type': TumblrBlog, 'url': tumblr_url_match}
     # Check if it matches a tumblr name pattern
     tumblr_name_regex = '[A-Za-z0-9\-]+'
     tumblr_name_match = re.fullmatch(tumblr_name_regex, string).string
     # Check if a tumblr blog with that name exists
     if tumblr_name_match:
         tumblr_agent = Tumblpy(TUMBLR['consumer'], TUMBLR['secret'])
         try:
             tumblr_agent.get('info', tumblr_name_match)
             # tumblpy didn't throw an exception, so blog exists
             return {'type': TumblrBlog,
                     'url': 'http://' + tumblr_name_match + '.tumblr.com/'}
         except TumblpyError:
             # tumblpy did throw an exception, so blog doesn't exist.
             pass
Ejemplo n.º 4
0
    def getPosts():
        t = Tumblpy(app_key = APP_KEY,
                    app_secret = APP_SECRET,
                    oauth_token = OAUTH_TOKEN,
                    oauth_token_secret = OAUTH_TOKEN_SECRET)

        # Print out the user info, let's get the first blog url...
        blog_url = t.post('user/info')
        blog_url = blog_url['user']['blogs'][0]['url']
        print "url: %s" % blog_url
        # Assume you are using the blog_url and Tumblpy instance from the previous section
        posts = t.get('posts', blog_url=blog_url)
        for post in posts['posts']:
            print "%s %s" % (post['date'],post['post_url'])
Ejemplo n.º 5
0
def main():

    setup()
    t = Tumblpy(app_key = my_app_key, app_secret = my_app_secret,
                oauth_token=oauth_token, oauth_token_secret=oauth_token_secret)

    blog='staff.tumblr.com'
    #followers = t.get('posts', blog_url=blog)
    #print followers
    posts = t.get('tagged',blog_url=None, params={'tag':'asoiaf', 'before': 1363820400})
    print len(posts)
    a = 0
    for p in posts:
        print (p['post_url'], p['timestamp'], p['type'])

    '''
Ejemplo n.º 6
0
def dump_tumblr_json():
    t = Tumblpy(
            app_key=app_config.TUMBLR_KEY,
            app_secret=os.environ['TUMBLR_APP_SECRET'],
            oauth_token=os.environ['TUMBLR_OAUTH_TOKEN'],
            oauth_token_secret=os.environ['TUMBLR_OAUTH_TOKEN_SECRET'])

    limit = 10
    pages = range(0, 20)

    for page in pages:
        offset = page * limit
        posts = t.get('posts', blog_url=app_config.TUMBLR_URL, params={'limit': limit, 'offset': offset})

        with open('data/backups/tumblr_prod_%s.json' % page, 'w') as f:
            f.write(json.dumps(posts))
Ejemplo n.º 7
0
    def deletePosts(tag):
        t = Tumblpy(app_key = APP_KEY,
                    app_secret = APP_SECRET,
                    oauth_token = OAUTH_TOKEN,
                    oauth_token_secret = OAUTH_TOKEN_SECRET)

        # Print out the user info, let's get the first blog url...
        blog_url = t.post('user/info')
        blog_url = blog_url['user']['blogs'][0]['url']
        print "url: %s" % blog_url
        # Assume you are using the blog_url and Tumblpy instance from the previous section
        posts = t.get('posts', blog_url=blog_url, params={'tag':tag})
        for post in posts['posts']:
            print "%s %s" % (post['date'],post['post_url'])
            id = post['id']
            post = t.post('post/delete', blog_url=blog_url, params={'id':id})
            print "post deleted!!"
Ejemplo n.º 8
0
def dump_tumblr_json():
    t = Tumblpy(
        app_key=app_config.TUMBLR_KEY,
        app_secret=os.environ["%s_TUMBLR_APP_SECRET" % app_config.CONFIG_NAME],
        oauth_token=os.environ["%s_TUMBLR_OAUTH_TOKEN" % app_config.CONFIG_NAME],
        oauth_token_secret=os.environ["%s_TUMBLR_OAUTH_TOKEN_SECRET" % app_config.CONFIG_NAME],
    )

    limit = 10
    pages = range(0, 20)

    for page in pages:
        offset = page * limit
        posts = t.get("posts", blog_url=app_config.TUMBLR_URL, params={"limit": limit, "offset": offset})

        with open("data/backups/tumblr_prod_%s.json" % page, "w") as f:
            f.write(json.dumps(posts))
Ejemplo n.º 9
0
    def hidePosts():
        t = Tumblpy(app_key = APP_KEY,
                    app_secret = APP_SECRET,
                    oauth_token = OAUTH_TOKEN,
                    oauth_token_secret = OAUTH_TOKEN_SECRET)

        # Print out the user info, let's get the first blog url...
        blog_url = t.post('user/info')
        blog_url = blog_url['user']['blogs'][0]['url']
        print "url: %s" % blog_url
        # Assume you are using the blog_url and Tumblpy instance from the previous section
        posts = t.get('posts', blog_url=blog_url, params={'tag':"JustMigrate"})
        for post in posts['posts']:
            print "%s %s" % (post['date'],post['post_url'])
            id = post['id']
            post = t.post('edit', blog_url=blog_url, params={'id':id,'state':'private'})
            print "ahora está en estado %s" % post['state']
Ejemplo n.º 10
0
 def do():
     try:
         t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret)
         resp = t.get('posts/video',
                      blog_url=self.blog.url,
                      params={"offset": self.offset})
         # 视频不存在会导致 url字段为空
         posts = resp.get('posts')
         video_posts_handler(posts, self.blog)
         t.client.close()
     except TumblpyRateLimitError:
         spider_log.info("Key调用次数达到上限,本线程退出")
         return
     except TumblpyError as e:
         if e.error_code == 404:
             mark_dead_blog(self.blog)
     finally:
         session.remove()
Ejemplo n.º 11
0
 def do():
     try:
         t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret)
         # t.client.headers = {'Connection': 'close'}
         resp = t.get('posts/photo',
                      blog_url=self.blog.url,
                      params={"offset": self.offset})
         posts = resp.get('posts')
         post_handler(posts, self.blog)
         t.client.close()
     except TumblpyRateLimitError:
         spider_log.info("Key调用次数达到上限,本线程退出")
         return
     except TumblpyError as e:
         if e.error_code == 404:
             mark_dead_blog(self.blog)
     finally:
         session.remove()
Ejemplo n.º 12
0
 def do():
     spider_log.info("开始获取博客信息!")
     blogs = load_all_blog()
     spider_log.info("加载Blog列表完成!")
     for blog in blogs:
         try:
             t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret)
             resp = t.get('info', blog_url=urlparse(blog.url).netloc)
             b = resp.get("blog")
             t.client.close()
             blog.name = b.get("name")
             blog.url = b.get("url")
             blog.posts = b.get("posts")
             spider_log.info("BlogId:{} 已更新".format(blog.id))
         except TumblpyRateLimitError:
             spider_log.info("Key达到上限,本线程退出")
         except TumblpyError as e:
             if e.error_code == 404:
                 mark_dead_blog(blog)
         finally:
             session.commit()
     session.remove()
Ejemplo n.º 13
0
    rows = list(csv.reader(f))

for row in rows:
    svg_url, status, tumblr_url = row

    if not tumblr_url:
        row.append('')
        row.append('')
        continue

    post_id = tumblr_url.split('/')[-1]

    try:
        print post_id
        post = t.get('posts',
                     blog_url=app_config.TUMBLR_URL,
                     params={'id': post_id})
    except TumblpyError, e:
        print 'GET error %s: %s %s' % (post_id, e.error_code, e.msg)
        row.append('')
        row.append('')
        continue

    caption = post['posts'][0]['caption']

    attribution = re.search('<p class=\"signature-name\">(.*)<\/p>', caption)

    details = attribution.group(1)

    if ',' in details:
        name, location = details.split(',', 1)
Ejemplo n.º 14
0
def download_images_from_tumblr(tag,
                                max_count=10,
                                before_timestamp=None,
                                saved_path=None,
                                is_face_detect=None,
                                is_animeface=None):
    print("download_image_from_tumblr/tag:", tag, ", before_timestamp:",
          before_timestamp)

    dl_count = 0
    fc_count = 0
    last_timestamp = 0

    t = Tumblpy(CONSUMER_KEY, CONSUMER_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET)

    params = {'tag': tag}
    if before_timestamp is not None:
        params.update({"timestamp": before_timestamp})

    output_path = saved_path
    if saved_path is None:
        output_path = "./dl_tumblr/" + tag

    while dl_count < max_count:

        if last_timestamp > 0:
            params.update({"before": last_timestamp})

        tags = None
        try:
            tags = t.get('tagged', params=params)
        except Exception as e:
            print("error: ", str(e))
        # print("tags: ", tags)

        if tags == None or len(tags) == 0:
            print("end: tags is void")
            break

        for i, tag in enumerate(tags):

            if i == None or tag == None:
                break

            # print("index: ", i, ", tag: ", tag)
            last_timestamp = tag["timestamp"]

            if "photos" in tag:
                photos = tag["photos"]
                for j, photo in enumerate(photos):
                    # print("index: ", j, ", tag: ", photo)
                    image_url = photo["original_size"]["url"]
                    # print("i:", i, "j:", j, ", image_url:", image_url)
                    file_path = download_image(image_url, output_path)
                    if file_path:
                        dl_count += 1
                        if is_face_detect is not None:
                            fc_count += save_detected_faces(
                                image_path=file_path,
                                is_animeface=is_animeface)
            else:
                break

    return dl_count, fc_count, last_timestamp
Ejemplo n.º 15
0
from tumblpy import Tumblpy
import time
import Configuration

config = Configuration.Configuration('config.ini')

# Get the final tokens from the database or wherever you have them stored
t = Tumblpy(config.ConsumerKey, config.ConsumerSecret, config.OauthToken,
            config.OauthTokenSecret)

# Print out the user info, let's get the first blog url...
blog_url = t.post('user/info')
blog_url = blog_url['user']['blogs'][0]['url']
posts = t.get('posts', blog_url=blog_url)


class UserInfo:
    def __init__(self, name, blogURL):
        self.Name = name
        self.BlogURL = blogURL


class AutoFollow:
    USERNAMEFILE = "followedUser.txt"

    # Writes followed user name into a file (followedUser.txt)
    def WriteNameToFile(self, username):
        userFile = open(self.USERNAMEFILE, 'a')
        userFile.write(username)
        userFile.write('\n')
        userFile.close()
Ejemplo n.º 16
0
blog = flags.blog
blog_url = "http://" + blog + ".tumblr.com/"
consumer = flags.consumer
secret = flags.secret

# Authenticate on Tumblr
t = Tumblpy(consumer, secret)

auth_props = t.get_authentication_tokens()
auth_url = auth_props['auth_url']

OAUTH_TOKEN_SECRET = auth_props['oauth_token_secret']
print "You're into Tumblr!"

# Get the number of posts to evaluate
posts = t.get('info', blog_url=blog_url)
num_posts = posts['blog']['posts']
print "There are " + str(num_posts) + " posts to examine..."
 
# Iterate through posts, searching for self-text posts, saving body of each post
f = open((blog + '.txt'),'w')

count = 0
texts = []
for i in range(0,num_posts,20):
    # Only text posts, only raw text, and include reblog information
    posts = t.get('posts', blog_url=blog_url, params = {'offset':str(i),'type':'text','filter':'text','reblog_info':'true'})
    theposts = posts['posts']
    for j in theposts:
        body = j.get('body').encode('utf8')
        if body != None and j.get('reblogged_root_url') == None:
Ejemplo n.º 17
0
from tumblpy import Tumblpy
from flask import Flask, request, render_template, g, redirect, Response

tmpl_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                        'templates')
app = Flask(__name__, template_folder=tmpl_dir)

# private variables currently hardcoded for ease of use
CONSUMER_KEY = 'MN6llW04QBngyH2e31PCT3R0gMEaY656zQQFmwCyKdNKLr2dJ9'
CONSUMER_SECRET = '82lF0LGIGsLvXfuHfQV1c7YkdjR6KL9wnSI1hXfpjpLu7Npgz8'
OAUTH_TOKEN = 'lvUpN9aukdFxa17CRhlfFEpfbIoEeefdVW0prDUu7kXOw4FI3i'
OAUTH_TOKEN_SECRET = 'K548qCNr7YrFBegFxRLmUmYJ2GIxwjjsDc39uq0UV2PxHiYKQs'

t = Tumblpy(CONSUMER_KEY, CONSUMER_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET)

posts = t.get('posts', blog_url="www.cloktahwho.tumblr.com")

#print(posts)


@app.before_request
def before_request():
    """
  This function is run at the beginning of every web request 
  (every time you enter an address in the web browser).
  We use it to setup a database connection that can be used throughout the request.

  The variable g is globally accessible.
  """
    print("Hello this is trying to work.")
Ejemplo n.º 18
0
from tumblpy import Tumblpy
import yaml
settings_fn = 'settings.yml'

with open(settings_fn) as f:
    settings = yaml.load(f)

t = Tumblpy(app_key=settings['tumblr']['consumer_key'],
            app_secret=settings['tumblr']['consumer_secret'],
            oauth_token=settings['tumblr']['oauth_token'],
            oauth_token_secret=settings['tumblr']['oauth_secret'])

posts = t.get('posts', blog_url='okbot.tumblr.com')
print posts
Ejemplo n.º 19
0
with open('data/review.csv') as f:
    rows = list(csv.reader(f))
    
for row in rows:
    svg_url, status, tumblr_url = row

    if not tumblr_url:
	row.append('')
	row.append('')
        continue

    post_id = tumblr_url.split('/')[-1]

    try:
        print post_id
        post = t.get('posts', blog_url=app_config.TUMBLR_URL, params={ 'id': post_id })
    except TumblpyError, e:
        print 'GET error %s: %s %s' % (post_id, e.error_code, e.msg)
        row.append('')
        row.append('')
	continue

    caption = post['posts'][0]['caption']

    attribution = re.search('<p class=\"signature-name\">(.*)<\/p>', caption)

    details = attribution.group(1) 

    if ',' in details:
        name, location = details.split(',', 1)
    else:
Ejemplo n.º 20
0
	def get_tumblr_table(self):
	    t = Tumblpy("sYKNnjJRqbxWWlg19sY8WYnZyQi6wURbilnE4k3vsyqX4vc4ER","n8mtWzKieR8qgTdwUWNhF3OYZVIsvMZXvVr9DKPlCGI6wE2VLV",
	    "PyvcruFPx1YqhdAOkCWjCPWMBIYx3fUJaiFzjhxpkwUwps0VjC","Zjwmi2wYA83rtIdoL82BcWcj5sxm5QrI1MEnZX4DzFQHWydx1C")
	    
	    blog_url = t.post('user/info')
	    blog_url = blog_url['user']['blogs'][1]['url']
	    
	    posts = t.get('posts', blog_url=blog_url)
	    posts_count = posts["total_posts"]
	    #print posts
	    table = gtk.Table(posts_count, 1, False)
	    
	    # set the spacing to 10 on x and 10 on y
	    table.set_row_spacings(10)
	    table.set_col_spacings(10)
	    
	    # pack the table into the scrolled window
	    i = 0
	    for cur_post in posts["posts"]:
			buffer = ""
			cur_image_fac = catImageBox.catImageBox("http://www.linux.org.ru/tango/img/opensource-logo.png", 50, 50)

			if cur_post["type"] == "text":
				buffer = cur_post["body"]
			
			if cur_post["type"] == "photo":
				j = len(cur_post["photos"][0]["alt_sizes"]) -1 
				img_url = cur_post["photos"][0]["alt_sizes"][j]["url"]
				
				cur_image_fac = catImageBox.catImageBox(img_url, 75, 75)
				buffer = cur_post["caption"]
				
			s = MLStripper()
			s.feed(buffer)	
			label = gtk.Label(s.get_data())
			label.set_line_wrap(True)
			label.set_justify(gtk.JUSTIFY_LEFT)
			label.set_width_chars(30)
			label.show()
			
			#date box
			date_box = gtk.HBox(True, 1)
			date_icon = gtk.Image()
			date_icon.set_from_file("resources/cal.png")
			date_icon.show()
			cur_image = cur_image_fac.image
			cur_image.show()
			fdate = cur_post["date"]
			date_label = gtk.Label(fdate.split(" ")[0])
			date_label.set_line_wrap(True)
			date_label.show()
			#date_box.pack_start(date_icon, True, True, 1)
			date_box.pack_start(cur_image, True, True, 1)
			
			date_box.pack_end(date_label, True, True, 1)
			date_box.show()
			
			#tag box
			tag_box = gtk.HBox(True, 1)
			tag_icon = gtk.Image()
			tag_icon.set_from_file("resources/tag.png")
			tag_icon.show()
			ftags = ""
			for cur_tag in cur_post["tags"]:
				ftags += cur_tag + " "
				
			
			tag_label = gtk.Label(ftags)
			tag_label.set_line_wrap(True)
			tag_label.show()
			tag_box.pack_start(tag_icon, True, True, 1)
			tag_box.pack_end(tag_label, True, True, 1)
			tag_box.show()
			
			separator = gtk.HSeparator()
			separator.show()
			
			box = gtk.VBox(True, 1)
			box.pack_start(date_box, True, True, 1)
			if cur_post["tags"].count > 0:
				box.pack_start(tag_box, True, True, 1)
			box.pack_start(label, True, True, 0)
			box.pack_end(separator, True,  True, 0)
			box.show()
			table.attach(box, 1, 2, i, i+1)
			i = i+1
	    return  table
Ejemplo n.º 21
0
from tumblpy import Tumblpy
from flask import Flask, request, render_template, g, redirect, Response

tmpl_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'templates')
app = Flask(__name__, template_folder=tmpl_dir)

# private variables currently hardcoded for ease of use
CONSUMER_KEY        = 'MN6llW04QBngyH2e31PCT3R0gMEaY656zQQFmwCyKdNKLr2dJ9'
CONSUMER_SECRET     = '82lF0LGIGsLvXfuHfQV1c7YkdjR6KL9wnSI1hXfpjpLu7Npgz8'
OAUTH_TOKEN         = 'lvUpN9aukdFxa17CRhlfFEpfbIoEeefdVW0prDUu7kXOw4FI3i'
OAUTH_TOKEN_SECRET  = 'K548qCNr7YrFBegFxRLmUmYJ2GIxwjjsDc39uq0UV2PxHiYKQs'

t = Tumblpy(CONSUMER_KEY, CONSUMER_SECRET,
            OAUTH_TOKEN, OAUTH_TOKEN_SECRET)

posts = t.get('posts', blog_url="www.cloktahwho.tumblr.com")

#print(posts)


@app.before_request
def before_request():
  """
  This function is run at the beginning of every web request 
  (every time you enter an address in the web browser).
  We use it to setup a database connection that can be used throughout the request.

  The variable g is globally accessible.
  """
  print("Hello this is trying to work.")