def object_from_postdata(postdata): tumblr_id = postdata['id'] try: return False, Object.objects.get(service='tumblr.com', foreign_id=tumblr_id) except Object.DoesNotExist: pass obj = Object( service='tumblr.com', foreign_id=tumblr_id, permalink_url=postdata['post_url'], title='', body='', render_mode='mixed', time=datetime.strptime(postdata['date'], '%Y-%m-%d %H:%M:%S GMT'), author=account_for_tumblr_shortname(postdata['blog_name']), ) post_type = postdata['type'] if post_type == 'regular': obj.title = postdata.get('title', '') obj.body = postdata.get('body', '') elif post_type == 'video': player = max((player for player in postdata['player'] if player['width'] <= 700), key=lambda pl: pl['width']) body = player['embed_code'] caption = postdata.get('caption', None) if caption: body = '\n\n'.join((body, caption)) obj.body = body elif post_type == 'audio': obj.title = postdata.get('track_name', '') artist = postdata.get('artist', '') if artist and obj.title: obj.title = u'%s \u2013 %s' % (artist, obj.title) elif artist: obj.title = artist body = postdata.get('player', '') album_art = postdata.get('album_art', '') if album_art: body = u'\n\n'.join((u'<p><img src="%s"></p>' % album_art, body)) caption = postdata.get('caption', '') if caption: body = u'\n\n'.join((body, caption)) obj.body = body elif post_type == 'photo' and len(postdata['photos']) > 1: # photoset photobodies = list() for photo in postdata['photos']: photosize = max((size for size in photo['alt_sizes'] if size['width'] <= 700), key=lambda sz: sz['width']) body = u'<p><img src="%(url)s" width="%(width)s" height="%(height)s"></p>' % photosize photobodies.append(body) caption = photo.get('caption', '') if caption: photobodies.append(u'<p>%s</p>' % photo['caption']) caption = postdata.get('caption', '') if caption: photobodies.append(caption) obj.body = u'\n\n'.join(photobodies) elif post_type == 'photo': # single photo photo = postdata['photos'][0] photosize = max((size for size in photo['alt_sizes'] if size['width'] <= 700), key=lambda sz: sz['width']) image = Media( image_url=photosize['url'], width=photosize['width'], height=photosize['height'], ) image.save() obj.image = image obj.render_mode = 'image' obj.body = postdata.get('caption', '') elif post_type == 'link': # TODO: display the link if we can't make an in_reply_to object. # handle the Page manually to always provide an in_reply_to? # should this just be a render_mode=link object itself instead # of a reply? link_url = postdata['url'] try: in_reply_to_page = leapfrog.poll.embedlam.Page(link_url) except ValueError: pass else: try: in_reply_to = in_reply_to_page.to_object() except ValueError: in_reply_to = None if in_reply_to is None: in_reply_to = Object( service='', foreign_id=in_reply_to_page.url, render_mode='link', title=in_reply_to_page.title, permalink_url=in_reply_to_page.url, time=datetime.utcnow(), ) in_reply_to.save() obj.in_reply_to = in_reply_to obj.title = postdata.get('title', link_url) desc = postdata.get('description', '') if desc: obj.body = desc # If we added no description, make this a share instead. elif obj.in_reply_to: return True, obj.in_reply_to elif post_type == 'quote': quote_text = postdata.get('quote', '') body = u"""<blockquote><p>%s</p></blockquote>""" % (quote_text,) quote_source = postdata.get('source', '') if quote_source: body = u'\n\n'.join((body, u"<p>\u2014%s</p>" % quote_source)) obj.body = body # TODO: handle chat posts (i guess) else: log.debug("Unhandled Tumblr post type %r for post #%s; skipping", post_type, tumblr_id) return None, None # TODO: make reblogs into replies obj.save() return False, obj
def object_from_post_element(post_el, tumblelog_el): tumblr_id = post_el.attrib['id'] try: return False, Object.objects.get(service='tumblr.com', foreign_id=tumblr_id) except Object.DoesNotExist: pass obj = Object( service='tumblr.com', foreign_id=tumblr_id, permalink_url=post_el.attrib['url-with-slug'], title='', body='', render_mode='mixed', time=datetime.strptime(post_el.attrib['date-gmt'], '%Y-%m-%d %H:%M:%S GMT'), author=account_for_tumblelog_element(tumblelog_el), ) post_type = post_el.attrib['type'] if post_type == 'regular': title_el = post_el.find('./regular-title') if title_el is not None: obj.title = title_el.text body_el = post_el.find('./regular-body') if body_el is not None: obj.body = body_el.text elif post_type == 'video': body = post_el.find('./video-player').text video_caption_el = post_el.find('./video-caption') if video_caption_el is not None: video_caption = video_caption_el.text body = '\n\n'.join((body, video_caption)) obj.body = body elif post_type == 'audio': title_el = post_el.find('./id3-title') if title_el is not None: obj.title = title_el.text artist_el = post_el.find('./id3-artist') if artist_el is not None: obj.title = u'%s \u2013 %s' % (artist_el.text, obj.title) body = post_el.find('./audio-player').text audio_art_el = post_el.find('./id3-album-art') if audio_art_el is not None: audio_art_url = audio_art_el.text body = u'\n\n'.join((u'<p><img src="%s"></p>' % audio_art_url, body)) audio_caption_el = post_el.find('./audio-caption') if audio_caption_el is not None: audio_caption = audio_caption_el.text body = u'\n\n'.join((body, audio_caption)) obj.body = body elif post_type == 'photo': # TODO: if there's a photo-link-url, is this really a "photo reply"? photo_el = sorted(post_el.findall('./photo-url'), key=lambda x: int(x.attrib['max-width']), reverse=True)[0] photo_el_width = int(photo_el.attrib['max-width']) try: width, height = post_el.attrib['width'], post_el.attrib['height'] except KeyError: width, height = None, None else: width, height = int(width), int(height) if width > photo_el_width: height = photo_el_width * height / width width = photo_el_width image = Media( image_url=photo_el.text, width=width, height=height, ) image.save() obj.image = image obj.render_mode = 'image' caption_el = post_el.find('./photo-caption') if caption_el is not None: obj.body = caption_el.text elif post_type == 'link': # TODO: display the link if we can't make an in_reply_to object. # handle the Page manually to always provide an in_reply_to? # should this just be a render_mode=link object itself instead # of a reply? link_url = post_el.find('./link-url').text try: in_reply_to_page = leapfrog.poll.embedlam.Page(link_url) except ValueError: pass else: try: in_reply_to = in_reply_to_page.to_object() except ValueError: in_reply_to = None if in_reply_to is None: in_reply_to = Object( service='', foreign_id=in_reply_to_page.url, render_mode='link', title=in_reply_to_page.title, permalink_url=in_reply_to_page.url, time=datetime.utcnow(), ) in_reply_to.save() obj.in_reply_to = in_reply_to title_el = post_el.find('./link-text') obj.title = link_url if title_el is None else title_el.text desc_el = post_el.find('./link-description') if desc_el is not None: obj.body = desc_el.text # If we added no description, make this a share. if obj.in_reply_to and not obj.body: return True, obj.in_reply_to elif post_type == 'quote': quote_text = post_el.find('./quote-text').text body = u"""<blockquote><p>%s</p></blockquote>""" % (quote_text,) quote_source_el = post_el.find('./quote-source') if quote_source_el is not None: quote_source = quote_source_el.text body = u'\n\n'.join((body, u"<p>\u2014%s</p>" % quote_source)) obj.body = body # TODO: handle chat posts (i guess) else: log.debug("Unhandled Tumblr post type %r for post #%s; skipping", post_type, tumblr_id) return None, None try: orig_url = post_el.attrib['reblogged-root-url'] except KeyError: log.debug("Post #%s is not a reblog, leave it alone", tumblr_id) else: log.debug("Post #%s is a reblog of %s; let's try walking up", tumblr_id, orig_url) really_a_share, orig_obj = False, None try: really_a_share, orig_obj = object_from_url(orig_url) except ValueError, exc: # meh log.debug("Couldn't walk up to reblog reference %s: %s", orig_url, str(exc)) if not really_a_share and orig_obj is not None: # Patch up the upstream author's userpic if necessary, since we # don't get those from /api/read, evidently. if orig_obj.author.person.avatar is None and 'reblogged-root-avatar-url-64' in post_el.attrib: avatar = Media( image_url=post_el.attrib['reblogged-root-avatar-url-64'], width=64, height=64, ) avatar.save() orig_obj.author.person.avatar = avatar orig_obj.author.person.save() log.debug("Fixed up post #%s's author's avatar to %s", orig_obj.foreign_id, avatar.image_url) remove_reblog_boilerplate_from_obj(obj, orig_obj) if not obj.body: return True, orig_obj obj.in_reply_to = orig_obj
foreign_id=target_url, render_mode='link', title=resource.get('title', ''), body=resource.get('html', ''), author=account_for_embed_resource(resource), permalink_url=resource.get('url') or target_url, # might be given anyway time=datetime.utcnow(), ) if 'thumbnail_url' in resource: image = Media( image_url=resource['thumbnail_url'], width=resource.get('thumbnail_width'), height=resource.get('thumbnail_height'), ) image.save() obj.image = image obj.save() return obj raise ValueError('Unknown OEmbed resource type %r' % resource_type) def title_from_html_head(head): og_title_elem = head.find("meta", property="og:title") old_facebook_title_elem = head.find("meta", {"name":"title"}) title_elem = head.find("title") title = value_for_meta_elems((og_title_elem, old_facebook_title_elem, title_elem), "") return title def object_from_html_head(url, orig_url, head):