def _cache(blog, *args, **kwargs): class_name = blog.__class__.__name__ method_name = method.func_name strargs = '%s----%s' % ( '--'.join(slugify(str(arg)) for arg in args), '--'.join('%s=%s' % ( slugify(key), slugify(str(value))) for key, value in kwargs.items() ), ) cache_path = os.path.join( 'cache', class_name, method_name, strargs) if blog.use_cache: if os.path.exists(cache_path): print ' Loading %s.%s(%s, %s) from cache %s' % ( class_name, method_name, args, kwargs, cache_path ) return pickle.load(open(cache_path)) result = method(blog, *args, **kwargs) dirname = os.path.dirname(cache_path) if not os.path.exists(dirname): os.makedirs(dirname) with open(cache_path, 'w+') as f: pickle.dump(result, f) return result
def from_metaweblog(cls, struct, post_type='post', is_edit=False): """Receive metaWeblog RPC struct and initialize a Post. Used both by migrate_from_wordpress and when receiving a new or edited post from MarsEdit. """ title = struct.get('title', '') meta_description = struct.get('mt_excerpt', '') if len(meta_description) > 155: raise ValueError("Description is %d chars, max 155" % len(meta_description)) if 'mt_keywords' in struct: tags = [ tag.strip() for tag in struct['mt_keywords'].split(',') if tag.strip() ] else: tags = None slug = (slugify.slugify(struct['wp_slug']) if struct.get('wp_slug') else slugify.slugify(title)) description = struct.get('description', '') status = (struct.get('post_status') or struct.get('page_status') or 'publish') if 'date_modified_gmt' in struct: tup = struct['date_modified_gmt'].timetuple() mod = utc_tz.localize(datetime.datetime(*tup[0:6])) else: mod = datetime.datetime.utcnow() body = markup.markup(description) rv = cls( title=title, # Format for display body=body, plain=plain.plain(body), summary=summarize.summarize(body, 200), original=description, meta_description=meta_description, tags=tags, slug=slug, type=post_type, status=status, wordpress_id=struct.get('postid'), mod=mod) if not is_edit and 'date_created_gmt' in struct: # TODO: can fail if two posts created in same second, add random # suffix to ObjectId date_created = datetime.datetime.strptime( struct['date_created_gmt'].value, "%Y%m%dT%H:%M:%S") rv.id = ObjectId.from_datetime(date_created) return rv
def from_metaweblog( cls, struct, post_type='post', is_edit=False ): """Receive metaWeblog RPC struct and initialize a Post. Used both by migrate_from_wordpress and when receiving a new or edited post from MarsEdit. """ title = struct.get('title', '') # We expect MarsEdit to set categories with mt_setPostCategories() assert 'categories' not in struct if 'mt_keywords' in struct: tags = [ tag.strip() for tag in struct['mt_keywords'].split(',') if tag.strip() ] else: tags = None slug = ( slugify.slugify(struct['wp_slug']) if struct.get('wp_slug') else slugify.slugify(title)) description = struct.get('description', '') status = struct.get('post_status', 'publish') if 'date_modified_gmt' in struct: tup = struct['date_modified_gmt'].timetuple() mod = utc_tz.localize(datetime.datetime(*tup[0:6])) else: mod = datetime.datetime.utcnow() body = markup.markup(description) rv = cls( title=title, # Format for display body=body, summary=summarize.summarize(body, 200), original=description, tags=tags, slug=slug, type=post_type, status=status, wordpress_id=struct.get('postid'), mod=mod ) if not is_edit and 'date_created_gmt' in struct: # TODO: can fail if two posts created in same second, add random # suffix to ObjectId date_created = datetime.datetime.strptime( struct['date_created_gmt'].value, "%Y%m%dT%H:%M:%S") rv.id = ObjectId.from_datetime(date_created) return rv
def test_new_post(self): start = datetime.datetime.utcnow() post_id = self.new_post( title='the title', description=meta_description, body='the body') end = datetime.datetime.utcnow() post = self.fetch_rpc( 'metaWeblog.getPost', ( post_id, tornado_options.user, tornado_options.password)) title_slug = slugify.slugify('the title') expected_url = self.reverse_url_absolute('post', title_slug) self.assertEqual(post_id, post['id']) self.assertEqual(expected_url, post['link']) self.assertEqual(expected_url, post['permaLink']) self.assertEqual('a tag,another tag', post['mt_keywords']) self.assertEqual('publish', post['status']) self.assertEqual('the title', post['title']) self.assertEqual(meta_description, post['mt_excerpt']) self.assertEqual('the body', post['description']) # Confusing I know. self.assertTrue( start <= post['date_created_gmt'] <= end, "Post's date_created_gmt %s isn't between %s and %s" % ( post['date_created_gmt'], start, end))
def replace_media_links(body, media_library, db, destination_url, source_base_url): for link in media_library: if link in body: # This is making some big assumptions about the structure # of the media URL, that it's like # http://emptysquare.net/blog/wp-content/uploads/2011/10/img.png url = link.split('/uploads/')[-1] media_doc = db.media.find_one({'_id': link}) if not media_doc: # TODO: remove cache_path = os.path.join('cache', slugify(link)) if os.path.exists(cache_path): content, content_type = pickle.load(open(cache_path)) else: r = requests.get(link) content = r.content content_type = r.headers['content-type'] if not os.path.exists('cache'): os.mkdir('cache') with open(cache_path, 'w+') as f: pickle.dump((content, content_type), f) db.media.insert({ 'content': bson.Binary(content), 'length': len(content), 'type': content_type, '_id': url, 'mod': datetime.datetime.utcnow(), }) body = body.replace( link, os.path.join(destination_url, 'media', url)) return body
def test_new_post(self): start = datetime.datetime.utcnow() post_id = self.new_post(title='the title', description=meta_description, body='the body') end = datetime.datetime.utcnow() post = self.fetch_rpc( 'metaWeblog.getPost', (post_id, tornado_options.user, tornado_options.password)) title_slug = slugify.slugify('the title') expected_url = self.reverse_url_absolute('post', title_slug) self.assertEqual(post_id, post['id']) self.assertEqual(expected_url, post['link']) self.assertEqual(expected_url, post['permaLink']) self.assertEqual('a tag,another tag', post['mt_keywords']) self.assertEqual('publish', post['status']) self.assertEqual('the title', post['title']) self.assertEqual(meta_description, post['mt_excerpt']) self.assertEqual('the body', post['description']) # Confusing I know. self.assertTrue( start <= post['date_created_gmt'] <= end, "Post's date_created_gmt %s isn't between %s and %s" % (post['date_created_gmt'], start, end))
def replace_media_links(body, media_library, db, destination_url, source_base_url): for link in media_library: if link in body: # This is making some big assumptions about the structure # of the media URL, that it's like # http://emptysquare.net/blog/wp-content/uploads/2011/10/img.png url = link.split('/uploads/')[-1] media_doc = db.media.find_one({'_id': link}) if not media_doc: # TODO: remove cache_path = os.path.join('cache', slugify(link)) if os.path.exists(cache_path): content, content_type = pickle.load(open(cache_path)) else: r = requests.get(link) content = r.content content_type = r.headers['content-type'] if not os.path.exists('cache'): os.mkdir('cache') with open(cache_path, 'w+') as f: pickle.dump((content, content_type), f) db.media.insert({ 'content': bson.Binary(content), 'type': content_type, '_id': url, 'mod': datetime.datetime.utcnow(), }) body = body.replace(link, os.path.join(destination_url, 'media', url)) return body
def test_if_modified_since_microseconds(self): # If-Modified-Since is rounded down to the second. post_id = self.new_post(title='title') doc = self.sync_db.posts.find_one({'_id': ObjectId(post_id)}) dt = doc['mod'] slug = slugify.slugify('title') url = self.reverse_url('post', slug) response = self.fetch(url, if_modified_since=dt.replace(microsecond=0)) self.assertEqual(304, response.code)
def test_post_page(self): self.new_post() title_slug = slugify.slugify('the title') post_page = self.fetch(self.reverse_url('post', title_slug)) self.assertEqual(200, post_page.code) soup = BeautifulSoup(post_page.body) description_tag = soup.find('meta', attrs={'name': 'description'}) self.assertTrue(description_tag) self.assertEqual(self.meta_description, description_tag['content'])
def test_category_feed(self): slug = slugify.slugify('category 0') response = self.fetch(self.reverse_url('category-feed', slug)) self.assertEqual(200, response.code) feed = fromstring(response.body) entries = list(feed.findall(ns + 'entry')) self.assertEqual(1, len(entries)) # Post with 'the title' is in this category, not 'other title'. self.assertEqual('the title', entries[0].find(ns + 'title').text)
def test_feed(self): response = self.fetch(self.reverse_url('feed')) self.assertEqual(200, response.code) feed = fromstring(response.body) entries = list(feed.findall(ns + 'entry')) self.assertEqual(2, len(entries)) # Most recent first. self.assertEqual('other title', entries[0].find(ns + 'title').text) self.assertEqual( self.reverse_url_absolute('post', slugify.slugify('other title')), entries[0].find(ns + 'id').text) # Second post. self.assertEqual('the title', entries[1].find(ns + 'title').text) self.assertEqual( self.reverse_url_absolute('post', slugify.slugify('the title')), entries[1].find(ns + 'id').text)
def test_category_feed(self): slug = slugify.slugify('category 0') response = self.fetch(self.reverse_url('category-feed', slug)) self.assertEqual(200, response.code) feed = fromstring(response.body) entries = list(feed.findall(ns + 'entry')) self.assertEqual(1, len(entries)) # Post with 'the title' is in this category, not 'other title'. self.assertEqual( 'the title', entries[0].find(ns + 'title').text)
def test_feed(self): response = self.fetch(self.reverse_url('feed')) self.assertEqual(200, response.code) feed = fromstring(response.body) entries = list(feed.findall(ns + 'entry')) self.assertEqual(2, len(entries)) # Most recent first. self.assertEqual( 'other title', entries[0].find(ns + 'title').text) self.assertEqual( self.reverse_url_absolute('post', slugify.slugify('other title')), entries[0].find(ns + 'id').text) # Second post. self.assertEqual( 'the title', entries[1].find(ns + 'title').text) self.assertEqual( self.reverse_url_absolute('post', slugify.slugify('the title')), entries[1].find(ns + 'id').text)
def test_single_post_mod_date(self): one_id = self.new_post(title='title 1', created=datetime.datetime(2014, 1, 1)) self.new_post(title='title 2', created=datetime.datetime(2014, 1, 2)) title_2_slug = slugify.slugify('title 2') url = self.reverse_url('post', title_2_slug) self.assert_modified(url, datetime.datetime(2014, 1, 2)) self.new_post(title='title 3', created=datetime.datetime(2014, 1, 3)) self.assert_modified(url, datetime.datetime(2014, 1, 3)) self.edit_post(one_id, 'title 1', updated=datetime.datetime(2014, 1, 4)) self.assert_modified(url, datetime.datetime(2014, 1, 4))
def test_single_post_mod_date(self): one_id = self.new_post( title='title 1', created=datetime.datetime(2014, 1, 1)) self.new_post( title='title 2', created=datetime.datetime(2014, 1, 2)) title_2_slug = slugify.slugify('title 2') url = self.reverse_url('post', title_2_slug) self.assert_modified(url, datetime.datetime(2014, 1, 2)) self.new_post( title='title 3', created=datetime.datetime(2014, 1, 3)) self.assert_modified(url, datetime.datetime(2014, 1, 3)) self.edit_post( one_id, 'title 1', updated=datetime.datetime(2014, 1, 4)) self.assert_modified(url, datetime.datetime(2014, 1, 4))
def main(args): start = time.time() opts = options.options() destination_url = '/' + opts.base_url.lstrip('/') parts = urlparse(args.source_url) source_base_url = urljoin('%s://%s' % (parts[0], parts[1]), parts[2].split('/xmlrpc.php')[0]) print 'Base URL', source_base_url db = pymongo.Connection(safe=True).motorblog motordb = motor.MotorClient().open_sync().motorblog if args.wipe: print 'Wiping motorblog database' db.connection.drop_database('motorblog') print 'Creating capped collection "events"' create_events_collection(motordb) print 'Recreating indexes' ensure_indexes(db) source = Blog(args.source_url, args.source_username, args.source_password, use_cache=not args.refresh, verbose=args.verbose) print 'Getting media library' media_library = set([m['link'] for m in source.get_media_library()]) print ' %s assets\n' % len(media_library) print 'Getting posts and pages' post_structs = source.get_recent_posts(args.nposts) print ' %s posts' % len(post_structs) page_structs = source.get_pages() print ' %s pages' % len(page_structs) print for structs, post_type in [ (post_structs, 'post'), (page_structs, 'page'), ]: print '%sS' % post_type.upper() for struct in structs: categories = struct.pop('categories', []) struct['description'] = wordpress_to_markdown( struct, media_library, db, destination_url, source_base_url) post = Post.from_metaweblog(struct, post_type) print '%-34s %s' % (post.title, post.status.upper()) for category_name in categories: doc = db.categories.find_one({'name': category_name}) if doc: category = Category(**doc) else: category = Category(name=category_name, slug=slugify(category_name)) category.id = db.categories.insert(category.to_python()) print ' %-30s %s' % (category_name, ' NEW' if not doc else '') post.categories.append(category) db.posts.insert(post.to_python()) print '\nFinished %s %ss' % (len(structs), post_type) print 'Posting "categories_changed" event' db.events.insert( { 'ts': datetime.datetime.utcnow(), 'name': 'categories_changed' }, manipulate=False) # No need to add _id print '\nFinished in %.2f seconds' % (time.time() - start)
def media_link(year, month, filename): base, extension = os.path.splitext(filename) return '%04d/%02d/%s' % (year, month, slugify.slugify(base)) + extension
def _from_rpc(cls, struct, name): _id = ObjectId( struct['categoryId']) if 'categoryId' in struct else None return cls(name=name, slug=slugify.slugify(name), id=_id)
def _from_rpc(cls, struct, name): _id = ObjectId(struct['categoryId']) if 'categoryId' in struct else None return cls(name=name, slug=slugify.slugify(name), id=_id)
def main(args): start = time.time() opts = options.options() destination_url = '/' + opts.base_url.lstrip('/') parts = urlparse(args.source_url) source_base_url = urljoin( '%s://%s' % (parts[0], parts[1]), parts[2].split('/xmlrpc.php')[0]) print 'Base URL', source_base_url db = pymongo.Connection(safe=True).motorblog motordb = motor.MotorConnection().open_sync().motorblog if args.wipe: print 'Wiping motorblog database' db.connection.drop_database('motorblog') print 'Creating capped collection "events"' create_events_collection(motordb) print 'Recreating indexes' ensure_indexes(db) source = Blog( args.source_url, args.source_username, args.source_password, use_cache=not args.refresh, verbose=args.verbose) print 'Getting media library' media_library = set([ m['link'] for m in source.get_media_library()]) print ' %s assets\n' % len(media_library) print 'Getting posts and pages' post_structs = source.get_recent_posts(args.nposts) print ' %s posts' % len(post_structs) page_structs = source.get_pages() print ' %s pages' % len(page_structs) print for structs, type in [ (post_structs, 'post'), (page_structs, 'page'), ]: print '%sS' % type.upper() for struct in structs: categories = struct.pop('categories', []) struct['description'] = wordpress_to_markdown( struct, media_library, db, destination_url, source_base_url) post = Post.from_metaweblog(struct, type) print '%-34s %s' % (post.title, post.status.upper()) for category_name in categories: doc = db.categories.find_one({'name': category_name}) if doc: category = Category(**doc) else: category = Category( name=category_name, slug=slugify(category_name)) category.id = db.categories.insert(category.to_python()) print ' %-30s %s' % ( category_name, ' NEW' if not doc else '' ) post.categories.append(category) db.posts.insert(post.to_python()) print '\nFinished %s %ss' % (len(structs), type) print 'Posting "categories_changed" event' db.events.insert( {'ts': datetime.datetime.utcnow(), 'name': 'categories_changed'}, manipulate=False) # No need to add _id print '\nFinished in %.2f seconds' % (time.time() - start)