'wordpress': process_wordpress_post, 'typepad': process_typepad_post # livejournal': process_livejournal_post, # 'newsvine': process_newsvine_post } # API: # def process_[blog_type]_blog( file_path ) # #Find html files for each of the blog posts within a blog of type blog_type stored at file_path # #Return a list of filenames # # def process_[blog_type]_post( file_name ) # #Parse an html blog post into xml # #Return xml for the blog post # # decided to put the directory syntax (/) in the input rather than the actual code # if __name__=="__main__": #Testing scripts go here: #files = process_newsvine_blog( '/scratch/unmirrored5/agong/blog_panel_crawl/' + 'vanessa-wilson73.newsvine.com/') #print len( files ) #print files[:5] profiler3.testBlogParser( '/scratch/unmirrored5/agong/blog_panel_crawl/' + 'witchdoctorrepellent.blogspot.com/2005/12/no-tears-for-monster.html/'+ 'vanessa-wilson73.newsvine.com/', "blogger", sample=20 )
'blogger': process_blogger_post, 'wordpress': process_wordpress_post, 'typepad': process_typepad_post, 'livejournal': process_livejournal_post, 'newsvine': process_newsvine_post } # API: # def process_[blog_type]_blog( file_path ) # #Find html files for each of the blog posts within a blog of type blog_type stored at file_path # #Return a list of filenames # # def process_[blog_type]_post( file_name ) # #Parse an html blog post into xml # #Return xml for the blog post # # decided to put the directory syntax (/) in the input rather than the actual code # if __name__=="__main__": #Testing scripts go here: #files = process_newsvine_blog( '/scratch/unmirrored5/agong/blog_panel_crawl/' + 'vanessa-wilson73.newsvine.com/') #print len( files ) #print files[:5] profiler3.testBlogParser( 'carloz.newsvine.com', "newsvine", sample=20 )