def run(self): """ Organizes the parsing of a tumblr post. For Tumblr posts, you can use their api to get some basic info from html tags, but we probably will also need to look at the actual source code to get the rest of the info """ # We can parse tumblr blogs by adding "/<id>" to a regular tumblr query # or by changing the "id" value in the api interface. # i.e. http://<blog_name>.tumblr.com/<id> or http://<blog_name>.tumblr.com/api/read?id=<id> # Here, we use the api interface in the form of # "http://<blog_name>.tumblr.com/api/read/?id=<id>" # Build query string for the tumblr api post_query = "%s%s%s" % (self.post.url[: self.post.url.rfind("/") - 5], r"/api/read/?id=", self.post.id) # Extract basic post features try: print "Querying %s" % post_query source = HttpLib.openAndRead(post_query) except: print "Query failed" return # Extracts easy features, then sends off to notescraper for more work # Fills out post obj. & returns new links for spider self.extractBasicInfo(source) # Check whether this reblog key has already been processed. # If it has, no need to process its notes/relations again - they're the same for # All reblog relations! reblog_key_in_db = self.s.query(ReblogRelations).filter_by(reblog_key=self.post.reblog_key).first() if reblog_key_in_db == None: ns = NotesScraper(self.post) else: print "Reblog key already in DB! No need to process notes" self.urls_found = ns.all_outgoing_urls self.reblog_map = ns.reblog_map # passed in post object automatically updated too self.post.processed = True # Now, we have a fully-filled out post object try: print "postscraper finished processing %s" % self.post.url print "reblog map: " % self.reblog_map if len(self.reblog_map) > 0: print "woo!" print self.post except: print "couldn't print %s\n" % self.post.url
def parseNotes(self, url, num_likes=0, num_comments=0): # AAHHH DONT PASS IN dictionary here!! values transfer over :( (it's not mutable)) """ Parses all of the notes (reblogs, likes, and comments) for the given post URL. Returns tuple: (num_likes, num_comments, reblog_map) """ print "reblog dict at start of parsenotes:" % self.reblog_map content = HttpLib.openAndRead(url, lower=False) soup = BeautifulSoup(content) # Otherwise, parse the links and likes, etc. off the page try: notes = soup.ol.findAll('li') except: print "no notes on page %s" % url return (0, 0) print "processing %s notes from %s" % (len(notes), url) for note in notes: try: words = note["class"].split() # extract the contents of the 'class' tag except: continue note_type = words[1] print "\nNote type: %s" % note_type # The convention will be "<person 1> reblogged <person 2>" # We find those two links and extract their URLs # (Where the 'x reblogged y' info comes from) if note_type == "reblog": try: details = note.span.findAll('a') print "details: %s\n" % details poster = details[1]['href'] print "poster: %s" % poster reblogger = details[0]['href'] print "reblogger: %s" % reblogger # Add to results (reblog_map) print "adding to map" self.reblog_map[poster] = self.reblog_map.get(poster, []) if reblogger not in self.reblog_map[poster]: self.reblog_map[poster].append(reblogger) except: print "Couldn't extract reblog relation(%s). Skipping for now" % details # Probably a 'posted by' superflous error - as details = note.span.findAll('a') elif note_type == "like": num_likes += 1 elif note_type == "more_notes_link_container": num_comments += 1 # Figure out what to do next next_query = self.getNextPageOfNotes(content, url) print "\nreturning %s, %s, %s notes processed for %s" \ % (num_likes, num_comments, self.reblog_map, url) if next_query == -1: print "nothing else to query" return (num_likes, num_comments) else: print "more pages to query - getting more" (num_likes, num_comments) = self.parseNotes(next_query, num_likes, num_comments) return (num_likes, num_comments)
def run(self): """ Generates the populated blog object Organizes the parsing of a tumblr blog. For Tumblr blogs, you can use their api to get 50 posts at a time - scraping until we get no more posts """ # We can parse tumblr blogs by adding "/page/n" to a regular tumblr query # or by changing the "start=n" value in the api interface._ # Here, we use the api interface in the form of # "http://(blogname).tumblr.com/api/read/?num=50start=n" # We put in the blog's name, and go through the posts by 50 until we run out offset = 0 # the post # we start querying the next 50 from # Extract blog title and build query string for the tumblr api blog_query = r"http://" + self.blog.name + r".tumblr.com/api/read/?num=50&start=" + str(offset) # Extract basic blog feature content = HttpLib.openAndRead(blog_query) soup = BeautifulSoup(content) basic_info = self.extractBasicTumblrInfo(content, soup, self.blog.name) num_posts = basic_info['num_posts'] print "processing until %s posts\n" % num_posts # Walk through each set of 50 posts of the blog until get to end post while offset <= num_posts: try: print "querying %s" % blog_query source = HttpLib.openAndRead(blog_query) except: print "sanity check failed" raise new_posts = self.getBlogPosts(source, self.blog.name) ## The actual feature extraction self.urls_found.extend(new_posts) # for scraper print "%s new posts urls for %s:" % (len(new_posts), self.blog.name) print "%s old posts in %s:" % (len(self.blog.posts), self.blog.name) if self.redundancyCheck(self.blog.posts, new_posts): for i, post in enumerate(new_posts): self.blog.posts.append(Post(post)) else: print "redundancy check failed - same content returned as prev. run" raise # Set-up for next run blog_query = blog_query.rstrip(str(offset)) offset += 50 blog_query += str(offset) #print "\n%s of the posts: " % len(new_posts) #print new_posts #self.urls_found = list(new_posts) # make new list # self.blog modified self.blog.processed=True print "finished processing blog %s" % self.blog.url
def next(self): """ Parses the next set of notes (reblogs, likes, and comments) for the given URL. returns a list of (poster, reblogger) urls """ # The thing we will return list_of_relations = [] num_reblogs = 0 num_likes = 0 source = None # Open the URL and convert to BeautifulSoup content = HttpLib.openAndRead(self.next_url_to_scrape, lower=False) soup = BeautifulSoup(content) # All of the "notes" are stored as <li> objects # We can use BeautifulSoup to extract them try: notes = soup.ol.findAll('li') except: print "no notes on page %s" % self.next_url_to_scrape self.next_url_to_scrape = -1 return [] # If We haven't returned by now, we have notes to process! print "processing %s notes from %s" % (len(notes), self.next_url_to_scrape) # Each "note" is a line that says "reblogged from" or "liked" or something # Here, we will process each note and build the reblog_map for note in notes: # Idenfity the "type" of note (reblog, like, etc.) # Do this by extracting info from the "class" tag try: note_type = note["class"].split()[1] # extract the contents of the 'class' tag print "\nnote_type:", note_type except: continue # If the Note type is a "reblog", try to extract the graph information # about who is reblogging who # The convention will be "<person 1> reblogged <person 2>" # We find those two links and extract their URLs # (Where the 'x reblogged y' info comes from) if note_type == "reblog": try: # Extract relevant information from the "note" details = note.span.findAll('a') print "details: %s" % details # If it's a "reblog" but only one long - that means we've found # The original poster! if len(details) == 1: source = Blogger(name=details[0]['title'].encode("utf-8"), url=details[0]['href']) continue poster = Blogger(name=details[1]['title'].encode("utf-8"), url=details[1]['href']) reblogger = Blogger(name=details[0]['title'].encode("utf-8"), url=details[0]['href']) print "poster: %s" % poster print "reblogger: %s" % reblogger # Add to results (reblog_map) list_of_relations.append( (poster, reblogger) ) num_reblogs += 1 # If couldn't find information - whoops! except Exception as e: print "Couldn't extract reblog relation(%s). Skipping for now" % details print "error: ", e # A "Note" may not be a reblog relation - it could be a "like" # If so, count it elif note_type == "like": num_likes += 1 # Find the next page of Notes to process [have to do this manually - no tumblr api for this] self.next_url_to_scrape = self._getUrlForNextPageOfNotes(content, self.next_url_to_scrape) return (list_of_relations, num_reblogs, num_likes, self.next_url_to_scrape, source)