def run(self): """ Organizes the parsing of a tumblr post. For Tumblr posts, you can use their api to get some basic info from html tags, but we probably will also need to look at the actual source code to get the rest of the info """ # We can parse tumblr blogs by adding "/<id>" to a regular tumblr query # or by changing the "id" value in the api interface. # i.e. http://<blog_name>.tumblr.com/<id> or http://<blog_name>.tumblr.com/api/read?id=<id> # Here, we use the api interface in the form of # "http://<blog_name>.tumblr.com/api/read/?id=<id>" # Build query string for the tumblr api post_query = "%s%s%s" % (self.post.url[: self.post.url.rfind("/") - 5], r"/api/read/?id=", self.post.id) # Extract basic post features try: print "Querying %s" % post_query source = HttpLib.openAndRead(post_query) except: print "Query failed" return # Extracts easy features, then sends off to notescraper for more work # Fills out post obj. & returns new links for spider self.extractBasicInfo(source) # Check whether this reblog key has already been processed. # If it has, no need to process its notes/relations again - they're the same for # All reblog relations! reblog_key_in_db = self.s.query(ReblogRelations).filter_by(reblog_key=self.post.reblog_key).first() if reblog_key_in_db == None: ns = NotesScraper(self.post) else: print "Reblog key already in DB! No need to process notes" self.urls_found = ns.all_outgoing_urls self.reblog_map = ns.reblog_map # passed in post object automatically updated too self.post.processed = True # Now, we have a fully-filled out post object try: print "postscraper finished processing %s" % self.post.url print "reblog map: " % self.reblog_map if len(self.reblog_map) > 0: print "woo!" print self.post except: print "couldn't print %s\n" % self.post.url
def assimilateNewUrls(self, urls): """adds urls to the UnvisitedURLs table if we don't have information on them in any of our tables""" # First, 'clean' the new urls - remove duplicates, and trailing slashes, etc. urls = HttpLib.cleanLinks(urls) print "\nAssimilating %s new urls\n" % len(urls) if len(urls) <= 0: return s = Session() # For each URL, for url in urls: # Query the Database & find out if URL already in it tables_to_query = [Blog, Post, UnvisitedURLs] to_save = True for table in tables_to_query: # Only care about entries that are fully processed - if they're not, we'll need to scrape them! entries = s.query(table).filter_by(url=url,processed=True).first() # If we found entries (!= None), don't add it to our 'unvisited urls' table if entries != None: to_save = False break if not to_save: continue # If didn't break by this point, then we should add the url to the # 'To save' session forthe UnvisitedURLs table #print "adding %s to be saved to the unvisited urls table" % url u = UnvisitedURLs(url=url) u.url_type = self.getUrlType(url) s.add(u) print "committing all unsaved urls to the database" s.commit() print "%s urls now in unvisitedURLs DB\n" % s.query(func.count(UnvisitedURLs)) urls = [] self.unvisited_urls = []
def getNextPageOfNotes(self, content, url): """ Parses the page, finds the link to "view more notes" (or equivalent), and returns a full url that points to the next page of notes """ print "processing next page of notes.. (%s)" % url # Find tumblr's link to the next 'notes' page (hidden away in javascript crap) note_url = re.search(r"tumblrreq.open\('get','(?P<note_url>.*?)',true\);", content, re.I) try: # Extract the url to the note note_url = note_url.group('note_url') except: print "No more pages of notes to process" return -1 # get "http://(name).tumblr.com" - so we can add the extracted notes url to it # (no trailing slash "/" included) base_url = HttpLib.extractBaseUrl(url) next_query = base_url + note_url # "note url" is case sEnSiTiVe!! #print "note url: %s\nnext query: %s\n" % (note_url, next_query) return next_query
def getBlogPosts(self, content, blogname): """ Returns 'Post' objects with only their 'url' attribute filled out Gets all of the posts on a page, but only the ones that link to the 'mother blog' - not to others. Essentially a page's internal links Args: blog: the title of the blog (no url), ie. 'naivemelody' """ soup = BeautifulSoup(content) post_urls = soup.findAll('post') posts = [] # a list of 'Post' objects that we'll insert into the DB for post in post_urls: # Only save posts that belong to the 'mother blog' url = post["url"] url = HttpLib.cleanLinks([url])[0] if self.extractBlogName(url) == blogname: #posts.append(Post(url=url)) posts.append(url) #print "found posts for %s!" % blogname #print posts return posts
def parseNotes(self, url, num_likes=0, num_comments=0): # AAHHH DONT PASS IN dictionary here!! values transfer over :( (it's not mutable)) """ Parses all of the notes (reblogs, likes, and comments) for the given post URL. Returns tuple: (num_likes, num_comments, reblog_map) """ print "reblog dict at start of parsenotes:" % self.reblog_map content = HttpLib.openAndRead(url, lower=False) soup = BeautifulSoup(content) # Otherwise, parse the links and likes, etc. off the page try: notes = soup.ol.findAll('li') except: print "no notes on page %s" % url return (0, 0) print "processing %s notes from %s" % (len(notes), url) for note in notes: try: words = note["class"].split() # extract the contents of the 'class' tag except: continue note_type = words[1] print "\nNote type: %s" % note_type # The convention will be "<person 1> reblogged <person 2>" # We find those two links and extract their URLs # (Where the 'x reblogged y' info comes from) if note_type == "reblog": try: details = note.span.findAll('a') print "details: %s\n" % details poster = details[1]['href'] print "poster: %s" % poster reblogger = details[0]['href'] print "reblogger: %s" % reblogger # Add to results (reblog_map) print "adding to map" self.reblog_map[poster] = self.reblog_map.get(poster, []) if reblogger not in self.reblog_map[poster]: self.reblog_map[poster].append(reblogger) except: print "Couldn't extract reblog relation(%s). Skipping for now" % details # Probably a 'posted by' superflous error - as details = note.span.findAll('a') elif note_type == "like": num_likes += 1 elif note_type == "more_notes_link_container": num_comments += 1 # Figure out what to do next next_query = self.getNextPageOfNotes(content, url) print "\nreturning %s, %s, %s notes processed for %s" \ % (num_likes, num_comments, self.reblog_map, url) if next_query == -1: print "nothing else to query" return (num_likes, num_comments) else: print "more pages to query - getting more" (num_likes, num_comments) = self.parseNotes(next_query, num_likes, num_comments) return (num_likes, num_comments)
def run(self): """ Generates the populated blog object Organizes the parsing of a tumblr blog. For Tumblr blogs, you can use their api to get 50 posts at a time - scraping until we get no more posts """ # We can parse tumblr blogs by adding "/page/n" to a regular tumblr query # or by changing the "start=n" value in the api interface._ # Here, we use the api interface in the form of # "http://(blogname).tumblr.com/api/read/?num=50start=n" # We put in the blog's name, and go through the posts by 50 until we run out offset = 0 # the post # we start querying the next 50 from # Extract blog title and build query string for the tumblr api blog_query = r"http://" + self.blog.name + r".tumblr.com/api/read/?num=50&start=" + str(offset) # Extract basic blog feature content = HttpLib.openAndRead(blog_query) soup = BeautifulSoup(content) basic_info = self.extractBasicTumblrInfo(content, soup, self.blog.name) num_posts = basic_info['num_posts'] print "processing until %s posts\n" % num_posts # Walk through each set of 50 posts of the blog until get to end post while offset <= num_posts: try: print "querying %s" % blog_query source = HttpLib.openAndRead(blog_query) except: print "sanity check failed" raise new_posts = self.getBlogPosts(source, self.blog.name) ## The actual feature extraction self.urls_found.extend(new_posts) # for scraper print "%s new posts urls for %s:" % (len(new_posts), self.blog.name) print "%s old posts in %s:" % (len(self.blog.posts), self.blog.name) if self.redundancyCheck(self.blog.posts, new_posts): for i, post in enumerate(new_posts): self.blog.posts.append(Post(post)) else: print "redundancy check failed - same content returned as prev. run" raise # Set-up for next run blog_query = blog_query.rstrip(str(offset)) offset += 50 blog_query += str(offset) #print "\n%s of the posts: " % len(new_posts) #print new_posts #self.urls_found = list(new_posts) # make new list # self.blog modified self.blog.processed=True print "finished processing blog %s" % self.blog.url
def next(self): """ Parses the next set of notes (reblogs, likes, and comments) for the given URL. returns a list of (poster, reblogger) urls """ # The thing we will return list_of_relations = [] num_reblogs = 0 num_likes = 0 source = None # Open the URL and convert to BeautifulSoup content = HttpLib.openAndRead(self.next_url_to_scrape, lower=False) soup = BeautifulSoup(content) # All of the "notes" are stored as <li> objects # We can use BeautifulSoup to extract them try: notes = soup.ol.findAll('li') except: print "no notes on page %s" % self.next_url_to_scrape self.next_url_to_scrape = -1 return [] # If We haven't returned by now, we have notes to process! print "processing %s notes from %s" % (len(notes), self.next_url_to_scrape) # Each "note" is a line that says "reblogged from" or "liked" or something # Here, we will process each note and build the reblog_map for note in notes: # Idenfity the "type" of note (reblog, like, etc.) # Do this by extracting info from the "class" tag try: note_type = note["class"].split()[1] # extract the contents of the 'class' tag print "\nnote_type:", note_type except: continue # If the Note type is a "reblog", try to extract the graph information # about who is reblogging who # The convention will be "<person 1> reblogged <person 2>" # We find those two links and extract their URLs # (Where the 'x reblogged y' info comes from) if note_type == "reblog": try: # Extract relevant information from the "note" details = note.span.findAll('a') print "details: %s" % details # If it's a "reblog" but only one long - that means we've found # The original poster! if len(details) == 1: source = Blogger(name=details[0]['title'].encode("utf-8"), url=details[0]['href']) continue poster = Blogger(name=details[1]['title'].encode("utf-8"), url=details[1]['href']) reblogger = Blogger(name=details[0]['title'].encode("utf-8"), url=details[0]['href']) print "poster: %s" % poster print "reblogger: %s" % reblogger # Add to results (reblog_map) list_of_relations.append( (poster, reblogger) ) num_reblogs += 1 # If couldn't find information - whoops! except Exception as e: print "Couldn't extract reblog relation(%s). Skipping for now" % details print "error: ", e # A "Note" may not be a reblog relation - it could be a "like" # If so, count it elif note_type == "like": num_likes += 1 # Find the next page of Notes to process [have to do this manually - no tumblr api for this] self.next_url_to_scrape = self._getUrlForNextPageOfNotes(content, self.next_url_to_scrape) return (list_of_relations, num_reblogs, num_likes, self.next_url_to_scrape, source)