def run(self):
        """    
        Organizes the parsing of a tumblr post. For Tumblr posts, you can use
        their api to get some basic info from html tags, but we probably will 
        also need to look at the actual source code to get the rest of the info
        """
        # We can parse tumblr blogs by adding "/<id>" to a regular tumblr query
        # or by changing the "id" value in the api interface.
        # i.e. http://<blog_name>.tumblr.com/<id> or http://<blog_name>.tumblr.com/api/read?id=<id>

        # Here, we use the api interface in the form of
        # "http://<blog_name>.tumblr.com/api/read/?id=<id>"

        # Build query string for the tumblr api
        post_query = "%s%s%s" % (self.post.url[: self.post.url.rfind("/") - 5], r"/api/read/?id=", self.post.id)

        # Extract basic post features
        try:
            print "Querying %s" % post_query
            source = HttpLib.openAndRead(post_query)
        except:
            print "Query failed"
            return

        # Extracts easy features, then sends off to notescraper for more work
        # Fills out post obj. & returns new links for spider
        self.extractBasicInfo(source)

        # Check whether this reblog key has already been processed.
        # If it has, no need to process its notes/relations again - they're the same for
        # All reblog relations!
        reblog_key_in_db = self.s.query(ReblogRelations).filter_by(reblog_key=self.post.reblog_key).first()
        if reblog_key_in_db == None:
            ns = NotesScraper(self.post)
        else:
            print "Reblog key already in DB! No need to process notes"

        self.urls_found = ns.all_outgoing_urls
        self.reblog_map = ns.reblog_map
        # passed in post object automatically updated too

        self.post.processed = True  # Now, we have a fully-filled out post object

        try:
            print "postscraper finished processing %s" % self.post.url
            print "reblog map: " % self.reblog_map
            if len(self.reblog_map) > 0:
                print "woo!"
            print self.post

        except:
            print "couldn't print %s\n" % self.post.url
 def parseNotes(self, url, num_likes=0, num_comments=0):
     # AAHHH DONT PASS IN dictionary here!! values transfer over :( (it's not mutable))
     """ Parses all of the notes (reblogs, likes, and comments) for the given
         post URL.
         
         Returns tuple: (num_likes, num_comments, reblog_map) """
         
     print "reblog dict at start of parsenotes:" % self.reblog_map
         
     content = HttpLib.openAndRead(url, lower=False)
     soup = BeautifulSoup(content)
     
     # Otherwise, parse the links and likes, etc. off the page
     
     try:
         notes = soup.ol.findAll('li')
     except:
         print "no notes on page %s" % url
         return (0, 0)
     
     print "processing %s notes from %s" % (len(notes), url)
     
     for note in notes:
         try:
             words = note["class"].split()   # extract the contents of the 'class' tag
         except:
             continue
         note_type = words[1]
         print "\nNote type: %s" % note_type
         
         # The convention will be "<person 1> reblogged <person 2>"
         # We find those two links and extract their URLs
         # (Where the 'x reblogged y' info comes from)
         
         if note_type == "reblog":
             try:
                 details = note.span.findAll('a')         
                 print "details: %s\n" % details                
 
                 poster = details[1]['href']
                 print "poster: %s" % poster
 
                 reblogger = details[0]['href']
                 print "reblogger: %s" % reblogger
                 
                 
                 # Add to results (reblog_map)
                 print "adding to map"
                 self.reblog_map[poster] = self.reblog_map.get(poster, [])
                 if reblogger not in self.reblog_map[poster]:
                     self.reblog_map[poster].append(reblogger)
                 
             except:
                 print "Couldn't extract reblog relation(%s). Skipping for now" % details
                 # Probably a 'posted by' superflous error - as 
                 details = note.span.findAll('a')
             
         elif note_type == "like":
             num_likes += 1
             
         elif note_type == "more_notes_link_container":
             num_comments += 1
            
 
     # Figure out what to do next
     next_query = self.getNextPageOfNotes(content, url)
 
     print "\nreturning %s, %s, %s notes processed for %s" \
         % (num_likes, num_comments, self.reblog_map, url)
     
     if next_query == -1:
         print "nothing else to query"
         return (num_likes, num_comments)
     else:
         print "more pages to query - getting more"
         (num_likes, num_comments) = self.parseNotes(next_query, num_likes, num_comments)
         return (num_likes, num_comments)
 def run(self):
     """ Generates the populated blog object 
     
     Organizes the parsing of a tumblr blog. For Tumblr blogs, you can use
     their api to get 50 posts at a time - scraping until we get no more posts
     """
     
     # We can parse tumblr blogs by adding "/page/n" to a regular tumblr query
     # or by changing the "start=n" value in the api interface._
     
     # Here, we use the api interface in the form of 
     # "http://(blogname).tumblr.com/api/read/?num=50start=n"
     # We put in the blog's name, and go through the posts by 50 until we run out
     offset = 0      # the post # we start querying the next 50 from
     
     # Extract blog title and build query string for the tumblr api
     blog_query = r"http://" + self.blog.name + r".tumblr.com/api/read/?num=50&start=" + str(offset)
     
     # Extract basic blog feature
     content = HttpLib.openAndRead(blog_query)
     soup = BeautifulSoup(content)
     basic_info = self.extractBasicTumblrInfo(content, soup, self.blog.name)
     
     num_posts = basic_info['num_posts']
     print "processing until %s posts\n" % num_posts
     
     
     # Walk through each set of 50 posts of the blog until get to end post
     while offset <= num_posts:
         try:
             print "querying %s" % blog_query
             source = HttpLib.openAndRead(blog_query)
         except:
             print "sanity check failed"
             raise
     
         new_posts = self.getBlogPosts(source, self.blog.name)    ## The actual feature extraction
         self.urls_found.extend(new_posts) # for scraper
         print "%s new posts urls for %s:" % (len(new_posts), self.blog.name)
         
         print "%s old posts in %s:" % (len(self.blog.posts), self.blog.name)
         
         if self.redundancyCheck(self.blog.posts, new_posts):
             for i, post in enumerate(new_posts):
                 self.blog.posts.append(Post(post))
         else:
             print "redundancy check failed - same content returned as prev. run"
             raise
         
         # Set-up for next run
         blog_query = blog_query.rstrip(str(offset))
         offset += 50
         blog_query += str(offset)
         
         #print "\n%s of the posts: " % len(new_posts)
         #print new_posts
     
     
     #self.urls_found = list(new_posts) # make new list
     # self.blog modified
     
     self.blog.processed=True
     print "finished processing blog %s" % self.blog.url
Exemple #4
0
    def next(self):
        """ Parses the next set of notes (reblogs, likes, and comments) for the given URL.
            
            returns a list of (poster, reblogger) urls """
            
        # The thing we will return
        list_of_relations = []
        num_reblogs = 0
        num_likes = 0
        source = None
            
        # Open the URL and convert to BeautifulSoup
        content = HttpLib.openAndRead(self.next_url_to_scrape, lower=False)
        soup = BeautifulSoup(content)
        
        # All of the "notes" are stored as <li> objects
        # We can use BeautifulSoup to extract them
        try:
            notes = soup.ol.findAll('li')
        except:
            print "no notes on page %s" % self.next_url_to_scrape
            self.next_url_to_scrape = -1
            return []
        
        # If We haven't returned by now, we have notes to process!
        print "processing %s notes from %s" % (len(notes), self.next_url_to_scrape)
        
        # Each "note" is a line that says "reblogged from" or "liked" or something
        # Here, we will process each note and build the reblog_map
        for note in notes:
            # Idenfity the "type" of note (reblog, like, etc.)
            # Do this by extracting info from the "class" tag
            try:
                note_type = note["class"].split()[1]   # extract the contents of the 'class' tag
                print "\nnote_type:", note_type
            except:
                continue


            # If the Note type is a "reblog", try to extract the graph information
            # about who is reblogging who
            
            # The convention will be "<person 1> reblogged <person 2>"
            # We find those two links and extract their URLs
            # (Where the 'x reblogged y' info comes from)
            if note_type == "reblog":
                try:
                    # Extract relevant information from the "note"
                    details = note.span.findAll('a')   
                    print "details: %s" % details
                    
                    # If it's a "reblog" but only one long - that means we've found
                    # The original poster!
                    if len(details) == 1:
                        source = Blogger(name=details[0]['title'].encode("utf-8"), url=details[0]['href'])
                        continue
                    
                    poster = Blogger(name=details[1]['title'].encode("utf-8"), url=details[1]['href'])
                    reblogger = Blogger(name=details[0]['title'].encode("utf-8"), url=details[0]['href'])
                    
                    
                    print "poster: %s" % poster
                    print "reblogger: %s" % reblogger
                    
                    # Add to results (reblog_map)
                    list_of_relations.append( (poster, reblogger) )
                    num_reblogs += 1
                    
                # If couldn't find information - whoops!
                except Exception as e:
                    print "Couldn't extract reblog relation(%s). Skipping for now" % details
                    print "error: ", e
            
            # A "Note" may not be a reblog relation - it could be a "like"
            # If so, count it
            elif note_type == "like":
                num_likes += 1
                
               
    
        # Find the next page of Notes to process [have to do this manually - no tumblr api for this]
        self.next_url_to_scrape = self._getUrlForNextPageOfNotes(content, self.next_url_to_scrape)
        
        return (list_of_relations, num_reblogs, num_likes, self.next_url_to_scrape, source)