def run(self):
        """    
        Organizes the parsing of a tumblr post. For Tumblr posts, you can use
        their api to get some basic info from html tags, but we probably will 
        also need to look at the actual source code to get the rest of the info
        """
        # We can parse tumblr blogs by adding "/<id>" to a regular tumblr query
        # or by changing the "id" value in the api interface.
        # i.e. http://<blog_name>.tumblr.com/<id> or http://<blog_name>.tumblr.com/api/read?id=<id>

        # Here, we use the api interface in the form of
        # "http://<blog_name>.tumblr.com/api/read/?id=<id>"

        # Build query string for the tumblr api
        post_query = "%s%s%s" % (self.post.url[: self.post.url.rfind("/") - 5], r"/api/read/?id=", self.post.id)

        # Extract basic post features
        try:
            print "Querying %s" % post_query
            source = HttpLib.openAndRead(post_query)
        except:
            print "Query failed"
            return

        # Extracts easy features, then sends off to notescraper for more work
        # Fills out post obj. & returns new links for spider
        self.extractBasicInfo(source)

        # Check whether this reblog key has already been processed.
        # If it has, no need to process its notes/relations again - they're the same for
        # All reblog relations!
        reblog_key_in_db = self.s.query(ReblogRelations).filter_by(reblog_key=self.post.reblog_key).first()
        if reblog_key_in_db == None:
            ns = NotesScraper(self.post)
        else:
            print "Reblog key already in DB! No need to process notes"

        self.urls_found = ns.all_outgoing_urls
        self.reblog_map = ns.reblog_map
        # passed in post object automatically updated too

        self.post.processed = True  # Now, we have a fully-filled out post object

        try:
            print "postscraper finished processing %s" % self.post.url
            print "reblog map: " % self.reblog_map
            if len(self.reblog_map) > 0:
                print "woo!"
            print self.post

        except:
            print "couldn't print %s\n" % self.post.url
    def assimilateNewUrls(self, urls):
        """adds urls to the UnvisitedURLs table if we don't have information on them
           in any of our tables"""
           
        # First, 'clean' the new urls - remove duplicates, and trailing slashes, etc.
        urls = HttpLib.cleanLinks(urls)
           
        print "\nAssimilating %s new urls\n" % len(urls)
        if len(urls) <= 0:
            return
            
        s = Session()
        
        
        # For each URL, 
        for url in urls:
            
            # Query the Database & find out if URL already in it
            tables_to_query = [Blog, Post, UnvisitedURLs]
            to_save = True
            for table in tables_to_query:
				# Only care about entries that are fully processed - if they're not, we'll need to scrape them!
				entries = s.query(table).filter_by(url=url,processed=True).first()
			
                # If we found entries (!= None), don't add it to our 'unvisited urls' table
                if entries != None:
                    to_save = False
                    break
                
            if not to_save:
                continue
            
            # If didn't break by this point, then we should add the url to the
            # 'To save' session forthe UnvisitedURLs table
            #print "adding %s to be saved to the unvisited urls table" % url
            u = UnvisitedURLs(url=url)
            u.url_type = self.getUrlType(url)
            s.add(u)
        
            
        print "committing all unsaved urls to the database"
        s.commit()
        print "%s urls now in unvisitedURLs DB\n" % s.query(func.count(UnvisitedURLs))
        urls = []
        self.unvisited_urls = []         
 def getNextPageOfNotes(self, content, url):
     """ Parses the page, finds the link to "view more notes" (or equivalent),
         and returns a full url that points to the next page of notes """
     print "processing next page of notes.. (%s)" % url
     
     # Find tumblr's link to the next 'notes' page (hidden away in javascript crap)
     note_url = re.search(r"tumblrreq.open\('get','(?P<note_url>.*?)',true\);", content, re.I)
 
     try:
         # Extract the url to the note
         note_url = note_url.group('note_url')
     except:
         print "No more pages of notes to process"
         return -1
     
     # get "http://(name).tumblr.com" - so we can add the extracted notes url to it
     # (no trailing slash "/" included)
     base_url = HttpLib.extractBaseUrl(url)     
     next_query = base_url + note_url    # "note url" is case sEnSiTiVe!!
     
     #print "note url: %s\nnext query: %s\n" % (note_url, next_query)
     return next_query
 def getBlogPosts(self, content, blogname):
     """ Returns 'Post' objects with only their 'url' attribute filled out
         Gets all of the posts on a page, but only the ones that link to the
         'mother blog' - not to others. Essentially a page's internal links 
         
         Args:
             blog: the title of the blog (no url), ie. 'naivemelody' """
     soup = BeautifulSoup(content)
     post_urls = soup.findAll('post')
     
     posts = [] # a list of 'Post' objects that we'll insert into the DB
     
     for post in post_urls:
         # Only save posts that belong to the 'mother blog'
         url = post["url"]
         url = HttpLib.cleanLinks([url])[0]
         if self.extractBlogName(url) == blogname:
             #posts.append(Post(url=url))
             posts.append(url)
 
     #print "found posts for %s!" % blogname
     #print posts
     return posts
 def parseNotes(self, url, num_likes=0, num_comments=0):
     # AAHHH DONT PASS IN dictionary here!! values transfer over :( (it's not mutable))
     """ Parses all of the notes (reblogs, likes, and comments) for the given
         post URL.
         
         Returns tuple: (num_likes, num_comments, reblog_map) """
         
     print "reblog dict at start of parsenotes:" % self.reblog_map
         
     content = HttpLib.openAndRead(url, lower=False)
     soup = BeautifulSoup(content)
     
     # Otherwise, parse the links and likes, etc. off the page
     
     try:
         notes = soup.ol.findAll('li')
     except:
         print "no notes on page %s" % url
         return (0, 0)
     
     print "processing %s notes from %s" % (len(notes), url)
     
     for note in notes:
         try:
             words = note["class"].split()   # extract the contents of the 'class' tag
         except:
             continue
         note_type = words[1]
         print "\nNote type: %s" % note_type
         
         # The convention will be "<person 1> reblogged <person 2>"
         # We find those two links and extract their URLs
         # (Where the 'x reblogged y' info comes from)
         
         if note_type == "reblog":
             try:
                 details = note.span.findAll('a')         
                 print "details: %s\n" % details                
 
                 poster = details[1]['href']
                 print "poster: %s" % poster
 
                 reblogger = details[0]['href']
                 print "reblogger: %s" % reblogger
                 
                 
                 # Add to results (reblog_map)
                 print "adding to map"
                 self.reblog_map[poster] = self.reblog_map.get(poster, [])
                 if reblogger not in self.reblog_map[poster]:
                     self.reblog_map[poster].append(reblogger)
                 
             except:
                 print "Couldn't extract reblog relation(%s). Skipping for now" % details
                 # Probably a 'posted by' superflous error - as 
                 details = note.span.findAll('a')
             
         elif note_type == "like":
             num_likes += 1
             
         elif note_type == "more_notes_link_container":
             num_comments += 1
            
 
     # Figure out what to do next
     next_query = self.getNextPageOfNotes(content, url)
 
     print "\nreturning %s, %s, %s notes processed for %s" \
         % (num_likes, num_comments, self.reblog_map, url)
     
     if next_query == -1:
         print "nothing else to query"
         return (num_likes, num_comments)
     else:
         print "more pages to query - getting more"
         (num_likes, num_comments) = self.parseNotes(next_query, num_likes, num_comments)
         return (num_likes, num_comments)
 def run(self):
     """ Generates the populated blog object 
     
     Organizes the parsing of a tumblr blog. For Tumblr blogs, you can use
     their api to get 50 posts at a time - scraping until we get no more posts
     """
     
     # We can parse tumblr blogs by adding "/page/n" to a regular tumblr query
     # or by changing the "start=n" value in the api interface._
     
     # Here, we use the api interface in the form of 
     # "http://(blogname).tumblr.com/api/read/?num=50start=n"
     # We put in the blog's name, and go through the posts by 50 until we run out
     offset = 0      # the post # we start querying the next 50 from
     
     # Extract blog title and build query string for the tumblr api
     blog_query = r"http://" + self.blog.name + r".tumblr.com/api/read/?num=50&start=" + str(offset)
     
     # Extract basic blog feature
     content = HttpLib.openAndRead(blog_query)
     soup = BeautifulSoup(content)
     basic_info = self.extractBasicTumblrInfo(content, soup, self.blog.name)
     
     num_posts = basic_info['num_posts']
     print "processing until %s posts\n" % num_posts
     
     
     # Walk through each set of 50 posts of the blog until get to end post
     while offset <= num_posts:
         try:
             print "querying %s" % blog_query
             source = HttpLib.openAndRead(blog_query)
         except:
             print "sanity check failed"
             raise
     
         new_posts = self.getBlogPosts(source, self.blog.name)    ## The actual feature extraction
         self.urls_found.extend(new_posts) # for scraper
         print "%s new posts urls for %s:" % (len(new_posts), self.blog.name)
         
         print "%s old posts in %s:" % (len(self.blog.posts), self.blog.name)
         
         if self.redundancyCheck(self.blog.posts, new_posts):
             for i, post in enumerate(new_posts):
                 self.blog.posts.append(Post(post))
         else:
             print "redundancy check failed - same content returned as prev. run"
             raise
         
         # Set-up for next run
         blog_query = blog_query.rstrip(str(offset))
         offset += 50
         blog_query += str(offset)
         
         #print "\n%s of the posts: " % len(new_posts)
         #print new_posts
     
     
     #self.urls_found = list(new_posts) # make new list
     # self.blog modified
     
     self.blog.processed=True
     print "finished processing blog %s" % self.blog.url
Esempio n. 7
0
    def next(self):
        """ Parses the next set of notes (reblogs, likes, and comments) for the given URL.
            
            returns a list of (poster, reblogger) urls """
            
        # The thing we will return
        list_of_relations = []
        num_reblogs = 0
        num_likes = 0
        source = None
            
        # Open the URL and convert to BeautifulSoup
        content = HttpLib.openAndRead(self.next_url_to_scrape, lower=False)
        soup = BeautifulSoup(content)
        
        # All of the "notes" are stored as <li> objects
        # We can use BeautifulSoup to extract them
        try:
            notes = soup.ol.findAll('li')
        except:
            print "no notes on page %s" % self.next_url_to_scrape
            self.next_url_to_scrape = -1
            return []
        
        # If We haven't returned by now, we have notes to process!
        print "processing %s notes from %s" % (len(notes), self.next_url_to_scrape)
        
        # Each "note" is a line that says "reblogged from" or "liked" or something
        # Here, we will process each note and build the reblog_map
        for note in notes:
            # Idenfity the "type" of note (reblog, like, etc.)
            # Do this by extracting info from the "class" tag
            try:
                note_type = note["class"].split()[1]   # extract the contents of the 'class' tag
                print "\nnote_type:", note_type
            except:
                continue


            # If the Note type is a "reblog", try to extract the graph information
            # about who is reblogging who
            
            # The convention will be "<person 1> reblogged <person 2>"
            # We find those two links and extract their URLs
            # (Where the 'x reblogged y' info comes from)
            if note_type == "reblog":
                try:
                    # Extract relevant information from the "note"
                    details = note.span.findAll('a')   
                    print "details: %s" % details
                    
                    # If it's a "reblog" but only one long - that means we've found
                    # The original poster!
                    if len(details) == 1:
                        source = Blogger(name=details[0]['title'].encode("utf-8"), url=details[0]['href'])
                        continue
                    
                    poster = Blogger(name=details[1]['title'].encode("utf-8"), url=details[1]['href'])
                    reblogger = Blogger(name=details[0]['title'].encode("utf-8"), url=details[0]['href'])
                    
                    
                    print "poster: %s" % poster
                    print "reblogger: %s" % reblogger
                    
                    # Add to results (reblog_map)
                    list_of_relations.append( (poster, reblogger) )
                    num_reblogs += 1
                    
                # If couldn't find information - whoops!
                except Exception as e:
                    print "Couldn't extract reblog relation(%s). Skipping for now" % details
                    print "error: ", e
            
            # A "Note" may not be a reblog relation - it could be a "like"
            # If so, count it
            elif note_type == "like":
                num_likes += 1
                
               
    
        # Find the next page of Notes to process [have to do this manually - no tumblr api for this]
        self.next_url_to_scrape = self._getUrlForNextPageOfNotes(content, self.next_url_to_scrape)
        
        return (list_of_relations, num_reblogs, num_likes, self.next_url_to_scrape, source)