Esempio n. 1
0
    def update(self, url = ''):
        """Retrieve HTML from first page and scrape basic info 
        """
   
        if not url:
            url = self.url 

        self.url = vbutils.cleanURL(url)
        self.id = vbutils.findThreadID(self.url)
        page = []
        print "Scraping %s ..." % self.url
        page.append(getPage(self.url))
        self.numpages = int(vbscrape.scrapeNumPages(page[0]))
        print "Found %s pages." % str(self.numpages)
        for p in range(1, self.numpages):
            print "Scraping page %s of %s ..." % (str(p+1), str(self.numpages))
            page.append(getPage(self.url, (p + 1)))

        print "Importing data from HTML ..."
        self.importHTML(page, self.url)

        self.lastupdate = vbutils.getDateTime()   
        print "Thread update completed at %s" % self.lastupdate
Esempio n. 2
0
    def update(self, platform_ = ''):
        """Download latest version of all threads
            and update the JSON summary file.
        """
        # Very important that platform_ is set correctly
        # Filename transformations depend on this setting
        if not platform_:
            if self.platform:
                platform_ = self.platform
            else:
                platform_ = 'unix'
        
        # Loop over each thread in each forum
        #   and download latest data
        for forumid, forumobj in self.forum.iteritems():
            print "Checking %s threads in %s for updates ..." % (len(forumobj.thread), forumid)
            for threadid, threadobj in forumobj.thread.iteritems():
                # TODO Problem: Thread objects created from
                #   arbitrary HTML/JSON may not have URL 
                #   Maybe we can implement a smarter URL guessing
                #   heuristic based on other things 
                #   Even google search? :-?
                if not downloadThread(threadobj, self.localdir, platform = platform_):
                    print "Failed to download thread %s" % threadobj.title
                    print "Attempting to proceed anyway ..."

        # Sync this Archive object with new data on disk 
        # os.walk gives us an iterator of a dir tree
        # TODO Need to catch errors
        print "Syncing new data in %s" % self.localdir

        currentforum = ''
        currentthreadlist = [] 
        currentthread = ''
        currentinstance = ''
        # Iterate through the rest of the subdirs
        # They won't be a reliable order so we need to
        #   figure out where we are in the tree each time
        for root, dirs, files in os.walk(self.localdir, topdown=True):
            lastdir = os.path.split(root.rstrip('/'))[1]
            nextlastdir = os.path.split(os.path.split(root.rstrip('/'))[0]) 
            if (root == self.localdir):
                # We are in archive root
                # Create a Forum object for each subdir
                for d in dirs:
                    print "Found forum %s" % d
                    self._addForum(d)
            elif (lastdir in self.forum.keys()):
                # We are in a forum dir
                currentforum = lastdir
                # Assume all files are subdirs named by slugs
                # TODO Need to validate these dirs
                currentthreadlist = dirs 
                for t in currentthreadlist:
                    print "Found thread %s" % t
                currentinstance = ''            
            elif (lastdir in currentthreadlist):
                # We are in a thread dir
                currentthread = lastdir
                # Assume all files are subdirs with instances
                # Sort by date 
                print "Reviewing thread %s" % currentthread
                print "Found %s saved instances" % len(dirs)
                dirs.sort()
                dirs = [dirs[0]]
                currentinstance = dirs[0]
                print "Latest update was on %s" % currentinstance
            elif (lastdir == currentinstance):
                # We are in the dir of an instance
                # Find an original html page,
                #   e.g. showthread.php@t=01235.orig
                # TODO this is vB convention need to be template
                try:
                    for f in files:
                        print f
                    orig_file = (f for f in files if (f[:4] == 'show' and f[-4:] == 'orig')).next()
                    print "Found source file: %s" % orig_file
                except:
                    orig_file = ''
                    print "Source HTML file not found"
                    break

                # TODO Much easier if subdirs = IDs instead of slugs
                # TODO Is this something we should change?
                id = vbutils.findThreadID(orig_file)

                # Is there already a thread object?
                if not id in self.forum[currentforum].thread.keys():
                    # Read the original html
                    subdir = root 
                    print "Attempting to read from %s" % subdir
                    with open(os.path.join(subdir, orig_file), 'r') as f:
                        orig_html = f.read()

                    # Find the URL in this HTML 
                    # TODO sometimes the URL is not discoverable
                    url = vbutils.findThreadURL(orig_html, id)
 
                    # Create this thread object 
                    # TODO Try/except might be cleaner
                    if not url:
                        print "No valid URL found in HTML."
                        print "Trying to make Thread object anyway from the HTML."
                        self.addThread(rawhtml_ = orig_html)
                    else:
                        if not self.addThread(url):
                            print "Tried and failed to create Thread object for URL: %s" % url

        # Last update is now!
        self.lastupdate = vbutils.getDateTime()        

        # If we survive that insane loop,
        #   the data model should be up to date
        # Store this updated data to a JSON summary
        self.writeSummary()