def assimilateNewUrls(self, urls): """adds urls to the UnvisitedURLs table if we don't have information on them in any of our tables""" # First, 'clean' the new urls - remove duplicates, and trailing slashes, etc. urls = HttpLib.cleanLinks(urls) print "\nAssimilating %s new urls\n" % len(urls) if len(urls) <= 0: return s = Session() # For each URL, for url in urls: # Query the Database & find out if URL already in it tables_to_query = [Blog, Post, UnvisitedURLs] to_save = True for table in tables_to_query: # Only care about entries that are fully processed - if they're not, we'll need to scrape them! entries = s.query(table).filter_by(url=url,processed=True).first() # If we found entries (!= None), don't add it to our 'unvisited urls' table if entries != None: to_save = False break if not to_save: continue # If didn't break by this point, then we should add the url to the # 'To save' session forthe UnvisitedURLs table #print "adding %s to be saved to the unvisited urls table" % url u = UnvisitedURLs(url=url) u.url_type = self.getUrlType(url) s.add(u) print "committing all unsaved urls to the database" s.commit() print "%s urls now in unvisitedURLs DB\n" % s.query(func.count(UnvisitedURLs)) urls = [] self.unvisited_urls = []
def getBlogPosts(self, content, blogname): """ Returns 'Post' objects with only their 'url' attribute filled out Gets all of the posts on a page, but only the ones that link to the 'mother blog' - not to others. Essentially a page's internal links Args: blog: the title of the blog (no url), ie. 'naivemelody' """ soup = BeautifulSoup(content) post_urls = soup.findAll('post') posts = [] # a list of 'Post' objects that we'll insert into the DB for post in post_urls: # Only save posts that belong to the 'mother blog' url = post["url"] url = HttpLib.cleanLinks([url])[0] if self.extractBlogName(url) == blogname: #posts.append(Post(url=url)) posts.append(url) #print "found posts for %s!" % blogname #print posts return posts