def create_map(): url = request.args.get('url', '', type=str) # Get url to scrape for reblogging information url = url if len(url) > 0 else 'http://ioncewishediwasawitch.tumblr.com/post/15710423803' # A default url for testing purposes # Scrape url for reblogging information a = PostScraper(url) if a.has_next(): (list_of_relations, num_reblogs, num_likes, next_url_to_scrape, source) = a.next() # Return reblogging information in json format list_of_relations = [{'poster': _blgr2dict(poster), 'reblogger': _blgr2dict(reblogger)} for poster, reblogger in list_of_relations] return jsonify( list_of_relations = list_of_relations, num_reblogs = num_reblogs, num_likes = num_likes, next_url_to_scrape = next_url_to_scrape, source = _blgr2dict(source))
for poster in posters.values(): print "\n", poster.name, ":" for reblogger in poster.rebloggers: print "\t",reblogger.name if __name__ == "__main__": try: url = sys.argv[1] except: #url = "http://ioncewishediwasawitch.tumblr.com/post/15710423803" #1 page of notes #url = "http://chanaaaa.tumblr.com/post/5326278147" #2,500 notes #url = "http://beepbeepfagg0tt.tumblr.com/post/15727549088" #171 notes url = "http://beepbeepfagg0tt.tumblr.com/post/15553114034" #151 notes #url = "http://beepbeepfagg0tt.tumblr.com/post/15478680896" a = PostScraper(url) posters = {} num_likes = 0 num_reblogs = 0 post_source = None # Iterate through all notes and build reblog map while a.has_next(): (new_relations, likes, reblogs, next_url, ret_source) = a.next() num_likes += likes num_reblogs += reblogs post_source = post_source or ret_source print "post-source: ", post_source
def sendScraper(self, scr_type=None): """Chooses a url from self.urls_unvisited and sends out an appropriate scraper. Returns -1 if there are no unvisited urls left.""" print "\nRequesting to send out a %s scraper" % scr_type s = Session() # Connection to DB q = s.query(UnvisitedURLs).first() # Check if there are any urls to scrape if q == None: print "No URLs to scrape" print "%s threads still running" % len(self.threads_out) return -1 else: if scr_type == "blog": url_obj_to_scrape = s.query(UnvisitedURLs).filter_by(url_type=0).first() elif scr_type == "post": url_obj_to_scrape = s.query(UnvisitedURLs).filter_by(url_type=1).first() # 'getFirstBlog' (or post) return None if can't complete task. # In that case,url == None is true, and we can just pop the first one if not url_obj_to_scrape: print "Couldn't find a %s to process - sending out other type instead" % scr_type url_obj_to_scrape = s.query(UnvisitedURLs).first() print "sending out scraper to ", url_obj_to_scrape # Decide which scraper to send out if self.isPostUrl(url_obj_to_scrape.url): scraper = PostScraper(url_obj_to_scrape.url) else: scraper = BlogScraper(url_obj_to_scrape.url) # Remove from to_send_out list s.delete(url_obj_to_scrape) s.commit() print "sending out scraper for", url_obj_to_scrape.url scraper.start() self.threads_out.append(scraper) return scraper def processReblogRelations(self, post, reblog_map): """ For each post object returned, try to update the ReblogRelations table with all of the information included """ print "\nProcessing %s Reblog Relations for %s" % (len(reblog_map.keys()), post.url) print reblog_map if len(reblog_map.keys()) <= 0: print "nothing to process. returning" return # If the post reblog_key is already in this table (anywhere), # We can assume that we've already completely processed it # (Because of the way Tumblr works) # And can return without doing anything s = Session() any_entries_in_db = s.query(ReblogRelations).filter_by(reblog_key=post.reblog_key).count() if any_entries_in_db > 0: print "Reblog relations for %s already processed!" % post.url print "Doing nothing." return # Otherwise, # Iterate through reblog map # {'poster':['reblogger1', 'reblogger2',...]} for poster, reblogger_list in reblog_map.iteritems(): for reblogger in reblogger_list: new_relation = ReblogRelations(post.reblog_key, poster, reblogger) s.add(new_relation) s.commit() def updateDB(self, to_add): """ Updates the database (in the correct table) with the given object """ try: print "\nupdating DB to include ", to_add.url table_type = self.getUrlType(to_add.url) s = Session() if table_type == 0: #blog old_entry = s.query(Blog).filter_by(url=to_add.url).first() if old_entry: print "ahhh! we processed a blog we shouldn'tve" print old_entry raise else: # post print "getting post from db" old_entry = s.query(Post).filter_by(url=to_add.url).first() print "got post from db" # if the entry exists at all if old_entry: old_entry.update(to_add) s.commit() print "successfully updated old entry" else: print "%s not in database yet. Nothing to remove" % to_add.url print "trying to add ", to_add.url, " to Database" s.add(to_add) s.commit() print "successfully added %s to Database" % (to_add.url) except: print "couldn't add %s to Database" % to_add.url
def scrape(url): page_id_and_access_token = get_access_token_page_id(url) page_id = page_id_and_access_token[0] access_token = page_id_and_access_token[1] create_directories(page_id) PostScraper(page_id, access_token).scrapeFacebookPageFeedStatus()