Beispiel #1
0
def create_map():
	url = request.args.get('url', '', type=str) # Get url to scrape for reblogging information
	url = url if len(url) > 0 else 'http://ioncewishediwasawitch.tumblr.com/post/15710423803' # A default url for testing purposes
	
	# Scrape url for reblogging information
	a = PostScraper(url)
	if a.has_next():
		(list_of_relations, num_reblogs, num_likes, next_url_to_scrape, source) = a.next()
		
	# Return reblogging information in json format
	list_of_relations = [{'poster': _blgr2dict(poster), 'reblogger': _blgr2dict(reblogger)} for poster, reblogger in list_of_relations]
	return jsonify(	list_of_relations = list_of_relations, 
					num_reblogs = num_reblogs, 
					num_likes = num_likes,
					next_url_to_scrape = next_url_to_scrape,
					source = _blgr2dict(source))
Beispiel #2
0
    for poster in posters.values():
        print "\n", poster.name, ":"
        for reblogger in poster.rebloggers:
            print "\t",reblogger.name

if __name__ == "__main__":
    try:
        url = sys.argv[1]
    except:
        #url = "http://ioncewishediwasawitch.tumblr.com/post/15710423803" #1 page of notes
        #url = "http://chanaaaa.tumblr.com/post/5326278147" #2,500 notes
        #url = "http://beepbeepfagg0tt.tumblr.com/post/15727549088" #171 notes
        url = "http://beepbeepfagg0tt.tumblr.com/post/15553114034" #151 notes
        #url = "http://beepbeepfagg0tt.tumblr.com/post/15478680896"
    
    a = PostScraper(url)
    
    posters = {}
    num_likes = 0
    num_reblogs = 0    
    post_source = None
    
    # Iterate through all notes and build reblog map
    while a.has_next():
        (new_relations, likes, reblogs, next_url, ret_source) = a.next()
        
        num_likes += likes
        num_reblogs += reblogs
        post_source = post_source or ret_source
        print "post-source: ", post_source
        
    def sendScraper(self, scr_type=None):
        """Chooses a url from self.urls_unvisited and sends out an appropriate
        scraper. Returns -1 if there are no unvisited urls left."""
        print "\nRequesting to send out a %s scraper" % scr_type
        s = Session() # Connection to DB
        q = s.query(UnvisitedURLs).first() # Check if there are any urls to scrape
        
        if q == None:
            print "No URLs to scrape"
            print "%s threads still running" % len(self.threads_out)
            return -1
            
        else:
            if scr_type == "blog":
                url_obj_to_scrape = s.query(UnvisitedURLs).filter_by(url_type=0).first()

            elif scr_type == "post":
                url_obj_to_scrape = s.query(UnvisitedURLs).filter_by(url_type=1).first()
            
            # 'getFirstBlog' (or post) return None if can't complete task.
            # In that case,url == None is true, and we can just pop the first one
            if not url_obj_to_scrape:
                print "Couldn't find a %s to process - sending out other type instead" % scr_type
                url_obj_to_scrape = s.query(UnvisitedURLs).first()
                
            print "sending out scraper to ", url_obj_to_scrape
                
            # Decide which scraper to send out
            if self.isPostUrl(url_obj_to_scrape.url):
                scraper = PostScraper(url_obj_to_scrape.url)
            else:                
                scraper = BlogScraper(url_obj_to_scrape.url)
                
            # Remove from to_send_out list
            s.delete(url_obj_to_scrape)
            s.commit()
                
            print "sending out scraper for", url_obj_to_scrape.url
            scraper.start()
            self.threads_out.append(scraper)
            return scraper
            
            
	def processReblogRelations(self, post, reblog_map):
        """ For each post object returned, try to update the ReblogRelations 
            table with all of the information included """

        print "\nProcessing %s Reblog Relations for %s" % (len(reblog_map.keys()), post.url)
        print reblog_map
        
        if len(reblog_map.keys()) <= 0:
            print "nothing to process. returning"            
            return

        # If the post reblog_key is already in this table (anywhere),
        # We can assume that we've already completely processed it 
        # (Because of the way Tumblr works)
        # And can return without doing anything
        s = Session()
        any_entries_in_db = s.query(ReblogRelations).filter_by(reblog_key=post.reblog_key).count()
        if any_entries_in_db > 0:
            print "Reblog relations for %s already processed!" % post.url
            print "Doing nothing."
            return

        # Otherwise, 
        # Iterate through reblog map
        # {'poster':['reblogger1', 'reblogger2',...]}        
        for poster, reblogger_list in reblog_map.iteritems():
            for reblogger in reblogger_list:
                new_relation = ReblogRelations(post.reblog_key, poster, reblogger)
                s.add(new_relation)
                
        s.commit()
        
        
        
        
    def updateDB(self, to_add):
        """ Updates the database (in the correct table) with the given object """
        try:
            print "\nupdating DB to include ", to_add.url
            table_type = self.getUrlType(to_add.url)
            s = Session()
            
            if table_type == 0: #blog
                old_entry = s.query(Blog).filter_by(url=to_add.url).first()
                if old_entry:
                    print "ahhh! we processed a blog we shouldn'tve"
                    print old_entry
                    raise
            else: # post
                print "getting post from db"
                old_entry = s.query(Post).filter_by(url=to_add.url).first()
                print "got post from db"
                
            # if the entry exists at all
            if old_entry:
                old_entry.update(to_add)
                s.commit()
            
                print "successfully updated old entry"
            else:
                print "%s not in database yet. Nothing to remove" % to_add.url
                print "trying to add ", to_add.url, " to Database"
                s.add(to_add)
                s.commit()
                print "successfully added %s to Database" % (to_add.url)
        except:
            print "couldn't add %s to Database" % to_add.url
def scrape(url):
    page_id_and_access_token = get_access_token_page_id(url)
    page_id = page_id_and_access_token[0]
    access_token = page_id_and_access_token[1]
    create_directories(page_id)
    PostScraper(page_id, access_token).scrapeFacebookPageFeedStatus()