def run(self):

        sc = SparkContext()
        sqlContext = SQLContext(sc)
        #sqlContext = HiveContext(sc)

        start_scrape = datetime.now()
        begin, begin_parts = scrape.get_boundary(self.begin)
        end, end_parts = scrape.get_boundary(self.end)

        print "here"
        all_years_months_days = self.getYearsMonths()
        print "all_years=", all_years_months_days

        game_ids = scrape.get_games(all_years_months_days, source=scrape.filesystem_scraper)
        print "games=", game_ids

        gamesRDD = sc.parallelize(game_ids)
        gamesRDD.cache()
        print "fileRDD=", gamesRDD

        print "# parttions:", gamesRDD.getNumPartitions()
        print "count=", gamesRDD.count()

        # create RDDs
        self.createRawParquet(sc, sqlContext, gamesRDD)
    
        # Hitter Stats
        batter_games = self.createHitterStats(sqlContext)

        # create Pitcher Stats
        self.createPitcherStats(sqlContext)
        
        print "STOPPING"
        sc.stop()
    def run(self):


        sc = SparkContext()
        start_scrape = datetime.now()
        begin, begin_parts = scrape.get_boundary(self.begin)
        end, end_parts = scrape.get_boundary(self.end)


        session = requests.Session()

        print "here"
        all_years_months_days = self.getYearsMonths(self.WEB_ROOT, session)

        games = scrape.get_games(all_years_months_days, session=session)

        gamesRDD = sc.parallelize(games)
        print "fileRDD=", gamesRDD

        gamesRDD.foreach(dump)
        print "# parttions:", gamesRDD.getNumPartitions()
        print "count=", gamesRDD.count()
        res = gamesRDD.map(getFiles).reduce(summarize)
        print "res=", res

        count = res[0]
        fails = res[1]
        #files = scrape.get_files(games, session=session)
        #count, fails = scrape.download(files, self.cache)
        end_scrape = datetime.now()
        self.log.info("%d files downloaded in %s", count,
                 str(end_scrape - start_scrape))
        if fails:
            for url in fails:
                self.log.error("failed to download %s", url)

        sc.stop()