def run(self): sc = SparkContext() sqlContext = SQLContext(sc) #sqlContext = HiveContext(sc) start_scrape = datetime.now() begin, begin_parts = scrape.get_boundary(self.begin) end, end_parts = scrape.get_boundary(self.end) print "here" all_years_months_days = self.getYearsMonths() print "all_years=", all_years_months_days game_ids = scrape.get_games(all_years_months_days, source=scrape.filesystem_scraper) print "games=", game_ids gamesRDD = sc.parallelize(game_ids) gamesRDD.cache() print "fileRDD=", gamesRDD print "# parttions:", gamesRDD.getNumPartitions() print "count=", gamesRDD.count() # create RDDs self.createRawParquet(sc, sqlContext, gamesRDD) # Hitter Stats batter_games = self.createHitterStats(sqlContext) # create Pitcher Stats self.createPitcherStats(sqlContext) print "STOPPING" sc.stop()
def run(self): sc = SparkContext() start_scrape = datetime.now() begin, begin_parts = scrape.get_boundary(self.begin) end, end_parts = scrape.get_boundary(self.end) session = requests.Session() print "here" all_years_months_days = self.getYearsMonths(self.WEB_ROOT, session) games = scrape.get_games(all_years_months_days, session=session) gamesRDD = sc.parallelize(games) print "fileRDD=", gamesRDD gamesRDD.foreach(dump) print "# parttions:", gamesRDD.getNumPartitions() print "count=", gamesRDD.count() res = gamesRDD.map(getFiles).reduce(summarize) print "res=", res count = res[0] fails = res[1] #files = scrape.get_files(games, session=session) #count, fails = scrape.download(files, self.cache) end_scrape = datetime.now() self.log.info("%d files downloaded in %s", count, str(end_scrape - start_scrape)) if fails: for url in fails: self.log.error("failed to download %s", url) sc.stop()