def scrape_date(str_date, cur_date, passive=False): #directory = str_date games_short_name = str_date + '.all.tar.bz2' saved_games_bundle = games_short_name return_code = ERROR if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE): if DEBUG: print 'skipping because exists', str_date, saved_games_bundle, \ 'and not small (size=', os.stat(saved_games_bundle).st_size, ')' return_code = GOOD else: RemoveSmallFileIfExists(saved_games_bundle) if passive: return_code = MISSING elif not download_date(str_date, cur_date, saved_games_bundle): return_code = ERROR else: return_code = DOWNLOADED # Repackage an existing file, if found if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE) and \ not os.path.exists(repackage_filename(saved_games_bundle)): repackage_archive(saved_games_bundle) return_code = REPACKAGED return return_code
def scrape_date(str_date, cur_date, passive=False): #directory = str_date games_short_name = str_date + '.all.tar.bz2' saved_games_bundle = games_short_name return_code = ERROR if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE): if DEBUG: print 'skipping because exists', str_date, saved_games_bundle, \ 'and not small (size=', os.stat(saved_games_bundle).st_size, ')' return_code = GOOD else: RemoveSmallFileIfExists(saved_games_bundle) if passive: return_code = MISSING elif not download_date(str_date, cur_date, saved_games_bundle): return_code = ERROR return_code = DOWNLOADED # Repackage an existing file, if found if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE) and \ not os.path.exists(repackage_filename(saved_games_bundle)): repackage_archive(saved_games_bundle) return_code = REPACKAGED return return_code
args = parser.parse_args() utils.ensure_exists('static/scrape_data') os.chdir('static/scrape_data') for cur_date in utils.daterange(datetime.date(2010, 10, 15), datetime.date.today()): str_date = time.strftime("%Y%m%d", cur_date.timetuple()) if not utils.includes_day(args, str_date): print 'skipping', str_date, 'because not in cmd line arg daterange' continue directory = str_date print str_date games_short_name = str_date + '.all.tar.bz2' saved_games_bundle = directory + '/' + games_short_name if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE): print 'skipping because exists', str_date, saved_games_bundle, \ 'and not small (size=', os.stat(saved_games_bundle).st_size, ')' continue if not os.path.exists(directory): os.mkdir(directory) RemoveSmallFileIfExists(saved_games_bundle) url = IsotropicGamesCollectionUrl(cur_date) print 'getting', saved_games_bundle, 'at', url filename, headers = MyURLOpener().retrieve(url, saved_games_bundle) time.sleep(5) os.chdir(directory) cmd = 'tar -xjvf ' + games_short_name
os.unlink(fn) args = parser.parse_args() for cur_date in utils.daterange(datetime.date(2010, 10, 15), datetime.date.today()): str_date = time.strftime("%Y%m%d", cur_date.timetuple()) if not utils.IncludesDay(args, str_date): print 'skipping', str_date, 'because not in cmd line arg daterange' continue directory = str_date print str_date games_short_name = str_date + '.all.tar.bz2' saved_games_bundle = directory + '/' + games_short_name if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE): print 'skipping because exists', str_date, saved_games_bundle, \ 'and not small (size=', os.stat(saved_games_bundle).st_size, ')' else: if not os.path.exists(directory): os.mkdir(directory) RemoveSmallFileIfExists(saved_games_bundle) urls_by_priority = [ CouncilroomGamesCollectionUrl(cur_date), IsotropicGamesCollectionUrl(cur_date) ] for url in urls_by_priority: print 'getting', saved_games_bundle, 'at', url contents = urllib.urlopen(url).read()