def go(sTargetDirectory, iMaxFiles): print( 'go' ) if not os.path.isdir( sTargetDirectory ): os.makedirs( sTargetDirectory ) index_processor.get_fileInfos( sTargetDirectory ) zip_downloader.downloadFiles( sTargetDirectory ) test_samples = dataset_partitioner.draw_test_samples( sTargetDirectory ) zipsToDats( sTargetDirectory, test_samples, 'test' ) createSingleDat(sTargetDirectory, 'test', test_samples ) train10k_samples = dataset_partitioner.draw_training_10k( sTargetDirectory ) zipsToDats( sTargetDirectory, train10k_samples, 'train10k' ) createSingleDat(sTargetDirectory, 'train10k', train10k_samples )
def draw_samples( dataDirectory, numSamples ): # draws filename, and game index number, from the available games # without replacement (so we should check for dupes :-( ) # first we should create a single list, containing pairs of ( filename, gameindex ) # then we will draw samples from this # we should restrict the available games to something static, eg everything up to dec 2014, inclusive availableGames = [] fileinfos = index_processor.get_fileInfos( dataDirectory ) for fileinfo in fileinfos: filename = fileinfo['filename'] year = int( filename.split('-')[1].split('_')[0] ) if year > 2014: continue # ignore after 2014, to keep the set of games fixed numgames = fileinfo['numGames'] for i in range( numgames ): availableGames.append( ( filename, i ) ) print( 'total num games: ' + str( len( availableGames ) ) ) # need to seed random first random.seed(0) samplesSet = set() while len( samplesSet ) < numSamples: sample = random.choice( availableGames ) if sample not in samplesSet: samplesSet.add( sample ) print( 'Drawn ' + str( numSamples ) + ' samples:' ) # copy to list samples = list( samplesSet ) return samples
def draw_all_training( dataDirectory ): test_samples = draw_test_samples( dataDirectory ) availableGames = [] fileinfos = index_processor.get_fileInfos( dataDirectory ) for fileinfo in fileinfos: filename = fileinfo['filename'] year = int( filename.split('-')[1].split('_')[0] ) if year > 2014: continue # ignore after 2014, to keep the set of games fixed numgames = fileinfo['numGames'] for i in range( numgames ): availableGames.append( ( filename, i ) ) print( 'total num games: ' + str( len( availableGames ) ) ) # need to seed random first random.seed(0) # I suppose the first 100 samples will be the testing ones :-P # anyway, just skip those.... samplesSet = set() for sample in availableGames: if sample not in test_samples: samplesSet.add( sample ) print( 'Drawn all samples, ie ' + str( len( samplesSet ) ) + ' samples:' ) # copy to list samples = list( samplesSet ) return samples
def draw_samples(dataDirectory, numSamples): # draws filename, and game index number, from the available games # without replacement (so we should check for dupes :-( ) # first we should create a single list, containing pairs of ( filename, gameindex ) # then we will draw samples from this # we should restrict the available games to something static, eg everything up to dec 2014, inclusive availableGames = [] fileinfos = index_processor.get_fileInfos(dataDirectory) for fileinfo in fileinfos: filename = fileinfo['filename'] year = int(filename.split('-')[1].split('_')[0]) if year > 2014: continue # ignore after 2014, to keep the set of games fixed numgames = fileinfo['numGames'] for i in range(numgames): availableGames.append((filename, i)) print('total num games: ' + str(len(availableGames))) # need to seed random first random.seed(0) samplesSet = set() while len(samplesSet) < numSamples: sample = random.choice(availableGames) if sample not in samplesSet: samplesSet.add(sample) print('Drawn ' + str(numSamples) + ' samples:') # copy to list samples = list(samplesSet) return samples
def draw_all_training(dataDirectory): test_samples = draw_test_samples(dataDirectory) availableGames = [] fileinfos = index_processor.get_fileInfos(dataDirectory) for fileinfo in fileinfos: filename = fileinfo['filename'] year = int(filename.split('-')[1].split('_')[0]) if year > 2014: continue # ignore after 2014, to keep the set of games fixed numgames = fileinfo['numGames'] for i in range(numgames): availableGames.append((filename, i)) print('total num games: ' + str(len(availableGames))) # need to seed random first random.seed(0) # I suppose the first 100 samples will be the testing ones :-P # anyway, just skip those.... samplesSet = set() for sample in availableGames: if sample not in test_samples: samplesSet.add(sample) print('Drawn all samples, ie ' + str(len(samplesSet)) + ' samples:') # copy to list samples = list(samplesSet) return samples
def downloadFiles(sTargetDirectory): fileInfos = index_processor.get_fileInfos(sTargetDirectory) urlsToDo = [] for fileinfo in fileInfos: url = fileinfo['url'] print(url) sFilename = fileinfo['filename'] if not os.path.isfile(sTargetDirectory + '/' + sFilename): urlsToDo.append((url, sTargetDirectory + '/' + sFilename)) print('to do: ' + url + ' ... ') pool = multiprocessing.Pool(processes=16) try: it = pool.imap( worker, urlsToDo, ) for i in it: # print( i ) pass pool.close() pool.join() except KeyboardInterrupt: print("Caught KeyboardInterrupt, terminating workers") pool.terminate() pool.join() sys.exit(-1)
def draw_training_games( dataDirectory ): # gets list of all non-test games, that are no later than dec 2014 global testGames train_games = [] fileinfos = index_processor.get_fileInfos( dataDirectory ) for fileinfo in fileinfos: filename = fileinfo['filename'] year = int( filename.split('-')[1].split('_')[0] ) if year > 2014: continue # ignore after 2014, to keep the set of games fixed numgames = fileinfo['numGames'] for i in range( numgames ): sample = ( filename, i ) if sample not in testGames: train_games.append( sample ) print( 'total num training games: ' + str( len( train_games ) ) )
def draw_training_games(dataDirectory): # gets list of all non-test games, that are no later than dec 2014 global testGames train_games = [] fileinfos = index_processor.get_fileInfos(dataDirectory) for fileinfo in fileinfos: filename = fileinfo['filename'] year = int(filename.split('-')[1].split('_')[0]) if year > 2014: continue # ignore after 2014, to keep the set of games fixed numgames = fileinfo['numGames'] for i in range(numgames): sample = (filename, i) if sample not in testGames: train_games.append(sample) print('total num training games: ' + str(len(train_games)))
def downloadFiles( sTargetDirectory ): fileInfos = index_processor.get_fileInfos( sTargetDirectory ) urlsToDo = [] for fileinfo in fileInfos: url = fileinfo['url'] print( url ) sFilename = fileinfo['filename'] if not os.path.isfile( sTargetDirectory + '/' + sFilename ): urlsToDo.append( ( url, sTargetDirectory + '/' + sFilename ) ) print( 'to do: ' + url + ' ... ' ) pool = multiprocessing.Pool( processes = 16 ) try: it = pool.imap( worker, urlsToDo, ) for i in it: # print( i ) pass pool.close() pool.join() except KeyboardInterrupt: print( "Caught KeyboardInterrupt, terminating workers" ) pool.terminate() pool.join() sys.exit(-1)