def go(sTargetDirectory, iMaxFiles):
    print( 'go' )
    if not os.path.isdir( sTargetDirectory ):
        os.makedirs( sTargetDirectory )
    index_processor.get_fileInfos( sTargetDirectory )
    zip_downloader.downloadFiles( sTargetDirectory )

    test_samples = dataset_partitioner.draw_test_samples( sTargetDirectory )
    zipsToDats( sTargetDirectory, test_samples, 'test' )
    createSingleDat(sTargetDirectory, 'test', test_samples )

    train10k_samples = dataset_partitioner.draw_training_10k( sTargetDirectory )
    zipsToDats( sTargetDirectory, train10k_samples, 'train10k' )
    createSingleDat(sTargetDirectory, 'train10k', train10k_samples )
def draw_samples( dataDirectory, numSamples ):
    # draws filename, and game index number, from the available games
    # without replacement (so we should check for dupes :-( )

    # first we should create a single list, containing pairs of ( filename, gameindex )
    # then we will draw samples from this
    # we should restrict the available games to something static, eg everything up to dec 2014, inclusive
    availableGames = []
    fileinfos = index_processor.get_fileInfos( dataDirectory )
    for fileinfo in fileinfos:
        filename = fileinfo['filename']
        year = int( filename.split('-')[1].split('_')[0] )
        if year > 2014:
            continue  # ignore after 2014, to keep the set of games fixed
        numgames = fileinfo['numGames']
        for i in range( numgames ):
            availableGames.append( ( filename, i ) )
    print( 'total num games: ' + str( len( availableGames ) ) )

    # need to seed random first
    random.seed(0)
    
    samplesSet = set()
    while len( samplesSet ) < numSamples:
        sample = random.choice( availableGames )
        if sample not in samplesSet:
            samplesSet.add( sample )
    print( 'Drawn ' + str( numSamples ) + ' samples:' )
    # copy to list
    samples = list( samplesSet )
    return samples
def draw_all_training( dataDirectory ):
    test_samples = draw_test_samples( dataDirectory )

    availableGames = []
    fileinfos = index_processor.get_fileInfos( dataDirectory )
    for fileinfo in fileinfos:
        filename = fileinfo['filename']
        year = int( filename.split('-')[1].split('_')[0] )
        if year > 2014:
            continue  # ignore after 2014, to keep the set of games fixed
        numgames = fileinfo['numGames']
        for i in range( numgames ):
            availableGames.append( ( filename, i ) )
    print( 'total num games: ' + str( len( availableGames ) ) )

    # need to seed random first
    random.seed(0)
    # I suppose the first 100 samples will be the testing ones :-P
    # anyway, just skip those....
    
    samplesSet = set()
    for sample in availableGames:
        if sample not in test_samples:
            samplesSet.add( sample )
    print( 'Drawn all samples, ie ' + str( len( samplesSet ) ) + ' samples:' )
    # copy to list
    samples = list( samplesSet )
    return samples
def draw_samples(dataDirectory, numSamples):
    # draws filename, and game index number, from the available games
    # without replacement (so we should check for dupes :-( )

    # first we should create a single list, containing pairs of ( filename, gameindex )
    # then we will draw samples from this
    # we should restrict the available games to something static, eg everything up to dec 2014, inclusive
    availableGames = []
    fileinfos = index_processor.get_fileInfos(dataDirectory)
    for fileinfo in fileinfos:
        filename = fileinfo['filename']
        year = int(filename.split('-')[1].split('_')[0])
        if year > 2014:
            continue  # ignore after 2014, to keep the set of games fixed
        numgames = fileinfo['numGames']
        for i in range(numgames):
            availableGames.append((filename, i))
    print('total num games: ' + str(len(availableGames)))

    # need to seed random first
    random.seed(0)

    samplesSet = set()
    while len(samplesSet) < numSamples:
        sample = random.choice(availableGames)
        if sample not in samplesSet:
            samplesSet.add(sample)
    print('Drawn ' + str(numSamples) + ' samples:')
    # copy to list
    samples = list(samplesSet)
    return samples
def draw_all_training(dataDirectory):
    test_samples = draw_test_samples(dataDirectory)

    availableGames = []
    fileinfos = index_processor.get_fileInfos(dataDirectory)
    for fileinfo in fileinfos:
        filename = fileinfo['filename']
        year = int(filename.split('-')[1].split('_')[0])
        if year > 2014:
            continue  # ignore after 2014, to keep the set of games fixed
        numgames = fileinfo['numGames']
        for i in range(numgames):
            availableGames.append((filename, i))
    print('total num games: ' + str(len(availableGames)))

    # need to seed random first
    random.seed(0)
    # I suppose the first 100 samples will be the testing ones :-P
    # anyway, just skip those....

    samplesSet = set()
    for sample in availableGames:
        if sample not in test_samples:
            samplesSet.add(sample)
    print('Drawn all samples, ie ' + str(len(samplesSet)) + ' samples:')
    # copy to list
    samples = list(samplesSet)
    return samples
def downloadFiles(sTargetDirectory):
    fileInfos = index_processor.get_fileInfos(sTargetDirectory)
    urlsToDo = []
    for fileinfo in fileInfos:
        url = fileinfo['url']
        print(url)
        sFilename = fileinfo['filename']
        if not os.path.isfile(sTargetDirectory + '/' + sFilename):
            urlsToDo.append((url, sTargetDirectory + '/' + sFilename))
            print('to do: ' + url + ' ... ')
    pool = multiprocessing.Pool(processes=16)
    try:
        it = pool.imap(
            worker,
            urlsToDo,
        )
        for i in it:
            # print( i )
            pass
        pool.close()
        pool.join()
    except KeyboardInterrupt:
        print("Caught KeyboardInterrupt, terminating workers")
        pool.terminate()
        pool.join()
        sys.exit(-1)
def draw_training_games( dataDirectory ):
    # gets list of all non-test games, that are no later than dec 2014
    global testGames
    train_games = []
    fileinfos = index_processor.get_fileInfos( dataDirectory )
    for fileinfo in fileinfos:
        filename = fileinfo['filename']
        year = int( filename.split('-')[1].split('_')[0] )
        if year > 2014:
            continue  # ignore after 2014, to keep the set of games fixed
        numgames = fileinfo['numGames']
        for i in range( numgames ):
            sample = ( filename, i )
            if sample not in testGames:
                train_games.append( sample )
    print( 'total num training games: ' + str( len( train_games ) ) )
def draw_training_games(dataDirectory):
    # gets list of all non-test games, that are no later than dec 2014
    global testGames
    train_games = []
    fileinfos = index_processor.get_fileInfos(dataDirectory)
    for fileinfo in fileinfos:
        filename = fileinfo['filename']
        year = int(filename.split('-')[1].split('_')[0])
        if year > 2014:
            continue  # ignore after 2014, to keep the set of games fixed
        numgames = fileinfo['numGames']
        for i in range(numgames):
            sample = (filename, i)
            if sample not in testGames:
                train_games.append(sample)
    print('total num training games: ' + str(len(train_games)))
def downloadFiles( sTargetDirectory ):
    fileInfos = index_processor.get_fileInfos( sTargetDirectory )
    urlsToDo = []
    for fileinfo in fileInfos:
        url = fileinfo['url']
        print( url )
        sFilename = fileinfo['filename']
        if not os.path.isfile( sTargetDirectory + '/' + sFilename ):
            urlsToDo.append( ( url, sTargetDirectory + '/' + sFilename ) )
            print( 'to do: ' + url + ' ... ' )
    pool = multiprocessing.Pool( processes = 16 )
    try:
        it = pool.imap( worker, urlsToDo,  )
        for i in it:
            # print( i )
            pass
        pool.close()
        pool.join()
    except KeyboardInterrupt:
        print( "Caught KeyboardInterrupt, terminating workers" )
        pool.terminate()
        pool.join()
        sys.exit(-1)