def main(): parser = argparse.ArgumentParser(description='Train a NN predictor from config', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('config', help='config file for model / training') args = parser.parse_args() tstart = time.time() with open(args.config) as f: config = yaml.safe_load(f.read()) collection_name = os.path.basename(os.path.dirname(args.config)).replace('configs_', '') name = os.path.basename(args.config).split('.')[0] outputDir = os.path.join('models', collection_name, name) os.makedirs(outputDir, exist_ok = True) tensorboard_writer = maia_chess_backend.torch.TB_wrapper(name, log_dir = os.path.join('runs', collection_name)) with torch.cuda.device(config['device']): maia_chess_backend.printWithDate(f"Loading model:{config['model']}") net = maia_chess_backend.torch.NetFromConfigNew(config['model']) train_loader, test_loader, val_loader = setupLoaders(config) try: train_loop(net, config, train_loader, test_loader, val_loader, tensorboard_writer, outputDir) except KeyboardInterrupt: net.save(os.path.join(outputDir, f"net-final.pt")) maia_chess_backend.printWithDate(f"Done everything in {humanize.naturaldelta(time.time() - tstart)}, exiting")
def writerWorker(outputFile, inputQueue, num_readers, name): i = -1 num_kill_remaining = num_readers tstart = time.time() maia_chess_backend.printWithDate("Writer created") with bz2.open(outputFile, 'wb') as f: maia_chess_backend.printWithDate(f"Created: {outputFile}") f.write((','.join(maia_chess_backend.full_csv_header) + '\n').encode('utf8')) tLast = time.time() while True: try: dat = inputQueue.get() except queue.Empty: #Should never happen break try: f.write(dat) except TypeError: if dat == 'kill': num_kill_remaining -= 1 if num_kill_remaining <= 0: break else: raise else: i += 1 if i % 1000 == 0 and time.time() - tLast > logging_delay: tLast = time.time() maia_chess_backend.printWithDate(f"{name} Written {i} games in {humanize.naturaldelta(time.time() - tstart)}, doing {(i + 1) /(time.time() - tstart):.0f} games a second", flush = True) maia_chess_backend.printWithDate("Received shutdown signal to writer") maia_chess_backend.printWithDate(f"Done a total of {i} games in {humanize.naturaldelta(time.time() - tstart)}")
def main(): parser = argparse.ArgumentParser(description='Make mmapped version of csv', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('inputs', nargs = '+', help='input csv') parser.add_argument('outputDir', help='output dir of mmapped files') parser.add_argument('--nrows', type=int, help='number of rows to read in, FOR TESTING', default = None) parser.add_argument('--min_elo', type=int, help='min active elo', default = 1000) parser.add_argument('--max_elo', type=int, help='max active elo', default = 4000) parser.add_argument('--allow_negative_loss', type=bool, help='allow winrate losses below 0', default = False) parser.add_argument('--allow_low_time', type=bool, help='Include low time moves', default = False) parser.add_argument('--min_ply', type=int, help='min move ply to consider', default = 6) parser.add_argument('--nb_to_b_ratio', type=float, help='ratio fof blunders to non blunders in dataset', default = 1.5) #parser.add_argument('split_column', help='what to split the csvs on, i.e. is_blunder') #parser.add_argument('y_vals', nargs = '+', help='columns to treate as y vals') args = parser.parse_args() maia_chess_backend.printWithDate(f"Starting mmap of {', '.join(args.inputs)} writing to {args.outputDir} with {', '.join(mmap_columns)}") with multiprocessing.Pool(32) as pool: pool.starmap(run_path, [(p, args) for p in args.inputs]) maia_chess_backend.printWithDate("Done")
def run_path(path, args): #helper for multiproc try: mmap_csv( path, load_csv(path, args.nrows), args.outputDir, args, ) except EOFError: maia_chess_backend.printWithDate(f"EOF error in: {path}")
def gamesConverter(inputQueue, outputQueue, allow_non_sf): #maia_chess_backend.printWithDate("Converter created") while True: try: #print('qsize', inputQueue.qsize()) dat = inputQueue.get() except queue.Empty: break if dat == 'kill': outputQueue.put('kill', True, 1000) break else: try: s = maia_chess_backend.gameToCSVlines(dat, allow_non_sf = allow_non_sf) except maia_chess_backend.NoStockfishEvals: pass except: maia_chess_backend.printWithDate('error:') maia_chess_backend.printWithDate(dat) maia_chess_backend.printWithDate(traceback.format_exc()) raise else: if len(s) > 0: lines = '\n'.join(s) + '\n' outputQueue.put(lines.encode('utf8'), True, 1000) maia_chess_backend.printWithDate("Received shutdown signal to Converter", flush = True)
def main(): parser = argparse.ArgumentParser(description='process PGN file with stockfish annotaions into a csv file', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help='input PGNs') parser.add_argument('outputDir', help='output CSVs dir') parser.add_argument('--pool', type=int, help='number of simultaneous jobs running per file', default = 30) parser.add_argument('--allow_non_sf', help='Allow games with no stockfish info', default = False, action="store_true") #parser.add_argument('--debug', help='DEBUG MODE', default = False, action="store_true") #parser.add_argument('--readers', type=int, help='number of simultaneous reader running per inputfile', default = 24) parser.add_argument('--queueSize', type=int, help='Max number of games to cache', default = 1000) args = parser.parse_args() maia_chess_backend.printWithDate(f"Starting CSV conversion of {args.input} writing to {args.outputDir}") os.makedirs(args.outputDir, exist_ok=True) name = os.path.basename(args.input).split('.')[0] outputName = os.path.join(args.outputDir, f"{name}.csv.bz2") #names[n] = (name, outputName) maia_chess_backend.printWithDate(f"Loading file: {name}") maia_chess_backend.printWithDate(f"Starting main loop") tstart = time.time() with multiprocessing.Manager() as manager: with multiprocessing.Pool(args.pool) as workers_pool, multiprocessing.Pool(3) as io_pool: pgnReader, gameReader, writer, unproccessedQueue, resultsQueue = processPGN(args.input, name, outputName, args.queueSize, args.pool, args.allow_non_sf, manager, workers_pool, io_pool) maia_chess_backend.printWithDate(f"Done loading Queues in {humanize.naturaldelta(time.time() - tstart)}, waiting for reading to finish") cleanup(pgnReader, gameReader, writer)
def readerWorker(inputPath, unproccessedQueue, resultsQueue, name, num_readers): tstart = time.time() gamesFile = maia_chess_backend.LightGamesFile(inputPath, just_games = True) try: tLast = time.time() for i, (_, gs) in enumerate(gamesFile): unproccessedQueue.put(gs, True, 1000) if i % 1000 == 0 and time.time() - tLast > logging_delay: tLast = time.time() maia_chess_backend.printWithDate(f"{name} Loaded {i} games, input queue depth: {unproccessedQueue.qsize()}, ouput queue depth: {resultsQueue.qsize()}", flush = True) except (EOFError, StopIteration): pass maia_chess_backend.printWithDate(f"{name} Done loading Queue in {humanize.naturaldelta(time.time() - tstart)}, sending kills") for i in range(num_readers): #maia_chess_backend.printWithDate(f"Putting kill number {i} in queue") unproccessedQueue.put('kill', True, 100)
def processPGN(gamesPath, inputName, outputName, queueSize, poolSize, allow_non_sf, manager, workers_pool, io_pool): unproccessedQueue = manager.Queue(queueSize) resultsQueue = manager.Queue(queueSize) readers = [] for _ in range(poolSize - 1): reader = workers_pool.apply_async(gamesConverter, (unproccessedQueue, resultsQueue, allow_non_sf)) readers.append(reader) maia_chess_backend.printWithDate(f"{inputName} Started {len(readers)} readers", flush = True) pgnReader = io_pool.apply_async(readerWorker, (gamesPath, unproccessedQueue, resultsQueue, inputName, len(readers))) maia_chess_backend.printWithDate(f"{inputName} loader created") writer = io_pool.apply_async(writerWorker, (outputName, resultsQueue, len(readers), inputName)) maia_chess_backend.printWithDate(f"{inputName} Started writer for: {inputName}", flush = True) return pgnReader, readers, writer, unproccessedQueue, resultsQueue
def make_df_mmaps(df, name, output_dir): os.makedirs(output_dir, exist_ok = True) mmaps = {} maia_chess_backend.printWithDate(f"Making y_vals mmaps for: {name}", flush = True) for y_name in mmap_columns: make_var_mmap(y_name, output_dir, mmaps, df) #print(y_name, end = ' ', flush = True) make_game_id_mmap(output_dir, mmaps, df) maia_chess_backend.printWithDate(f"Making move array mmaps for: {name}", flush = True) make_move_mmap(output_dir, mmaps, df) maia_chess_backend.printWithDate(f"Making boards array mmaps for: {name}", flush = True) make_board_mmap(output_dir, mmaps, df)
def cleanup(pgnReaders, gameReaders, writers): #time.sleep(10) while len(gameReaders) > 0: for i in range(len(gameReaders)): #maia_chess_backend.printWithDate(f"Checking {i} of {len(gameReaders)}", flush = True) try: gameReaders[i].get(1) except multiprocessing.TimeoutError: pass else: del gameReaders[i] break maia_chess_backend.printWithDate(f"Done processing") pgnReaders.get() maia_chess_backend.printWithDate(f"Done reading") writers.get() maia_chess_backend.printWithDate(f"Done cleanup")
def get_latest_chunks(path): chunks = [] maia_chess_backend.printWithDate(f"found {glob.glob(path)} chunk dirs") for d in glob.glob(path): maia_chess_backend.printWithDate(f"found {len(chunks)} chunks", end='\r') chunks += glob.glob(os.path.join(d, '*.gz')) maia_chess_backend.printWithDate(f"found {len(chunks)} chunks total") if len(chunks) < 10: print("Not enough chunks {}".format(len(chunks))) sys.exit(1) if len(chunks) < 1000: print("There are not very many chunks so results may be unstable") print("sorting {} chunks...".format(len(chunks)), end='') chunks.sort(key=os.path.getmtime, reverse=True) print("[done]") print("{} - {}".format(os.path.basename(chunks[-1]), os.path.basename(chunks[0]))) random.shuffle(chunks) return chunks
def train_loop(net, config, train_loader, test_loader, val_loader, tensorboard_writer, outputDir): maia_chess_backend.printWithDate(f"Starting training loop") if torch.cuda.is_available(): net.cuda() lastFewAcs = [] optimizer = torch.optim.Adam( net.parameters(), lr = config['training']['lr_intial'], #momentum = 0.9, weight_decay = 0.0001, betas = (0.9, 0.999), eps = 1e-8, #nesterov = True, ) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=config['training']['lr_steps'], gamma=config['training']['lr_gamma'], ) loss_reg = torch.nn.MSELoss(reduction='mean') loss_class = torch.nn.CrossEntropyLoss(ignore_index = -1) epoch_losses = {'count' : 0} step_durations = [] tstart = time.time() tests = 0 last_save = time.time() for step in range(config['training']['total_steps']): delta_t = step_train(net, train_loader, optimizer, loss_reg, loss_class, epoch_losses) step_durations.append(delta_t) if step % 100 == 0: i = step % config['training']['test_steps'] maia_chess_backend.printWithDate(f"Step {step} {i /config['training']['test_steps']*100:02.0f}% {make_info_str(epoch_losses)} {(i + 1) / (time.time() - tstart):03.2f} steps/second", end = '\r') if step > 0 and step % config['training']['validate_steps'] == 0: val_results = step_validate(net, val_loader, config['training']['test_size'] * 10) else: val_results = None if step > 0 and step % config['training']['test_steps'] == 0: tests += 1 maia_chess_backend.printWithDate(f"Training step {step} losses: {make_info_str(epoch_losses)}" + ' ' * 10) test_losses, accuracies = step_test(net, test_loader, loss_reg, loss_class, config['training']['test_size']) maia_chess_backend.printWithDate(f"Testing {tests} step {step} losses: {make_info_str(test_losses)} accuracy: {make_info_str(accuracies)}") last_save, batch_acc = save_results(step, tests, last_save, net, tensorboard_writer, epoch_losses, test_losses, accuracies, val_results, optimizer, step_durations, config['training']['batch_size'], train_loader.num_blunders, train_loader.num_nonblunders, outputDir) if tests == 1: t_x, t_y = next(train_loader) if net.has_extras: tensorboard_writer.add_graph( net, input_to_model = (t_x, t_y), ) else: tensorboard_writer.add_graph(net,input_to_model = t_x) if config['training'].get('auto_stop', None) is not None and batch_acc is not None: lastFewAcs.append(batch_acc) if len(lastFewAcs) - np.argmax(lastFewAcs) > config['training'].get('auto_stop', None) - 1: break epoch_losses = {'count' : 0} step_durations = [] tstart = time.time() scheduler.step() test_losses, accuracies = step_test(net, test_loader, loss_reg, loss_class, config['training']['test_size']) val_results = step_validate(net, val_loader, config['training']['test_size'] * 10) net.save(os.path.join(outputDir, f"net-final-{step}.pt")) last_save, batch_acc = save_results(step, tests, last_save, net, tensorboard_writer, epoch_losses, test_losses, accuracies, val_results, optimizer, step_durations, config['training']['batch_size'], train_loader.num_blunders, train_loader.num_nonblunders, outputDir)
def main(config_path, name, collection_name): output_name = os.path.join('models', collection_name, name + '.txt') with open(config_path) as f: cfg = yaml.safe_load(f.read()) maia_chess_backend.printWithDate(yaml.dump(cfg, default_flow_style=False)) experimental_parser = cfg['dataset'].get('experimental_v4_only_dataset', False) train_chunks = get_latest_chunks(cfg['dataset']['input_train']) test_chunks = get_latest_chunks(cfg['dataset']['input_test']) shuffle_size = cfg['training']['shuffle_size'] total_batch_size = cfg['training']['batch_size'] batch_splits = cfg['training'].get('num_batch_splits', 1) if total_batch_size % batch_splits != 0: raise ValueError('num_batch_splits must divide batch_size evenly') split_batch_size = total_batch_size // batch_splits # Load data with split batch size, which will be combined to the total batch size in tfprocess. maia_chess_backend.maia.ChunkParser.BATCH_SIZE = split_batch_size root_dir = os.path.join('models', collection_name, name) if not os.path.exists(root_dir): os.makedirs(root_dir) tfprocess = maia_chess_backend.maia.TFProcess(cfg, name, collection_name) if experimental_parser: train_dataset = tf.data.Dataset.from_tensor_slices(train_chunks).shuffle(len(train_chunks)).repeat()\ .interleave(lambda x: tf.data.FixedLengthRecordDataset(x, 8292, compression_type='GZIP', num_parallel_reads=1).filter(sample), num_parallel_calls=tf.data.experimental.AUTOTUNE)\ .shuffle(shuffle_size)\ .batch(split_batch_size).map(extract_inputs_outputs).prefetch(4) else: train_parser = maia_chess_backend.maia.ChunkParser( FileDataSrc(train_chunks.copy()), shuffle_size=shuffle_size, sample=SKIP, batch_size=maia_chess_backend.maia.ChunkParser.BATCH_SIZE) train_dataset = tf.data.Dataset.from_generator( train_parser.parse, output_types=(tf.string, tf.string, tf.string, tf.string)) train_dataset = train_dataset.map( maia_chess_backend.maia.ChunkParser.parse_function) train_dataset = train_dataset.prefetch(4) shuffle_size = int(shuffle_size) if experimental_parser: test_dataset = tf.data.Dataset.from_tensor_slices(test_chunks).shuffle(len(test_chunks)).repeat()\ .interleave(lambda x: tf.data.FixedLengthRecordDataset(x, 8292, compression_type='GZIP', num_parallel_reads=1).filter(sample), num_parallel_calls=tf.data.experimental.AUTOTUNE)\ .shuffle(shuffle_size)\ .batch(split_batch_size).map(extract_inputs_outputs).prefetch(4) else: test_parser = maia_chess_backend.maia.ChunkParser( FileDataSrc(test_chunks), shuffle_size=shuffle_size, sample=SKIP, batch_size=maia_chess_backend.maia.ChunkParser.BATCH_SIZE) test_dataset = tf.data.Dataset.from_generator( test_parser.parse, output_types=(tf.string, tf.string, tf.string, tf.string)) test_dataset = test_dataset.map( maia_chess_backend.maia.ChunkParser.parse_function) test_dataset = test_dataset.prefetch(4) tfprocess.init_v2(train_dataset, test_dataset) tfprocess.restore_v2() # If number of test positions is not given # sweeps through all test chunks statistically # Assumes average of 10 samples per test game. # For simplicity, testing can use the split batch size instead of total batch size. # This does not affect results, because test results are simple averages that are independent of batch size. num_evals = cfg['training'].get('num_test_positions', len(test_chunks) * 10) num_evals = max( 1, num_evals // maia_chess_backend.maia.ChunkParser.BATCH_SIZE) print("Using {} evaluation batches".format(num_evals)) tfprocess.process_loop_v2(total_batch_size, num_evals, batch_splits=batch_splits) if cfg['training'].get('swa_output', False): tfprocess.save_swa_weights_v2(output_name) else: tfprocess.save_leelaz_weights_v2(output_name) train_parser.shutdown() test_parser.shutdown()
def mmap_csv(target_path, df, outputDir, args): maia_chess_backend.printWithDate(f"Loading: {target_path}") name = os.path.basename(target_path).split('.')[0] #df = maia_chess_backend.printWithDate(f"Filtering data starting at {len(df)} rows") df = df[df['move_ply'] >= args.min_ply] if not args.allow_low_time: df = df[df['low_time'].eq(False)] if not args.allow_negative_loss: df = df[df['winrate_loss'] > 0] df = df[df['active_elo'] > args.min_elo] df = df[df['active_elo'] < args.max_elo] df = df.dropna() maia_chess_backend.printWithDate(f"Filtering down data to {len(df)} rows") df_blunder = df[df['is_blunder_wr']] maia_chess_backend.printWithDate(f"Found {len(df_blunder)} blunders") df_blunder = df_blunder.sample(frac=1).reset_index(drop=True) df_non_blunder = df[df['is_blunder_wr'].eq(False)] maia_chess_backend.printWithDate(f"Found {len(df_non_blunder)} non blunders") df_non_blunder = df_non_blunder.sample(frac=1).reset_index(drop=True).iloc[:int(len(df_blunder) * args.nb_to_b_ratio)] del df maia_chess_backend.printWithDate(f"Reduced to {len(df_non_blunder)} non blunders") maia_chess_backend.printWithDate(f"Starting mmaping") os.makedirs(outputDir, exist_ok = True) make_df_mmaps(df_blunder, name, os.path.join(outputDir, name, 'blunder')) del df_blunder make_df_mmaps(df_non_blunder, name, os.path.join(outputDir, name, 'nonblunder'))
def load_csv(target_path, nrows): maia_chess_backend.printWithDate(f"Loading: {target_path}", flush = True) return pandas.read_csv(target_path, usecols=target_columns, nrows = nrows)