def main(): parser = OptionParser() parser.add_option("-p", "--protocol", type="string", default="betaori_closed_hand") parser.add_option("-i", "--input", type="string", help="The input directory name") parser.add_option("-e", "--epochs", type="int", default=16) parser.add_option("--load", type="int", help="What epoch to load", default=0) parser.add_option( "--print", action="store_true", help="Do we need to print predictions or not", default=False, ) parser.add_option("--visualize", action="store_true", default=False) opts, _ = parser.parse_args() load_epoch = opts.load epochs = opts.epochs protocol_string = opts.protocol visualize = opts.visualize input_directory_name = opts.input print_predictions = opts.print root_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(root_dir, "..", "processed_data", input_directory_name) if not os.path.exists(data_dir): print("Directory with data is not exists. Run prepare_data.py") return if not os.listdir(data_dir): print("Directory with data is empty. Run prepare_data.py") return protocols = { "betaori_closed_hand": BetaoriClosedHandModel, "betaori_open_hand": BetaoriOpenHandModel, "hand_cost_open": OpenHandCostModel, "hand_cost_closed": ClosedHandCostModel, } protocol = protocols.get(protocol_string) if not protocol: parser.error("Possible values for protocol are: {}.".format(", ".join(protocols.keys()))) set_up_logging("training_{}".format(protocol_string)) model = protocol(input_directory_name, data_dir, print_predictions, epochs, visualize, load_epoch) model.run()
def main(): parser = OptionParser() parser.add_option("-p", "--protocol", type="string") parser.add_option("-e", "--epochs", type="int", default=16) parser.add_option("--load", type="int", help="What epoch to load", default=0) parser.add_option( "--print", action="store_true", help="Do we need to print predictions or not", default=False, ) parser.add_option("--visualize", action="store_true", default=False) opts, _ = parser.parse_args() load_epoch = opts.load epochs = opts.epochs protocol_string = opts.protocol visualize = opts.visualize print_predictions = opts.print data_dir = pathlib.Path( __file__).parent / ".." / "processed_data" / protocol_string if not os.path.exists(data_dir): print("Directory with data doesn't exist. Run prepare_data.py") return if not os.listdir(data_dir): print("Directory with data is empty. Run prepare_data.py") return protocols = { "agari_riichi_cost": AgariRiichiCostModel, } model_class = protocols.get(protocol_string) if not model_class: parser.error( f"Possible values for protocol are: {', '.join(protocols.keys())}." ) set_up_logging("training_{}".format(protocol_string)) model = model_class(protocol_string, data_dir, print_predictions, epochs, visualize, load_epoch) model.run()
def main(): parser = OptionParser() parser.add_option("-p", "--protocol", type="string", default="betaori_closed_hand") parser.add_option("-o", "--output", type="string", help="The output directory name") parser.add_option("-d", "--train-path", type="string", help="Path to .csv with train data.") parser.add_option("-t", "--test-path", type="string", help="Path to .csv with test data.") parser.add_option("--chunk", type="int", help="chunk size", default=100000) parser.add_option("--test-chunk", type="int", help="test file chunk size", default=50000) parser.add_option("--percentage", type="int", help="test data percentage", default=20) opts, _ = parser.parse_args() data_path = opts.train_path test_path = opts.test_path chunk_size = opts.chunk test_file_chunk_size = opts.test_chunk test_data_percentage = opts.percentage output_directory_name = opts.output if not data_path: parser.error("Path to .csv with train data is not given.") if not test_path: parser.error("Path to .csv with test data is not given.") protocol_string = opts.protocol protocols = { "betaori_closed_hand": BetaoriClosedHandProtocol, "betaori_open_hand": BetaoriOpenHandProtocol, "hand_cost_open": OpenHandCostProtocol, "hand_cost_closed": ClosedHandCostProtocol, } protocol = protocols.get(protocol_string) if not protocol: parser.error("Possible values for protocol are: {}".format(", ".join( protocols.keys()))) set_up_logging("prepare_data") logger.info("{} protocol will be used.".format(protocol_string)) logger.info("Chunk size: {}. Test data percentage: {}".format( chunk_size, test_data_percentage)) root_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "processed_data") if not os.path.exists(root_dir): os.mkdir(root_dir) data_dir = os.path.join(root_dir, output_directory_name) if os.path.exists(data_dir): logger.info("Data directory already exists. It was deleted.") shutil.rmtree(data_dir) os.mkdir(data_dir) total_count = line_count(data_path) test_count = int((total_count / 100.0) * test_data_percentage) logger.info("Train data size: {}".format(total_count)) logger.info("Test data size: {}".format(test_count)) # our test data had to be in separate file header = CSVExporter.header() # test_data = pd.read_csv(test_path, names=header, nrows=test_count) # test_data = test_data.replace([None, np.nan, 'None', 'NaN', 'nan'], '') # # protocol.parse_new_data(test_data.iterrows()) for i, chunk in enumerate( pd.read_csv(test_path, chunksize=test_file_chunk_size, names=header, nrows=test_count)): file_name = "test_chunk_{:03}.hkl".format(i) logger.info("Processing {}...".format(file_name)) protocol = protocols.get(protocol_string) protocol = protocol() chunk = chunk.replace([None, np.nan, "None", "NaN", "nan"], "") protocol.parse_new_data(chunk.iterrows()) test_path = os.path.join(data_dir, file_name) hickle.dump(protocol, test_path, mode="w") gc.collect() logger.info("") logger.info("Processing train data...") for i, chunk in enumerate( pd.read_csv(data_path, chunksize=chunk_size, names=header)): file_name = "chunk_{:03}.h5".format(i) logger.info("Processing {}...".format(file_name)) protocol = protocols.get(protocol_string) protocol = protocol() chunk = chunk.replace([None, np.nan, "None", "NaN", "nan"], "") protocol.parse_new_data(chunk.iterrows()) with h5py.File(os.path.join(data_dir, file_name), "w") as f: f.create_dataset("input_data", data=protocol.input_data, dtype="float32") f.create_dataset("output_data", data=protocol.output_data, dtype="float32") logger.info("Data size = {}".format(len(protocol.input_data))) gc.collect()
def main(): """ Walk through tenhou logs and extract information needed for training. :return: """ parser = OptionParser() parser.add_option("-p", "--protocol", type="string", help="The output protocol") parser.add_option("-o", "--output", type="string", help="The output file") parser.add_option("-d", "--data", type="string", help="Path to .sqlite3 db with logs content") parser.add_option("--limit", type="int", help="How many logs to load", default=None) parser.add_option("--offset", type="int", help="Point from where to load logs", default=0) opts, _ = parser.parse_args() db_path = opts.data limit = opts.limit offset = opts.offset output_format = opts.protocol output_file = opts.output if not db_path: parser.error("Path to db is not given.") allowed_outputs = { "betaori_closed_hand": BetaoriClosedHandParser(), "betaori_open_hand": BetaoriOpenHandParser(), } if not allowed_outputs.get(output_format): parser.error("Not correct output format. Available options: {}".format( ", ".join(allowed_outputs.keys()))) parser = allowed_outputs.get(output_format) if os.path.exists(output_file): os.remove(output_file) logger.warning(f"File {output_file} already exists! It was removed") set_up_logging("parser") logger.info("Data file: {}".format(db_path)) logger.info("{} protocol will be used".format(output_format)) logger.info("Loading and decompressing logs content...") logs = load_logs(db_path, limit, offset) logs_count = 0 samples_count = 0 count_of_logs = len(logs) logger.info("Starting processing {} logs...".format(count_of_logs)) bar = tqdm(logs) for log_data in bar: if logs_count > 0 and logs_count % 1000 == 0: logger.info("Processed logs: {}/{}".format(logs_count, count_of_logs)) logger.info(f"Samples: {samples_count}") game = parser.get_game_rounds(log_data["log_content"], log_data["log_id"]) records = parser.parse_game_rounds(game) samples_count += len(records) with open(output_file, "a") as f: writer = csv.writer(f) writer.writerow(CSVExporter.header()) for record in records: writer.writerow(record) logs_count += 1 bar.set_description(f"Samples: {samples_count}") logger.info("Shuffle output file") # subprocess.run( # "shuf -o {} < {}".format(os.path.abspath(output_file), os.path.abspath(output_file)), shell=True, # ) logger.info("End") logger.info("Total samples: {}".format(samples_count))
def main(): parser = OptionParser() parser.add_option('-p', '--protocol', type='string', default='betaori_closed_hand') parser.add_option('-i', '--input', type='string', help='The input directory name') parser.add_option('-e', '--epochs', type='int', default=16) parser.add_option('--load', type='int', help='What epoch to load', default=0) parser.add_option('--print', action='store_true', help='Do we need to print predictions or not', default=False) parser.add_option('--visualize', action='store_true', default=False) opts, _ = parser.parse_args() load_epoch = opts.load epochs = opts.epochs protocol_string = opts.protocol visualize = opts.visualize input_directory_name = opts.input print_predictions = opts.print root_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(root_dir, '..', 'processed_data', input_directory_name) if not os.path.exists(data_dir): print('Directory with data is not exists. Run prepare_data.py') return if not os.listdir(data_dir): print('Directory with data is empty. Run prepare_data.py') return protocols = { 'betaori_closed_hand': BetaoriClosedHandModel, 'betaori_open_hand': BetaoriOpenHandModel, 'hand_cost_open': OpenHandCostModel, 'hand_cost_closed': ClosedHandCostModel, } protocol = protocols.get(protocol_string) if not protocol: parser.error('Possible values for protocol are: {}.'.format(', '.join( protocols.keys()))) set_up_logging('training_{}'.format(protocol_string)) model = protocol(input_directory_name, data_dir, print_predictions, epochs, visualize, load_epoch) model.run()
def main(): parser = OptionParser() parser.add_option("-p", "--protocol", type="string") parser.add_option("-d", "--train-path", type="string", help="Path to .csv with train data.") parser.add_option("-t", "--test-path", type="string", help="Path to .csv with test data.") parser.add_option("-c", "--chunk", type="int", help="chunk size", default=100000) opts, _ = parser.parse_args() data_path = opts.train_path test_path = opts.test_path chunk_size = opts.chunk if not data_path: parser.error("Path to .csv with train data is not given.") if not test_path: parser.error("Path to .csv with test data is not given.") protocol_string = opts.protocol protocols = { "agari_riichi_cost": AgariRiichiCostProtocol, } protocol_class = protocols.get(protocol_string) if not protocol_class: parser.error( f"Possible values for protocol are: {', '.join(protocols.keys())}") set_up_logging("prepare_data") logger.info(f"{protocol_class.__name__} protocol will be used.") logger.info(f"Chunk size: {chunk_size}") processed_folder = pathlib.Path(__file__).parent / ".." / "processed_data" if not processed_folder.exists(): os.mkdir(processed_folder) data_dir = processed_folder / protocol_string if data_dir.exists(): logger.info("Data directory already exists. It was deleted.") shutil.rmtree(data_dir) os.mkdir(data_dir) for i, chunk in enumerate( pd.read_csv(test_path, chunksize=chunk_size, names=protocol_class.CSV_HEADER)): file_name = f"test_chunk_{i:03}.hkl" logger.info(f"Processing {file_name}...") protocol = protocol_class() chunk = chunk.replace([None, np.nan, "None", "NaN", "nan"], "") protocol.parse_new_data(chunk.iterrows()) test_path = os.path.join(data_dir, file_name) hickle.dump( { "input_data": protocol.input_data, "output_data": protocol.output_data, "verification_data": protocol.verification_data, }, test_path, mode="w", ) logger.info(f"Test size = {len(protocol.input_data)}") del protocol gc.collect() logger.info("") logger.info("Processing train data...") for i, chunk in enumerate( pd.read_csv(data_path, chunksize=chunk_size, names=protocol_class.CSV_HEADER)): file_name = f"chunk_{i:03}.hkl" logger.info(f"Processing {file_name}...") protocol = protocols.get(protocol_string) protocol = protocol() chunk = chunk.replace([None, np.nan, "None", "NaN", "nan"], "") protocol.parse_new_data(chunk.iterrows()) with h5py.File(os.path.join(data_dir, file_name), "w") as f: f.create_dataset("input_data", data=protocol.input_data, dtype="float32") f.create_dataset("output_data", data=protocol.output_data, dtype="float32") logger.info(f"Data size = {len(protocol.input_data)}") del protocol gc.collect()
def main(): parser = OptionParser() parser.add_option('-p', '--protocol', type='string', default='betaori_closed_hand') parser.add_option('-o', '--output', type='string', help='The output directory name') parser.add_option('-d', '--train-path', type='string', help='Path to .csv with train data.') parser.add_option('-t', '--test-path', type='string', help='Path to .csv with test data.') parser.add_option('--chunk', type='int', help='chunk size', default=100000) parser.add_option('--test-chunk', type='int', help='test file chunk size', default=50000) parser.add_option('--percentage', type='int', help='test data percentage', default=20) opts, _ = parser.parse_args() data_path = opts.train_path test_path = opts.test_path chunk_size = opts.chunk test_file_chunk_size = opts.test_chunk test_data_percentage = opts.percentage output_directory_name = opts.output if not data_path: parser.error('Path to .csv with train data is not given.') if not test_path: parser.error('Path to .csv with test data is not given.') protocol_string = opts.protocol protocols = { 'betaori_closed_hand': BetaoriClosedHandProtocol, 'betaori_open_hand': BetaoriOpenHandProtocol, 'hand_cost_open': OpenHandCostProtocol, 'hand_cost_closed': ClosedHandCostProtocol, } protocol = protocols.get(protocol_string) if not protocol: parser.error('Possible values for protocol are: {}'.format(', '.join( protocols.keys()))) set_up_logging('prepare_data') logger.info('{} protocol will be used.'.format(protocol_string)) logger.info('Chunk size: {}. Test data percentage: {}'.format( chunk_size, test_data_percentage)) root_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'processed_data') if not os.path.exists(root_dir): os.mkdir(root_dir) data_dir = os.path.join(root_dir, output_directory_name) if os.path.exists(data_dir): logger.info('Data directory already exists. It was deleted.') shutil.rmtree(data_dir) os.mkdir(data_dir) total_count = line_count(data_path) test_count = int((total_count / 100.0) * test_data_percentage) logger.info('Train data size: {}'.format(total_count)) logger.info('Test data size: {}'.format(test_count)) # our test data had to be in separate file header = CSVExporter.header() # test_data = pd.read_csv(test_path, names=header, nrows=test_count) # test_data = test_data.replace([None, np.nan, 'None', 'NaN', 'nan'], '') # # protocol.parse_new_data(test_data.iterrows()) for i, chunk in enumerate( pd.read_csv(test_path, chunksize=test_file_chunk_size, names=header, nrows=test_count)): file_name = 'test_chunk_{:03}.hkl'.format(i) logger.info('Processing {}...'.format(file_name)) protocol = protocols.get(protocol_string) protocol = protocol() chunk = chunk.replace([None, np.nan, 'None', 'NaN', 'nan'], '') protocol.parse_new_data(chunk.iterrows()) test_path = os.path.join(data_dir, file_name) hickle.dump(protocol, test_path, mode='w') gc.collect() logger.info('') logger.info('Processing train data...') for i, chunk in enumerate( pd.read_csv(data_path, chunksize=chunk_size, names=header)): file_name = 'chunk_{:03}.h5'.format(i) logger.info('Processing {}...'.format(file_name)) protocol = protocols.get(protocol_string) protocol = protocol() chunk = chunk.replace([None, np.nan, 'None', 'NaN', 'nan'], '') protocol.parse_new_data(chunk.iterrows()) with h5py.File(os.path.join(data_dir, file_name), 'w') as f: f.create_dataset('input_data', data=protocol.input_data, dtype='float32') f.create_dataset('output_data', data=protocol.output_data, dtype='float32') logger.info('Data size = {}'.format(len(protocol.input_data))) gc.collect()
def main(): parser = OptionParser() parser.add_option('-p', '--protocol', type='string', help='The output protocol') parser.add_option('-o', '--output', type='string', help='The output file') parser.add_option('-d', '--data', type='string', help='Path to .sqlite3 db with logs content') parser.add_option('-l', '--limit', type='string', help='For debugging', default='unlimited') opts, _ = parser.parse_args() db_path = opts.data limit = opts.limit output_format = opts.protocol output_file = opts.output if not db_path: parser.error('Path to db is not given.') allowed_outputs = { 'closed_hand': BetaoriClosedHandParser(), 'open_hand': BetaoriOpenHandParser(), } if not allowed_outputs.get(output_format): parser.error('Not correct output format. Available options: {}'.format( ', '.join(allowed_outputs.keys()))) parser = allowed_outputs.get(output_format) if os.path.exists(output_file): logger.warning( 'File {} already exists! New data will append there.'.format( output_file)) set_up_logging('parser') logger.info('Data file: {}'.format(db_path)) logger.info('{} protocol will be used'.format(output_format)) logger.info('Loading and decompressing logs content...') logs = load_logs(db_path, limit) logs_count = 0 samples_count = 0 count_of_logs = len(logs) logger.info('Starting processing {} logs...'.format(count_of_logs)) for log_data in logs: if logs_count > 0 and logs_count % 1000 == 0: logger.info('Processed logs: {}/{}'.format(logs_count, count_of_logs)) logger.info('Samples: {}'.format(samples_count)) game = parser.get_game_rounds(log_data['log_content'], log_data['log_id']) records = parser.parse_game_rounds(game) samples_count += len(records) with open(output_file, 'a') as f: writer = csv.writer(f) for record in records: writer.writerow(record) logs_count += 1 logger.info('Shuffle output file') subprocess.run('shuf -o {} < {}'.format(os.path.abspath(output_file), os.path.abspath(output_file)), shell=True) logger.info('End') logger.info('Total samples: {}'.format(samples_count))