def main(argv): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) train_folder_name = argv[1] dict_file_name = argv[2] out_dir = argv[3] seq_max_len = int(argv[4]) # 32 seems ok with tags lstm_size = int(argv[5]) epochs = int(argv[6]) batch_size = int(argv[7]) # 2304 for geforce 780, 40 for cpu or 128 dictionary = pp.read_gzip_data(dict_file_name) dict_size = len(dictionary) if epochs > 0: if not os.path.isdir(out_dir): os.makedirs(out_dir) weights_path = out_dir + 'weights.h5' model_path = out_dir + 'model.json' model = prepareModel(seq_max_len, dict_size, lstm_size, model_path) if os.path.isfile(weights_path): logging.info("Reading weights from existing file...") model.load_weights(weights_path) train_files = os.listdir(train_folder_name) if len(train_files) > 1: logging.info("Training on batches...") for e in range(epochs): logging.info('Starting {0} epoch...'.format(e)) random.shuffle(train_files) for train_file_name in train_files: if '.swp' not in train_file_name: train_data = pp.read_gzip_data(train_folder_name + train_file_name) train_data = pp.convertDataToKerasFormat(train_data, seq_max_len, dictionary) model.train_on_batch(train_data[:-1], train_data[1:]) logging.info("Saving weights...") model.save_weights(weights_path, overwrite = True) elif len(train_files) == 1: logging.info("Fitting model...") train_data = pp.read_gzip_data(train_folder_name + train_files[0]) train_data = pp.convertDataToKerasFormat(train_data, seq_max_len, dictionary) model.fit(train_data[:-1], train_data[1:], batch_size = batch_size, nb_epoch = epochs) logging.info("Saving weights...") model.save_weights(weights_path, overwrite = True) else: logging.error("There are no files to train on!")
def main(argv): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) data_file_name = argv[1] dict_file_name = argv[2] working_dir = argv[3] seq_max_len = int(argv[4]) sufix_len = int(argv[5]) batch_size = 2304 it_size = batch_size logging.info('Reading model...') model_file = open(working_dir + 'model.json', 'r') json_model = model_file.read() model = model_from_json(json_model) logging.info('Reading weights...') model.load_weights(working_dir + 'weights.h5') dictionary = pp.read_gzip_data(dict_file_name) data = pp.read_gzip_data(data_file_name) test_data, test_data_count = pp.prepareTestData(data_file_name) test_data = pd.prepareData(test_data, seq_max_len, dictionary, sufix_len) pp.save_gzip_data(working_dir + 'test_data.txt.gz', test_data) test_data = pp.convertDataToKerasFormat(test_data, seq_max_len, dictionary) logging.info('Testing model on data...') full_size = len(test_data) it = 0 res_win = 0 res_los = 0 res_inc = 0 res_all = 0 it_count = 0 while it < full_size: starting_it_count = it_count it2 = it for one_count in range(it_count, len(test_data_count)): if it2 + test_data_count[one_count] > min(full_size, it + it_size): break else: it_count += 1 it2 += test_data_count[one_count] tmp_test_data = test_data[it : it2] tmp_test_data_count = test_data_count[starting_it_count : it_count] tmp_results = model.predict_proba(tmp_test_data, batch_size = batch_size) win, los, inc, al = testModel(tmp_test_data, tmp_results, tmp_test_data_count) res_win += win res_los += los res_inc += inc res_all += al it = it2 printFinalResults(res_win, res_los, res_inc, res_all)