def predict_lstm(X, y, y_original, timesteps, bs, alphabet_size, model_name, final_step=False): if not final_step: num_iters = int((len(X)+timesteps)/bs) ind = np.array(range(bs))*num_iters # open compressed files and compress first few characters using # uniform distribution f = [open(args.temp_file_prefix+'.'+str(i),'wb') for i in range(bs)] bitout = [arithmeticcoding_fast.BitOutputStream(f[i]) for i in range(bs)] enc = [arithmeticcoding_fast.ArithmeticEncoder(32, bitout[i]) for i in range(bs)] prob = np.ones(alphabet_size)/alphabet_size cumul = np.zeros(alphabet_size+1, dtype = np.uint64) cumul[1:] = np.cumsum(prob*10000000 + 1) for i in range(bs): for j in range(min(timesteps, num_iters)): enc[i].write(cumul, X[ind[i],j]) cumul = np.zeros((bs, alphabet_size+1), dtype = np.uint64) for j in (range(num_iters - timesteps)): x=torch.Tensor(X[ind,:]) x = x.reshape(-1,timesteps, args.input_size).to(device) outputs = model(x) prob=F.softmax(outputs).data.cpu().numpy() cumul[:,1:] = np.cumsum(prob*10000000 + 1, axis = 1) for i in range(bs): enc[i].write(cumul[i,:], y_original[ind[i]]) ind = ind + 1 # close files for i in range(bs): enc[i].finish() bitout[i].close() f[i].close() else: f = open(args.temp_file_prefix+'.last','wb') bitout = arithmeticcoding_fast.BitOutputStream(f) enc = arithmeticcoding_fast.ArithmeticEncoder(32, bitout) prob = np.ones(alphabet_size)/alphabet_size cumul = np.zeros(alphabet_size+1, dtype = np.uint64) cumul[1:] = np.cumsum(prob*10000000 + 1) for j in range(timesteps): enc.write(cumul, X[0,j]) for i in (range(len(X))): x=torch.Tensor(X[i,:]) x = x.reshape(-1,timesteps, args.input_size).to(device) outputs = model(x) prob=F.softmax(outputs).data.cpu().numpy() cumul[1:] = np.cumsum(prob*10000000 + 1) enc.write(cumul, y_original[i][0]) enc.finish() bitout.close() f.close() return
def predict_lstm(X, y, y_original, timesteps, bs, alphabet_size, model_name, final_step=False): model = getattr(models, model_name)(bs, timesteps, alphabet_size) model.load_weights(args.model_weights_file) if not final_step: num_iters = int((len(X)+timesteps)/bs) ind = np.array(range(bs))*num_iters # open compressed files and compress first few characters using # uniform distribution f = [open(args.temp_file_prefix+'.'+str(i),'wb') for i in range(bs)] bitout = [arithmeticcoding_fast.BitOutputStream(f[i]) for i in range(bs)] enc = [arithmeticcoding_fast.ArithmeticEncoder(32, bitout[i]) for i in range(bs)] prob = np.ones(alphabet_size)/alphabet_size cumul = np.zeros(alphabet_size+1, dtype = np.uint64) cumul[1:] = np.cumsum(prob*10000000 + 1) for i in range(bs): for j in range(min(timesteps, num_iters)): enc[i].write(cumul, X[ind[i],j]) cumul = np.zeros((bs, alphabet_size+1), dtype = np.uint64) for j in (range(num_iters - timesteps)): prob = model.predict(X[ind,:], batch_size=bs) cumul[:,1:] = np.cumsum(prob*10000000 + 1, axis = 1) for i in range(bs): enc[i].write(cumul[i,:], y_original[ind[i]]) ind = ind + 1 # close files for i in range(bs): enc[i].finish() bitout[i].close() f[i].close() else: f = open(args.temp_file_prefix+'.last','wb') bitout = arithmeticcoding_fast.BitOutputStream(f) enc = arithmeticcoding_fast.ArithmeticEncoder(32, bitout) prob = np.ones(alphabet_size)/alphabet_size cumul = np.zeros(alphabet_size+1, dtype = np.uint64) cumul[1:] = np.cumsum(prob*10000000 + 1) for j in range(timesteps): enc.write(cumul, X[0,j]) for i in (range(len(X))): prob = model.predict(X[i,:].reshape(1,-1), batch_size=1) cumul[1:] = np.cumsum(prob*10000000 + 1) enc.write(cumul, y_original[i][0]) enc.finish() bitout.close() f.close() return
def predict_lstm(X, y_original, timesteps, bs, alphabet_size, model_name): ARNN, PRNN = eval(model_name)(bs, timesteps, alphabet_size) PRNN.load_weights(args.model_weights_file) l = int(len(X) / bs) * bs f = open(args.file_prefix, 'wb') bitout = arithmeticcoding_fast.BitOutputStream(f) enc = arithmeticcoding_fast.ArithmeticEncoder(32, bitout) prob = np.ones(alphabet_size) / alphabet_size cumul = np.zeros(alphabet_size + 1, dtype=np.uint64) cumul[1:] = np.cumsum(prob * 10000000 + 1) for j in range(timesteps): enc.write(cumul, X[0, j]) cumul = np.zeros((1, alphabet_size + 1), dtype=np.uint64) progress = 0 for bx, by in iterate_minibatches(X[:l], y_original[:l], bs): for j in range(bs): prob = PRNN.predict(bx[j:j + 1], batch_size=1) cumul[:, 1:] = np.cumsum(prob * 10000000 + 1, axis=1) enc.write(cumul[0, :], int(by[j])) progress = progress + 1 sys.stdout.flush() print("{}/{}".format(progress, len(X) + timesteps), end="\r") if len(X[l:]) > 0: # prob, _ = ARNN.predict(X[l:], batch_size=len(X[l:])) cumul = np.zeros((1, alphabet_size + 1), dtype=np.uint64) for i in range(len(y_original[l:])): prob = PRNN.predict(X[l:][i:i + 1], batch_size=1) cumul[:, 1:] = np.cumsum(prob * 10000000 + 1, axis=1) enc.write(cumul[0, :], int(y_original[l:][i])) enc.finish() bitout.close() f.close()
def predict_lstm(X, y_original, timesteps, bs, alphabet_size, model_name): ARNN, PRNN = eval(model_name)(bs, timesteps, alphabet_size) PRNN.load_weights(args.model_weights_file) optim = tf.train.AdamOptimizer(learning_rate=5e-4) ARNN.compile(loss=loss_fn, optimizer=optim, metrics=['acc']) l = int(len(X) / bs) * bs f = open(args.file_prefix, 'wb') bitout = arithmeticcoding_fast.BitOutputStream(f) enc = arithmeticcoding_fast.ArithmeticEncoder(32, bitout) prob = np.ones(alphabet_size) / alphabet_size cumul = np.zeros(alphabet_size + 1, dtype=np.uint64) cumul[1:] = np.cumsum(prob * 10000000 + 1) for j in range(timesteps): enc.write(cumul, X[0, j]) cumul = np.zeros((1, alphabet_size + 1), dtype=np.uint64) progress = 0 for bx, by in iterate_minibatches(X[:l], y_original[:l], bs): for j in range(bs): prob = ARNN.predict(bx[j:j + 1], batch_size=1) cumul[:, 1:] = np.cumsum(prob * 10000000 + 1, axis=1) enc.write(cumul[0, :], int(by[j])) progress = progress + 1 sys.stdout.flush() print("{}/{}".format(progress, len(X) + timesteps), end="\r") onehot = keras.utils.to_categorical(by, num_classes=alphabet_size) ARNN.train_on_batch(bx, onehot) if len(X[l:]) > 0: # prob, _ = ARNN.predict(X[l:], batch_size=len(X[l:])) cumul = np.zeros((1, alphabet_size + 1), dtype=np.uint64) for i in range(len(y_original[l:])): prob = ARNN.predict(X[l:][i:i + 1], batch_size=1) cumul[:, 1:] = np.cumsum(prob * 10000000 + 1, axis=1) enc.write(cumul[0, :], int(y_original[l:][i])) enc.finish() bitout.close() f.close()
def compress_model(args,device): class LSTM(nn.Module): def __init__(self, num_classes, hidden_size1=32, hidden_size2=32, num_layers=2): super(LSTM, self).__init__() self.hidden_size1 = hidden_size1 self.hidden_size2 = hidden_size2 self.num_layers = num_layers self.embedding= nn.Embedding(num_classes, 32).to(device) self.lstm = nn.LSTM(32, hidden_size1, num_layers, batch_first=True).to(device) self.fc1 = nn.Linear(hidden_size1, hidden_size2).to(device) self.fc2 = nn.Linear(hidden_size2, num_classes).to(device) def forward(self, x): # initialize h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size1).to(device) c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size1).to(device) out=self.embedding(x[:,:,0].long()) out, (h_n, h_c) = self.lstm(out, (h0, c0)) out =self.fc1(out[:, -1, :]) out =self.fc2(nn.ReLU()(out)) return out # load the data np.random.seed(0) series = np.load(args.sequence_npy_file) series = series.reshape(-1, 1) onehot_encoder = OneHotEncoder(sparse=False) onehot_encoded = onehot_encoder.fit(series) args.batch_size=int(len(series)/10000) batch_size = args.batch_size timesteps = 64 with open(args.params_file, 'r') as f: params = json.load(f) params['len_series'] = len(series) params['bs'] = batch_size params['timesteps'] = timesteps with open(args.output_file_prefix+'.params','w') as f: json.dump(params, f, indent=4) alphabet_size = len(params['id2char_dict']) def strided_app(a, L, S): # Window len = L, Stride len/stepsize = S nrows = ((a.size - L) // S) + 1 n = a.strides[0] return np.lib.stride_tricks.as_strided( a, shape=(nrows, L), strides=(S * n, n), writeable=False) series = series.reshape(-1) data = strided_app(series, timesteps+1, 1) X = data[:, :-1] Y_original = data[:, -1:] Y = onehot_encoder.transform(Y_original) l = int(len(series)/batch_size)*batch_size # Hyper Parameters num_epochs=args.num_epochs hidden_size1 = args.hidden_size1 hidden_size2 = args.hidden_size2 num_layers = args.num_layers num_classes = alphabet_size lr = 0.001 if args.model_name=="LSTM": model = LSTM(num_classes,hidden_size1, hidden_size2, num_layers) elif args.model_name=="GRU": model = GRU(num_classes,hidden_size1, hidden_size2, num_layers) elif args.model_name=="biLSTM": model = biLSTM(num_classes,hidden_size1, hidden_size2, num_layers) elif args.model_name=="biGRU": model = biGRU(num_classes,hidden_size1, hidden_size2, num_layers) # unzip the compressed models zip_path="../data/trained_models/"+args.data_name+"/"+args.file_path+args.append+ ".zip" save_path="../data/trained_models/"+args.data_name with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(save_path) model=model.to('cpu') if args.quantization=="yes": # load the quantized model weights model = torch.quantization.quantize_dynamic( model, {nn.LSTM, nn.Linear}, dtype=torch.qint8 ) model.load_state_dict(torch.load(args.model_weights_file)) else: #load the model weights model.load_state_dict(torch.load(args.model_weights_file)) model=model.to(device) def predict_lstm(X, y, y_original, timesteps, bs, alphabet_size, model_name, final_step=False): if not final_step: num_iters = int((len(X)+timesteps)/bs) ind = np.array(range(bs))*num_iters # open compressed files and compress first few characters using # uniform distribution f = [open(args.temp_file_prefix+'.'+str(i),'wb') for i in range(bs)] bitout = [arithmeticcoding_fast.BitOutputStream(f[i]) for i in range(bs)] enc = [arithmeticcoding_fast.ArithmeticEncoder(32, bitout[i]) for i in range(bs)] prob = np.ones(alphabet_size)/alphabet_size cumul = np.zeros(alphabet_size+1, dtype = np.uint64) cumul[1:] = np.cumsum(prob*10000000 + 1) for i in range(bs): for j in range(min(timesteps, num_iters)): enc[i].write(cumul, X[ind[i],j]) cumul = np.zeros((bs, alphabet_size+1), dtype = np.uint64) for j in (range(num_iters - timesteps)): x=torch.Tensor(X[ind,:]) x = x.reshape(-1,timesteps, args.input_size).to(device) outputs = model(x) prob=F.softmax(outputs).data.cpu().numpy() cumul[:,1:] = np.cumsum(prob*10000000 + 1, axis = 1) for i in range(bs): enc[i].write(cumul[i,:], y_original[ind[i]]) ind = ind + 1 # close files for i in range(bs): enc[i].finish() bitout[i].close() f[i].close() else: f = open(args.temp_file_prefix+'.last','wb') bitout = arithmeticcoding_fast.BitOutputStream(f) enc = arithmeticcoding_fast.ArithmeticEncoder(32, bitout) prob = np.ones(alphabet_size)/alphabet_size cumul = np.zeros(alphabet_size+1, dtype = np.uint64) cumul[1:] = np.cumsum(prob*10000000 + 1) for j in range(timesteps): enc.write(cumul, X[0,j]) for i in (range(len(X))): x=torch.Tensor(X[i,:]) x = x.reshape(-1,timesteps, args.input_size).to(device) outputs = model(x) prob=F.softmax(outputs).data.cpu().numpy() cumul[1:] = np.cumsum(prob*10000000 + 1) enc.write(cumul, y_original[i][0]) enc.finish() bitout.close() f.close() return # variable length integer encoding http://www.codecodex.com/wiki/Variable-Length_Integers def var_int_encode(byte_str_len, f): while True: this_byte = byte_str_len&127 byte_str_len >>= 7 if byte_str_len == 0: f.write(struct.pack('B',this_byte)) break f.write(struct.pack('B',this_byte|128)) byte_str_len -= 1 # compress the data predict_lstm(X, Y, Y_original, timesteps, batch_size, alphabet_size, args.model_name) if l < len(series)-timesteps: predict_lstm(X[l:,:], Y[l:,:], Y_original[l:], timesteps, 1, alphabet_size, args.model_name, final_step = True) else: f = open(args.temp_file_prefix+'.last','wb') bitout = arithmeticcoding_fast.BitOutputStream(f) enc = arithmeticcoding_fast.ArithmeticEncoder(32, bitout) prob = np.ones(alphabet_size)/alphabet_size cumul = np.zeros(alphabet_size+1, dtype = np.uint64) cumul[1:] = np.cumsum(prob*10000000 + 1) for j in range(l, len(series)): enc.write(cumul, series[j]) enc.finish() bitout.close() f.close() # combine files into one file f = open(args.output_file_prefix+'.combined','wb') for i in range(batch_size): f_in = open(args.temp_file_prefix+'.'+str(i),'rb') byte_str = f_in.read() byte_str_len = len(byte_str) var_int_encode(byte_str_len, f) f.write(byte_str) f_in.close() f_in = open(args.temp_file_prefix+'.last','rb') byte_str = f_in.read() byte_str_len = len(byte_str) var_int_encode(byte_str_len, f) f.write(byte_str) f_in.close() f.close() shutil.rmtree(args.temp_dir)
def compress(model, X, Y, bs, vocab_size, timesteps, device, optimizer, scheduler, final_step=False): if not final_step: num_iters = (len(X) + timesteps) // bs print(num_iters) ind = np.array(range(bs)) * num_iters back_ind = np.array(range(bs)) * num_iters f = [ open(FLAGS.temp_file_prefix + '.' + str(i), 'wb') for i in range(bs) ] bitout = [ arithmeticcoding_fast.BitOutputStream(f[i]) for i in range(bs) ] enc = [ arithmeticcoding_fast.ArithmeticEncoder(32, bitout[i]) for i in range(bs) ] prob = np.ones(vocab_size) / vocab_size cumul = np.zeros(vocab_size + 1, dtype=np.uint64) cumul[1:] = np.cumsum(prob * 10000000 + 1) for i in range(bs): for j in range(min(timesteps, num_iters)): enc[i].write(cumul, X[ind[i], j]) block_len = 20 cumul = np.zeros((bs * block_len, vocab_size + 1), dtype=np.uint64) test_loss = 0 batch_loss = 0 train_loss = 0 for j in (range(num_iters - timesteps)): # Write Code for probability extraction if (j + 1) % block_len == 0: indices = np.concatenate( [ind - p for p in range(block_len - 1, -1, -1)], axis=0) bx = Variable(torch.from_numpy(X[indices, :])).to(device) by = Variable(torch.from_numpy(Y[indices])).to(device) # print(X[ind, :]) with torch.no_grad(): model.eval() pred, _ = model(bx) loss = loss_function(pred, by) test_loss += loss.item() * block_len batch_loss += loss.item() * block_len prob = torch.exp(pred).detach().cpu().numpy() cumul[:, 1:] = np.cumsum(prob * 10000000 + 1, axis=1) for i in range(bs): for s in range(block_len): enc[i].write(cumul[i + bs * s, :], Y[indices[i + bs * s]]) if (j + 1) % 100 == 0: print( "Iter {} Loss {:.4f} Moving Loss {:.4f} Train Loss {:.4f}". format(j + 1, test_loss / (j + 1), batch_loss / 100, train_loss / 100), flush=True) batch_loss = 0 train_loss = 0 if (j + 1) % block_len == 0: indices = np.concatenate([ind - p for p in range(block_len)], axis=0) bx = Variable(torch.from_numpy(X[indices, :])).to(device) by = Variable(torch.from_numpy(Y[indices])).to(device) model.train() optimizer.zero_grad() pred1, pred2 = model(bx) loss2 = loss_function(pred2, by) loss = loss_function(pred1, by) + loss2 train_loss += loss2.item() * block_len loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 5) optimizer.step() ind = ind + 1 cumul = np.zeros((bs, vocab_size + 1), dtype=np.uint64) start_ind = ((num_iters - timesteps) // block_len) * block_len ind = np.array(range(bs)) * num_iters + start_ind for j in (range(start_ind, num_iters - timesteps)): bx = Variable(torch.from_numpy(X[ind, :])).to(device) by = Variable(torch.from_numpy(Y[ind])).to(device) # print(X[ind, :]) with torch.no_grad(): model.eval() pred, _ = model(bx) loss = loss_function(pred, by) test_loss += loss.item() batch_loss += loss.item() prob = torch.exp(pred).detach().cpu().numpy() cumul[:, 1:] = np.cumsum(prob * 10000000 + 1, axis=1) for i in range(bs): enc[i].write(cumul[i, :], Y[ind[i]]) ind = ind + 1 # close files for i in range(bs): enc[i].finish() bitout[i].close() f[i].close() else: f = open(FLAGS.temp_file_prefix + '.last', 'wb') bitout = arithmeticcoding_fast.BitOutputStream(f) enc = arithmeticcoding_fast.ArithmeticEncoder(32, bitout) prob = np.ones(vocab_size) / vocab_size cumul = np.zeros(vocab_size + 1, dtype=np.uint64) cumul[1:] = np.cumsum(prob * 10000000 + 1) for j in range(timesteps): enc.write(cumul, X[0, j]) for i in (range(len(X))): bx = Variable(torch.from_numpy(X[i:i + 1, :])).to(device) with torch.no_grad(): model.eval() pred, _ = model(bx) prob = torch.exp(pred).detach().cpu().numpy() cumul[1:] = np.cumsum(prob * 10000000 + 1) enc.write(cumul, Y[i]) enc.finish() bitout.close() f.close() return
def main(): os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu batch_size = FLAGS.bs timesteps = FLAGS.timesteps use_cuda = True FLAGS.params = "params_" + FLAGS.file_name with open(FLAGS.params, 'r') as f: params = json.load(f) FLAGS.temp_dir = 'temp' if os.path.exists(FLAGS.temp_dir): shutil.rmtree('temp') os.system("rm -r {}".format(FLAGS.temp_dir)) FLAGS.temp_file_prefix = FLAGS.temp_dir + "/compressed" if not os.path.exists(FLAGS.temp_dir): os.makedirs(FLAGS.temp_dir) use_cuda = use_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") sequence = np.load(FLAGS.file_name + ".npy") vocab_size = len(np.unique(sequence)) sequence = sequence sequence = sequence.reshape(-1) series = sequence.copy() data = strided_app(series, timesteps + 1, 1) X = data[:, :-1] Y = data[:, -1] print("array type", X.dtype) # X = X.astype('int') # Y = Y.astype('int') params['len_series'] = len(series) params['bs'] = batch_size params['timesteps'] = timesteps with open(FLAGS.output + '.params', 'w') as f: json.dump(params, f, indent=4) bsdic = { 'vocab_size': vocab_size, 'emb_size': 8, 'length': timesteps, 'jump': 16, 'hdim1': 8, 'hdim2': 16, 'n_layers': 2, 'bidirectional': True } comdic = { 'vocab_size': vocab_size, 'emb_size': 32, 'length': timesteps, 'hdim': 8 } if vocab_size >= 1 and vocab_size <= 3: bsdic['hdim1'] = 8 bsdic['hdim2'] = 16 comdic['emb_size'] = 16 comdic['hdim'] = 1024 if vocab_size >= 4 and vocab_size <= 9: bsdic['hdim1'] = 32 bsdic['hdim2'] = 16 comdic['emb_size'] = 16 comdic['hdim'] = 2048 if vocab_size >= 10 and vocab_size < 128: bsdic['hdim1'] = 128 bsdic['hdim2'] = 128 bsdic['emb_size'] = 16 comdic['emb_size'] = 32 comdic['hdim'] = 2048 if vocab_size >= 128: bsdic['hdim1'] = 128 bsdic['hdim2'] = 256 bsdic['emb_size'] = 16 comdic['emb_size'] = 32 comdic['hdim'] = 2048 bsmodel = BootstrapNN(**bsdic).to(device) bsmodel.load_state_dict(torch.load(FLAGS.model_weights_path)) comdic['bsNN'] = bsmodel commodel = CombinedNN(**comdic).to(device) for name, p in commodel.named_parameters(): if "bs" in name: p.requires_grad = False optimizer = optim.Adam(commodel.parameters(), lr=5e-4, betas=(0.0, 0.999)) # optimizer = optim.RMSprop(commodel.parameters(), lr=5e-4) # optimizer = optim.Adadelta(commodel.parameters(), lr=1.0) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, threshold=1e-2, patience=1000, cooldown=10000, min_lr=1e-4, verbose=True) l = int(len(series) / batch_size) * batch_size compress(commodel, X, Y, batch_size, vocab_size, timesteps, device, optimizer, scheduler) if l < len(series) - timesteps: compress(commodel, X[l:], Y[l:], 1, vocab_size, timesteps, device, optimizer, scheduler, final_step=True) else: f = open(FLAGS.temp_file_prefix + '.last', 'wb') bitout = arithmeticcoding_fast.BitOutputStream(f) enc = arithmeticcoding_fast.ArithmeticEncoder(32, bitout) prob = np.ones(vocab_size) / vocab_size cumul = np.zeros(vocab_size + 1, dtype=np.uint64) cumul[1:] = np.cumsum(prob * 10000000 + 1) for j in range(l, len(series)): enc.write(cumul, series[j]) enc.finish() bitout.close() f.close() # combine files into one file f = open(FLAGS.output + '.combined', 'wb') for i in range(batch_size): f_in = open(FLAGS.temp_file_prefix + '.' + str(i), 'rb') byte_str = f_in.read() byte_str_len = len(byte_str) var_int_encode(byte_str_len, f) f.write(byte_str) f_in.close() f_in = open(FLAGS.temp_file_prefix + '.last', 'rb') byte_str = f_in.read() byte_str_len = len(byte_str) var_int_encode(byte_str_len, f) f.write(byte_str) f_in.close() f.close() shutil.rmtree('temp') print("Done")
def main(): args.temp_dir = tempfile.mkdtemp() args.temp_file_prefix = args.temp_dir + "/compressed" tf.set_random_seed(42) np.random.seed(0) series = np.load(args.sequence_npy_file) series = series.reshape(-1, 1) onehot_encoder = OneHotEncoder(sparse=False) onehot_encoded = onehot_encoder.fit(series) batch_size = args.batch_size timesteps = 64 with open(args.params_file, 'r') as f: params = json.load(f) params['len_series'] = len(series) params['bs'] = batch_size params['timesteps'] = timesteps with open(args.output_file_prefix+'.params','w') as f: json.dump(params, f, indent=4) alphabet_size = len(params['id2char_dict']) series = series.reshape(-1) data = strided_app(series, timesteps+1, 1) X = data[:, :-1] Y_original = data[:, -1:] Y = onehot_encoder.transform(Y_original) l = int(len(series)/batch_size)*batch_size predict_lstm(X, Y, Y_original, timesteps, batch_size, alphabet_size, args.model_name) if l < len(series)-timesteps: predict_lstm(X[l:,:], Y[l:,:], Y_original[l:], timesteps, 1, alphabet_size, args.model_name, final_step = True) else: f = open(args.temp_file_prefix+'.last','wb') bitout = arithmeticcoding_fast.BitOutputStream(f) enc = arithmeticcoding_fast.ArithmeticEncoder(32, bitout) prob = np.ones(alphabet_size)/alphabet_size cumul = np.zeros(alphabet_size+1, dtype = np.uint64) cumul[1:] = np.cumsum(prob*10000000 + 1) for j in range(l, len(series)): enc.write(cumul, series[j]) enc.finish() bitout.close() f.close() # combine files into one file f = open(args.output_file_prefix+'.combined','wb') for i in range(batch_size): f_in = open(args.temp_file_prefix+'.'+str(i),'rb') byte_str = f_in.read() byte_str_len = len(byte_str) var_int_encode(byte_str_len, f) f.write(byte_str) f_in.close() f_in = open(args.temp_file_prefix+'.last','rb') byte_str = f_in.read() byte_str_len = len(byte_str) var_int_encode(byte_str_len, f) f.write(byte_str) f_in.close() f.close() shutil.rmtree(args.temp_dir)