def test_encode(self): ds = ["hello", ''] bc = ByteCode("byte_values.txt") for i in range(len(ds)): indices = bc.to_int_seq(ds[i]) self.assertEqual(indices[-1], bc.STOP_CODE) self.assertEqual(ds[i], bc.to_string(indices))
def sample(model_path, model_config, num_samples, byte_value_path, maxlen=200, temperature=1.0): """model_path = path to model state dict file. model_config = path to model config JSON num_samples = number of samples to draw from the model. byte_value_path: path to txt holding list of unicode byte values maxlen: int, max number of bytes to draw temperature: float > 0, sampling temperature. Lower temperature means more conservative sampling""" if not os.path.exists(model_path): print(f"Model file {model_path} not found.", file=sys.stderr) return 1 if not os.path.exists(model_config): print(f"Model config file {model_config} not found.", file=sys.stderr) return 2 if not os.path.exists(byte_value_path): print(f"Byte value file {byte_value_path} not found.", file=sys.stderr) return 2 bc = ByteCode(byte_value_path) with open(model_config) as f: config = json.load(f) stop_token = config['stop_token'] model = LayerRNN(config['input_size'], hidden_size=config['hidden_size'], num_layers=config['num_layers'], dropout=config['dropout']) if torch.cuda.is_available(): print("Using gpu") device = torch.device("cuda:0") else: device = torch.device('cpu') model.to(device) model.load_state_dict(torch.load(model_path, map_location=device)) print(f"Loaded model from {model_path}.") model.eval() for __ in range(num_samples): bytestring, probs, entropy = model.sample(stop_token, maxlen=maxlen, temperature=temperature) print(bc.to_string(bytestring))
def test_dl_formats(self): bc = ByteCode("byte_values.txt") ds = ByteDataset("_bios.json", bc) dl = ByteDataLoader(ds, pack_onehot=False) onehot, targets = next(iter(dl)) self.assertTrue(isinstance(onehot, list)) dl = ByteDataLoader(ds, pack_onehot=True) onehot, targets = next(iter(dl))
opt_fname = os.path.join(expt_dir, f"opt_epoch_{ep}") torch.save(model.state_dict(), model_fname) torch.save(optimizer.state_dict(), opt_fname) model_fname = os.path.join(expt_dir, f"model_final_{ep}") opt_fname = os.path.join(expt_dir, f"opt_final_{ep}") torch.save(model.state_dict(), model_fname) torch.save(optimizer.state_dict(), opt_fname) finally: log() if __name__ == "__main__": fname = "_bios.json" bc = ByteCode("byte_values.txt") ds = ByteDataset(fname, bc, device=torch.device('cpu')) print(f"Loaded {len(ds)} samples") dl = ByteDataLoader(ds, batch_size=1) rnn = RNN(bc.num_codes) rnn.train() epochs = 1 lr = 1e-3 losses = [] lossfn = nn.CrossEntropyLoss(reduction='none') optimizer = Adam(rnn.parameters(), lr=lr) train(dl, rnn, optimizer, dict(epochs=epochs, expt_dir="tst", sample_step=1), torch.device('cpu'), bc)
inp = h[i] # apply linear to the upper hidden state and sample from the byte distribution. logits = self.linear(h[self.num_layers-1]).squeeze() / temperature probs = logits.softmax(0).detach() probs[probs<1e-12] = 1e-12 entropies.append(-(probs * probs.log2()).sum().item()) output = Categorical(logits=logits).sample().item() bytestring.append(output) probs_sampled.append(probs[output].item()) if len(bytestring) == maxlen - 1: print(f"Warning - max length {maxlen} reached") if bytestring[-1] != stop_token: bytestring.append(stop_token) return bytestring, probs_sampled, entropies if __name__ == "__main__": from data import ByteCode byte_code = ByteCode("byte_values.txt") model = LayerRNN(input_size=byte_code.num_codes) b,p,e = model.sample(byte_code.STOP_CODE, maxlen=20) print(byte_code.to_string(b))
def test__to_code(self): bc = ByteCode("byte_values.txt") not_a_byteval = -3 self.assertEqual(bc._to_code(not_a_byteval), bc._byte_value_map[bc.MISSING])
def test_to_int_seq(self): s = ' ' bc = ByteCode("byte_values.txt") self.assertEqual(bc.to_int_seq(s), [bc.STOP_CODE, bc._byte_value_map[32], bc.STOP_CODE])