Example #1
0
 def test_encode(self):
     ds = ["hello", '']
     bc = ByteCode("byte_values.txt")
     for i in range(len(ds)):
         indices = bc.to_int_seq(ds[i])
         self.assertEqual(indices[-1], bc.STOP_CODE)
         self.assertEqual(ds[i], bc.to_string(indices))
Example #2
0
def sample(model_path,
           model_config,
           num_samples,
           byte_value_path,
           maxlen=200,
           temperature=1.0):
    """model_path = path to model state dict file.
        model_config = path to model config JSON
        num_samples = number of samples to draw from the model.
        byte_value_path: path to txt holding list of unicode byte values
        maxlen: int, max number of bytes to draw
        temperature: float > 0, sampling temperature. Lower temperature means more 
        conservative sampling"""
    if not os.path.exists(model_path):
        print(f"Model file {model_path} not found.", file=sys.stderr)
        return 1
    if not os.path.exists(model_config):
        print(f"Model config file {model_config} not found.", file=sys.stderr)
        return 2
    if not os.path.exists(byte_value_path):
        print(f"Byte value file {byte_value_path} not found.", file=sys.stderr)
        return 2

    bc = ByteCode(byte_value_path)

    with open(model_config) as f:
        config = json.load(f)

    stop_token = config['stop_token']
    model = LayerRNN(config['input_size'],
                     hidden_size=config['hidden_size'],
                     num_layers=config['num_layers'],
                     dropout=config['dropout'])

    if torch.cuda.is_available():
        print("Using gpu")
        device = torch.device("cuda:0")
    else:
        device = torch.device('cpu')
    model.to(device)

    model.load_state_dict(torch.load(model_path, map_location=device))
    print(f"Loaded model from {model_path}.")
    model.eval()
    for __ in range(num_samples):
        bytestring, probs, entropy = model.sample(stop_token,
                                                  maxlen=maxlen,
                                                  temperature=temperature)
        print(bc.to_string(bytestring))
Example #3
0
    def test_dl_formats(self):
        bc = ByteCode("byte_values.txt")
        ds = ByteDataset("_bios.json", bc)
        dl = ByteDataLoader(ds, pack_onehot=False)
        onehot, targets = next(iter(dl))
        self.assertTrue(isinstance(onehot, list))

        dl = ByteDataLoader(ds, pack_onehot=True)
        onehot, targets = next(iter(dl))
Example #4
0
                opt_fname = os.path.join(expt_dir, f"opt_epoch_{ep}")
                torch.save(model.state_dict(), model_fname)
                torch.save(optimizer.state_dict(), opt_fname)

        model_fname = os.path.join(expt_dir, f"model_final_{ep}")
        opt_fname = os.path.join(expt_dir, f"opt_final_{ep}")
        torch.save(model.state_dict(), model_fname)
        torch.save(optimizer.state_dict(), opt_fname)
    finally:
        log()


if __name__ == "__main__":

    fname = "_bios.json"
    bc = ByteCode("byte_values.txt")
    ds = ByteDataset(fname, bc, device=torch.device('cpu'))
    print(f"Loaded {len(ds)} samples")
    dl = ByteDataLoader(ds, batch_size=1)
    rnn = RNN(bc.num_codes)
    rnn.train()
    epochs = 1
    lr = 1e-3
    losses = []
    lossfn = nn.CrossEntropyLoss(reduction='none')

    optimizer = Adam(rnn.parameters(), lr=lr)

    train(dl, rnn, optimizer, dict(epochs=epochs,
                                   expt_dir="tst",
                                   sample_step=1), torch.device('cpu'), bc)
Example #5
0
                inp = h[i]

            # apply linear to the upper hidden state and sample from the byte distribution.
            
            logits = self.linear(h[self.num_layers-1]).squeeze() / temperature
            probs = logits.softmax(0).detach()
            probs[probs<1e-12] = 1e-12
            entropies.append(-(probs * probs.log2()).sum().item())
            output = Categorical(logits=logits).sample().item()
            bytestring.append(output)
            probs_sampled.append(probs[output].item())

        if len(bytestring) == maxlen - 1:
            print(f"Warning - max length {maxlen} reached")
            if bytestring[-1] != stop_token:
                bytestring.append(stop_token)

        return bytestring, probs_sampled, entropies


if __name__ == "__main__":
    from data import ByteCode
    byte_code = ByteCode("byte_values.txt")
    model = LayerRNN(input_size=byte_code.num_codes)
    b,p,e = model.sample(byte_code.STOP_CODE, maxlen=20)
    print(byte_code.to_string(b))

  


Example #6
0
 def test__to_code(self):
     bc = ByteCode("byte_values.txt")
     not_a_byteval = -3
     self.assertEqual(bc._to_code(not_a_byteval),
                      bc._byte_value_map[bc.MISSING])
Example #7
0
 def test_to_int_seq(self):
     s = ' '
     bc = ByteCode("byte_values.txt")
     self.assertEqual(bc.to_int_seq(s),
                      [bc.STOP_CODE, bc._byte_value_map[32], bc.STOP_CODE])