def text_generator(state_dict, given_starting_letter): seed = random.randint(0, 2147483647) np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = get_encoder() config = GPT2Config() model = GPT2LMHeadModel(config) model = load_weight(model, state_dict) model.to(device) model.eval() context_tokens = enc.encode(EXAMPLE_LETTER) generated = 0 out = sample_sequence( model=model, length=config.n_ctx // 2, context=context_tokens, start_token=None, batch_size=1, temperature=0.7, top_k=40, device=device, ) out = out[:, len(context_tokens):].tolist() text = enc.decode(out[0]) print(text) return text
def main(): args = parser.parse_args() enc = encoder.get_encoder() print('Reading files') chunks = load_dataset(enc, args.in_text, args.combine) print('Writing', args.out_npz) np.savez_compressed(args.out_npz, *chunks)
def text_generator(state_dict): # parser = argparse.ArgumentParser() # parser.add_argument("--text", type=str, required=True) # parser.add_argument("--quiet", type=bool, default=False) # parser.add_argument("--nsamples", type=int, default=1) # parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') # parser.add_argument("--batch_size", type=int, default=-1) # parser.add_argument("--length", type=int, default=-1) # parser.add_argument("--temperature", type=float, default=0.7) # parser.add_argument("--top_k", type=int, default=40) # args = parser.parse_args() if args_quiet is False: print(args) # if args_batch_size == -1: args_batch_size = 1 assert args_nsamples % args_batch_size == 0 seed = random.randint(0, 2147483647) np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load Model enc = get_encoder() config = GPT2Config() model = GPT2LMHeadModel(config) model = load_weight(model, state_dict) model.to(device) model.eval() #if args_length == -1: args_length = config.n_ctx // 2 # elif args_length > config.n_ctx: # raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx) # print(args.text) context_tokens = enc.encode(GPT2_seed_text) generated = 0 for _ in range(args_nsamples // args_batch_size): out = sample_sequence( model=model, length=args_length, context=context_tokens if not args_unconditional else None, start_token=enc.encoder['<|endoftext|>'] if args_unconditional else None, batch_size=args_batch_size, temperature=args_temperature, top_k=args_top_k, device=device ) out = out[:, len(context_tokens):].tolist() for i in range(args_batch_size): generated += 1 text = enc.decode(out[i]) if args_quiet is False: print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) global GPT2_output GPT2_output = text print(text)
def text_generator(state_dict, param_prompt, param_nsamples, param_batch_size, param_length, param_temperature, param_top_k): #param_prompt = "Peter was a man" param_quiet = False #param_nsamples = 1 param_unconditional = None #param_batch_size = 1 #param_length = 5 #param_temperature = 0.95 #param_top_k = 100 if param_batch_size == -1: param_batch_size = 1 assert param_nsamples % param_batch_size == 0 seed = random.randint(0, 2147483647) np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load Model enc = get_encoder() config = GPT2Config() model = GPT2LMHeadModel(config) model = load_weight(model, state_dict) model.to(device) model.eval() if param_length == -1: param_length = config.n_ctx // 2 elif param_length > config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx) response = param_prompt #print(param_prompt) context_tokens = enc.encode(param_prompt) generated = 0 for _ in range(param_nsamples // param_batch_size): out = sample_sequence( model=model, length=param_length, context=context_tokens if not param_unconditional else None, start_token=enc.encoder['<|endoftext|>'] if param_unconditional else None, batch_size=param_batch_size, temperature=param_temperature, top_k=param_top_k, device=device ) out = out[:, len(context_tokens):].tolist() for i in range(param_batch_size): generated += 1 text = enc.decode(out[i]) if param_quiet is False: response = "=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40 #return("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) response = param_prompt + text #return(text) return response
def text_generator(input_text): if gpt2_parameters.get("quiet") is False: print('GPT-2 parameters used: ' + str(gpt2_parameters)) if gpt2_parameters.get("batch_size") == -1: gpt2_parameters["batch_size"] = 1 assert gpt2_parameters.get("nsamples") % gpt2_parameters.get("batch_size") == 0 seed = random.randint(0, 2147483647) np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = get_encoder() print(GPT2Config(model_file).output_config()) config = GPT2Config(model_file) model = GPT2LMHeadModel(config) model = load_weight(model, state_dict) model.to(device) model.eval() if gpt2_parameters.get("length") == -1: gpt2_parameters["length"] = config.n_ctx // 2 elif gpt2_parameters.get("length") > config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx) print('TEXT INPUT: ' + input_text) context_tokens = enc.encode(input_text) generated = 0 for _ in range(gpt2_parameters.get("nsamples") // gpt2_parameters.get("batch_size")): out = sample_sequence( model=model, length=gpt2_parameters.get("length"), context=context_tokens if not gpt2_parameters.get("unconditional") else None, start_token=enc.encoder['<|endoftext|>'] if gpt2_parameters.get("unconditional") else None, batch_size=gpt2_parameters.get("batch_size"), temperature=gpt2_parameters.get("temperature"), top_k=gpt2_parameters.get("top_k"), device=device ) out = out[:, len(context_tokens):].tolist() for i in range(gpt2_parameters.get("batch_size")): generated += 1 text = enc.decode(out[i]) context_tokens = enc.encode(text) if gpt2_parameters.get("quiet") is False: print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) if '<|endoftext|>' in text: print(input_text + text.replace("<|endoftext|>",' (END-OF-TEXT)')) return input_text + text.replace("<|endoftext|>",' (END-OF-TEXT)') else: print(input_text + text + '...') return input_text + text + '...'
def calculateGroundValidityTensor(groundStrings: iter): gvBar = tqdm(total=len(groundStrings), desc="GroundValidity", position=0) gvTen = [] coder = get_encoder() for gs in groundStrings: tokens = coder.encode(gs) radii = errorSeries(tokens, None) gvTen.append(radii) gvBar.update() return gvTen
def text_generator(state_dict, args): if args.quiet is False: print(args) if args.batch_size == -1: args.batch_size = 1 assert args.nsamples % args.batch_size == 0 seed = random.randint(0, 2147483647) np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load Model enc = get_encoder() config = GPT2Config() model = GPT2LMHeadModel(config) model = load_weight(model, state_dict) model.to(device) model.eval() if args.length == -1: args.length = config.n_ctx // 2 elif args.length > config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx) print(args.text) context_tokens = enc.encode(args.text) generated = 0 for _ in range(args.nsamples // args.batch_size): out = sample_sequence( model=model, length=args.length, context=context_tokens if not args.unconditional else None, start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None, batch_size=args.batch_size, temperature=args.temperature, top_k=args.top_k, device=device) out = out[:, len(context_tokens):].tolist() for i in range(args.batch_size): generated += 1 text = enc.decode(out[i]) if args.quiet is False: print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text)
def text_generator_for_out(text, model, device, length=200, temperature=0.7, top_k=40, path_to_model=path): print("text_generator_for_out", path) if os.path.exists(path + '/' + 'gpt2-pytorch_model.bin'): print(path + '/' + 'gpt2-pytorch_model.bin') enc = get_encoder() quiet = False length = 200 print("text_generator_for_out 1") if length == -1: length = 1024 // 2 elif length > 1024: raise ValueError("Can't get samples longer than window size: %s" % 1024) context_tokens = enc.encode(text) generated = 0 print("text_generator_for_out 2") for _ in range(1): out = sample_sequence(model=model, length=length, context=context_tokens, start_token=None, batch_size=1, temperature=temperature, top_k=top_k, device=device) print("text_generator_for_out 3") out = out[:, len(context_tokens):].tolist() for i in range(1): generated += 1 text = enc.decode(out[i]) print("text_generator_for_out 4") if quiet is False: print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print("in big gen2", text) return text
def text_generator(seed, unconditional=False, nsamples=1, batch_size=-1, length=-1, temperature=0.7, top_k=40): enc = get_encoder() context_tokens = enc.encode(seed) if batch_size == -1: batch_size = 1 assert nsamples % batch_size == 0 if length == -1: length = config.n_ctx // 2 elif length > config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx) out = sample_sequence( model=model, length=length, context=context_tokens if not unconditional else None, start_token=enc.encoder['<|endoftext|>'] if unconditional else None, batch_size=batch_size, temperature=temperature, top_k=top_k, device=device) text = '' out = out[:, len(context_tokens):].tolist() for i in range(batch_size): text += enc.decode(out[i]) html = '' html = add_content( html, header('Input Seed ', color='black', gen_text='Network Output')) html = add_content(html, box(seed, text)) return f'<div>{html}</div>'
def __init__(self): state_dict = torch.load( (path.join(path.dirname(path.abspath(__file__)), 'gpt-2-Pytorch', 'gpt2-pytorch_model.bin')), map_location='cpu' if not torch.cuda.is_available() else None) batch_size = 1 # assert nsamples % batch_size == 0 seed = random.randint(0, 2147483647) np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load Model enc = get_encoder() config = GPT2Config() model = GPT2LMHeadModel(config) model = load_weight(model, state_dict) model.to(device) model.eval() length = -1 if length == -1: length = config.n_ctx // 2 elif length > config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx) self.enc = enc self.batch_size = batch_size self.model = model self.length = 20 self.device = device
def generate_ground_strings( model=model, length=-1, sentences=-1, start_token=None, batch_size=1, context=None, temperature=1, top_k=0, device=device, sample=True, ): if start_token is None: assert context is not None, "Specify exactly one of start_token and context!" context = (torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat( batch_size, 1)) else: assert context is None, "Specify exactly one of start_token and context!" context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long) prev = context output = context past = None enc = get_encoder() elaboration = "" with torch.no_grad(): # -- Check Context Size -- # if length == -1: length = gpt2_config.n_ctx // 2 elif length > gpt2_config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % gpt2_config.n_ctx) for i in range(length): logits, past = model(prev, past=past) logits = logits[:, -1, :] / temperature logits = top_k_logits(logits, k=top_k) log_probs = F.softmax(logits, dim=-1) if sample: prev = torch.multinomial(log_probs, num_samples=1) else: _, prev = torch.topk(log_probs, k=1, dim=-1) output = torch.cat((output, prev), dim=1) elab_fragment = enc.decode([output[0].tolist()[-1]]) if (elab_fragment == "<|endoftext|>") or ( (i + 1) == length) or ("\n" in elab_fragment): # try again prev = context i = 0 continue elaboration += elab_fragment # break at endoftext tag or length limit reached # limit number of sentences if parameter set if sentences != -1: possible_sentence = False # see if this is possibly the end of a sentence for char in elab_fragment: if char in ".?!": # checking for end punctuation possible_sentence = True break if possible_sentence: sentence_count = count_sentences(elaboration) if sentence_count >= sentences: # stop elaborating, trim, and return. # trim message to last punctuation, and return it. elaboration = trim_to_sentence(elaboration) return elaboration
import quotes import json from GPT2.model import GPT2LMHeadModel from GPT2.utils import load_weight from GPT2.config import GPT2Config from GPT2.sample import sample_sequence from GPT2.encoder import get_encoder # Load Model state_dict = torch.load( "gpt2-pytorch_model.bin", map_location="cpu" if not torch.cuda.is_available() else None, ) enc = get_encoder() config = GPT2Config() model = GPT2LMHeadModel(config) model = load_weight(model, state_dict) def text_generator(model, text): nsamples = 1 batch_size = -1 length = 200 temperature = .7 top_k = 40 unconditional = False if batch_size == -1: batch_size = 1
def main(): parser = argparse.ArgumentParser( description='Fine-tune GPT-2 on your custom dataset.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--dataset', metavar='PATH', type=str, required=True, help= 'Input file, directory, or glob pattern (utf-8 text, or preencoded .npz files).' ) parser.add_argument('--model_name', metavar='MODEL', type=str, default='117M', help='Pretrained model name') parser.add_argument( '--combine', metavar='CHARS', type=int, default=50000, help= 'Concatenate input files with <|endoftext|> separator into chunks of this minimum size' ) parser.add_argument('--batch_size', metavar='SIZE', type=int, default=1, help='Batch size') parser.add_argument('--learning_rate', metavar='LR', type=float, default=0.00002, help='Learning rate for Adam') parser.add_argument('--accumulate_gradients', metavar='N', type=int, default=1, help='Accumulate gradients across N minibatches.') parser.add_argument('--only_train_transformer_layers', default=False, action='store_true', help='Restrict training to the transformer blocks.') parser.add_argument('--optimizer', type=str, default='adam', help='Optimizer. <adam|sgd>.') parser.add_argument( '--noise', type=float, default=0.0, help='Add noise to input training data to regularize against typos.') parser.add_argument('--top_k', type=int, default=40, help='K for top-k sampling.') parser.add_argument( '--top_p', type=float, default=0.0, help='P for top-p sampling. Overrides top_k if set > 0.') parser.add_argument( '--restore_from', type=str, default='latest', help='Either "latest", "fresh", or a path to a checkpoint file') parser.add_argument( '--run_name', type=str, default='run1', help='Run id. Name of subdirectory in finetuned_models/') parser.add_argument('--sample_every', metavar='N', type=int, default=100, help='Generate samples every N steps') parser.add_argument('--sample_length', metavar='TOKENS', type=int, default=1023, help='Sample this many tokens') parser.add_argument('--sample_num', metavar='N', type=int, default=1, help='Generate this many samples') parser.add_argument('--save_every', metavar='N', type=int, default=1000, help='Write a checkpoint every N steps') parser.add_argument( '--val_dataset', metavar='PATH', type=str, default=None, help='Dataset for validation loss, defaults to --dataset.') parser.add_argument('--val_batch_size', metavar='SIZE', type=int, default=2, help='Batch size for validation.') parser.add_argument('--val_batch_count', metavar='N', type=int, default=40, help='Number of batches for validation.') parser.add_argument('--val_every', metavar='STEPS', type=int, default=0, help='Calculate validation loss every STEPS steps.') # settings args = parser.parse_args() print(args) enc = get_encoder() config = get_config(args.model_name) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = GPT2LMHeadModel(config) # error checking if args.sample_length > config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx) if args.model_name == '345M': args.memory_saving_gradients = True if args.optimizer == 'adam': args.only_train_transformer_layers = True # select variables to update while training all_vars = [tensor for tensor in model.parameters()] transformer_vars = [ tensor for name, tensor in model.named_parameters() if 'transformer.h.' in name ] train_vars = transformer_vars if args.only_train_transformer_layers else all_vars # create optimizer if args.optimizer == 'adam': optimizer = torch.optim.Adam(train_vars, lr=args.learning_rate) elif args.optimizer == 'sgd': optimizer = torch.optim.SGD(train_vars, lr=args.learning_rate) else: exit('Bad optimizer:', args.optimizer) # load model if args.restore_from == 'latest': ckpt_path = get_latest_ckpt(os.path.join(CHECKPOINT_DIR, args.run_name)) if ckpt_path is None: state_dict = get_state_dict(args.model_name) model = load_model(model, state_dict, device) counter = 1 else: ckpt = torch.load(ckpt_path) model = load_model(model, ckpt['model_state_dict'], device) optimizer.load_state_dict(ckpt['optimizer_state_dict']) counter = ckpt['counter'] elif args.restore_from == 'fresh': state_dict = get_state_dict(args.model_name) model = load_model(model, state_dict, device) counter = 1 else: # path to a checkpoint tar file ckpt = torch.load(args.restore_from) model = load_model(model, ckpt['model_state_dict'], device) optimizer.load_state_dict(ckpt['optimizer_state_dict']) counter = ckpt['counter'] # load datasets print('load training dataset...') chunks = load_dataset(enc, args.dataset, args.combine) data_sampler = Sampler(chunks) print('dataset has {} tokens'.format(data_sampler.total_size)) if args.val_every > 0: # Sample from validation set once with fixed seed to make # it deterministic during training as well as across runs. print('load validation dataset...') val_chunks = load_dataset(enc, args.val_dataset, args.combine) if args.val_dataset else chunks val_data_sampler = Sampler(val_chunks, seed=1) val_batches = torch.tensor([[ val_data_sampler.sample(1024) for _ in range(args.val_batch_size) ] for _ in range(args.val_batch_count)]) def save(): maketree(os.path.join(CHECKPOINT_DIR, args.run_name)) save_path = os.path.join(CHECKPOINT_DIR, args.run_name, 'ckpt-{}.tar'.format(counter)) torch.save( { 'counter': counter, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, save_path) def generate_samples(): """Generate unconditional samples.""" print('Generating samples...') generated = 0 all_text = [] for _ in range(args.sample_num): out = sample_sequence(model=model, length=args.sample_length, context=None, start_token=enc.encoder['<|endoftext|>'], batch_size=1, temperature=1.0, top_k=args.top_k, device=device) out = out[:, :].tolist()[0] generated += 1 text = enc.decode(out) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) all_text.append(text) maketree(os.path.join(SAMPLE_DIR, args.run_name)) with open( os.path.join(SAMPLE_DIR, args.run_name, 'samples-{}.txt'.format(counter)), 'w') as fp: fp.write('\n'.join(all_text)) def validation(): print('Calculating validation loss...') losses = [] for batch in tqdm.tqdm(val_batches): loss = model(batch[:, :-1].to(device), lm_labels=batch[:, 1:].to(device)) losses.append(loss) v_val_loss = torch.mean(torch.tensor(losses)) print('[{counter} | {time:2.2f}] validation loss = {loss:2.2f}'.format( counter=counter, time=time.time() - start_time, loss=v_val_loss)) def sample_batch(): return torch.tensor( [data_sampler.sample(1024) for _ in range(args.batch_size)]) avg_loss = (0.0, 0.0) start_time = time.time() # training try: while True: if counter % args.save_every == 0: save() if counter % args.sample_every == 0: generate_samples() if args.val_every > 0 and (counter % args.val_every == 0 or counter == 1): validation() if args.accumulate_gradients > 1: optimizer.zero_grad() for _ in range(args.accumulate_gradients): batch = sample_batch() loss = model(batch[:, :-1].to(device), lm_labels=batch[:, 1:].to(device)) loss.backward() optimizer.step() else: optimizer.zero_grad() batch = sample_batch() loss = model(batch[:, :-1].to(device), lm_labels=batch[:, 1:].to(device)) loss.backward() optimizer.step() avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0) print('[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}'. format(counter=counter, time=time.time() - start_time, loss=loss, avg=avg_loss[0] / avg_loss[1])) counter += 1 except KeyboardInterrupt: print('interrupt') save()
def text_generator(state_dict): parser = argparse.ArgumentParser() #parser.add_argument("--text", type = file, required=True) parser.add_argument('filename') parser.add_argument("--quiet", type=bool, default=False) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') parser.add_argument("--batch_size", type=int, default=-1) parser.add_argument("--length", type=int, default=40) parser.add_argument("--temperature", type=float, default=0.7) parser.add_argument("--top_k", type=int, default=40) args = parser.parse_args() open_bbc_page = requests.get(main_url).json() article = open_bbc_page["articles"] results = [] for ar in article: results.append(ar["title"]) print(results[1]) text1 = results[1] with open(args.filename) as file: #text1 = file.read() print(text1) if args.quiet is False: print(args) if args.batch_size == -1: args.batch_size = 1 assert args.nsamples % args.batch_size == 0 seed = random.randint(0, 2147483647) np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load Model enc = get_encoder() config = GPT2Config() model = GPT2LMHeadModel(config) model = load_weight(model, state_dict) model.to(device) model.eval() if args.length == -1: args.length = config.n_ctx // 2 elif args.length > config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx) print(text1) context_tokens = enc.encode(text1) generated = 0 for _ in range(args.nsamples // args.batch_size): out = sample_sequence( model=model, length=args.length, context=context_tokens if not args.unconditional else None, start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None, batch_size=args.batch_size, temperature=args.temperature, top_k=args.top_k, device=device) out = out[:, len(context_tokens):].tolist() for i in range(args.batch_size): generated += 1 text = enc.decode(out[i]) if args.quiet is False: print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) text = text1 + text api.update_status(status=text)
def generator(text): # parser = argparse.ArgumentParser() # parser.add_argument("--text", type=str, required=True) # parser.add_argument("--quiet", type=bool, default=False) # parser.add_argument("--nsamples", type=int, default=1) # parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') # parser.add_argument("--batch_size", type=int, default=-1) # parser.add_argument("--length", type=int, default=-1) # parser.add_argument("--temperature", type=float, default=0.7) # parser.add_argument("--top_k", type=int, default=40) # args = parser.parse_args() state_dict = torch.load( 'gpt2-pytorch_model.bin', map_location='cpu' if not torch.cuda.is_available() else None) input = text quiet = False nsamples = 1 unconditional = False batch_size = -1 length = -1 temperature = 0.7 top_k = 40 if batch_size == -1: batch_size = 1 assert nsamples % batch_size == 0 seed = random.randint(0, 2147483647) np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load Model enc = get_encoder() config = GPT2Config() model = GPT2LMHeadModel(config) model = load_weight(model, state_dict) model.to(device) model.eval() if length == -1: length = config.n_ctx // 2 elif length > config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx) print(text) context_tokens = enc.encode(text) generated = 0 for _ in range(nsamples // batch_size): out = sample_sequence( model=model, length=length, context=context_tokens if not unconditional else None, start_token=enc.encoder['<|endoftext|>'] if unconditional else None, batch_size=batch_size, temperature=temperature, top_k=top_k, device=device) out = out[:, len(context_tokens):].tolist() for i in range(batch_size): generated += 1 text = enc.decode(out[i]) if quiet is False: print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) return text
def text_generator(state_dict): parser = argparse.ArgumentParser() parser.add_argument("--text", type=str, required=True) parser.add_argument("--quiet", type=bool, default=False) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') parser.add_argument("--batch_size", type=int, default=-1) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=float, default=0.7) parser.add_argument("--top_k", type=int, default=40) args = parser.parse_args() # ================================================================================ if args.quiet is False: print(args) # ================================================================================ if args.batch_size == -1: args.batch_size = 1 # ================================================================================ assert args.nsamples % args.batch_size == 0 # ================================================================================ seed = random.randint(0, 2147483647) np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # ================================================================================ # Load Model enc = get_encoder() config = GPT2Config() model = GPT2LMHeadModel(config) # ================================================================================ model = load_weight(model, state_dict) model.to(device) model.eval() # ================================================================================ if args.length == -1: args.length = config.n_ctx // 2 elif args.length > config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx) # ================================================================================ print(args.text) # I use computer # ================================================================================ context_tokens = enc.encode(args.text) # afaf 2: context_tokens = enc.encode(args.text) # print("context_tokens",context_tokens) # [40, 779, 3644] # ================================================================================ # print("args.length",args.length) # 512 generated = 0 for _ in range(args.nsamples // args.batch_size): out = sample_sequence( model=model, length=args.length, context=context_tokens if not args.unconditional else None, start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None, batch_size=args.batch_size, temperature=args.temperature, top_k=args.top_k, device=device) # afaf 5: out = sample_sequence( # print("out",out) # tensor([[ 40, 779, 3644, 1143, 3788, 284, 2198, 262, 2033, 286, # 1321, 287, 262, 2393, 11, 290, 788, 4866, 340, 284, # print("out",out.shape) # torch.Size([1, 515]) len_ctx_tokens = len(context_tokens) # print("len_ctx_tokens",len_ctx_tokens) # 3 out = out[:, len_ctx_tokens:].tolist() # ================================================================================ # print("args.batch_size",args.batch_size) # 1 for i in range(args.batch_size): generated += 1 # ================================================================================ # print("out",out) # [[3783, 11, 543, 318, 257, 1688, 636, 286, 616, 3047, 290, 318, 257, 845, # print("out",len(out)) # 1 # ================================================================================ indexed_out = out[i] # print("indexed_out",indexed_out) # [5479, 588, 9678, 290, 24134, 284, 16481, 1366, 287, 257, 30117, 13, 383, 1917, 318, 326, # print("indexed_out",len(indexed_out)) # 512 # ================================================================================ text = enc.decode(indexed_out) print("text", text) afaf # terminals with Ethernet cable to connect the computer to a computer system that has a computer terminal. # An additional feature # ================================================================================ if args.quiet is False: print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text)
import json import tokensToUTF from GPT2.encoder import get_encoder inPath = "" inFileName = "chapter 1" tokensPerChunk = 65 # number of tokens a chunk should have at most lowTokenBoundary = 20 inJSONfilePath = f"{inPath}{inFileName}_{tokensPerChunk}tkChunks.json" inJSON = open(inJSONfilePath, "r", encoding="utf-8").read() chunkList = json.loads(inJSON) fixEncodes = tokensToUTF.getFixEncodes() encoder = get_encoder() for chunk in chunkList: chunkTokens = encoder.encode(chunk) if len(chunkTokens) > tokensPerChunk: print( f"'{chunk}'\nhas {len(chunkTokens)} tokens, which are {len(chunkTokens) - tokensPerChunk} too many!\n" ) if len(chunkTokens) <= lowTokenBoundary: print( f"'{chunk}'\nhas {len(chunkTokens)} tokens, which is very little!\n" )
def tokenize(text: str): enc = get_encoder() tokens = enc.encode(text) return tokens
def detokenize(tokens: iter): enc = get_encoder() text = enc.decode(tokens) return text
def calculateValidityTensor( groundTokens: iter, groundValidityTensor: iter, perterbationTensor: iter, checkpoint: str = None, ): validityTensor = [] totalBar = tqdm(total=len(perterbationTensor), desc="Total", position=0) symbolBar = tqdm(total=len(perterbationTensor[0][1]), desc="TBD", position=1) vectorBar = tqdm(total=len(perterbationTensor[0][1][0]), desc="Vector", position=2) if checkpoint: with open(checkpoint, "r") as f: validityTensor = json.load(f) # don't recalculate any symbols that have already been done already = len(validityTensor) perterbationTensor = perterbationTensor[already::] totalBar.update(already) coder = get_encoder() for sym, plane in perterbationTensor: logging.info("Started Symbol: " + sym) symbolBar.reset() symbolBar.set_description(sym) vPlane = [] for i, vector in enumerate(plane): vVector = [] vectorBar.reset(total=len(vector)) for pString in vector: # tokenize pString pTokens = coder.encode(pString) # locate departure form ground tokens departure = firstMismatch(pTokens, groundTokens[i]) if departure is not None: # sum error up to agreement with groundTokens agreement = sum(groundValidityTensor[i][:departure]) # calculate validity of peterbed string from departure onward departureValidity = partialErrorSeries(pTokens, departure) # calculate total validity validity = agreement + sum(departureValidity) # compare to ground validity validity_delta = (sum(groundValidityTensor[i]) - validity ) # lower validity is better else: validity_delta = 0 vVector.append(validity_delta) vectorBar.update() vPlane.append(vVector) symbolBar.update() validityTensor.append((sym, vPlane)) totalBar.update() logging.info("Finished Symbol: " + sym) with open(args["vo"], "w") as f: # save checkpoint json.dump(validityTensor, f) vectorBar.close() symbolBar.close() totalBar.close() return validityTensor