Example #1
0
def text_generator(state_dict, given_starting_letter):
    seed = random.randint(0, 2147483647)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    enc = get_encoder()
    config = GPT2Config()
    model = GPT2LMHeadModel(config)
    model = load_weight(model, state_dict)
    model.to(device)
    model.eval()
    context_tokens = enc.encode(EXAMPLE_LETTER)
    generated = 0
    out = sample_sequence(
        model=model,
        length=config.n_ctx // 2,
        context=context_tokens,
        start_token=None,
        batch_size=1,
        temperature=0.7,
        top_k=40,
        device=device,
    )
    out = out[:, len(context_tokens):].tolist()
    text = enc.decode(out[0])
    print(text)
    return text
Example #2
0
def main():
    args = parser.parse_args()
    enc = encoder.get_encoder()
    print('Reading files')
    chunks = load_dataset(enc, args.in_text, args.combine)
    print('Writing', args.out_npz)
    np.savez_compressed(args.out_npz, *chunks)
Example #3
0
def text_generator(state_dict):
   # parser = argparse.ArgumentParser()
  #  parser.add_argument("--text", type=str, required=True)
   # parser.add_argument("--quiet", type=bool, default=False)
   # parser.add_argument("--nsamples", type=int, default=1)
   # parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
   # parser.add_argument("--batch_size", type=int, default=-1)
   # parser.add_argument("--length", type=int, default=-1)
   # parser.add_argument("--temperature", type=float, default=0.7)
   # parser.add_argument("--top_k", type=int, default=40)
   # args = parser.parse_args()

    if args_quiet is False:
        print(args)

   # if args_batch_size == -1:
    args_batch_size = 1
    assert args_nsamples % args_batch_size == 0

    seed = random.randint(0, 2147483647)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load Model
    enc = get_encoder()
    config = GPT2Config()
    model = GPT2LMHeadModel(config)
    model = load_weight(model, state_dict)
    model.to(device)
    model.eval()

    #if args_length == -1:
    args_length = config.n_ctx // 2
   # elif args_length > config.n_ctx:
    #    raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx)

   # print(args.text)
    context_tokens = enc.encode(GPT2_seed_text)

    generated = 0
    for _ in range(args_nsamples // args_batch_size):
        out = sample_sequence(
            model=model, length=args_length,
            context=context_tokens  if not  args_unconditional else None,
            start_token=enc.encoder['<|endoftext|>'] if args_unconditional else None,
            batch_size=args_batch_size,
            temperature=args_temperature, top_k=args_top_k, device=device
        )
        out = out[:, len(context_tokens):].tolist()
        for i in range(args_batch_size):
            generated += 1
            text = enc.decode(out[i])
            if args_quiet is False:
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
            global GPT2_output
            GPT2_output = text
            print(text)
Example #4
0
File: test.py Project: pmerrill/ai
    def text_generator(state_dict, param_prompt, param_nsamples, param_batch_size, param_length, param_temperature, param_top_k):

        #param_prompt = "Peter was a man"
        param_quiet = False
        #param_nsamples = 1
        param_unconditional = None
        #param_batch_size = 1
        #param_length = 5
        #param_temperature = 0.95
        #param_top_k = 100

        if param_batch_size == -1:
            param_batch_size = 1
        assert param_nsamples % param_batch_size == 0

        seed = random.randint(0, 2147483647)
        np.random.seed(seed)
        torch.random.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load Model
        enc = get_encoder()
        config = GPT2Config()
        model = GPT2LMHeadModel(config)
        model = load_weight(model, state_dict)
        model.to(device)
        model.eval()

        if param_length == -1:
            param_length = config.n_ctx // 2
        elif param_length > config.n_ctx:
            raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx)

        response = param_prompt
        #print(param_prompt)
        context_tokens = enc.encode(param_prompt)

        generated = 0
        for _ in range(param_nsamples // param_batch_size):
            out = sample_sequence(
                model=model, length=param_length,
                context=context_tokens  if not  param_unconditional else None,
                start_token=enc.encoder['<|endoftext|>'] if param_unconditional else None,
                batch_size=param_batch_size,
                temperature=param_temperature, top_k=param_top_k, device=device
            )
            out = out[:, len(context_tokens):].tolist()
            for i in range(param_batch_size):
                generated += 1
                text = enc.decode(out[i])
                if param_quiet is False:
                    response = "=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40
                    #return("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                response = param_prompt + text
                #return(text)

        return response
Example #5
0
def text_generator(input_text):
    if gpt2_parameters.get("quiet") is False:
        print('GPT-2 parameters used: ' + str(gpt2_parameters))

    if gpt2_parameters.get("batch_size") == -1:
        gpt2_parameters["batch_size"] = 1
    assert gpt2_parameters.get("nsamples") % gpt2_parameters.get("batch_size") == 0

    seed = random.randint(0, 2147483647)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = get_encoder()



    print(GPT2Config(model_file).output_config())
    config = GPT2Config(model_file)

    model = GPT2LMHeadModel(config)
    model = load_weight(model, state_dict)
    model.to(device)
    model.eval()

    if gpt2_parameters.get("length") == -1:
        gpt2_parameters["length"] = config.n_ctx // 2
    elif gpt2_parameters.get("length") > config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx)

    print('TEXT INPUT: ' + input_text)
    context_tokens = enc.encode(input_text)

    generated = 0
    for _ in range(gpt2_parameters.get("nsamples") // gpt2_parameters.get("batch_size")):
        out = sample_sequence(
            model=model, length=gpt2_parameters.get("length"),
            context=context_tokens if not gpt2_parameters.get("unconditional") else None,
            start_token=enc.encoder['<|endoftext|>'] if gpt2_parameters.get("unconditional") else None,
            batch_size=gpt2_parameters.get("batch_size"),
            temperature=gpt2_parameters.get("temperature"), top_k=gpt2_parameters.get("top_k"), device=device
        )
        out = out[:, len(context_tokens):].tolist()
        for i in range(gpt2_parameters.get("batch_size")):
            generated += 1
            text = enc.decode(out[i])
            context_tokens = enc.encode(text)
            if gpt2_parameters.get("quiet") is False:
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
            if '<|endoftext|>' in text:
                print(input_text + text.replace("<|endoftext|>",' (END-OF-TEXT)'))
                return input_text + text.replace("<|endoftext|>",' (END-OF-TEXT)')
            else:
                print(input_text + text + '...')
                return input_text + text + '...'
Example #6
0
def calculateGroundValidityTensor(groundStrings: iter):
    gvBar = tqdm(total=len(groundStrings), desc="GroundValidity", position=0)
    gvTen = []
    coder = get_encoder()
    for gs in groundStrings:
        tokens = coder.encode(gs)
        radii = errorSeries(tokens, None)
        gvTen.append(radii)
        gvBar.update()
    return gvTen
Example #7
0
def text_generator(state_dict, args):
    if args.quiet is False:
        print(args)

    if args.batch_size == -1:
        args.batch_size = 1
    assert args.nsamples % args.batch_size == 0

    seed = random.randint(0, 2147483647)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load Model
    enc = get_encoder()
    config = GPT2Config()
    model = GPT2LMHeadModel(config)
    model = load_weight(model, state_dict)
    model.to(device)
    model.eval()

    if args.length == -1:
        args.length = config.n_ctx // 2
    elif args.length > config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         config.n_ctx)

    print(args.text)
    context_tokens = enc.encode(args.text)

    generated = 0
    for _ in range(args.nsamples // args.batch_size):
        out = sample_sequence(
            model=model,
            length=args.length,
            context=context_tokens if not args.unconditional else None,
            start_token=enc.encoder['<|endoftext|>']
            if args.unconditional else None,
            batch_size=args.batch_size,
            temperature=args.temperature,
            top_k=args.top_k,
            device=device)
        out = out[:, len(context_tokens):].tolist()
        for i in range(args.batch_size):
            generated += 1
            text = enc.decode(out[i])
            if args.quiet is False:
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
            print(text)
Example #8
0
def text_generator_for_out(text,
                           model,
                           device,
                           length=200,
                           temperature=0.7,
                           top_k=40,
                           path_to_model=path):
    print("text_generator_for_out", path)
    if os.path.exists(path + '/' + 'gpt2-pytorch_model.bin'):
        print(path + '/' + 'gpt2-pytorch_model.bin')
        enc = get_encoder()
        quiet = False
        length = 200
        print("text_generator_for_out 1")
        if length == -1:
            length = 1024 // 2
        elif length > 1024:
            raise ValueError("Can't get samples longer than window size: %s" %
                             1024)

        context_tokens = enc.encode(text)

        generated = 0
        print("text_generator_for_out 2")
        for _ in range(1):
            out = sample_sequence(model=model,
                                  length=length,
                                  context=context_tokens,
                                  start_token=None,
                                  batch_size=1,
                                  temperature=temperature,
                                  top_k=top_k,
                                  device=device)
            print("text_generator_for_out 3")
            out = out[:, len(context_tokens):].tolist()
            for i in range(1):
                generated += 1
                text = enc.decode(out[i])
                print("text_generator_for_out 4")
                if quiet is False:
                    print("=" * 40 + " SAMPLE " + str(generated) + " " +
                          "=" * 40)
                print("in big gen2", text)
                return text
Example #9
0
def text_generator(seed,
                   unconditional=False,
                   nsamples=1,
                   batch_size=-1,
                   length=-1,
                   temperature=0.7,
                   top_k=40):

    enc = get_encoder()
    context_tokens = enc.encode(seed)

    if batch_size == -1:
        batch_size = 1
    assert nsamples % batch_size == 0

    if length == -1:
        length = config.n_ctx // 2
    elif length > config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         config.n_ctx)

    out = sample_sequence(
        model=model,
        length=length,
        context=context_tokens if not unconditional else None,
        start_token=enc.encoder['<|endoftext|>'] if unconditional else None,
        batch_size=batch_size,
        temperature=temperature,
        top_k=top_k,
        device=device)

    text = ''

    out = out[:, len(context_tokens):].tolist()
    for i in range(batch_size):
        text += enc.decode(out[i])

    html = ''
    html = add_content(
        html, header('Input Seed ', color='black', gen_text='Network Output'))
    html = add_content(html, box(seed, text))
    return f'<div>{html}</div>'
Example #10
0
    def __init__(self):

        state_dict = torch.load(
            (path.join(path.dirname(path.abspath(__file__)), 'gpt-2-Pytorch',
                       'gpt2-pytorch_model.bin')),
            map_location='cpu' if not torch.cuda.is_available() else None)

        batch_size = 1

        # assert nsamples % batch_size == 0

        seed = random.randint(0, 2147483647)
        np.random.seed(seed)
        torch.random.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load Model
        enc = get_encoder()
        config = GPT2Config()
        model = GPT2LMHeadModel(config)
        model = load_weight(model, state_dict)
        model.to(device)
        model.eval()

        length = -1
        if length == -1:
            length = config.n_ctx // 2
        elif length > config.n_ctx:
            raise ValueError("Can't get samples longer than window size: %s" %
                             config.n_ctx)

        self.enc = enc
        self.batch_size = batch_size
        self.model = model
        self.length = 20
        self.device = device
Example #11
0
def generate_ground_strings(
    model=model,
    length=-1,
    sentences=-1,
    start_token=None,
    batch_size=1,
    context=None,
    temperature=1,
    top_k=0,
    device=device,
    sample=True,
):
    if start_token is None:
        assert context is not None, "Specify exactly one of start_token and context!"
        context = (torch.tensor(context, device=device,
                                dtype=torch.long).unsqueeze(0).repeat(
                                    batch_size, 1))
    else:
        assert context is None, "Specify exactly one of start_token and context!"
        context = torch.full((batch_size, 1),
                             start_token,
                             device=device,
                             dtype=torch.long)
    prev = context
    output = context
    past = None
    enc = get_encoder()
    elaboration = ""
    with torch.no_grad():
        # -- Check Context Size -- #
        if length == -1:
            length = gpt2_config.n_ctx // 2
        elif length > gpt2_config.n_ctx:
            raise ValueError("Can't get samples longer than window size: %s" %
                             gpt2_config.n_ctx)
        for i in range(length):
            logits, past = model(prev, past=past)
            logits = logits[:, -1, :] / temperature
            logits = top_k_logits(logits, k=top_k)
            log_probs = F.softmax(logits, dim=-1)
            if sample:
                prev = torch.multinomial(log_probs, num_samples=1)
            else:
                _, prev = torch.topk(log_probs, k=1, dim=-1)
            output = torch.cat((output, prev), dim=1)
            elab_fragment = enc.decode([output[0].tolist()[-1]])
            if (elab_fragment == "<|endoftext|>") or (
                (i + 1) == length) or ("\n" in elab_fragment):
                # try again
                prev = context
                i = 0
                continue
            elaboration += elab_fragment
            # break at endoftext tag or length limit reached
            # limit number of sentences if parameter set
            if sentences != -1:
                possible_sentence = False
                # see if this is possibly the end of a sentence
                for char in elab_fragment:
                    if char in ".?!":  # checking for end punctuation
                        possible_sentence = True
                        break
                if possible_sentence:
                    sentence_count = count_sentences(elaboration)
                    if sentence_count >= sentences:  # stop elaborating, trim, and return.
                        # trim message to last punctuation, and return it.
                        elaboration = trim_to_sentence(elaboration)
                        return elaboration
Example #12
0
import quotes
import json

from GPT2.model import GPT2LMHeadModel
from GPT2.utils import load_weight
from GPT2.config import GPT2Config
from GPT2.sample import sample_sequence
from GPT2.encoder import get_encoder

# Load Model
state_dict = torch.load(
    "gpt2-pytorch_model.bin",
    map_location="cpu" if not torch.cuda.is_available() else None,
)
enc = get_encoder()
config = GPT2Config()
model = GPT2LMHeadModel(config)
model = load_weight(model, state_dict)


def text_generator(model, text):
    nsamples = 1
    batch_size = -1
    length = 200
    temperature = .7
    top_k = 40
    unconditional = False

    if batch_size == -1:
        batch_size = 1
Example #13
0
def main():
    parser = argparse.ArgumentParser(
        description='Fine-tune GPT-2 on your custom dataset.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument(
        '--dataset',
        metavar='PATH',
        type=str,
        required=True,
        help=
        'Input file, directory, or glob pattern (utf-8 text, or preencoded .npz files).'
    )
    parser.add_argument('--model_name',
                        metavar='MODEL',
                        type=str,
                        default='117M',
                        help='Pretrained model name')
    parser.add_argument(
        '--combine',
        metavar='CHARS',
        type=int,
        default=50000,
        help=
        'Concatenate input files with <|endoftext|> separator into chunks of this minimum size'
    )

    parser.add_argument('--batch_size',
                        metavar='SIZE',
                        type=int,
                        default=1,
                        help='Batch size')
    parser.add_argument('--learning_rate',
                        metavar='LR',
                        type=float,
                        default=0.00002,
                        help='Learning rate for Adam')
    parser.add_argument('--accumulate_gradients',
                        metavar='N',
                        type=int,
                        default=1,
                        help='Accumulate gradients across N minibatches.')
    parser.add_argument('--only_train_transformer_layers',
                        default=False,
                        action='store_true',
                        help='Restrict training to the transformer blocks.')
    parser.add_argument('--optimizer',
                        type=str,
                        default='adam',
                        help='Optimizer. <adam|sgd>.')
    parser.add_argument(
        '--noise',
        type=float,
        default=0.0,
        help='Add noise to input training data to regularize against typos.')

    parser.add_argument('--top_k',
                        type=int,
                        default=40,
                        help='K for top-k sampling.')
    parser.add_argument(
        '--top_p',
        type=float,
        default=0.0,
        help='P for top-p sampling. Overrides top_k if set > 0.')

    parser.add_argument(
        '--restore_from',
        type=str,
        default='latest',
        help='Either "latest", "fresh", or a path to a checkpoint file')
    parser.add_argument(
        '--run_name',
        type=str,
        default='run1',
        help='Run id. Name of subdirectory in finetuned_models/')
    parser.add_argument('--sample_every',
                        metavar='N',
                        type=int,
                        default=100,
                        help='Generate samples every N steps')
    parser.add_argument('--sample_length',
                        metavar='TOKENS',
                        type=int,
                        default=1023,
                        help='Sample this many tokens')
    parser.add_argument('--sample_num',
                        metavar='N',
                        type=int,
                        default=1,
                        help='Generate this many samples')
    parser.add_argument('--save_every',
                        metavar='N',
                        type=int,
                        default=1000,
                        help='Write a checkpoint every N steps')

    parser.add_argument(
        '--val_dataset',
        metavar='PATH',
        type=str,
        default=None,
        help='Dataset for validation loss, defaults to --dataset.')
    parser.add_argument('--val_batch_size',
                        metavar='SIZE',
                        type=int,
                        default=2,
                        help='Batch size for validation.')
    parser.add_argument('--val_batch_count',
                        metavar='N',
                        type=int,
                        default=40,
                        help='Number of batches for validation.')
    parser.add_argument('--val_every',
                        metavar='STEPS',
                        type=int,
                        default=0,
                        help='Calculate validation loss every STEPS steps.')

    # settings
    args = parser.parse_args()
    print(args)

    enc = get_encoder()
    config = get_config(args.model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GPT2LMHeadModel(config)

    # error checking
    if args.sample_length > config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         config.n_ctx)

    if args.model_name == '345M':
        args.memory_saving_gradients = True
        if args.optimizer == 'adam':
            args.only_train_transformer_layers = True

    # select variables to update while training
    all_vars = [tensor for tensor in model.parameters()]
    transformer_vars = [
        tensor for name, tensor in model.named_parameters()
        if 'transformer.h.' in name
    ]
    train_vars = transformer_vars if args.only_train_transformer_layers else all_vars

    # create optimizer
    if args.optimizer == 'adam':
        optimizer = torch.optim.Adam(train_vars, lr=args.learning_rate)
    elif args.optimizer == 'sgd':
        optimizer = torch.optim.SGD(train_vars, lr=args.learning_rate)
    else:
        exit('Bad optimizer:', args.optimizer)

    # load model
    if args.restore_from == 'latest':
        ckpt_path = get_latest_ckpt(os.path.join(CHECKPOINT_DIR,
                                                 args.run_name))

        if ckpt_path is None:
            state_dict = get_state_dict(args.model_name)
            model = load_model(model, state_dict, device)
            counter = 1

        else:
            ckpt = torch.load(ckpt_path)
            model = load_model(model, ckpt['model_state_dict'], device)
            optimizer.load_state_dict(ckpt['optimizer_state_dict'])
            counter = ckpt['counter']

    elif args.restore_from == 'fresh':
        state_dict = get_state_dict(args.model_name)
        model = load_model(model, state_dict, device)
        counter = 1

    else:  # path to a checkpoint tar file
        ckpt = torch.load(args.restore_from)
        model = load_model(model, ckpt['model_state_dict'], device)
        optimizer.load_state_dict(ckpt['optimizer_state_dict'])
        counter = ckpt['counter']

    # load datasets
    print('load training dataset...')
    chunks = load_dataset(enc, args.dataset, args.combine)
    data_sampler = Sampler(chunks)
    print('dataset has {} tokens'.format(data_sampler.total_size))

    if args.val_every > 0:
        # Sample from validation set once with fixed seed to make
        # it deterministic during training as well as across runs.
        print('load validation dataset...')
        val_chunks = load_dataset(enc, args.val_dataset,
                                  args.combine) if args.val_dataset else chunks
        val_data_sampler = Sampler(val_chunks, seed=1)
        val_batches = torch.tensor([[
            val_data_sampler.sample(1024) for _ in range(args.val_batch_size)
        ] for _ in range(args.val_batch_count)])

    def save():
        maketree(os.path.join(CHECKPOINT_DIR, args.run_name))
        save_path = os.path.join(CHECKPOINT_DIR, args.run_name,
                                 'ckpt-{}.tar'.format(counter))
        torch.save(
            {
                'counter': counter,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict()
            }, save_path)

    def generate_samples():
        """Generate unconditional samples."""
        print('Generating samples...')

        generated = 0
        all_text = []

        for _ in range(args.sample_num):
            out = sample_sequence(model=model,
                                  length=args.sample_length,
                                  context=None,
                                  start_token=enc.encoder['<|endoftext|>'],
                                  batch_size=1,
                                  temperature=1.0,
                                  top_k=args.top_k,
                                  device=device)

            out = out[:, :].tolist()[0]
            generated += 1
            text = enc.decode(out)
            print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
            print(text)
            all_text.append(text)

        maketree(os.path.join(SAMPLE_DIR, args.run_name))
        with open(
                os.path.join(SAMPLE_DIR, args.run_name,
                             'samples-{}.txt'.format(counter)), 'w') as fp:
            fp.write('\n'.join(all_text))

    def validation():
        print('Calculating validation loss...')
        losses = []
        for batch in tqdm.tqdm(val_batches):
            loss = model(batch[:, :-1].to(device),
                         lm_labels=batch[:, 1:].to(device))
            losses.append(loss)
        v_val_loss = torch.mean(torch.tensor(losses))
        print('[{counter} | {time:2.2f}] validation loss = {loss:2.2f}'.format(
            counter=counter, time=time.time() - start_time, loss=v_val_loss))

    def sample_batch():
        return torch.tensor(
            [data_sampler.sample(1024) for _ in range(args.batch_size)])

    avg_loss = (0.0, 0.0)
    start_time = time.time()

    # training
    try:
        while True:
            if counter % args.save_every == 0:
                save()
            if counter % args.sample_every == 0:
                generate_samples()
            if args.val_every > 0 and (counter % args.val_every == 0
                                       or counter == 1):
                validation()

            if args.accumulate_gradients > 1:
                optimizer.zero_grad()

                for _ in range(args.accumulate_gradients):
                    batch = sample_batch()
                    loss = model(batch[:, :-1].to(device),
                                 lm_labels=batch[:, 1:].to(device))
                    loss.backward()
                    optimizer.step()

            else:
                optimizer.zero_grad()
                batch = sample_batch()
                loss = model(batch[:, :-1].to(device),
                             lm_labels=batch[:, 1:].to(device))
                loss.backward()
                optimizer.step()

            avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0)

            print('[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}'.
                  format(counter=counter,
                         time=time.time() - start_time,
                         loss=loss,
                         avg=avg_loss[0] / avg_loss[1]))

            counter += 1

    except KeyboardInterrupt:
        print('interrupt')
        save()
Example #14
0
def text_generator(state_dict):
    parser = argparse.ArgumentParser()
    #parser.add_argument("--text", type = file, required=True)
    parser.add_argument('filename')

    parser.add_argument("--quiet", type=bool, default=False)
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument('--unconditional',
                        action='store_true',
                        help='If true, unconditional generation.')
    parser.add_argument("--batch_size", type=int, default=-1)
    parser.add_argument("--length", type=int, default=40)
    parser.add_argument("--temperature", type=float, default=0.7)
    parser.add_argument("--top_k", type=int, default=40)
    args = parser.parse_args()

    open_bbc_page = requests.get(main_url).json()

    article = open_bbc_page["articles"]

    results = []

    for ar in article:
        results.append(ar["title"])

    print(results[1])
    text1 = results[1]
    with open(args.filename) as file:
        #text1 = file.read()
        print(text1)

        if args.quiet is False:
            print(args)

        if args.batch_size == -1:
            args.batch_size = 1
        assert args.nsamples % args.batch_size == 0

        seed = random.randint(0, 2147483647)
        np.random.seed(seed)
        torch.random.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load Model
        enc = get_encoder()
        config = GPT2Config()
        model = GPT2LMHeadModel(config)
        model = load_weight(model, state_dict)
        model.to(device)
        model.eval()

        if args.length == -1:
            args.length = config.n_ctx // 2
        elif args.length > config.n_ctx:
            raise ValueError("Can't get samples longer than window size: %s" %
                             config.n_ctx)

        print(text1)
        context_tokens = enc.encode(text1)

        generated = 0
        for _ in range(args.nsamples // args.batch_size):
            out = sample_sequence(
                model=model,
                length=args.length,
                context=context_tokens if not args.unconditional else None,
                start_token=enc.encoder['<|endoftext|>']
                if args.unconditional else None,
                batch_size=args.batch_size,
                temperature=args.temperature,
                top_k=args.top_k,
                device=device)
            out = out[:, len(context_tokens):].tolist()
            for i in range(args.batch_size):
                generated += 1
                text = enc.decode(out[i])
                if args.quiet is False:
                    print("=" * 40 + " SAMPLE " + str(generated) + " " +
                          "=" * 40)
                print(text)
                text = text1 + text
                api.update_status(status=text)
Example #15
0
def generator(text):
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--text", type=str, required=True)
    # parser.add_argument("--quiet", type=bool, default=False)
    # parser.add_argument("--nsamples", type=int, default=1)
    # parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
    # parser.add_argument("--batch_size", type=int, default=-1)
    # parser.add_argument("--length", type=int, default=-1)
    # parser.add_argument("--temperature", type=float, default=0.7)
    # parser.add_argument("--top_k", type=int, default=40)
    # args = parser.parse_args()

    state_dict = torch.load(
        'gpt2-pytorch_model.bin',
        map_location='cpu' if not torch.cuda.is_available() else None)

    input = text
    quiet = False
    nsamples = 1
    unconditional = False
    batch_size = -1
    length = -1
    temperature = 0.7
    top_k = 40

    if batch_size == -1:
        batch_size = 1
    assert nsamples % batch_size == 0

    seed = random.randint(0, 2147483647)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load Model
    enc = get_encoder()
    config = GPT2Config()
    model = GPT2LMHeadModel(config)
    model = load_weight(model, state_dict)
    model.to(device)
    model.eval()

    if length == -1:
        length = config.n_ctx // 2
    elif length > config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         config.n_ctx)

    print(text)
    context_tokens = enc.encode(text)

    generated = 0
    for _ in range(nsamples // batch_size):
        out = sample_sequence(
            model=model,
            length=length,
            context=context_tokens if not unconditional else None,
            start_token=enc.encoder['<|endoftext|>']
            if unconditional else None,
            batch_size=batch_size,
            temperature=temperature,
            top_k=top_k,
            device=device)
        out = out[:, len(context_tokens):].tolist()
        for i in range(batch_size):
            generated += 1
            text = enc.decode(out[i])
            if quiet is False:
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
            return text
Example #16
0
def text_generator(state_dict):
    parser = argparse.ArgumentParser()
    parser.add_argument("--text", type=str, required=True)
    parser.add_argument("--quiet", type=bool, default=False)
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument('--unconditional',
                        action='store_true',
                        help='If true, unconditional generation.')
    parser.add_argument("--batch_size", type=int, default=-1)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=float, default=0.7)
    parser.add_argument("--top_k", type=int, default=40)
    args = parser.parse_args()

    # ================================================================================
    if args.quiet is False:
        print(args)

    # ================================================================================
    if args.batch_size == -1:
        args.batch_size = 1

    # ================================================================================
    assert args.nsamples % args.batch_size == 0

    # ================================================================================
    seed = random.randint(0, 2147483647)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # ================================================================================
    # Load Model
    enc = get_encoder()
    config = GPT2Config()
    model = GPT2LMHeadModel(config)

    # ================================================================================
    model = load_weight(model, state_dict)
    model.to(device)
    model.eval()

    # ================================================================================
    if args.length == -1:
        args.length = config.n_ctx // 2
    elif args.length > config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         config.n_ctx)

    # ================================================================================
    print(args.text)
    # I use computer

    # ================================================================================
    context_tokens = enc.encode(args.text)
    # afaf 2: context_tokens = enc.encode(args.text)
    # print("context_tokens",context_tokens)
    # [40, 779, 3644]

    # ================================================================================
    # print("args.length",args.length)
    # 512

    generated = 0
    for _ in range(args.nsamples // args.batch_size):
        out = sample_sequence(
            model=model,
            length=args.length,
            context=context_tokens if not args.unconditional else None,
            start_token=enc.encoder['<|endoftext|>']
            if args.unconditional else None,
            batch_size=args.batch_size,
            temperature=args.temperature,
            top_k=args.top_k,
            device=device)
        # afaf 5: out = sample_sequence(

        # print("out",out)
        # tensor([[   40,   779,  3644,  1143,  3788,   284,  2198,   262,  2033,   286,
        #           1321,   287,   262,  2393,    11,   290,   788,  4866,   340,   284,

        # print("out",out.shape)
        # torch.Size([1, 515])

        len_ctx_tokens = len(context_tokens)
        # print("len_ctx_tokens",len_ctx_tokens)
        # 3

        out = out[:, len_ctx_tokens:].tolist()

        # ================================================================================
        # print("args.batch_size",args.batch_size)
        # 1
        for i in range(args.batch_size):
            generated += 1

            # ================================================================================
            # print("out",out)
            # [[3783, 11, 543, 318, 257, 1688, 636, 286, 616, 3047, 290, 318, 257, 845,
            # print("out",len(out))
            # 1

            # ================================================================================
            indexed_out = out[i]
            # print("indexed_out",indexed_out)
            # [5479, 588, 9678, 290, 24134, 284, 16481, 1366, 287, 257, 30117, 13, 383, 1917, 318, 326,
            # print("indexed_out",len(indexed_out))
            # 512

            # ================================================================================
            text = enc.decode(indexed_out)
            print("text", text)
            afaf
            # terminals with Ethernet cable to connect the computer to a computer system that has a computer terminal.
            # An additional feature

            # ================================================================================
            if args.quiet is False:
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
            print(text)
Example #17
0
import json
import tokensToUTF
from GPT2.encoder import get_encoder

inPath = ""
inFileName = "chapter 1"

tokensPerChunk = 65  # number of tokens a chunk should have at most

lowTokenBoundary = 20

inJSONfilePath = f"{inPath}{inFileName}_{tokensPerChunk}tkChunks.json"

inJSON = open(inJSONfilePath, "r", encoding="utf-8").read()
chunkList = json.loads(inJSON)

fixEncodes = tokensToUTF.getFixEncodes()
encoder = get_encoder()

for chunk in chunkList:
    chunkTokens = encoder.encode(chunk)

    if len(chunkTokens) > tokensPerChunk:
        print(
            f"'{chunk}'\nhas {len(chunkTokens)} tokens, which are {len(chunkTokens) - tokensPerChunk} too many!\n"
        )

    if len(chunkTokens) <= lowTokenBoundary:
        print(
            f"'{chunk}'\nhas {len(chunkTokens)} tokens, which is very little!\n"
        )
Example #18
0
def tokenize(text: str):
    enc = get_encoder()
    tokens = enc.encode(text)
    return tokens
Example #19
0
def detokenize(tokens: iter):
    enc = get_encoder()
    text = enc.decode(tokens)
    return text
Example #20
0
def calculateValidityTensor(
    groundTokens: iter,
    groundValidityTensor: iter,
    perterbationTensor: iter,
    checkpoint: str = None,
):
    validityTensor = []
    totalBar = tqdm(total=len(perterbationTensor), desc="Total", position=0)
    symbolBar = tqdm(total=len(perterbationTensor[0][1]),
                     desc="TBD",
                     position=1)
    vectorBar = tqdm(total=len(perterbationTensor[0][1][0]),
                     desc="Vector",
                     position=2)

    if checkpoint:
        with open(checkpoint, "r") as f:
            validityTensor = json.load(f)
        # don't recalculate any symbols that have already been done
        already = len(validityTensor)
        perterbationTensor = perterbationTensor[already::]
        totalBar.update(already)

    coder = get_encoder()
    for sym, plane in perterbationTensor:
        logging.info("Started Symbol: " + sym)
        symbolBar.reset()
        symbolBar.set_description(sym)
        vPlane = []
        for i, vector in enumerate(plane):
            vVector = []
            vectorBar.reset(total=len(vector))
            for pString in vector:
                # tokenize pString
                pTokens = coder.encode(pString)
                # locate departure form ground tokens
                departure = firstMismatch(pTokens, groundTokens[i])
                if departure is not None:
                    # sum error up to agreement with groundTokens
                    agreement = sum(groundValidityTensor[i][:departure])
                    # calculate validity of peterbed string from departure onward
                    departureValidity = partialErrorSeries(pTokens, departure)
                    # calculate total validity
                    validity = agreement + sum(departureValidity)
                    # compare to ground validity
                    validity_delta = (sum(groundValidityTensor[i]) - validity
                                      )  # lower validity is better
                else:
                    validity_delta = 0
                vVector.append(validity_delta)
                vectorBar.update()
            vPlane.append(vVector)
            symbolBar.update()
        validityTensor.append((sym, vPlane))
        totalBar.update()
        logging.info("Finished Symbol: " + sym)
        with open(args["vo"], "w") as f:  # save checkpoint
            json.dump(validityTensor, f)
    vectorBar.close()
    symbolBar.close()
    totalBar.close()
    return validityTensor