Esempio n. 1
0
}

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        default='openai-gpt',
                        help='model name or path')
    args = parser.parse_args()

    config = OpenAIGPTConfig.from_pretrained(args.model)
    model = OpenAIGPTModel.from_pretrained(args.model, config=config)

    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model)
    special_tokens_dict = {'pad_token': '<pad>'}
    tokenizer.add_special_tokens(special_tokens_dict=special_tokens_dict)

    model.resize_token_embeddings(len(tokenizer))

    params_senteval['model'] = model.cuda().eval()
    params_senteval['tokenizer'] = tokenizer

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    transfer_tasks = [
        'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA',
        'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment',
        'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth',
        'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
        'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI'
Esempio n. 2
0
def load_model(name: str) -> Tuple[OpenAIGPTLMHeadModel, OpenAIGPTTokenizer]:
    model = OpenAIGPTLMHeadModel.from_pretrained(name)
    tokenizer = OpenAIGPTTokenizer.from_pretrained(name)
    model.eval()
    return model, tokenizer
Esempio n. 3
0
def generate_from_history(history: List[Tuple[bool, str]], tokenizer: OpenAIGPTTokenizer,
                          model: OpenAIGPTDoubleHeadsModel, device,
                          token_blacklist: Optional[List[str]] = None,) -> List[str]:
    """Generates an utterance given a set of messages preceding it.

    :argument history: a list of tuples (user, message)
                            user is a boolean on whether sender is user.
                            message is string.
    :argument tokenizer: the tokenizer
    :argument model: the model
    :argument device: pytorch device to run on
    :argument token_blacklist: a list of tokens to not make the network generate"""

    model.to(device)

    # build the network inputs
    output = []
    inputs = [bos]
    token_types = [speaker_other if len(history) > 0 and not history[0][0] else speaker_self]
    for user, text in history:
        inputs.append(speaker_self if user else speaker_other)
        token_types.append(speaker_self if user else speaker_other)
        for token in tokenizer.tokenize(text):
            inputs.append(token)
            token_types.append(speaker_self if user else speaker_other)
    inputs.append(speaker_self)
    token_types.append(speaker_self)

    input_ids = tokenizer.convert_tokens_to_ids(inputs)
    token_type_ids = tokenizer.convert_tokens_to_ids(token_types)

    model.eval()

    eos_token = tokenizer.convert_tokens_to_ids(eos)
    speaker_self_token = tokenizer.convert_tokens_to_ids(speaker_self)
    speaker_other_token = tokenizer.convert_tokens_to_ids(speaker_other)

    cutoff = config["bot"]["max_token_history"]
    for i in range(config["bot"]["token_limit"]):
        model_out = model(torch.tensor([input_ids[-cutoff:]], dtype=torch.long).to(device),
                          token_type_ids=torch.tensor([token_type_ids[-cutoff:]], dtype=torch.long).to(device))
        logits = model_out.logits[0, -1, :] / config["eval"]["temperature"]
        blacklist = [bos, eos, pad] + token_blacklist
        logits = filter_logits(logits, tokenizer, False, blacklist=blacklist)
        logits = top_p_sample(logits, config["eval"]["top_p"])
        # print("{} -> {}".format(tokenizer.convert_ids_to_tokens(output[-5:]), tokenizer.convert_ids_to_tokens(torch.topk(logits, 5)[1])))
        probs = F.softmax(logits, dim=-1)
        prev = torch.multinomial(probs, 1).item()
        input_ids.append(prev)
        token_type_ids.append(speaker_self_token)
        output.append(prev)
        if prev in (speaker_other_token, eos_token):
            break

    output = tokenizer.convert_ids_to_tokens(output)
    current_msg = []
    messages = []
    for i in output:
        if i in (speaker_self, eos, speaker_other):
            messages.append(tokenizer.convert_tokens_to_string(current_msg))
            current_msg = []
        else:
            current_msg.append(i)
    if len(current_msg) > 0:
        messages.append(tokenizer.convert_tokens_to_string(current_msg))
    return messages
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name",
                        type=str,
                        default="openai-gpt",
                        help="pretrained model name")
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument("--train_dataset", type=str, default="")
    parser.add_argument("--eval_dataset", type=str, default="")
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--num_train_epochs", type=int, default=3)
    parser.add_argument("--train_batch_size", type=int, default=8)
    parser.add_argument("--eval_batch_size", type=int, default=16)
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", type=int, default=1)
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training \
                        steps to perform. Override num_train_epochs.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before\
                        performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate", type=float, default=6.25e-5)
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
    parser.add_argument("--weight_decay", type=float, default=0.01)
    parser.add_argument("--lm_coef", type=float, default=0.9)
    parser.add_argument("--n_valid", type=int, default=374)

    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ["_start_", "_delimiter_", "_classify_"]
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
    tokenizer.add_tokens(special_tokens)
    special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    # Load and encode the datasets
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(
        len(story[:max_length]) +
        max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
        for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (
                len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(
                train_dataloader
            ) // args.gradient_accumulation_steps * args.num_train_epochs

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                args.weight_decay,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=t_total)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids,
                               mc_token_ids=mc_token_ids,
                               lm_labels=lm_labels,
                               mc_labels=mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = (loss.item() if exp_average_loss is None
                                    else 0.7 * exp_average_loss +
                                    0.3 * loss.item())
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    scheduler.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, "module") else model  # Only save the model itself

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss, _, mc_logits = model(input_ids,
                                                 mc_token_ids=mc_token_ids,
                                                 lm_labels=lm_labels,
                                                 mc_labels=mc_labels)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to("cpu").numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            "eval_loss": eval_loss,
            "eval_accuracy": eval_accuracy,
            "train_loss": train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, default="openai-gpt", help="pretrained model name")
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_test", action="store_true", help="fix the theoretical lowest loss")
    parser.add_argument("--do_save", action="store_true", help="Save the model")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument("--train_dataset", type=str, default="/cloze_test_val__spring2016 - cloze_test_ALL_val.csv")
    parser.add_argument("--eval_dataset", type=str, default="")
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--num_train_epochs", type=int, default=3)
    parser.add_argument("--train_batch_size", type=int, default=8)
    parser.add_argument("--eval_batch_size", type=int, default=16)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", type=int, default=1)
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training \
                        steps to perform. Override num_train_epochs.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before\
                        performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate", type=float, default=6.25e-5)
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
    parser.add_argument("--weight_decay", type=float, default=0.01)
    parser.add_argument("--lm_coef", type=float, default=0.9)
    parser.add_argument("--n_valid", type=int, default=374)

    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()
    #print(args)
    

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval and not args.do_test:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ["_start_", "_delimiter_", "_classify_"]
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
    tokenizer.add_tokens(special_tokens)
    special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    # Load and encode the datasets
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(
        len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
        for dataset in encoded_datasets
        for story, cont1, cont2, _ in dataset
    )
    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train or args.do_test:
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        global optimizer_grouped_parameters
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": args.weight_decay,
            },
            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
        )

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(tqdm_bar, desc="Training")
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = (
                    loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])
    if args.do_test:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        ##for _ in (0,)):
        ##
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
        scheduler=torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lambda x: 1e-2**x),-1)
        tr_loss = 0
        nb_tr_steps = 0
        tqdm_bar = tqdm(train_dataloader, desc="Testing")
        maxloop=0 
        avrgloops=0
        loop=0 
        prog=""        
        for step, batch in enumerate(tqdm_bar):
            stage=0
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
            loss = args.lm_coef * losses[0] + losses[1]
            loss.backward()
            lowloss=loss.item()
            tqdm.write("reseting lowlost")
            tqdm_bar.set_description("Testing {} loss:{}".format(loop,lowloss))
            scheduler.step(-1)
            optimizer.step()
            optimizer.zero_grad()
            if loop>maxloop:
                maxloop=loop
            avrgloops +=loop
            loop=0
            newloss=loss.item()
            intloss=math.inf
            oldloss=intloss
            bad=0
            if math.isnan(loss.item()):
                tqdm_bar.write("beeping NaN")
            while True:
                tqdm_bar.set_description("Testing {} loss:{}".format(loop,newloss))
                loop = loop + 1
                if intloss < newloss:
                    tqdm_bar.write("{} counter productive:{} > {}".format(bad,newloss,intloss))
                    scheduler.step()
                    if intloss>lowloss:
                        tqdm_bar.write("this run didn't beat the old loss{}".format(lowloss))
                        stage=1
                if oldloss==newloss:
                  tqdm_bar.write("\nlooped {} as good as it gets: {}".format(loop,loss))
                  break
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.zero_grad()
                oldloss=intloss
                intloss=newloss
                newloss=loss.item()
                if newloss < lowloss:
                    bad=0
                if newloss < lowloss:
                    lowloss=newloss
            tr_loss += lowloss
            avgloops += loop
            exp_average_loss = (
                loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
            )
Esempio n. 6
0
def get_gpt_token_num():
    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    tokenizer.add_tokens(GPT_SPECIAL_TOKENS)
    return len(tokenizer)
Esempio n. 7
0
def main():

    config = get_config(mode="test")

    if config.data_name == "cornell2":
        vocab = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        special_tokens = {
            'pad_token': PAD_TOKEN,
            'bos_token': SOS_TOKEN,
            'eos_token': EOS_TOKEN,
            'sep_token': SEP_TOKEN,
        }
        vocab.add_special_tokens(special_tokens)
        config.vocab_size = len(vocab)
        config.vocab = vocab
        config.pad_id = vocab.pad_token_id
        config.eos_id = vocab.eos_token_id 
        config.sos_id = vocab.bos_token_id 

        convs = [
            # [["u0", "how's the weather today in Daejeon?"], ["u1", "It's rainy... "], ["u0", "Did you take your umbrella?"], ["u1", "Sure I did"]],
            [["u0", "how's the weather today?"], ["u1", "Sure I did"]],
            [["u0", "did you have a nice weekends?"], ["u1", "sure"], ["u0", "where did you go?"]],
            # [["u0", "did you have a nice weekends?"], ["u1", "sure, It was wonderful :)"]],
            [["u0", "did you take your umbrella?"], ["u1", "sure, It was wonderful :)"]], 
            [["u0", "I hurt my legs"], ["u1", "oh,, i'm sorry to hear that"]],
            [["u200", "Do u love me?"], ["u1", "oh,, i'm sorry to hear that"]],
            [["u0", "I hurt my legs"], ["u1", "oh,, i'm sorry to hear that"], ["u0", "thanks"]],
            [["u0", "how's the weather today in Daejeon?"], ["u1", "Sure I did"]],
            # [["u0", "how's the weather today in Daejeon?"], ["u1", "It's sunny today!"], ["u0", "Did you take your umbrella?"], ["u1", "Sure I did"]],
            # [["u0", "hello"], ["u1", "i hate you"], ["u0", "what??"]],
            # [["u0", "hello"], ["u1", "i love you"], ["u0", "what??"]],
            [["u0", "hello"], ["u1", "i dont't have a girlfriend likes you"], ["u0", "i know"]]
        ]
    
    else: 
        raise ValueError("{} Sorry... We don't support that data".format(config.data_name))   

    models_path = os.path.join(config.dataset_dir, "model_infos.json")
    with open(models_path) as f: 
        models = json.load(f)["models"]

    project_dir = config.dataset_dir.parent.parent

    total_outputs = []
    model_names = []
    
    for model_i, model in enumerate(models):
        config.model = model["name"]
        config.checkpoint = os.path.join(project_dir, "results", config.data_name, model["name"], model["path"])
        model_names.append(model["name"] + "/" + model["path"])

        if model.get('config'):
            for key in model["config"]:
                setattr(config, key, model["config"][key])
        
        data_loader = get_loader(convs=convs,
                                vocab=vocab,
                                batch_size=1,
                                model=config.model,
                                dataset=config.data_name,
                                config=config,
                                shuffle=False)

        model_solver = getattr(solvers, "Solver{}".format(config.model))

        solver = model_solver(config, None, data_loader, vocab=vocab, is_train=False)

        solver.build()
        inputs, outputs = solver.export_samples(config.beam_size, file_write=False)

        for i, utter in enumerate(outputs):
            if model_i == 0: 
                total_outputs.append([utter])
            else:
                total_outputs[i].append(utter)

    result_path = os.path.join(project_dir, "results", config.data_name, "qualitative_samples.txt")

    with open(result_path, 'w') as fw:
        for input_utter, outputs in zip(inputs, total_outputs): 
            # print(input_utter, file=fw)
            # for i, output in enumerate(outputs):
            #     print("{} : {}".format(model_names[i], output), file=fw)
            # print('============================', file=fw)
            print(input_utter)
            for i, output in enumerate(outputs):
                print("{} : {}".format(model_names[i], output.split('<eos>')[0]))
            print('============================')
Esempio n. 8
0
def train(dataset_path: str):
    device = torch.device(config["train"]["device"])

    print("Device: {}".format(device))

    # device = torch.device("cpu")  # gpu not enough memory :(

    model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt")
    model.to(device)
    tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")

    orig_num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS)
    model.resize_token_embeddings(new_num_tokens=orig_num_tokens +
                                  num_added_tokens)

    # dataloader = get_data_loader(dataset_path, tokenizer, batch_size=4, shuffle=False, num_workers=1)
    full_dataset = get_dataset(dataset_path, tokenizer)
    assert len(full_dataset) > 0
    train_size = int(
        len(full_dataset) * config["train"]["train_dataset_proportion"] + 1)
    test_size = len(full_dataset) - train_size
    print("Full dataset has {} dialogs. Splitting into train: {} and test: {}".
          format(len(full_dataset), train_size, test_size))
    train_dataset, test_dataset = random_split(
        full_dataset, [train_size, test_size],
        torch.Generator().manual_seed(42))
    print(len(train_dataset), len(test_dataset))

    train_loader = get_data_loader(train_dataset, tokenizer,
                                   config["train"]["batch_size"], True, 0)
    test_loader = get_data_loader(test_dataset, tokenizer, 1, False, 0)

    lr = config["train"]["learning_rate"]
    print("lr: {}".format(lr))
    optimizer = AdamW(model.parameters(), lr=lr)

    # init logging
    start_time = datetime.datetime.now()
    save_path = os.path.join(
        os.path.dirname(__file__),
        "log/log-{}.txt".format(start_time.strftime("%y-%m-%d-%H-%M-%S")))
    print(os.path.dirname(__file__), save_path)
    f = open(save_path, "w+")
    f.close()

    epochs = config["train"]["num_epochs"]
    eval_every = config["train"]["evaluate_interval_iters"]
    num_tests = config["train"]["num_tests"]
    last_model_save = datetime.datetime.now()
    iteration = 0

    for epoch in range(epochs):
        print("Starting epoch {}/{}".format(epoch, epochs))
        for batch in train_loader:

            if iteration % eval_every == 0:
                results = evaluate_model(model, test_loader, device, num_tests)
                add_log(
                    save_path,
                    "test,{0},{1},{2[mc_correct]},{2[num_tests]},{2[lm_correct]},{2[lm_tested]}\n"
                    .format(iteration, epoch, results))

            model.train()
            input_ids = batch["input_ids"].to(device)
            mc_token_ids = batch["mc_token_ids"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            lm_labels = batch["lm_labels"].to(device)
            mc_labels = batch["correct"].to(device)

            try:
                model_output = model(input_ids,
                                     token_type_ids=token_type_ids,
                                     mc_token_ids=mc_token_ids,
                                     mc_labels=mc_labels,
                                     labels=lm_labels)
            except Exception as e:
                print(input_ids,
                      token_type_ids,
                      mc_token_ids,
                      lm_labels,
                      mc_labels,
                      sep="\n")
                raise e

            # print("input_ids: {}\ntoken_type_ids: {}\nmc_token_ids: {}\nlm_labels: {}\nmc_labels: {}"
            #       .format(input_ids, token_type_ids, mc_token_ids, lm_labels, mc_labels))

            # print(model_output.loss.item(), model_output.mc_loss.item())
            lm_loss = model_output.loss
            mc_loss = model_output.mc_loss

            loss = lm_loss * config["train"]["lm_coeff"] + mc_loss * config[
                "train"]["mc_coeff"]

            add_log(
                save_path,
                "train,{},{},{},{},{}\n".format(iteration, epoch, loss,
                                                lm_loss, mc_loss))

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(),
                                     config["train"]["max_norm"])
            optimizer.step()
            optimizer.zero_grad()

            # TODO: evaluation

            if iteration % 50 == 0:
                print(
                    "Time: {} Epoch: {}/{} Iteration: {}/{} Loss: {} ({} {})".
                    format(
                        datetime.datetime.now() - start_time, epoch, epochs,
                        iteration,
                        epochs *
                        (len(train_dataset) // config["train"]["batch_size"]),
                        loss.item(), lm_loss.item(), mc_loss.item()))

            if datetime.datetime.now() - last_model_save > datetime.timedelta(
                    minutes=config["train"]["save_interval_mins"]):
                print("Saving model...")
                torch.save(
                    model.state_dict(),
                    os.path.join(os.path.dirname(__file__),
                                 "checkpoints/model-{}-iter{}.pt").format(
                                     start_time.strftime("%y-%m-%d-%H-%M-%S"),
                                     iteration))
                last_model_save = datetime.datetime.now()

            iteration += 1

    print("Saving model...")
    torch.save(
        model.state_dict(),
        os.path.join(os.path.dirname(__file__),
                     "checkpoints/model-{}-iter{}.pt").format(
                         start_time.strftime("%y-%m-%d-%H-%M-%S"), iteration))
Esempio n. 9
0
def main():
    bleu_list = list()
    length_history = list()
    rouge_history = list()
    embedding_list = list()
    dist1_list = list()
    meteor_list = list()
    conv_idx_match = 0
    convs_top_answer = list()
    convs_ground_truth = list()
    num_answers = 1

    if dataset != "cornell":
        if model_name == "DialoGPT":
            vocab = GPT2Tokenizer.from_pretrained('gpt2')
        else:
            vocab = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
            special_tokens = {
                'pad_token': PAD_TOKEN,
                'bos_token': SOS_TOKEN,
                'eos_token': EOS_TOKEN,
                'sep_token': SEP_TOKEN,
            }
            vocab.add_special_tokens(special_tokens)
        state_dict = torch.load(checkpoint_path)

        embedding_weight_name = None
        for key in state_dict.keys():
            if key.endswith("tok_embedding.weight"):
                embedding_weight_name = key
                break
            elif key.endswith("transformer.tokens_embed.weight"):
                embedding_weight_name = key
                break
            elif key.endswith("encoder.embedding.weight"):
                embedding_weight_name = key
                num_answers = int(target_file_path.split('_')[-2])
                break
            elif key.endswith("wte.weight"):
                embedding_weight_name = key
        assert embedding_weight_name != None
        weight_tensor = state_dict[embedding_weight_name]
        embedding = nn.Embedding.from_pretrained(weight_tensor).to("cpu")
    else:
        with open(id2word_path, 'rb') as f:
            id2word = pickle.load(f)
            word2id = {v: k for k, v in id2word.items()}

        with open(pretrained_wv_path, 'rb') as f:
            weight_tensor = to_var(torch.FloatTensor(pickle.load(f)))
        embedding = nn.Embedding.from_pretrained(weight_tensor,
                                                 freeze=False).to("cpu")

    with codecs.open(target_file_path, "r", "utf-8") as csv_f:
        for line in csv_f:
            try:
                conv_idx = int(line.strip().split()[-1])
            except:
                print(line)
                print(conv_idx)
            if conv_idx_match != conv_idx:
                print("What?!")
                return
            conv_idx_match += 1
            context_utter = csv_f.readline().strip()
            # print(context_utter)

            answers = list()
            # for _ in range(num_answers):
            answers.append(csv_f.readline().strip())
            # print(answers)

            if '<eos>' in answers[-1]:
                top_answer = answers[-1].split('<eos>')[0].strip()
            else:
                top_answer = answers[-1].strip()

            ground_truth_utter = csv_f.readline().strip()

            if ground_truth_utter.split()[-1].startswith('u'):
                ground_truth_utter = ' '.join(ground_truth_utter.split()[:-1])

            if '<eos>' in ground_truth_utter:
                ground_truth_utter = ground_truth_utter.split('<eos>')[0]

            length_history.append(len(top_answer.split()))

            if context_utter == "" or top_answer == "" or ground_truth_utter == "":
                continue

            dist1_list += top_answer.split()

            try:
                ground_truth_utter_ids = vocab.encode(ground_truth_utter)
                top_answer_utter_ids = vocab.encode(top_answer)
                embedding_list.append(
                    embedding_compute(ground_truth_utter_ids,
                                      top_answer_utter_ids, embedding))
            except ValueError:
                embedding_list.append(0)

            try:
                bleu_list.append(bleu_compute(ground_truth_utter, top_answer))
            except ZeroDivisionError:
                bleu_list.append(0)

            try:
                rouge_history.append(
                    rouge_compute(ground_truth_utter, top_answer))
            except ValueError:
                rouge_history.append(np.zeros(3))

            meteor_list.append(meteor_compute(ground_truth_utter, top_answer))

    length_mat = np.array(length_history)
    bleu_mat = np.array(bleu_list)
    rouge_mat = np.stack(rouge_history, axis=0)
    embedding_mat = np.array(embedding_list)
    meteor_mat = np.array(meteor_list)

    avg_length = np.mean(length_mat)
    avg_bleu = np.mean(bleu_mat)
    avg_rouge = np.mean(rouge_mat, axis=0)
    avg_embedding = np.mean(embedding_mat)
    avg_meteor = np.mean(meteor_mat)

    stderr_bleu = sem(bleu_mat, axis=0)
    stderr_length = sem(length_mat)
    stderr_rouge = sem(rouge_mat, axis=0)
    stderr_embedding = sem(embedding_mat, axis=0)
    stderr_meteor = sem(meteor_mat, axis=0)

    dist1 = dist_compute(dist1_list)
    dist2 = dist_compute(dist1_list, 2)

    output_str_list = list()
    output_str_list.append(["Length", avg_length, stderr_length])
    output_str_list.append(["BLEU", avg_bleu, stderr_bleu])
    output_str_list.append(["Embedding", avg_embedding, stderr_embedding])
    output_str_list.append(["METEOR", avg_meteor, stderr_meteor])
    output_str_list.append(["Dist1", dist1, '-'])
    output_str_list.append(["Dist2", dist2, '-'])

    for one_name, one_avg, one_stderr in zip(rouge_names(), avg_rouge,
                                             stderr_rouge):
        output_str_list.append([one_name, one_avg, one_stderr])

    output_str = tabulate.tabulate(
        output_str_list, headers=["Metric", "Average", "Standard Error"])
    print(output_str)
Esempio n. 10
0
    # Tell pytorch to run this model on the GPU.
    if use_gpu:
        device = torch.device('cuda:' + str(gpu_id))
    else:
        device = torch.device("cpu")

    parent_to_child = pickle.load(open(pkl_dump_dir + "parent_to_child.pkl", "rb"))

    fine_label_path = base_fine_path
    fine_tok_path = fine_label_path + "/tokenizer"
    fine_model_path = fine_label_path + "/model/"

    pad_token_dict = pickle.load(open(pkl_dump_dir + "/pad_token_dict.pkl", "rb"))

    fine_tokenizer = OpenAIGPTTokenizer.from_pretrained(fine_tok_path, do_lower_case=True)
    fine_model = torch.load(fine_model_path + "coarse_fine.pt", map_location=device)

    all_sents = []
    all_labels = []
    for p in [parent_label]:
        children = parent_to_child[p]
        for ch in children:
            sentences = generate(ch, fine_tokenizer, fine_model, pad_token_dict, num_samples=num)
            sentences = post_process(sentences)
            labels = [ch] * num
            all_sents += sentences
            all_labels += labels

        df = pd.DataFrame.from_dict({"text": all_sents, "label": all_labels})
        pickle.dump(df, open(pkl_dump_dir + algo + "/df_gen_" + p + ".pkl", "wb"))
Esempio n. 11
0
    print('\33]0;SimpleTOD\a', end='')
    sys.stdout.flush()

    model_checkpoint = sys.argv[1]
    decoding = sys.argv[2]
    if decoding == 'nucleus':
        TOP_P = float(sys.argv[3])

    delay = 0.5
    multiwoz_db = MultiWozDB()

    print('\nLoading Model', end="")

    if 'openai' in model_checkpoint:
        tokenizer = OpenAIGPTTokenizer.from_pretrained(model_checkpoint)
        model = OpenAIGPTLMHeadModel.from_pretrained(model_checkpoint)
    else:
        tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint)
        model = GPT2LMHeadModel.from_pretrained(model_checkpoint)

    # model.load_state_dict(torch.load(model_checkpoint))
    model.eval()
    model.to('cuda')

    break_tokens = tokenizer.encode(
        tokenizer._eos_token) + tokenizer.encode('?') + tokenizer.encode('!')
    # break_tokens = tokenizer.encode(tokenizer._eos_token)
    MAX_LEN = model.config.n_ctx

    if 'openai-gpt' in model_checkpoint:
Esempio n. 12
0
    iteration = int(sys.argv[3])
    # iteration = 1

    # Tell pytorch to run this model on the GPU.
    if use_gpu:
        device = torch.device('cuda:' + str(gpu_id))
    else:
        device = torch.device("cpu")

    df = pickle.load(open(pkl_dump_dir + "df_coarse.pkl", "rb"))
    parent_to_child = pickle.load(
        open(pkl_dump_dir + "parent_to_child.pkl", "rb"))

    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        'openai-gpt',
        bos_token='<|startoftext|>',
        pad_token='<|pad|>',
        additional_special_tokens=['<|labelsep|>', '<|labelpad|>'])

    model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    child_to_parent = {}
    for p in parent_to_child:
        for ch in parent_to_child[p]:
            child_to_parent[ch] = p

    parent_labels = []
    child_labels = []
    for p in parent_to_child:
def setup_gpt(model_name="openai-gpt"):
    model = OpenAIGPTLMHeadModel.from_pretrained(model_name)
    tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name)
    return model, tokenizer
Esempio n. 14
0
    def setup_class(self):
        self.use_gpu = torch.cuda.is_available()
        self.test_dir = Path(tempfile.mkdtemp())

        self.base_tokenizer = OpenAIGPTTokenizer.from_pretrained(
            'openai-gpt', do_lower_case=True, cache_dir=self.test_dir)
        self.rust_tokenizer = PyOpenAiGptTokenizer(
            get_from_cache(
                self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
                ['openai-gpt']),
            get_from_cache(
                self.base_tokenizer.pretrained_vocab_files_map['merges_file']
                ['openai-gpt']),
            do_lower_case=True)
        self.model = OpenAIGPTModel.from_pretrained(
            'openai-gpt', output_attentions=False).eval()
        if self.use_gpu:
            self.model.cuda()
        #     Extracted from https://en.wikipedia.org/wiki/Deep_learning
        self.sentence_list = [
            'Deep learning (also known as deep structured learning or hierarchical learning) is part of a broader family of machine learning methods based on artificial neural networks.Learning can be supervised, semi-supervised or unsupervised.',
            'Deep learning is a class of machine learning algorithms that[11](pp199–200) uses multiple layers to progressively extract higher level features from the raw input.',
            'For example, in image processing, lower layers may identify edges, while higher layers may identify the concepts relevant to a human such as digits or letters or faces.',
            'Most modern deep learning models are based on artificial neural networks, specifically, Convolutional Neural Networks (CNN)s, although they can also include propositional formulas organized layer-wise in deep generative models.',
            'In deep learning, each level learns to transform its input data into a slightly more abstract and composite representation.',
            'In an image recognition application, the raw input may be a matrix of pixels; the first representational layer may abstract the pixels and encode edges; the second layer may compose and encode arrangements of edges;',
            'he third layer may encode a nose and eyes; and the fourth layer may recognize that the image contains a face. Importantly, a deep learning process can learn which features to optimally place in which level on its own.',
            '(Of course, this does not completely eliminate the need for hand-tuning; for example, varying numbers of layers and layer sizes can provide different degrees of abstraction.)[',
            'The word "deep" in "deep learning" refers to the number of layers through which the data is transformed. More precisely, deep learning systems have a substantial credit assignment path (CAP) depth. The CAP is the chain of transformations from input to output.',
            'CAPs describe potentially causal connections between input and output. For a feedforward neural network, the depth of the CAPs is that of the network and is the number of hidden layers plus one (as the output layer is also parameterized).',
            'For recurrent neural networks, in which a signal may propagate through a layer more than once, the CAP depth is potentially unlimited.[2] No universally agreed upon threshold of depth divides shallow learning from deep learning.',
            'CAP of depth 2 has been shown to be a universal approximator in the sense that it can emulate any function.[14] Beyond that, more layers do not add to the function approximator ability of the network.',
            'Deep models (CAP > 2) are able to extract better features than shallow models and hence, extra layers help in learning the features effectively. Deep learning architectures can be constructed with a greedy layer-by-layer method.',
            'Deep learning helps to disentangle these abstractions and pick out which features improve performance.[1]. For supervised learning tasks, deep learning methods eliminate feature engineering, by translating the data into compact intermediate representations',
            'Deep learning algorithms can be applied to unsupervised learning tasks. This is an important benefit because unlabeled data are more abundant than the labeled data. Examples of deep structures that can be trained in an unsupervised manner are neural history compressors and deep belief networks.',
            'Deep neural networks are generally interpreted in terms of the universal approximation theorem or probabilistic inference. The classic universal approximation theorem concerns the capacity of feedforward neural networks with a single hidden layer of finite size to approximate continuous functions.',
            'In 1989, the first proof was published by George Cybenko for sigmoid activation functions and was generalised to feed-forward multi-layer architectures in 1991 by Kurt Hornik.Recent work also showed that universal approximation also holds for non-bounded activation functions such as the rectified linear unit.',
            'he universal approximation theorem for deep neural networks concerns the capacity of networks with bounded width but the depth is allowed to grow. Lu et al. proved that if the width of a deep neural network with ReLU activation is strictly larger than the input dimension, then the network can approximate any Lebesgue integrable function',
            'The probabilistic interpretation[24] derives from the field of machine learning. It features inference, as well as the optimization concepts of training and testing, related to fitting and generalization, respectively',
            'More specifically, the probabilistic interpretation considers the activation nonlinearity as a cumulative distribution function. The probabilistic interpretation led to the introduction of dropout as regularizer in neural networks.',
            'The probabilistic interpretation was introduced by researchers including Hopfield, Widrow and Narendra and popularized in surveys such as the one by Bishop. The term Deep Learning was introduced to the machine learning community by Rina Dechter in 1986',
            'The first general, working learning algorithm for supervised, deep, feedforward, multilayer perceptrons was published by Alexey Ivakhnenko and Lapa in 1965.[32] A 1971 paper described already a deep network with 8 layers trained by the group method of data handling algorithm.',
            'Other deep learning working architectures, specifically those built for computer vision, began with the Neocognitron introduced by Kunihiko Fukushima in 1980.[34] In 1989, Yann LeCun et al. applied the standard backpropagation algorithm',
            'By 1991 such systems were used for recognizing isolated 2-D hand-written digits, while recognizing 3-D objects was done by matching 2-D images with a handcrafted 3-D object model. Weng et al. suggested that a human brain does not use a monolithic 3-D object model and in 1992 they published Cresceptron',
            'Because it directly used natural images, Cresceptron started the beginning of general-purpose visual learning for natural 3D worlds. Cresceptron is a cascade of layers similar to Neocognitron. But while Neocognitron required a human programmer to hand-merge features, Cresceptron learned an open number of features in each layer without supervision',
            'Cresceptron segmented each learned object from a cluttered scene through back-analysis through the network. Max pooling, now often adopted by deep neural networks (e.g. ImageNet tests), was first used in Cresceptron to reduce the position resolution by a factor of (2x2) to 1 through the cascade for better generalization',
            'In 1994, André de Carvalho, together with Mike Fairhurst and David Bisset, published experimental results of a multi-layer boolean neural network, also known as a weightless neural network, composed of a 3-layers self-organising feature extraction neural network module (SOFT) followed by a multi-layer classification neural network module (GSN)',
            'n 1995, Brendan Frey demonstrated that it was possible to train a network containing six fully connected layers and several hundred hidden units using the wake-sleep algorithm, co-developed with Peter Dayan and Hinton. Many factors contribute to the slow speed, including the vanishing gradient problem analyzed in 1991 by Sepp Hochreiter',
            'Simpler models that use task-specific handcrafted features such as Gabor filters and support vector machines (SVMs) were a popular choice in the 1990s and 2000s, because of artificial neural network\'s (ANN) computational cost and a lack of understanding of how the brain wires its biological networks.',
            'Both shallow and deep learning (e.g., recurrent nets) of ANNs have been explored for many years.[47][48][49] These methods never outperformed non-uniform internal-handcrafting Gaussian mixture model/Hidden Markov model (GMM-HMM) technology based on generative models of speech trained discriminatively.',
            'Key difficulties have been analyzed, including gradient diminishing[45] and weak temporal correlation structure in neural predictive models.[51][52] Additional difficulties were the lack of training data and limited computing power. Most speech recognition researchers moved away from neural nets to pursue generative modeling.',
            'An exception was at SRI International in the late 1990s. Funded by the US government\'s NSA and DARPA, SRI studied deep neural networks in speech and speaker recognition. The speaker recognition team led by Larry Heck achieved the first significant success with deep neural networks.',
            'While SRI experienced success with deep neural networks in speaker recognition, they were unsuccessful in demonstrating similar success in speech recognition. The principle of elevating "raw" features over hand-crafted optimization was first explored successfully in the architecture of deep autoencoder on the "raw" spectrogram'
        ]

        # Pre-allocate GPU memory
        tokens_list = [
            self.base_tokenizer.tokenize(sentence)
            for sentence in self.sentence_list
        ]
        features = [
            self.base_tokenizer.convert_tokens_to_ids(tokens)
            for tokens in tokens_list
        ]
        features = [
            self.base_tokenizer.prepare_for_model(input,
                                                  None,
                                                  add_special_tokens=True,
                                                  max_length=128)
            for input in features
        ]
        max_len = max([len(f['input_ids']) for f in features])
        features = [[
            f['input_ids'] + [0] * (max_len - len(f['input_ids']))
            for f in features
        ]]
        all_input_ids = torch.tensor(features, dtype=torch.long)

        if self.use_gpu:
            all_input_ids = all_input_ids.cuda()

        with torch.no_grad():
            _ = self.model(all_input_ids)[0].cpu().numpy()
Esempio n. 15
0
from .evals import get_metrics
from .helpers import *
from .models import ElmoSCLSTM
from .util import get_module_or_attr

""" NEW: reranking snippets """
# (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
import torch
from torch.nn import CrossEntropyLoss

HFACE_batch_size = 8
RERANKER = "GPT-2"  # GPT/GPT-2/CTRL/Transformer-XL/XLNet
if RERANKER == "GPT":
    from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel

    gpt2Tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    gpt2LMHeadModel = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
    gpt2Tokenizer.add_special_tokens({'pad_token': "[PAD]"})
    gpt2LMHeadModel.resize_token_embeddings(len(gpt2Tokenizer))
    assert gpt2Tokenizer.pad_token == '[PAD]'
elif "GPT-2":
    from transformers import GPT2Tokenizer, GPT2LMHeadModel

    gpt2Tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
    gpt2LMHeadModel = GPT2LMHeadModel.from_pretrained('gpt2-medium')
    gpt2Tokenizer.pad_token = gpt2Tokenizer.eos_token
elif "Transformer-XL":
    from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel

    txlTokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
    txlLMHeadModel = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
Esempio n. 16
0
# import argparse
import ingest
import csv
from tqdm import tqdm
from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, AdamW,
                          cached_path, WEIGHTS_NAME, CONFIG_NAME,
                          get_linear_schedule_with_warmup)
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
import transformers
import numpy as np
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = OpenAIGPTDoubleHeadsModel.from_pretrained("log/")
tokenizer = OpenAIGPTTokenizer.from_pretrained("log/")
special_tokens = ['_start_', '_delimiter_', '_classify_']
special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)


def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)


def load_rocstories_dataset(dataset_path, loadLabel=False):
    """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
    with open(dataset_path, encoding='utf_8') as f:
        f = csv.reader(f)
        output = []
        next(f)  # skip the first line
Esempio n. 17
0
def load_gpt_input_tensors(statement_jsonl_path, max_seq_length):
    def _truncate_seq_pair(tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length."""
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def load_qa_dataset(dataset_path):
        """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
        with open(dataset_path, "r", encoding="utf-8") as fin:
            output = []
            for line in fin:
                input_json = json.loads(line)
                label = ord(input_json.get("answerKey", "A")) - ord("A")
                output.append(
                    (input_json['id'], input_json["question"]["stem"], *[
                        ending["text"]
                        for ending in input_json["question"]["choices"]
                    ], label))
        return output

    def pre_process_datasets(encoded_datasets, num_choices, max_seq_length,
                             start_token, delimiter_token, clf_token):
        """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)

            To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
            input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
        """
        tensor_datasets = []
        for dataset in encoded_datasets:
            n_batch = len(dataset)
            input_ids = np.zeros((n_batch, num_choices, max_seq_length),
                                 dtype=np.int64)
            mc_token_ids = np.zeros((n_batch, num_choices), dtype=np.int64)
            lm_labels = np.full((n_batch, num_choices, max_seq_length),
                                fill_value=-1,
                                dtype=np.int64)
            mc_labels = np.zeros((n_batch, ), dtype=np.int64)
            for i, data, in enumerate(dataset):
                q, mc_label = data[0], data[-1]
                choices = data[1:-1]
                for j in range(len(choices)):
                    _truncate_seq_pair(q, choices[j], max_seq_length - 3)
                    qa = [start_token] + q + [delimiter_token
                                              ] + choices[j] + [clf_token]
                    input_ids[i, j, :len(qa)] = qa
                    mc_token_ids[i, j] = len(qa) - 1
                    lm_labels[i, j, :len(qa) - 1] = qa[1:]
                mc_labels[i] = mc_label
            all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
            tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
        return tensor_datasets

    def tokenize_and_encode(tokenizer, obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        else:
            return list(tokenize_and_encode(tokenizer, o) for o in obj)

    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    tokenizer.add_tokens(GPT_SPECIAL_TOKENS)
    special_tokens_ids = tokenizer.convert_tokens_to_ids(GPT_SPECIAL_TOKENS)

    dataset = load_qa_dataset(statement_jsonl_path)
    examples_ids = [data[0] for data in dataset]
    dataset = [data[1:] for data in dataset]  # discard example ids
    num_choices = len(dataset[0]) - 2

    encoded_dataset = tokenize_and_encode(tokenizer, dataset)

    (input_ids, mc_token_ids, lm_labels,
     mc_labels), = pre_process_datasets([encoded_dataset], num_choices,
                                        max_seq_length, *special_tokens_ids)
    return examples_ids, mc_labels, input_ids, mc_token_ids, lm_labels
 def setup_python_tokenizer(self):
     self.base_tokenizer = OpenAIGPTTokenizer.from_pretrained(
         'openai-gpt', do_lower_case=True, cache_dir=self.test_dir)
Esempio n. 19
0
from typing import *

import torch
from transformers import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer

from itertools import chain

from special_tokens import bos, eos, speaker_self, speaker_other, lsep, pad, SPECIAL_TOKENS

model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt")
tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")

# history = [[(True, "hello"), (True, "how"), (True, "are"), (True, "you"), (True, "?")],
#            [(False, "i"), (False, "am"), (False, "fine"), (False, "thanks"), (False, ".")]]

history = [(True, tokenizer.tokenize("hello how are you?")),
           (False, tokenizer.tokenize("i am fine thanks."))]

reply = (True, ["good", "to", "hear", "."])

orig_num_tokens = len(tokenizer.encoder)
print(orig_num_tokens)
num_added_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS)
model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)


def build_inputs(history: List[Tuple[bool, List[str]]], reply: Tuple[bool, List[str]]):
    history = history + [reply]
    sequence = list(map(lambda x: [speaker_self if x[0] else speaker_other] + x[1], history))
    # print(sequence)
    sequence[0] = [bos] + sequence[0]
def initialize():
    global model, tokenizer
    model = TFOpenAIGPTLMHeadModel.from_pretrained("openai-gpt")
    tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
Esempio n. 21
0
def test_gpt_embeddings():
    gpt_model: str = "openai-gpt"

    tokenizer = OpenAIGPTTokenizer.from_pretrained(gpt_model)
    model = OpenAIGPTModel.from_pretrained(
        pretrained_model_name_or_path=gpt_model, output_hidden_states=True)
    model.to(flair.device)
    model.eval()

    s: str = "Berlin and Munich have a lot of puppeteer to see ."

    with torch.no_grad():
        tokens = tokenizer.tokenize(s)

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]

        first_layer = hidden_states[1][0]

    assert len(first_layer) == len(tokens)

    #     0             1           2            3          4         5         6        7       8       9        10        11         12
    #
    # 'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>'
    #     |             |           |            |          |         |         |         \      |      /          |         |          |
    #   Berlin         and        Munich        have        a        lot        of           puppeteer             to       see         .
    #
    #     0             1           2            3          4         5         6                7                  8        9          10

    def embed_sentence(
        sentence: str,
        pooling_operation,
        layers: str = "1",
        use_scalar_mix: bool = False,
    ) -> Sentence:
        embeddings = OpenAIGPTEmbeddings(
            pretrained_model_name_or_path=gpt_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence

    # First subword embedding
    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation="first")

    first_token_embedding_ref = first_layer[0].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_subword_embedding_ref = first_layer[7].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)

    # Last subword embedding
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation="last")

    first_token_embedding_ref = first_layer[0].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_last_subword_embedding_ref = first_layer[9].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)

    # First and last subword embedding
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation="first_last")

    first_token_embedding_ref = torch.cat([first_layer[0],
                                           first_layer[0]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[7], first_layer[9]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)

    # Mean of all subword embeddings
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation="mean")

    first_token_embedding_ref = calculate_mean_embedding([first_layer[0]
                                                          ]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()

    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[7], first_layer[8], first_layer[9]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)

    # Check embedding dimension when using multiple layers
    sentence_mult_layers = embed_sentence(sentence="Munich",
                                          pooling_operation="first",
                                          layers="1,2,3,4")

    ref_embedding_size = 4 * 768
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size

    # Check embedding dimension when using multiple layers and scalar mix
    sentence_mult_layers_scalar_mix = embed_sentence(
        sentence="Berlin",
        pooling_operation="first",
        layers="1,2,3,4",
        use_scalar_mix=True,
    )

    ref_embedding_size = 1 * 768
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size
Esempio n. 22
0
    parser.add_argument('--data_dir', type=str, default='../../data')
    parser.add_argument('--n_batch', type=int, default=1)
    parser.add_argument('--beam', type=int, default=10)
    parser.add_argument('--filter_decode', type=bool, default=True)
    parser.add_argument('--mem_k', type=int, default=1)
args = parser.parse_args()
print(args)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
use_mem = args.use_mem
device = torch.device(device)
text_encoder = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
encoder = text_encoder.encoder
decoder = text_encoder.decoder

#sentence-level special tokens
encoder['<|sent0|>'] = len(encoder)
decoder[len(decoder)] = '<|sent0|>'

encoder['<|sent1|>'] = len(encoder)
decoder[len(decoder)] = '<|sent1|>'

encoder['<|sent2|>'] = len(encoder)
decoder[len(decoder)] = '<|sent2|>'

encoder['<|sent3|>'] = len(encoder)
decoder[len(decoder)] = '<|sent3|>'
Esempio n. 23
0
import torch.nn as nn
# import a config from transformers
from transformers import Trainer, TrainingArguments
from transformers import TextDataset
# OpenAI GPT for text generation
from transformers import OpenAIGPTConfig, OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
from transformers import DataCollatorForLanguageModeling
from process_data import *

# initialize a model from config
config = OpenAIGPTConfig(vocab_size=100000, n_positions=512, n_layer=6)
model = False

# the pretrained tokenizer
tname = "Jojo_Tokenizer"
tokenizer = OpenAIGPTTokenizer.from_pretrained(tname)

# initialize a data collator
# https://github.com/huggingface/transformers/blob/1af58c07064d8f4580909527a8f18de226b226ee/src/transformers/data/data_collator.py#L68
data_collator = False

# initialize dataset - process_data
# https://github.com/huggingface/transformers/blob/1af58c07064d8f4580909527a8f18de226b226ee/src/transformers/data/datasets/language_modeling.py#L16
dataset = False

output = "output"

# initialize training arguments
training_args = TrainingArguments(
    output_dir="./" + output,
    overwrite_output_dir=True,
Esempio n. 24
0
    # Save a trained model
    if args.do_train or args.do_save:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, "module") else model  # Only save the model itself

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss, _, mc_logits = model(
                    input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels
                )

            mc_logits = mc_logits.detach().cpu().numpy()