Ejemplo n.º 1
0
def main(test_file, config):
    params = load_config(config)
    print(params)
    set_seed(params.seed)

    tokenizer = transformers.AutoTokenizer.from_pretrained(
        params.output.tokenizer_dir)
    model = transformers.TFAutoModelWithLMHead.from_pretrained(
        params.output.model_dir)
    test_texts = load_dataset(test_file)

    x_test, y_test = build_data(tokenizer, test_texts, params.block_size)

    # Create optimizer
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    # optimizer = keras.optimizers.Adam()
    model.compile(
        loss=[loss, *[None] * model.config.n_layer],
        metrics=[
            keras.metrics.SparseCategoricalCrossentropy(from_logits=True),
            keras.metrics.SparseCategoricalAccuracy(),
        ],
    )

    # Evaluate best model with test set
    res = model.evaluate(x_test, y_test)
    print(res)
def main():
    args = get_args()
    args.n_gpu = 1

    set_seed(args)

    # Construct tokenizer
    tokenizer = CharTokenizer([])
    tokenizer.load(args.load_vocab)
    args.vocab_size = len(tokenizer)

    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    # GPU setting
    os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Construct model
    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    logger.info(
        f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M"
    )

    # Load data
    noisy_sents = read_strings(os.path.join('sejong_corpus', args.noisy_file))
    clean_sents = read_strings(os.path.join('sejong_corpus', args.clean_file))
    sents_annotation = ['None'] * len(noisy_sents)

    pairs = [{
        "noisy": noisy,
        "clean": clean,
        "annotation": annot
    } for noisy, clean, annot in zip(noisy_sents, clean_sents,
                                     sents_annotation)]

    # Train-validation split
    train_data, valid_data = train_test_split(
        pairs, test_size=args.val_ratio,
        random_state=args.seed)  # test: about 1000
    logger.info(f"# of train data: {len(train_data)}")
    logger.info(f"# of valid data: {len(valid_data)}")

    train(model, tokenizer, train_data, valid_data, args, eos=args.eos_setting)
Ejemplo n.º 3
0
def main():

    set_seed(seed=1234)
    NUM_WORKERS = os.cpu_count() * 3
    classes = ["car", "motorcycle", "bus", "bicycle", "truck", "pedestrian", "other_vehicle", "animal", "emergency_vehicle"]
        
    #train_dataset = LyftDataset(data_path='.', json_path='../input/3d-object-detection-for-autonomous-vehicles/train_data', verbose=True)
    #level5data = LyftTestDataset(data_path='.', json_path='../input/3d-object-detection-for-autonomous-vehicles/test_data', verbose=True)
   
    class_heights = {'animal':0.51,'bicycle':1.44,'bus':3.44,'car':1.72,'emergency_vehicle':2.39,'motorcycle':1.59,
                'other_vehicle':3.23,'pedestrian':1.78,'truck':3.44}    

    # load data
    data_folder = os.path.join(OUTPUT_ROOT, "bev_test_1024")
    # choose test samples
    input_filepaths = sorted(glob.glob(os.path.join(data_folder, "*_input.png")))
    sample_tokens = [x.split("/")[-1].replace("_input.png","") for x in input_filepaths]
    sample_tokens = [x.replace("bev_data\\","") for x in sample_tokens] 
    df = pd.read_csv('folds/test_host_scenes.csv')
    print(df.head())
   
    # model 
    model = get_smp_model(encoder='resnext101', num_classes=len(classes)+1)
    # load model checkpoint  
    checkpoint= f'{OUTPUT_ROOT}/checkpoints/unet_4_32_768_fold_3/unet_4_32_1024_fold_3_epoch_15.pth'   
    load_model(model, checkpoint)
    model = model.to(device)
    model.eval()

    test_dataset = BEVTestDataset(sample_tokens = samples_test, 
                                 debug=True, img_size=IMG_SIZE, 
                                 input_dir: str, transforms = albu_test_tansforms)
    im, sample_token = test_dataset[1]
    im = im.numpy()

    plt.figure(figsize=(16,8))
    # Transpose the input volume CXY to XYC order, which is what matplotlib requires.
    plt.imshow(im.transpose(1,2,0)[...,:3])
    plt.title(sample_token)
    plt.show()

    visualize_lidar_of_sample(sample_token)

    box_scale = 0.8
Ejemplo n.º 4
0
parser.add_argument("--local_rank", type=int, default=-1,
                    help="For distributed training: local_rank")


args = parser.parse_args()

device = torch.device("cpu")
args.n_gpu = torch.cuda.device_count()

print(device)
print(args.n_gpu)

args.device = device

set_seed(args)

dataProcessor = DataProcessor()

dev_eg = dataProcessor.get_dev_examples()

train_eg = dataProcessor.get_train_examples()

test_eg = dataProcessor.get_test_dataset()

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

train_dataset = convert_features_to_dataset(convert_examples_to_features(
    examples=train_eg, label2id=LABEL2ID, max_seq_length=121, tokenizer=tokenizer))

dev_dataset = convert_features_to_dataset(convert_examples_to_features(
def run():
    parser = ArgumentParser()
    parser.add_argument("--run_name", type=str, default='run1', help="The name of the run (subdirectory in ./runs)")
    parser.add_argument("--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2'])
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=80, help="Maximum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature", type=int, default=1.0, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.8, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    parser.add_argument("--no_info", action='store_true', default=False, help="Only show conversation output")
    args = parser.parse_args()

    # set seed
    set_seed(args)

    logger.info("Get pretrained model and tokenizer")
    model_path = os.path.join('runs', args.run_name)
    tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(model_path)
    model = model_class.from_pretrained(model_path)
    model.to(args.device)
    history = []
    personality = []
    speaker1_tag = '<speaker1>'
    speaker2_tag = '<speaker2>'
    speaker1_tag_id = tokenizer.convert_tokens_to_ids(speaker1_tag)
    speaker2_tag_id = tokenizer.convert_tokens_to_ids(speaker2_tag)
    history = f"""
{speaker2_tag} Hi!
{speaker1_tag} Hello
{speaker2_tag} Are you ready?
{speaker1_tag} Yes!
{speaker2_tag} Ok let's start chatting
{speaker1_tag} Sure, what do you want to talk about?"""
    print(history)
    print('\n[Chat with the model! Send "h" to see the full history]\n')
    history = history.split('\n')
    while True: 
        message = None
        while not message:
            message = input(f'{speaker2_tag} ')
            if message == 'h':
                print('\n'.join(history))
                message = None
        # add new message to history
        history.append(f'{speaker2_tag} {message}')
        # keep only most recent conversation as input to the model
        recent_history = history[-(2*args.max_history):]
        # concatenate history into single string and add trigger word "bot:"
        history_str = '{}\n{}'.format('\n'.join(recent_history), speaker1_tag)
        # tokenize text and convert into vocabulary ids (input ids)
        history_enc = tokenizer.encode(history_str, add_special_tokens=True)
        with torch.no_grad():
            out_ids = sample_sequence(history_enc, model, args)
        out_ids = out_ids[:, len(history_enc):].tolist()[0]
        if not args.no_info:
            print(20*'-')
            print('Output of model:')
            full_output = tokenizer.decode(out_ids, clean_up_tokenization_spaces=True)
            print(full_output)
            print('\nInput to the model:')
            print(history_str)
            print(20*'-' + '\n')
        # Select part before speaker tags as answer
        for i, out_id in enumerate(out_ids):
            if out_id in [speaker1_tag_id, speaker2_tag_id]:
                break
        answer = '{} {}'.format(speaker1_tag, tokenizer.decode(out_ids[:i]))
        print(answer)
        # add answer to history
        history.append(answer)
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch-size', default=50, type=int)
    parser.add_argument('--dropout', default=0.5, type=float)
    parser.add_argument('--epoch', default=20, type=int)
    parser.add_argument('--learning-rate', default=0.1, type=float)
    parser.add_argument("--mode",
                        default="non-static",
                        help="available models: rand, static, non-static")
    parser.add_argument('--num-feature-maps', default=100, type=int)
    parser.add_argument("--pretrained-word-vectors",
                        default="fasttext",
                        help="available models: fasttext, Word2Vec")
    parser.add_argument("--save-word-vectors",
                        action='store_true',
                        default=False,
                        help='save trained word vectors')
    parser.add_argument("--predict",
                        action='store_true',
                        default=False,
                        help='classify your sentence')
    args = parser.parse_args()

    # load data
    print("Load data...\n")
    texts, labels = dataset.load_data()

    print("Tokenizing...\n")
    tokenized_texts, word2idx, max_len = dataset.tokenize(texts)
    input_ids = dataset.encode(tokenized_texts, word2idx, max_len)
    train_inputs, val_inputs, train_labels, val_labels = train_test_split(
        input_ids, labels, test_size=0.1, random_state=42)

    print("Creating Dataloader...\n")
    train_dataloader, val_dataloader = dataset.data_loader(
        train_inputs,
        val_inputs,
        train_labels,
        val_labels,
        batch_size=args.batch_size)

    if args.mode == 'rand':
        # CNN-rand: Word vectors are randomly initialized.
        train.set_seed(42)
        cnn_model, optimizer = model.initilize_model(
            vocab_size=len(word2idx),
            embed_dim=300,
            learning_rate=args.learning_rate,
            dropout=args.dropout)
        train.train(cnn_model,
                    optimizer,
                    train_dataloader,
                    val_dataloader,
                    epochs=args.epoch)

    elif args.mode == 'static':
        # CNN-static: fastText pretrained word vectors are used and freezed during training.
        train.set_seed(42)
        embeddings = pretrained_vectors.get_embeddings(
            word2idx, args.pretrained_word_vectors)
        cnn_model, optimizer = model.initilize_model(
            pretrained_embedding=embeddings,
            freeze_embedding=True,
            learning_rate=args.learning_rate,
            dropout=args.dropout)
        train.train(cnn_model,
                    optimizer,
                    train_dataloader,
                    val_dataloader,
                    epochs=args.epoch)

    else:
        # CNN-non-static: fastText pretrained word vectors are fine-tuned during training.
        train.set_seed(42)
        embeddings = pretrained_vectors.get_embeddings(
            word2idx, args.pretrained_word_vectors)
        cnn_model, optimizer = model.initilize_model(
            pretrained_embedding=embeddings,
            freeze_embedding=False,
            learning_rate=args.learning_rate,
            dropout=args.dropout)
        train.train(cnn_model,
                    optimizer,
                    train_dataloader,
                    val_dataloader,
                    epochs=args.epoch)

    if args.save_word_vectors == True:
        save_embeddings.write_embeddings(
            'trained_embeddings_{}.txt'.format(args.mode),
            cnn_model.embedding.weight.data, word2idx)

    if args.predict == True:
        x = input('영어 텍스트를 입력하세요! : ')
        x = str(x)
        train.predict(x, cnn_model, word2idx)
        while True:
            conti = input('계속하시겠습니까? (y/n) : ')
            if conti == 'y':
                x1 = input('영어 텍스트를 입력하세요! : ')
                x1 = str(x1)
                train.predict(x1, cnn_model, word2idx)
            else:
                break
Ejemplo n.º 7
0
def train(args, train_dataset, logger, model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Main loop for training the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate,
                                  drop_last=True)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if (args.model_name_or_path and os.path.isfile(
            os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt"))):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step