def initialize(self, ctx):

        self.manifest = ctx.manifest

        properties = ctx.system_properties
        model_dir = properties.get("model_dir")
        serialized_file = self.manifest["model"]["serializedFile"]
        model_pt_path = os.path.join(model_dir, serialized_file)
        self.device = torch.device("cuda:" +
                                   str(properties.get("gpu_id")) if torch.cuda.
                                   is_available() else "cpu")

        # Read model serialize/pt file
        self.tokenizer = BertTokenizer.from_pretrained(
            self.config.model_name_or_path)
        self.processor = TRADEPreprocessor(self.slot_meta, self.tokenizer)

        tokenized_slot_meta = []
        for slot in self.slot_meta:
            tokenized_slot_meta.append(
                self.tokenizer.encode(slot.replace("-", " "),
                                      add_special_tokens=False))

        self.model = TRADE(self.config, tokenized_slot_meta)
        ckpt = torch.load(model_pt_path, map_location="cpu")

        self.model.load_state_dict(ckpt)
        self.model.to(self.device)
        print("Model is loaded")

        self.initialized = True
Esempio n. 2
0
    # Data Loading
    train_data_file = f"{args.data_dir}/train_dials.json"
    slot_meta = json.load(open(f"{args.data_dir}/slot_meta.json"))
    train_data, dev_data, dev_labels = load_dataset(train_data_file)

    train_examples = get_examples_from_dialogues(
        train_data, user_first=False, dialogue_level=False
    )
    dev_examples = get_examples_from_dialogues(
        dev_data, user_first=False, dialogue_level=False
    )

    # Define Preprocessor
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
    processor = TRADEPreprocessor(slot_meta, tokenizer)
    args.vocab_size = len(tokenizer)
    args.n_gate = len(processor.gating2id) # gating 갯수 none, dontcare, ptr

    # Extracting Featrues
    train_features = processor.convert_examples_to_features(train_examples)
    dev_features = processor.convert_examples_to_features(dev_examples)
    
    # Slot Meta tokenizing for the decoder initial inputs
    tokenized_slot_meta = []
    for slot in slot_meta:
        tokenized_slot_meta.append(
            tokenizer.encode(slot.replace("-", " "), add_special_tokens=False)
        )
    
    # Model 선언
    parser.add_argument("--model_dir", type=str, default=None)
    parser.add_argument("--output_dir", type=str, default=None)
    parser.add_argument("--eval_batch_size", type=int, default=32)
    args = parser.parse_args()
    args.data_dir = os.environ['SM_CHANNEL_EVAL']
    args.model_dir = os.environ['SM_CHANNEL_MODEL']
    args.output_dir = os.environ['SM_OUTPUT_DATA_DIR']
    
    model_dir_path = os.path.dirname(args.model_dir)
    eval_data = json.load(open(f"{args.data_dir}/eval_dials.json", "r"))
    config = json.load(open(f"{model_dir_path}/exp_config.json", "r"))
    config = argparse.Namespace(**config)
    slot_meta = json.load(open(f"{model_dir_path}/slot_meta.json", "r"))

    tokenizer = BertTokenizer.from_pretrained(config.model_name_or_path)
    processor = TRADEPreprocessor(slot_meta, tokenizer)

    eval_examples = get_examples_from_dialogues(
        eval_data, user_first=False, dialogue_level=False
    )

    # Extracting Featrues
    eval_features = processor.convert_examples_to_features(eval_examples)
    eval_data = WOSDataset(eval_features)
    eval_sampler = SequentialSampler(eval_data)
    eval_loader = DataLoader(
        eval_data,
        batch_size=args.eval_batch_size,
        sampler=eval_sampler,
        collate_fn=processor.collate_fn,
    )
Esempio n. 4
0
    # args.data_dir = os.environ['SM_CHANNEL_TRAIN']
    # args.model_dir = os.environ['SM_MODEL_DIR']

    output_dir = increment_output_dir(args.model_dir)

    # random seed 고정
    set_seed(args.random_seed)

    # Data Loading
    train_data_file = f"{args.data_dir}/train_dials.json"
    slot_meta = json.load(open(f"{args.data_dir}/slot_meta.json"))

    # Define Preprocessor
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
    processor = TRADEPreprocessor(
        slot_meta, tokenizer,
        word_drop=args.word_dropout)  ## preprocessor에 word dropout 적용
    _config = BertConfig.from_pretrained(args.model_name_or_path)
    args.hidden_act = _config.hidden_act
    args.layer_norm_eps = _config.layer_norm_eps
    args.vocab_size = len(tokenizer)
    args.n_gate = len(
        processor.gating2id)  # gating 갯수 none, dontcare, ptr, yes, no

    feature_path = '/opt/ml/code/p3-dst-chatting-day/features/' + args.feature_model
    if os.path.exists(feature_path):
        # 저장된 feature 피클 가져오기
        with open(feature_path + '/train_features.pickle', 'rb') as f:
            train_features = pickle.load(f)
        with open(feature_path + '/dev_features.pickle', 'rb') as f:
            dev_features = pickle.load(f)
Esempio n. 5
0
def train(args):
    # Define Tokenizer
    tokenizer_module = getattr(import_module("transformers"),
                               f"{args.model_name}Tokenizer")
    tokenizer = tokenizer_module.from_pretrained(args.pretrained_name_or_path)

    slot_meta, train_examples, dev_examples, dev_labels = train_data_loading(
        args, isUserFirst=False, isDialogueLevel=False)
    # Define Preprocessor
    processor = TRADEPreprocessor(slot_meta,
                                  tokenizer,
                                  max_seq_length=args.max_seq_length,
                                  use_n_gate=args.use_n_gate)

    train_features = processor.convert_examples_to_features(train_examples)
    dev_features = processor.convert_examples_to_features(dev_examples)

    train_loader = get_data_loader(processor, train_features,
                                   args.train_batch_size)
    dev_loader = get_data_loader(processor, dev_features, args.eval_batch_size)

    args.vocab_size = len(tokenizer)
    args.n_gate = len(
        processor.gating2id
    )  # gating 갯수 : (none, dontcare, ptr) or (none, yes, no, dontcare, ptr)

    # Slot Meta tokenizing for the decoder initial inputs
    tokenized_slot_meta = []
    for slot in slot_meta:
        tokenized_slot_meta.append(
            tokenizer.encode(slot.replace("-", " "), add_special_tokens=False))

    # Model 선언
    model = TRADE(args, tokenized_slot_meta)
    # model.set_subword_embedding(args)  # Subword Embedding 초기화
    print(f"Subword Embeddings is loaded from {args.pretrained_name_or_path}")
    model.to(device)
    print("Model is initialized")

    # Optimizer 및 Scheduler 선언
    n_epochs = args.epochs
    t_total = len(train_loader) * n_epochs
    # get_optimizer 부분에서 자동으로 warmup_steps를 계산할 수 있도록 바꿨음 (아래가 원래의 code)
    # warmup_steps = int(t_total * args.warmup_ratio)

    optimizer = get_optimizer(model,
                              args)  # get optimizer (Adam, sgd, AdamP, ..)

    scheduler = get_scheduler(
        optimizer, t_total, args)  # get scheduler (custom, linear, cosine, ..)

    loss_fnc_1 = masked_cross_entropy_for_value  # generation - # classes: vocab_size
    loss_fnc_2 = nn.CrossEntropyLoss()
    # loss_fnc_2 = LabelSmoothingLoss(classes=model.decoder.n_gate,smoothing=args.smoothing_factor)

    json.dump(
        vars(args),
        open(f"{args.model_dir}/{args.model_fold}/exp_config.json", "w"),
        indent=2,
        ensure_ascii=False,
    )
    json.dump(
        slot_meta,
        open(f"{args.model_dir}/{args.model_fold}/slot_meta.json", "w"),
        indent=2,
        ensure_ascii=False,
    )

    best_score, best_checkpoint = 0, 0
    for epoch in range(n_epochs):
        model.train()
        for step, batch in enumerate(train_loader):
            input_ids, segment_ids, input_masks, gating_ids, target_ids, guids = [
                b.to(device) if not isinstance(b, list) else b for b in batch
            ]

            # teacher forcing
            if (args.teacher_forcing_ratio > 0.0
                    and random.random() < args.teacher_forcing_ratio):
                tf = target_ids
            else:
                tf = None

            all_point_outputs, all_gate_outputs = model(
                input_ids, segment_ids, input_masks, target_ids.size(-1), tf)

            # generation loss
            loss_1 = loss_fnc_1(
                all_point_outputs.contiguous(),
                target_ids.contiguous().view(-1),
                tokenizer.pad_token_id,
            )

            # gating loss
            loss_2 = loss_fnc_2(
                all_gate_outputs.contiguous().view(-1, args.n_gate),
                gating_ids.contiguous().view(-1),
            )
            loss = loss_1 + loss_2

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            optimizer.step()
            scheduler.step()
            for learning_rate in scheduler.get_lr():
                wandb.log({"learning_rate": learning_rate})

            optimizer.zero_grad()

            if step % 100 == 0:
                print(
                    f"[{epoch}/{n_epochs}] [{step}/{len(train_loader)}] loss: {loss.item()} gen: {loss_1.item()} gate: {loss_2.item()}"
                )
                wandb.log({
                    "epoch": epoch,
                    "Train epoch loss": loss.item(),
                    "Train epoch gen loss": loss_1.item(),
                    "Train epoch gate loss": loss_2.item(),
                })

        predictions = inference_TRADE(model, dev_loader, processor, device)
        eval_result = _evaluation(predictions, dev_labels, slot_meta)
        for k, v in eval_result.items():
            if k in ("joint_goal_accuracy", 'turn_slot_accuracy',
                     'turn_slot_f1'):
                print(f"{k}: {v}")

        if best_score < eval_result["joint_goal_accuracy"]:
            print("Update Best checkpoint!")
            best_score = eval_result["joint_goal_accuracy"]
            best_checkpoint = epoch

            wandb.log({
                "epoch":
                epoch,
                "Best joint goal accuracy":
                best_score,
                "Best turn slot accuracy":
                eval_result['turn_slot_accuracy'],
                "Best turn slot f1":
                eval_result['turn_slot_f1']
            })

        if args.logging_accuracy_per_domain_slot:
            wandb.log({
                k: v
                for k, v in eval_result.items()
                if k not in ("joint_goal_accuracy", 'turn_slot_accuracy',
                             'turn_slot_f1')
            })

        torch.save(model.state_dict(),
                   f"{args.model_dir}/{args.model_fold}/model-{epoch}.bin")

    print(f"Best checkpoint: {args.model_dir}/model-{best_checkpoint}.bin")
    wandb.log(
        {"Best checkpoint": f"{args.model_dir}/model-{best_checkpoint}.bin"})
def main_inference(args, config):
    slot_meta = json.load(
        open(f"{args.model_dir}/{args.model_fold}/slot_meta.json", "r"))
    ontology = json.load(open(f"{CFG.TrainOntology}", "r"))

    if config.replace_word_data:
        slot_meta = [meta.replace('택시', '버스') for meta in slot_meta]
        ontology = {
            domain_slot_key.replace('택시', '버스'): domain_slot_value
            for domain_slot_key, domain_slot_value in ontology.items()
        }

    # Define Tokenizer
    tokenizer_module = getattr(import_module("transformers"),
                               f"{config.model_name}Tokenizer")
    tokenizer = tokenizer_module.from_pretrained(
        config.pretrained_name_or_path)

    # Extracting Featrues
    if config.dst == 'TRADE':
        eval_examples = test_data_loading(args,
                                          isUserFirst=False,
                                          isDialogueLevel=False)
        processor = TRADEPreprocessor(slot_meta, tokenizer)

        tokenized_slot_meta = []
        for slot in slot_meta:
            tokenized_slot_meta.append(
                tokenizer.encode(slot.replace("-", " "),
                                 add_special_tokens=False))

        # Model 선언
        model = TRADE(config, tokenized_slot_meta)
        model.set_subword_embedding(config)  # Subword Embedding 초기화

    elif config.dst == 'SUMBT':
        eval_examples = test_data_loading(args,
                                          isUserFirst=True,
                                          isDialogueLevel=True)
        max_turn = max([len(e) * 2 for e in eval_examples])
        processor = SUMBTPreprocessor(
            slot_meta,
            tokenizer,
            ontology=ontology,  # predefined ontology
            max_seq_length=config.max_seq_length,  # 각 turn마다 최대 길이
            max_turn_length=max_turn)  # 각 dialogue의 최대 turn 길이

        slot_type_ids, slot_values_ids = tokenize_ontology(
            ontology, tokenizer, config.max_label_length)

        # Model 선언
        num_labels = [len(s)
                      for s in slot_values_ids]  # 각 Slot 별 후보 Values의 갯수

        model = SUMBT(config, num_labels, device)
        model.initialize_slot_value_lookup(
            slot_values_ids,
            slot_type_ids)  # Tokenized Ontology의 Pre-encoding using BERT_SV

    eval_features = processor.convert_examples_to_features(eval_examples)
    eval_loader = get_data_loader(processor, eval_features,
                                  config.eval_batch_size)
    print("# eval:", len(eval_loader))

    ckpt = torch.load(
        f'{args.model_dir}/{args.model_fold}/model-{args.chkpt_idx}.bin',
        map_location="cpu")
    model.load_state_dict(ckpt)
    model.to(device)
    print("Model is loaded")

    inference_module = getattr(import_module("inference"),
                               f"inference_{config.dst}")
    predictions = inference_module(model, eval_loader, processor, device)

    os.makedirs(args.output_dir, exist_ok=True)

    json.dump(
        predictions,
        open(f"{args.output_dir}/{args.model_fold}-predictions.csv", "w"),
        indent=2,
        ensure_ascii=False,
    )
class TRADEHandler(BaseHandler, ABC):
    """
    Transformers text classifier handler class. This handler takes a text (string) and
    as input and returns the classification text based on the serialized transformers checkpoint.
    """
    def __init__(self):
        super(TRADEHandler, self).__init__()
        self.initialized = False

        self.config, self.slot_meta = self.load_json_data(
            "./exp_config.json", "./slot_meta.json")

    def load_json_data(self, exp_config_path, slot_meta_path):

        config = json.load(open(exp_config_path, "r"))
        config = argparse.Namespace(**config)

        slot_meta = json.load(open(slot_meta_path, "r"))

        return config, slot_meta

    def initialize(self, ctx):

        self.manifest = ctx.manifest

        properties = ctx.system_properties
        model_dir = properties.get("model_dir")
        serialized_file = self.manifest["model"]["serializedFile"]
        model_pt_path = os.path.join(model_dir, serialized_file)
        self.device = torch.device("cuda:" +
                                   str(properties.get("gpu_id")) if torch.cuda.
                                   is_available() else "cpu")

        # Read model serialize/pt file
        self.tokenizer = BertTokenizer.from_pretrained(
            self.config.model_name_or_path)
        self.processor = TRADEPreprocessor(self.slot_meta, self.tokenizer)

        tokenized_slot_meta = []
        for slot in self.slot_meta:
            tokenized_slot_meta.append(
                self.tokenizer.encode(slot.replace("-", " "),
                                      add_special_tokens=False))

        self.model = TRADE(self.config, tokenized_slot_meta)
        ckpt = torch.load(model_pt_path, map_location="cpu")

        self.model.load_state_dict(ckpt)
        self.model.to(self.device)
        print("Model is loaded")

        self.initialized = True

    def preprocess(self, requests):
        """ Very basic preprocessing code - only tokenizes.
            Extend with your own preprocessing steps as needed.
        """
        input_batch = []
        for idx, data in enumerate(requests):
            input_text = data.get("data")
            if input_text is None:
                input_text = data.get("body")
            if isinstance(input_text, (bytes, bytearray)):
                input_text = input_text.decode('utf-8')

            input_text = json.loads(input_text)
            input_batch.extend(input_text)

        eval_examples = get_examples_from_dialogues(input_batch,
                                                    user_first=False,
                                                    dialogue_level=False)
        eval_features = self.processor.convert_examples_to_features(
            eval_examples)
        eval_data = WOSDataset(eval_features)
        eval_sampler = SequentialSampler(eval_data)
        eval_loader = DataLoader(
            eval_data,
            batch_size=1,
            sampler=eval_sampler,
            collate_fn=self.processor.collate_fn,
        )

        return eval_loader

    def postprocess_state(self, state):
        for i, s in enumerate(state):
            s = s.replace(" : ", ":")
            state[i] = s.replace(" , ", ", ")
        return state

    def inference(self, inputs):
        self.model.eval()
        output_lst = []
        predictions = {}
        for batch in inputs:
            input_ids, segment_ids, input_masks, gating_ids, target_ids, guids = [
                b.to(self.device) if not isinstance(b, list) else b
                for b in batch
            ]

            with torch.no_grad():
                o, g = self.model(input_ids, segment_ids, input_masks, 9)

                _, generated_ids = o.max(-1)
                _, gated_ids = g.max(-1)

            for guid, gate, gen in zip(guids, gated_ids.tolist(),
                                       generated_ids.tolist()):
                prediction = self.processor.recover_state(gate, gen)
                prediction = self.postprocess_state(prediction)
                predictions[guid] = prediction

        output_lst.append(predictions)
        return output_lst

    # def inference(self, inputs):
    #     """
    #     Predict the class of a text using a trained transformer model.
    #     """
    #     # NOTE: This makes the assumption that your model expects text to be tokenized
    #     # with "input_ids" and "token_type_ids" - which is true for some popular transformer models, e.g. bert.
    #     # If your transformer model expects different tokenization, adapt this code to suit
    #     # its expected input format.
    #     prediction = self.model(
    #         inputs['input_ids'].to(self.device),
    #         token_type_ids=inputs['token_type_ids'].to(self.device)
    #     )[0].argmax().item()
    #     logger.info("Model predicted: '%s'", prediction)
    #
    #     if self.mapping:
    #         prediction = self.mapping[str(prediction)]
    #
    #     return [prediction]

    def postprocess(self, inference_output):
        # TODO: Add any needed post-processing of the model predictions here
        return inference_output