def preprocess(self, requests): """ Very basic preprocessing code - only tokenizes. Extend with your own preprocessing steps as needed. """ input_batch = [] for idx, data in enumerate(requests): input_text = data.get("data") if input_text is None: input_text = data.get("body") if isinstance(input_text, (bytes, bytearray)): input_text = input_text.decode('utf-8') input_text = json.loads(input_text) input_batch.extend(input_text) eval_examples = get_examples_from_dialogues(input_batch, user_first=False, dialogue_level=False) eval_features = self.processor.convert_examples_to_features( eval_examples) eval_data = WOSDataset(eval_features) eval_sampler = SequentialSampler(eval_data) eval_loader = DataLoader( eval_data, batch_size=1, sampler=eval_sampler, collate_fn=self.processor.collate_fn, ) return eval_loader
parser.add_argument("--teacher_forcing_ratio", type=float, default=0.5) args = parser.parse_args() args.data_dir = os.environ['SM_CHANNEL_TRAIN'] args.model_dir = os.environ['SM_MODEL_DIR'] # random seed 고정 set_seed(args.random_seed) # Data Loading train_data_file = f"{args.data_dir}/train_dials.json" slot_meta = json.load(open(f"{args.data_dir}/slot_meta.json")) train_data, dev_data, dev_labels = load_dataset(train_data_file) train_examples = get_examples_from_dialogues( train_data, user_first=False, dialogue_level=False ) dev_examples = get_examples_from_dialogues( dev_data, user_first=False, dialogue_level=False ) # Define Preprocessor tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) processor = TRADEPreprocessor(slot_meta, tokenizer) args.vocab_size = len(tokenizer) args.n_gate = len(processor.gating2id) # gating 갯수 none, dontcare, ptr # Extracting Featrues train_features = processor.convert_examples_to_features(train_examples) dev_features = processor.convert_examples_to_features(dev_examples)
args = parser.parse_args() args.data_dir = os.environ['SM_CHANNEL_EVAL'] args.model_dir = os.environ['SM_CHANNEL_MODEL'] args.output_dir = os.environ['SM_OUTPUT_DATA_DIR'] model_dir_path = os.path.dirname(args.model_dir) eval_data = json.load(open(f"{args.data_dir}/eval_dials.json", "r")) config = json.load(open(f"{model_dir_path}/exp_config.json", "r")) config = argparse.Namespace(**config) slot_meta = json.load(open(f"{model_dir_path}/slot_meta.json", "r")) tokenizer = BertTokenizer.from_pretrained(config.model_name_or_path) processor = TRADEPreprocessor(slot_meta, tokenizer) eval_examples = get_examples_from_dialogues( eval_data, user_first=False, dialogue_level=False ) # Extracting Featrues eval_features = processor.convert_examples_to_features(eval_examples) eval_data = WOSDataset(eval_features) eval_sampler = SequentialSampler(eval_data) eval_loader = DataLoader( eval_data, batch_size=args.eval_batch_size, sampler=eval_sampler, collate_fn=processor.collate_fn, ) print("# eval:", len(eval_data)) tokenized_slot_meta = []
from tqdm import tqdm from transformers import BertTokenizer, BertConfig from data_utils import get_examples_from_dialogues, convert_state_dict, load_dataset from data_utils import OntologyDSTFeature, DSTPreprocessor, _truncate_seq_pair from transformers.modeling_bert import BertOnlyMLMHead """## Data Loading """ train_data_file = "/opt/ml/input/data/train_dataset/train_dials.json" slot_meta = json.load(open("/opt/ml/input/data/train_dataset/slot_meta.json")) ontology = json.load(open("/opt/ml/input/data/train_dataset/ontology.json")) train_data, dev_data, dev_labels = load_dataset(train_data_file) train_examples = get_examples_from_dialogues(data=train_data, user_first=True, dialogue_level=True) dev_examples = get_examples_from_dialogues(data=dev_data, user_first=True, dialogue_level=True) len(train_data) max_turn = max([len(e['dialogue']) for e in train_data]) tokenizer = BertTokenizer.from_pretrained('dsksd/bert-ko-small-minimal') """## TODO-1: SUMBT Preprocessor 정의 Ontology-based DST model인 SUMBT의 InputFeature를 만들기 위한 Preprocessor를 정의해야 합니다. <br>