def read_one_example(self, inputs): """ inputs keys: sequence_a and sequence_b """ sequence_a = utils.get_sequence_a(inputs) sequence_b = inputs.get("sequence_b", None) sequence_a_sub_tokens = self.subword_tokenizer.tokenize(sequence_a) bert_input = [self.CLS_TOKEN ] + sequence_a_sub_tokens + [self.SEP_TOKEN] if sequence_b: sequence_b_sub_tokens = self.subword_tokenizer.tokenize(sequence_b) bert_input += sequence_b_sub_tokens + [self.SEP_TOKEN] if len(bert_input) > self.sequence_max_length: bert_input = bert_input[:self.sequence_max_length - 1] + [self.SEP_TOKEN] token_type = utils.make_bert_token_type(bert_input, SEP_token=self.SEP_TOKEN) features = [] features.append({ "bert_input": bert_input, "token_type": { "feature": token_type, "text": "" }, # TODO: fix hard-code }) return features, {}
def read_one_example(self, inputs): """ inputs keys: sequence_a and sequence_b """ sequence_a = utils.get_sequence_a(inputs) sequence_b = inputs.get("sequence_b", None) bert_feature = BertFeature() bert_feature.set_input_with_speical_token( sequence_a, sequence_b, self.tokenizer, max_seq_length=self.sequence_max_length, data_type="predict", cls_token=self.cls_token, sep_token=self.sep_token, input_type=self.input_type, ) features = [bert_feature.to_dict()] helper = {} return features, helper
def _read(self, file_path, data_type=None): """ .json file structure should be something like this: { "data": [ { "sequence_a": "what a wonderful day!", "sequence_b": "what a great day!", "score": 0.9 }, ... ] } """ data = self._get_data(file_path, data_type=data_type) helper = { "file_path": file_path, "examples": {}, "cls_token": self.CLS_TOKEN, "sep_token": self.SEP_TOKEN, "unk_token": self.UNK_TOKEN, "model": {}, "predict_helper": {} } features, labels = [], [] for example in tqdm(data, desc=data_type): sequence_a = utils.get_sequence_a(example) sequence_b = example.get("sequence_b", None) sequence_a_sub_tokens = self.subword_tokenizer.tokenize(sequence_a) sequence_b_sub_tokens = None bert_input = [self.CLS_TOKEN ] + sequence_a_sub_tokens + [self.SEP_TOKEN] if sequence_b is not None: sequence_b_sub_tokens = self.subword_tokenizer.tokenize( sequence_b) bert_input += sequence_b_sub_tokens + [self.SEP_TOKEN] if (self.sequence_max_length is not None and data_type == "train" and len(bert_input) > self.sequence_max_length): continue if "uid" in example: data_uid = example["uid"] else: data_uid = str(uuid.uuid1()) feature_row = { "id": data_uid, "bert_input": bert_input, } features.append(feature_row) score = example[self.label_key] label_row = { "id": data_uid, "score": score, } labels.append(label_row) helper["examples"][data_uid] = { "sequence_a": sequence_a, "sequence_a_sub_tokens": sequence_a_sub_tokens, "sequence_b": sequence_b, "sequence_b_sub_tokens": sequence_b_sub_tokens, "score": score, } if self.is_test and len(features) >= 10: break return make_batch(features, labels), helper
def _read(self, file_path, data_type=None): """ .json file structure should be something like this: { "data": [ { "sequence_a": "what a wonderful day!", "sequence_b": "what a great day!", "score": 0.9 }, ... ] } """ data = self._get_data(file_path, data_type=data_type) helper = Helper(**{ "file_path": file_path, "cls_token": self.cls_token, "sep_token": self.sep_token, }) features, labels = [], [] for example in tqdm(data, desc=data_type): sequence_a = utils.get_sequence_a(example) sequence_b = example.get("sequence_b", None) sequence_a_tokens = self.tokenizer.tokenize(sequence_a) sequence_b_tokens = None if sequence_b: sequence_b_tokens = self.tokenizer.tokenize(sequence_b) bert_input = utils.make_bert_input( sequence_a, sequence_b, self.tokenizer, max_seq_length=self.sequence_max_length, data_type=data_type, cls_token=self.cls_token, sep_token=self.sep_token, input_type=self.input_type, ) if bert_input is None: continue if "uid" in example: data_uid = example["uid"] else: data_uid = str(uuid.uuid1()) feature_row = { "id": data_uid, "bert_input": bert_input, } features.append(feature_row) score = example[self.label_key] label_row = { "id": data_uid, "score": score, } labels.append(label_row) helper.set_example(data_uid, { "sequence_a": sequence_a, "sequence_a_tokens": sequence_a_tokens, "sequence_b": sequence_b, "sequence_b_tokens": sequence_b_tokens, "score": score, }) if self.is_test and len(features) >= 10: break return utils.make_batch(features, labels), helper.to_dict()
def _read(self, file_path, data_type=None): """ .json file structure should be something like this: { "data": [ { "sequence": "what a wonderful day!", "emotion": "happy" }, ... ], "emotion": [ // class_key "angry", "happy", "sad", ... ] } """ data = self._get_data(file_path, data_type=data_type) class_idx2text, class_text2idx = self._get_class_dicts(data=data) helper = Helper(**{ "file_path": file_path, "class_idx2text": class_idx2text, "class_text2idx": class_text2idx, "cls_token": self.cls_token, "sep_token": self.sep_token, "dataset": SeqClsBertDataset, "metric_key": self.METRIC_KEY, }) helper.set_model_parameter({ "num_classes": len(class_idx2text), }) helper.set_predict_helper({ "class_idx2text": class_idx2text, }) features, labels = [], [] for example in tqdm(data, desc=data_type): sequence_a = utils.get_sequence_a(example) sequence_b = example.get("sequence_b", None) sequence_a_tokens = self.tokenizer.tokenize(sequence_a) sequence_b_tokens = None if sequence_b: sequence_b_tokens = self.tokenizer.tokenize(sequence_b) bert_input = utils.make_bert_input( sequence_a, sequence_b, self.tokenizer, max_seq_length=self.sequence_max_length, data_type=data_type, cls_token=self.cls_token, sep_token=self.sep_token, input_type=self.input_type, ) if bert_input is None: continue if "uid" in example: data_uid = example["uid"] else: data_uid = str(uuid.uuid1()) # token_type(segment_ids) will be added in dataset feature_row = { "id": data_uid, "bert_input": bert_input, } features.append(feature_row) class_text = example[self.class_key] label_row = { "id": data_uid, "class_idx": class_text2idx[class_text], "class_text": class_text, } labels.append(label_row) helper.set_example(data_uid, { "sequence_a": sequence_a, "sequence_a_tokens": sequence_a_tokens, "sequence_b": sequence_b, "sequence_b_tokens": sequence_b_tokens, "class_idx": class_text2idx[class_text], "class_text": class_text, }) if self.is_test and len(features) >= 10: break return utils.make_batch(features, labels), helper.to_dict()