def test_make_batch(): features = { "f1": 0, "f2": 1, "f3": 3, } labels = { "l1": 0, "l2": 1, "l3": 2, } batch = make_batch(features, labels) assert batch.features == features assert batch.labels == labels
def test_batch_sort_by_key(): features = [{"f1": "long long long"}, {"f1": "short"}, {"f1": "mid mid"}] labels = [ { "l1": 3 }, { "l1": 1 }, { "l1": 2 }, ] batch = make_batch(features, labels) batch.sort_by_key("f1") assert batch.features == sorted(features, key=lambda x: len(x["f1"]))
def _read(self, file_path, data_type=None): word_tokenized_error_count, sub_level_tokenized_error_count = 0, 0 data = self.data_handler.read(file_path) squad = json.loads(data) if "data" in squad: squad = squad["data"] helper = Helper( **{ "file_path": file_path, "raw_dataset": squad, "cls_token": self.cls_token, "sep_token": self.sep_token, "dataset": SQuADBertDataset, }) helper.set_model_parameter({ "lang_code": self.lang_code, }) features, labels = [], [] is_training = data_type == "train" for article in tqdm(squad, desc=data_type): for paragraph in article["paragraphs"]: context_text = paragraph["context"].replace("``", '" ').replace( "''", '" ') context_tokens = self.word_tokenizer.tokenize(context_text) context_spans, char_to_word_offset = self._convert_to_spans( context_text, context_tokens) context_tokens = [ Token(text, span) for (text, span) in zip(context_tokens, context_spans) ] context_sub_tokens = [] for token in context_tokens: for sub_token in self.sub_level_tokenizer.tokenize( token.text): context_sub_tokens.append( Token(sub_token, token.text_span)) for qa in paragraph["qas"]: question_text = qa["question"] question_text = " ".join( self.word_tokenizer.tokenize(question_text)) question_sub_tokens = [ Token(sub_token) for sub_token in self.sub_level_tokenizer.tokenize(question_text) ] id_ = qa["id"] answers = qa["answers"] answer_texts, answer_indices = [], [] if qa.get("is_impossible", None): answers = qa["plausible_answers"] answerable = 0 else: answers = qa["answers"] answerable = 1 for answer in answers: answer_start = answer["answer_start"] answer_end = answer_start + len(answer["text"]) - 1 answer_texts.append(answer["text"]) answer_indices.append((answer_start, answer_end)) if len(answer_indices) > 0: answer_char_start, answer_char_end = self._find_one_most_common( answer_indices) answer_word_start = char_to_word_offset[ answer_char_start] answer_word_end = char_to_word_offset[answer_char_end] char_answer_text = context_text[ answer_char_start:answer_char_end + 1] word_answer_text = context_text[ context_spans[answer_word_start][0]: context_spans[answer_word_end][1]] if not self._is_rebuild(char_answer_text, word_answer_text): logger.warning( f"word_tokenized_error: {char_answer_text} ### {word_answer_text}" ) word_tokenized_error_count += 1 else: # Unanswerable answers = ["<noanswer>"] answer_char_start, answer_char_end = -1, -1 answer_word_start, answer_word_end = -1, -1 bert_features, bert_labels = self._make_features_and_labels( context_sub_tokens, question_sub_tokens, answer_char_start, answer_char_end + 1, ) for (index, (feature, label)) in enumerate( zip(bert_features, bert_labels)): bert_tokens = feature answer_start, answer_end = label if is_training and ( answer_start < 0 or answer_start >= len(bert_tokens) or answer_end >= len(bert_tokens) or bert_tokens[answer_start].text_span is None or bert_tokens[answer_end].text_span is None): continue if is_training: char_start = bert_tokens[answer_start].text_span[0] char_end = bert_tokens[answer_end].text_span[1] bert_answer = context_text[char_start:char_end] if char_answer_text != bert_answer: logger.warning( f"sub_level_tokenized_error: {char_answer_text} ### {word_answer_text})" ) sub_level_tokenized_error_count += 1 feature_row = { "bert_input": [token.text for token in bert_tokens], "bert_token": bert_tokens, } features.append(feature_row) bert_id = id_ + f"#{index}" label_row = { "id": bert_id, # question_id + bert_index "answer_texts": "\t".join(answer_texts), "answer_start": answer_start, "answer_end": answer_end, "answerable": answerable, } labels.append(label_row) if id_ not in helper.examples: helper.set_example( id_, { "context": context_text, "question": question_text, "answers": answer_texts, }) helper.set_example(id_, { f"bert_tokens_{index}": bert_tokens, }, update=True) logger.info( f"tokenized_error_count - word: {word_tokenized_error_count} | sub_level: {sub_level_tokenized_error_count}" ) return utils.make_batch(features, labels), helper.to_dict()
def _read(self, file_path, data_type=None): """ .json file structure should be something like this: { "data": [ { "sequence": "what a wonderful day!", "emotion": "happy" }, ... ], "emotion": [ // class_key "angry", "happy", "sad", ... ] } """ data = self._get_data(file_path, data_type=data_type) class_idx2text, class_text2idx = self._get_class_dicts(data=data) helper = Helper(**{ "file_path": file_path, "class_idx2text": class_idx2text, "class_text2idx": class_text2idx, }) helper.set_model_parameter({ "num_classes": len(class_idx2text), }) helper.set_predict_helper({ "class_idx2text": class_idx2text, }) features, labels = [], [] for example in tqdm(data, desc=data_type): sequence = example["sequence"].strip().replace("\n", "") sequence_words = self.word_tokenizer.tokenize(sequence) if ( self.sequence_max_length is not None and data_type == "train" and len(sequence_words) > self.sequence_max_length ): continue if "uid" in example: data_uid = example["uid"] else: data_uid = str(uuid.uuid1()) feature_row = { "id": data_uid, "sequence": sequence, } features.append(feature_row) class_text = example[self.class_key] label_row = { "id": data_uid, "class_idx": class_text2idx[class_text], "class_text": class_text, } labels.append(label_row) helper.set_example(data_uid, { "sequence": sequence, "class_idx": class_text2idx[class_text], "class_text": class_text, }) return utils.make_batch(features, labels), helper.to_dict()
def __call__(self, features, labels, apply_pad_labels=(), apply_pad_values=()): self.collate(features) self.collate(labels, apply_pad=False, apply_pad_labels=apply_pad_labels, apply_pad_values=apply_pad_values) return utils.make_batch(features, labels)
def __call__(self, features, labels): self.collate(features, pad_value=self.pad_value) self.collate(labels, apply_pad=False, pad_value=self.pad_value) return utils.make_batch(features, labels)
def _read(self, file_path, data_type=None): """ .json file structure should be something like this: { "data": [ { "sequence": "i'm looking for a flight from New York to London.", "slots": ["O", "O", "O", "O", "O", "O", "B-city.dept", "I-city.dept" "O", "B-city.dest"] // the number of tokens in sequence.split() and tags must match }, ... ], "slots": [ // tag_key "O", // tags should be in IOB format "B-city.dept", "I-city.dept", "B-city.dest", "I-city.dest", ... ] } """ data = self._get_data(file_path) tag_idx2text, tag_text2idx = self._get_tag_dicts(data=data) helper = Helper( **{ "file_path": file_path, "tag_idx2text": tag_idx2text, "ignore_tag_idx": self.ignore_tag_idx, "cls_token": self.cls_token, "sep_token": self.sep_token, }) helper.set_model_parameter({ "num_tags": len(tag_idx2text), "ignore_tag_idx": self.ignore_tag_idx, }) helper.set_predict_helper({ "tag_idx2text": tag_idx2text, }) features, labels = [], [] for example in tqdm(data, desc=data_type): sequence_text = example["sequence"].strip().replace("\n", "") sequence_tokens = self.word_tokenizer.tokenize(sequence_text) naive_tokens = sequence_text.split() is_head_word = utils.get_is_head_of_word(naive_tokens, sequence_tokens) sequence_sub_tokens = [] tagged_sub_token_idxs = [] curr_sub_token_idx = 1 # skip CLS_TOKEN for token_idx, token in enumerate(sequence_tokens): for sub_token_pos, sub_token in enumerate( self.subword_tokenizer.tokenize(token, unit="word")): sequence_sub_tokens.append(sub_token) if is_head_word[token_idx] and sub_token_pos == 0: tagged_sub_token_idxs.append(curr_sub_token_idx) curr_sub_token_idx += 1 bert_input = [self.cls_token ] + sequence_sub_tokens + [self.sep_token] if (self.sequence_max_length is not None and data_type == "train" and len(bert_input) > self.sequence_max_length): continue if "uid" in example: data_uid = example["uid"] else: data_uid = str(uuid.uuid1()) tag_texts = example[self.tag_key] tag_idxs = [tag_text2idx[tag_text] for tag_text in tag_texts] utils.sanity_check_iob(naive_tokens, tag_texts) assert len(naive_tokens) == len(tagged_sub_token_idxs), \ f"""Wrong tagged_sub_token_idxs: followings mismatch. naive_tokens: {naive_tokens} sequence_sub_tokens: {sequence_sub_tokens} tagged_sub_token_idxs: {tagged_sub_token_idxs}""" feature_row = { "id": data_uid, "bert_input": bert_input, "tagged_sub_token_idxs": tagged_sub_token_idxs, "num_tokens": len(naive_tokens), } features.append(feature_row) label_row = { "id": data_uid, "tag_idxs": tag_idxs, "tag_texts": tag_texts, } labels.append(label_row) helper.set_example( data_uid, { "sequence": sequence_text, "sequence_sub_tokens": sequence_sub_tokens, "tag_idxs": tag_idxs, "tag_texts": tag_texts, }) return utils.make_batch(features, labels), helper.to_dict()
def _read(self, file_path, data_type=None): tokenized_error_count = 0 data = self.data_handler.read(file_path) squad = json.loads(data) if "data" in squad: squad = squad["data"] helper = Helper(**{ "file_path": file_path, "raw_dataset": squad, }) helper.set_model_parameter({ "lang_code": self.lang_code, }) features, labels = [], [] for article in tqdm(squad, desc=data_type): for paragraph in article["paragraphs"]: context = paragraph["context"].replace("``", '" ').replace( "''", '" ') context_words = self.word_tokenizer.tokenize(context) if (self.context_max_length is not None and data_type == "train" and len(context_words) > self.context_max_length): continue for qa in paragraph["qas"]: question = qa["question"].strip().replace("\n", "") id_ = qa["id"] answer_texts, answer_indices = [], [] if qa.get("is_impossible", None): answers = qa["plausible_answers"] answerable = 0 else: answers = qa["answers"] answerable = 1 for answer in answers: answer_start = answer["answer_start"] answer_end = answer_start + len(answer["text"]) answer_texts.append(answer["text"]) answer_indices.append((answer_start, answer_end)) feature_row = { "context": self._clean_text(context), "question": question, } features.append(feature_row) if len(answer_indices) > 0: answer_start, answer_end = self._find_one_most_common( answer_indices) text_spans = self._convert_to_spans( context, context_words) word_idxs = self._get_word_span_idxs( text_spans, answer_start, answer_end) word_answer_start = word_idxs[0] word_answer_end = word_idxs[-1] # To check rebuild answer: char_answer_text - word_answer_text char_answer_text = context[answer_start:answer_end] word_answer_text = context[ text_spans[word_answer_start][0]: text_spans[word_answer_end][1]] if not self._is_rebuild(char_answer_text, word_answer_text): logger.warning( f"word_tokenized_error: {char_answer_text} ### {word_answer_text}" ) tokenized_error_count += 1 else: # Unanswerable answers = ["<noanswer>"] text_spans = [] answer_start, answer_end = 0, 0 word_answer_start, word_answer_end = 0, 0 label_row = { "id": id_, "answer_start": word_answer_start, "answer_end": word_answer_end, "answerable": answerable, } labels.append(label_row) helper.set_example( id_, { "context": context, "text_span": text_spans, "question": question, "answers": answer_texts, }) logger.info(f"tokenized_error_count: {tokenized_error_count} ") return utils.make_batch(features, labels), helper.to_dict()
def _read(self, file_path, data_type=None): """ .json file structure should be something like this: { "data": [ { "sequence_a": "what a wonderful day!", "sequence_b": "what a great day!", "score": 0.9 }, ... ] } """ data = self._get_data(file_path, data_type=data_type) helper = Helper(**{ "file_path": file_path, "cls_token": self.cls_token, "sep_token": self.sep_token, }) features, labels = [], [] for example in tqdm(data, desc=data_type): sequence_a = utils.get_sequence_a(example) sequence_b = example.get("sequence_b", None) sequence_a_tokens = self.tokenizer.tokenize(sequence_a) sequence_b_tokens = None if sequence_b: sequence_b_tokens = self.tokenizer.tokenize(sequence_b) bert_input = utils.make_bert_input( sequence_a, sequence_b, self.tokenizer, max_seq_length=self.sequence_max_length, data_type=data_type, cls_token=self.cls_token, sep_token=self.sep_token, input_type=self.input_type, ) if bert_input is None: continue if "uid" in example: data_uid = example["uid"] else: data_uid = str(uuid.uuid1()) feature_row = { "id": data_uid, "bert_input": bert_input, } features.append(feature_row) score = example[self.label_key] label_row = { "id": data_uid, "score": score, } labels.append(label_row) helper.set_example(data_uid, { "sequence_a": sequence_a, "sequence_a_tokens": sequence_a_tokens, "sequence_b": sequence_b, "sequence_b_tokens": sequence_b_tokens, "score": score, }) if self.is_test and len(features) >= 10: break return utils.make_batch(features, labels), helper.to_dict()
def _read(self, file_path, data_type=None): """ .json file structure should be something like this: { "data": [ { "sequence": "what a wonderful day!", "emotion": "happy" }, ... ], "emotion": [ // class_key "angry", "happy", "sad", ... ] } """ data = self._get_data(file_path, data_type=data_type) class_idx2text, class_text2idx = self._get_class_dicts(data=data) helper = Helper(**{ "file_path": file_path, "class_idx2text": class_idx2text, "class_text2idx": class_text2idx, "cls_token": self.cls_token, "sep_token": self.sep_token, "dataset": SeqClsBertDataset, "metric_key": self.METRIC_KEY, }) helper.set_model_parameter({ "num_classes": len(class_idx2text), }) helper.set_predict_helper({ "class_idx2text": class_idx2text, }) features, labels = [], [] for example in tqdm(data, desc=data_type): sequence_a = utils.get_sequence_a(example) sequence_b = example.get("sequence_b", None) sequence_a_tokens = self.tokenizer.tokenize(sequence_a) sequence_b_tokens = None if sequence_b: sequence_b_tokens = self.tokenizer.tokenize(sequence_b) bert_input = utils.make_bert_input( sequence_a, sequence_b, self.tokenizer, max_seq_length=self.sequence_max_length, data_type=data_type, cls_token=self.cls_token, sep_token=self.sep_token, input_type=self.input_type, ) if bert_input is None: continue if "uid" in example: data_uid = example["uid"] else: data_uid = str(uuid.uuid1()) # token_type(segment_ids) will be added in dataset feature_row = { "id": data_uid, "bert_input": bert_input, } features.append(feature_row) class_text = example[self.class_key] label_row = { "id": data_uid, "class_idx": class_text2idx[class_text], "class_text": class_text, } labels.append(label_row) helper.set_example(data_uid, { "sequence_a": sequence_a, "sequence_a_tokens": sequence_a_tokens, "sequence_b": sequence_b, "sequence_b_tokens": sequence_b_tokens, "class_idx": class_text2idx[class_text], "class_text": class_text, }) if self.is_test and len(features) >= 10: break return utils.make_batch(features, labels), helper.to_dict()
def _read(self, file_path, data_type=None): file_path = self.data_handler.read(file_path, return_path=True) file_path = Path(file_path) data_dir = file_path.parent file_name = file_path.stem db_path = data_dir / f"{file_name}.db" table_path = data_dir / f"{file_name}.tables.jsonl" self.dbengine = DBEngine(db_path) helper = Helper(**{ "file_path": file_path, "db_path": db_path, }) features, labels = [], [] sql_datas, table_data = self.load_data(file_path, table_path, data_type=data_type) for sql_data in tqdm(sql_datas, desc=data_type): question = sql_data["question"] table_id = sql_data["table_id"] column_headers = table_data[table_id]["header"] feature_row = {"column": column_headers, "question": question} data_uid = str(uuid.uuid1()) conditions_value_position = self.get_coditions_value_position( sql_data["question"], [x[2] for x in sql_data["sql"]["conds"]]) sql_query = Query.from_dict(sql_data["sql"], ordered=True) execution_result = self.dbengine.execute_query(table_id, sql_query, lower=True) label_row = { "id": data_uid, "table_id": table_id, "tokenized_question": self.word_tokenizer.tokenize(question), "aggregator_idx": sql_data["sql"]["agg"], "select_column_idx": sql_data["sql"]["sel"], "conditions_num": len(sql_data["sql"]["conds"]), "conditions_column_idx": [x[0] for x in sql_data["sql"]["conds"]], "conditions_operator_idx": [x[1] for x in sql_data["sql"]["conds"]], "conditions_value_string": [str(x[2]) for x in sql_data["sql"]["conds"]], "conditions_value_position": conditions_value_position, "sql_query": sql_query, "execution_result": execution_result, } features.append(feature_row) labels.append(label_row) helper.set_example( data_uid, { "question": question, "sql_query": sql_query, "execution_result": execution_result, }) if self.is_test and len(labels) == 10: break return utils.make_batch(features, labels), helper.to_dict()