def get_sentence_examples(self, questions): for index, data in enumerate(questions): guid = 'test-%d' % index text_a = tokenization.convert_to_unicode(str(data[0])) text_b = tokenization.convert_to_unicode(str(data[1])) label = str(0) yield InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
def _create_examples(self, lines, set_type): examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) print("%d %s data creation is finished!" % (len(examples), set_type)) return examples
def get_dev_examples(self, data_dir): file_path = os.path.join(data_dir, 'dev.csv') dev_df = pd.read_csv(file_path, encoding='utf-8') dev_data = [] for index, dev in enumerate(dev_df.values): guid = 'test-%d' % index text_a = tokenization.convert_to_unicode(str(dev[0])) text_b = tokenization.convert_to_unicode(str(dev[1])) label = str(dev[2]) dev_data.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return dev_data
def read_examples(input_file): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 with tf.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 print("num_examples", len(examples)) return examples
def get_sentence_statistics(self): """Creates examples for the training and dev sets.""" dialog_examples = [] response_examples = [] for (i, dialog_data) in enumerate(self.data_l): text_dilaog_context = tokenization.convert_to_unicode( dialog_data[0]) text_response = tokenization.convert_to_unicode(dialog_data[1]) label = tokenization.convert_to_unicode(dialog_data[2]) tokens_dialog_context = self.tokenizer.tokenize( text_dilaog_context) tokens_response = self.tokenizer.tokenize(text_response) tok_dialog_len = len(tokens_dialog_context) tok_response_len = len(tokens_response) # print(i, ":", tok_dialog_len) self.avg_dialog_context += tok_dialog_len self.avg_response += tok_response_len if self.max_dialog_context < tok_dialog_len: self.max_dialog_context = tok_dialog_len if self.max_response < tok_response_len: self.max_response = tok_response_len if self.min_dialog_context > tok_dialog_len: self.min_dialog_context = tok_dialog_len if self.min_response > tok_response_len: self.min_response = tok_response_len if (i + 1) % 1000 == 0: print(i + 1, "th text stat info") print("avg_dialog_context", self.avg_dialog_context / (i + 1)) print("avg_response", self.avg_response / (i + 1)) print("max_dialog_context", self.max_dialog_context) print("max_response", self.max_response) print("min_dialog_context", self.min_dialog_context) print("min_response", self.min_response) print('-' * 200) self.avg_dialog_context /= len(self.data_l) self.avg_response /= len(self.data_l)
def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. index = 0 for input_file in input_files: with tf.gfile.GFile(input_file, "r") as reader: while True: if (index + 1) % 100000 == 0: tf.logging.info("%d data has been tokenized!" % (index + 1)) index += 1 line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) # print(tokens) # if index == 100: # exit() if tokens: all_documents[-1].append(tokens) # Remove empty documents all_documents = [x for x in all_documents if x] tf.logging.info("total %d data tokenizing process has been finished!" % len(all_documents)) print(all_documents[0]) rng.shuffle(all_documents) tokenized_data_path = "/mnt/raid5/taesun/data/ResSel/advising/tokenized_train_data_320_eot.pkl" with open(tokenized_data_path, "wb") as document_f_handle: pickle.dump(all_documents, document_f_handle) vocab_words = list(tokenizer.vocab.keys()) instances = [] for _ in range(dupe_factor): for document_index in range(len(all_documents)): instances.extend( create_instances_from_document(all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) rng.shuffle(instances) return instances
def _create_seperate_examples(self, inputs, set_type): dialog_examples = [] response_examples = [] for (i, dialog_data) in enumerate(inputs): guid = "%s-%d" % (set_type, i + 1) dialog = tokenization.convert_to_unicode(dialog_data[0]) response = tokenization.convert_to_unicode(dialog_data[1]) label = tokenization.convert_to_unicode(dialog_data[2]) dialog_examples.append( InputExample(guid=guid, text_a=dialog, label=label)) response_examples.append( InputExample(guid=guid, text_a=response, label=label)) print("%s data creation is finished! %d" % (set_type, len(dialog_examples))) return dialog_examples, response_examples
def _create_examples(self, inputs, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, dialog_data) in enumerate(inputs): guid = "%s-%d" % (set_type, i + 1) text_a = tokenization.convert_to_unicode(dialog_data[0]) text_b = tokenization.convert_to_unicode(dialog_data[1]) label = tokenization.convert_to_unicode(str(dialog_data[2])) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) print("%s data creation is finished! %d" % (set_type, len(examples))) return examples
def _create_knowledge_examples(self, inputs, set_type): examples = [] for (i, dialog_data) in enumerate(inputs): guid = "%s-%d" % (set_type, i + 1) text_a = tokenization.convert_to_unicode(dialog_data[3]) examples.append(InputExample(guid=guid, text_a=text_a)) print("%s data creation is finished! %d" % (set_type, len(examples))) return examples
def _create_knowledge_separate_examples(self, inputs, set_type): knowledge_num = self.hparams.top_n examples = [] for (i, dialog_data) in enumerate(inputs): separate_examples = [] text_b = tokenization.convert_to_unicode(dialog_data[1]) for j, (knowledge, knowledge_label) in enumerate( zip(dialog_data[3], dialog_data[4])): guid = "%s-%d-%d" % (set_type, i + 1, j + 1) text_a = tokenization.convert_to_unicode(knowledge) label = tokenization.convert_to_unicode(str(knowledge_label)) separate_examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) examples.append(separate_examples) print("%s data creation is finished! %d" % (set_type, len(examples))) return examples
def _create_similar_examples(self, inputs, set_type): examples = [] for (i, dialog_data) in enumerate(inputs): sub_examples = [] for sub_i, each_similar_dialog in enumerate(dialog_data[3]): guid = "%s-%d-%d" % (set_type, i + 1, sub_i + 1) text_a = tokenization.convert_to_unicode(each_similar_dialog) sub_examples.append(InputExample(guid=guid, text_a=text_a)) examples.append(sub_examples) print("%s data creation is finished! %d" % (set_type, len(examples))) return examples
def _create_knowledge_examples(ubuntu_manual_dict): examples = [] knowledge_name_list = [] for i, knowledge_name in enumerate(ubuntu_manual_dict.keys()): guid = "knowledge-%d" % (i + 1) text_a = tokenization.convert_to_unicode( knowledge_name + " : " + ubuntu_manual_dict[knowledge_name]) knowledge_name_list.append(knowledge_name) examples.append(InputExample(guid=guid, text_a=text_a)) print("knowledge description data creation is finished! %d" % (len(examples))) return examples, knowledge_name_list
def _create_train_data_examples(ubuntu_data): examples = [] count = 0 print("total length of ubuntu dialog data") for i, dialog_data in enumerate(ubuntu_data): if i % 2 != 0: continue count += 1 guid = "train_dialog-%d" % count text_a = tokenization.convert_to_unicode(dialog_data[0]) examples.append(InputExample(guid=guid, text_a=text_a)) print("knowledge description data creation is finished! %d" % (len(examples))) return examples
def quantity_analysis_run_evaluate(self): examples = [ "have you tried the regular way to start with do you run ubuntu with gnome", "probably ca n't do that on the xbox can you get a command prompt on it at all try rhythmbox", " afroken check out xrandr as the control mechanism for video ..", "certainly sudo apt-get install zsh zsh-doc zsh-lovers" ] self._logger.info("Evaluation batch iteration per epoch is %d" % len(examples)) if self.train_setup_vars["do_evaluate"]: print("Evaluation batch iteration per epoch is %d" % len(examples)) dialog_examples = [] for (i, dialog_data) in enumerate(examples): text_a = tokenization.convert_to_unicode(dialog_data) dialog_examples.append(InputExample(guid=i, text_a=text_a)) feed_dict = {} [input_ids, input_mask, segment_ids, (text_a_lengths, text_b_lengths), label_ids, position_ids] = \ self.processor.get_analysis_bert_data(dialog_examples) feed_dict[self.input_ids_ph] = input_ids feed_dict[self.input_mask_ph] = input_mask feed_dict[self.segment_ids_ph] = segment_ids feed_dict[self.dropout_keep_prob_ph] = 1.0 feed_dict[self.dialog_position_ids_ph] = position_ids feed_dict[self.dialog_len_ph] = text_a_lengths feed_dict[self.response_len_ph] = text_b_lengths feed_dict[self.label_ids_ph] = label_ids sequence_out_val = self.sess.run([self.seq_outputs], feed_dict=feed_dict) # batch, 768 * 4 print(np.array(sequence_out_val).shape) seq_merged_embeddings = merge_subtokens(examples, self.tokenizer, sequence_out_val[0])