def _create_example(self, lines, set_type): examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[0]) if i == 0: print(label) examples.append(InputExample(guid=guid, text=text, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, tokenization.convert_to_unicode( line[0])) text_a = tokenization.convert_to_unicode(line[8]) text_b = tokenization.convert_to_unicode(line[9]) label = tokenization.convert_to_unicode(line[-1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[0]) label = "0" else: text_a = tokenization.convert_to_unicode(line[0]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if len(line) != 4: print(line) guid = "%s-%s-%s" % ( set_type, tokenization.convert_to_unicode(line[0]), str(i)) target = tokenization.convert_to_unicode(line[1]) text = tokenization.convert_to_unicode(line[2]) label = tokenization.convert_to_unicode(line[3]) examples.append( InputExample(guid=guid, text_a=target, text_b=text, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # skip header if i == 0: continue guid = int(line[0]) group = line[1] text_left = tokenization.convert_to_unicode(line[2]) text_right = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[4]) negs = [tokenization.convert_to_unicode(neg) for neg in line[5:] if neg.strip()] examples.append( InputExample(guid=guid, group=group, text_left=text_left, text_right=text_right, label=label, negs=negs)) return examples
def create_examples(data, label_list, set_type, labels_available=True): """Creates examples for the training, dev and test sets. Data:list """ examples = [] for (i, line) in enumerate(data): guid = "%s" % (i) text_a = tokenization.convert_to_unicode(line[0].strip("\n")) if labels_available: labels = tokenization.convert_to_unicode(line[0]) labels = labels.split(", ") #format: list ['a'] or ['a','b'] labels = Multi_hot_label(labels,label_list) else: labels = [0]*len(label_list) examples.append( InputExample(guid=guid, text_a=text_a, labels=labels)) return examples
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(curr_path, data_dir)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) text_a = tokenization.convert_to_unicode(line[2]) text_b = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] print("length of lines:", len(lines)) for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) try: label = tokenization.convert_to_unicode(line[2]) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) except Exception: # pylint: disable=broad-except print("###error.i:", i, line) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] sentence1_index = 0 sentence2_index = 0 label0_index = None label1_index = None label2_index = None for (i, line) in enumerate(lines): if i == 0: # Identify the sentence index for j, token in enumerate(line): if token.strip() == "sentence1": sentence1_index = j elif token.strip() == "sentence2": sentence2_index = j elif token.strip() == "label0_prob": label0_index = j elif token.strip() == "label1_prob": label1_index = j elif token.strip() == "label2_prob": label2_index = j continue guid = "%s-%s" % (set_type, tokenization.convert_to_unicode( line[0])) text_a = tokenization.convert_to_unicode(line[sentence1_index]) text_b = tokenization.convert_to_unicode(line[sentence2_index]) if label0_index and label1_index and label2_index: # The three indices correspond to probabilities of contradiction, # entailment and neutral. label = [ float(line[label0_index]), float(line[label1_index]), float(line[label2_index]) ] else: label = [1.0, 0, 0] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_train_examples(self, data_dir): file_path = os.path.join(data_dir, 'train.txt') train_df = pd.read_csv(file_path, encoding='utf-8', sep='\t', header=None) train_data = [] for index, train in enumerate(train_df.values): guid = 'train-%d' % index text_a = tokenization.convert_to_unicode(str(train[0])) text_b = tokenization.convert_to_unicode(str(train[1])) label = str(train[2]) train_data.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return train_data
def load_test_data(self, path): x1, x2, y = [], [], [] for data in self.read_csv(path): x1.append(convert_to_unicode(data[1])) x2.append(None) y.append( None ) # if test data does not includes ground true label, this is a placeholder which can be anything return x1, x2, y
def _create_examples(self, lines): """Create examples data set""" examples = [] for index, line in enumerate(lines): guid = 'train-%d' % index split_line = line.strip().split(',') if len(split_line) == 3 and (split_line[0] == '1' or split_line[0] == '0'): text_a = tokenization.convert_to_unicode(split_line[1]) text_b = tokenization.convert_to_unicode(split_line[2]) label = split_line[0] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_example(self, lines, labels, set_type): examples = [] idx = 0 for line, label in zip(lines, labels): guid = "{}-{}".format(set_type, idx) text_a = tokenization.convert_to_unicode(line) examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) idx += 1 return examples
def convert_to_examples(texts): examples = [] for _id, text in enumerate(texts): line = tokenization.convert_to_unicode(text) if not line: break line = line.strip() examples.append(InputExample(unique_id=_id, text_a=line, text_b=None)) return examples
def get_example(self, index): data = self.get_data(index) # left, pron, right, candidates, selected src_left = self.tokenizer.tokenize(convert_to_unicode(data.left)) src_right = self.tokenizer.tokenize(convert_to_unicode(data.right)) pronoun_tokens = self.tokenizer.tokenize(convert_to_unicode(data.pron)) candidates = [ self.tokenizer.tokenize(convert_to_unicode(c)) for c in data.candidates.split(',') ] selected_idx = int(data.selected) assert len(candidates) <= self.max_candidates, data candidates.extend([None] * (self.max_candidates - len(candidates))) selected = [0] * len(candidates) selected[selected_idx] = 1 return self._make_inputs(src_left, src_right, pronoun_tokens, candidates, selected)
def get_dev_examples(self, data_dir): file_path = os.path.join(data_dir, 'dev.txt') dev_df = pd.read_csv(file_path, encoding='utf-8', sep='\t', header=None) dev_data = [] for index, dev in enumerate(dev_df.values): guid = 'test-%d' % index text_a = tokenization.convert_to_unicode(str(dev[1])) text_b = tokenization.convert_to_unicode(str(dev[2])) label = str(dev[3]) dev_data.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return dev_data
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def bert_tokenize(vocab_fname, corpus_fname, output_fname): tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2: for line in f1: sentence = line.replace('\n', '').strip() tokens = tokenizer.tokenize(convert_to_unicode(sentence)) tokenized_sent = ' '.join(tokens) f2.writelines(tokenized_sent + '\n')
def get_train_examples(self, data_dir): file_path = os.path.join(data_dir, 'train.txt') f = open(file_path, 'r', encoding='utf-8') train_data = [] index = 0 for line in f.readlines(): guid = 'train-%d' % index #参数guid是用来区分每个example的 line = line.replace("\n", "").split("\t") text_a = tokenization.convert_to_unicode(line[1]) #句子a text_b = tokenization.convert_to_unicode(line[2]) #句子b label = line[0] #文本对应的类别 train_data.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) index += 1 return train_data
def get_test_examples(self, data_dir): file_path = os.path.join(data_dir, 'test.txt') f = open(file_path, 'r', encoding='utf-8') test_data = [] index = 0 for line in f.readlines(): guid = 'test-%d' % index line = line.replace("\n", "").split("\t") text_a = tokenization.convert_to_unicode(line[1]) text_b = tokenization.convert_to_unicode(line[2]) label = line[0] test_data.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) index += 1 return test_data
def get_dev_examples(self, data_dir): """ 获取验证数据 :param data_dir: 文件文件夹路径 :return: """ # 1. 加载训练数据 lines = self._read_txt(data_dir, file_name=FLAGS.dev_file_names) # 2. 数据遍历处理并转换 examples = [] for idx, line in enumerate(lines): guid = "dev-%d" % idx text_a = tokenization.convert_to_unicode(line[0]) label = tokenization.convert_to_unicode(line[1]) examples.append(InputExample(guid=guid, text_a=text_a, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = line[1] text_a = tokenization.convert_to_unicode(line[2]) text_b = tokenization.convert_to_unicode(line[3]) if set_type == "test": label = self.get_labels()[-1] else: label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def convert_line(line, label_list, max_seq_length, tokenizer): """Function to convert a line that should be predicted into BERT input features. """ label = tokenization.convert_to_unicode("0") # Mock label text_a = tokenization.convert_to_unicode(line) example = rc.InputExample(guid=0, text_a=text_a, text_b=None, label=label) feature = rc.convert_single_example(0, example, label_list, max_seq_length, tokenizer) input_ids = np.reshape([feature.input_ids], (1, max_seq_length)) input_mask = np.reshape([feature.input_mask], (1, max_seq_length)) segment_ids = np.reshape([feature.segment_ids], (max_seq_length)) label_ids = [feature.label_id] return input_ids, input_mask, segment_ids, label_ids
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # skip header if i == 0: continue guid = line[0] text_a = tokenization.convert_to_unicode(line[1]) if set_type == "test": label = self.get_labels()[-1] else: try: label = tokenization.convert_to_unicode(line[2]) except IndexError: logging.exception(line) exit(1) examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_train_examples(self, data_dir): file_path = os.path.join(data_dir, 'train.csv') with open(file_path, 'r', encoding="utf-8") as f: reader = f.readlines() examples = [] for index, line in enumerate(reader): guid = 'train-%d' % index split_line = line.strip().split("\t") # print(split_line) text_a = tokenization.convert_to_unicode(split_line[1]) text_b = tokenization.convert_to_unicode(split_line[2]) label = split_line[3] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): '''create examples for training and dev''' examples = [] for (i, line) in enumerate(lines): guid = '%s-%s' % (set_type, i) # title + content try: text_a = tokenization.convert_to_unicode(line[1] + line[2]) if set_type == "test": label = u"美食" else: label = tokenization.convert_to_unicode(line[0]) if label not in self.labels: continue except Exception as ex: continue examples.append(InputExample(guid=guid, text_a=text_a, label=label)) return examples
def preproc_doc(document): """Convert document to list of TF Examples for binary order classification. Args: document: a wikipedia article as a list of lines Returns: A list of tfexamples of binary orderings of pairs of sentences in the document. The tfexamples are serialized to string to be written directly to TFRecord. """ # Each document is a list of lines tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # just use lower case? # set a random seed for reproducability hash_object = hashlib.md5(document[0]) rng = random.Random(int(hash_object.hexdigest(), 16) % (10**8)) # Each document is composed of a list of text lines. Each text line is a # paragraph. We split the line into sentences but keep the paragraph grouping. # The utility functions below expect the document to be split by paragraphs. list_of_paragraphs = [] for line in document: # each line is a story line = tokenization.convert_to_unicode(line) line = line.replace(u"\u2018", "'").replace(u"\u2019", "'") sents = split_line_by_sentences(line) sent_tokens = [tokenizer.tokenize(sent) for sent in sents if sent] list_of_paragraphs.append(sent_tokens) # In case of any empty paragraphs, remove them. list_of_paragraphs = [x for x in list_of_paragraphs if x] # Convert the list of paragraphs into TrainingInstance object # See preprocessing_utils.py for definition if FLAGS.format == FORMAT_BINARY: instances = create_instances_from_document(list_of_paragraphs, FLAGS.max_seq_length, rng) elif FLAGS.format == FORMAT_PARAGRAPH: instances = create_paragraph_order_from_document( list_of_paragraphs, FLAGS.max_seq_length, rng) # Convert token lists into ids and add any needed tokens and padding for BERT tf_examples = [ convert_instance_to_tf_example(tokenizer, instance, FLAGS.max_seq_length)[0] for instance in instances ] # Serialize TFExample for writing to file. tf_examples = [example.SerializeToString() for example in tf_examples] return tf_examples
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "sct_v2.train.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%s" % (line[0]) label = int(tokenization.convert_to_unicode(line[1])) text_a = tokenization.convert_to_unicode(line[2]) text_b_pos = tokenization.convert_to_unicode(line[3]) text_b_neg = tokenization.convert_to_unicode(line[4]) vs_sent1 = self._string_to_array(line[5][1:-1]) vs_sent2 = self._string_to_array(line[6][1:-1]) vs_sent3 = self._string_to_array(line[7][1:-1]) vs_sent4 = self._string_to_array(line[8][1:-1]) vs_sent5_pos = self._string_to_array(line[9][1:-1]) vs_sent5_neg = self._string_to_array(line[10][1:-1]) cs_dist_pos = self._string_to_array(line[11][1:-1]) cs_dist_neg = self._string_to_array(line[12][1:-1]) examples.append( InputExample(guid=guid, label=label, text_a=text_a, text_b_pos=text_b_pos, text_b_neg=text_b_neg, vs_sent1=vs_sent1, vs_sent2=vs_sent2, vs_sent3=vs_sent3, vs_sent4=vs_sent4, vs_sent5_pos=vs_sent5_pos, vs_sent5_neg=vs_sent5_neg, cs_dist_pos=cs_dist_pos, cs_dist_neg=cs_dist_neg)) return examples
def read_examples(input_file): """Read a list of `InputExample`s from an input file.""" examples = [] with tf.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() examples.append(line) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) split_line = line.strip().split("\t") text_a = tokenization.convert_to_unicode(split_line[1]) text_b = None if set_type == "test": label = "体育" else: label = tokenization.convert_to_unicode(split_line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples