def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): #if i == 0: # continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label ="0" else: text_a = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, label=label)) return examples
def _create_examples(self, lines, set_type):#得到example list """Creates examples for the training and dev sets.""" examples = [] print("length of lines:",len(lines)) for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) try: label = tokenization.convert_to_unicode(line[2]) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) except Exception: print('###error.i:', i, line) return examples
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "__label__unknown" else: text_a = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def process(self, text): #预处理 text = tokenization.convert_to_unicode(text) text = self.tokenizer.tokenize(text) if len(text) > self.max_seq_length: text = text[0:self.max_seq_length - 2] #输入id化 tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in text: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) while len(input_ids) < self.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == self.max_seq_length assert len(input_mask) == self.max_seq_length assert len(segment_ids) == self.max_seq_length label_id = 0.0 #模拟label output = self.model({ 'a_input_ids': [input_ids], 'a_input_mask': [input_mask], 'a_segment_ids': [segment_ids], 'b_input_ids': [input_ids], 'b_input_mask': [input_mask], 'b_segment_ids': [segment_ids], 'label_ids': [label_id] })['a_output_layer'] return output pass