def get_sentence_examples(self, questions): examples = [] for index, data in enumerate(questions): guid = 'test-%d' % index text_a = tokenization.convert_to_unicode(str(data[0])) text_b = tokenization.convert_to_unicode(str(data[1])) label = str(0) examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def make_examples_fn(samples, set_type): ## samples should be list of tuples (text,label) examples = [] for (i, sample) in enumerate(samples): guid = "%s-%s" % (set_type, i) text = tokenization.convert_to_unicode(sample[0]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(sample[1]) examples.append(InputExample(guid=guid, text_a=text, label=label)) return examples
def create_examples(lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type, set_id): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s-%s" % (set_type, set_id, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[0]) label = "-1" else: text_a = tokenization.convert_to_unicode(line[0]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def create_examples(self, lines, set_type, file_base=True): """Creates examples for the training and dev sets. each line is label+\t+text_a+\t+text_b """ examples = [] for (i, line) in tqdm(enumerate(lines)): if file_base: if i == 0: continue guid = "%s-%s" % (set_type, i) text = tokenization.convert_to_unicode(line[1]) if set_type == "test" or set_type == "pred": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append(InputExample(guid=guid, text_a=text, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def class_predict_service(): global graph with graph.as_default(): result = {} result['code'] = 0 try: sentence = request.args['text'] result['text'] = sentence start = datetime.now() sentence = tokenizer.tokenize(sentence) sentence = " ".join(sentence) print('your input is:{}'.format(sentence)) example = InputExample(guid=None, text_a=sentence, text_b=None) feature = convert_single_example(0, example, labels, config.max_seq_length, tokenizer) input_ids, input_mask, segment_ids, label_ids = convert(feature) print(input_ids) feed_dict = {input_ids_p: input_ids, input_mask_p: input_mask} # run session get current feed_dict result pred_probabilities_result = sess.run([probabilities], feed_dict)[0] #print(pred_probabilities_result[0]) #print(pred_probabilities_result) label_ids = np.where(pred_probabilities_result > 0.5, 1, 0) pred_label_result = mlb.inverse_transform(label_ids)[0] print(label_ids) #todo: 组合策略 result['data'] = pred_label_result result["data2"] = convert_id2label(labels, pred_probabilities_result) print('time used: {} sec'.format( (datetime.now() - start).total_seconds())) return json.dumps(result, ensure_ascii=False) except: traceback.print_exc() result['code'] = -1 result['data'] = 'error' return json.dumps(result, ensure_ascii=False)
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def main(): max_seq_len = 64 label_list = ['0', '1'] sentences = ("您好.麻烦您截图全屏辛苦您了.", "麻烦您截图大一点辛苦您了.最好可以全屏.") guid = 'test-%d' % 1 text_a = tokenization.convert_to_unicode(str(sentences[0])) text_b = tokenization.convert_to_unicode(str(sentences[1])) label = str(0) predict_examples = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) tokenizer = tokenization.FullTokenizer( vocab_file='./albert_config/vocab.txt', do_lower_case=True) features = convert_single_example(predict_examples, label_list, max_seq_len, tokenizer) export_dir = './export/1576720765' graph = tf.Graph() with graph.as_default(): with tf.Session() as sess: tf.saved_model.loader.load(sess, [tag_constants.SERVING], export_dir) tensor_input_ids = graph.get_tensor_by_name('input_ids_1:0') tensor_input_mask = graph.get_tensor_by_name('input_mask_1:0') tensor_label_ids = graph.get_tensor_by_name('label_ids_1:0') tensor_segment_ids = graph.get_tensor_by_name('segment_ids_1:0') tensor_outputs = graph.get_tensor_by_name('loss/Softmax:0') result = sess.run( tensor_outputs, feed_dict={ tensor_input_ids: np.array(features.input_ids).reshape(-1, max_seq_len), tensor_input_mask: np.array(features.input_mask).reshape(-1, max_seq_len), tensor_label_ids: np.array([features.label_id]), tensor_segment_ids: np.array(features.segment_ids).reshape(-1, max_seq_len), }) print(*(result[0]), sep='\t')
datasets = get_datas(args.test_file) output_dict = {} model.to(device) print('Predicting by Bert....') predicts = [] d = defaultdict(str) for dataset in tqdm(datasets): if not len(dataset): # output_dict[pid] = '' continue examples = [] for i, data in enumerate(dataset): if i < 64: examples.append(InputExample(i, data[1], data[2], '0')) eval_features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) # print(all_input_ids.size()) input_ids = all_input_ids.to(device) input_mask = all_input_mask.to(device) segment_ids = all_segment_ids.to(device)
label_ids = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] label_list = ['0', '1', '2'] max_seq_length = 70 vocab_file = "../data_model/chinese_L-12_H-768_A-12/vocab.txt" do_lower_case = True fix_label = "2" text = "linux改xp花了一下午时间。散热不好,cpu温度就没下过50,玩游戏能上70,比较吓人。触摸板关不掉,打字经常碰到。" guid = 'train-%d' % index # 参数guid是用来区分每个example的 text_a = tokenization.convert_to_unicode(text) # 要分类的文本 # label = str(line[2]) # 文本对应的情感类别 example = InputExample(guid=guid, text_a=text_a, text_b=None, label=fix_label) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer) # input_dict["input_ids"] = [[101, 8403, 3121, 8766, 5709, 749, 671, 678, 1286, 3198, 7313, 511, 3141, 4178, 679, 1962, 8024, 8476, 3946, 2428, 2218, 3766, 678, 6814, 8145, 8024, 4381, 3952, 2767, 5543, 677, 8203, 8024, 3683, 6772, 1405, 782, 511, 6239, 3043, 3352, 1068, 679, 2957, 8024, 2802, 2099, 5307, 2382, 4821, 1168, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] # input_dict["input_mask"] = [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] # input_dict["segment_ids"] = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] # input_dict["label_ids"] = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] input_dict["input_ids"] = [feature.input_ids] input_dict["input_mask"] = [feature.input_mask] input_dict["segment_ids"] = [feature.segment_ids] input_dict["label_ids"] = [label_ids] prob = predict_fn(input_dict) print(prob)