def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): logger.info(predict_line.text) logger.info(predict_line.label) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]', 'X']: continue try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: logger.info(e) logger.info(predict_line.text) logger.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n')
def get_assignment_map_from_checkpoint(tvars, init_checkpoint): """Compute the union of the current variables and checkpoint variables.""" assignment_map = {} initialized_variable_names = {} name_to_variable = collections.OrderedDict() for var in tvars: name = var.name m = re.match("^(.*):\\d+$", name) if m is not None: name = m.group(1) name_to_variable[name] = var logger.info(init_checkpoint) init_vars = tf.train.list_variables(init_checkpoint) assignment_map = collections.OrderedDict() for x in init_vars: (name, var) = (x[0], x[1]) if name not in name_to_variable: continue # https://github.com/google-research/bert/issues/383 解决模型加载错误 # assignment_map[name] = name assignment_map[name] = name_to_variable[name] initialized_variable_names[name] = 1 initialized_variable_names[name + ":0"] = 1 return assignment_map, initialized_variable_names
def train_ner(): args = get_args_parser() if True: param_str = '\n'.join(['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())]) logger.info('usage: %s\n%20s %s\n%s\n%s\n' % (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str)) logger.info(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map train(args=args)
def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" logger.info("_read_tsv file: %s" % input_file) reader = csv.reader(open(input_file, encoding="utf-8", mode="r"), delimiter=",", quotechar=quotechar) lines = [] for line in reader: # logger.info(line) lines.append(line) # for test:测试模型是否通 # lines = lines[:32] return lines
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): """Convert a set of `InputExample`s to a list of `InputFeatures`.""" features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d of %d" % (ex_index, len(examples))) feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer) features.append(feature) return features
def get_last_checkpoint(model_path): if not os.path.exists(os.path.join(model_path, 'checkpoint')): logger.info('checkpoint file not exits:'.format( os.path.join(model_path, 'checkpoint'))) return None last = None with codecs.open(os.path.join(model_path, 'checkpoint'), 'r', encoding='utf-8') as fd: for line in fd: line = line.strip().split(':') if len(line) != 2: continue if line[0] == 'model_checkpoint_path': last = line[1][2:-1] break return last
def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): """Checks whether the casing config is consistent with the checkpoint name.""" # The casing has to be passed in by the user and there is no explicit check # as to whether it matches the checkpoint. The casing information probably # should have been stored in the bert_config.json file, but it's not, so # we have to heuristically detect it to validate. if not init_checkpoint: logger.info("初始化的checkpoint为 None") return logger.info(init_checkpoint) # m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) # logger.info(m.group()) # if m is None: # logger.info("初始化的checkpoint无效:%s" % init_checkpoint) # return # model_name = m.group(1) model_name = "chinese_L-12_H-768_A-12" lower_models = [ "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" ] cased_models = [ "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", "multi_cased_L-12_H-768_A-12" ] logger.info("初始化的checkpoint为 %s" % model_name) # 检查模型和参数do_lower_case是否匹配 is_bad_config = False if model_name in lower_models and not do_lower_case: is_bad_config = True actual_flag = "False" case_name = "lowercased" opposite_flag = "True" if model_name in cased_models and do_lower_case: is_bad_config = True actual_flag = "True" case_name = "cased" opposite_flag = "False" if is_bad_config: raise ValueError( "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " "However, `%s` seems to be a %s model, so you " "should pass in `--do_lower_case=%s` so that the fine-tuning matches " "how the model was pre-training. If this error is wrong, please " "just comment out this check." % (actual_flag, init_checkpoint, model_name, case_name, opposite_flag)) logger.info("do_lower_case 与模型匹配")
def filed_based_convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, output_file, output_dir, mode=None): """ 将数据转化为TF_Record 结构,作为模型数据输入 :param examples: 样本 :param label_list:标签list :param max_seq_length: 预先设定的最大序列长度 :param tokenizer: tokenizer 对象 :param output_file: tf.record 输出路径 :param mode: :return: """ writer = tf.python_io.TFRecordWriter(output_file) # 遍历训练数据 for (ex_index, example) in enumerate(examples): if ex_index % 5000 == 0: logger.info("Writing example %d of %d" % (ex_index, len(examples))) # 对于每一个训练样本, feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode) def create_int_feature(values): f = tf.train.Feature(int64_list=tf.train.Int64List( value=list(values))) return f # 创建特征字典, 有序字典, 写进record features = collections.OrderedDict() features["input_ids"] = create_int_feature(feature.input_ids) features["input_mask"] = create_int_feature(feature.input_mask) features["segment_ids"] = create_int_feature(feature.segment_ids) features["label_ids"] = create_int_feature(feature.label_ids) # features["label_mask"] = create_int_feature(feature.label_mask) # tf.train.Example/Feature 是一种协议,方便序列化??? tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writer.write(tf_example.SerializeToString())
def get_labels(self, labels=None): logger.info(self.labels) # 整理最终输出预测的label, 这里的超级混乱,有些不必要 if labels is not None: try: # 支持从文件中读取标签类型 if os.path.exists(labels) and os.path.isfile(labels): with codecs.open(labels, 'r', encoding='utf-8') as fd: for line in fd: self.labels.append(line.strip()) else: # 否则通过传入的参数,按照逗号分割 self.labels = labels.split(',') self.labels = set(self.labels) # to set except Exception as e: print(e) # 通过读取train文件获取标签的方法会出现一定的风险。 if os.path.exists(os.path.join(self.output_dir, 'label_list.pkl')): with codecs.open(os.path.join(self.output_dir, 'label_list.pkl'), 'rb') as rf: self.labels = pickle.load(rf) logger.info(self.labels) else: logger.info(self.labels) if len(self.labels) > 0: # X: for word piece, 英文词的后缀 self.labels = self.labels.union({"X", "[CLS]", "[SEP]"}) with codecs.open( os.path.join(self.output_dir, 'label_list.pkl'), 'wb') as rf: pickle.dump(self.labels, rf) else: raise Exception("输出的label存在问题") return self.labels
def file_based_convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, output_file): """ 将序列化后的InputFeature 转化为 tfrecord :param examples: :param label_list: :param max_seq_length: :param tokenizer: :param output_file: :return: """ writer = tf.python_io.TFRecordWriter(output_file) logger.info("数据准备....") for (ex_index, example) in tqdm(enumerate(examples)): feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer) def create_int_feature(values): f = tf.train.Feature(int64_list=tf.train.Int64List( value=list(values))) return f features = collections.OrderedDict() features["input_ids"] = create_int_feature(feature.input_ids) features["input_mask"] = create_int_feature(feature.input_mask) features["segment_ids"] = create_int_feature(feature.segment_ids) features["label_ids"] = create_int_feature([feature.label_id]) features["is_real_example"] = create_int_feature( [int(feature.is_real_example)]) features["Q_mask"] = create_int_feature(feature.Q_mask) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writer.write(tf_example.SerializeToString()) writer.close() logger.info("数据准备完成!")
def data_for_sentimental(): # test: 将序列化标注的test数据解析作为模型的输入,利用到序列化标注的结果 columns = [ "ID", "AspectTerms", "Opinions", "Polarities", "Categories", "Review" ] path = "zhejiang/data_ner/ner_res.xlsx" df = pd.read_excel(path) df = df[columns].fillna(value="_") df.to_csv("./zhejiang/data_sentimental/test.csv", index=False) print(df[:3]) # train:将训练数据对应的label opinion提取并作为序列化标注的结果 df = pd.read_csv(open("zhejiang/data_sentimental/Train_labels.csv", encoding="utf-8"), header=0) df = df[["id", "AspectTerms", "OpinionTerms", "Polarities", "Categories"]] sentiment_ids = collections.OrderedDict() for index, senti in enumerate(set(df["Polarities"].values)): sentiment_ids[senti] = index logger.info(sentiment_ids) pd.Series(sentiment_ids).to_csv( "zhejiang/data_sentimental/sentiment_ids.csv") df["Polarities"] = df["Polarities"].apply( lambda x: sentiment_ids[x.strip()]) df.columns = columns[:-1] print(df[:3]) # 给训练数据添加review df_review = pd.read_csv(open("zhejiang/data_sentimental/Train_reviews.csv", encoding="utf8"), header=0, index_col=["id"], dtype=str) # print(df_review[:3]) f = lambda x: " ".join(list(sentence_clean(x))) df_review["Reviews"] = df_review["Reviews"].apply(f).values tmp = [df_review.loc[id]["Reviews"] for id in df["ID"].values] # logger.info(tmp) df["Review"] = tmp print(df_review[:3]) indexes = list(range(len(df))) random.shuffle(indexes) df = df.iloc[indexes] num_row = len(df) split_index = int(num_row * 0.2) df_dev = df[:split_index] df_train = df[split_index:] logger.info(len(df_dev)) logger.info(len(df_train)) df_train.to_csv("zhejiang/data_sentimental/train.csv", encoding="utf-8", index=False) df_dev.to_csv("zhejiang/data_sentimental/dev.csv", encoding="utf-8", index=False)
def _read_data(self, input_file, data_type=None): # 对父类的方法进行重写 """Reads a BIO data.""" logger.info(input_file) """Reads a BIO data.""" with codecs.open(input_file, 'r', encoding='utf-8') as f: lines = [] words = [] labels = [] for line in f: contends = line.strip() tokens = contends.split('\t') if len(contends) != 0: if len(tokens) == 2: # train, dev, test 测试部分带验证 words.append(tokens[0]) labels.append(tokens[1]) self.labels.add(tokens[1]) elif (len(tokens) == 1) and (data_type == "test"): # 测试不带验证 words.append(tokens[0]) labels.append("O") # 仅仅为了填充下 else: logger.info(line) logger.info(tokens) raise Exception("数据样本准备错误") else: # 一个句子的结果整理完毕, 每个信号均用空格进行分割 if len(contends) == 0: l = ' '.join( [label for label in labels if len(label) > 0]) w = ' '.join([word for word in words if len(word) > 0]) lines.append([l, w]) words = [] labels = [] continue # 将每个样本输入序列 和 输出 label作为 二元list对保存在list中 logger.info(lines) return lines
def count_category(output_file, data_dir): df_labels = pd.read_csv(output_file, encoding="utf-8", delimiter=",", header=0) cates = df_labels.Categories.values len(cates) logger.info(Counter(cates)) cates_ids = collections.OrderedDict() for v, k in enumerate(sorted(list(set(cates)))): cates_ids[k] = v logger.info(cates) pd.Series(cates_ids).to_csv(data_dir + "/category_ids.csv") logger.info(cates_ids) return cates_ids
def count_train_data(file): df = pd.read_csv(file, encoding="utf-8", delimiter=",", header=0) logger.info(df.columns) reviews = df["Reviews"].values logger.info(len(reviews)) words = [] for line in reviews: line = line.strip() if line: logger.info(line) line = re.sub("[a-zA-Z]+", "@", line) line = re.sub("\d+", "&", line) line = re.sub("\s|\.", "", line) words.extend(list(line)) return Counter(words)
def count_predcited_aspect_opinion(): file = "./output/label_test.txt" with open(file, encoding="utf-8", mode="r") as file: ots = [] ats = [] flag = None word = "" for line in file.readlines(): # logger.info(line) line = line.strip() items = re.split("\s+", line) if len(items) != 3: if word: if flag == "at": ats.append(word) logger.info(word) elif flag == "ot": ots.append(word) logger.info(word) else: raise Exception("bug") flag = None word = "" continue # logger.info(items) # 寻找目标词汇 if line.endswith("B_at"): # 旧的目标 if word: if flag == "at": ats.append(word) logger.info(word) elif flag == "ot": ots.append(word) logger.info(word) else: raise Exception("bug") # 新目标 flag = "at" word = items[0] continue if word and flag == "at" and line.endswith("I_at"): # 寻找仅仅接着的 word += items[0] continue if word and flag == "at" and not line.endswith("I_at"): ats.append(word) logger.info(word) flag = None word = "" if line.endswith("B_ot"): # 旧的目标 if word: if flag == "at": ats.append(word) logger.info(word) elif flag == "ot": ots.append(word) logger.info(word) else: raise Exception("bug") # 新目标 flag = "ot" word = items[0] continue if word and flag == "ot" and line.endswith("I_ot"): # 寻找仅仅接着的 word += items[0] continue if word and flag == "ot" and not line.endswith("I_ot"): flag = None ots.append(word) logger.info(word) word = "" logger.info(ats) logger.info(ots) logger.info(Counter(ats)) logger.info(Counter(ots))
def get_dev_examples(self, data_dir): logger.info("get_dev_examples...>>>...") return self._create_example( self._read_data(os.path.join(data_dir, "dev.txt")), "dev")
def train(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map processors = {"ner": NerProcessor} bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if args.clean and args.do_train: if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) # check output dir exists if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) processor = processors[args.ner](args.output_dir) logger.info(args.data_dir) # 加载字典 tokenizer = FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) session_config = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) run_config = tf.estimator.RunConfig( model_dir=args.output_dir, save_summary_steps=500, # 这里写死了 前面定义无用 save_checkpoints_steps=500, session_config=session_config) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None if args.do_train and args.do_eval: # 加载训练数据 train_examples = processor.get_train_examples(args.data_dir) logger.info(len(train_examples)) num_train_steps = int( len(train_examples) * 1.0 / args.batch_size * args.num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.batch_size) logger.info(" Num steps = %d", num_train_steps) # 加载测试数据 eval_examples = processor.get_dev_examples(args.data_dir) # 打印验证集数据信息 logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.batch_size) # labels = ["B_at", "I_at", "B_ot", "I_ot", "O"] # label_list = processor.get_labels(labels) label_list = processor.get_labels() # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=args.init_checkpoint, learning_rate=args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, args=args) params = {'batch_size': args.batch_size} # 不同场景的dropout设置????, 如何实现 estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) if args.do_train and args.do_eval: # 1. 将数据转化为tf_record 数据 train_file = os.path.join(args.output_dir, "train.tf_record") if not os.path.exists(train_file): filed_based_convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir) # 2.读取record 训练数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=args.max_seq_length, is_training=True, drop_remainder=True) # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) # eval的record eval_file = os.path.join(args.output_dir, "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, eval_file, args.output_dir) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) # train and eval togither # early stop hook early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook( estimator=estimator, metric_name='loss', # loss没有提升的时候提前结束, 为啥不合适dev loss??? max_steps_without_decrease=num_train_steps, # 这里设置了最大值????? eval_dir=None, min_steps=0, run_every_secs=None, run_every_steps=args.save_checkpoints_steps) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) if args.do_predict: token_path = os.path.join(args.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(args.data_dir) predict_file = os.path.join(args.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer, predict_file, args.output_dir, mode="test") logger.info("***** Running prediction*****") logger.info(" Num examples = %d", len(predict_examples)) logger.info(" Batch size = %d", args.batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) # 这里没有进行维特比解码 如何获取序列化标注的结果???? result = estimator.predict(input_fn=predict_input_fn) logger.info(result) output_predict_file = os.path.join(args.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): logger.info(predict_line.text) logger.info(predict_line.label) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]', 'X']: continue try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: logger.info(e) logger.info(predict_line.text) logger.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) eval_result = return_report(output_predict_file) print(''.join(eval_result)) # 写结果到文件中 with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result)) # filter model if args.filter_adam_var: adam_filter(args.output_dir)
def data_for_squence2(input_file, output_file=None, data_dir="zhejiang/data_ner"): """ NER识别的数据准备 :param input_file: :param output_file: :return: """ max_len = 0 df_reviews = pd.read_csv(input_file, encoding="utf-8", delimiter=",", header=0) reviews = df_reviews["Reviews"].values # 句子清洗 sentences = list(map(sentence_clean, reviews)) df_reviews["Reviews"] = sentences # 序列化文本 if not output_file: # 句子序列化: test data sentences = list(map(list, sentences)) f = lambda list_words: "\n".join(list_words) sentences = list(map(f, sentences)) max_len = max([len(v) for v in sentences]) with open(data_dir + "/test.txt", mode="w", encoding="utf-8") as file: file.write("\n\n".join(sentences)) file.close() else: df_labels = pd.read_csv(output_file, encoding="utf-8", delimiter=",", header=0) cates_id = count_category(output_file, data_dir) logger.info(Counter(df_labels["Categories"].values)) # print(df_reviews.info()) # print(df_labels.info()) logger.info(cates_id) text = "" cols_name = "AspectTerms,A_start,OpinionTerms,O_start,Categories".split( ",") for col_id, col_review in tqdm(df_reviews[["id", "Reviews"]].values): # logger.info(col_id) # logger.info(col_review) col_id_df = df_labels.loc[df_labels.id == col_id] # logger.info(col_id_df) col_review = list(col_review) if len(col_review) > max_len: max_len = len(col_review) col_review_label = " ".join(col_review) # 用空格进行分开 logger # print(cols_name) for AspectTerms, A_start, OpinionTerms, O_start, Categories in col_id_df[ cols_name].values: cate_id = cates_id.get(Categories) # logger.info(AspectTerms) # logger.info(OpinionTerms) if AspectTerms != "_": suffix = "at_%d" % cate_id A_replaced = "B" + "I" * (len(AspectTerms) - 1) A_replaced = " ".join( [v + "_" + suffix for v in A_replaced]) col_review_label = col_review_label.replace( " ".join(list(AspectTerms)), A_replaced) # logger.info(col_review_label) if OpinionTerms != "_": obf = "m" # 修饰自身 try: A_start = int(A_start) O_start = int(O_start) if A_start < O_start: obf = "f" # 修饰前面aspect else: obf = "b" # 修饰后面aspect except: pass suffix = "ot_%d_%s" % (cate_id, obf) # logger.info(suffix) O_replaced = "B" + "I" * (len(OpinionTerms) - 1) O_replaced = " ".join( [v + "_" + suffix for v in O_replaced]) # logger.info(O_replaced) col_review_label = col_review_label.replace( " ".join(list(OpinionTerms)), O_replaced) # logger.info(col_review_label) col_review_label = col_review_label.split(" ") # logger.info(col_review) # logger.info(col_review_label) try: assert (len(col_review_label) == len(col_review)) # 其他地方已经进行过处理 # col_review = ["[CLS]"] + col_review # col_review_label = ["C"] + col_review_label tmp = [] for k, v in zip(col_review, col_review_label): v = v if v != k else "O" tmp.append(v) # print(k, v) text += k + "\t" + v + "\n" text += "\n" if sum([v == "O" for v in tmp]) == len(tmp): raise Exception except: logger.info(col_review) logger.info(col_review_label) # continue logger.info("数据存在问题") break text = text.strip().split("\n\n") random.shuffle(text) num_doc = len(text) split_index = int(num_doc * 0.2) text_dev = text[:split_index] text_train = text[split_index:] logger.info(len(text_dev)) logger.info(len(text_train)) with open(data_dir + "/dev.txt", mode="w", encoding="utf-8") as file: file.write("\n\n".join(text_dev)) file.close() with open(data_dir + "/train.txt", mode="w", encoding="utf-8") as file: file.write("\n\n".join(text_train)) file.close() return max_len
def model_fn(features, labels, mode, params): """features: 字典 mode:代表 train dev test 输入固定的吗?必须这么定义模型的输入函数""" logger.info("*** Features ***") for name in sorted(features.keys()): logger.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] # logger.info('shape of input_ids', input_ids.shape) # label_mask = features["label_mask"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) # 使用参数构建模型,input_idx 就是输入的样本idx表示,label_ids 就是标签的idx表示 # trans 为转移矩阵 total_loss, logits, trans, pred_ids = create_model( bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, False, args.dropout_rate, args.lstm_size, args.cell, args.num_layers) # 加载BERT模型: 已经有训练的参数就加载,后面新加入的crf参数就没有??? tvars = tf.trainable_variables() # 获取所有能训练的参数 if init_checkpoint: # 获取能加载的参数 (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint( tvars, init_checkpoint) # 加载能加载的参数 tf.train.init_from_checkpoint(init_checkpoint, assignment_map) # 打印加载模型的参数 logger.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: # 这里代表能加载的参数,否则不是加载 init_string = ", *INIT_FROM_CKPT*" logger.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) if mode == tf.estimator.ModeKeys.TRAIN: # train 优化器定义 # train_op = optimizer.optimizer(total_loss, learning_rate, num_train_steps) train_op = create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, False) hook_dict = {} hook_dict['loss'] = total_loss hook_dict['global_steps'] = tf.train.get_or_create_global_step() logging_hook = tf.train.LoggingTensorHook( hook_dict, every_n_iter=args.save_summary_steps) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, training_hooks=[logging_hook]) elif mode == tf.estimator.ModeKeys.EVAL: # dev # 针对NER ,进行了修改 def metric_fn(label_ids, pred_ids): return { # 采用均方误差???? "eval_loss": tf.metrics.mean_squared_error(labels=label_ids, predictions=pred_ids), } eval_metrics = metric_fn(label_ids, pred_ids) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=eval_metrics) else: # test output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=pred_ids) return output_spec # 这个是什么????
def embedding_lookup(input_ids, vocab_size, embedding_size=128, initializer_range=0.02, word_embedding_name="word_embeddings", use_one_hot_embeddings=False): """Looks up words embeddings for id tensor. Args: input_ids: int32 Tensor of shape [batch_size, seq_length] containing word ids. vocab_size: int. Size of the embedding vocabulary. embedding_size: int. Width of the word embeddings. initializer_range: float. Embedding initialization range. 用于初始化时候embedding限制 word_embedding_name: string. Name of the embedding table. use_one_hot_embeddings: bool. If True, use one-hot method for word embeddings. If False, use `tf.gather()`. Returns: float Tensor of shape [batch_size, seq_length, embedding_size]. """ # This function assumes that the input is of shape [batch_size, seq_length, num_inputs]. ??? # If the input is a 2D tensor of shape [batch_size, seq_length], we # reshape to [batch_size, seq_length, 1]. if input_ids.shape.ndims == 2: input_ids = tf.expand_dims(input_ids, axis=[-1]) logger.info(get_shape_list(input_ids)) # 初始化embedding的值 # Q K V 前面的线性层 embedding_table = tf.get_variable( name=word_embedding_name, shape=[vocab_size, embedding_size], initializer=create_initializer(initializer_range)) logger.info(get_shape_list(embedding_table)) # 多种维度的特征 按最后一个维度规整到一维? flat_input_ids = tf.reshape(input_ids, [-1]) logger.info(get_shape_list(flat_input_ids)) if use_one_hot_embeddings: one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) logger.info(get_shape_list(one_hot_input_ids)) # 查表映射下 output = tf.matmul(one_hot_input_ids, embedding_table) else: # 按下标取出字集合 output = tf.gather(embedding_table, flat_input_ids) input_shape = get_shape_list(input_ids) logger.info(input_shape) logger.info(input_shape[0:-1] + [input_shape[-1] * embedding_size]) # 最后一个维度在这里没有了 output = tf.reshape(output, input_shape[0:-1] + [input_shape[-1] * embedding_size]) return output, embedding_table
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode): """ 将一个样本进行分析,然后将字转化为id, 标签转化为id,然后结构化到InputFeatures对象中 :param ex_index: index :param example: 一个样本 :param label_list: 标签列表 :param max_seq_length: :param tokenizer: :param output_dir :param mode: :return: """ label_map = {} # 1表示从1开始对label进行index化 for (i, label) in enumerate(label_list, 1): label_map[label] = i # logger.info(label_map) # 保存label->index 的map if not os.path.exists(os.path.join(output_dir, 'label2id.pkl')): with codecs.open(os.path.join(output_dir, 'label2id.pkl'), 'wb') as w: pickle.dump(label_map, w) textlist = example.text.split(' ') labellist = example.label.split(' ') # 前面解析时候使用的空格 tokens = [] labels = [] for i, word in enumerate(textlist): # 分词,如果是中文,就是分字,但是对于一些不在BERT的vocab.txt中得字符会被进行WordPice处理(例如中文的引号),可以将所有的分字操作替换为list(input) token = tokenizer.tokenize(word) # token = [word] # 这里不用wordPiece tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): # 对 wordpiece 词干后缀用 X 填充序列 反序列化时候解析需要用上 if m == 0: labels.append(label_1) else: labels.append("X") # tokens = tokenizer.tokenize(example.text) # 序列截断 if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] # -2 的原因是因为序列需要加一个句首和句尾标志 labels = labels[0:(max_seq_length - 2)] # 输入的特征维度 这里给处两个维度,一个 词汇的 token 一个是句子编号 ntokens = [] segment_ids = [] # 输出的信号 label_ids = [] # 分别均添加句子开头 ntokens.append("[CLS]") # 句子开始设置CLS 标志 segment_ids.append(0) # append("O") or append("[CLS]") not sure! # [CLS] 对应 [CLS] 的label label_ids.append( label_map["[CLS]"] ) # O OR CLS 没有任何影响,不过我觉得O 会减少标签个数,不过拒收和句尾使用不同的标志来标注,使用LCS 也没毛病 for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) # 添加句子的结尾 ntokens.append("[SEP]") # 句尾添加[SEP] 标志 segment_ids.append(0) # append("O") or append("[SEP]") not sure! label_ids.append(label_map["[SEP]"]) # 输入的词汇转化为词典定义好的id # logger.info(ntokens) input_ids = tokenizer.convert_tokens_to_ids( ntokens) # 将序列中的字(ntokens)转化为ID形式 input_mask = [1] * len(input_ids) # 此处何用啊???? 对于真实存在输入信号的位置为1, 不足填充0 # label_mask = [1] * len(input_ids) # padding, 使用, 长度不够进行填充0 while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) # we don't concerned about it! label_ids.append(0) ntokens.append("**NULL**") # label_mask.append(0) # print(len(input_ids)) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length # assert len(label_mask) == max_seq_length # 打印部分样本数据信息 if ex_index < 3: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) # logger.info("tokens: %s" % " ".join([printable_text(x) for x in tokens])) logger.info("tokens: %s" % " ".join(ntokens)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) logger.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) # logger.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) # 结构化为一个类 feature = InputFeatures( input_ids=input_ids, # 序列化标注的时候从第2个采用用 解析的时候 input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, # label_mask = label_mask ) # mode='test'的时候才有效 write_tokens(ntokens, output_dir, mode) return feature
def get_test_examples(self, data_dir): logger.info("get_test_examples...>>>...") return self._create_example( self._read_data(os.path.join(data_dir, "test.txt"), data_type="test"), "test")
def __init__( self, sequence1, sequence2, num_classes=3, d_a_size=3, r_size=4, fc_size=10, ): # Placeholders for input, output and dropout self._length, sequence_length, hidden_size = get_shape_list(sequence1) logger.info(get_shape_list(sequence1)) initializer = tf.contrib.layers.xavier_initializer() self.output_fw = sequence1 self.output_bw = sequence2 self.H = tf.concat([self.output_fw, self.output_bw], axis=2) logger.info(get_shape_list(self.H)) H_reshape = tf.reshape(self.H, [-1, 2 * hidden_size]) logger.info(get_shape_list(H_reshape)) with tf.variable_scope("self-attention"): with tf.variable_scope("attention_A"): self.W_s1 = tf.get_variable("W_s1", shape=[2 * hidden_size, d_a_size], initializer=initializer) _H_s1 = tf.nn.tanh(tf.matmul(H_reshape, self.W_s1)) self.W_s2 = tf.get_variable("W_s2", shape=[d_a_size, r_size], initializer=initializer) _H_s2 = tf.matmul(_H_s1, self.W_s2) _H_s2_reshape = tf.transpose( tf.reshape(_H_s2, [-1, sequence_length, r_size]), [0, 2, 1]) self.A = tf.nn.softmax(_H_s2_reshape, name="attention") with tf.variable_scope("sentence-embedding"): self.M = tf.matmul(self.A, self.H) with tf.variable_scope("fully-connected"): # self.M_pool = tf.reduce_mean(self.M, axis=1) # W_fc = tf.get_variable("W_fc", shape=[2 * hidden_size, fc_size], initializer=initializer) self.M_flat = tf.reshape(self.M, shape=[-1, 2 * hidden_size * r_size]) W_fc = tf.get_variable( "W_fc", shape=[2 * hidden_size * r_size, fc_size], initializer=initializer) b_fc = tf.Variable(tf.constant(0.1, shape=[fc_size]), name="b_fc") self.fc = tf.nn.relu(tf.nn.xw_plus_b(self.M_flat, W_fc, b_fc), name="fc") with tf.variable_scope("output"): W_output = tf.get_variable("W_output", shape=[fc_size, num_classes], initializer=initializer) b_output = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b_output") self.logits = tf.nn.xw_plus_b(self.fc, W_output, b_output, name="logits") self.predictions = tf.argmax(self.logits, 1, name="predictions") with tf.variable_scope("penalization"): self.AA_T = tf.matmul(self.A, tf.transpose(self.A, [0, 2, 1])) self.I = tf.reshape( tf.tile(tf.eye(r_size), [tf.shape(self.A)[0], 1]), [-1, r_size, r_size]) self.P = tf.square( tf.norm(self.AA_T - self.I, axis=[-2, -1], ord="fro"))
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument # features ??? """The `model_fn` for TPUEstimator.""" logger.info( "******************************* Features ***********************************" ) for name in sorted(features.keys()): logger.info(" name = %s, shape = %s" % (name, features[name].shape)) # 获取 输入的的对应参数 input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] Q_mask = features["Q_mask"] is_real_example = None if "is_real_example" in features: is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) else: is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) is_training = (mode == tf.estimator.ModeKeys.TRAIN) # 模型搭建 logger.info(Q_mask) total_loss, per_example_loss, logits, probabilities = create_model( bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings, Q_mask=Q_mask) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: # 加载pre train的模型 assignment_map, initialized_variable_names = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) # 打印加载模型的参数 logger.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: # 这里代表能加载的参数,否则不是加载 init_string = ", *INIT_FROM_CKPT*" logger.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) logger.info("**** Trainable Variables ****") for var in tvars: # 将加载的模型的参数进行输出展示 init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" logger.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: # 训练 train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf.contrib.tpu.TPUEstimatorSpec( # 黑科技?????????? mode=mode, loss=total_loss, train_op=train_op, # ???? scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, label_ids, logits, is_real_example): # 计算一个eval的准确率,以及一个平均loss predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions, weights=is_real_example) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return { "eval_accuracy": accuracy, "eval_loss": loss, } eval_metrics = (metric_fn, [ per_example_loss, label_ids, logits, is_real_example ]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, ## ??? scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={"probabilities": probabilities}, scaffold_fn=scaffold_fn) return output_spec
def create_model( bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings, Q_mask, p_coef=0.004 # 注意机制的惩罚系数 ): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, Q_mask=Q_mask) # In the demo, we are doing a simple classification task on the entire segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_sequence_output() # 隐藏层最后的输出 logger.info(modeling.get_shape_list(output_layer)) output_layer2 = model.get_sequence_output2() logger.info(modeling.get_shape_list(output_layer2)) if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) output_layer2 = tf.nn.dropout(output_layer2, keep_prob=0.9) # 添加一个住一层输出机制 # 这里几个参数 d_a 是超参数,代表网络的复杂程度,就像网络的层数一样 # fc 全连接的层的节点的大小 # r_size 代表对句子embedding的维度的序列长度大小 aspect_opinion_attention = SelfAttention2(output_layer, output_layer2, num_classes=num_labels, d_a_size=64, r_size=10, fc_size=10) logits, panel = aspect_opinion_attention.get_output() probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) # 每个样本的交叉信息熵 per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) # 求均值损失 + 注意的损失惩罚 loss = tf.reduce_mean(per_example_loss) + tf.reduce_mean(panel * p_coef) with tf.Session() as sess: tf.summary.FileWriter("logs/run_classifier/", sess.graph) logger.info( "=======================save graph===========================") return loss, per_example_loss, logits, probabilities
def data_for_squence(input_file, output_file=None): df_reviews = pd.read_csv(input_file, encoding="utf-8", delimiter=",", header=0) reviews = df_reviews["Reviews"].values # 句子清洗 sentences = list(map(sentence_clean, reviews)) df_reviews["Reviews"] = sentences # 序列化文本 if not output_file: # 句子序列化: test data sentences = list(map(list, sentences)) f = lambda list_words: "\n".join(list_words) sentences = list(map(f, sentences)) with open(r"D:\projects_py\bert\zhejiang\data\test.txt", mode="w", encoding="utf-8") as file: file.write("\n\n".join(sentences)) file.close() else: df_labels = pd.read_csv(output_file, encoding="utf-8", delimiter=",", header=0) logger.info(Counter(df_labels["Categories"].values)) # print(df_reviews.info()) # print(df_labels.info()) text = "" for col_id, col_review in tqdm(df_reviews[["id", "Reviews"]].values): # logger.info(col_id) # logger.info(col_review) col_id_df = df_labels.loc[df_labels.id == col_id] # print(col_id_df) col_id_aspects = [ v for v in col_id_df["AspectTerms"].values if "_" != v ] # logger.info(col_id_aspects) col_review_label = col_review for v in col_id_aspects: # logger.info(v) if v: v_replaced = "[B_at]" + "[I_at]" * (len(v) - 1) col_review_label = re.sub(v, v_replaced, col_review_label, 1) # logger.info(col_review_label) col_id_opinions = [ v for v in col_id_df["OpinionTerms"].values if "_" != v ] for v in col_id_opinions: if v: v_replaced = "[B_ot]" + "[I_ot]" * (len(v) - 1) col_review_label = re.sub(v, v_replaced, col_review_label, 1) # logger.info(col_review_label) tmp = [v for v in re.split("\]|\[", col_review_label) if v] # logger.info(tmp) col_review_label = [[v] if v.endswith("t") else list(v) for v in tmp] # logger.info(col_review_label) # logger.info(col_review) tmp = [] for v in col_review_label: tmp.extend(v) col_review_label = tmp # logger.info(tmp) col_review = list(col_review) logger.info(col_review) logger.info(col_review_label) try: assert (len(col_review_label) == len(col_review)) # 其他地方已经进行过处理 # col_review = ["[CLS]"] + col_review # col_review_label = ["C"] + col_review_label for k, v in zip(col_review, col_review_label): v = v if v != k else "O" print(k, v) text += k + "\t" + v + "\n" text += "\n" except: logger.info(col_review) logger.info(col_review_label) # continue break text = text.strip().split("\n\n") random.shuffle(text) num_doc = len(text) split_index = int(num_doc * 0.2) text_dev = text[:split_index] text_train = text[split_index:] logger.info(len(text_dev)) logger.info(len(text_train)) with open(r"D:\projects_py\bert\zhejiang\data\dev.txt", mode="w", encoding="utf-8") as file: file.write("\n\n".join(text_dev)) file.close() with open(r"D:\projects_py\bert\zhejiang\data\train.txt", mode="w", encoding="utf-8") as file: file.write("\n\n".join(text_train)) file.close()
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """ 将一个InputExample 计算序列化 转化为 InputExample :param ex_index: :param example: :param label_list: :param max_seq_length: :param tokenizer: :return: """ if isinstance(example, PaddingInputExample): # 最后一个batch可能存在数量不够用 空的来充数 return InputFeatures(input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=0, Q_mask=[0] * max_seq_length, is_real_example=False) label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # 计算 Q_mask Q_mask = "\t".join(tokens) if example.aspect: aspect = "\t".join(list(example.aspect)) Q_mask = Q_mask.replace(aspect, "\t".join(["mask"] * len(example.aspect))) if example.opinion: opinion = "\t".join(list(example.opinion)) Q_mask = Q_mask.replace(opinion, "\t".join(["mask"] * len(example.opinion))) Q_mask = [1 if v == "mask" else 0 for v in Q_mask.split("\t")] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) Q_mask.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(Q_mask) == max_seq_length label_id = label_map[example.label] if ex_index < 3: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("Q_mask: %s" % " ".join([str(x) for x in Q_mask])) logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) logger.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, Q_mask=Q_mask, is_real_example=True) return feature
def parse_ner_predict(predicted_file, category_ids_file, data_dir): category_ids = {} for k, v in pd.read_csv(open(category_ids_file), header=None).values: category_ids[v] = k logger.info(category_ids) assert (len(category_ids) != 0) # (1)评论ID(ID):ID是每一条用户评论的唯一标识。 # # (2)用户评论(Reviews):用户对商品的评论原文。 # # (3)属性特征词(AspectTerms):评论原文中的商品属性特征词。例如“价格很便宜”中的“价格”。该字段结果须与评论原文中的表述保持一致。 # # (4)观点词(OpinionTerms):评论原文中,用户对商品某一属性所持有的观点。例如“价格很便宜”中的“很便宜”。该字段结果须与评论原文中的表述保持一致。 # # (5)观点极性(Polarity):用户对某一属性特征的观点所蕴含的情感极性,即负面、中性或正面三类。 # # (6)属性种类(Category):相似或同类的属性特征词构成的属性种类。例如“快递”和“物流”两个属性特征词都可归入“物流”这一属性种类 res = [] with open(predicted_file, encoding="utf-8", mode="r") as file: items = file.read().strip().split("\n\n") logger.info(len(items)) # ID AspectTerms Opinions Polarities Categories patt = re.compile(". O O") for id, item in enumerate(items, 1): # 分析每个句子 review = " ".join([line[0] for line in item.split("\n")]) logger.info(id) logger.info(review) ner_tokens_fake = [ v.strip() for v in patt.split(item.strip()) if v.strip() ] # 这里还没有将标注的分开出来 # logger.info(ner_tokens_fake) ner_tokens = [] for fake in ner_tokens_fake: # 存在识别的结果存在连续现象 fake_lines = fake.split("\n") # ner识别群每个行 token = [] for line in fake_lines: if line[4] == "B" and token: # 以B字母为分割 ner_tokens.append("\n".join(token)) token = [line] else: token.append(line) if token: ner_tokens.append("\n".join(token)) # logger.info(ner_tokens) # 将token拆解成词汇 和 词汇第一个字符对应的标注信息 word_info_pairs = [] for token in ner_tokens: token_lines = token.split("\n") word = "".join([l[0] for l in token_lines]) # 去第一个序列标注化后的结果提取信息 if "at" in token_lines[0]: info = token_lines[0][6:].split("_") else: info = token_lines[-1][6:].split("_") word_info_pairs.append([word, info]) # 解析结果到df的行 ner_tokens_res = [] for index, word_info in enumerate(word_info_pairs): word, info = word_info category = category_ids.get(int(info[1])) # logger.info(category) # logger.info(info) if info[0] == "at": aspect = word # aspect opinion = None if index > 0: # 向前寻找修饰的情感词汇 former_word, former_info = word_info_pairs[index - 1] if former_info[0] == "ot" and former_info[-1] == "b": opinion = former_word if not opinion and index < (len(word_info_pairs) - 1): # 向后寻找修饰词汇 next_word, next_info = word_info_pairs[index + 1] if next_info[0] == "ot" and next_info[-1] == "f": opinion = next_word row = [id, review, aspect, opinion, None, category, item] ner_tokens_res.append(row) if info[0] == "ot" and info[-1] == "m": opinion = word row = [id, review, None, opinion, None, category, item] ner_tokens_res.append(row) res.extend(ner_tokens_res) # break df = pd.DataFrame(data=res, columns=[ "ID", "Review", "AspectTerms", "Opinions", "Polarities", "Categories", "Ner" ]) df[["ID", "AspectTerms", "Opinions", "Polarities", "Categories"]].to_csv(data_dir + "/ner_res.csv", index=False)
def main(_): # tf.logging.set_verbosity(logger.info) processors = { "zhejiang": ZhejiangProcesser, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # fine tuning 句子的长度不能比 pre train 长 if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) # 创建输出文件夹 tf.gfile.MakeDirs(FLAGS.output_dir) # 必须为每一task定义一个processor task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) # 选取指定的数据 logger.info("数据集合是: %s" % task_name) processor = processors[task_name]() # 事先定义好分类类别的label集合 label_list = processor.get_labels() logger.info(label_list) # 加载词典,并将词典编号处理 tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # 忽略tpu tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 tpu_config = tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host) # RunConfig的设置 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tpu_config) train_examples = None num_train_steps = None num_warmup_steps = None # 获取训练的数据 if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) # 学习速率的确定??? num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # 获取函数 model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) logger.info("是否加载训练数据:") logger.info(FLAGS.do_train) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", FLAGS.train_batch_size) logger.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) logger.info("是否加载DEV数据:") logger.info(FLAGS.do_eval) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) logger.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) logger.info("是否加载TEST数据:") logger.info(FLAGS.do_eval) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) logger.info("***** Running prediction*****") logger.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) logger.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 logger.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def __init__(self, sequence_length=128, num_classes=3, vocab_size=20000, embedding_size=768, hidden_size=768, d_a_size=3, r_size=4, fc_size=10, p_coef=0.004): # Placeholders for input, output and dropout self.input_text = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_text') self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y') text_length = self._length(self.input_text) initializer = tf.contrib.layers.xavier_initializer() # Embeddings with tf.device('/cpu:0'), tf.name_scope("embedding"): self.W_text = tf.Variable(tf.random_uniform( [vocab_size, embedding_size], -1.0, 1.0), name="W_text") self.embedded_chars = tf.nn.embedding_lookup( self.W_text, self.input_text) # Bidirectional(Left&Right) Recurrent Structure with tf.name_scope("bi-lstm"): fw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size) bw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size) (self.output_fw, self.output_bw), states = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_cell, cell_bw=bw_cell, inputs=self.embedded_chars, sequence_length=text_length, dtype=tf.float32) self.H = tf.concat([self.output_fw, self.output_bw], axis=2) logger.info(get_shape_list(self.H)) H_reshape = tf.reshape(self.H, [-1, 2 * hidden_size]) logger.info(get_shape_list(H_reshape)) with tf.name_scope("self-attention"): self.W_s1 = tf.get_variable("W_s1", shape=[2 * hidden_size, d_a_size], initializer=initializer) _H_s1 = tf.nn.tanh(tf.matmul(H_reshape, self.W_s1)) self.W_s2 = tf.get_variable("W_s2", shape=[d_a_size, r_size], initializer=initializer) _H_s2 = tf.matmul(_H_s1, self.W_s2) _H_s2_reshape = tf.transpose( tf.reshape(_H_s2, [-1, sequence_length, r_size]), [0, 2, 1]) self.A = tf.nn.softmax(_H_s2_reshape, name="attention") with tf.name_scope("sentence-embedding"): self.M = tf.matmul(self.A, self.H) with tf.name_scope("fully-connected"): # self.M_pool = tf.reduce_mean(self.M, axis=1) # W_fc = tf.get_variable("W_fc", shape=[2 * hidden_size, fc_size], initializer=initializer) self.M_flat = tf.reshape(self.M, shape=[-1, 2 * hidden_size * r_size]) W_fc = tf.get_variable("W_fc", shape=[2 * hidden_size * r_size, fc_size], initializer=initializer) b_fc = tf.Variable(tf.constant(0.1, shape=[fc_size]), name="b_fc") self.fc = tf.nn.relu(tf.nn.xw_plus_b(self.M_flat, W_fc, b_fc), name="fc") with tf.name_scope("output"): W_output = tf.get_variable("W_output", shape=[fc_size, num_classes], initializer=initializer) b_output = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b_output") self.logits = tf.nn.xw_plus_b(self.fc, W_output, b_output, name="logits") self.predictions = tf.argmax(self.logits, 1, name="predictions") with tf.name_scope("penalization"): self.AA_T = tf.matmul(self.A, tf.transpose(self.A, [0, 2, 1])) self.I = tf.reshape( tf.tile(tf.eye(r_size), [tf.shape(self.A)[0], 1]), [-1, r_size, r_size]) self.P = tf.square( tf.norm(self.AA_T - self.I, axis=[-2, -1], ord="fro")) # Calculate mean cross-entropy loss with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.input_y) self.loss_P = tf.reduce_mean(self.P * p_coef) self.loss = tf.reduce_mean(losses) + self.loss_P # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, axis=1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")