コード例 #1
0
 def result_to_pair(writer):
     for predict_line, prediction in zip(predict_examples, result):
         idx = 0
         line = ''
         line_token = str(predict_line.text).split(' ')
         label_token = str(predict_line.label).split(' ')
         len_seq = len(label_token)
         if len(line_token) != len(label_token):
             logger.info(predict_line.text)
             logger.info(predict_line.label)
             break
         for id in prediction:
             if idx >= len_seq:
                 break
             if id == 0:
                 continue
             curr_labels = id2label[id]
             if curr_labels in ['[CLS]', '[SEP]', 'X']:
                 continue
             try:
                 line += line_token[idx] + ' ' + label_token[
                     idx] + ' ' + curr_labels + '\n'
             except Exception as e:
                 logger.info(e)
                 logger.info(predict_line.text)
                 logger.info(predict_line.label)
                 line = ''
                 break
             idx += 1
         writer.write(line + '\n')
コード例 #2
0
def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match("^(.*):\\d+$", name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var
    logger.info(init_checkpoint)

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if name not in name_to_variable:
            continue

        # https://github.com/google-research/bert/issues/383  解决模型加载错误
        # assignment_map[name] = name
        assignment_map[name] = name_to_variable[name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ":0"] = 1

    return assignment_map, initialized_variable_names
コード例 #3
0
ファイル: run.py プロジェクト: wangbq18/bert
def train_ner():
    args = get_args_parser()
    if True:
        param_str = '\n'.join(['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())])
        logger.info('usage: %s\n%20s   %s\n%s\n%s\n' % (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str))
    logger.info(args)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map
    train(args=args)
コード例 #4
0
 def _read_tsv(cls, input_file, quotechar=None):
     """Reads a tab separated value file."""
     logger.info("_read_tsv file: %s" % input_file)
     reader = csv.reader(open(input_file, encoding="utf-8", mode="r"),
                         delimiter=",",
                         quotechar=quotechar)
     lines = []
     for line in reader:
         # logger.info(line)
         lines.append(line)
     # for test:测试模型是否通
     # lines = lines[:32]
     return lines
コード例 #5
0
def convert_examples_to_features(examples, label_list, max_seq_length,
                                 tokenizer):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        feature = convert_single_example(ex_index, example, label_list,
                                         max_seq_length, tokenizer)

        features.append(feature)
    return features
コード例 #6
0
def get_last_checkpoint(model_path):
    if not os.path.exists(os.path.join(model_path, 'checkpoint')):
        logger.info('checkpoint file not exits:'.format(
            os.path.join(model_path, 'checkpoint')))
        return None
    last = None
    with codecs.open(os.path.join(model_path, 'checkpoint'),
                     'r',
                     encoding='utf-8') as fd:
        for line in fd:
            line = line.strip().split(':')
            if len(line) != 2:
                continue
            if line[0] == 'model_checkpoint_path':
                last = line[1][2:-1]
                break
    return last
コード例 #7
0
def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
    """Checks whether the casing config is consistent with the checkpoint name."""

    # The casing has to be passed in by the user and there is no explicit check
    # as to whether it matches the checkpoint. The casing information probably
    # should have been stored in the bert_config.json file, but it's not, so
    # we have to heuristically detect it to validate.

    if not init_checkpoint:
        logger.info("初始化的checkpoint为 None")
        return

    logger.info(init_checkpoint)
    # m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
    # logger.info(m.group())
    # if m is None:
    #     logger.info("初始化的checkpoint无效:%s" % init_checkpoint)
    #     return

    # model_name = m.group(1)
    model_name = "chinese_L-12_H-768_A-12"
    lower_models = [
        "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
        "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
    ]

    cased_models = [
        "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
        "multi_cased_L-12_H-768_A-12"
    ]
    logger.info("初始化的checkpoint为 %s" % model_name)

    # 检查模型和参数do_lower_case是否匹配
    is_bad_config = False
    if model_name in lower_models and not do_lower_case:
        is_bad_config = True
        actual_flag = "False"
        case_name = "lowercased"
        opposite_flag = "True"

    if model_name in cased_models and do_lower_case:
        is_bad_config = True
        actual_flag = "True"
        case_name = "cased"
        opposite_flag = "False"

    if is_bad_config:
        raise ValueError(
            "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
            "However, `%s` seems to be a %s model, so you "
            "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
            "how the model was pre-training. If this error is wrong, please "
            "just comment out this check." %
            (actual_flag, init_checkpoint, model_name, case_name,
             opposite_flag))
    logger.info("do_lower_case 与模型匹配")
コード例 #8
0
def filed_based_convert_examples_to_features(examples,
                                             label_list,
                                             max_seq_length,
                                             tokenizer,
                                             output_file,
                                             output_dir,
                                             mode=None):
    """
    将数据转化为TF_Record 结构,作为模型数据输入
    :param examples:  样本
    :param label_list:标签list
    :param max_seq_length: 预先设定的最大序列长度
    :param tokenizer: tokenizer 对象
    :param output_file: tf.record 输出路径
    :param mode:
    :return:
    """
    writer = tf.python_io.TFRecordWriter(output_file)
    # 遍历训练数据
    for (ex_index, example) in enumerate(examples):
        if ex_index % 5000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
        # 对于每一个训练样本,
        feature = convert_single_example(ex_index, example, label_list,
                                         max_seq_length, tokenizer, output_dir,
                                         mode)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(
                value=list(values)))
            return f

        # 创建特征字典, 有序字典, 写进record
        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["label_ids"] = create_int_feature(feature.label_ids)
        # features["label_mask"] = create_int_feature(feature.label_mask)

        # tf.train.Example/Feature 是一种协议,方便序列化???
        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))
        writer.write(tf_example.SerializeToString())
コード例 #9
0
    def get_labels(self, labels=None):
        logger.info(self.labels)
        # 整理最终输出预测的label, 这里的超级混乱,有些不必要
        if labels is not None:
            try:
                # 支持从文件中读取标签类型
                if os.path.exists(labels) and os.path.isfile(labels):
                    with codecs.open(labels, 'r', encoding='utf-8') as fd:
                        for line in fd:
                            self.labels.append(line.strip())
                else:
                    # 否则通过传入的参数,按照逗号分割
                    self.labels = labels.split(',')
                self.labels = set(self.labels)  # to set
            except Exception as e:
                print(e)

        # 通过读取train文件获取标签的方法会出现一定的风险。
        if os.path.exists(os.path.join(self.output_dir, 'label_list.pkl')):
            with codecs.open(os.path.join(self.output_dir, 'label_list.pkl'),
                             'rb') as rf:
                self.labels = pickle.load(rf)
                logger.info(self.labels)
        else:
            logger.info(self.labels)
            if len(self.labels) > 0:
                # X: for word piece, 英文词的后缀
                self.labels = self.labels.union({"X", "[CLS]", "[SEP]"})
                with codecs.open(
                        os.path.join(self.output_dir, 'label_list.pkl'),
                        'wb') as rf:
                    pickle.dump(self.labels, rf)
            else:
                raise Exception("输出的label存在问题")
        return self.labels
コード例 #10
0
def file_based_convert_examples_to_features(examples, label_list,
                                            max_seq_length, tokenizer,
                                            output_file):
    """
    将序列化后的InputFeature 转化为 tfrecord
    :param examples:
    :param label_list:
    :param max_seq_length:
    :param tokenizer:
    :param output_file:
    :return:
    """
    writer = tf.python_io.TFRecordWriter(output_file)
    logger.info("数据准备....")
    for (ex_index, example) in tqdm(enumerate(examples)):
        feature = convert_single_example(ex_index, example, label_list,
                                         max_seq_length, tokenizer)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(
                value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["label_ids"] = create_int_feature([feature.label_id])
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])
        features["Q_mask"] = create_int_feature(feature.Q_mask)

        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()
    logger.info("数据准备完成!")
コード例 #11
0
def data_for_sentimental():
    # test: 将序列化标注的test数据解析作为模型的输入,利用到序列化标注的结果
    columns = [
        "ID", "AspectTerms", "Opinions", "Polarities", "Categories", "Review"
    ]
    path = "zhejiang/data_ner/ner_res.xlsx"
    df = pd.read_excel(path)
    df = df[columns].fillna(value="_")
    df.to_csv("./zhejiang/data_sentimental/test.csv", index=False)
    print(df[:3])

    # train:将训练数据对应的label opinion提取并作为序列化标注的结果
    df = pd.read_csv(open("zhejiang/data_sentimental/Train_labels.csv",
                          encoding="utf-8"),
                     header=0)
    df = df[["id", "AspectTerms", "OpinionTerms", "Polarities", "Categories"]]
    sentiment_ids = collections.OrderedDict()
    for index, senti in enumerate(set(df["Polarities"].values)):
        sentiment_ids[senti] = index
    logger.info(sentiment_ids)
    pd.Series(sentiment_ids).to_csv(
        "zhejiang/data_sentimental/sentiment_ids.csv")
    df["Polarities"] = df["Polarities"].apply(
        lambda x: sentiment_ids[x.strip()])
    df.columns = columns[:-1]
    print(df[:3])
    # 给训练数据添加review
    df_review = pd.read_csv(open("zhejiang/data_sentimental/Train_reviews.csv",
                                 encoding="utf8"),
                            header=0,
                            index_col=["id"],
                            dtype=str)
    # print(df_review[:3])
    f = lambda x: " ".join(list(sentence_clean(x)))
    df_review["Reviews"] = df_review["Reviews"].apply(f).values
    tmp = [df_review.loc[id]["Reviews"] for id in df["ID"].values]
    # logger.info(tmp)
    df["Review"] = tmp
    print(df_review[:3])

    indexes = list(range(len(df)))
    random.shuffle(indexes)
    df = df.iloc[indexes]
    num_row = len(df)
    split_index = int(num_row * 0.2)
    df_dev = df[:split_index]
    df_train = df[split_index:]
    logger.info(len(df_dev))
    logger.info(len(df_train))

    df_train.to_csv("zhejiang/data_sentimental/train.csv",
                    encoding="utf-8",
                    index=False)
    df_dev.to_csv("zhejiang/data_sentimental/dev.csv",
                  encoding="utf-8",
                  index=False)
コード例 #12
0
    def _read_data(self, input_file, data_type=None):
        # 对父类的方法进行重写
        """Reads a BIO data."""
        logger.info(input_file)
        """Reads a BIO data."""
        with codecs.open(input_file, 'r', encoding='utf-8') as f:
            lines = []
            words = []
            labels = []
            for line in f:
                contends = line.strip()
                tokens = contends.split('\t')
                if len(contends) != 0:
                    if len(tokens) == 2:
                        # train, dev, test 测试部分带验证
                        words.append(tokens[0])
                        labels.append(tokens[1])
                        self.labels.add(tokens[1])
                    elif (len(tokens) == 1) and (data_type == "test"):
                        # 测试不带验证
                        words.append(tokens[0])
                        labels.append("O")  # 仅仅为了填充下
                    else:
                        logger.info(line)
                        logger.info(tokens)
                        raise Exception("数据样本准备错误")

                else:
                    # 一个句子的结果整理完毕, 每个信号均用空格进行分割
                    if len(contends) == 0:
                        l = ' '.join(
                            [label for label in labels if len(label) > 0])
                        w = ' '.join([word for word in words if len(word) > 0])
                        lines.append([l, w])
                        words = []
                        labels = []
                        continue
            # 将每个样本输入序列 和 输出  label作为 二元list对保存在list中
            logger.info(lines)
            return lines
コード例 #13
0
def count_category(output_file, data_dir):
    df_labels = pd.read_csv(output_file,
                            encoding="utf-8",
                            delimiter=",",
                            header=0)
    cates = df_labels.Categories.values
    len(cates)
    logger.info(Counter(cates))
    cates_ids = collections.OrderedDict()
    for v, k in enumerate(sorted(list(set(cates)))):
        cates_ids[k] = v
    logger.info(cates)
    pd.Series(cates_ids).to_csv(data_dir + "/category_ids.csv")
    logger.info(cates_ids)
    return cates_ids
コード例 #14
0
def count_train_data(file):
    df = pd.read_csv(file, encoding="utf-8", delimiter=",", header=0)
    logger.info(df.columns)
    reviews = df["Reviews"].values
    logger.info(len(reviews))
    words = []
    for line in reviews:
        line = line.strip()
        if line:
            logger.info(line)
            line = re.sub("[a-zA-Z]+", "@", line)
            line = re.sub("\d+", "&", line)
            line = re.sub("\s|\.", "", line)
            words.extend(list(line))
    return Counter(words)
コード例 #15
0
def count_predcited_aspect_opinion():
    file = "./output/label_test.txt"
    with open(file, encoding="utf-8", mode="r") as file:
        ots = []
        ats = []
        flag = None
        word = ""
        for line in file.readlines():
            # logger.info(line)
            line = line.strip()
            items = re.split("\s+", line)
            if len(items) != 3:
                if word:
                    if flag == "at":
                        ats.append(word)
                        logger.info(word)
                    elif flag == "ot":
                        ots.append(word)
                        logger.info(word)
                    else:
                        raise Exception("bug")
                flag = None
                word = ""
                continue
            # logger.info(items)
            # 寻找目标词汇
            if line.endswith("B_at"):
                # 旧的目标
                if word:
                    if flag == "at":
                        ats.append(word)
                        logger.info(word)
                    elif flag == "ot":
                        ots.append(word)
                        logger.info(word)
                    else:
                        raise Exception("bug")
                # 新目标
                flag = "at"
                word = items[0]
                continue
            if word and flag == "at" and line.endswith("I_at"):
                # 寻找仅仅接着的
                word += items[0]
                continue
            if word and flag == "at" and not line.endswith("I_at"):
                ats.append(word)
                logger.info(word)
                flag = None
                word = ""

            if line.endswith("B_ot"):
                # 旧的目标
                if word:
                    if flag == "at":
                        ats.append(word)
                        logger.info(word)
                    elif flag == "ot":
                        ots.append(word)
                        logger.info(word)
                    else:
                        raise Exception("bug")
                # 新目标
                flag = "ot"
                word = items[0]
                continue
            if word and flag == "ot" and line.endswith("I_ot"):
                # 寻找仅仅接着的
                word += items[0]
                continue
            if word and flag == "ot" and not line.endswith("I_ot"):
                flag = None
                ots.append(word)
                logger.info(word)
                word = ""
        logger.info(ats)
        logger.info(ots)
        logger.info(Counter(ats))
        logger.info(Counter(ots))
コード例 #16
0
 def get_dev_examples(self, data_dir):
     logger.info("get_dev_examples...>>>...")
     return self._create_example(
         self._read_data(os.path.join(data_dir, "dev.txt")), "dev")
コード例 #17
0
def train(args):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map

    processors = {"ner": NerProcessor}
    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean
    if args.clean and args.do_train:
        if os.path.exists(args.output_dir):

            def del_file(path):
                ls = os.listdir(path)
                for i in ls:
                    c_path = os.path.join(path, i)
                    if os.path.isdir(c_path):
                        del_file(c_path)
                    else:
                        os.remove(c_path)

            try:
                del_file(args.output_dir)
            except Exception as e:
                print(e)
                print('pleace remove the files of output dir and data.conf')
                exit(-1)

    # check output dir exists
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    processor = processors[args.ner](args.output_dir)
    logger.info(args.data_dir)

    # 加载字典
    tokenizer = FullTokenizer(vocab_file=args.vocab_file,
                              do_lower_case=args.do_lower_case)

    session_config = tf.ConfigProto(log_device_placement=False,
                                    inter_op_parallelism_threads=0,
                                    intra_op_parallelism_threads=0,
                                    allow_soft_placement=True)

    run_config = tf.estimator.RunConfig(
        model_dir=args.output_dir,
        save_summary_steps=500,  # 这里写死了 前面定义无用
        save_checkpoints_steps=500,
        session_config=session_config)

    train_examples = None
    eval_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if args.do_train and args.do_eval:
        # 加载训练数据
        train_examples = processor.get_train_examples(args.data_dir)
        logger.info(len(train_examples))
        num_train_steps = int(
            len(train_examples) * 1.0 / args.batch_size *
            args.num_train_epochs)
        if num_train_steps < 1:
            raise AttributeError('training data is so small...')
        num_warmup_steps = int(num_train_steps * args.warmup_proportion)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        # 加载测试数据
        eval_examples = processor.get_dev_examples(args.data_dir)

        # 打印验证集数据信息
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.batch_size)

    # labels = ["B_at", "I_at", "B_ot", "I_ot", "O"]
    # label_list = processor.get_labels(labels)
    label_list = processor.get_labels()
    # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程
    # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list) + 1,
                                init_checkpoint=args.init_checkpoint,
                                learning_rate=args.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                args=args)

    params = {'batch_size': args.batch_size}

    # 不同场景的dropout设置????, 如何实现
    estimator = tf.estimator.Estimator(model_fn,
                                       params=params,
                                       config=run_config)

    if args.do_train and args.do_eval:
        # 1. 将数据转化为tf_record 数据
        train_file = os.path.join(args.output_dir, "train.tf_record")
        if not os.path.exists(train_file):
            filed_based_convert_examples_to_features(train_examples,
                                                     label_list,
                                                     args.max_seq_length,
                                                     tokenizer, train_file,
                                                     args.output_dir)
        # 2.读取record 训练数据,组成batch
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=args.max_seq_length,
            is_training=True,
            drop_remainder=True)
        # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

        # eval的record
        eval_file = os.path.join(args.output_dir, "eval.tf_record")
        if not os.path.exists(eval_file):
            filed_based_convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer, eval_file,
                                                     args.output_dir)
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=False)

        # train and eval togither
        # early stop hook
        early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook(
            estimator=estimator,
            metric_name='loss',  # loss没有提升的时候提前结束, 为啥不合适dev loss???
            max_steps_without_decrease=num_train_steps,  # 这里设置了最大值?????
            eval_dir=None,
            min_steps=0,
            run_every_secs=None,
            run_every_steps=args.save_checkpoints_steps)

        train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                            max_steps=num_train_steps,
                                            hooks=[early_stopping_hook])

        eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)

        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    if args.do_predict:
        token_path = os.path.join(args.output_dir, "token_test.txt")
        if os.path.exists(token_path):
            os.remove(token_path)

        with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'),
                         'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}

        predict_examples = processor.get_test_examples(args.data_dir)
        predict_file = os.path.join(args.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples,
                                                 label_list,
                                                 args.max_seq_length,
                                                 tokenizer,
                                                 predict_file,
                                                 args.output_dir,
                                                 mode="test")

        logger.info("***** Running prediction*****")
        logger.info("  Num examples = %d", len(predict_examples))
        logger.info("  Batch size = %d", args.batch_size)

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        # 这里没有进行维特比解码 如何获取序列化标注的结果????
        result = estimator.predict(input_fn=predict_input_fn)
        logger.info(result)
        output_predict_file = os.path.join(args.output_dir, "label_test.txt")

        def result_to_pair(writer):
            for predict_line, prediction in zip(predict_examples, result):
                idx = 0
                line = ''
                line_token = str(predict_line.text).split(' ')
                label_token = str(predict_line.label).split(' ')
                len_seq = len(label_token)
                if len(line_token) != len(label_token):
                    logger.info(predict_line.text)
                    logger.info(predict_line.label)
                    break
                for id in prediction:
                    if idx >= len_seq:
                        break
                    if id == 0:
                        continue
                    curr_labels = id2label[id]
                    if curr_labels in ['[CLS]', '[SEP]', 'X']:
                        continue
                    try:
                        line += line_token[idx] + ' ' + label_token[
                            idx] + ' ' + curr_labels + '\n'
                    except Exception as e:
                        logger.info(e)
                        logger.info(predict_line.text)
                        logger.info(predict_line.label)
                        line = ''
                        break
                    idx += 1
                writer.write(line + '\n')

        with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer:
            result_to_pair(writer)

        eval_result = return_report(output_predict_file)
        print(''.join(eval_result))

        # 写结果到文件中
        with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'),
                         'a',
                         encoding='utf-8') as fd:
            fd.write(''.join(eval_result))

    # filter model
    if args.filter_adam_var:
        adam_filter(args.output_dir)
コード例 #18
0
def data_for_squence2(input_file,
                      output_file=None,
                      data_dir="zhejiang/data_ner"):
    """
    NER识别的数据准备
    :param input_file:
    :param output_file:
    :return:
    """
    max_len = 0
    df_reviews = pd.read_csv(input_file,
                             encoding="utf-8",
                             delimiter=",",
                             header=0)
    reviews = df_reviews["Reviews"].values

    # 句子清洗
    sentences = list(map(sentence_clean, reviews))
    df_reviews["Reviews"] = sentences

    # 序列化文本
    if not output_file:
        # 句子序列化: test data
        sentences = list(map(list, sentences))
        f = lambda list_words: "\n".join(list_words)
        sentences = list(map(f, sentences))
        max_len = max([len(v) for v in sentences])

        with open(data_dir + "/test.txt", mode="w", encoding="utf-8") as file:
            file.write("\n\n".join(sentences))
            file.close()
    else:
        df_labels = pd.read_csv(output_file,
                                encoding="utf-8",
                                delimiter=",",
                                header=0)
        cates_id = count_category(output_file, data_dir)
        logger.info(Counter(df_labels["Categories"].values))
        # print(df_reviews.info())
        # print(df_labels.info())
        logger.info(cates_id)
        text = ""
        cols_name = "AspectTerms,A_start,OpinionTerms,O_start,Categories".split(
            ",")
        for col_id, col_review in tqdm(df_reviews[["id", "Reviews"]].values):
            # logger.info(col_id)
            # logger.info(col_review)
            col_id_df = df_labels.loc[df_labels.id == col_id]
            # logger.info(col_id_df)

            col_review = list(col_review)

            if len(col_review) > max_len:
                max_len = len(col_review)

            col_review_label = " ".join(col_review)  # 用空格进行分开
            logger
            # print(cols_name)
            for AspectTerms, A_start, OpinionTerms, O_start, Categories in col_id_df[
                    cols_name].values:
                cate_id = cates_id.get(Categories)
                # logger.info(AspectTerms)
                # logger.info(OpinionTerms)
                if AspectTerms != "_":
                    suffix = "at_%d" % cate_id
                    A_replaced = "B" + "I" * (len(AspectTerms) - 1)
                    A_replaced = " ".join(
                        [v + "_" + suffix for v in A_replaced])
                    col_review_label = col_review_label.replace(
                        " ".join(list(AspectTerms)), A_replaced)
                    # logger.info(col_review_label)

                if OpinionTerms != "_":
                    obf = "m"  # 修饰自身
                    try:
                        A_start = int(A_start)
                        O_start = int(O_start)
                        if A_start < O_start:
                            obf = "f"  # 修饰前面aspect
                        else:
                            obf = "b"  # 修饰后面aspect
                    except:
                        pass
                    suffix = "ot_%d_%s" % (cate_id, obf)
                    # logger.info(suffix)
                    O_replaced = "B" + "I" * (len(OpinionTerms) - 1)
                    O_replaced = " ".join(
                        [v + "_" + suffix for v in O_replaced])
                    # logger.info(O_replaced)
                    col_review_label = col_review_label.replace(
                        " ".join(list(OpinionTerms)), O_replaced)
                    # logger.info(col_review_label)

            col_review_label = col_review_label.split(" ")
            # logger.info(col_review)
            # logger.info(col_review_label)

            try:
                assert (len(col_review_label) == len(col_review))
                # 其他地方已经进行过处理
                # col_review = ["[CLS]"] + col_review
                # col_review_label = ["C"] + col_review_label
                tmp = []
                for k, v in zip(col_review, col_review_label):
                    v = v if v != k else "O"
                    tmp.append(v)
                    # print(k, v)
                    text += k + "\t" + v + "\n"
                text += "\n"

                if sum([v == "O" for v in tmp]) == len(tmp):
                    raise Exception

            except:
                logger.info(col_review)
                logger.info(col_review_label)
                # continue
                logger.info("数据存在问题")
                break

        text = text.strip().split("\n\n")
        random.shuffle(text)

        num_doc = len(text)
        split_index = int(num_doc * 0.2)

        text_dev = text[:split_index]
        text_train = text[split_index:]

        logger.info(len(text_dev))
        logger.info(len(text_train))

        with open(data_dir + "/dev.txt", mode="w", encoding="utf-8") as file:
            file.write("\n\n".join(text_dev))
            file.close()

        with open(data_dir + "/train.txt", mode="w", encoding="utf-8") as file:
            file.write("\n\n".join(text_train))
            file.close()
    return max_len
コード例 #19
0
    def model_fn(features, labels, mode, params):
        """features: 字典
        mode:代表 train dev test
        输入固定的吗?必须这么定义模型的输入函数"""
        logger.info("*** Features ***")
        for name in sorted(features.keys()):
            logger.info("  name = %s, shape = %s" %
                        (name, features[name].shape))
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        # logger.info('shape of input_ids', input_ids.shape)
        # label_mask = features["label_mask"]
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        # 使用参数构建模型,input_idx 就是输入的样本idx表示,label_ids 就是标签的idx表示
        # trans 为转移矩阵
        total_loss, logits, trans, pred_ids = create_model(
            bert_config, is_training, input_ids, input_mask, segment_ids,
            label_ids, num_labels, False, args.dropout_rate, args.lstm_size,
            args.cell, args.num_layers)

        # 加载BERT模型: 已经有训练的参数就加载,后面新加入的crf参数就没有???
        tvars = tf.trainable_variables()  # 获取所有能训练的参数
        if init_checkpoint:
            # 获取能加载的参数
            (assignment_map,
             initialized_variable_names) = get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            # 加载能加载的参数
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        # 打印加载模型的参数
        logger.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                # 这里代表能加载的参数,否则不是加载
                init_string = ", *INIT_FROM_CKPT*"
            logger.info("  name = %s, shape = %s%s", var.name, var.shape,
                        init_string)

        if mode == tf.estimator.ModeKeys.TRAIN:
            # train 优化器定义
            # train_op = optimizer.optimizer(total_loss, learning_rate, num_train_steps)
            train_op = create_optimizer(total_loss, learning_rate,
                                        num_train_steps, num_warmup_steps,
                                        False)
            hook_dict = {}
            hook_dict['loss'] = total_loss
            hook_dict['global_steps'] = tf.train.get_or_create_global_step()
            logging_hook = tf.train.LoggingTensorHook(
                hook_dict, every_n_iter=args.save_summary_steps)

            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                training_hooks=[logging_hook])

        elif mode == tf.estimator.ModeKeys.EVAL:
            # dev
            # 针对NER ,进行了修改
            def metric_fn(label_ids, pred_ids):
                return {
                    # 采用均方误差????
                    "eval_loss":
                    tf.metrics.mean_squared_error(labels=label_ids,
                                                  predictions=pred_ids),
                }

            eval_metrics = metric_fn(label_ids, pred_ids)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode, loss=total_loss, eval_metric_ops=eval_metrics)

        else:
            # test
            output_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                     predictions=pred_ids)
        return output_spec  # 这个是什么????
コード例 #20
0
def embedding_lookup(input_ids,
                     vocab_size,
                     embedding_size=128,
                     initializer_range=0.02,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=False):
    """Looks up words embeddings for id tensor.

    Args:
      input_ids: int32 Tensor of shape [batch_size, seq_length] containing word ids.
      vocab_size: int. Size of the embedding vocabulary.
      embedding_size: int. Width of the word embeddings.
      initializer_range: float. Embedding initialization range. 用于初始化时候embedding限制
      word_embedding_name: string. Name of the embedding table.
      use_one_hot_embeddings: bool. If True, use one-hot method for word embeddings. If False, use `tf.gather()`.

    Returns:
      float Tensor of shape [batch_size, seq_length, embedding_size].
    """
    # This function assumes that the input is of shape [batch_size, seq_length, num_inputs].  ???
    # If the input is a 2D tensor of shape [batch_size, seq_length], we
    # reshape to [batch_size, seq_length, 1].
    if input_ids.shape.ndims == 2:
        input_ids = tf.expand_dims(input_ids, axis=[-1])
    logger.info(get_shape_list(input_ids))
    # 初始化embedding的值 # Q K V 前面的线性层
    embedding_table = tf.get_variable(
        name=word_embedding_name, shape=[vocab_size, embedding_size], initializer=create_initializer(initializer_range))
    logger.info(get_shape_list(embedding_table))
    # 多种维度的特征 按最后一个维度规整到一维?
    flat_input_ids = tf.reshape(input_ids, [-1])
    logger.info(get_shape_list(flat_input_ids))
    if use_one_hot_embeddings:
        one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
        logger.info(get_shape_list(one_hot_input_ids))
        # 查表映射下
        output = tf.matmul(one_hot_input_ids, embedding_table)
    else:
        # 按下标取出字集合
        output = tf.gather(embedding_table, flat_input_ids)

    input_shape = get_shape_list(input_ids)
    logger.info(input_shape)
    logger.info(input_shape[0:-1] + [input_shape[-1] * embedding_size])
    # 最后一个维度在这里没有了
    output = tf.reshape(output, input_shape[0:-1] + [input_shape[-1] * embedding_size])
    return output, embedding_table
コード例 #21
0
def convert_single_example(ex_index, example, label_list, max_seq_length,
                           tokenizer, output_dir, mode):
    """
    将一个样本进行分析,然后将字转化为id, 标签转化为id,然后结构化到InputFeatures对象中
    :param ex_index: index
    :param example: 一个样本
    :param label_list: 标签列表
    :param max_seq_length:
    :param tokenizer:
    :param output_dir
    :param mode:
    :return:
    """
    label_map = {}
    # 1表示从1开始对label进行index化
    for (i, label) in enumerate(label_list, 1):
        label_map[label] = i
    # logger.info(label_map)
    # 保存label->index 的map
    if not os.path.exists(os.path.join(output_dir, 'label2id.pkl')):
        with codecs.open(os.path.join(output_dir, 'label2id.pkl'), 'wb') as w:
            pickle.dump(label_map, w)

    textlist = example.text.split(' ')
    labellist = example.label.split(' ')  # 前面解析时候使用的空格

    tokens = []
    labels = []
    for i, word in enumerate(textlist):
        # 分词,如果是中文,就是分字,但是对于一些不在BERT的vocab.txt中得字符会被进行WordPice处理(例如中文的引号),可以将所有的分字操作替换为list(input)
        token = tokenizer.tokenize(word)
        # token = [word]  # 这里不用wordPiece
        tokens.extend(token)
        label_1 = labellist[i]
        for m in range(len(token)):
            # 对 wordpiece 词干后缀用 X 填充序列 反序列化时候解析需要用上
            if m == 0:
                labels.append(label_1)
            else:
                labels.append("X")

    # tokens = tokenizer.tokenize(example.text)
    # 序列截断
    if len(tokens) >= max_seq_length - 1:
        tokens = tokens[0:(max_seq_length - 2)]  # -2 的原因是因为序列需要加一个句首和句尾标志
        labels = labels[0:(max_seq_length - 2)]

    # 输入的特征维度 这里给处两个维度,一个 词汇的 token 一个是句子编号
    ntokens = []
    segment_ids = []
    # 输出的信号
    label_ids = []
    # 分别均添加句子开头
    ntokens.append("[CLS]")  # 句子开始设置CLS 标志
    segment_ids.append(0)
    # append("O") or append("[CLS]") not sure!
    # [CLS] 对应 [CLS] 的label
    label_ids.append(
        label_map["[CLS]"]
    )  # O OR CLS 没有任何影响,不过我觉得O 会减少标签个数,不过拒收和句尾使用不同的标志来标注,使用LCS 也没毛病

    for i, token in enumerate(tokens):
        ntokens.append(token)
        segment_ids.append(0)
        label_ids.append(label_map[labels[i]])

    # 添加句子的结尾
    ntokens.append("[SEP]")  # 句尾添加[SEP] 标志
    segment_ids.append(0)
    # append("O") or append("[SEP]") not sure!
    label_ids.append(label_map["[SEP]"])

    # 输入的词汇转化为词典定义好的id
    # logger.info(ntokens)
    input_ids = tokenizer.convert_tokens_to_ids(
        ntokens)  # 将序列中的字(ntokens)转化为ID形式
    input_mask = [1] * len(input_ids)  # 此处何用啊???? 对于真实存在输入信号的位置为1, 不足填充0
    # label_mask = [1] * len(input_ids)
    # padding, 使用, 长度不够进行填充0
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
        # we don't concerned about it!
        label_ids.append(0)
        ntokens.append("**NULL**")
        # label_mask.append(0)
    # print(len(input_ids))
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    assert len(label_ids) == max_seq_length
    # assert len(label_mask) == max_seq_length

    # 打印部分样本数据信息
    if ex_index < 3:
        logger.info("*** Example ***")
        logger.info("guid: %s" % (example.guid))
        # logger.info("tokens: %s" % " ".join([printable_text(x) for x in tokens]))
        logger.info("tokens: %s" % " ".join(ntokens))
        logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        logger.info("segment_ids: %s" % " ".join([str(x)
                                                  for x in segment_ids]))
        logger.info("label_ids: %s" % " ".join([str(x) for x in label_ids]))
        # logger.info("label_mask: %s" % " ".join([str(x) for x in label_mask]))

    # 结构化为一个类
    feature = InputFeatures(
        input_ids=input_ids,  # 序列化标注的时候从第2个采用用 解析的时候
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_ids=label_ids,
        # label_mask = label_mask
    )
    # mode='test'的时候才有效
    write_tokens(ntokens, output_dir, mode)
    return feature
コード例 #22
0
 def get_test_examples(self, data_dir):
     logger.info("get_test_examples...>>>...")
     return self._create_example(
         self._read_data(os.path.join(data_dir, "test.txt"),
                         data_type="test"), "test")
コード例 #23
0
    def __init__(
        self,
        sequence1,
        sequence2,
        num_classes=3,
        d_a_size=3,
        r_size=4,
        fc_size=10,
    ):
        # Placeholders for input, output and dropout
        self._length, sequence_length, hidden_size = get_shape_list(sequence1)
        logger.info(get_shape_list(sequence1))
        initializer = tf.contrib.layers.xavier_initializer()
        self.output_fw = sequence1
        self.output_bw = sequence2
        self.H = tf.concat([self.output_fw, self.output_bw], axis=2)
        logger.info(get_shape_list(self.H))

        H_reshape = tf.reshape(self.H, [-1, 2 * hidden_size])
        logger.info(get_shape_list(H_reshape))

        with tf.variable_scope("self-attention"):
            with tf.variable_scope("attention_A"):
                self.W_s1 = tf.get_variable("W_s1",
                                            shape=[2 * hidden_size, d_a_size],
                                            initializer=initializer)
                _H_s1 = tf.nn.tanh(tf.matmul(H_reshape, self.W_s1))
                self.W_s2 = tf.get_variable("W_s2",
                                            shape=[d_a_size, r_size],
                                            initializer=initializer)
                _H_s2 = tf.matmul(_H_s1, self.W_s2)
                _H_s2_reshape = tf.transpose(
                    tf.reshape(_H_s2, [-1, sequence_length, r_size]),
                    [0, 2, 1])
                self.A = tf.nn.softmax(_H_s2_reshape, name="attention")

            with tf.variable_scope("sentence-embedding"):
                self.M = tf.matmul(self.A, self.H)

            with tf.variable_scope("fully-connected"):
                # self.M_pool = tf.reduce_mean(self.M, axis=1)
                # W_fc = tf.get_variable("W_fc", shape=[2 * hidden_size, fc_size], initializer=initializer)
                self.M_flat = tf.reshape(self.M,
                                         shape=[-1, 2 * hidden_size * r_size])
                W_fc = tf.get_variable(
                    "W_fc",
                    shape=[2 * hidden_size * r_size, fc_size],
                    initializer=initializer)
                b_fc = tf.Variable(tf.constant(0.1, shape=[fc_size]),
                                   name="b_fc")
                self.fc = tf.nn.relu(tf.nn.xw_plus_b(self.M_flat, W_fc, b_fc),
                                     name="fc")

            with tf.variable_scope("output"):
                W_output = tf.get_variable("W_output",
                                           shape=[fc_size, num_classes],
                                           initializer=initializer)
                b_output = tf.Variable(tf.constant(0.1, shape=[num_classes]),
                                       name="b_output")
                self.logits = tf.nn.xw_plus_b(self.fc,
                                              W_output,
                                              b_output,
                                              name="logits")
                self.predictions = tf.argmax(self.logits,
                                             1,
                                             name="predictions")

            with tf.variable_scope("penalization"):
                self.AA_T = tf.matmul(self.A, tf.transpose(self.A, [0, 2, 1]))
                self.I = tf.reshape(
                    tf.tile(tf.eye(r_size), [tf.shape(self.A)[0], 1]),
                    [-1, r_size, r_size])
                self.P = tf.square(
                    tf.norm(self.AA_T - self.I, axis=[-2, -1], ord="fro"))
コード例 #24
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        # features ???
        """The `model_fn` for TPUEstimator."""
        logger.info(
            "******************************* Features ***********************************"
        )
        for name in sorted(features.keys()):
            logger.info("  name = %s, shape = %s" %
                        (name, features[name].shape))

        # 获取 输入的的对应参数
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]
        Q_mask = features["Q_mask"]
        is_real_example = None
        if "is_real_example" in features:
            is_real_example = tf.cast(features["is_real_example"],
                                      dtype=tf.float32)
        else:
            is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        # 模型搭建
        logger.info(Q_mask)
        total_loss, per_example_loss, logits, probabilities = create_model(
            bert_config,
            is_training,
            input_ids,
            input_mask,
            segment_ids,
            label_ids,
            num_labels,
            use_one_hot_embeddings,
            Q_mask=Q_mask)

        tvars = tf.trainable_variables()
        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            # 加载pre train的模型
            assignment_map, initialized_variable_names = modeling.get_assignment_map_from_checkpoint(
                tvars, init_checkpoint)
            # 打印加载模型的参数
            logger.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                # 这里代表能加载的参数,否则不是加载
                init_string = ", *INIT_FROM_CKPT*"
            logger.info("  name = %s, shape = %s%s", var.name, var.shape,
                        init_string)

            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold

            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        logger.info("**** Trainable Variables ****")
        for var in tvars:
            # 将加载的模型的参数进行输出展示
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            logger.info("  name = %s, shape = %s%s", var.name, var.shape,
                        init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            # 训练
            train_op = optimization.create_optimizer(total_loss, learning_rate,
                                                     num_train_steps,
                                                     num_warmup_steps, use_tpu)

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(  # 黑科技??????????
                mode=mode,
                loss=total_loss,
                train_op=train_op,  # ????
                scaffold_fn=scaffold_fn)

        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(per_example_loss, label_ids, logits,
                          is_real_example):
                # 计算一个eval的准确率,以及一个平均loss
                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                accuracy = tf.metrics.accuracy(labels=label_ids,
                                               predictions=predictions,
                                               weights=is_real_example)
                loss = tf.metrics.mean(values=per_example_loss,
                                       weights=is_real_example)
                return {
                    "eval_accuracy": accuracy,
                    "eval_loss": loss,
                }

            eval_metrics = (metric_fn, [
                per_example_loss, label_ids, logits, is_real_example
            ])
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=eval_metrics,  ## ???
                scaffold_fn=scaffold_fn)
        else:
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                predictions={"probabilities": probabilities},
                scaffold_fn=scaffold_fn)
        return output_spec
コード例 #25
0
def create_model(
        bert_config,
        is_training,
        input_ids,
        input_mask,
        segment_ids,
        labels,
        num_labels,
        use_one_hot_embeddings,
        Q_mask,
        p_coef=0.004  # 注意机制的惩罚系数
):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings,
                               Q_mask=Q_mask)

    # In the demo, we are doing a simple classification task on the entire segment.
    #
    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.

    output_layer = model.get_sequence_output()  # 隐藏层最后的输出
    logger.info(modeling.get_shape_list(output_layer))
    output_layer2 = model.get_sequence_output2()
    logger.info(modeling.get_shape_list(output_layer2))

    if is_training:
        # I.e., 0.1 dropout
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
        output_layer2 = tf.nn.dropout(output_layer2, keep_prob=0.9)

    # 添加一个住一层输出机制
    # 这里几个参数 d_a 是超参数,代表网络的复杂程度,就像网络的层数一样
    # fc 全连接的层的节点的大小
    # r_size 代表对句子embedding的维度的序列长度大小
    aspect_opinion_attention = SelfAttention2(output_layer,
                                              output_layer2,
                                              num_classes=num_labels,
                                              d_a_size=64,
                                              r_size=10,
                                              fc_size=10)
    logits, panel = aspect_opinion_attention.get_output()

    probabilities = tf.nn.softmax(logits, axis=-1)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
    # 每个样本的交叉信息熵
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    # 求均值损失 + 注意的损失惩罚
    loss = tf.reduce_mean(per_example_loss) + tf.reduce_mean(panel * p_coef)

    with tf.Session() as sess:
        tf.summary.FileWriter("logs/run_classifier/", sess.graph)
        logger.info(
            "=======================save graph===========================")
        return loss, per_example_loss, logits, probabilities
コード例 #26
0
def data_for_squence(input_file, output_file=None):
    df_reviews = pd.read_csv(input_file,
                             encoding="utf-8",
                             delimiter=",",
                             header=0)
    reviews = df_reviews["Reviews"].values

    # 句子清洗
    sentences = list(map(sentence_clean, reviews))
    df_reviews["Reviews"] = sentences

    # 序列化文本
    if not output_file:
        # 句子序列化: test data
        sentences = list(map(list, sentences))
        f = lambda list_words: "\n".join(list_words)
        sentences = list(map(f, sentences))
        with open(r"D:\projects_py\bert\zhejiang\data\test.txt",
                  mode="w",
                  encoding="utf-8") as file:
            file.write("\n\n".join(sentences))
            file.close()
    else:
        df_labels = pd.read_csv(output_file,
                                encoding="utf-8",
                                delimiter=",",
                                header=0)
        logger.info(Counter(df_labels["Categories"].values))
        # print(df_reviews.info())
        # print(df_labels.info())
        text = ""
        for col_id, col_review in tqdm(df_reviews[["id", "Reviews"]].values):
            # logger.info(col_id)
            # logger.info(col_review)
            col_id_df = df_labels.loc[df_labels.id == col_id]
            # print(col_id_df)

            col_id_aspects = [
                v for v in col_id_df["AspectTerms"].values if "_" != v
            ]
            # logger.info(col_id_aspects)
            col_review_label = col_review
            for v in col_id_aspects:
                # logger.info(v)
                if v:
                    v_replaced = "[B_at]" + "[I_at]" * (len(v) - 1)
                    col_review_label = re.sub(v, v_replaced, col_review_label,
                                              1)
                # logger.info(col_review_label)

            col_id_opinions = [
                v for v in col_id_df["OpinionTerms"].values if "_" != v
            ]
            for v in col_id_opinions:
                if v:
                    v_replaced = "[B_ot]" + "[I_ot]" * (len(v) - 1)
                    col_review_label = re.sub(v, v_replaced, col_review_label,
                                              1)

            # logger.info(col_review_label)
            tmp = [v for v in re.split("\]|\[", col_review_label) if v]
            # logger.info(tmp)

            col_review_label = [[v] if v.endswith("t") else list(v)
                                for v in tmp]
            # logger.info(col_review_label)
            # logger.info(col_review)
            tmp = []
            for v in col_review_label:
                tmp.extend(v)
            col_review_label = tmp
            # logger.info(tmp)

            col_review = list(col_review)
            logger.info(col_review)
            logger.info(col_review_label)
            try:
                assert (len(col_review_label) == len(col_review))
                # 其他地方已经进行过处理
                # col_review = ["[CLS]"] + col_review
                # col_review_label = ["C"] + col_review_label
                for k, v in zip(col_review, col_review_label):
                    v = v if v != k else "O"
                    print(k, v)
                    text += k + "\t" + v + "\n"
                text += "\n"

            except:
                logger.info(col_review)
                logger.info(col_review_label)
                # continue
                break

        text = text.strip().split("\n\n")
        random.shuffle(text)

        num_doc = len(text)
        split_index = int(num_doc * 0.2)

        text_dev = text[:split_index]
        text_train = text[split_index:]

        logger.info(len(text_dev))
        logger.info(len(text_train))

        with open(r"D:\projects_py\bert\zhejiang\data\dev.txt",
                  mode="w",
                  encoding="utf-8") as file:
            file.write("\n\n".join(text_dev))
            file.close()

        with open(r"D:\projects_py\bert\zhejiang\data\train.txt",
                  mode="w",
                  encoding="utf-8") as file:
            file.write("\n\n".join(text_train))
            file.close()
コード例 #27
0
def convert_single_example(ex_index, example, label_list, max_seq_length,
                           tokenizer):
    """
    将一个InputExample 计算序列化 转化为 InputExample
    :param ex_index:
    :param example:
    :param label_list:
    :param max_seq_length:
    :param tokenizer:
    :return:
    """

    if isinstance(example, PaddingInputExample):
        # 最后一个batch可能存在数量不够用 空的来充数
        return InputFeatures(input_ids=[0] * max_seq_length,
                             input_mask=[0] * max_seq_length,
                             segment_ids=[0] * max_seq_length,
                             label_id=0,
                             Q_mask=[0] * max_seq_length,
                             is_real_example=False)

    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i

    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)
    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # 计算 Q_mask
    Q_mask = "\t".join(tokens)
    if example.aspect:
        aspect = "\t".join(list(example.aspect))
        Q_mask = Q_mask.replace(aspect,
                                "\t".join(["mask"] * len(example.aspect)))

    if example.opinion:
        opinion = "\t".join(list(example.opinion))
        Q_mask = Q_mask.replace(opinion,
                                "\t".join(["mask"] * len(example.opinion)))
    Q_mask = [1 if v == "mask" else 0 for v in Q_mask.split("\t")]

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
        Q_mask.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    assert len(Q_mask) == max_seq_length

    label_id = label_map[example.label]
    if ex_index < 3:
        logger.info("*** Example ***")
        logger.info("guid: %s" % (example.guid))
        logger.info("tokens: %s" %
                    " ".join([tokenization.printable_text(x) for x in tokens]))
        logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        logger.info("Q_mask: %s" % " ".join([str(x) for x in Q_mask]))
        logger.info("segment_ids: %s" % " ".join([str(x)
                                                  for x in segment_ids]))
        logger.info("label: %s (id = %d)" % (example.label, label_id))

    feature = InputFeatures(input_ids=input_ids,
                            input_mask=input_mask,
                            segment_ids=segment_ids,
                            label_id=label_id,
                            Q_mask=Q_mask,
                            is_real_example=True)
    return feature
コード例 #28
0
def parse_ner_predict(predicted_file, category_ids_file, data_dir):
    category_ids = {}
    for k, v in pd.read_csv(open(category_ids_file), header=None).values:
        category_ids[v] = k
    logger.info(category_ids)
    assert (len(category_ids) != 0)

    # (1)评论ID(ID):ID是每一条用户评论的唯一标识。
    #
    # (2)用户评论(Reviews):用户对商品的评论原文。
    #
    # (3)属性特征词(AspectTerms):评论原文中的商品属性特征词。例如“价格很便宜”中的“价格”。该字段结果须与评论原文中的表述保持一致。
    #
    # (4)观点词(OpinionTerms):评论原文中,用户对商品某一属性所持有的观点。例如“价格很便宜”中的“很便宜”。该字段结果须与评论原文中的表述保持一致。
    #
    # (5)观点极性(Polarity):用户对某一属性特征的观点所蕴含的情感极性,即负面、中性或正面三类。
    #
    # (6)属性种类(Category):相似或同类的属性特征词构成的属性种类。例如“快递”和“物流”两个属性特征词都可归入“物流”这一属性种类
    res = []
    with open(predicted_file, encoding="utf-8", mode="r") as file:
        items = file.read().strip().split("\n\n")
        logger.info(len(items))

        # ID AspectTerms Opinions Polarities Categories
        patt = re.compile(". O O")

        for id, item in enumerate(items, 1):
            # 分析每个句子
            review = " ".join([line[0] for line in item.split("\n")])
            logger.info(id)
            logger.info(review)
            ner_tokens_fake = [
                v.strip() for v in patt.split(item.strip()) if v.strip()
            ]  # 这里还没有将标注的分开出来
            # logger.info(ner_tokens_fake)

            ner_tokens = []
            for fake in ner_tokens_fake:
                # 存在识别的结果存在连续现象
                fake_lines = fake.split("\n")  # ner识别群每个行
                token = []
                for line in fake_lines:
                    if line[4] == "B" and token:
                        # 以B字母为分割
                        ner_tokens.append("\n".join(token))
                        token = [line]
                    else:
                        token.append(line)
                if token:
                    ner_tokens.append("\n".join(token))

            # logger.info(ner_tokens)

            # 将token拆解成词汇 和 词汇第一个字符对应的标注信息
            word_info_pairs = []
            for token in ner_tokens:
                token_lines = token.split("\n")
                word = "".join([l[0] for l in token_lines])
                # 去第一个序列标注化后的结果提取信息
                if "at" in token_lines[0]:
                    info = token_lines[0][6:].split("_")
                else:
                    info = token_lines[-1][6:].split("_")
                word_info_pairs.append([word, info])

            # 解析结果到df的行
            ner_tokens_res = []
            for index, word_info in enumerate(word_info_pairs):
                word, info = word_info
                category = category_ids.get(int(info[1]))
                # logger.info(category)
                # logger.info(info)
                if info[0] == "at":
                    aspect = word
                    # aspect
                    opinion = None
                    if index > 0:
                        # 向前寻找修饰的情感词汇
                        former_word, former_info = word_info_pairs[index - 1]
                        if former_info[0] == "ot" and former_info[-1] == "b":
                            opinion = former_word
                    if not opinion and index < (len(word_info_pairs) - 1):
                        # 向后寻找修饰词汇
                        next_word, next_info = word_info_pairs[index + 1]
                        if next_info[0] == "ot" and next_info[-1] == "f":
                            opinion = next_word
                    row = [id, review, aspect, opinion, None, category, item]
                    ner_tokens_res.append(row)

                if info[0] == "ot" and info[-1] == "m":
                    opinion = word
                    row = [id, review, None, opinion, None, category, item]
                    ner_tokens_res.append(row)
            res.extend(ner_tokens_res)
            # break
        df = pd.DataFrame(data=res,
                          columns=[
                              "ID", "Review", "AspectTerms", "Opinions",
                              "Polarities", "Categories", "Ner"
                          ])
        df[["ID", "AspectTerms", "Opinions", "Polarities",
            "Categories"]].to_csv(data_dir + "/ner_res.csv", index=False)
コード例 #29
0
def main(_):
    # tf.logging.set_verbosity(logger.info)

    processors = {
        "zhejiang": ZhejiangProcesser,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    # fine tuning 句子的长度不能比 pre train 长
    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model was only trained up to sequence length %d"
            % (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    # 创建输出文件夹
    tf.gfile.MakeDirs(FLAGS.output_dir)

    # 必须为每一task定义一个processor
    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    # 选取指定的数据
    logger.info("数据集合是: %s" % task_name)
    processor = processors[task_name]()

    # 事先定义好分类类别的label集合
    label_list = processor.get_labels()
    logger.info(label_list)

    # 加载词典,并将词典编号处理
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    # 忽略tpu
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    tpu_config = tf.contrib.tpu.TPUConfig(
        iterations_per_loop=FLAGS.iterations_per_loop,
        num_shards=FLAGS.num_tpu_cores,
        per_host_input_for_training=is_per_host)

    # RunConfig的设置
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tpu_config)

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    # 获取训练的数据
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        # 学习速率的确定???
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    # 获取函数
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    logger.info("是否加载训练数据:")
    logger.info(FLAGS.do_train)
    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", FLAGS.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    logger.info("是否加载DEV数据:")
    logger.info(FLAGS.do_eval)
    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d (%d actual, %d padding)",
                    len(eval_examples), num_actual_eval_examples,
                    len(eval_examples) - num_actual_eval_examples)
        logger.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    logger.info("是否加载TEST数据:")
    logger.info(FLAGS.do_eval)
    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        logger.info("***** Running prediction*****")
        logger.info("  Num examples = %d (%d actual, %d padding)",
                    len(predict_examples), num_actual_predict_examples,
                    len(predict_examples) - num_actual_predict_examples)
        logger.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            logger.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples
コード例 #30
0
    def __init__(self,
                 sequence_length=128,
                 num_classes=3,
                 vocab_size=20000,
                 embedding_size=768,
                 hidden_size=768,
                 d_a_size=3,
                 r_size=4,
                 fc_size=10,
                 p_coef=0.004):
        # Placeholders for input, output and dropout
        self.input_text = tf.placeholder(tf.int32,
                                         shape=[None, sequence_length],
                                         name='input_text')
        self.input_y = tf.placeholder(tf.float32,
                                      shape=[None, num_classes],
                                      name='input_y')

        text_length = self._length(self.input_text)
        initializer = tf.contrib.layers.xavier_initializer()

        # Embeddings
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            self.W_text = tf.Variable(tf.random_uniform(
                [vocab_size, embedding_size], -1.0, 1.0),
                                      name="W_text")
            self.embedded_chars = tf.nn.embedding_lookup(
                self.W_text, self.input_text)

        # Bidirectional(Left&Right) Recurrent Structure
        with tf.name_scope("bi-lstm"):
            fw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
            bw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
            (self.output_fw,
             self.output_bw), states = tf.nn.bidirectional_dynamic_rnn(
                 cell_fw=fw_cell,
                 cell_bw=bw_cell,
                 inputs=self.embedded_chars,
                 sequence_length=text_length,
                 dtype=tf.float32)
            self.H = tf.concat([self.output_fw, self.output_bw], axis=2)
            logger.info(get_shape_list(self.H))
            H_reshape = tf.reshape(self.H, [-1, 2 * hidden_size])
            logger.info(get_shape_list(H_reshape))

        with tf.name_scope("self-attention"):
            self.W_s1 = tf.get_variable("W_s1",
                                        shape=[2 * hidden_size, d_a_size],
                                        initializer=initializer)
            _H_s1 = tf.nn.tanh(tf.matmul(H_reshape, self.W_s1))
            self.W_s2 = tf.get_variable("W_s2",
                                        shape=[d_a_size, r_size],
                                        initializer=initializer)
            _H_s2 = tf.matmul(_H_s1, self.W_s2)
            _H_s2_reshape = tf.transpose(
                tf.reshape(_H_s2, [-1, sequence_length, r_size]), [0, 2, 1])
            self.A = tf.nn.softmax(_H_s2_reshape, name="attention")

        with tf.name_scope("sentence-embedding"):
            self.M = tf.matmul(self.A, self.H)

        with tf.name_scope("fully-connected"):
            # self.M_pool = tf.reduce_mean(self.M, axis=1)
            # W_fc = tf.get_variable("W_fc", shape=[2 * hidden_size, fc_size], initializer=initializer)
            self.M_flat = tf.reshape(self.M,
                                     shape=[-1, 2 * hidden_size * r_size])
            W_fc = tf.get_variable("W_fc",
                                   shape=[2 * hidden_size * r_size, fc_size],
                                   initializer=initializer)
            b_fc = tf.Variable(tf.constant(0.1, shape=[fc_size]), name="b_fc")
            self.fc = tf.nn.relu(tf.nn.xw_plus_b(self.M_flat, W_fc, b_fc),
                                 name="fc")

        with tf.name_scope("output"):
            W_output = tf.get_variable("W_output",
                                       shape=[fc_size, num_classes],
                                       initializer=initializer)
            b_output = tf.Variable(tf.constant(0.1, shape=[num_classes]),
                                   name="b_output")
            self.logits = tf.nn.xw_plus_b(self.fc,
                                          W_output,
                                          b_output,
                                          name="logits")
            self.predictions = tf.argmax(self.logits, 1, name="predictions")

        with tf.name_scope("penalization"):
            self.AA_T = tf.matmul(self.A, tf.transpose(self.A, [0, 2, 1]))
            self.I = tf.reshape(
                tf.tile(tf.eye(r_size), [tf.shape(self.A)[0], 1]),
                [-1, r_size, r_size])
            self.P = tf.square(
                tf.norm(self.AA_T - self.I, axis=[-2, -1], ord="fro"))

        # Calculate mean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(
                logits=self.logits, labels=self.input_y)
            self.loss_P = tf.reduce_mean(self.P * p_coef)
            self.loss = tf.reduce_mean(losses) + self.loss_P

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions,
                                           tf.argmax(self.input_y, axis=1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                   tf.float32),
                                           name="accuracy")