Ejemplo n.º 1
0
    def __init__(self,
                 nums_chars,
                 nums_tags,
                 buckets_char,
                 counts=None,
                 batch_size=10,
                 crf=1,
                 ngram=None,
                 sent_seg=False,
                 is_space=True,
                 emb_path=None,
                 tag_scheme='BIES'):
        self.nums_chars = nums_chars
        self.nums_tags = nums_tags
        self.buckets_char = buckets_char
        self.counts = counts
        self.crf = crf
        self.ngram = ngram
        self.emb_path = emb_path
        self.emb_layer = None
        self.tag_scheme = tag_scheme
        self.gram_layers = []
        self.batch_size = batch_size
        self.l_rate = None
        self.decay = None
        self.train_step = None
        self.saver = None
        self.decode_holders = None
        self.scores = None
        self.params = None
        self.pixels = None
        self.is_space = is_space
        self.sent_seg = sent_seg
        self.updates = []
        self.bucket_dit = {}
        self.input_v = []
        self.input_w = []
        self.input_p = None
        self.output = []
        self.output_ = []
        self.output_p = []

        if self.crf > 0:
            self.transition_char = tf.get_variable(
                'transitions_char', [self.nums_tags + 1, self.nums_tags + 1])
        else:
            self.transition_char = None

        while len(self.buckets_char) > len(self.counts):
            self.counts.append(1)

        self.real_batches = toolbox.get_real_batch(self.counts,
                                                   self.batch_size)
Ejemplo n.º 2
0
    def __init__(self, nums_chars, nums_tags, buckets_char, counts=None, pic_size=None, font=None, batch_size=10,
                 tag_scheme='BIES', word_vec=True, radical=False, graphic=False, crf=1, ngram=None, metric='F1-score',
                 mode='RNN'):
        self.nums_chars = nums_chars
        self.nums_tags = nums_tags
        self.buckets_char = buckets_char
        self.counts = counts
        self.tag_scheme = tag_scheme
        self.graphic = graphic
        self.word_vec = word_vec
        self.radical = radical
        self.crf = crf
        self.ngram = ngram
        self.emb_layer = None
        self.radical_layer = None
        self.pos_emb_f, self.pos_emb_b = None, None
        self.gram_layers = []
        self.font = font
        self.pic_size = pic_size
        self.batch_size = batch_size
        self.l_rate, self.decay = None, None
        self.train_step = None
        self.saver = None
        self.decode_holders = None
        self.scores = None
        self.params = None
        self.pixels = None
        self.metric = metric
        self.mode = mode
        self.updates = []
        self.bucket_dit = {}
        self.input_v = []
        self.input_w = []
        self.input_p = None
        self.output, self.output_, self.output_p, self.output_w, self.output_w_ = [], [], [], [], []
        if self.crf > 0:
            self.transition_char = []
            for i in range(len(self.nums_tags)):
                self.transition_char.append(tf.get_variable('transitions_char' + str(i), [self.nums_tags[i] + 1,
                                                                                          self.nums_tags[i] + 1]))

        self.all_metrics = None

        self.all_metrics = ['Precision', 'Recall', 'F1-score', 'True-Negative-Rate', 'Boundary-F1-score']

        while len(self.buckets_char) > len(self.counts):
            self.counts.append(1)

        self.real_batches = toolbox.get_real_batch(self.counts, self.batch_size)
Ejemplo n.º 3
0
    def __init__(self, nums_chars, nums_tags, buckets_char, counts=None, pic_size=None, font=None, batch_size=10, tag_scheme='BIES', word_vec=True, radical=False, graphic=False, crf=1, ngram=None):
        self.nums_chars = nums_chars
        self.nums_tags = nums_tags
        self.buckets_char = buckets_char
        self.counts = counts
        self.tag_scheme = tag_scheme
        self.graphic = graphic
        self.word_vec = word_vec
        self.radical = radical
        self.crf = crf
        self.ngram = ngram
        self.emb_layer = None
        self.radical_layer = None
        self.gram_layers = []
        self.font = font
        self.pic_size = pic_size
        self.batch_size = batch_size
        self.l_rate = None
        self.decay = None
        self.train_step = None
        self.saver = None
        self.decode_holders = None
        self.scores = None
        self.params = None
        self.pixels = None
        self.updates = []
        self.bucket_dit = {}
        self.input_v = []
        self.input_w = []
        self.input_p = None
        self.output = []
        self.output_ = []
        self.output_p = []
        self.output_w = []
        self.output_w_ = []
        if self.crf > 0:
            self.transition_char = []
            for i in range(len(self.nums_tags)):
                self.transition_char.append(tf.get_variable('transitions_char' + str(i), [self.nums_tags[i] + 1, self.nums_tags[i] + 1]))

        while len(self.buckets_char) > len(self.counts):
            self.counts.append(1)

        self.real_batches = toolbox.get_real_batch(self.counts, self.batch_size)
Ejemplo n.º 4
0
    def __init__(self, nums_chars, nums_tags, buckets_char, window_size=0, filters_number=0, counts=None, pic_size=None,
                 font=None, batch_size=10, tag_scheme='BIES', word_vec=True, radical=False, graphic=False, crf=1,
                 ngram=None, metric='F1-score'):
        self.window_size = window_size
        self.filters_number = filters_number
        # 字符种类数目
        self.nums_chars = nums_chars
        # 标签种类数目,例如[18],至于为什么要用数组多此一举不清楚
        self.nums_tags = nums_tags
        # 每个 bucket 中句子的长度
        self.buckets_char = buckets_char
        # 训练集中每个 bucket 中的句子个数
        self.counts = counts
        self.tag_scheme = tag_scheme
        self.graphic = graphic
        self.word_vec = word_vec
        # 是否使用偏旁部首信息
        self.radical = radical
        # 默认为1,即使用一阶条件随机场
        self.crf = crf
        self.ngram = ngram
        self.emb_layer = None
        self.radical_layer = None
        self.gram_layers = []
        self.font = font
        self.pic_size = pic_size
        self.batch_size = batch_size
        self.l_rate = None
        self.decay = None
        self.train_steps = None
        self.saver = None
        self.decode_holders = None
        self.scores = None
        self.params = None
        self.pixels = None
        self.drop_out = None
        self.drop_out_v = None
        # 默认是 F1-score
        self.metric = metric
        self.updates = []
        self.bucket_dit = {}
        # shape = (bucket数量,每个 bucket 中的句子数量,句子长度)
        self.input_v = []
        self.input_w = []
        self.input_p = None
        # LSTM 经全连接后的输出
        self.output = []
        # 标签,ground truth
        self.output_ = []
        self.output_p = []
        self.output_w = []
        self.output_w_ = []

        self.lm_predictions = []
        self.lm_groundtruthes = []

        self.merged_summary = None

        self.merged_summary = None
        self.summaries = []
        # 使用 viterbi 解码
        if self.crf > 0:
            self.transition_char = []
            for i in range(len(self.nums_tags)):
                self.transition_char.append(
                    # 标签转移矩阵,为什么要额外加一呢?
                    tf.get_variable('transitions_char' + str(i), [self.nums_tags[i] + 1, self.nums_tags[i] + 1])
                )

        self.all_metrics = None

        self.all_metrics = ['Precision', 'Recall', 'F1-score', 'True-Negative-Rate', 'Boundary-F1-score']

        while len(self.buckets_char) > len(self.counts):
            self.counts.append(1)

        # 获取每一个 batch 的大小
        # 一个 bucket 就是一个 batch,如果 bucket 中句子的个数小于设定的 batch_size,则对应 batch 的大小就是 bucket 中的句子个数,否则是 batch_size。
        # 即限制了 batch 最大为 batch_size
        self.real_batches = toolbox.get_real_batch(self.counts, self.batch_size)
        self.losses = []
    def __init__(self,
                 nums_chars,
                 nums_tags,
                 buckets_char,
                 counts=None,
                 batch_size=10,
                 tag_scheme='BIES',
                 crf=1,
                 metric='F1-score',
                 ngram=None,
                 co_train=False,
                 highway_layers=1,
                 lambda0=0,
                 lambda1=0,
                 char_freq_loss=False):
        self.ngram = ngram
        self.gram_layers = []

        # 字符种类数目
        self.nums_chars = nums_chars
        # 标签种类数目,例如[18],至于为什么要用数组多此一举不清楚
        self.nums_tags = nums_tags
        # 每个 bucket 中句子的长度
        self.buckets_char = buckets_char
        # 训练集中每个 bucket 中的句子个数
        self.counts = counts
        self.tag_scheme = tag_scheme
        # 默认为1,即使用一阶条件随机场
        self.crf = crf
        self.emb_layer = None
        self.batch_size = batch_size
        self.l_rate = None
        self.decay = None
        self.train_steps = None
        self.saver = None
        self.decode_holders = None
        self.scores = None
        self.params = None
        self.drop_out = None
        self.drop_out_v = None
        # 默认是 F1-score
        self.metric = metric
        self.updates = []
        self.bucket_dit = {}
        # shape = (bucket数量,每个 bucket 中的句子数量,句子长度)
        self.input_v = []
        self.input_w = []
        self.input_p = None
        # LSTM 经全连接后的输出
        self.output = []
        # 标签,ground truth
        self.output_ = []
        self.output_p = []
        self.output_w = []
        self.output_w_ = []

        self.co_train = co_train
        self.highway_layers = highway_layers
        self.char_freq_loss = char_freq_loss

        if self.char_freq_loss:
            self.char_freq_predictions = []
            self.char_freq_groundtruthes = []
            self.lambda1 = lambda1

        if self.co_train:
            self.lm_fw_predictions = []
            self.lm_bw_predictions = []
            self.lm_fw_groundtruthes = []
            self.lm_bw_groundtruthes = []
            self.lambda0 = lambda0

        self.summaries = []
        # 使用 viterbi 解码
        if self.crf > 0:
            self.transition_char = []
            for i in range(len(self.nums_tags)):
                self.transition_char.append(
                    # 标签转移矩阵,为什么要额外加一呢?
                    tf.get_variable(
                        'transitions_char' + str(i),
                        [self.nums_tags[i] + 1, self.nums_tags[i] + 1]))

        self.all_metrics = [
            'Precision', 'Recall', 'F1-score', 'True-Negative-Rate',
            'Boundary-F1-score'
        ]

        while len(self.buckets_char) > len(self.counts):
            self.counts.append(1)

        # 获取每一个 batch 的大小
        # 一个 bucket 就是一个 batch,如果 bucket 中句子的个数小于设定的 batch_size,则对应 batch 的大小就是 bucket 中的句子个数,否则是 batch_size。
        # 即限制了 batch 最大为 batch_size
        self.real_batches = toolbox.get_real_batch(self.counts,
                                                   self.batch_size)
        self.losses = []