Ejemplo n.º 1
0
    def test(self, sess, t_x, t_y, idx2tag, idx2char, outpath=None, ensemble=None, batch_size=200, tag_num=1):

        t_y = toolbox.unpad_zeros(t_y)
        gold = toolbox.decode_tags(t_y, idx2tag, self.tag_scheme)
        chars = toolbox.decode_chars(t_x[0], idx2char)
        gold_out = toolbox.generate_output(chars, gold, self.tag_scheme)

        pt_holder = None
        if self.graphic:
            pt_holder = self.input_p[0]

        prediction = self.predict(data=t_x, sess=sess, model=self.input_v[0] + self.output[0], index=0, pt_h=pt_holder,
                                  pt=self.pixels, ensemble=ensemble, batch_size=batch_size)
        prediction = toolbox.decode_tags(prediction, idx2tag, self.tag_scheme)
        prediction_out = toolbox.generate_output(chars, prediction, self.tag_scheme)

        scores = toolbox.evaluator(prediction_out, gold_out, metric='All', verbose=True, tag_num=tag_num)

        print 'Best scores: '

        print 'Segmentation F1-score: %f' % scores[2]
        print 'Segmentation Precision: %f' % scores[0]
        print 'Segmentation Recall: %f' % scores[1]
        print 'Segmentation True Negative Rate: %f' % scores[6]
        print 'Segmentation Boundary-F1-score: %f\n' % scores[10]

        print 'Joint POS tagging F-score: %f' % scores[5]
        print 'Joint POS tagging Precision: %f' % scores[3]
        print 'Joint POS tagging Recall: %f' % scores[4]
        print 'Joint POS True Negative Rate: %f' % scores[7]
        print 'Joint POS tagging Boundary-F1-score: %f\n' % scores[13]

        if outpath is not None:
            final_out = prediction_out[0]
            toolbox.printer(final_out, outpath)
Ejemplo n.º 2
0
    def test(self,
             sess,
             t_x,
             t_y,
             idx2tag,
             idx2char,
             outpath=None,
             ensemble=None,
             batch_size=200):

        t_y = toolbox.unpad_zeros(t_y)
        gold = toolbox.decode_tags(t_y, idx2tag, self.tag_scheme)
        chars = toolbox.decode_chars(t_x[0], idx2char)
        gold_out = toolbox.generate_output(chars, gold, self.tag_scheme)

        pt_holder = None
        if self.graphic:
            pt_holder = self.input_p[0]

        prediction = self.predict(data=t_x,
                                  sess=sess,
                                  model=self.input_v[0] + self.output[0],
                                  index=0,
                                  pt_h=pt_holder,
                                  pt=self.pixels,
                                  ensemble=ensemble,
                                  batch_size=batch_size)
        prediction = toolbox.decode_tags(prediction, idx2tag, self.tag_scheme)
        prediction_out = toolbox.generate_output(chars, prediction,
                                                 self.tag_scheme)

        scores = toolbox.evaluator(prediction_out,
                                   gold_out,
                                   tag_scheme=self.tag_scheme,
                                   verbose=True)

        scores = np.asarray(scores)
        scores_f = scores[:, 1]
        best_idx = int(np.argmax(scores_f))

        c_score = scores[0]

        print 'Best scores: '
        print 'Segmentation F-score: %f' % c_score[0]
        print 'Segmentation Precision: %f' % c_score[2]
        print 'Segmentation Recall: %f\n' % c_score[3]

        print 'Joint POS tagging F-score: %f' % c_score[1]
        print 'Joint POS tagging Precision: %f' % c_score[4]
        print 'Joint POS tagging Recall: %f' % c_score[5]

        if outpath is not None:
            if self.tag_scheme == 'parallel':
                final_out = prediction_out[best_idx + 1]
            elif self.tag_scheme == 'mul':
                final_out = prediction_out[best_idx]
            else:
                final_out = prediction_out[0]
            toolbox.printer(final_out, outpath)
Ejemplo n.º 3
0
    def tag(self, sess, r_x, idx2tag, idx2char, expected_scheme='BIES', outpath='out.txt', ensemble=None, batch_size=200, large_file=False):

        chars = toolbox.decode_chars(r_x[0], idx2char)

        pt_holder = None
        if self.graphic:
            pt_holder = self.input_p[0]

        prediction = self.predict(data=r_x, sess=sess, model=self.input_v[0] + self.output[0], index=0, pt_h=pt_holder, pt=self.pixels, ensemble=ensemble, batch_size=batch_size)
        prediction = toolbox.decode_tags(prediction, idx2tag, self.tag_scheme)
        prediction_out = toolbox.generate_output(chars, prediction, self.tag_scheme)

        scheme2idx_short = {'BI': 1, 'BIE': 2, 'BIES': 3, 'Voting': 4}
        scheme2idx_long = {'BIES': 0, 'long': 1}

        if len(prediction_out) > 2:
            final_out = prediction_out[scheme2idx_short[expected_scheme]]
        elif len(prediction_out) == 2:
            final_out = prediction_out[scheme2idx_long[expected_scheme]]
        else:
            final_out = prediction_out[0]
        if large_file:
            return final_out
        else:
            toolbox.printer(final_out, outpath)
Ejemplo n.º 4
0
    def tag(self, sess, r_x, idx2tag, idx2char, char2idx, outpath='out.txt', ensemble=None, batch_size=200,
            large_file=False):

        chars = toolbox.decode_chars(r_x[0], idx2char)

        char_num = len(set(char2idx.values()))

        r_x = np.asarray(r_x)

        r_x[0][r_x[0] > char_num - 1] = char2idx['<UNK>']

        pt_holder = None
        if self.graphic:
            pt_holder = self.input_p[0]

        c_len = len(r_x[0][0])
        idx = self.bucket_dit[c_len]

        real_batch = batch_size * 300 / c_len

        prediction = self.predict(data=r_x, sess=sess, model=self.input_v[idx] + self.output[idx], index=idx,
                                  pt_h=pt_holder, pt=self.pixels, ensemble=ensemble, batch_size=real_batch)
        prediction = toolbox.decode_tags(prediction, idx2tag, self.tag_scheme)
        prediction_out = toolbox.generate_output(chars, prediction, self.tag_scheme)

        final_out = prediction_out[0]
        if large_file:
            return final_out
        else:
            toolbox.printer(final_out, outpath)
Ejemplo n.º 5
0
    def tag(self, r_x, r_x_raw, idx2tag, idx2char, unk_chars, sub_dict, trans_dict, sess, transducer, ensemble=None,
            batch_size=100, outpath=None, sent_seg=False, seg_large=False, form='conll'):

        chars = toolbox.decode_chars(r_x[0], idx2char)

        for i in range(len(r_x[0])):
            for j, n in enumerate(r_x[0][i]):
                if n in sub_dict:
                    r_x[0][i][j] = sub_dict[n]
                elif n in unk_chars:
                    r_x[0][i][j] = 1

        c_len = len(r_x[0][0])
        idx = self.bucket_dit[c_len]

        real_batch = batch_size * 300 / c_len

        transducer_dict = None
        if transducer is not None:
            char2idx = {v: k for k, v in idx2char.items()}

            def transducer_dict(trans_str):
                return self.define_transducer_dict(trans_str, char2idx, sess[-1], transducer)

        prediction = self.predict(data=r_x, sess=sess, model=self.input_v[idx] + self.output[idx], index=idx,
                                  argmax=True, batch_size=real_batch, ensemble=ensemble)

        predictions = toolbox.decode_tags(prediction, idx2tag)

        if self.is_space == 'sea':
            prediction_out, raw_out = toolbox.generate_output_sea(chars, predictions)
            multi_out = prediction_out
        else:
            prediction_out, raw_out, multi_out = toolbox.generate_output(chars, predictions, trans_dict,
                                                                         transducer_dict, multi_tok=True)

        pre_out = []
        mut_out = []
        for pre in prediction_out:
            pre_out += pre
        for mul in multi_out:
            mut_out += mul
        prediction_out = pre_out
        multi_out = mut_out

        if form == 'mlp1' or form == 'mlp2':
            prediction_out = toolbox.mlp_post(r_x_raw, prediction_out, self.is_space, form)

        if not seg_large:
            toolbox.printer(r_x_raw, prediction_out, multi_out, outpath, sent_seg, form)

        else:
            return prediction_out, multi_out
Ejemplo n.º 6
0
    def tag(self, r_x, idx2tag, idx2char, unk_chars, sub_dict,
            sess, ensemble=None,
            batch_size=100):

        chars = toolbox.decode_chars(r_x[0], idx2char)

        for i in range(len(r_x[0])):
            for j, n in enumerate(r_x[0][i]):
                if n in sub_dict:
                    r_x[0][i][j] = sub_dict[n]
                elif n in unk_chars:
                    r_x[0][i][j] = 1

        c_len = len(r_x[0][0])
        idx = self.bucket_dit[c_len]

        real_batch = batch_size * 300 / c_len

        #print('predict', file=sys.stderr)
        prediction = self.predict(data=r_x,
                                  sess=sess,
                                  model=self.input_v[idx] + self.output[idx],
                                  index=idx,
                                  argmax=True,
                                  batch_size=real_batch,
                                  ensemble=ensemble)

        predictions = toolbox.decode_tags(prediction, idx2tag)
        #print(f'GOT predictions {len(chars)}, {len(chars[0])}', file=sys.stderr)
        #print(f'GOT predictions {len(predictions)},',
              #f'{len(predictions[0])}, {len(predictions[0][0])}',
              #file=sys.stderr)
        sentences = toolbox.generate_output_offsets(chars, predictions)
        #print(f'generate_output_offsets DONE', file=sys.stderr)

        return sentences
Ejemplo n.º 7
0
    def test(self,
             t_x1,
             t_x2,
             t_y_raw,
             t_y_gold,
             idx2tag,
             idx2char,
             unk_chars,
             sub_dict,
             trans_dict,
             sess,
             transducer,
             ensemble=None,
             batch_size=100,
             sent_seg=False,
             bias=-1,
             outpath=None,
             trans_type='mix',
             test_result_path=None):

        chars = toolbox.decode_chars_new(t_x1[0], idx2char)
        gold_out = t_y_gold

        for i in range(len(t_x1[0])):
            for j, n in enumerate(t_x1[0][i]):
                if n in sub_dict:
                    t_x1[0][i][j] = sub_dict[n]
                elif n in unk_chars:
                    t_x1[0][i][j] = 1

        for i in range(len(t_x2[0])):
            for j, n in enumerate(t_x2[0][i]):
                if n in sub_dict:
                    t_x2[0][i][j] = sub_dict[n]
                elif n in unk_chars:
                    t_x2[0][i][j] = 1

        transducer_dict = None
        if transducer is not None:
            char2idx = {v: k for k, v in idx2char.items()}

            def transducer_dict(trans_str):
                return self.define_transducer_dict(trans_str, char2idx,
                                                   sess[-1], transducer)

        if bias < 0:
            argmax = True
        else:
            argmax = False
        t_x = t_x1 + t_x2
        # pdb.set_trace()
        prediction = self.predict(data_v=t_x,
                                  sess=sess,
                                  model=self.input_v1[0] + self.input_v2[0] +
                                  self.output[0],
                                  index=0,
                                  argmax=argmax,
                                  batch_size=batch_size,
                                  ensemble=ensemble)

        if bias >= 0 and self.crf == 0:
            prediction = [toolbox.biased_out(prediction[0], bias)]

        predictions = toolbox.decode_tags(prediction, idx2tag)
        # pdb.set_trace()
        if self.is_space == 'sea':
            prediction_out, raw_out = toolbox.generate_output_sea(
                chars, predictions)
        else:
            prediction_out, raw_out = toolbox.generate_output(
                chars,
                predictions,
                trans_dict,
                transducer_dict,
                trans_type=trans_type)

        if sent_seg:
            scores = evaluation.evaluator(prediction_out, gold_out, raw_out,
                                          t_y_raw)
        else:
            scores = evaluation.evaluator(prediction_out,
                                          gold_out,
                                          verbose=True)

        if outpath is not None:
            wt = codecs.open(outpath, 'w', encoding='utf-8')
            for pre in prediction_out[0]:
                wt.write(pre + '\n')
            wt.close()

        if test_result_path is not None:
            wt = codecs.open(test_result_path, 'w', encoding='utf-8')
            if sent_seg:
                wt.write('Sentence segmentation:' + '\n')
                wt.write('F score: %f' % scores[5] + '\n')
                wt.write('Precision: %f' % scores[3] + '\n')
                wt.write('Recall: %f\n' % scores[4] + '\n')
                wt.write('Word segmentation:' + '\n')
                wt.write('F score: %f' % scores[2] + '\n')
                wt.write('Precision: %f' % scores[0] + '\n')
                wt.write('Recall: %f\n' % scores[1] + '\n')
            else:
                wt.write('F score: %f' % scores[2] + '\n')
                wt.write('Precision: %f' % scores[0] + '\n')
                wt.write('Recall: %f\n' % scores[1] + '\n')
                wt.write('True negative rate: %f' % scores[3] + '\n')
                wt.close()
        print 'Evaluation scores:'

        if sent_seg:
            print 'Sentence segmentation:'
            print 'F score: %f' % scores[5]
            print 'Precision: %f' % scores[3]
            print 'Recall: %f\n' % scores[4]
            print 'Word segmentation:'
            print 'F score: %f' % scores[2]
            print 'Precision: %f' % scores[0]
            print 'Recall: %f\n' % scores[1]
        else:
            print 'Precision: %f' % scores[0]
            print 'Recall: %f' % scores[1]
            print 'F score: %f' % scores[2]
            print 'True negative rate: %f' % scores[3]
Ejemplo n.º 8
0
    def train(self,
              t_x1,
              t_x2,
              t_y,
              v_x1,
              v_x2,
              v_y_raw,
              v_y_gold,
              idx2tag,
              idx2char,
              unk_chars,
              trans_dict,
              sess,
              epochs,
              trained_model,
              transducer=None,
              lr=0.05,
              decay=0.05,
              decay_step=1,
              sent_seg=False,
              outpath=None):
        lr_r = lr

        best_epoch = 0
        best_score = [0] * 6

        chars = toolbox.decode_chars_new(v_x1[0], idx2char)
        for i in range(len(v_x1[0])):
            for j, n in enumerate(v_x1[0][i]):
                if n in unk_chars:
                    v_x1[0][i][j] = 1
        for i in range(len(v_x2[0])):
            for j, n in enumerate(v_x2[0][i]):
                if n in unk_chars:
                    v_x2[0][i][j] = 1
        for i in range(len(t_x1[0])):
            for k in range(len(t_x1[0][i])):
                for j, n in enumerate(t_x1[0][i][k]):
                    if n in unk_chars:
                        t_x1[0][i][k][j] = 1
        for i in range(len(t_x2[0])):
            for k in range(len(t_x2[0][i])):
                for j, n in enumerate(t_x2[0][i][k]):
                    if n in unk_chars:
                        t_x2[0][i][k][j] = 1

        transducer_dict = None
        if transducer is not None:
            char2idx = {k: v for v, k in idx2char.items()}

            def transducer_dict(trans_str):
                return self.define_transducer_dict(trans_str, char2idx,
                                                   sess[-1], transducer)

        for epoch in range(epochs):
            print 'epoch: %d' % (epoch + 1)
            t = time()
            if epoch % decay_step == 0 and decay > 0:
                lr_r = lr / (1 + decay * (epoch / decay_step))
# #(Pdb) print(np.array(t_x1[0]).shape)
# (7,)
# (Pdb) print(np.array(t_x1[0][0]).shape)
# (5719, 50)
# (Pdb) print(np.array(t_x1[0][1]).shape)
# (5473, 100)
# (Pdb) print(np.array(t_x1[0][2]).shape)
# (3135, 150)
# (Pdb) print(np.array(t_x1[0][3]).shape)
# (1323, 200)
# (Pdb) print(np.array(t_x1[0][4]).shape)
# (538, 250)
# (Pdb) print(np.array(t_x1[0][5]).shape)
# (351, 300)
# (Pdb) print(np.array(t_x1[0][6]).shape)
# (3, 300)
# (Pdb) print(np.array(t_x1[0][7]).shape)
# #
            data_list = t_x1 + t_x2 + t_y

            samples = zip(*data_list)

            random.shuffle(samples)
            # pdb.set_trace()
            for sample in samples:
                c_len = len(sample[0][0])
                idx = self.bucket_dit[c_len]
                real_batch_size = self.num_gpus * self.batch_size
                model = self.input_v1[idx] + self.input_v2[idx] + self.output_[
                    idx]
                # pdb.set_trace()
                Batch.train(sess=sess[0],
                            model=model,
                            batch_size_h=self.batch_size_h,
                            batch_size=self.real_batches[idx],
                            config=self.train_step[idx],
                            lr=self.l_rate,
                            lrv=lr_r,
                            dr=self.drop_out,
                            drv=self.drop_out_v,
                            data=list(sample),
                            verbose=False,
                            num_gpus=self.num_gpus)

            predictions = []

            #for v_b_x in zip(*v_x):
            c_len = len(v_x1[0][0])
            idx = self.bucket_dit[c_len]
            data_v = v_x1 + v_x2
            b_prediction = self.predict(data_v,
                                        sess=sess,
                                        model=self.input_v1[idx] +
                                        self.input_v2[idx] + self.output[idx],
                                        index=idx,
                                        argmax=True,
                                        batch_size=200)
            # pdb.set_trace()
            b_prediction = toolbox.decode_tags(b_prediction, idx2tag)

            predictions.append(b_prediction)
            # pdb.set_trace()
            predictions = zip(*predictions)
            predictions = toolbox.merge_bucket(predictions)

            if self.is_space == 'sea':
                prediction_out, raw_out = toolbox.generate_output_sea(
                    chars, predictions)
            else:
                prediction_out, raw_out = toolbox.generate_output(
                    chars, predictions, trans_dict, transducer_dict)

            if sent_seg:
                scores = evaluation.evaluator(prediction_out, v_y_gold,
                                              raw_out, v_y_raw)
            else:
                scores = evaluation.evaluator(prediction_out, v_y_gold)
            if sent_seg:
                c_score = scores[2] * scores[5]
                c_best_score = best_score[2] * best_score[5]
            else:
                c_score = scores[2]
                c_best_score = best_score[2]

            if c_score > c_best_score:
                best_epoch = epoch + 1
                best_score = scores
                self.saver.save(sess[0], trained_model, write_meta_graph=False)

                if outpath is not None:
                    wt = codecs.open(outpath, 'w', encoding='utf-8')
                    for pre in prediction_out[0]:
                        wt.write(pre + '\n')
                    wt.close()

            if sent_seg:
                print 'Sentence segmentation:'
                print 'F score: %f\n' % scores[5]
                print 'Word segmentation:'
                print 'F score: %f' % scores[2]
            else:
                print 'F score: %f' % c_score
            print 'Time consumed: %d seconds' % int(time() - t)
        print 'Training is finished!'
        if sent_seg:
            print 'Sentence segmentation:'
            print 'Best F score: %f' % best_score[5]
            print 'Best Precision: %f' % best_score[3]
            print 'Best Recall: %f\n' % best_score[4]
            print 'Word segmentation:'
            print 'Best F score: %f' % best_score[2]
            print 'Best Precision: %f' % best_score[0]
            print 'Best Recall: %f\n' % best_score[1]
        else:
            print 'Best F score: %f' % best_score[2]
            print 'Best Precision: %f' % best_score[0]
            print 'Best Recall: %f\n' % best_score[1]
        print 'Best epoch: %d' % best_epoch
Ejemplo n.º 9
0
    def train(self, t_x, t_y, v_x, v_y, idx2tag, idx2char, sess, epochs, trained_model, lr=0.05, decay=0.05, decay_step=1):
        lr_r = lr
        best_epoch = 0
        best_score = 0

        best_seg = 0
        best_pos = 0

        v_y = toolbox.merge_bucket(v_y)
        v_y = toolbox.unpad_zeros(v_y)

        gold = toolbox.decode_tags(v_y, idx2tag, self.tag_scheme)

        input_chars = toolbox.merge_bucket([v_x[0]])

        chars = toolbox.decode_chars(input_chars[0], idx2char)

        gold_out = toolbox.generate_output(chars, gold, self.tag_scheme)

        for epoch in range(epochs):
            print 'epoch: %d' % (epoch + 1)
            t = time()
            if epoch % decay_step == 0 and decay > 0:
                lr_r = lr/(1 + decay*(epoch/decay_step))

            data_list = t_x + t_y

            samples = zip(*data_list)

            random.shuffle(samples)

            for sample in samples:
                c_len = len(sample[0][0])
                idx = self.bucket_dit[c_len]
                real_batch_size = self.real_batches[idx]
                model = self.input_v[idx] + self.output_[idx]
                pt_holder = None
                if self.graphic:
                    pt_holder = self.input_p[idx]
                Batch.train(sess=sess[0], model=model, batch_size=real_batch_size, config=self.train_step[idx], lr=self.l_rate, lrv=lr_r, dr=self.drop_out, drv=self.drop_out_v, data=list(sample), pt_h=pt_holder, pixels=self.pixels, verbose=False)

            predictions = []

            for v_b_x in zip(*v_x):
                c_len = len(v_b_x[0][0])
                idx = self.bucket_dit[c_len]
                pt_holder = None
                if self.graphic:
                    pt_holder = self.input_p[idx]
                b_prediction = self.predict(data=v_b_x, sess=sess, model=self.input_v[idx] + self.output[idx], index=idx, pt_h=pt_holder, pt=self.pixels, batch_size=100)
                b_prediction = toolbox.decode_tags(b_prediction, idx2tag, self.tag_scheme)
                predictions.append(b_prediction)

            predictions = zip(*predictions)
            predictions = toolbox.merge_bucket(predictions)

            prediction_out = toolbox.generate_output(chars, predictions, self.tag_scheme)

            scores = toolbox.evaluator(prediction_out, gold_out, tag_scheme=self.tag_scheme)
            scores = np.asarray(scores)

            c_score = np.max(scores[:,1])*np.max(scores[:,0])
            if c_score > best_score and epoch > 4:
                best_epoch = epoch + 1
                best_score = c_score
                best_seg = np.max(scores[:,0])
                best_pos = np.max(scores[:,1])
                self.saver.save(sess[0], trained_model, write_meta_graph=False)
            print 'Time consumed: %d seconds' % int(time() - t)
        print 'Training is finished!'
        print 'Best segmentation score: %f' % best_seg
        print 'Best POS tag score: %f' % best_pos
        print 'Best epoch: %d' % best_epoch
Ejemplo n.º 10
0
    def train(self,
              t_x,
              t_y,
              v_x,
              v_y_raw,
              v_y_gold,
              idx2tag,
              idx2char,
              unk_chars,
              trans_dict,
              sess,
              epochs,
              trained_model,
              transducer=None,
              lr=0.05,
              decay=0.05,
              decay_step=1,
              sent_seg=False,
              outpath=None):
        lr_r = lr

        best_epoch = 0
        best_score = [0] * 6

        chars = toolbox.decode_chars(v_x[0], idx2char)

        for i in range(len(v_x[0])):
            for j, n in enumerate(v_x[0][i]):
                if n in unk_chars:
                    v_x[0][i][j] = 1

        for i in range(len(t_x[0])):
            for k in range(len(t_x[0][i])):
                for j, n in enumerate(t_x[0][i][k]):
                    if n in unk_chars:
                        t_x[0][i][k][j] = 1

        transducer_dict = None
        if transducer is not None:
            char2idx = {k: v for v, k in idx2char.items()}

            def transducer_dict(trans_str):
                return self.define_transducer_dict(trans_str, char2idx,
                                                   sess[-1], transducer)

        for epoch in range(epochs):
            print('epoch: %d' % (epoch + 1))
            sys.stdout.flush()
            t = time()
            if epoch % decay_step == 0 and decay > 0:
                lr_r = lr / (1 + decay * (epoch / decay_step))

            data_list = t_x + t_y

            samples = list(zip(*data_list))

            random.shuffle(samples)

            for sample in samples:
                c_len = len(sample[0][0])
                idx = self.bucket_dit[c_len]
                real_batch_size = self.real_batches[idx]
                model = self.input_v[idx] + self.output_[idx]
                Batch.train(sess=sess[0],
                            model=model,
                            batch_size=real_batch_size,
                            config=self.train_step[idx],
                            lr=self.l_rate,
                            lrv=lr_r,
                            dr=self.drop_out,
                            drv=self.drop_out_v,
                            data=list(sample),
                            verbose=False)

            predictions = []

            #for v_b_x in zip(*v_x):
            c_len = len(v_x[0][0])
            idx = self.bucket_dit[c_len]
            b_prediction = self.predict(data=v_x,
                                        sess=sess,
                                        model=self.input_v[idx] +
                                        self.output[idx],
                                        index=idx,
                                        argmax=True,
                                        batch_size=200)
            b_prediction = toolbox.decode_tags(b_prediction, idx2tag)
            predictions.append(b_prediction)

            predictions = list(zip(*predictions))
            predictions = toolbox.merge_bucket(predictions)

            if self.is_space == 'sea':
                prediction_out, raw_out = toolbox.generate_output_sea(
                    chars, predictions)
            else:
                prediction_out, raw_out = toolbox.generate_output(
                    chars, predictions, trans_dict, transducer_dict)

            if sent_seg:
                scores = evaluation.evaluator(prediction_out, v_y_gold,
                                              raw_out, v_y_raw)
            else:
                scores = evaluation.evaluator(prediction_out, v_y_gold)
            if sent_seg:
                c_score = scores[2] * scores[5]
                c_best_score = best_score[2] * best_score[5]
            else:
                c_score = scores[2]
                c_best_score = best_score[2]

            if c_score > c_best_score:
                best_epoch = epoch + 1
                best_score = scores
                self.saver.save(sess[0], trained_model, write_meta_graph=True)

                if outpath is not None:
                    wt = codecs.open(outpath, 'w', encoding='utf-8')
                    for pre in prediction_out[0]:
                        wt.write(pre + '\n')
                    wt.close()

            if sent_seg:
                print('Sentence segmentation F-score: %f' % scores[5])
                print('Word segmentation     F-score: %f' % scores[2])
            else:
                print('F score: %f' % c_score)
            print('Time consumed: %d seconds\n' % int(time() - t))
            sys.stdout.flush()
        print('Training is finished!')
        if sent_seg:
            print('Sentence segmentation:')
            print('Best F score: %f' % best_score[5])
            print('Best Precision: %f' % best_score[3])
            print('Best Recall: %f\n' % best_score[4])
            print('Word segmentation:')
            print('Best F score: %f' % best_score[2])
            print('Best Precision: %f' % best_score[0])
            print('Best Recall: %f\n' % best_score[1])
        else:
            print('Best F score: %f' % best_score[2])
            print('Best Precision: %f' % best_score[0])
            print('Best Recall: %f\n' % best_score[1])
        print('Best epoch: %d' % best_epoch)
Ejemplo n.º 11
0
    def train(self, t_x, t_y, v_x, v_y, idx2tag, idx2char, sess, epochs, trained_model, lr=0.05, decay=0.05,
              decay_step=1, tag_num=1):
        lr_r = lr

        best_epoch, best_score, best_seg, best_pos, c_tag, c_seg, c_score = {}, {}, {}, {}, {}, {}, {}

        pindex = 0

        metric = self.metric

        for m in self.all_metrics:
            best_epoch[m] = 0
            best_score[m] = 0

            best_seg[m] = 0
            best_pos[m] = 0

            c_tag[m] = 0
            c_seg[m] = 0
            c_score[m] = 0

        v_y = toolbox.merge_bucket(v_y)
        v_y = toolbox.unpad_zeros(v_y)

        gold = toolbox.decode_tags(v_y, idx2tag, self.tag_scheme)
        input_chars = toolbox.merge_bucket([v_x[0]])

        chars = toolbox.decode_chars(input_chars[0], idx2char)

        gold_out = toolbox.generate_output(chars, gold, self.tag_scheme)

        for epoch in range(epochs):
            print 'epoch: %d' % (epoch + 1)
            t = time()
            if epoch % decay_step == 0 and decay > 0:
                lr_r = lr/(1 + decay*(epoch/decay_step))

            data_list = t_x + t_y

            samples = zip(*data_list)

            random.shuffle(samples)

            for sample in samples:
                c_len = len(sample[0][0])
                idx = self.bucket_dit[c_len]
                real_batch_size = self.real_batches[idx]
                model = self.input_v[idx] + self.output_[idx]
                pt_holder = None
                if self.graphic:
                    pt_holder = self.input_p[idx]
                Batch.train(sess=sess[0], model=model, batch_size=real_batch_size, config=self.train_step[idx],
                            lr=self.l_rate, lrv=lr_r, dr=self.drop_out, drv=self.drop_out_v, data=list(sample),
                            pt_h=pt_holder, pixels=self.pixels, verbose=False)

            predictions = []

            for v_b_x in zip(*v_x):
                c_len = len(v_b_x[0][0])
                idx = self.bucket_dit[c_len]
                pt_holder = None
                if self.graphic:
                    pt_holder = self.input_p[idx]
                b_prediction = self.predict(data=v_b_x, sess=sess, model=self.input_v[idx] + self.output[idx],
                                            index=idx, pt_h=pt_holder, pt=self.pixels, batch_size=200)
                b_prediction = toolbox.decode_tags(b_prediction, idx2tag, self.tag_scheme)
                predictions.append(b_prediction)

            predictions = zip(*predictions)
            predictions = toolbox.merge_bucket(predictions)

            prediction_out = toolbox.generate_output(chars, predictions, self.tag_scheme)

            scores = toolbox.evaluator(prediction_out, gold_out, metric=metric, verbose=True, tag_num=tag_num)
            scores = np.asarray(scores)

            #Score_seg * Score_seg&tag
            c_seg['Precision'] = scores[0]
            c_seg['Recall'] = scores[1]
            c_seg['F1-score'] = scores[2]
            c_seg['True-Negative-Rate'] = scores[6]
            c_seg['Boundary-F1-score'] = scores[10]
            if self.tag_scheme != 'seg':
                c_tag['Precision'] = scores[3]
                c_tag['Recall'] = scores[4]
                c_tag['F1-score'] = scores[5]
                c_tag['True-Negative-Rate'] = scores[7]
                c_tag['Boundary-F1-score'] = scores[13]
            else:
                c_tag['Precision'] = 1
                c_tag['Recall'] = 1
                c_tag['F1-score'] = 1
                c_tag['True-Negative-Rate'] = 1
                c_tag['Boundary-F1-score'] = 1

            if metric == 'All':
                for m in self.all_metrics:
                    print 'Segmentation ' + m + ': %f' % c_seg[m]
                    print 'POS Tagging ' + m + ': %f\n' % c_tag[m]
                pindex = trained_model.rindex('/') + 1
            else:
                print 'Segmentation ' + metric + ': %f' % c_seg[metric]
                if self.tag_scheme != 'seg':
                    print 'POS Tagging ' + metric + ': %f\n' % c_tag[metric]

            for m in self.all_metrics:
                c_score[m] = c_seg[m] * c_tag[m]

            if metric == 'All':
                for m in self.all_metrics:
                    if c_score[m] > best_score[m] and epoch > 4:
                        best_epoch[m] = epoch + 1
                        best_score[m] = c_score[m]
                        best_seg[m] = c_seg[m]
                        best_pos[m] = c_tag[m]
                        self.saver.save(sess[0],  trained_model[:pindex] + m + '_' + trained_model[pindex:],
                                        write_meta_graph=False)

            elif c_score[metric] > best_score[metric] and epoch > 4:
                best_epoch[metric] = epoch + 1
                best_score[metric] = c_score[metric]
                best_seg[metric] = c_seg[metric]
                best_pos[metric] = c_tag[metric]
                self.saver.save(sess[0], trained_model, write_meta_graph=False)
            print 'Time consumed: %d seconds' % int(time() - t)
        print 'Training is finished!'

        if metric == 'All':
            for m in self.all_metrics:
                print 'Best segmentation ' + m + ': %f' % best_seg[m]
                print 'Best POS Tagging ' + m + ': %f' % best_pos[m]
                print 'Best epoch: %d\n' % best_epoch[m]
        else:
            print 'Best segmentation ' + metric + ': %f' % best_seg[metric]
            print 'Best POS Tagging ' + metric + ': %f' % best_pos[metric]
            print 'Best epoch: %d\n' % best_epoch[metric]
Ejemplo n.º 12
0
    def train(self, t_x, t_y, v_x, v_y, idx2tag, idx2char, sess,
              epochs, trained_model, lr=0.05, decay=0.05, decay_step=1, tag_num=1):
        """

        :param t_x: b_train_x
        :param t_y: b_train_y
        :param v_x: b_dev_x
        :param v_y: b_dev_y
        :param idx2tag:
        :param idx2char:
        :param sess:
        :param epochs: 训练轮数
        :param trained_model: 训练好的模型参数
        :param lr: 学习率
        :param decay: 学习率衰减率
        :param decay_step:
        :param tag_num: 标签种类个数
        """
        log_dir = "./train_log"
        shutil.rmtree(log_dir)
        train_writer = tf.summary.FileWriter(log_dir, sess[0].graph)

        lr_r = lr

        best_epoch, best_score, best_seg, best_pos, c_tag, c_seg, c_score = {}, {}, {}, {}, {}, {}, {}

        pindex = 0

        metric = self.metric

        # 每种衡量标准下都有对应的最佳结果
        for m in self.all_metrics:
            best_epoch[m] = 0
            best_score[m] = 0

            best_seg[m] = 0
            best_pos[m] = 0

            c_tag[m] = 0
            c_seg[m] = 0
            c_score[m] = 0

        v_y = toolbox.merge_bucket(v_y)
        v_y = toolbox.unpad_zeros(v_y)

        gold = toolbox.decode_tags(v_y, idx2tag, self.tag_scheme)
        # 0 是字符本身,1 是偏旁部首,2、3 分别是 2gram 和 3gram
        input_chars = toolbox.merge_bucket([v_x[0]])

        chars = toolbox.decode_chars(input_chars[0], idx2char)
        # 正确答案,实际上直接读取 dev.txt 即可得到,不知为何还要这么麻烦通过各种 ID 转换获取
        gold_out = toolbox.generate_output(chars, gold, self.tag_scheme)

        for epoch in range(epochs):
            print 'epoch: %d' % (epoch + 1)
            t = time()
            # 在 decay_step 轮之后,衰减学习率
            if epoch % decay_step == 0 and decay > 0:
                lr_r = lr / (1 + decay * (epoch / decay_step))
            # data_list: shape=(5,bucket 数量,bucket 中句子个数,句子长度)
            data_list = t_x + t_y
            # samples: shape=(bucket 数量,5, bucket 中句子个数,句子长度),相当于置换了 data_list 中的 shape[0] 和 shape[1]
            samples = zip(*data_list)

            random.shuffle(samples)

            # 遍历每一个 bucket
            for sample in samples:
                # sample: shape=(5, bucket 中句子个数,句子长度)
                # 当前 bucket 中的句子长度
                c_len = len(sample[0][0])
                # 当前 bucket 的序号
                idx = self.bucket_dit[c_len]
                real_batch_size = self.real_batches[idx]
                # 当前 bucket 的模型的输入和输出(注意每个 bucket 都有一个单独的模型)
                model_placeholders = self.input_v[idx] + self.output_[idx] + self.lm_groundtruthes[idx]
                pt_holder = None
                if self.graphic:
                    pt_holder = self.input_p[idx]
                # sess[0] 是 main_sess, sess[1] 是 decode_sess(如果使用 CRF 的话)
                # 训练当前的 bucket,这个函数里面才真正地为模型填充了数据并运行(以 real_batch_size 为单位,将 bucket 中的句子依次喂给模型)
                # 被 sess.run 的是 config=self.train_step[idx],train_step[idx] 就会触发 BP 更新参数了
                Batch.train(sess=sess[0], placeholders=model_placeholders, batch_size=real_batch_size,
                            train_step=self.train_steps[idx],loss=self.losses[idx],
                            lr=self.l_rate, lrv=lr_r, dr=self.drop_out, drv=self.drop_out_v, data=list(sample),
                            # debug_variable=[self.lm_output[idx], self.lm_output_[idx], self.output[idx], self.output_[idx]],
                            pt_h=pt_holder, pixels=self.pixels, verbose=False,
                            merged_summary=self.merged_summary, log_writer=train_writer,
                            single_summary=self.summaries[idx], epoch_index=epoch)

            predictions = []
            # 遍历每个 bucket, 用开发集测试准确率
            for v_b_x in zip(*v_x):
                # v_b_x: shape=(4,bucket 中句子个数,句子长度)
                c_len = len(v_b_x[0][0])
                idx = self.bucket_dit[c_len]
                pt_holder = None
                if self.graphic:
                    pt_holder = self.input_p[idx]
                b_prediction = self.predict(data=v_b_x, sess=sess, model=self.input_v[idx] + self.output[idx],
                                            index=idx, pt_h=pt_holder, pt=self.pixels, batch_size=100)
                b_prediction = toolbox.decode_tags(b_prediction, idx2tag, self.tag_scheme)
                predictions.append(b_prediction)

            predictions = zip(*predictions)
            predictions = toolbox.merge_bucket(predictions)

            prediction_out = toolbox.generate_output(chars, predictions, self.tag_scheme)

            scores = toolbox.evaluator(prediction_out, gold_out, metric=metric, verbose=True, tag_num=tag_num)
            scores = np.asarray(scores)

            # Score_seg * Score_seg&tag
            c_seg['Precision'] = scores[0]
            c_seg['Recall'] = scores[1]
            c_seg['F1-score'] = scores[2]
            c_seg['True-Negative-Rate'] = scores[6]
            c_seg['Boundary-F1-score'] = scores[10]
            if self.tag_scheme != 'seg':
                c_tag['Precision'] = scores[3]
                c_tag['Recall'] = scores[4]
                c_tag['F1-score'] = scores[5]
                c_tag['True-Negative-Rate'] = scores[7]
                c_tag['Boundary-F1-score'] = scores[13]
            else:
                c_tag['Precision'] = 1
                c_tag['Recall'] = 1
                c_tag['F1-score'] = 1
                c_tag['True-Negative-Rate'] = 1
                c_tag['Boundary-F1-score'] = 1

            if metric == 'All':
                for m in self.all_metrics:
                    print 'Segmentation ' + m + ': %f' % c_seg[m]
                    print 'POS Tagging ' + m + ': %f\n' % c_tag[m]
                pindex = trained_model.rindex('/') + 1
            else:
                print 'Segmentation ' + metric + ': %f' % c_seg[metric]
                if self.tag_scheme != 'seg':
                    print 'POS Tagging ' + metric + ': %f\n' % c_tag[metric]

            for m in self.all_metrics:
                c_score[m] = c_seg[m] * c_tag[m]

            if metric == 'All':
                for m in self.all_metrics:
                    if c_score[m] > best_score[m] and epoch > 4:
                        best_epoch[m] = epoch + 1
                        best_score[m] = c_score[m]
                        best_seg[m] = c_seg[m]
                        best_pos[m] = c_tag[m]
                        self.saver.save(sess[0], trained_model[:pindex] + m + '_' + trained_model[pindex:],
                                        write_meta_graph=False)

            elif c_score[metric] > best_score[metric] and epoch > 4:
                best_epoch[metric] = epoch + 1
                best_score[metric] = c_score[metric]
                best_seg[metric] = c_seg[metric]
                best_pos[metric] = c_tag[metric]
                self.saver.save(sess[0], trained_model, write_meta_graph=False)
            print 'Time consumed: %d seconds' % int(time() - t)
        print 'Training is finished!'

        if metric == 'All':
            for m in self.all_metrics:
                print 'Best segmentation ' + m + ': %f' % best_seg[m]
                print 'Best POS Tagging ' + m + ': %f' % best_pos[m]
                print 'Best epoch: %d\n' % best_epoch[m]
        else:
            print 'Best segmentation ' + metric + ': %f' % best_seg[metric]
            print 'Best POS Tagging ' + metric + ': %f' % best_pos[metric]
            print 'Best epoch: %d\n' % best_epoch[metric]