Exemple #1
0
        pixels = toolbox.read_chars_pixels(path, font_name, args.picture_size)

    char2idx, idx2char, tag2idx, idx2tag = toolbox.get_dic(chars, tags)

    # train_x: shape=(2,句子数量),2 表示字符本身+偏旁部首
    train_x, train_y, train_max_slen_c, train_max_slen_w, train_max_wlen = \
        toolbox.get_input_vec(path, train_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme)
    dev_x, dev_y, dev_max_slen_c, dev_max_slen_w, dev_max_wlen = \
        toolbox.get_input_vec(path, dev_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme)

    # 读取 ngram 向量
    nums_grams = None
    ng_embeddings = None

    if args.ngram > 1:
        gram2idx = toolbox.get_ngram_dic(ngram)
        train_gram = toolbox.get_gram_vec(path, train_file, gram2idx)
        dev_gram = toolbox.get_gram_vec(path, dev_file, gram2idx)
        # 这一句后 train_x: shape=(4,句子数量),因为加了 2gram 和 3gram
        train_x += train_gram
        dev_x += dev_gram
        nums_grams = []
        for dic in gram2idx:
            nums_grams.append(len(dic.keys()))

        if args.ngram_embeddings is not None:
            print 'Reading N-gram Embeddings...'
            short_ng_emb = args.ngram_embeddings[args.ngram_embeddings.
                                                 index('/') + 1:]
            if not os.path.isfile(path + '/' + short_ng_emb + '_' +
                                  str(args.ngram) + 'gram_sub.txt'):
    def tokenize(self, text):
        """Tokenize the input text

        :param text: the text to tokenize.
        :return: a list of the sentences contained in the text. Each element is
        a list of tokens and each token is a tuple made of its raw string and
        its offset in the text.
        """
        t = time()
        grams, gram2idx = None, None

        (char2idx, unk_chars_idx,
         idx2char, tag2idx, idx2tag, _) = toolbox.get_dicts(self.path,
                                                            True,
                                                            self.tag_scheme,
                                                            self.crf)

        if self.ngram > 1:
            grams = toolbox.read_ngrams(self.path, self.ngram)
        raw_file_f = tempfile.NamedTemporaryFile(mode='w')
        raw_file_f.write(text)
        raw_file_f.flush()

        raw_file = raw_file_f.name
        new_chars, new_grams = None, None

        new_chars = toolbox.get_new_chars(raw_file, char2idx, self.is_space)

        if self.emb_path is not None:
            valid_chars = toolbox.get_valid_chars(new_chars, self.emb_path)
        else:
            valid_chars = None

        (char2idx, idx2char,
         unk_chars_idx, sub_dict) = toolbox.update_char_dict(char2idx,
                                                             new_chars,
                                                             unk_chars_idx,
                                                             valid_chars)

        raw_x, raw_len = toolbox.get_input_vec_tag(None, raw_file, char2idx,
                                                  limit=self.sent_limit + 100,
                                                  is_space=self.is_space)
        #print(f'Got raw_x={raw_x}, raw_len={raw_len}')
        if self.ngram > 1:
            gram2idx = toolbox.get_ngram_dic(grams)
            new_grams = toolbox.get_new_grams(raw_file, gram2idx, is_raw=True,
                                              is_space=self.is_space)

            raw_grams = toolbox.get_gram_vec_tag(None, raw_file, gram2idx,
                                                limit=self.sent_limit + 100,
                                                is_space=self.is_space)

            raw_x += raw_grams

        for k in range(len(raw_x)):
            raw_x[k] = toolbox.pad_zeros(raw_x[k], self.sent_limit + 100)
        with self.main_graph.as_default():
            self.model.define_updates(new_chars=new_chars,
                                emb_path=self.emb_path,
                                char2idx=char2idx)

        with tf.device(self.gpu_config):
            #print('Running updates....', file=sys.stderr)
            self.model.run_updates(self.main_sess)
            #print('Updated.', file=sys.stderr)
        #print(f'Done loading data. Time consumed: {time() - t:3.2f} seconds',
              #file=sys.stderr)
        #sys.stderr.flush()
        #t = time()

        #output_path = f'{raw_file}.conllu'
        #print(f'Before tag {raw_x}.', file=sys.stderr)
        sentences = self.model.tag(raw_x,
                                   idx2tag,
                                   idx2char,
                                   unk_chars_idx,
                                   sub_dict,
                                   self.sess,
                                   batch_size=self.tag_batch)

        #print(f'Done tokenizing. Time consumed: {time() - t:3.2f} seconds',
              #file=sys.stderr)
        #sys.stderr.flush()
        #t = time()
        return sentences