print 'Longest sentence by character is %d. ' % max_step_c print 'Longest sentence by word is %d. ' % max_step_w print 'Longest word is %d. ' % max_w_len # b_train_x: shape=(4, bucket 数量,) b_train_x, b_train_y = toolbox.buckets(train_x, train_y, size=args.bucket_size) b_dev_x, b_dev_y = toolbox.buckets(dev_x, dev_y, size=args.bucket_size) # 在句子在末尾填充 0 从而使得每个 bucket 内的句子长度保持一致 # b_train_x: shape=(4, bucket 数量,每个 bucket 内的句子数量,句子长度) # b_train_y: shape=(1, bucket 数量,每个 bucket 内的句子数量,句子长度),1表示每个字对应1个标签 b_train_x, b_train_y, b_buckets, b_counts = toolbox.pad_bucket( b_train_x, b_train_y) # b_buckets:每一个 bucket 中句子的长度(已经对齐过,bucket 中所有句子的长度一致) # b_counts:每一个 bucket 中句子的个数 b_dev_x, b_dev_y, b_buckets, _ = toolbox.pad_bucket(b_dev_x, b_dev_y, bucket_len_c=b_buckets) print 'Training set: %d instances; Dev set: %d instances.' % (len( train_x[0]), len(dev_x[0])) nums_tags = toolbox.get_nums_tags(tag2idx, args.tag_scheme) # 用来对session进行参数配置,allow_soft_placement 表示如果你指定的设备不存在,允许TF自动分配设备 config = tf.ConfigProto(allow_soft_placement=True) gpu_config = "/gpu:" + str(args.gpu) print 'Initialization....' t = time()
limit=args.sent_limit, sent_seg=args.sent_seg, is_space=is_space, ignore_space=args.ignore_space) train_x += train_gram dev_x += dev_gram nums_grams = [] for dic in gram2idx: nums_grams.append(len(dic.keys())) max_len = max(max_len_train, max_len_dev) b_train_x, b_train_y = toolbox.buckets(train_x, train_y, size=args.bucket_size) b_train_x, b_train_y, b_lens, b_count = toolbox.pad_bucket( b_train_x, b_train_y, max_len) b_dev_x = [toolbox.pad_zeros(dev_x_i, max_len) for dev_x_i in dev_x] b_dev_y_gold = [ line.strip() for line in codecs.open( path + '/tag_dev_gold.txt', 'r', encoding='utf-8') ] nums_tag = len(tag2idx) config = tf.ConfigProto(allow_soft_placement=True) gpu_config = "/gpu:" + str(args.gpu) transducer = None transducer_graph = None