Beispiel #1
0
def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
    global last_f1
    global lr
    time0 = time.time()
    batch_indexs = np.random.permutation(n_tr_batches)  # shuffle the training data
    for batch in tqdm(xrange(n_tr_batches)):
        global_step = sess.run(model.global_step)
        if 0 == (global_step + 1) % FLAGS.valid_step:
            valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
            print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
                global_step, valid_cost, precision, recall, f1, time.time() - time0))
            time0 = time.time()
            if f1 > last_f1:
                last_f1 = f1
                saving_path = model.saver.save(sess, model_path, global_step+1)
                print('saved new model to %s ' % saving_path)
        # training
        batch_id = batch_indexs[batch]
        [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                     model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
        summary, _cost, _, _ = sess.run(train_fetches, feed_dict)  # the cost is the mean cost of one batch
        # valid per 500 steps
        if 0 == (global_step + 1) % 500:
            train_writer.add_summary(summary, global_step)
            batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
            [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
            y_batch = to_categorical(y_batch)
            _batch_size = len(y_batch)
            feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                         model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
            summary, _cost = sess.run(valid_fetches, feed_dict)
            test_writer.add_summary(summary, global_step)
Beispiel #2
0
def valid_epoch(data_path, sess, model):
    """Test on the valid data."""
    va_batches = os.listdir(data_path)
    n_va_batches = len(va_batches)
    _costs = 0.0
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    for i in xrange(n_va_batches):
        [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
        marked_labels_list.extend(y_batch)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        fetches = [model.loss, model.y_pred]
        feed_dict = {
            model.X1_inputs: X1_batch,
            model.X2_inputs: X2_batch,
            model.y_inputs: y_batch,
            model.batch_size: _batch_size,
            model.tst: True,
            model.keep_prob: 1.0
        }
        _cost, predict_labels = sess.run(fetches, feed_dict)
        _costs += _cost
        predict_labels = map(lambda label: label.argsort()[-1:-6:-1],
                             predict_labels)  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list,
                                              marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    mean_cost = _costs / n_va_batches
    return mean_cost, precision, recall, f1
Beispiel #3
0
def valid_epoch(data_path,sess,model):
    #??以下这两行应该是多余的,前面已经写过了。
    va_batches=os.listdir(data_path)
    n_va_batches=len(va_batches)
    _costs=0.0
    predict_labels_list=list()
    marked_labels_list=list()
    for i in range(n_va_batches):
        [X1_batch,X2_batch,y_batch]=get_batch(data_path,i)
        marked_labels_list.extend(y_batch)
        #将label转化为one_hot形式
        y_batch = to_categorical(y_batch)
        _batch_size=len(y_batch)
        fetches=[model.loss,model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        
        _cost,predict_labels=sess.run(fetches,feed_dict)
        #累加1个epoch里的误差
        _costs+=_cost
        #注意predict_lavels的shape为[batch_size,n_class]
        predict_labels=list(map(lambda label:label.argsort()[-1:-6:-1],predict_labels))
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list=zip(predict_labels_list,marked_labels_list)
    precision,recall,f1=score_eval(predict_label_and_marked_label_list)
    mean_cost=_costs/n_va_batches
    return mean_cost,precision,recall,f1
Beispiel #4
0
def valid_epoch():
    """Testing or valid."""
    data_valid._index_in_epoch = 0  # 先指向第一个值
    _batch_size = te_batch_size
    fetches = [cost, y_pred]
    batch_num = int(valid_data_size / _batch_size)
    start_time = time.time()
    _costs = 0.0
    predict_labels_list = list()  # 所有的预测结果
    for i in range(batch_num):
        X_batch, y_batch = data_valid.next_batch(_batch_size)
        X1_batch = X_batch[:, :n_step1]
        X2_batch = X_batch[:, n_step1:]
        y_batch = to_categorical(y_batch)
        feed_dict = {
            X1_inputs: X1_batch,
            X2_inputs: X2_batch,
            y_inputs: y_batch,
            lr: 1e-5,
            batch_size: _batch_size,
            keep_prob: 1.0,
            fc_keep_prob: 1.0,
            tst: True,
            n_updates: global_step
        }
        _cost, predict_labels = sess.run(fetches, feed_dict)
        _costs += _cost
        predict_labels = map(lambda label: label.argsort()[-1:-6:-1],
                             predict_labels)  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list,
                                              marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    mean_cost = _costs / batch_num
    return mean_cost, precision, recall, f1
def valid_epoch(data_path=data_valid_path):
    """Test on the valid data."""
    _costs = 0.0
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()  # 真实标签
    _global_step = sess.run(global_step)
    for i in range(n_va_batches):
        [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
        marked_labels_list.extend(y_batch)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        fetches = [merged, cost, y_pred]
        feed_dict = {
            X1_inputs: X1_batch,
            X2_inputs: X2_batch,
            y_inputs: y_batch,
            batch_size: _batch_size,
            keep_prob: 1.0,
            tst: True,
            n_updates: _global_step
        }
        summary, _cost, predict_labels = sess.run(fetches, feed_dict)
        _costs += _cost
        predict_labels = map(lambda label: label.argsort()[-1:-6:-1],
                             predict_labels)  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list,
                                              marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    mean_cost = _costs / n_va_batches
    return mean_cost, precision, recall, f1
def predict_test(sess, model):
    """Test on the valid data."""
    time0 = time.time()
    predict_labels_list = list()  # 所有的预测结果
    predict_score20_list = list() # 预测排名前20的分数
    predict_labels_list2 = list() #前五名的结果
    marked_labels_list = list()
    topic_num = list()
    predict_scores = list()
    for i in tqdm(xrange(n_te_batches)):
        [X1_batch, X2_batch, y_batch] = get_batch_t(i)
        marked_labels_list.extend(y_batch)#真实标签结果 没-1
        y_batch = to_categorical(y_batch)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]#每个类别的分数
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_labels = softmax(predict_labels)#128
        predict_scores.append(predict_labels)#每个类别的分数


        predict_top5score = map(lambda label: np.sort(label,axis=-1)[-1:-6:-1], predict_labels)  # 取最大的5个分数 128

        index = map(findindex,predict_top5score)#list 128
        #print (index,'index.type:',type(index),'len.index',len(index))

        predict_toplabels = list()

        for i in range(len(index)):
            if index[i] == None:
                toplabel = predict_labels[i].argsort()[-1:-6:-1]
            elif index[i] == 0:
                toplabel = predict_labels[i].argsort()[-1:-2:-1]
            else:
                toplabel = predict_labels[i].argsort()[-1:-1*index[i]-1:-1]
            predict_toplabels.append(toplabel)

        predict_labels_list.extend(predict_toplabels) 
        #print('predict_toplabels:',predict_toplabels,type(predict_toplabels),len(predict_toplabels))

    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)#都-1了 不知道为啥

    print (predict_label_and_marked_label_list[0:2])
    #(array([ 15, 327, 307, 478,  10]), [8, 15, 307, 0]),真实是[9, 16, 308, 1]
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)#计算分数
    print('Local test p=%g, r=%g, f1=%g' % (precision, recall, f1))
    predict_scores = np.vstack(np.asarray(predict_scores))
    print('predict_scores:',predict_scores.shape)
    local_scores_name = local_scores_path + model_name + '_test.npy'
    np.save(local_scores_name, predict_scores)#保存每个类别的分数
    print('local_scores.shape=', predict_scores.shape)
    print('Writed the test scores into %s, time %g s' % (local_scores_name, time.time() - time0))
Beispiel #7
0
def train_epoch(data_path,sess,model,train_fetches, valid_fetches,train_writer,test_writer):
    global last_f1
    global lr
    time0=time.time()
    batch_indexs=np.random.permutation(n_tr_batches)    
    #引入数据,弄上tqdm显示进度
    for batch in tqdm(range(n_tr_batches)):
        #看下global_step,每隔一万步valid一下
        global_step=sess.run(model.global_step)
        if (global_step+1)%FLAGS.valid_step==0:
            valid_cost, precision, recall, f1=valid_epoch(data_valid_path,sess,model)
            print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
                global_step, valid_cost, precision, recall, f1, time.time() - time0))
            time0=time.time()
            if f1>last_f1:
                last_f1=f1
                saving_path=model.saver.save(sess,model_path,global_step+1)
                print('saved new model to %s ' % saving_path)
        #training
        
        batch_id=batch_indexs[batch]
        [X1_batch,X2_batch,y_batch]=get_batch(data_train_path,batch_id)
        #将label转化为one_hot形式
        y_batch = to_categorical(y_batch)
        _batch_size=len(y_batch)
        feed_dict={model.X1_inputs:X1_batch, model.X2_inputs:X2_batch, model.y_inputs:y_batch,
                   model.batch_size:_batch_size,model.tst:True,model.keep_prob:0.5}
        summary, _cost, _, _=sess.run(train_fetches,feed_dict)
        #valid per 500 steps
        if (global_step+1) % 500 ==0:
            train_writer.add_summary(summary, global_step)
            batch_id=np.random.randint(0,n_va_batches)
            [X1_batch,X2_batch,y_batch]=get_batch(data_valid_path,batch_id)
            y_batch = to_categorical(y_batch)
            _batch_size = len(y_batch)
            feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                         model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
            summary, _cost = sess.run(valid_fetches, feed_dict)
            test_writer.add_summary(summary, global_step)
def valid_epoch(data_path, sess, model):
    """Test on the valid data."""
    va_batches = os.listdir(data_path)
    n_va_batches = len(va_batches)
    _costs = 0.0
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    for i in xrange(n_va_batches):
        [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
        marked_labels_list.extend(y_batch)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        fetches = [model.loss, model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        _cost, predict_labels = sess.run(fetches, feed_dict)
        _costs += _cost
        predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels)  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    mean_cost = _costs / n_va_batches
    return mean_cost, precision, recall, f1
def train_epoch(data_path, sess, model, train_fetches, valid_fetches,
                train_writer, test_writer):
    global last_score
    global lr
    time0 = time.time()
    batch_indexs = np.random.permutation(
        n_tr_batches)  # shuffle the training data
    for batch in range(n_tr_batches):
        global_step = sess.run(model.global_step)
        if 0 == (global_step + 1) % FLAGS.valid_step:
            valid_cost, score = valid_epoch(data_valid_path, sess, model)
            print('\n')
            print('Global_step=%d: valid cost=%g; score=%g, time=%g s' %
                  (global_step, valid_cost, score, time.time() - time0))
            logging.info('Global_step=%d: valid cost=%g; score=%g, time=%g s' %
                         (global_step, valid_cost, score, time.time() - time0))
            time0 = time.time()
            if score > last_score:
                last_score = score
                saving_path = model.saver.save(sess, model_path,
                                               global_step + 1)
                print('\n')
                print('saved new model to %s ' % saving_path)
                logging.info('saved new model to %s ' % saving_path)
        # training
        batch_id = batch_indexs[batch]
        [X_batch, y_batch] = get_batch(data_train_path, batch_id)
        y_batch = to_categorical(y_batch)  # 标签转换
        _batch_size = len(y_batch)
        feed_dict = {
            model.X_inputs: X_batch,
            model.y_inputs: y_batch,
            model.batch_size: _batch_size,
            model.tst: False,
            model.keep_prob: FLAGS.keep_prob
        }
        summary, _cost, _accuracy, _, _ = sess.run(
            train_fetches, feed_dict)  # the cost is the mean cost of one batch
        time_str = datetime.datetime.now().isoformat()
        #         print("{}: step {}, loss {:g}, acc {:g}".format(time_str, global_step, _cost, _accuracy))
        logging.info("{}: step {}, loss {:g}, acc {:g}".format(
            time_str, global_step, _cost, _accuracy))
        # valid per 500 steps
        if 0 == (global_step + 1) % 500:
            train_writer.add_summary(summary, global_step)
            batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
            [X_batch, y_batch] = get_batch(data_valid_path, batch_id)
            y_batch = to_categorical(y_batch)  # 法条标签转换
            batch_size = len(y_batch)
            feed_dict = {
                model.X_inputs: X_batch,
                model.y_inputs: y_batch,
                model.batch_size: _batch_size,
                model.tst: True,
                model.keep_prob: 1.0
            }
            summary, _cost, _accuracy = sess.run(valid_fetches, feed_dict)
            time_str = datetime.datetime.now().isoformat()
            #             print("valid: {}: step {}, loss {:g}, acc {:g}".format(time_str, global_step, _cost, _accuracy))
            logging.info("valid: {}: step {}, loss {:g}, acc {:g}".format(
                time_str, global_step, _cost, _accuracy))
            test_writer.add_summary(summary, global_step)
def valid_epoch(data_path, sess, model):
    """Test on the valid data."""
    va_batches = os.listdir(data_path)
    n_va_batches = len(va_batches)
    _costs = 0.0
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    #     n_va_batches = 10
    for i in range(n_va_batches):
        [X_batch, y_batch1] = get_batch(data_path, i)
        marked_labels_list.extend(y_batch1)
        y_batch = to_categorical(y_batch1)  # 法条或罪名标签转换
        _batch_size = len(y_batch)
        fetches = [model.loss, model.y_pred, model.accuracy]
        feed_dict = {
            model.X_inputs: X_batch,
            model.y_inputs: y_batch,
            model.batch_size: _batch_size,
            model.tst: True,
            model.keep_prob: 1.0
        }
        _cost, predict_labels, _accuracy = sess.run(fetches, feed_dict)
        train_batch_predict(predict_labels,
                            y_batch1,
                            batch_path + config.MISSION + "/",
                            i,
                            batch_size=config.BATCH_SIZE)
        #         train_batch_predict(predict_labels,y_batch1, batch_path+'accu/', i, batch_size=config.BATCH_SIZE)
        predict_labelsnew = []
        if config.LAST_LAYER == "sigmoid":
            for label in predict_labels:
                xitem = np.argwhere(label > config.SIGMOID_THRESHOLD).flatten()
                if (len(xitem) > 0):
                    predict_labelsnew.append(xitem)
                else:
                    predict_labelsnew.append(label.argsort()[-1:-2:-1])
        elif config.LAST_LAYER == "softmax":
            for label in predict_labels:
                prob = np.argsort(label)
                pred = [prob[-1]]
                total_prob = config.SOFTMAX_THRESHHOLD - prob[-1]
                for i in range(len(prob))[2:]:
                    total_prob -= prob[-i]
                    if total_prob > 0:
                        pred.append(prob[i - 1])
                    else:
                        break
                xitem = np.array(pred)
                predict_labelsnew.append(xitem)

        _costs += _cost
        if (i == 0):
            logging.info(predict_labelsnew)
        #         predict_labels = map(lambda label: label.argsort()[-1:-2:-1], predict_labels)  # 取最大的1个下标
        predict_labels_list.extend(predict_labelsnew)

    predict_label_and_marked_label_list = zip(predict_labels_list,
                                              marked_labels_list)
    score = get_task_score(predict_label_and_marked_label_list)  # 法条预测分数
    mean_cost = _costs / n_va_batches
    return mean_cost, score
            valid_cost, precision, recall, f1 = valid_epoch()
            print(
                'Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' %
                (_global_step, valid_cost, precision, recall, f1,
                 time.time() - time0))
            time0 = time.time()
            if (f1 > last_f1):
                last_f1 = f1
                model_num += 1
                save_path = saver.save(sess, model_path, global_step=model_num)
                print('the save path is ', save_path)

        batch_id = batch_indexs[batch]
        [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id,
                                                  n_step1)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        fetches = [merged, cost, train_op, update_op]
        feed_dict = {
            X1_inputs: X1_batch,
            X2_inputs: X2_batch,
            y_inputs: y_batch,
            batch_size: _batch_size,
            keep_prob: 0.5,
            tst: False,
            n_updates: _global_step
        }
        summary, _cost, _, _ = sess.run(
            fetches, feed_dict)  # the cost is the mean cost of one batch
        if _global_step % 100:
            train_writer.add_summary(summary, _global_step)
def predict_dev(sess, model):
    """Test on the valid data."""
    time0 = time.time()
    predict_labels_list = list()  # 所有的预测结果
    predict_score20_list = list() # 预测排名前20的分数
    predict_labels_list2 = list() #前五名的结果
    marked_labels_list = list()
    topic_num = list()
    predict_scores = list()
    for i in tqdm(xrange(n_va_batches)):#验证集
        [X1_batch, X2_batch, y_batch] = get_batch(i)
        X1_length, X2_length = get_sequence_length(X1_batch, X2_batch)
        marked_labels_list.extend(y_batch)#真实标签结果 没-1
        y_batch = to_categorical(y_batch)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]#每个类别的分数
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.X1_length: X1_length, model.X2_length: X2_length,
                     model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_labels = softmax(predict_labels)#128
        predict_scores.append(predict_labels)#每个类别的分数


        predict_top5score = map(lambda label: np.sort(label,axis=-1)[-1:-6:-1], predict_labels)  # 取最大的5个分数 128
        #predict_top20score = map(lambda label: np.sort(label,axis=-1)[-1:-21:-1], predict_labels)  # 取最大的20个分数 128
        #print (type(predict_score20_list))
        #print (type(predict_top20score))
        #predict_score20_list.extend(predict_top20score) #所有
        #list,predict_score_list1[0]=[ 0.63514245  0.09193601  0.0417341   0.02742104  0.02721145]

        index = map(findindex,predict_top5score)#list 128
        #print (index,'index.type:',type(index),'len.index',len(index))

        predict_toplabels = list()

        for i in range(len(index)):
            if index[i] == None:
                toplabel = predict_labels[i].argsort()[-1:-6:-1]
            elif index[i] == 0:
                toplabel = predict_labels[i].argsort()[-1:-2:-1]
            else:
                toplabel = predict_labels[i].argsort()[-1:-1*index[i]-1:-1]
            predict_toplabels.append(toplabel)

        predict_labels_list.extend(predict_toplabels) 
        #print('predict_toplabels:',predict_toplabels,type(predict_toplabels),len(predict_toplabels))


        #predict_top5labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels)  # 取最大的5个下标
        #predict_labels_list2.extend(predict_top5labels)
        
        #predict_labels_list2.to_csv('predict_labels_list2.csv')


    #predict_score20_list = DataFrame(predict_score20_list)
    #predict_labels_list2 = DataFrame(predict_labels_list2)
    #predict_score20_list.to_csv('score20list.csv')
    #predict_labels_list2.to_csv('predict_labels_list2.csv')
    #topic_num = map(tolen,marked_labels_list)
    #topic_num = DataFrame(topic_num)
    #topic_num.to_csv('topic_num.csv')
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)#都-1了 不知道为啥

    print (predict_label_and_marked_label_list[0:2])
    #(array([ 15, 327, 307, 478,  10]), [8, 15, 307, 0]),真实是[9, 16, 308, 1]
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)#计算分数
    print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
    predict_scores = np.vstack(np.asarray(predict_scores))
    print('predict_scores:',predict_scores.shape)
    local_scores_name = local_scores_path + model_name + '_dev.npy'
    np.save(local_scores_name, predict_scores)#保存每个类别的分数
    print('local_scores.shape=', predict_scores.shape)
    print('Writed the dev scores into %s, time %g s' % (local_scores_name, time.time() - time0))
Beispiel #13
0
        max_features = np.max(list(indice_token.keys())) + 1

        # Augmenting x_train and x_test with n-grams features
        x_train = add_ngram(list(x_train), token_indice, ngram_range)
        x_dev = add_ngram(list(x_dev), token_indice, ngram_range)
        x_test = add_ngram(list(x_test), token_indice, ngram_range)
        print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
        print('Average dev sequence length: {}'.format(np.mean(list(map(len, x_dev)), dtype=int)))
        print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

    print('Pad sequences (samples x time)')
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_dev = sequence.pad_sequences(x_dev, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    testy = y_test
    y_train = np.array([to_categorical(s) for s in y_train])
    y_dev = np.array([to_categorical(s) for s in y_dev])
    y_test = np.array([to_categorical(s) for s in y_test])
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_dev.shape)
    print('x_test shape:', x_test.shape)
    print(x_train[1])
    print('Build model...')

    model = Sequential()

    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    model.add(Embedding(max_features,
                        embedding_dims,
                        input_length=maxlen))
Beispiel #14
0
# ==================================================

# Load data
print("Loading data...")
(X_train, y_train), (X_test, y_test), WordEm = data_helpers.loadData(
    path='../corpus/wordseq/mr_new.p')
# Randomly shuffle data
np.random.seed(1933)
max_features = (WordEm.shape[0])
embedding_size = WordEm.shape[1]
sequence_length = X_train.shape[1]
print("Vocabulary Size: {:d}".format(max_features))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_test)))
print("Sequnence Length: {:d}".format(sequence_length))

train_label = data_helpers.to_categorical(y_train, 2)
test_label = data_helpers.to_categorical(y_test, 2)

# Training
# ==================================================

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(sequence_length=sequence_length,
                      num_classes=2,
                      vocab_size=max_features,
                      embedding_size=FLAGS.embedding_dim,