Python process_file 예제들, cnews_loader.process_file Python 예제들

예제 #1

0

파일 보기

파일: torch_train.py 프로젝트: chenny3/nlp-beginner-finish

def train():
    x_train, y_train = process_file(train_dir, word_to_id, cat_to_id,
                                    600)  #获取训练数据每个字的id和对应标签的oe-hot形式
    x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, 600)
    #使用LSTM或者CNN
    model = TextRNN()
    model.train()
    # model = TextCNN()
    #选择损失函数
    Loss = nn.MultiLabelSoftMarginLoss()
    # Loss = nn.BCELoss()
    # Loss = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    best_val_acc = 0
    for epoch in range(100):
        i = 0
        print('epoch:{}'.format(epoch))
        batch_train = batch_iter(x_train, y_train, 64)
        for x_batch, y_batch in batch_train:
            i += 1
            # print(i)
            x = np.array(x_batch)
            y = np.array(y_batch)
            x = torch.LongTensor(x)
            y = torch.Tensor(y)
            # y = torch.LongTensor(y)
            # x = Variable(x)
            # y = Variable(y)
            out = model(x)
            loss = Loss(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # 对模型进行验证
            if i % 90 == 0:
                los, accracy = evaluate(model, Loss, x_val,
                                        y_val)  # 此处不需要优化器参数
                print('loss:{},accracy:{}'.format(los, accracy))
                if accracy > best_val_acc:
                    torch.save(model.state_dict(), 'model_params.pkl')
                    best_val_acc = accracy

예제 #2

0

파일 보기

def test():
    
    print("Loading test data...")
    start_time = time.time()
    x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)
      
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  
    # 在保存和恢复模型时都需要首先运行这一行：tf.train.Saver()，而不是只有保存时需要。

    print('Testing...')
    loss_test, acc_test = evaluate(session, x_test, y_test)
     # 返回了10000个总测试样本的平均交叉熵损失和平均准率。
    
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))

    batch_size = 128
    data_len = len(x_test) 
    num_batch = int((data_len - 1) / batch_size) + 1 
    y_test_cls = np.argmax(y_test, 1) 
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  
    for i in range(num_batch):  
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0 
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)
        # 测试的时候不需要dropout神经元。

  
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))
    # 可以得到准确率 、召回率和F1_score

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

예제 #3

0

파일 보기

파일: train_cnn.py 프로젝트: DragonYong/TextClassification

def test():
    print("Loading test data...")
    start_time = time.time()
    x_test, y_test = process_file(test_dir, word_to_id, cat_to_id,
                                  args.SEQ_LENGTH)

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型

    print('Testing...')
    loss_test, acc_test = evaluate(session, x_test, y_test)
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))

    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1

    y_test_cls = np.argmax(y_test, 1)
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存预测结果
    for i in range(num_batch):  # 逐批次处理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls,
                                                  feed_dict=feed_dict)

    # 评估
    print("Precision, Recall and F1-Score...")
    print(
        metrics.classification_report(y_test_cls,
                                      y_pred_cls,
                                      target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

예제 #4

0

파일 보기

파일: cnews_classification.py 프로젝트: mathCrazyy/NLP_task

def test():
    print("loading test data ...")
    start_time=time.time()
    x_test,y_test=process_file(test_dir,word_to_id,cat_to_id,config.seq_length)

    session=tf.Session()
    session.run(tf.global_variables_initializer())

    saver=tf.train.Saver()
    saver.restore(sess=session,save_path=save_path)

    print("testing.. ")
    loss_test,acc_test=evaluate(session,x_test,y_test)
    msg="Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}"
    print(msg.format(loss_test,acc_test))

    batch_size=128
    data_len=len(x_test)
    num_batch=int((data_len-1)/batch_size)+1

    y_test_cls=np.argmax(y_test,1)#1指的是第一列数据
    y_pred_cls=np.zeros(shape=len(x_test),dtype=np.int32)
    for i in range(num_batch):
        start_id=i*batch_size
        end_id=min((i+1)*batch_size,data_len)
        feed_dict={
            model.input_x:x_test[start_id:end_id],
            model.keep_prob:1.0
        }
        y_pred_cls[start_id:end_id]=session.run(model.y_pred_cls,feed_dict=feed_dict)

    #评估
    print('precision, recall and f1')
    print(metrics.classification_report(y_test_cls,y_pred_cls,target_names=categories))

    #混淆矩阵
    print("confusion matrix")
    cm=metrics.confusion_matrix(y_test_cls,y_pred_cls)
    print(cm)

    time_dif=get_time_dif(start_time)
    print("time usage ",time_dif)

예제 #5

0

파일 보기

def train():
    print("Configuring TensorBoard and Saver...")
    # 配置 Tensorboard，重新训练时，请将tensorboard文件夹删除，不然图会覆盖
    tensorboard_dir = 'tensorboard/textrnn'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    # 配置 Saver
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print("Loading training and validation data...")
    # 载入训练集与验证集
    start_time = time.time()
    x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
    x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # 创建session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升，提前结束训练

    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)

            if total_batch % config.save_per_batch == 0:
                # 每多少轮次将训练结果写入tensorboard scalar
                s = session.run(merged_summary, feed_dict=feed_dict)
                writer.add_summary(s, total_batch)

            if total_batch % config.print_per_batch == 0:
                # 每多少轮次输出在训练集和验证集上的性能
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                loss_val, acc_val = evaluate(session, x_val, y_val)  # todo

                if acc_val > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path=save_path)
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

            session.run(model.optim, feed_dict=feed_dict)  # 运行优化
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                # 验证集正确率长期不提升，提前结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break  # 跳出循环
        if flag:  # 同上
            break

예제 #6

0

파일 보기

def train():
    print("Configuring TensorBoard and Saver...")
    tensorboard_dir = 'tensorboard/textcnn'
  
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)
        
    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    # 用到 tf.summary 中的方法保存日志数据，用于tensorboard可视化操作。
    # 用 tf.summary.scalar 保存标量，一般用来保存loss，accuary，学习率等数据    
    
  
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)
    # 使用 tf.summaries.merge_all() 对所有的汇总操作进行合并
    # 将数据写入本地磁盘: tf.summary.FileWriter

 
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print("Loading training and validation data...")
    start_time = time.time()
    x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
    x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0  
    best_acc_val = 0.0  
    last_improved = 0  
    require_improvement = 1000  
    # 如果超过1000轮未提升，提前结束训练，防止过拟合。

    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)      
            
            if total_batch % config.save_per_batch == 0:
                s = session.run(merged_summary, feed_dict=feed_dict)
                writer.add_summary(s, total_batch)

            if total_batch % config.print_per_batch == 0:
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                loss_val, acc_val = evaluate(session, x_val, y_val)  # todo

                if acc_val > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path=save_path)
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

            session.run(model.optim, feed_dict=feed_dict)  # 运行优化
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                # 验证集正确率长期不提升，提前结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break  
        if flag:  
            break

예제 #7

0

파일 보기

파일: cnews_classification.py 프로젝트: mathCrazyy/NLP_task

def train():
    print("configuring tensorboard and saver")
    tensorboard_dir="tensorboard/textcnn"
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)
    tf.summary.scalar("loss",model.loss)
    tf.summary.scalar("accuracy",model.acc)

    merged_summary=tf.summary.merge_all()
    writer=tf.summary.FileWriter(tensorboard_dir)

    saver=tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print("loading training  and validation data..")
    start_time=time.time()
    x_train,y_train=process_file(train_dir,word_to_id,cat_to_id,config.seq_length)
    x_val,y_val=process_file(val_dir,word_to_id,cat_to_id,config.seq_length)
    time_dif=get_time_dif(start_time)
    print("time usage: ",time_dif)

    #创建session
    session=tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print("training and evaluate...")
    start_time=time.time()
    total_batch=0
    best_acc_val=0.0
    last_improved=0
    require_improvement=1000
    flag=False
    for epoch in range(config.num_epochs):
        print("epoch: ",epoch+1)
        batch_train=batch_iter(x_train,y_train,config.batch_size)
        for x_batch,y_batch in batch_train:
            feed_dict=feed_data(x_batch,y_batch,config.dropout_keep_prob)
            if(total_batch%config.save_per_batch==0):
                s=session.run(merged_summary,feed_dict=feed_dict)
                writer.add_summary(s,total_batch)
            if(total_batch%config.print_per_batch==0):
                feed_dict[model.keep_prob]=1.0
                loss_train,acc_train=session.run([model.loss,model.acc],feed_dict=feed_dict)
                loss_val,acc_val=evaluate(
                    session,x_val,y_val
                )
                # 保存最好的结果
                if acc_val>best_acc_val:
                    best_acc_val=acc_val
                    last_improved=0
                    saver.save(sess=session,save_path=save_path)
                    improved_str="*"
                else:
                    improved_str=""
                time_dif=get_time_dif(start_time)
                #msg="iter: {0:>6}, train loss: {1:6.2},train acc:{2:7.2},"+"val loss: {3:>6.2},val acc: {4:>7.2%},time:{5},{6}"
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))
            session.run(model.optim,feed_dict=feed_dict)
            total_batch+=1
            if total_batch-last_improved>require_improvement:
                print("no optimization for a long time, auto-stopping...")
                flag=True
                break
        if flag:
            break

예제 #8

0

파일 보기

파일: lstm.py 프로젝트: vicFan/NLP-

num_classes = 10  #类别数
# Preparing data is usually the most time-consuming part of machine learning.

base_dir = r'C:\Users\weke\Desktop\语料搜集\cnews'
train_dir = os.path.join(base_dir, 'cnews.train.txt')
test_dir = os.path.join(base_dir, 'cnews.test.txt')
val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')

if not os.path.exists(vocab_dir):  # 如果不存在词汇表，重建
    build_vocab(train_dir, vocab_dir, config.vocab_size)
categories, cat_to_id = read_category()
words, word_to_id = read_vocab(vocab_dir)
vocab_size = len(words)

X_train, y_train = process_file(train_dir, word_to_id, cat_to_id, seq_length)
X_test, y_test = process_file(val_dir, word_to_id, cat_to_id, seq_length)

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

## Padding input data
# Models in Keras (and elsewhere) usually take as input batches of sentences of the same length.
# Since sentences usually have different sizes, we "pad" sentences (we add a dummy "padding" token at the end of the
# sentences. The input thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence
# in the batch.
'''
maxlen  = 80  # cut texts after this number of words (among top vocab_size most common words)
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test  = sequence.pad_sequences(X_test, maxlen=maxlen)
'''

예제 #9

0

파일 보기

파일: cnews10.30.py 프로젝트: wangfangfang-2/BI_week_nine

test_file = 'cnews.test.txt'
val_file = 'cnews.val.txt'
# 获取文本的类别及其对应id的字典
categories, cat_to_id = read_category()
#print(categories)
# 获取训练文本中所有出现过的字及其所对应的id
words, word_to_id = read_vocab('cnews.vocab.txt')
#print(words)
#print(word_to_id)
#print(word_to_id)
#获取字数
vocab_size = len(words)

# 数据加载及分批
# 获取训练数据每个字的id和对应标签的one-hot形式
x_train, y_train = process_file('cnews.train1.txt', word_to_id, cat_to_id, 600)
#print('x_train=', x_train)
x_val, y_val = process_file('cnews.val.txt', word_to_id, cat_to_id, 600)

#设置GPU
cuda = torch.device('cuda')
x_train, y_train = torch.LongTensor(x_train), torch.Tensor(y_train)
x_val, y_val = torch.LongTensor(x_val), torch.Tensor(y_val)

train_dataset = Data.TensorDataset(x_train, y_train)
train_loader = Data.DataLoader(dataset=train_dataset,
                               batch_size=1280,
                               shuffle=True)
val_dataset = Data.TensorDataset(x_val, y_val)
val_loader = Data.DataLoader(dataset=val_dataset, batch_size=1280)

예제 #10

0

파일 보기

        })

    text = features["text"]
    title = tf.cast(features["title"], tf.int32)
    label = tf.cast(features["label"], tf.int32)
    return text, title, label


if __name__ == "__main__":
    # 数据格式转换
    config = TCNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表，重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    x_train, y_train = process_file(train_dir, word_to_id, cat_to_id,
                                    config.seq_length)
    text_test, x_test, y_test = process_file2(test_dir, word_to_id, cat_to_id,
                                              config.seq_length)

    convert_to_TFRecords([x_train, y_train], base_dir + "/train")
    # convert_to_TFRecords_withText([text_test, x_test, y_test], base_dir+"/test")

    # 读取测试
    # read_TFRecords_test("sexText/train10.tfrecords")
    # queue = tf.train.string_input_producer(["sexText/test.tfrecords"], num_epochs=10)
    # text, title, label = read_example(queue)
    #
    # text_batch, title_batch, label_batch = tf.train.batch([text, title, label], batch_size=1, capacity=5000,
    #                                                 num_threads=1)
    # count = 0
    # with tf.Session() as sess:

예제 #11

0

파일 보기

    """
    words, word_to_id = read_vocab(vocab_dir)
    categories, cat_to_id = read_category()
    # vocab_size = len(words)
    return words, word_to_id, categories, cat_to_id


if __name__ == "__main__":
    test_dir = 'file/cnews/cnews.test.txt'
    vocab_dir = 'file/cnews/vocab.txt'
    train_dir = 'file/cnews/cnews.train.txt'

    words, word_to_id, categories, cat_to_id = load_data(vocab_dir)

    x_pad, y_pad = process_file(train_dir,
                                word_to_id,
                                cat_to_id,
                                max_length=5000)
    x_test, y_test = process_file(test_dir,
                                  word_to_id,
                                  cat_to_id,
                                  max_length=5000)

    X = x_pad.T
    Y = y_pad.T
    x_test = x_test.T
    y_test = y_test.T

    learning_rate = 1e-3
    num_interantion = 100

    print("---------------start------------------------")

예제 #12

0

파일 보기

파일: train.py 프로젝트: gitgitgithut/cnews

train_epochs = 1000
batch_size = 256
lr = 0.001

#########################################
# 获取文本的类别及其对应id的字典
categories, cat_to_id = read_category()
print(categories)
# 获取训练文本中所有出现过的字及其所对应的id
words, word_to_id = read_vocab('cnews.vocab.txt')

vocab_size = len(words)

# 数据加载及分批
# 获取训练数据每个字的id和对应标签的one-hot形式
x_train, y_train = process_file('cnews.train.txt', word_to_id, cat_to_id, 600)
x_test, y_test = process_file('cnews.val.txt', word_to_id, cat_to_id, 600)

cuda = torch.device('cuda')
x_train, y_train = torch.LongTensor(x_train), torch.LongTensor(y_train)
x_test, y_test = torch.LongTensor(x_test), torch.LongTensor(y_test)

train_dataset = Data.TensorDataset(x_train, y_train)
train_loader = Data.DataLoader(dataset=train_dataset,
                               batch_size=batch_size,
                               shuffle=True)
test_dataset = Data.TensorDataset(x_test, y_test)
test_loader = Data.DataLoader(dataset=test_dataset,
                              batch_size=batch_size,
                              shuffle=True)
#################################################################################