Ejemplo n.º 1
0
    def load_model(self):   
        config=self.load_config()
        vocabproc = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(os.path.join(self.root,"text.vocab"))
        print("Text Vocabulary Size: {:d}".format(len(vocabproc.vocabulary_)))   

        rcnn=TextRCNN(
            sequence_length=config["sequence_length"],
            num_classes=config["classes"],
            vocab_size=len(vocabproc.vocabulary_),
            word_embedding_size=config["word_embedding_size"],
            context_embedding_size=config["context_embedding_size"],
            cell_type=config["cell_type"],
            hidden_size=config["hidden_size"],
            l2_reg_lambda=config["l2_reg_lambda"],
            W_text_trainable=config["W_text_trainable"]
            ) 
        self.sess.run(tf.global_variables_initializer())  
        saver = tf.train.Saver() 
        restore_from_lastest(self.sess,saver,self.root) 
        return rcnn,vocabproc
Ejemplo n.º 2
0
def main(config):

    if not os.path.exists(config.model_dir):
        os.makedirs(config.model_dir)

    if not os.path.exists(config.log_dir):
        os.makedirs(config.log_dir)

    print("\t \t \t the model name is {}".format(config.model_name))
    device, n_gpu = get_device()

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(config.seed)
        torch.backends.cudnn.deterministic = True  # cudnn 使用确定性算法,保证每次结果一样
    """ sst2 数据准备 """
    text_field = data.Field(tokenize='spacy',
                            lower=True,
                            include_lengths=True,
                            fix_length=config.sequence_length)
    label_field = data.LabelField(dtype=torch.long)

    train_iterator, dev_iterator, test_iterator = load_sst2(
        config.data_path, text_field, label_field, config.batch_size, device,
        config.glove_word_file, config.cache_path)
    """ 词向量准备 """
    pretrained_embeddings = text_field.vocab.vectors

    model_file = config.model_dir + 'model1.pt'
    """ 模型准备 """
    if config.model_name == "TextCNN":
        from TextCNN import TextCNN
        filter_sizes = [int(val) for val in config.filter_sizes.split()]
        model = TextCNN.TextCNN(config.glove_word_dim, config.filter_num,
                                filter_sizes, config.output_dim,
                                config.dropout, pretrained_embeddings)
    elif config.model_name == "TextRNN":
        from TextRNN import TextRNN
        model = TextRNN.TextRNN(config.glove_word_dim, config.output_dim,
                                config.hidden_size, config.num_layers,
                                config.bidirectional, config.dropout,
                                pretrained_embeddings)

    elif config.model_name == "LSTMATT":
        from LSTM_ATT import LSTMATT
        model = LSTMATT.LSTMATT(config.glove_word_dim, config.output_dim,
                                config.hidden_size, config.num_layers,
                                config.bidirectional, config.dropout,
                                pretrained_embeddings)
    elif config.model_name == 'TextRCNN':
        from TextRCNN import TextRCNN
        model = TextRCNN.TextRCNN(config.glove_word_dim, config.output_dim,
                                  config.hidden_size, config.num_layers,
                                  config.bidirectional, config.dropout,
                                  pretrained_embeddings)

    elif config.model_name == "TransformerText":
        from TransformerText import TransformerText
        model = TransformerText.TransformerText(
            config.head_num, config.encode_layer, config.glove_word_dim,
            config.d_model, config.d_ff, config.output_dim, config.dropout,
            pretrained_embeddings)

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()

    model = model.to(device)
    criterion = criterion.to(device)

    if config.do_train:
        train(config.epoch_num, model, train_iterator, dev_iterator, optimizer,
              criterion, ['0', '1'], model_file, config.log_dir,
              config.print_step, 'word')

    model.load_state_dict(torch.load(model_file))

    test_loss, test_acc, test_report = evaluate(model, test_iterator,
                                                criterion, ['0', '1'], 'word')
    print("-------------- Test -------------")
    print("\t Loss: {} | Acc: {} | Macro avg F1: {} | Weighted avg F1: {}".
          format(test_loss, test_acc, test_report['macro avg']['f1-score'],
                 test_report['weighted avg']['f1-score']))
Ejemplo n.º 3
0
def aSampleTest(choose_model):
    x, y, vocabulary, vocabulary_inv, labelToindex, sentenceToindex, labelNumdict = data_processing.load_input_data(
        MAXLENGTH)
    # word2vec预训练权重
    weight_array = pickle.load(
        open(os.path.join(DATA_PATH, 'weight_array'), 'rb'))

    test_sample_x = '价格公正,物流很快,但有些污垢!'
    test_sample_y = 1
    test_sample_seg = []

    # 去除标点符号、数字及字母
    punctuation = re.compile(
        u"[-~!@#$%^&*()_+`=\[\]\\\{\}\"|;':,./<>?·!@#¥%……&*()——+【】、;‘:“”,。、《》?「『」』 ^┻]"
    )
    digit = re.compile(u"[0-9]")
    number = re.compile(u"[a-zA-Z]")

    test_sample_x = punctuation.sub("", test_sample_x)
    test_sample_x = digit.sub("", test_sample_x)
    test_sample_x = number.sub("", test_sample_x)

    for word in jieba.cut(test_sample_x):
        if word not in data_processing.get_stop_words().keys(
        ) and word in vocabulary.keys():
            test_sample_seg.append(word)
    test_sample_seg_pad = data_processing.pad_sentences([test_sample_seg],
                                                        MAXLENGTH)
    test_x, test_y = data_processing.build_input_data(test_sample_seg_pad,
                                                      test_sample_y,
                                                      vocabulary)

    test_x = Variable(torch.LongTensor(test_x))
    test_y = Variable(torch.LongTensor(test_y))
    if use_cuda:
        test_x = test_x.cuda()
        test_y = test_y.cuda()

    # 选择test的模型
    if choose_model == 'TextCNN':
        model = TextCNN(1, KERNEL_NUM, len(vocabulary), EMBEDDING_DIM,
                        len(labelToindex))
    elif choose_model == 'BiLSTM':
        model = BiLSTM(len(vocabulary), EMBEDDING_DIM, HIDDEN_SIZE,
                       len(labelToindex))
    elif choose_model == 'TextCNN_BN':
        model = TextCNN_BN(len(vocabulary), EMBEDDING_DIM, KERNEL_SIZES,
                           KERNEL_NUM, len(labelToindex), MAXLENGTH)
    elif choose_model == 'BiLSTM_b':
        model = BiLSTM_b(len(vocabulary), EMBEDDING_DIM, HIDDEN_SIZE,
                         len(labelToindex), MAXLENGTH)
    elif choose_model == 'CNN_BiLSTM_a':
        model = CNN_BiLSTM_a(len(vocabulary), EMBEDDING_DIM,
                             KERNEL_SIZES, KERNEL_NUM, HIDDEN_SIZE,
                             len(labelToindex), MAXLENGTH)
    elif choose_model == 'BiGRU':
        model = BiGRU(len(vocabulary), EMBEDDING_DIM, HIDDEN_SIZE,
                      len(labelToindex), MAXLENGTH)
    elif choose_model == 'CNN_with_pretrained_embedding':
        model = TextCNN_BN_with_pretrained_embed(len(vocabulary),
                                                 EMBEDDING_DIM,
                                                 KERNEL_SIZES, KERNEL_NUM,
                                                 len(labelToindex), MAXLENGTH,
                                                 weight_array)
    elif choose_model == 'TextRCNN':
        model = TextRCNN(len(vocabulary), EMBEDDING_DIM,
                         KERNEL_SIZES, HIDDEN_SIZE, KERNEL_NUM,
                         len(labelToindex), MAXLENGTH, weight_array)
    elif choose_model == 'TextCNN_multi_channel':
        model = TextCNN_multi_channel(len(vocabulary), EMBEDDING_DIM,
                                      KERNEL_SIZES, KERNEL_NUM,
                                      len(labelToindex), MAXLENGTH,
                                      weight_array)

    model.load_state_dict(
        torch.load(os.path.join(MODEL_PATH,
                                choose_model + '_201807102300.pkl')))  # 日期要变
    if use_cuda:
        model = model.cuda()

    model_out = model(test_x)  # (1, 3)
    _, pre_y = torch.max(model_out, 1)
    print("预测的标签为:", pre_y.item())
Ejemplo n.º 4
0
def solver(mydata, config):
    #output dir
    timestamp = time.strftime('%Y-%m-%d-%Hh-%Mm-%Ss')
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))
    #get RCNN
    rcnn = TextRCNN(sequence_length=config["sequence_length"],
                    num_classes=mydata.getClasses(),
                    vocab_size=mydata.vocabSize,
                    word_embedding_size=config["word_embedding_size"],
                    context_embedding_size=config["context_embedding_size"],
                    cell_type=config["cell_type"],
                    hidden_size=config["hidden_size"],
                    l2_reg_lambda=config["l2_reg_lambda"],
                    W_text_trainable=config["W_text_trainable"],
                    out_dir=out_dir)
    ## summary
    sess = rcnn.sess

    # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
    checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    #save vocab/config/category
    mydata.saveCategory2Index(os.path.join(out_dir, "category_index"))
    mydata.vocabproc.save(os.path.join(out_dir, "text.vocab"))
    Utils.showAndSaveConfig(config, os.path.join(out_dir, "config.txt"))

    print("[*]parameter number: %s" % (getParameterNumbers()))

    saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)

    # Initialize all variables
    sess.run(tf.global_variables_initializer())

    restore_from = config["restore_from"]
    if restore_from != None:
        saver.restore(sess, restore_from)
        print("[*]restore success")
    # Pre-trained word2vec
    wordInit = {}
    if config["LoadGoogleModel"] and restore_from == None:
        print("[*]Loading Google Pre-trained Model")
        # initial matrix with random uniform
        initW = np.random.uniform(
            -0.25, 0.25, (mydata.vocabSize, config["word_embedding_size"]))
        # load any vectors from the word2vec
        word2vec = config["Word2Vec"]
        print("  [*]Load word2vec file {0}".format(word2vec))
        cnt_word_in_word2vec = 0
        with open(word2vec, "rb") as f:
            header = f.readline()
            vocab_size, layer1_size = map(int, header.split())
            print("  [*]Google:vocab_size:%s" % (vocab_size))
            binary_len = np.dtype('float32').itemsize * layer1_size
            for line in range(vocab_size):
                word = []
                while True:
                    ch = f.read(1).decode('latin-1')
                    if ch == ' ':
                        word = ''.join(word)
                        break
                    if ch != '\n':
                        word.append(ch)
                idx = mydata.vocabproc.vocabulary_.get(word.lower())
                if idx != 0:
                    if idx not in wordInit:
                        wordInit[idx] = word
                        initW[idx] = np.fromstring(f.read(binary_len),
                                                   dtype='float32')
                        cnt_word_in_word2vec += 1
                    elif word == word.lower():
                        wordInit[idx] = word
                        initW[idx] = np.fromstring(f.read(binary_len),
                                                   dtype='float32')

                else:
                    f.read(binary_len)
            print(
                "  [*]Load Google Model success: word in Word2Vec :%s total word:%s"
                % (cnt_word_in_word2vec, mydata.vocabSize))

        sess.run(rcnn.W_text.assign(initW))
        print("[*]Success to load pre-trained word2vec model!\n")

    # start traning
    # step && learning rate
    stlr = STLR(1e-3, 1e-2, 200, 600)
    step = 0
    while True:
        batch = mydata.nextBatch(config["BatchSize"])
        learning_r = stlr.getLearningRate(step)
        feed_dict = {
            rcnn.input_text: batch[0],
            rcnn.input_y: batch[1],
            rcnn.dropout_keep_prob: config["droupout"],
            rcnn.learning_rate: learning_r
        }
        _, step, summaries, loss, accuracy = sess.run([
            rcnn.train_op, rcnn.global_step, rcnn.train_summary_op, rcnn.loss,
            rcnn.accuracy
        ], feed_dict)
        rcnn.summary_writer.add_summary(summaries, step)
        # Training log display
        if step % config["TraingLogEverySteps"] == 0:
            time_str = datetime.datetime.now().isoformat()
            print("  [*] step %s;  loss %s;  acc %s; lr %.6f " %
                  (step, loss, accuracy, learning_r))

        # Evaluation
        if step % config["TestEverySteps"] == 0:
            test_data = mydata.getTestData()
            test_size = len(test_data[0])
            correct_predict_count = 0
            dev_loss = 0
            for i in range(0, test_size, 500):
                x_test = test_data[0][i:i + 500]
                y_test = test_data[1][i:i + 500]
                feed_dict_dev = {
                    rcnn.input_text: x_test,
                    rcnn.input_y: y_test,
                    rcnn.dropout_keep_prob: 1.0
                }
                summaries_dev, loss, accuracy = sess.run(
                    [rcnn.dev_summary_op, rcnn.loss, rcnn.accuracy],
                    feed_dict_dev)
                #rcnn.summary_writer.add_summary(summaries_dev, step)
                #
                correct_predict_count += int(0.5 + accuracy * len(x_test))
                dev_loss += loss * len(x_test) / test_size
            #dev summary
            dev_accuracy = correct_predict_count / test_size

            rcnn.summary_writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag="dev_loss", simple_value=dev_loss)
                ]), step)
            rcnn.summary_writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag="dev_accu", simple_value=dev_accuracy)
                ]), step)
            time_str = datetime.datetime.now().isoformat()
            print("\n[*]Test:%s step %s, loss %.6f, acc %.6f " %
                  (time_str, step, dev_loss, dev_accuracy))

        # Model checkpoint
        if step % 1000 == 0:
            path = saver.save(sess, checkpoint_prefix, global_step=step)
            print("Saved model checkpoint to {}\n".format(path))
Ejemplo n.º 5
0
def getBadCases(choose_model):
    badcases_contents = []
    badcases_scores = []
    badcases_true_labels = []
    badcases_pred_labels = []

    x, y, vocabulary, vocabulary_inv, labelToindex, _, labelNumdict = data_processing.load_input_data(
        MAXLENGTH)
    # word2vec预训练权重
    weight_array = pickle.load(
        open(os.path.join(DATA_PATH, 'weight_array'), 'rb'))

    # 选择test的模型
    if choose_model == 'TextCNN':
        model = TextCNN(1, KERNEL_NUM, len(vocabulary), EMBEDDING_DIM,
                        len(labelToindex))
    elif choose_model == 'BiLSTM':
        model = BiLSTM(len(vocabulary), EMBEDDING_DIM, HIDDEN_SIZE,
                       len(labelToindex))
    elif choose_model == 'TextCNN_BN':
        model = TextCNN_BN(len(vocabulary),
                           EMBEDDING_DIM,
                           KERNEL_SIZES,
                           KERNEL_NUM,
                           len(labelToindex),
                           MAXLENGTH,
                           weight_array=None)
    elif choose_model == 'BiLSTM_b':
        model = BiLSTM_b(len(vocabulary), EMBEDDING_DIM, HIDDEN_SIZE,
                         len(labelToindex), MAXLENGTH)
    elif choose_model == 'CNN_BiLSTM_a':
        model = CNN_BiLSTM_a(len(vocabulary), EMBEDDING_DIM,
                             KERNEL_SIZES, KERNEL_NUM, HIDDEN_SIZE,
                             len(labelToindex), MAXLENGTH)
    elif choose_model == 'CNN_with_pretrained_embedding':
        model = TextCNN_BN_with_pretrained_embed(len(vocabulary),
                                                 EMBEDDING_DIM,
                                                 KERNEL_SIZES, KERNEL_NUM,
                                                 len(labelToindex), MAXLENGTH,
                                                 weight_array)
    elif choose_model == 'TextRCNN':
        model = TextRCNN(len(vocabulary), EMBEDDING_DIM,
                         KERNEL_SIZES, HIDDEN_SIZE, KERNEL_NUM,
                         len(labelToindex), MAXLENGTH, weight_array)
    elif choose_model == 'TextCNN_multi_channel':
        model = TextCNN_multi_channel(len(vocabulary), EMBEDDING_DIM,
                                      KERNEL_SIZES, KERNEL_NUM,
                                      len(labelToindex), MAXLENGTH,
                                      weight_array)

    model.load_state_dict(
        torch.load(os.path.join(MODEL_PATH,
                                choose_model + '_201807110957.pkl')))  # 日期要变
    if use_cuda:
        model = model.cuda()
    print("Model loaded!")

    # 所有样本
    all_samples = pd.read_csv(os.path.join(DATA_PATH, 'all_labeled_datas.csv'))
    all_samples_contents = all_samples['content']
    all_samples_scores = all_samples['score']
    all_samples_labels = all_samples['label']

    all_samples_pro_contents = []
    all_samples_pro_scores = []
    all_samples_pro_labels = []

    for content, score, label in zip(all_samples_contents, all_samples_scores,
                                     all_samples_labels):
        punctuation = re.compile(
            u"[-~!@#$%^&*()_+`=\[\]\\\{\}\"|;':,./<>?·!@#¥%……&*()——+【】、;‘:“”,。、《》?「『」』 ]"
        )
        digit = re.compile(u"[0-9]")
        number = re.compile(u"[a-zA-Z]")

        content = punctuation.sub('', content)
        content = digit.sub("", content)
        content = number.sub("", content)
        if content != '':
            all_samples_pro_contents.append(content)
            all_samples_pro_scores.append(score)
            all_samples_pro_labels.append(label)

    all_pro_seg_contents = []
    all_pro_seg_scores = []
    all_pro_seg_labels = []
    sentenceToindex = {}
    for content, score, label in zip(all_samples_pro_contents,
                                     all_samples_pro_scores,
                                     all_samples_pro_labels):
        seg_content = jieba.cut(content)
        seg_con = []
        for word in seg_content:
            if word not in data_processing.get_stop_words().keys(
            ) and word in vocabulary.keys():
                seg_con.append(word)

        # 文本去重
        tmpSentence = ''.join(seg_con)
        if tmpSentence != '':
            if tmpSentence in sentenceToindex:
                continue
            else:
                sentenceToindex[tmpSentence] = len(sentenceToindex)

            all_pro_seg_contents.append(seg_con)
            all_pro_seg_scores.append(score)
            all_pro_seg_labels.append(label)

    for i, ct in enumerate(all_pro_seg_contents):
        ct_pad = data_processing.pad_sentences([ct], MAXLENGTH)
        input_x, input_y = data_processing.build_input_data(
            ct_pad, all_pro_seg_labels[i], vocabulary)

        input_x = Variable(torch.LongTensor(input_x))
        input_y = Variable(torch.LongTensor(input_y))
        if use_cuda:
            input_x = input_x.cuda()
            input_y = input_y.cuda()

        model_out = model(input_x)
        _, pre_y = torch.max(model_out, 1)

        if pre_y.item() != input_y.item():
            badcases_contents.append(' '.join(all_pro_seg_contents[i]))
            badcases_scores.append(all_pro_seg_scores[i])
            badcases_true_labels.append(all_pro_seg_labels[i])
            badcases_pred_labels.append(pre_y.item())

    dataframe = pd.DataFrame({
        "content": badcases_contents,
        "user_score": badcases_scores,
        "true_label": badcases_true_labels,
        "pred_label": badcases_pred_labels
    })
    dataframe.to_csv(os.path.join(DATA_PATH, 'badcases.csv'),
                     index=False,
                     sep=',')
    print("Badcases done!")
Ejemplo n.º 6
0
def train(config):
    print('parameters:')
    print(config)

    # load data
    print('load data')
    X, y = data_helper.process_data(config)  # X=[[seq1],[seq2]]   y=[,,,,]

    # make vocab
    print('make vocab...')
    word2index, label2index = data_helper.generate_vocab(X, y, config)

    # padding data
    print('padding data')
    input_x, input_y = data_helper.padding(X, y, config, word2index,
                                           label2index)

    # split data
    print('split data...')
    x_train, y_train, x_test, y_test, x_dev, y_dev = data_helper.split_data(
        input_x, input_y, config)

    print('length train: {}'.format(len(x_train)))
    print('length test: {}'.format(len(x_test)))
    print('length dev: {}'.format(len(x_dev)))

    print('training...')

    with tf.Graph().as_default():
        sess_config = tf.ConfigProto(
            allow_soft_placement=config['allow_soft_placement'],
            log_device_placement=config['log_device_placement'])
        with tf.Session(config=sess_config) as sess:
            rcnn = TextRCNN(config)

        # training procedure
        global_step = tf.Variable(0, name='globel_step', trainable=False)
        train_op = tf.train.AdamOptimizer(config['learning_rate']).minimize(
            rcnn.loss, global_step=global_step)

        # output dir for models
        timestamp = str(int(time.time()))
        outdir = os.path.abspath(
            os.path.join(os.path.curdir, 'runs', timestamp))
        if not os.path.exists(os.path.join(os.path.curdir, 'runs')):
            os.mkdir(os.path.join(os.path.curdir, 'runs'))
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        print('writing to {}'.format(outdir))

        # checkpoint dictory
        checkpoint_dir = os.path.abspath(os.path.join(outdir, 'checkpoints'))
        checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

        if not os.path.exists(checkpoint_dir):
            os.mkdir(checkpoint_dir)

        saver = tf.train.Saver(tf.global_variables(),
                               max_to_keep=config['num_checkpoints'])

        sess.run(tf.global_variables_initializer())

        def train_step(x_batch, y_batch):
            feed_dict = {
                rcnn.input_x: x_batch,
                rcnn.input_y: y_batch,
                rcnn.dropout_keep_prob: config['dropout_keep_prob']
            }

            _, step, loss, accuracy = sess.run(
                [train_op, global_step, rcnn.loss, rcnn.accuracy],
                feed_dict=feed_dict)

            time_str = datetime.datetime.now().isoformat()
            print('{}: step {}, loss {}, acc {}'.format(
                time_str, step, loss, accuracy))

        def dev_step(x_batch, y_batch):
            feed_dict = {
                rcnn.input_x: x_batch,
                rcnn.input_y: y_batch,
                rcnn.dropout_keep_prob: 1.0
            }

            step, loss, accuracy = sess.run(
                [global_step, rcnn.loss, rcnn.accuracy], feed_dict=feed_dict)

            time_str = datetime.datetime.now().isoformat()
            print('{}: step {}, loss {}, acc {}'.format(
                time_str, step, loss, accuracy))

        # generate batches
        batches = data_helper.generate_batchs(x_train, y_train, config)
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            print(y_batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % config['evaluate_every'] == 0:
                print('Evaluation:')
                dev_step(x_dev, y_dev)

            if current_step % config['checkpoint_every'] == 0:
                path = saver.save(sess,
                                  checkpoint_prefix,
                                  global_step=current_step)
                print('save model checkpoint to {}'.format(path))

        # test accuracy
        test_accuracy = sess.run(
            [rcnn.accuracy],
            feed_dict={
                rcnn.input_x: x_test,
                rcnn.input_y: y_test,
                rcnn.dropout_keep_prob: 1.0
            })
        print('Test dataset accuracy: {}'.format(test_accuracy))
Ejemplo n.º 7
0
def train_and_test(choose_model):
    # 写入日志文件
    logger = logging.getLogger(__name__)
    logger.setLevel(level=logging.DEBUG)

    runTime = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
    handler = logging.FileHandler('./logs/'+choose_model+'_'+runTime+'.log.txt')
    handler.setLevel(level=logging.DEBUG)

    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.info("************************************************************")

    x, y, vocabulary, vocabulary_inv, labelToindex, sentenceToindex, labelNumdict = data_processing.load_input_data(MAXLENGTH)
    logger.info("The number of samples is: {}".format(len(sentenceToindex)))
    logger.info("The distribution of the all dataset label(With: 0-bad, 1-mid, 2-good):{}".format(labelNumdict))
    # word2vec预训练权重
    weight_array = pickle.load(open(os.path.join(DATA_PATH, 'weight_array'), 'rb'))

    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=42)
    print("Train Sample's distribution: {}".format(data_processing.get_labelNumdict(train_y)))
    print("Test Sample's distribution: {}".format(data_processing.get_labelNumdict(test_y)))
    logger.info("Train Sample's distribution: {}".format(data_processing.get_labelNumdict(train_y)))
    logger.info("Test Sample's distribution: {}".format(data_processing.get_labelNumdict(test_y)))
    logger.info("Some hyperparameters with lr:{}, wd:{}, embed:{}".format(LEARNING_RATE, WEIGHT_DECAY, EMBEDDING_DIM))

    train_x = torch.LongTensor(train_x)
    test_x = torch.LongTensor(test_x)
    train_y = torch.LongTensor(train_y)
    test_y = torch.LongTensor(test_y)

    trainDataset = data_processing.JDataset(train_x, train_y)
    testDataset = data_processing.JDataset(test_x, test_y)
    trainDataLoader = DataLoader(trainDataset, batch_size=BATCH_SIZE, shuffle=True)
    testDataLoader = DataLoader(testDataset, batch_size=BATCH_SIZE, shuffle=False)

    # 选择训练的模型
    if choose_model == 'TextCNN':
        model = TextCNN(1, KERNEL_NUM, len(vocabulary), EMBEDDING_DIM, len(labelToindex))
    elif choose_model == 'BiLSTM':
        model = BiLSTM(len(vocabulary), EMBEDDING_DIM, HIDDEN_SIZE, len(labelToindex))
    elif choose_model == 'TextCNN_BN':
        model = TextCNN_BN(len(vocabulary), EMBEDDING_DIM, KERNEL_SIZES, KERNEL_NUM, len(labelToindex), MAXLENGTH, weight_array=None)
    elif choose_model == 'BiLSTM_b':
        model = BiLSTM_b(len(vocabulary), EMBEDDING_DIM, HIDDEN_SIZE, len(labelToindex), MAXLENGTH, weight_array=weight_array)
    elif choose_model == 'CNN_BiLSTM_a':
        model = CNN_BiLSTM_a(len(vocabulary), EMBEDDING_DIM, KERNEL_SIZES, KERNEL_NUM, HIDDEN_SIZE, len(labelToindex), MAXLENGTH, weight_array=weight_array)
    elif choose_model == 'BiGRU':
        model = BiGRU(len(vocabulary), EMBEDDING_DIM, HIDDEN_SIZE, len(labelToindex), MAXLENGTH)
    elif choose_model == 'CNN_with_pretrained_embedding':
        model = TextCNN_BN_with_pretrained_embed(len(vocabulary), EMBEDDING_DIM, KERNEL_SIZES, KERNEL_NUM, len(labelToindex), MAXLENGTH, weight_array)
    elif choose_model == 'TextRCNN':
        model = TextRCNN(len(vocabulary), EMBEDDING_DIM, KERNEL_SIZES, HIDDEN_SIZE, KERNEL_NUM, len(labelToindex), MAXLENGTH, weight_array)
    elif choose_model == 'TextCNN_multi_channel':
        model = TextCNN_multi_channel(len(vocabulary), EMBEDDING_DIM, KERNEL_SIZES, KERNEL_NUM, len(labelToindex), MAXLENGTH, weight_array)
    elif choose_model == 'Attention_rnn':
        model = Attention_RNN_model(len(vocabulary), EMBEDDING_DIM, HIDDEN_SIZE, len(labelToindex), weight_array)
    # 打印模型信息
    print(model)
    logger.info(model)
    if use_cuda:
        model = model.cuda()

    # 损失函数
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    criterion = nn.CrossEntropyLoss()
    optimzier = optim.Adam(parameters, lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

    best_acc = 0
    best_model = None
    for step in range(N_STEPS):
        model.train()
        train_loss = 0.0
        train_acc = 0
        for i, data in enumerate(trainDataLoader):
            tr_x, tr_y = data
            #print("Tr_X's size is: ", tr_x.size())
            #print("Tr_Y size: ", tr_y.size())
            if use_cuda:
                tr_x = Variable(tr_x).cuda()
                tr_y = Variable(tr_y).cuda()
            else:
                tr_x = Variable(tr_x)
                tr_y = Variable(tr_y)

            # forward
            out = model(tr_x)
            loss = criterion(out, tr_y)
            train_loss += loss.item() * len(tr_y)
            _, pre = torch.max(out, 1)
            #print("***", pre.size())
            #print(pre)
            num_acc = (pre==tr_y).sum()
            train_acc += num_acc.item()
            #print(train_acc)

            # backward
            optimzier.zero_grad()
            loss.backward()
            optimzier.step()
            if (i+1) % 100 == 0:
                print('[{}/{}], train loss is: {:.6f}, train acc is: {:.6f}'.format(i, len(trainDataLoader),
                                                                                    train_loss/(i*BATCH_SIZE),
                                                                                    train_acc/(i*BATCH_SIZE)))

                logger.info('[{}/{}], train loss is: {:.6f}, train acc is: {:.6f}'.format(i, len(trainDataLoader),
                                                                                          train_loss/(i*BATCH_SIZE),
                                                                                          train_acc/(i*BATCH_SIZE)))

        print('Step:[{}], train loss is: {:.6f}, train acc is: {:.6f}'.format(step,
                                                                              train_loss/(len(trainDataLoader)*BATCH_SIZE),
                                                                              train_acc/(len(trainDataLoader)*BATCH_SIZE)))

        logger.info('Step:[{}], train loss is: {:.6f}, train acc is: {:.6f}'.format(step,
                                                                                    train_loss / (len(trainDataLoader) * BATCH_SIZE),
                                                                                    train_acc / (len(trainDataLoader) * BATCH_SIZE)))

        model.eval()
        eval_loss = 0
        eval_acc = 0
        for i, data in enumerate(testDataLoader):
            te_x, te_y = data
            if use_cuda:
                te_x = Variable(te_x).cuda()
                te_y = Variable(te_y).cuda()
            else:
                te_x = Variable(te_x)
                te_y = Variable(te_y)
            out = model(te_x)
            loss = criterion(out, te_y)
            eval_loss += loss.item() * len(te_y)
            _, pre = torch.max(out, 1)
            num_acc=(pre==te_y).sum()
            eval_acc += num_acc.item()
        print('test loss is: {:.6f}, test acc is: {:.6f}'.format(eval_loss/(len(testDataLoader)*BATCH_SIZE),
                                                                 eval_acc/(len(testDataLoader)*BATCH_SIZE)))

        logger.info('test loss is: {:.6f}, test acc is: {:.6f}'.format(eval_loss / (len(testDataLoader) * BATCH_SIZE),
                                                                       eval_acc / (len(testDataLoader) * BATCH_SIZE)))

        if best_acc < (eval_acc/(len(testDataLoader)*BATCH_SIZE)):
            best_acc = eval_acc/(len(testDataLoader)*BATCH_SIZE)
            best_model = model.state_dict()
            print('best acc is {:.6f}, best model is changed.'.format(best_acc))

            logger.info('best acc is {:.6f}, best model is changed.'.format(best_acc))

    logger.info("Best acc is: {}".format(best_acc))
    logger.info("************************************************************")

    torch.save(model.state_dict(), os.path.join(MODEL_PATH, choose_model+'_'+runTime+'.pkl'))
Ejemplo n.º 8
0
random.seed(2020)
np.random.seed(2020)
torch.manual_seed(2020)
torch.cuda.manual_seed(2020)

device = torch.device("cuda:0")

training_set = MyDataset(url_train_data)

train_loader = data.DataLoader(dataset=training_set,
                               batch_size=128, shuffle=True)

valid_data = MyDataset(url_valid_data)
valid_loader = DataLoader(dataset=valid_data, batch_size=128)

model = TextRCNN()

if torch.cuda.is_available():
    model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adagrad(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

# Train the model
for epoch in range(20):
    loss_mean = 0
    correct = 0
    total = 0

    total0 = 0