Esempio n. 1
0
def main():

    ###########################
    #### create dictionary ####
    ###########################

    if os.path.exists('./data/corpus/dictionary.dict'):
        corpus = JaConvCorpus(file_path=None,
                              batch_size=batchsize,
                              size_filter=True)
        corpus.load(load_dir='./data/corpus/')
    else:
        corpus = JaConvCorpus(file_path=data_file,
                              batch_size=batchsize,
                              size_filter=True)
        corpus.save(save_dir='./data/corpus/')
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))

    ##########################
    #### create ID corpus ####
    ##########################

    input_mat = []
    output_mat = []
    batch_num = text_num = max_input_ren = max_output_ren = 0

    if not os.path.exists('./data/corpus/input_mat0.npy'):
        for input_text, output_text in zip(corpus.rough_posts,
                                           corpus.rough_cmnts):

            # convert to list
            input_text.reverse()  # encode words in a reverse order
            input_text.insert(0, corpus.dic.token2id["<eos>"])
            output_text.append(corpus.dic.token2id["<eos>"])

            # update max sentence length
            max_input_ren = max(max_input_ren, len(input_text))
            max_output_ren = max(max_output_ren, len(output_text))

            input_mat.append(input_text)
            output_mat.append(output_text)
            batch_num += 1

            if batch_num % 10000 == 0:
                # padding
                for li in input_mat:
                    insert_num = max_input_ren - len(li)
                    for _ in range(insert_num):
                        li.insert(0, corpus.dic.token2id['<pad>'])
                for li in output_mat:
                    insert_num = max_output_ren - len(li)
                    for _ in range(insert_num):
                        li.append(corpus.dic.token2id['<pad>'])

                # create batch matrix
                input_mat = np.array(input_mat, dtype=np.int32).T
                output_mat = np.array(output_mat, dtype=np.int32).T

                # save matrix and free memory
                print('save data ... number', text_num)
                np.save('./data/corpus/input_mat' + str(text_num) + '.npy',
                        input_mat)
                np.save('./data/corpus/output_mat' + str(text_num) + '.npy',
                        output_mat)
                text_num += 1
                del input_mat
                del output_mat
                gc.collect()
                input_mat = []
                output_mat = []

    else:
        print(
            'You already have matrix files! '
            'If you remake new corpus, you should remove old files in "data/corpus" directory and run this script.'
        )
Esempio n. 2
0
def main():

    ###########################
    #### create dictionary ####
    ###########################

    if os.path.exists('./data/corpus/dictionary.dict'):
        if args.lang == 'ja':
            corpus = JaConvCorpus(file_path=None,
                                  batch_size=batchsize,
                                  size_filter=True)
        else:
            corpus = ConvCorpus(file_path=None,
                                batch_size=batchsize,
                                size_filter=True)
        corpus.load(load_dir='./data/corpus/')
    else:
        if args.lang == 'ja':
            corpus = JaConvCorpus(file_path=data_file,
                                  batch_size=batchsize,
                                  size_filter=True)
        else:
            corpus = ConvCorpus(file_path=data_file,
                                batch_size=batchsize,
                                size_filter=True)
        corpus.save(save_dir='./data/corpus/')
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('Emotion size: ', len(corpus.emotion_set))

    # search word_threshold (general - emotional)
    ma = 0
    mi = 999999
    for word in corpus.emotion_set:
        wid = corpus.dic.token2id[word]
        if wid > ma:
            ma = wid
        if wid < mi:
            mi = wid
    # print(corpus.dic.token2id['<start>'], corpus.dic.token2id['<eos>'], corpus.dic.token2id['happy'], mi, ma)
    word_threshold = mi

    ######################
    #### create model ####
    ######################

    model = Seq2Seq(all_vocab_size=len(corpus.dic.token2id),
                    emotion_vocab_size=len(corpus.emotion_set),
                    feature_num=feature_num,
                    hidden_num=hidden_num,
                    batch_size=batchsize,
                    label_num=label_num,
                    label_embed_num=label_embed,
                    gpu_flg=args.gpu)

    if args.gpu >= 0:
        model.to_gpu()
    optimizer = optimizers.Adam(alpha=0.001)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001))

    ##########################
    #### create ID corpus ####
    ##########################

    input_mat = []
    output_mat = []
    input_mat_rev = []
    label_mat = []
    max_input_ren = max_output_ren = 0
    print('start making corpus matrix...')
    for input_text, output_text in zip(corpus.rough_posts, corpus.rough_cmnts):

        # reverse an input and add eos tag
        output_text.append(corpus.dic.token2id["<eos>"])  # 出力の最後にeosを挿入

        # update max sentence length
        max_input_ren = max(max_input_ren, len(input_text))
        max_output_ren = max(max_output_ren, len(output_text))

        # make a list of lists
        input_mat.append(input_text)
        output_mat.append(output_text)

        # make label lists TODO: 3値分類
        n_num = p_num = 0
        for word in output_text:
            if corpus.dic[word] in corpus.neg_words:
                n_num += 1
            if corpus.dic[word] in corpus.pos_words:
                p_num += 1
        if (n_num + p_num) == 0:
            label_mat.append([1 for _ in range(len(output_text))])
        elif n_num <= p_num:
            label_mat.append([2 for _ in range(len(output_text))])
        elif n_num > p_num:
            label_mat.append([0 for _ in range(len(output_text))])
        else:
            raise ValueError

    # make reverse corpus
    for input_text in input_mat:
        input_mat_rev.append(input_text[::-1])

    # padding (inputの文頭・outputの文末にパディングを挿入する)
    print('start labeling...')
    for li in input_mat:
        insert_num = max_input_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    for li in output_mat:
        insert_num = max_output_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    for li in input_mat_rev:
        insert_num = max_input_ren - len(li)
        for _ in range(insert_num):
            li.insert(0, corpus.dic.token2id['<pad>'])
    for li in label_mat:
        insert_num = max_output_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    if len(output_mat) != len(label_mat):
        print('Output matrix and label matrix should have the same dimension.')
        raise ValueError

    # create batch matrix
    print('transpose...')
    input_mat = np.array(input_mat, dtype=np.int32).T
    input_mat_rev = np.array(input_mat_rev, dtype=np.int32).T
    output_mat = np.array(output_mat, dtype=np.int32).T
    label_mat = np.array(label_mat, dtype=np.int32).T

    # separate corpus into Train and Test TODO:実験時はテストデータとトレーニングデータに分離する
    print('split train and test...')
    train_input_mat = input_mat
    train_output_mat = output_mat
    train_input_mat_rev = input_mat_rev
    train_label_mat = label_mat

    #############################
    #### train seq2seq model ####
    #############################

    accum_loss = 0
    train_loss_data = []
    print('start training...')
    for num, epoch in enumerate(range(n_epoch)):
        total_loss = 0
        batch_num = 0
        perm = np.random.permutation(len(corpus.rough_posts))

        # for training
        for i in range(0, len(corpus.rough_posts), batchsize):

            # select batch data
            input_batch = remove_extra_padding(
                train_input_mat[:, perm[i:i + batchsize]], reverse_flg=False)
            input_batch_rev = remove_extra_padding(
                train_input_mat_rev[:, perm[i:i + batchsize]],
                reverse_flg=True)
            output_batch = remove_extra_padding(
                train_output_mat[:, perm[i:i + batchsize]], reverse_flg=False)
            label_batch = remove_extra_padding(
                train_label_mat[:, perm[i:i + batchsize]], reverse_flg=False)

            # Encode a sentence
            model.initialize(
                batch_size=input_batch.shape[1])  # initialize cell
            model.encode(input_batch, input_batch_rev,
                         train=True)  # encode (output: hidden Variable)

            # Decode from encoded context
            input_ids = xp.array(
                [corpus.dic.token2id["<start>"] for _ in range(batchsize)])
            for w_ids, l_ids in zip(output_batch, label_batch):
                loss, predict_mat = model.decode(input_ids,
                                                 w_ids,
                                                 label_id=l_ids,
                                                 word_th=word_threshold,
                                                 train=True)
                input_ids = w_ids
                accum_loss += loss

            # learn model
            model.cleargrads()  # initialize all grad to zero
            accum_loss.backward()  # back propagation
            optimizer.update()
            total_loss += float(accum_loss.data)
            batch_num += 1
            print('Epoch: ', num, 'Batch_num', batch_num,
                  'batch loss: {:.2f}'.format(float(accum_loss.data)))
            accum_loss = 0

        train_loss_data.append(float(total_loss / batch_num))

        # save model and optimizer
        print('-----', epoch + 1, ' times -----')
        print('save the model and optimizer')
        serializers.save_hdf5('data/' + str(epoch) + '.model', model)
        serializers.save_hdf5('data/' + str(epoch) + '.state', optimizer)

    # save loss data
    with open('./data/loss_train_data.pkl', 'wb') as f:
        pickle.dump(train_loss_data, f)
Esempio n. 3
0
def main():

    ###########################
    #### create dictionary ####
    ###########################

    if os.path.exists('./data/corpus/dictionary.dict'):
        corpus = JaConvCorpus(file_path=None,
                              batch_size=batchsize,
                              size_filter=True)
        corpus.load(load_dir='./data/corpus/')
    else:
        corpus = JaConvCorpus(file_path=data_file,
                              batch_size=batchsize,
                              size_filter=True)
        corpus.save(save_dir='./data/corpus/')
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))

    ##################################
    #### create model (copy data) ####
    ##################################
    rough_model = './data/199_rough.model'
    model = Seq2Seq(len(corpus.dic.token2id),
                    feature_num=feature_num,
                    hidden_num=hidden_num,
                    batch_size=batchsize,
                    gpu_flg=args.gpu)
    serializers.load_hdf5(rough_model, model)
    if args.gpu >= 0:
        model.to_gpu()
    optimizer = optimizers.Adam(alpha=0.001)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(5))
    # optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001))

    ##########################
    #### create ID corpus ####
    ##########################

    input_mat = []
    output_mat = []
    max_input_ren = max_output_ren = 0

    for input_text, output_text in zip(corpus.fine_posts, corpus.fine_cmnts):

        # convert to list
        input_text.reverse()  # encode words in a reverse order
        input_text.insert(0, corpus.dic.token2id["<eos>"])
        output_text.append(corpus.dic.token2id["<eos>"])

        # update max sentence length
        max_input_ren = max(max_input_ren, len(input_text))
        max_output_ren = max(max_output_ren, len(output_text))

        input_mat.append(input_text)
        output_mat.append(output_text)

    # padding
    for li in input_mat:
        insert_num = max_input_ren - len(li)
        for _ in range(insert_num):
            li.insert(0, corpus.dic.token2id['<pad>'])
    for li in output_mat:
        insert_num = max_output_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])

    # create batch matrix
    input_mat = np.array(input_mat, dtype=np.int32).T
    output_mat = np.array(output_mat, dtype=np.int32).T

    # separate corpus into Train and Test
    perm = np.random.permutation(len(corpus.fine_posts))
    test_input_mat = input_mat[:, perm[0:0 + testsize]]
    test_output_mat = output_mat[:, perm[0:0 + testsize]]
    train_input_mat = input_mat[:, perm[testsize:]]
    train_output_mat = output_mat[:, perm[testsize:]]

    list_of_references = []
    for text_ndarray in test_output_mat.T:
        reference = text_ndarray.tolist()
        references = [[w_id for w_id in reference if w_id is not -1]]
        list_of_references.append(references)

    #############################
    #### train seq2seq model ####
    #############################

    accum_loss = 0
    train_loss_data = []
    test_loss_data = []
    bleu_score_data = []
    wer_score_data = []
    for num, epoch in enumerate(range(n_epoch)):
        total_loss = test_loss = 0
        batch_num = 0
        perm = np.random.permutation(len(corpus.fine_posts) - testsize)

        # for training
        for i in range(0, len(corpus.fine_posts) - testsize, batchsize):

            # select batch data
            input_batch = train_input_mat[:, perm[i:i + batchsize]]
            output_batch = train_output_mat[:, perm[i:i + batchsize]]

            # Encode a sentence
            model.initialize()  # initialize cell
            model.encode(input_batch,
                         train=True)  # encode (output: hidden Variable)

            # Decode from encoded context
            end_batch = xp.array(
                [corpus.dic.token2id["<start>"] for _ in range(batchsize)])
            first_words = output_batch[0]
            loss, predict_mat = model.decode(end_batch,
                                             first_words,
                                             train=True)
            next_ids = first_words
            accum_loss += loss
            for w_ids in output_batch[1:]:
                loss, predict_mat = model.decode(next_ids, w_ids, train=True)
                next_ids = w_ids
                accum_loss += loss

            # learn model
            model.cleargrads()  # initialize all grad to zero
            accum_loss.backward()  # back propagation
            optimizer.update()
            total_loss += float(accum_loss.data)
            batch_num += 1
            print('Epoch: ', num, 'Batch_num', batch_num,
                  'batch loss: {:.2f}'.format(float(accum_loss.data)))
            accum_loss = 0

        # for testing
        list_of_hypotheses = []
        for i in range(0, testsize, batchsize):

            # select test batch data
            input_batch = test_input_mat[:, i:i + batchsize]
            output_batch = test_output_mat[:, i:i + batchsize]

            # Encode a sentence
            model.initialize()  # initialize cell
            model.encode(input_batch,
                         train=True)  # encode (output: hidden Variable)

            # Decode from encoded context
            end_batch = xp.array(
                [corpus.dic.token2id["<start>"] for _ in range(batchsize)])
            first_words = output_batch[0]
            loss, predict_mat = model.decode(end_batch,
                                             first_words,
                                             train=True)
            next_ids = xp.argmax(predict_mat.data, axis=1)
            test_loss += loss
            if args.gpu >= 0:
                hypotheses = [cuda.to_cpu(next_ids)]
            else:
                hypotheses = [next_ids]
            for w_ids in output_batch[1:]:
                loss, predict_mat = model.decode(next_ids, w_ids, train=True)
                next_ids = xp.argmax(predict_mat.data, axis=1)
                test_loss += loss.data
                if args.gpu >= 0:
                    hypotheses.append(cuda.to_cpu(next_ids))
                else:
                    hypotheses.append(next_ids)

            # collect hypotheses for calculating BLEU score
            hypotheses = np.array(hypotheses).T
            for hypothesis in hypotheses:
                text_list = hypothesis.tolist()
                list_of_hypotheses.append(
                    [w_id for w_id in text_list if w_id is not -1])

        # calculate BLEU score from test (develop) data
        bleu_score = nltk.translate.bleu_score.corpus_bleu(list_of_references,
                                                           list_of_hypotheses,
                                                           weights=(0.25, 0.25,
                                                                    0.25,
                                                                    0.25))
        bleu_score_data.append(bleu_score)
        print('Epoch: ', num, 'BLEU SCORE: ', bleu_score)

        # calculate WER score from test (develop) data
        wer_score = 0
        for index, references in enumerate(list_of_references):
            wer_score += wer(references[0], list_of_hypotheses[index])
        wer_score /= len(list_of_references)
        wer_score_data.append(wer_score)
        print('Epoch: ', num, 'WER SCORE: ', wer_score)

        # save model and optimizer
        if (epoch + 1) % 10 == 0:
            print('-----', epoch + 1, ' times -----')
            print('save the model and optimizer')
            serializers.save_hdf5('data/' + str(epoch) + '_fine.model', model)
            serializers.save_hdf5('data/' + str(epoch) + '_fine.state',
                                  optimizer)

        # display the on-going status
        print('Epoch: ', num, 'Train loss: {:.2f}'.format(total_loss),
              'Test loss: {:.2f}'.format(float(test_loss)))
        train_loss_data.append(float(total_loss / batch_num))
        test_loss_data.append(float(test_loss))

        # evaluate a test loss
        check_loss = test_loss_data[-10:]  # check out the last 10 loss data
        end_flg = [
            j for j in range(len(check_loss) - 1)
            if check_loss[j] < check_loss[j + 1]
        ]
        if len(end_flg) > 9:
            print('Probably it is over-fitting. So stop to learn...')
            break

    # save loss data
    with open('./data/fine_loss_train_data.pkl', 'wb') as f:
        pickle.dump(train_loss_data, f)
    with open('./data/fine_loss_test_data.pkl', 'wb') as f:
        pickle.dump(test_loss_data, f)
    with open('./data/fine_bleu_score_data.pkl', 'wb') as f:
        pickle.dump(bleu_score_data, f)
    with open('./data/fine_wer_score_data.pkl', 'wb') as f:
        pickle.dump(wer_score_data, f)
Esempio n. 4
0
def main():

    ###########################
    #### create dictionary ####
    ###########################

    if os.path.exists('./data/corpus/dictionary.dict'):
        corpus = JaConvCorpus(file_path=None,
                              batch_size=batchsize,
                              size_filter=True)
        corpus.load(load_dir='./data/corpus/')
    else:
        corpus = JaConvCorpus(file_path=data_file,
                              batch_size=batchsize,
                              size_filter=True)
        corpus.save(save_dir='./data/corpus/')
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))

    ######################
    #### create model ####
    ######################

    model = Seq2Seq(len(corpus.dic.token2id),
                    feature_num=feature_num,
                    hidden_num=hidden_num,
                    batch_size=batchsize,
                    gpu_flg=args.gpu)
    if args.gpu >= 0:
        model.to_gpu()
    optimizer = optimizers.Adam(alpha=0.001)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(5))
    # optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001))

    ##########################
    #### create ID corpus ####
    ##########################

    test_input_mat = []
    test_output_mat = []
    train_input_mats = []
    train_output_mats = []

    if not os.path.exists('./data/corpus/input_mat0.npy'):
        print(
            "You don't have any input matrix. You should run 'preprocess.py' before you run this script."
        )
        raise ValueError
    else:
        for index, text_name in enumerate(glob.glob('data/corpus/input_mat*')):
            batch_input_mat = np.load(text_name)
            if index == 0:
                # separate corpus into Train and Test
                perm = np.random.permutation(batch_input_mat.shape[1])
                test_input_mat = batch_input_mat[:, perm[0:0 + testsize]]
                train_input_mats.append(batch_input_mat[:, perm[testsize:]])
            else:
                train_input_mats.append(batch_input_mat)
        for index, text_name in enumerate(
                glob.glob('data/corpus/output_mat*')):
            batch_output_mat = np.load(text_name)
            if index == 0:
                # separate corpus into Train and Test
                test_output_mat = batch_output_mat[:, perm[0:0 + testsize]]
                train_output_mats.append(batch_output_mat[:, perm[testsize:]])
            else:
                train_output_mats.append(batch_output_mat)

    list_of_references = []
    for text_ndarray in test_output_mat.T:
        reference = text_ndarray.tolist()
        references = [[w_id for w_id in reference if w_id is not -1]]
        list_of_references.append(references)

    #############################
    #### train seq2seq model ####
    #############################

    matrix_row_size = train_input_mats[0].shape[1] - testsize
    accum_loss = 0
    train_loss_data = []
    test_loss_data = []
    bleu_score_data = []
    wer_score_data = []
    for num, epoch in enumerate(range(n_epoch)):
        total_loss = test_loss = batch_num = 0

        # for training by each corpus matrix
        for mat_index in range(len(train_input_mats)):
            perm = np.random.permutation(matrix_row_size)

            # by each batch size
            for i in range(0, matrix_row_size, batchsize):

                # select batch data
                input_batch = train_input_mats[mat_index][:, perm[i:i +
                                                                  batchsize]]
                output_batch = train_output_mats[mat_index][:, perm[i:i +
                                                                    batchsize]]

                # Encode a sentence
                model.initialize()  # initialize cell
                model.encode(input_batch,
                             train=True)  # encode (output: hidden Variable)

                # Decode from encoded context
                end_batch = xp.array(
                    [corpus.dic.token2id["<start>"] for _ in range(batchsize)])
                first_words = output_batch[0]
                loss, predict_mat = model.decode(end_batch,
                                                 first_words,
                                                 train=True)
                next_ids = first_words
                accum_loss += loss
                for w_ids in output_batch[1:]:
                    loss, predict_mat = model.decode(next_ids,
                                                     w_ids,
                                                     train=True)
                    next_ids = w_ids
                    accum_loss += loss

                # learn model
                model.cleargrads()
                accum_loss.backward()
                #accum_loss.unchain_backward()
                optimizer.update()
                total_loss += float(accum_loss.data)
                print('Epoch: ', num, 'Matrix_num: ', mat_index, 'Batch_num',
                      batch_num,
                      'batch loss: {:.2f}'.format(float(accum_loss.data)))
                batch_num += 1
                accum_loss = 0

        # # for testing by 1epoch
        # list_of_hypotheses = []
        # for i in range(0, testsize, batchsize):
        #
        #     # select test batch data
        #     input_batch = test_input_mat[:, i:i + batchsize]
        #     output_batch = test_output_mat[:, i:i + batchsize]
        #
        #     # Encode a sentence
        #     model.initialize()                     # initialize cell
        #     model.encode(input_batch, train=True)  # encode (output: hidden Variable)
        #
        #     # Decode from encoded context
        #     end_batch = xp.array([corpus.dic.token2id["<start>"] for _ in range(batchsize)])
        #     first_words = output_batch[0]
        #     loss, predict_mat = model.decode(end_batch, first_words, train=True)
        #     next_ids = xp.argmax(predict_mat.data, axis=1)
        #     test_loss += loss
        #     if args.gpu >= 0:
        #         hypotheses = [cuda.to_cpu(next_ids)]
        #     else:
        #         hypotheses = [next_ids]
        #     for w_ids in output_batch[1:]:
        #         loss, predict_mat = model.decode(next_ids, w_ids, train=True)
        #         next_ids = xp.argmax(predict_mat.data, axis=1)
        #         test_loss += loss
        #         if args.gpu >= 0:
        #             hypotheses.append(cuda.to_cpu(next_ids))
        #         else:
        #             hypotheses.append(next_ids)
        #
        #     # collect hypotheses for calculating BLEU score
        #     hypotheses = np.array(hypotheses).T
        #     for hypothesis in hypotheses:
        #         text_list = hypothesis.tolist()
        #         list_of_hypotheses.append([w_id for w_id in text_list if w_id is not -1])
        #
        # # calculate BLEU score from test (develop) data
        # bleu_score = nltk.translate.bleu_score.corpus_bleu(list_of_references, list_of_hypotheses,
        #                                                    weights=(0.25, 0.25, 0.25, 0.25))
        # bleu_score_data.append(bleu_score)
        # print('Epoch: ', num, 'BLEU SCORE: ', bleu_score)
        #
        # # calculate WER score from test (develop) data
        # wer_score = 0
        # for index, references in enumerate(list_of_references):
        #     wer_score += wer(references[0], list_of_hypotheses[index])
        # wer_score /= len(list_of_references)
        # wer_score_data.append(wer_score)
        # print('Epoch: ', num, 'WER SCORE: ', wer_score)
        #
        # # evaluate a test loss
        # check_loss = test_loss_data[-10:]           # check out the last 10 loss data
        # end_flg = [j for j in range(len(check_loss) - 1) if check_loss[j] < check_loss[j + 1]]
        # if len(end_flg) > 9:
        #     print('Probably it is over-fitting. So stop to learn...')
        #     break

        # save model and optimizer
        if (epoch + 1) % 10 == 0:
            print('-----', epoch + 1, ' times -----')
            print('save the model and optimizer')
            serializers.save_hdf5('data/' + str(epoch) + '_rough.model', model)
            serializers.save_hdf5('data/' + str(epoch) + '_rough.state',
                                  optimizer)

        # display the on-going status
        print('Epoch: ', num, 'Train loss: {:.2f}'.format(total_loss),
              'Test loss: {:.2f}'.format(float(test_loss.data)))
        train_loss_data.append(float(total_loss / batch_num))
        test_loss_data.append(float(test_loss.data))

    # save loss data
    with open('./data/rough_loss_train_data.pkl', 'wb') as f:
        pickle.dump(train_loss_data, f)
    with open('./data/rough_loss_test_data.pkl', 'wb') as f:
        pickle.dump(test_loss_data, f)
    with open('./data/rough_bleu_score_data.pkl', 'wb') as f:
        pickle.dump(bleu_score_data, f)
    with open('./data/rough_wer_score_data.pkl', 'wb') as f:
        pickle.dump(wer_score_data, f)
Esempio n. 5
0
def main():

    ###########################
    #### create dictionary ####
    ###########################

    if os.path.exists('./data/corpus/dictionary.dict'):
        corpus = JaConvCorpus(file_path=None,
                              batch_size=batchsize,
                              size_filter=True)
        corpus.load(load_dir='./data/corpus/')
    else:
        corpus = JaConvCorpus(file_path=data_file,
                              batch_size=batchsize,
                              size_filter=True)
        corpus.save(save_dir='./data/corpus/')
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))

    ##################################
    #### create model (copy data) ####
    ##################################
    rough_model = './data/199_rough.model'
    model = Seq2Seq(len(corpus.dic.token2id),
                    feature_num=feature_num,
                    hidden_num=hidden_num,
                    batch_size=batchsize,
                    gpu_flg=args.gpu)
    serializers.load_hdf5(rough_model, model)
    if args.gpu >= 0:
        model.to_gpu()

    ##########################
    #### create ID corpus ####
    ##########################

    input_mat = []
    output_mat = []
    max_input_ren = max_output_ren = 0

    for input_text, output_text in zip(corpus.fine_posts, corpus.fine_cmnts):

        # convert to list
        input_text.reverse()  # encode words in a reverse order
        input_text.insert(0, corpus.dic.token2id["<eos>"])
        output_text.append(corpus.dic.token2id["<eos>"])

        # update max sentence length
        max_input_ren = max(max_input_ren, len(input_text))
        max_output_ren = max(max_output_ren, len(output_text))

        input_mat.append(input_text)
        output_mat.append(output_text)

    # padding
    for li in input_mat:
        insert_num = max_input_ren - len(li)
        for _ in range(insert_num):
            li.insert(0, corpus.dic.token2id['<pad>'])
    for li in output_mat:
        insert_num = max_output_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])

    # create batch matrix
    input_mat = np.array(input_mat, dtype=np.int32).T
    output_mat = np.array(output_mat, dtype=np.int32).T

    # separate corpus into Train and Test
    train_input_mat = input_mat
    train_output_mat = output_mat

    #############################
    #### train seq2seq model ####
    #############################

    accum_loss = 0
    train_loss_data = []
    for num, epoch in enumerate(range(n_epoch)):
        total_loss = 0
        batch_num = 0
        perm = np.random.permutation(len(corpus.fine_posts) - testsize)

        # initialize optimizer
        optimizer = optimizers.Adam(alpha=0.001)
        optimizer.setup(model)
        # optimizer.add_hook(chainer.optimizer.GradientClipping(5))
        optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001))

        # for training
        for i in range(0, len(corpus.fine_posts) - testsize, batchsize):

            # select batch data
            input_batch = train_input_mat[:, perm[i:i + batchsize]]
            output_batch = train_output_mat[:, perm[i:i + batchsize]]

            # Encode a sentence
            model.initialize()  # initialize cell
            model.encode(input_batch,
                         train=True)  # encode (output: hidden Variable)

            # Decode from encoded context
            end_batch = xp.array(
                [corpus.dic.token2id["<start>"] for _ in range(batchsize)])
            first_words = output_batch[0]
            loss, predict_mat = model.decode(end_batch,
                                             first_words,
                                             train=True)
            next_ids = first_words
            accum_loss += loss
            for w_ids in output_batch[1:]:
                loss, predict_mat = model.decode(next_ids, w_ids, train=True)
                next_ids = w_ids
                accum_loss += loss

            # learn model
            model.cleargrads()  # initialize all grad to zero
            accum_loss.backward()  # back propagation
            optimizer.update()
            total_loss += float(accum_loss.data)
            print('Epoch: ', num, 'Batch_num', batch_num,
                  'batch loss: {:.2f}'.format(float(accum_loss.data)))
            accum_loss = 0

        # save model and optimizer
        if (epoch + 1) % 5 == 0:
            print('-----', epoch + 1, ' times -----')
            print('save the model and optimizer')
            serializers.save_hdf5('data/' + str(epoch) + '_fine.model', model)
            serializers.save_hdf5('data/' + str(epoch) + '_fine.state',
                                  optimizer)

    # save loss data
    with open('./data/fine_loss_train_data.pkl', 'wb') as f:
        pickle.dump(train_loss_data, f)