def __init__(self, output_dim, hidden_dim, output_length, depth=1,bidirectional=True, dropout=0.25, **kwargs):
		if bidirectional and hidden_dim % 2 != 0:
			raise Exception ("hidden_dim for AttentionSeq2seq should be even (Because of bidirectional RNN).")
		super(AttentionSeq2seq, self).__init__()
		if type(depth) not in [list, tuple]:
			depth = (depth, depth)
		if bidirectional:
			encoder = Bidirectional(LSTMEncoder(output_dim=hidden_dim / 2, state_input=False, return_sequences=True, **kwargs))
		else:
			encoder = LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs)
		decoder = AttentionDecoder(hidden_dim=hidden_dim, output_length=output_length, state_input=False, **kwargs)
		lstms = []
		for i in range(1, depth[0]):
			if bidirectional:
				layer = Bidirectional(LSTMEncoder(output_dim=hidden_dim / 2, state_input=False, return_sequences=True, **kwargs))
			else:
				layer = LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs)
			self.add(layer)
			lstms.append(layer)
			self.add(Dropout(dropout))
		self.add(encoder)
		self.add(Dropout(dropout))
		self.add(TimeDistributedDense(hidden_dim if depth[1] > 1 else output_dim))
		lstms.append(encoder)
		self.add(decoder)
		lstms = [decoder]
		for i in range(1, depth[1]):
			layer = LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs)
			self.add(layer)
			lstms.append(layer)
			self.add(Dropout(dropout))
		if depth[1] > 1:
			self.add(TimeDistributedDense(output_dim))
		self.encoder = encoder
		self.decoder = decoder
Exemple #2
0
def get_nn_model(token_dict_size):
    '''model = Sequential()
    seq2seq = AttentionSeq2seq(
    #seq2seq = SimpleSeq2seq(
        input_dim = TOKEN_REPRESENTATION_SIZE,
        input_length = INPUT_SEQUENCE_LENGTH,
        hidden_dim = HIDDEN_LAYER_DIMENSION,
        output_dim = token_dict_size,
        output_length = ANSWER_MAX_TOKEN_LENGTH,
        depth = 4
    )
    model.add(seq2seq)
    model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
    '''
    dropout = 0.1
    model = Sequential()
    encoder_top_layer = LSTM(HIDDEN_LAYER_DIMENSION,
                             input_dim=TOKEN_REPRESENTATION_SIZE,
                             input_length=INPUT_SEQUENCE_LENGTH,
                             return_sequences=True)

    decoder_top_layer = AttentionDecoder(hidden_dim=HIDDEN_LAYER_DIMENSION,
                                         output_dim=HIDDEN_LAYER_DIMENSION,
                                         output_length=ANSWER_MAX_TOKEN_LENGTH,
                                         state_input=False,
                                         return_sequences=True)
    #model.add(Embedding(input_dim=TOKEN_REPRESENTATION_SIZE,output_dim=HIDDEN_LAYER_DIMENSION,input_length=INPUT_SEQUENCE_LENGTH))
    model.add(encoder_top_layer)
    model.add(Dropout(dropout))
    model.add(LSTM(HIDDEN_LAYER_DIMENSION, return_sequences=False))
    model.add(RepeatVector(ANSWER_MAX_TOKEN_LENGTH))
    model.add(decoder_top_layer)
    model.add(Dropout(dropout))
    model.add(LSTM(HIDDEN_LAYER_DIMENSION, return_sequences=True))
    model.add(Dropout(dropout))
    model.add(TimeDistributed(Dense(token_dict_size)))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    #if os.path.isfile(NN_MODEL_PATH):
    #    model.load_weights(NN_MODEL_PATH)

    return model
Exemple #3
0
	def __init__(self, output_dim, hidden_dim, output_length, depth=1,bidirectional=True, dropout=0.1, **kwargs):
		if bidirectional and hidden_dim % 2 != 0:
			raise Exception ("hidden_dim for AttentionSeq2seq should be even (Because of bidirectional RNN).")
		super(AttentionSeq2seq, self).__init__()
		if type(depth) not in [list, tuple]:
			depth = (depth, depth)
		if 'batch_input_shape' in kwargs:
			shape = kwargs['batch_input_shape']
			del kwargs['batch_input_shape']
		elif 'input_shape' in kwargs:
			shape = (None,) + tuple(kwargs['input_shape'])
			del kwargs['input_shape']
		elif 'input_dim' in kwargs:
			if 'input_length' in kwargs:
				input_length = kwargs['input_length']
			else:
				input_length = None
			shape = (None, input_length, kwargs['input_dim'])
			del kwargs['input_dim']
		self.add(Layer(batch_input_shape=shape))
		if bidirectional:
			self.add(Bidirectional(LSTMEncoder(output_dim=int(hidden_dim / 2), state_input=False, return_sequences=True, **kwargs)))
		else:
			self.add(LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs))
		for i in range(0, depth[0] - 1):
			self.add(Dropout(dropout))
			if bidirectional:
				self.add(Bidirectional(LSTMEncoder(output_dim=int(hidden_dim / 2), state_input=False, return_sequences=True, **kwargs)))
			else:
				self.add(LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs))
		encoder = self.layers[-1]
		self.add(Dropout(dropout))
		self.add(TimeDistributed(Dense(hidden_dim if depth[1] > 1 else output_dim)))
		decoder = AttentionDecoder(hidden_dim=hidden_dim, output_length=output_length, state_input=False, **kwargs)
		self.add(Dropout(dropout))
		self.add(decoder)
		for i in range(0, depth[1] - 1):
			self.add(Dropout(dropout))
			self.add(LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs))
		self.add(Dropout(dropout))
		self.add(TimeDistributed(Dense(output_dim)))
		self.encoder = encoder
		self.decoder = decoder
def train():
    # input_text = ['1 2 3 4 5'
    #               , '6 7 8 9 10'
    #               , '11 12 13 14 15'
    #               , '16 17 18 19 20'
    #               , '21 22 23 24 25']
    # tar_text = ['one two three four five'
    #             , 'six seven eight nine ten'
    #             , 'eleven twelve thirteen fourteen fifteen'
    #             , 'sixteen seventeen eighteen nineteen twenty'
    #             , 'twenty_one twenty_two twenty_three twenty_four twenty_five']


    # input_text = ['床 前 明 月 光 ,'
    #               , '举 头 望 明 月 ,'
    #               , '敝 笱 在 梁 ,'
    #               , '齐 子 归 止 ,'
    #               , '21 22 23 24 25']
    # tar_text = ['疑 是 地 上 霜。'
    #             , '低 头 思 故 乡。'
    #             , '其 鱼 鲂 鳏 。'
    #             , '其 从 如 云 。'
    #             , 'twenty_one twenty_two twenty_three twenty_four twenty_five']


    # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list)))

    vocab = loaddic('./corpus/dic.txt')

    print '-----------'
    # print vocab
    print '-----------'
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    # input_maxlen = max(map(len, (x for x in input_list)))
    # tar_maxlen = max(map(len, (x for x in tar_list)))
    input_maxlen = 97
    tar_maxlen = 20
    output_dim = vocab_size
    hidden_dim = 500

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    # print('Number of training stories:', len(input_list))
    # print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict((c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict((i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符

    decoder_mode = 3  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                        , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim
                                         , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                             , output_length=tar_maxlen, state_input=False, return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(Embedding(input_dim=vocab_size,
                              output_dim=hidden_dim,
                              input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))

    # input_text = loadfile('./corpus/content-12147.txt')
    # tar_text = loadfile('./corpus/title-12147.txt')
    input_text = loadfile('./corpus/content-12147.txt')
    tar_text = loadfile('./corpus/title-12147.txt')
    time1 = time.time()
    for iter_num in range(1):
        for i in range(0,58213,20):
            if(i == 58200):
                break
            input_list = []
            tar_list = []
            for tmp_input in input_text[i:i+20]:
                input_list.append(chtokenize(tmp_input))
            for tmp_tar in tar_text[i:i+20]:
                tar_list.append(chtokenize(tmp_tar))
            inputs_train, tars_train = vectorize_stories(input_list, tar_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size)
            en_de_model.fit(inputs_train, tars_train, batch_size=2, nb_epoch=1, show_accuracy=True)
            print'Current line:'+ str(i)
            print('Current iter_num is:%d' % iter_num)
        # out_predicts = en_de_model.predict(inputs_train)
        # for i_idx, out_predict in enumerate(out_predicts):
        #     predict_sequence = []
        #     for predict_vector in out_predict:
        #         next_index = np.argmax(predict_vector)
        #         next_token = idx_to_word[next_index]
        #         predict_sequence.append(next_token)
        #     print('Target output:', tar_text[i_idx])
        #     print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
    en_de_model.save_weights('en_de_weights.h5')
    print ('Train Ended')
    time2 = time.time()-time1
    print time2
Exemple #5
0
def main():
    input_text = [
        '1 2 3 4 5', '6 7 8 9 10', '11 12 13 14 15', '16 17 18 19 20',
        '21 22 23 24 25'
    ]
    tar_text = [
        'one two three four five', 'six seven eight nine ten',
        'eleven twelve thirteen fourteen fifteen',
        'sixteen seventeen eighteen nineteen twenty',
        'twenty_one twenty_two twenty_three twenty_four twenty_five'
    ]

    input_list = []
    tar_list = []

    for tmp_input in input_text:
        input_list.append(tokenize(tmp_input))
    for tmp_tar in tar_text:
        tar_list.append(tokenize(tmp_tar))

    vocab = sorted(
        reduce(lambda x, y: x | y,
               (set(tmp_list) for tmp_list in input_list + tar_list)))
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    input_maxlen = max(map(len, (x for x in input_list)))
    tar_maxlen = max(map(len, (x for x in tar_list)))
    output_dim = vocab_size
    hidden_dim = 20

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    print('Number of training stories:', len(input_list))
    print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict(
        (c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict(
        (i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符
    inputs_train, tars_train = vectorize_stories(input_list, tar_list,
                                                 word_to_idx, input_maxlen,
                                                 tar_maxlen, vocab_size)

    decoder_mode = 1  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                        output_dim=hidden_dim,
                                        output_length=tar_maxlen,
                                        state_input=False,
                                        return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=tar_maxlen,
                                         state_input=False,
                                         return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                             output_dim=hidden_dim,
                                             output_length=tar_maxlen,
                                             state_input=False,
                                             return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=hidden_dim,
                  input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))
    for iter_num in range(5000):
        en_de_model.fit(inputs_train,
                        tars_train,
                        batch_size=3,
                        nb_epoch=1,
                        show_accuracy=True)
        out_predicts = en_de_model.predict(inputs_train)
        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = idx_to_word[next_index]
                predict_sequence.append(next_token)
            print('Target output:', tar_text[i_idx])
            print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
def train():
    # input_text = ['1 2 3 4 5'
    #               , '6 7 8 9 10'
    #               , '11 12 13 14 15'
    #               , '16 17 18 19 20'
    #               , '21 22 23 24 25']
    # tar_text = ['one two three four five'
    #             , 'six seven eight nine ten'
    #             , 'eleven twelve thirteen fourteen fifteen'
    #             , 'sixteen seventeen eighteen nineteen twenty'
    #             , 'twenty_one twenty_two twenty_three twenty_four twenty_five']

    # input_text = ['Hello this is Tom speaking Is that John ?'
    #               , 'Would you like to go swimming with me ?'
    #               , 'Ok see you then Bye'
    #               , 'Yeah I am free What time shall we meet ?'
    #               , 'How does it taste ?']
    # tar_text = ['Yes this is What s up ?'
    #             , 'That sounds great It s good weather for swimming  I d love to .'
    #             , 'See you'
    #             , 'At 3:00PM'
    #             , 'It tastes good you should try some.']

    #排序 生成数据格式为list的字典
    # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list)))

    vocab = []
    dicfile = open('./corpus/dic.txt','r')
    line = dicfile.readline()
    while line:
        vocab.append(line.strip())
        line = dicfile.readline()
    dicfile.close()

    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    # input_maxlen = max(map(len, (x for x in input_list)))
    # tar_maxlen = max(map(len, (x for x in tar_list)))
    input_maxlen = 99
    tar_maxlen = 22
    output_dim = vocab_size
    hidden_dim = 50

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    # print('Number of training stories:', len(input_list))
    # print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict((c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict((i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符


    decoder_mode = 3  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                        , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim
                                         , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                             , output_length=tar_maxlen, state_input=False, return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(Embedding(input_dim=vocab_size,
                              output_dim=hidden_dim,
                              input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))

    for iter_num in range(5000):
        #构造数据 批量喂入
        input_text = loadfile('./corpus/content-12147.txt')
        tar_text = loadfile('./corpus/title-12147.txt')
        for i in range (0,58213):
            if (i == 58200):
                break
            input_list = []
            tar_list = []
            for tmp_input in input_text[i:i+20]:
                input_list.append(chtokenize(tmp_input,vocab))
            for tmp_tar in tar_text[i:i+20]:
                tar_list.append(chtokenize(tmp_tar,vocab))
            inputs_train, tars_train = vectorize_stories(input_list, tar_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size)
            en_de_model.fit(inputs_train, tars_train, batch_size=4, nb_epoch=1, show_accuracy=True)
            i += 20

        # out_predicts = en_de_model.predict(inputs_train)
        # for i_idx, out_predict in enumerate(out_predicts):
        #     predict_sequence = []
        #     for predict_vector in out_predict:
        #         next_index = np.argmax(predict_vector)
        #         next_token = idx_to_word[next_index]
        #         predict_sequence.append(next_token)
        #     print('Target output:', tar_text[i_idx].decode('utf8'))
        #     print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
    en_de_model.save_weights('en_de_weights1-40.h5')
    print ('Train Ended')
Exemple #7
0
def main():
    filep = pjoin(m_path, "cut_recovery_10", "Paging")
    file_num = len(os.listdir(pjoin(filep, "input")))

    file_names_idx = range(file_num)
    random.shuffle(file_names_idx)
    # train_names_idx = file_names_idx[:file_num/7]
    # test_names_idx = file_names_idx[file_num/7+1:]
    train_names_idx = file_names_idx[:5000]
    test_names_idx = file_names_idx[5001:6000]

    vocab = load_vocab(
        pjoin(m_path, "BaseLine-BigData_1kUE_20ENB_paging-Case_Group_1-Case_1",
              "dic.txt"))
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1

    #input_maxlen = max(map(len, (x for x in input_list)))
    #output_maxlen = max(map(len, (x for x in tar_list)))
    input_maxlen = 200
    output_maxlen = 200
    output_dim = vocab_size
    hidden_dim = 300

    # print('-')
    # print('Vocab size:', vocab_size, 'unique words')
    # print('Input max length:', input_maxlen, 'words')
    # print('Target max length:', tar_maxlen, 'words')
    # print('Dimension of hidden vectors:', hidden_dim)
    # print('Number of training stories:', len(input_list))
    # print('Number of test stories:', len(input_list))
    # print('-')
    # print('Vectorizing the word sequences...')
    word_to_idx = dict(
        (c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict(
        (i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符

    decoder_mode = 1  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                        output_dim=hidden_dim,
                                        output_length=output_maxlen,
                                        state_input=False,
                                        return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=output_maxlen,
                                         state_input=False,
                                         return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                             output_dim=hidden_dim,
                                             output_length=output_maxlen,
                                             state_input=False,
                                             return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=hidden_dim,
                  input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)

    if decoder_mode == 0:
        en_de_model.add(RepeatVector(output_maxlen))

    en_de_model.add(decoder_top_layer)
    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))

    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))

    for iter_num in range(10):
        input_sen, output_sen = load_data(filep, train_names_idx)
        input_list, output_list = io_list(input_sen, output_sen)
        inputs_train, outputs_train = vectorize_stories(
            input_list, output_list, vocab, word_to_idx, input_maxlen,
            output_maxlen, vocab_size)

        en_de_model.fit(inputs_train,
                        outputs_train,
                        batch_size=50,
                        nb_epoch=5,
                        show_accuracy=True)
        out_predicts = en_de_model.predict(inputs_train)

        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = idx_to_word[next_index]
                predict_sequence.append(next_token)
            #print('Target output:', toutput_text[i_idx])
            #print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)

    print("test")
elif decoder_mode == 1:
    decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                    output_dim=hidden_dim,
                                    output_length=tar_maxlen,
                                    state_input=False,
                                    return_sequences=True)
elif decoder_mode == 2:
    decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                     output_dim=hidden_dim,
                                     output_length=tar_maxlen,
                                     state_input=False,
                                     return_sequences=True)
elif decoder_mode == 3:
    decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=tar_maxlen,
                                         state_input=False,
                                         return_sequences=True)

en_de_model = Sequential()
print('Test 001 - jacoxu')
en_de_model.add(
    Embedding(input_dim=vocab_size,
              output_dim=hidden_dim,
              input_length=input_maxlen))

print('Test 002 - jacoxu')
en_de_model.add(encoder_top_layer)
if decoder_mode == 0:
    en_de_model.add(RepeatVector(tar_maxlen))
en_de_model.add(decoder_top_layer)
Exemple #9
0
def main():
    f = open("X_train.pkl", 'r')
    X_train = pickle.load(f)
    '''
    f=open('word2index.pkl','r')
    word2index=pickle.load(f)
    f=open('index2word.pkl','r')
    index2word=pickle.load(f)

    inputs_train, tars_train = vectorize_stories(X_train, X_train, word2index, maxlen, maxlen, vocab_size)
    '''
    X_train = pad_sequences(X_train, maxlen=maxlen)

    decoder_mode = 1  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                        output_dim=hidden_dim,
                                        output_length=maxlen,
                                        state_input=False,
                                        return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=maxlen,
                                         state_input=False,
                                         return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                             output_dim=hidden_dim,
                                             output_length=maxlen,
                                             state_input=False,
                                             return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=hidden_dim,
                  input_length=maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(vocab_size))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time: %f second!' % (time_end - time_start))
    for iter_num in range(5000):
        en_de_model.fit(X_train,
                        X_train,
                        batch_size=3,
                        nb_epoch=1,
                        show_accuracy=True)
        out_predicts = en_de_model.predict(X_train)
        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            '''
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = index2word[next_index]
                predict_sequence.append(next_token)
            '''
            print('Target output:', X_train[i_idx])
            print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
Exemple #10
0
def train():
    # input_text = ['1 2 3 4 5'
    #               , '6 7 8 9 10'
    #               , '11 12 13 14 15'
    #               , '16 17 18 19 20'
    #               , '21 22 23 24 25']
    # tar_text = ['one two three four five'
    #             , 'six seven eight nine ten'
    #             , 'eleven twelve thirteen fourteen fifteen'
    #             , 'sixteen seventeen eighteen nineteen twenty'
    #             , 'twenty_one twenty_two twenty_three twenty_four twenty_five']

    # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list)))

    vocab = loaddic('./corpus/smalldic.txt')

    print('-----------')
    # print vocab
    print('-----------')
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    # input_maxlen = max(map(len, (x for x in input_list)))
    # tar_maxlen = max(map(len, (x for x in tar_list)))
    input_maxlen = 70
    tar_maxlen = 17
    output_dim = vocab_size
    hidden_dim = 100

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    # print('Number of training stories:', len(input_list))
    # print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict(
        (c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict(
        (i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符

    decoder_mode = 3  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                        output_dim=hidden_dim,
                                        output_length=tar_maxlen,
                                        state_input=False,
                                        return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=tar_maxlen,
                                         state_input=False,
                                         return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                             output_dim=hidden_dim,
                                             output_length=tar_maxlen,
                                             state_input=False,
                                             return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=hidden_dim,
                  input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))

    en_de_model.load_weights('en_de_weights1-40.h5')

    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))

    # # input_text = loadfile('./corpus/content-12147.txt')
    # input_text = loadfile('./corpus/content1-500.txt')
    #
    # input_list = []
    # for tmp_input in input_text:
    #     input_list.append(chtokenize(tmp_input))
    #
    # inputs_train = vectorize_x(input_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size)
    #
    # out_predicts = en_de_model.predict(inputs_train)
    # for i_idx, out_predict in enumerate(out_predicts):
    #     predict_sequence = []
    #     tempstr = ''
    #     for predict_vector in out_predict:
    #         next_index = np.argmax(predict_vector)
    #         next_token = idx_to_word[next_index]
    #         # print next_token
    #         tempstr += next_token
    #         predict_sequence.append(next_token)
    #     print tempstr
    #     # print('Predict output:', predict_sequence)
    #
    # print ('Train Ended')

    # def predict(input_text):
    import socket
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    host = socket.gethostbyname(socket.gethostname())
    port = 50008
    sock.bind((host, port))
    sock.listen(5)
    while True:
        conn, addr = sock.accept()
        data = conn.recv(1024)
        list = []
        # input_text = '实际上,上周主管部门就和大唐打过招呼了,内部消息人士透露,国资委已经就李小琳任职问题和大唐进行沟通,但李小琳本人至今未报到。情况比较复杂,上述人士表示,目前还不敢完全确定,不排除后续还有变化。'
        tmp = 'BEG ' + data + ' END'
        tmp = jiebacut(tmp)
        list.append(tmp)
        result = ''
        input_list = []
        for tmp_input in list:
            print(tmp_input)
            print('---!--!---')
            input_list.append(chtokenize(tmp_input))
        inputs_train = vectorize_x(input_list, word_to_idx, input_maxlen)
        out_predicts = en_de_model.predict(inputs_train)
        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            tempstr = ''
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = idx_to_word[next_index]
                # print next_token
                tempstr += next_token
                predict_sequence.append(next_token)
            print(tempstr)
            result = tempstr

            print('Predict output:', predict_sequence)
        reply = result
        conn.send(reply.encode())