def __init__(self, output_dim, hidden_dim, output_length, depth=1,bidirectional=True, dropout=0.25, **kwargs): if bidirectional and hidden_dim % 2 != 0: raise Exception ("hidden_dim for AttentionSeq2seq should be even (Because of bidirectional RNN).") super(AttentionSeq2seq, self).__init__() if type(depth) not in [list, tuple]: depth = (depth, depth) if bidirectional: encoder = Bidirectional(LSTMEncoder(output_dim=hidden_dim / 2, state_input=False, return_sequences=True, **kwargs)) else: encoder = LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs) decoder = AttentionDecoder(hidden_dim=hidden_dim, output_length=output_length, state_input=False, **kwargs) lstms = [] for i in range(1, depth[0]): if bidirectional: layer = Bidirectional(LSTMEncoder(output_dim=hidden_dim / 2, state_input=False, return_sequences=True, **kwargs)) else: layer = LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs) self.add(layer) lstms.append(layer) self.add(Dropout(dropout)) self.add(encoder) self.add(Dropout(dropout)) self.add(TimeDistributedDense(hidden_dim if depth[1] > 1 else output_dim)) lstms.append(encoder) self.add(decoder) lstms = [decoder] for i in range(1, depth[1]): layer = LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs) self.add(layer) lstms.append(layer) self.add(Dropout(dropout)) if depth[1] > 1: self.add(TimeDistributedDense(output_dim)) self.encoder = encoder self.decoder = decoder
def get_nn_model(token_dict_size): '''model = Sequential() seq2seq = AttentionSeq2seq( #seq2seq = SimpleSeq2seq( input_dim = TOKEN_REPRESENTATION_SIZE, input_length = INPUT_SEQUENCE_LENGTH, hidden_dim = HIDDEN_LAYER_DIMENSION, output_dim = token_dict_size, output_length = ANSWER_MAX_TOKEN_LENGTH, depth = 4 ) model.add(seq2seq) model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy']) ''' dropout = 0.1 model = Sequential() encoder_top_layer = LSTM(HIDDEN_LAYER_DIMENSION, input_dim=TOKEN_REPRESENTATION_SIZE, input_length=INPUT_SEQUENCE_LENGTH, return_sequences=True) decoder_top_layer = AttentionDecoder(hidden_dim=HIDDEN_LAYER_DIMENSION, output_dim=HIDDEN_LAYER_DIMENSION, output_length=ANSWER_MAX_TOKEN_LENGTH, state_input=False, return_sequences=True) #model.add(Embedding(input_dim=TOKEN_REPRESENTATION_SIZE,output_dim=HIDDEN_LAYER_DIMENSION,input_length=INPUT_SEQUENCE_LENGTH)) model.add(encoder_top_layer) model.add(Dropout(dropout)) model.add(LSTM(HIDDEN_LAYER_DIMENSION, return_sequences=False)) model.add(RepeatVector(ANSWER_MAX_TOKEN_LENGTH)) model.add(decoder_top_layer) model.add(Dropout(dropout)) model.add(LSTM(HIDDEN_LAYER_DIMENSION, return_sequences=True)) model.add(Dropout(dropout)) model.add(TimeDistributed(Dense(token_dict_size))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) #if os.path.isfile(NN_MODEL_PATH): # model.load_weights(NN_MODEL_PATH) return model
def __init__(self, output_dim, hidden_dim, output_length, depth=1,bidirectional=True, dropout=0.1, **kwargs): if bidirectional and hidden_dim % 2 != 0: raise Exception ("hidden_dim for AttentionSeq2seq should be even (Because of bidirectional RNN).") super(AttentionSeq2seq, self).__init__() if type(depth) not in [list, tuple]: depth = (depth, depth) if 'batch_input_shape' in kwargs: shape = kwargs['batch_input_shape'] del kwargs['batch_input_shape'] elif 'input_shape' in kwargs: shape = (None,) + tuple(kwargs['input_shape']) del kwargs['input_shape'] elif 'input_dim' in kwargs: if 'input_length' in kwargs: input_length = kwargs['input_length'] else: input_length = None shape = (None, input_length, kwargs['input_dim']) del kwargs['input_dim'] self.add(Layer(batch_input_shape=shape)) if bidirectional: self.add(Bidirectional(LSTMEncoder(output_dim=int(hidden_dim / 2), state_input=False, return_sequences=True, **kwargs))) else: self.add(LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs)) for i in range(0, depth[0] - 1): self.add(Dropout(dropout)) if bidirectional: self.add(Bidirectional(LSTMEncoder(output_dim=int(hidden_dim / 2), state_input=False, return_sequences=True, **kwargs))) else: self.add(LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs)) encoder = self.layers[-1] self.add(Dropout(dropout)) self.add(TimeDistributed(Dense(hidden_dim if depth[1] > 1 else output_dim))) decoder = AttentionDecoder(hidden_dim=hidden_dim, output_length=output_length, state_input=False, **kwargs) self.add(Dropout(dropout)) self.add(decoder) for i in range(0, depth[1] - 1): self.add(Dropout(dropout)) self.add(LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs)) self.add(Dropout(dropout)) self.add(TimeDistributed(Dense(output_dim))) self.encoder = encoder self.decoder = decoder
def train(): # input_text = ['1 2 3 4 5' # , '6 7 8 9 10' # , '11 12 13 14 15' # , '16 17 18 19 20' # , '21 22 23 24 25'] # tar_text = ['one two three four five' # , 'six seven eight nine ten' # , 'eleven twelve thirteen fourteen fifteen' # , 'sixteen seventeen eighteen nineteen twenty' # , 'twenty_one twenty_two twenty_three twenty_four twenty_five'] # input_text = ['床 前 明 月 光 ,' # , '举 头 望 明 月 ,' # , '敝 笱 在 梁 ,' # , '齐 子 归 止 ,' # , '21 22 23 24 25'] # tar_text = ['疑 是 地 上 霜。' # , '低 头 思 故 乡。' # , '其 鱼 鲂 鳏 。' # , '其 从 如 云 。' # , 'twenty_one twenty_two twenty_three twenty_four twenty_five'] # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list))) vocab = loaddic('./corpus/dic.txt') print '-----------' # print vocab print '-----------' # Reserve 0 for masking via pad_sequences vocab_size = len(vocab) + 1 # keras进行embedding的时候必须进行len(vocab)+1 # input_maxlen = max(map(len, (x for x in input_list))) # tar_maxlen = max(map(len, (x for x in tar_list))) input_maxlen = 97 tar_maxlen = 20 output_dim = vocab_size hidden_dim = 500 print('-') print('Vocab size:', vocab_size, 'unique words') print('Input max length:', input_maxlen, 'words') print('Target max length:', tar_maxlen, 'words') print('Dimension of hidden vectors:', hidden_dim) # print('Number of training stories:', len(input_list)) # print('Number of test stories:', len(input_list)) print('-') print('Vectorizing the word sequences...') word_to_idx = dict((c, i + 1) for i, c in enumerate(vocab)) # 编码时需要将字符映射成数字index idx_to_word = dict((i + 1, c) for i, c in enumerate(vocab)) # 解码时需要将数字index映射成字符 decoder_mode = 3 # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式 if decoder_mode == 3: encoder_top_layer = LSTM(hidden_dim, return_sequences=True) else: encoder_top_layer = LSTM(hidden_dim) if decoder_mode == 0: decoder_top_layer = LSTM(hidden_dim, return_sequences=True) decoder_top_layer.get_weights() elif decoder_mode == 1: decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim , output_length=tar_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 2: decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim , output_length=tar_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 3: decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim , output_length=tar_maxlen, state_input=False, return_sequences=True) en_de_model = Sequential() en_de_model.add(Embedding(input_dim=vocab_size, output_dim=hidden_dim, input_length=input_maxlen)) en_de_model.add(encoder_top_layer) if decoder_mode == 0: en_de_model.add(RepeatVector(tar_maxlen)) en_de_model.add(decoder_top_layer) en_de_model.add(TimeDistributedDense(output_dim)) en_de_model.add(Activation('softmax')) print('Compiling...') time_start = time.time() en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop') time_end = time.time() print('Compiled, cost time:%fsecond!' % (time_end - time_start)) # input_text = loadfile('./corpus/content-12147.txt') # tar_text = loadfile('./corpus/title-12147.txt') input_text = loadfile('./corpus/content-12147.txt') tar_text = loadfile('./corpus/title-12147.txt') time1 = time.time() for iter_num in range(1): for i in range(0,58213,20): if(i == 58200): break input_list = [] tar_list = [] for tmp_input in input_text[i:i+20]: input_list.append(chtokenize(tmp_input)) for tmp_tar in tar_text[i:i+20]: tar_list.append(chtokenize(tmp_tar)) inputs_train, tars_train = vectorize_stories(input_list, tar_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size) en_de_model.fit(inputs_train, tars_train, batch_size=2, nb_epoch=1, show_accuracy=True) print'Current line:'+ str(i) print('Current iter_num is:%d' % iter_num) # out_predicts = en_de_model.predict(inputs_train) # for i_idx, out_predict in enumerate(out_predicts): # predict_sequence = [] # for predict_vector in out_predict: # next_index = np.argmax(predict_vector) # next_token = idx_to_word[next_index] # predict_sequence.append(next_token) # print('Target output:', tar_text[i_idx]) # print('Predict output:', predict_sequence) print('Current iter_num is:%d' % iter_num) en_de_model.save_weights('en_de_weights.h5') print ('Train Ended') time2 = time.time()-time1 print time2
def main(): input_text = [ '1 2 3 4 5', '6 7 8 9 10', '11 12 13 14 15', '16 17 18 19 20', '21 22 23 24 25' ] tar_text = [ 'one two three four five', 'six seven eight nine ten', 'eleven twelve thirteen fourteen fifteen', 'sixteen seventeen eighteen nineteen twenty', 'twenty_one twenty_two twenty_three twenty_four twenty_five' ] input_list = [] tar_list = [] for tmp_input in input_text: input_list.append(tokenize(tmp_input)) for tmp_tar in tar_text: tar_list.append(tokenize(tmp_tar)) vocab = sorted( reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list))) # Reserve 0 for masking via pad_sequences vocab_size = len(vocab) + 1 # keras进行embedding的时候必须进行len(vocab)+1 input_maxlen = max(map(len, (x for x in input_list))) tar_maxlen = max(map(len, (x for x in tar_list))) output_dim = vocab_size hidden_dim = 20 print('-') print('Vocab size:', vocab_size, 'unique words') print('Input max length:', input_maxlen, 'words') print('Target max length:', tar_maxlen, 'words') print('Dimension of hidden vectors:', hidden_dim) print('Number of training stories:', len(input_list)) print('Number of test stories:', len(input_list)) print('-') print('Vectorizing the word sequences...') word_to_idx = dict( (c, i + 1) for i, c in enumerate(vocab)) # 编码时需要将字符映射成数字index idx_to_word = dict( (i + 1, c) for i, c in enumerate(vocab)) # 解码时需要将数字index映射成字符 inputs_train, tars_train = vectorize_stories(input_list, tar_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size) decoder_mode = 1 # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式 if decoder_mode == 3: encoder_top_layer = LSTM(hidden_dim, return_sequences=True) else: encoder_top_layer = LSTM(hidden_dim) if decoder_mode == 0: decoder_top_layer = LSTM(hidden_dim, return_sequences=True) decoder_top_layer.get_weights() elif decoder_mode == 1: decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=tar_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 2: decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=tar_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 3: decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=tar_maxlen, state_input=False, return_sequences=True) en_de_model = Sequential() en_de_model.add( Embedding(input_dim=vocab_size, output_dim=hidden_dim, input_length=input_maxlen)) en_de_model.add(encoder_top_layer) if decoder_mode == 0: en_de_model.add(RepeatVector(tar_maxlen)) en_de_model.add(decoder_top_layer) en_de_model.add(TimeDistributedDense(output_dim)) en_de_model.add(Activation('softmax')) print('Compiling...') time_start = time.time() en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop') time_end = time.time() print('Compiled, cost time:%fsecond!' % (time_end - time_start)) for iter_num in range(5000): en_de_model.fit(inputs_train, tars_train, batch_size=3, nb_epoch=1, show_accuracy=True) out_predicts = en_de_model.predict(inputs_train) for i_idx, out_predict in enumerate(out_predicts): predict_sequence = [] for predict_vector in out_predict: next_index = np.argmax(predict_vector) next_token = idx_to_word[next_index] predict_sequence.append(next_token) print('Target output:', tar_text[i_idx]) print('Predict output:', predict_sequence) print('Current iter_num is:%d' % iter_num)
def train(): # input_text = ['1 2 3 4 5' # , '6 7 8 9 10' # , '11 12 13 14 15' # , '16 17 18 19 20' # , '21 22 23 24 25'] # tar_text = ['one two three four five' # , 'six seven eight nine ten' # , 'eleven twelve thirteen fourteen fifteen' # , 'sixteen seventeen eighteen nineteen twenty' # , 'twenty_one twenty_two twenty_three twenty_four twenty_five'] # input_text = ['Hello this is Tom speaking Is that John ?' # , 'Would you like to go swimming with me ?' # , 'Ok see you then Bye' # , 'Yeah I am free What time shall we meet ?' # , 'How does it taste ?'] # tar_text = ['Yes this is What s up ?' # , 'That sounds great It s good weather for swimming I d love to .' # , 'See you' # , 'At 3:00PM' # , 'It tastes good you should try some.'] #排序 生成数据格式为list的字典 # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list))) vocab = [] dicfile = open('./corpus/dic.txt','r') line = dicfile.readline() while line: vocab.append(line.strip()) line = dicfile.readline() dicfile.close() # Reserve 0 for masking via pad_sequences vocab_size = len(vocab) + 1 # keras进行embedding的时候必须进行len(vocab)+1 # input_maxlen = max(map(len, (x for x in input_list))) # tar_maxlen = max(map(len, (x for x in tar_list))) input_maxlen = 99 tar_maxlen = 22 output_dim = vocab_size hidden_dim = 50 print('-') print('Vocab size:', vocab_size, 'unique words') print('Input max length:', input_maxlen, 'words') print('Target max length:', tar_maxlen, 'words') print('Dimension of hidden vectors:', hidden_dim) # print('Number of training stories:', len(input_list)) # print('Number of test stories:', len(input_list)) print('-') print('Vectorizing the word sequences...') word_to_idx = dict((c, i + 1) for i, c in enumerate(vocab)) # 编码时需要将字符映射成数字index idx_to_word = dict((i + 1, c) for i, c in enumerate(vocab)) # 解码时需要将数字index映射成字符 decoder_mode = 3 # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式 if decoder_mode == 3: encoder_top_layer = LSTM(hidden_dim, return_sequences=True) else: encoder_top_layer = LSTM(hidden_dim) if decoder_mode == 0: decoder_top_layer = LSTM(hidden_dim, return_sequences=True) decoder_top_layer.get_weights() elif decoder_mode == 1: decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim , output_length=tar_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 2: decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim , output_length=tar_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 3: decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim , output_length=tar_maxlen, state_input=False, return_sequences=True) en_de_model = Sequential() en_de_model.add(Embedding(input_dim=vocab_size, output_dim=hidden_dim, input_length=input_maxlen)) en_de_model.add(encoder_top_layer) if decoder_mode == 0: en_de_model.add(RepeatVector(tar_maxlen)) en_de_model.add(decoder_top_layer) en_de_model.add(TimeDistributedDense(output_dim)) en_de_model.add(Activation('softmax')) print('Compiling...') time_start = time.time() en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop') time_end = time.time() print('Compiled, cost time:%fsecond!' % (time_end - time_start)) for iter_num in range(5000): #构造数据 批量喂入 input_text = loadfile('./corpus/content-12147.txt') tar_text = loadfile('./corpus/title-12147.txt') for i in range (0,58213): if (i == 58200): break input_list = [] tar_list = [] for tmp_input in input_text[i:i+20]: input_list.append(chtokenize(tmp_input,vocab)) for tmp_tar in tar_text[i:i+20]: tar_list.append(chtokenize(tmp_tar,vocab)) inputs_train, tars_train = vectorize_stories(input_list, tar_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size) en_de_model.fit(inputs_train, tars_train, batch_size=4, nb_epoch=1, show_accuracy=True) i += 20 # out_predicts = en_de_model.predict(inputs_train) # for i_idx, out_predict in enumerate(out_predicts): # predict_sequence = [] # for predict_vector in out_predict: # next_index = np.argmax(predict_vector) # next_token = idx_to_word[next_index] # predict_sequence.append(next_token) # print('Target output:', tar_text[i_idx].decode('utf8')) # print('Predict output:', predict_sequence) print('Current iter_num is:%d' % iter_num) en_de_model.save_weights('en_de_weights1-40.h5') print ('Train Ended')
def main(): filep = pjoin(m_path, "cut_recovery_10", "Paging") file_num = len(os.listdir(pjoin(filep, "input"))) file_names_idx = range(file_num) random.shuffle(file_names_idx) # train_names_idx = file_names_idx[:file_num/7] # test_names_idx = file_names_idx[file_num/7+1:] train_names_idx = file_names_idx[:5000] test_names_idx = file_names_idx[5001:6000] vocab = load_vocab( pjoin(m_path, "BaseLine-BigData_1kUE_20ENB_paging-Case_Group_1-Case_1", "dic.txt")) # Reserve 0 for masking via pad_sequences vocab_size = len(vocab) + 1 # keras进行embedding的时候必须进行len(vocab)+1 #input_maxlen = max(map(len, (x for x in input_list))) #output_maxlen = max(map(len, (x for x in tar_list))) input_maxlen = 200 output_maxlen = 200 output_dim = vocab_size hidden_dim = 300 # print('-') # print('Vocab size:', vocab_size, 'unique words') # print('Input max length:', input_maxlen, 'words') # print('Target max length:', tar_maxlen, 'words') # print('Dimension of hidden vectors:', hidden_dim) # print('Number of training stories:', len(input_list)) # print('Number of test stories:', len(input_list)) # print('-') # print('Vectorizing the word sequences...') word_to_idx = dict( (c, i + 1) for i, c in enumerate(vocab)) # 编码时需要将字符映射成数字index idx_to_word = dict( (i + 1, c) for i, c in enumerate(vocab)) # 解码时需要将数字index映射成字符 decoder_mode = 1 # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式 if decoder_mode == 3: encoder_top_layer = LSTM(hidden_dim, return_sequences=True) else: encoder_top_layer = LSTM(hidden_dim) if decoder_mode == 0: decoder_top_layer = LSTM(hidden_dim, return_sequences=True) decoder_top_layer.get_weights() elif decoder_mode == 1: decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=output_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 2: decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=output_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 3: decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=output_maxlen, state_input=False, return_sequences=True) en_de_model = Sequential() en_de_model.add( Embedding(input_dim=vocab_size, output_dim=hidden_dim, input_length=input_maxlen)) en_de_model.add(encoder_top_layer) if decoder_mode == 0: en_de_model.add(RepeatVector(output_maxlen)) en_de_model.add(decoder_top_layer) en_de_model.add(TimeDistributedDense(output_dim)) en_de_model.add(Activation('softmax')) print('Compiling...') time_start = time.time() en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop') time_end = time.time() print('Compiled, cost time:%fsecond!' % (time_end - time_start)) for iter_num in range(10): input_sen, output_sen = load_data(filep, train_names_idx) input_list, output_list = io_list(input_sen, output_sen) inputs_train, outputs_train = vectorize_stories( input_list, output_list, vocab, word_to_idx, input_maxlen, output_maxlen, vocab_size) en_de_model.fit(inputs_train, outputs_train, batch_size=50, nb_epoch=5, show_accuracy=True) out_predicts = en_de_model.predict(inputs_train) for i_idx, out_predict in enumerate(out_predicts): predict_sequence = [] for predict_vector in out_predict: next_index = np.argmax(predict_vector) next_token = idx_to_word[next_index] predict_sequence.append(next_token) #print('Target output:', toutput_text[i_idx]) #print('Predict output:', predict_sequence) print('Current iter_num is:%d' % iter_num) print("test")
elif decoder_mode == 1: decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=tar_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 2: decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=tar_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 3: decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=tar_maxlen, state_input=False, return_sequences=True) en_de_model = Sequential() print('Test 001 - jacoxu') en_de_model.add( Embedding(input_dim=vocab_size, output_dim=hidden_dim, input_length=input_maxlen)) print('Test 002 - jacoxu') en_de_model.add(encoder_top_layer) if decoder_mode == 0: en_de_model.add(RepeatVector(tar_maxlen)) en_de_model.add(decoder_top_layer)
def main(): f = open("X_train.pkl", 'r') X_train = pickle.load(f) ''' f=open('word2index.pkl','r') word2index=pickle.load(f) f=open('index2word.pkl','r') index2word=pickle.load(f) inputs_train, tars_train = vectorize_stories(X_train, X_train, word2index, maxlen, maxlen, vocab_size) ''' X_train = pad_sequences(X_train, maxlen=maxlen) decoder_mode = 1 # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式 if decoder_mode == 3: encoder_top_layer = LSTM(hidden_dim, return_sequences=True) else: encoder_top_layer = LSTM(hidden_dim) if decoder_mode == 0: decoder_top_layer = LSTM(hidden_dim, return_sequences=True) decoder_top_layer.get_weights() elif decoder_mode == 1: decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=maxlen, state_input=False, return_sequences=True) elif decoder_mode == 2: decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=maxlen, state_input=False, return_sequences=True) elif decoder_mode == 3: decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=maxlen, state_input=False, return_sequences=True) en_de_model = Sequential() en_de_model.add( Embedding(input_dim=vocab_size, output_dim=hidden_dim, input_length=maxlen)) en_de_model.add(encoder_top_layer) if decoder_mode == 0: en_de_model.add(RepeatVector(maxlen)) en_de_model.add(decoder_top_layer) en_de_model.add(TimeDistributedDense(vocab_size)) en_de_model.add(Activation('softmax')) print('Compiling...') time_start = time.time() en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop') time_end = time.time() print('Compiled, cost time: %f second!' % (time_end - time_start)) for iter_num in range(5000): en_de_model.fit(X_train, X_train, batch_size=3, nb_epoch=1, show_accuracy=True) out_predicts = en_de_model.predict(X_train) for i_idx, out_predict in enumerate(out_predicts): predict_sequence = [] ''' for predict_vector in out_predict: next_index = np.argmax(predict_vector) next_token = index2word[next_index] predict_sequence.append(next_token) ''' print('Target output:', X_train[i_idx]) print('Predict output:', predict_sequence) print('Current iter_num is:%d' % iter_num)
def train(): # input_text = ['1 2 3 4 5' # , '6 7 8 9 10' # , '11 12 13 14 15' # , '16 17 18 19 20' # , '21 22 23 24 25'] # tar_text = ['one two three four five' # , 'six seven eight nine ten' # , 'eleven twelve thirteen fourteen fifteen' # , 'sixteen seventeen eighteen nineteen twenty' # , 'twenty_one twenty_two twenty_three twenty_four twenty_five'] # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list))) vocab = loaddic('./corpus/smalldic.txt') print('-----------') # print vocab print('-----------') # Reserve 0 for masking via pad_sequences vocab_size = len(vocab) + 1 # keras进行embedding的时候必须进行len(vocab)+1 # input_maxlen = max(map(len, (x for x in input_list))) # tar_maxlen = max(map(len, (x for x in tar_list))) input_maxlen = 70 tar_maxlen = 17 output_dim = vocab_size hidden_dim = 100 print('-') print('Vocab size:', vocab_size, 'unique words') print('Input max length:', input_maxlen, 'words') print('Target max length:', tar_maxlen, 'words') print('Dimension of hidden vectors:', hidden_dim) # print('Number of training stories:', len(input_list)) # print('Number of test stories:', len(input_list)) print('-') print('Vectorizing the word sequences...') word_to_idx = dict( (c, i + 1) for i, c in enumerate(vocab)) # 编码时需要将字符映射成数字index idx_to_word = dict( (i + 1, c) for i, c in enumerate(vocab)) # 解码时需要将数字index映射成字符 decoder_mode = 3 # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式 if decoder_mode == 3: encoder_top_layer = LSTM(hidden_dim, return_sequences=True) else: encoder_top_layer = LSTM(hidden_dim) if decoder_mode == 0: decoder_top_layer = LSTM(hidden_dim, return_sequences=True) decoder_top_layer.get_weights() elif decoder_mode == 1: decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=tar_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 2: decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=tar_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 3: decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=tar_maxlen, state_input=False, return_sequences=True) en_de_model = Sequential() en_de_model.add( Embedding(input_dim=vocab_size, output_dim=hidden_dim, input_length=input_maxlen)) en_de_model.add(encoder_top_layer) if decoder_mode == 0: en_de_model.add(RepeatVector(tar_maxlen)) en_de_model.add(decoder_top_layer) en_de_model.add(TimeDistributedDense(output_dim)) en_de_model.add(Activation('softmax')) en_de_model.load_weights('en_de_weights1-40.h5') print('Compiling...') time_start = time.time() en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop') time_end = time.time() print('Compiled, cost time:%fsecond!' % (time_end - time_start)) # # input_text = loadfile('./corpus/content-12147.txt') # input_text = loadfile('./corpus/content1-500.txt') # # input_list = [] # for tmp_input in input_text: # input_list.append(chtokenize(tmp_input)) # # inputs_train = vectorize_x(input_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size) # # out_predicts = en_de_model.predict(inputs_train) # for i_idx, out_predict in enumerate(out_predicts): # predict_sequence = [] # tempstr = '' # for predict_vector in out_predict: # next_index = np.argmax(predict_vector) # next_token = idx_to_word[next_index] # # print next_token # tempstr += next_token # predict_sequence.append(next_token) # print tempstr # # print('Predict output:', predict_sequence) # # print ('Train Ended') # def predict(input_text): import socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) host = socket.gethostbyname(socket.gethostname()) port = 50008 sock.bind((host, port)) sock.listen(5) while True: conn, addr = sock.accept() data = conn.recv(1024) list = [] # input_text = '实际上,上周主管部门就和大唐打过招呼了,内部消息人士透露,国资委已经就李小琳任职问题和大唐进行沟通,但李小琳本人至今未报到。情况比较复杂,上述人士表示,目前还不敢完全确定,不排除后续还有变化。' tmp = 'BEG ' + data + ' END' tmp = jiebacut(tmp) list.append(tmp) result = '' input_list = [] for tmp_input in list: print(tmp_input) print('---!--!---') input_list.append(chtokenize(tmp_input)) inputs_train = vectorize_x(input_list, word_to_idx, input_maxlen) out_predicts = en_de_model.predict(inputs_train) for i_idx, out_predict in enumerate(out_predicts): predict_sequence = [] tempstr = '' for predict_vector in out_predict: next_index = np.argmax(predict_vector) next_token = idx_to_word[next_index] # print next_token tempstr += next_token predict_sequence.append(next_token) print(tempstr) result = tempstr print('Predict output:', predict_sequence) reply = result conn.send(reply.encode())