def encode_labels(data, labenc=None, max_len=50, pad=True): if labenc is None: labenc = labelencoder if labenc is None: # or onehotencoder == None: print("Error: labelencoder must be trained before it can be used!") return None #return onehotencoder.transform(labelencoder.transform(data)) data2 = [] num_labels = len(labenc.classes_) zero_vec = data_util.zero_vec(num_labels) if debug: print("data: ", str(len(data))) for item in data: #print "item len: " + str(len(item)) new_item = [] if len(item) > 0: item2 = labenc.transform(item) for lab in item2: onehot = [] for x in range(num_labels): onehot.append(0) onehot[lab] = 1 new_item.append(onehot) # Pad vectors if pad: if len(new_item) > max_len: new_item = new_item[0:max_len] while len(new_item) < max_len: new_item.append(zero_vec) data2.append(new_item) return data2
def decode_sequence(encoder_model, decoder_model, input_seq, output_seq_len, output_dim, vec_labels=False): # Encode the input as state vectors. states_value = encoder_model.predict(input_seq, batch_size=1) # Generate empty target sequence of length 1. #output_dim = 5 #print "output_dim: " + str(output_dim) target_seq = numpy.zeros((1, 1, int(output_dim))) # Populate the first character of target sequence with the start character. zero_lab = data_util.zero_vec(output_dim) if vec_labels: target_seq[0, 0] = zero_lab else: zero_lab = encode_labels([['O']])[0][0] index = zero_lab.index(1) target_seq[0, 0, index] = 1 # Sampling loop for a batch of sequences # (to simplify, here we assume a batch of size 1). stop_condition = False decoded_sentence = [] while not stop_condition: output_tokens, h, c = decoder_model.predict([target_seq] + states_value) # Sample a token #sampled_token_index = np.argmax(output_tokens[0, -1, :]) #sampled_lab = reverse_target_char_index[sampled_token_index] #print "output_tokens shape: " + str(output_tokens.shape) token = output_tokens[0, -1] #print "token: " + str(token) encoded_label = numpy.zeros((output_dim,), dtype=numpy.int).tolist() if vec_labels: decoded_sentence.append(encoded_label) else: ind = numpy.argmax(token) encoded_label[ind] = 1 #print "encoded_label: " + str(encoded_label) sampled_lab = decode_labels([encoded_label])[0] print("sampled_lab: ", str(sampled_lab)) decoded_sentence.append(sampled_lab) # Exit condition: either hit max length or find stop character. if (len(decoded_sentence) > output_seq_len): stop_condition = True # Update the target sequence (of length 1). target_seq = numpy.zeros((1, 1, output_dim)) for x in range(output_dim): target_seq[0, 0, x] = token[x] # Update states states_value = [h, c] return decoded_sentence
def get_feats(seqs, pad=True, train=False): print("get_feats") vec_model, dim = word2vec.load(vecfile) zero_vec = data_util.zero_vec(dim) feats = [] labels = [] global label_set label_set = set([]) for s in seqs: s_feats = [] s_labels = [] for pair in s: word = pair[0] vector = word2vec.get(word, vec_model) s_feats.append(vector) s_labels.append(pair[1]) label_set.add(pair[1]) feats.append(s_feats) labels.append(s_labels) if train: num_labels = len(list(label_set)) if debug: print("num original labels:", str(num_labels)) label_list = list(label_set) print('label_list:', str(label_list)) label_list.append('IE') # Add the IE label in case it wasn't in training data print('label_list:', str(labels)) create_labelencoder(label_list, num_labels) global max_seq_len #max_seq_len = max([len(txt) for txt in feats]) print("max_seq_len: ", str(max_seq_len)) # Pad sequences #feats = pad_sequences(numpy.array(feats), maxlen=max_seq_len, dtype='float32', padding="pre") #labels = pad_sequences(numpy.array(labels), maxlen=max_seq_len, dtype='str', padding="pre", value='O') if pad: padded_feats = [] padded_labels = [] for feat in feats: #print "seq len: " + str(len(feat)) while len(feat) > max_seq_len: feat_part = feat[0:max_seq_len] padded_feats.append(pad_feat(feat_part, max_seq_len, zero_vec)) feat = feat[max_seq_len:] new_feat = pad_feat(feat, max_seq_len, zero_vec) padded_feats.append(new_feat) for labs in labels: while len(labs) > max_seq_len: labs_part = labs[0:max_seq_len] padded_labels.append(pad_feat(labs_part, max_seq_len, 'O')) labs = labs[max_seq_len:] #print("labs:", str(labs)) padded_labels.append(pad_feat(labs, max_seq_len, 'O')) feats = padded_feats labels = padded_labels # Encode labels print("labels[0]: ", str(labels[0])) encoded_labels = encode_labels(labels, max_len=max_seq_len, pad=pad) print("encoded_labels[0]: ", str(encoded_labels[0])) #for row in labels: # encoded_row = encode_labels(row) # encoded_labels.append(encoded_row) print("feats: ", str(len(feats)), " labels: ", str(len(encoded_labels))) return feats, encoded_labels
def train_seq2seq(trainx, trainy, num_nodes=100, vec_labels=False, loss_function="cosine_proximity", num_epochs=10): trainx = numpy.array(trainx) print("trainx shape: ", str(trainx.shape)) trainy = numpy.array(trainy) print("trainy shape: ", str(trainy.shape)) input_dim = trainx.shape[-1] output_dim = trainy.shape[-1] input_seq_len = trainx.shape[1] output_seq_len = trainy.shape[1] # Create decoder target data trainy_target = [] zero_lab = data_util.zero_vec(output_dim) if not vec_labels: zero_lab = encode_labels([['O']])[0][0] print("zero_lab shape: ", str(numpy.asarray(zero_lab))) for i in range(trainy.shape[0]): row = trainy[i].tolist() new_row = row[1:] new_row.append(zero_lab) trainy_target.append(new_row) trainy_target = numpy.asarray(trainy_target) print("trainy_target shape: ", str(trainy_target.shape)) # Set up the encoder latent_dim = num_nodes dropout = 0.1 encoder_inputs = Input(shape=(None, input_dim)) #seq_len encoder = LSTM(latent_dim, return_state=True) # Encoder-Decoder model encoder_outputs, state_h, state_c = encoder(encoder_inputs) encoder_states = [state_h, state_c] # Set up the decoder, using `encoder_states` as initial state. decoder_inputs = Input(shape=(None, output_dim)) decoder_rnn = LSTM(latent_dim, return_sequences=True, return_state=True) decoder_outputs, d_state_h, d_state_c = decoder_rnn(decoder_inputs, initial_state=encoder_states) decoder_dense = Dense(output_dim, activation='softmax') decoder_outputs = decoder_dense(decoder_outputs) # Define the model that will turn # `encoder_input_data` & `decoder_input_data` into `decoder_target_data` model = Model([encoder_inputs, decoder_inputs], decoder_outputs) model.compile(optimizer='rmsprop', loss=loss_function) model.fit([trainx, trainy], trainy_target, epochs=num_epochs) # Normal RNN #rnn_out = GRU(latent_dim, return_sequences=False)(encoder_inputs) #dropout_out = Dropout(dropout)(rnn_out) #prediction = Dense(output_dim, activation='softmax')(dropout_out) #model = Model(inputs=encoder_inputs, outputs=prediction) #model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) #model.fit(trainx, trainy, nb_epoch=20) model.summary() model.save('seq2seq.model') # Create models for inference encoder_model = Model(encoder_inputs, encoder_states) decoder_state_input_h = Input(shape=(latent_dim,)) decoder_state_input_c = Input(shape=(latent_dim,)) decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c] decoder_outputs, state_h, state_c = decoder_rnn(decoder_inputs, initial_state=decoder_states_inputs) decoder_states = [state_h, state_c] decoder_outputs = decoder_dense(decoder_outputs) decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states) return model, encoder_model, decoder_model, output_dim
def get_vec(word, model): dim = model.vector_size if word in model: #.wv.vocab: return model[word].tolist() else: return data_util3.zero_vec(dim)
def get(word, model): dim = model.vector_size if word in model: #.wv.vocab: return list(model[word]) else: return data_util.zero_vec(dim)