def encode_labels(data, labenc=None, max_len=50): if labenc == None: labenc = labelencoder if labenc == None: # or onehotencoder == None: print "Error: labelencoder must be trained before it can be used!" return None #return onehotencoder.transform(labelencoder.transform(data)) data2 = [] num_labels = len(labenc.classes_) zero_vec = data_util.zero_vec(num_labels) print "data: " + str(len(data)) for item in data: #print "item len: " + str(len(item)) new_item = [] if len(item) > 0: item2 = labenc.transform(item) for lab in item2: onehot = [] for x in range(num_labels): onehot.append(0) onehot[lab] = 1 new_item.append(onehot) # Pad vectors if len(new_item) > max_len: new_item = new_item[0:max_len] while len(new_item) < max_len: new_item.append(zero_vec) data2.append(new_item) #else: # data2.append([]) return data2
def decode_sequence(encoder_model, decoder_model, input_seq, output_seq_len, output_dim, vec_labels=False): # Encode the input as state vectors. states_value = encoder_model.predict(input_seq, batch_size=1) # Generate empty target sequence of length 1. #output_dim = 5 #print "output_dim: " + str(output_dim) target_seq = numpy.zeros((1, 1, int(output_dim))) # Populate the first character of target sequence with the start character. zero_lab = data_util.zero_vec(output_dim) if vec_labels: target_seq[0, 0] = zero_lab else: zero_lab = encode_labels([['O']])[0][0] index = zero_lab.index(1) target_seq[0, 0, index] = 1 # Sampling loop for a batch of sequences # (to simplify, here we assume a batch of size 1). stop_condition = False decoded_sentence = [] while not stop_condition: output_tokens, h, c = decoder_model.predict([target_seq] + states_value) # Sample a token #sampled_token_index = np.argmax(output_tokens[0, -1, :]) #sampled_lab = reverse_target_char_index[sampled_token_index] #print "output_tokens shape: " + str(output_tokens.shape) token = output_tokens[0, -1] #print "token: " + str(token) encoded_label = numpy.zeros((output_dim, ), dtype=numpy.int).tolist() if vec_labels: decoded_sentence.append(encoded_label) else: ind = numpy.argmax(token) encoded_label[ind] = 1 #print "encoded_label: " + str(encoded_label) sampled_lab = decode_labels([encoded_label])[0] print "sampled_lab: " + str(sampled_lab) decoded_sentence.append(sampled_lab) # Exit condition: either hit max length or find stop character. if (len(decoded_sentence) > output_seq_len): stop_condition = True # Update the target sequence (of length 1). target_seq = numpy.zeros((1, 1, output_dim)) for x in range(output_dim): target_seq[0, 0, x] = token[x] # Update states states_value = [h, c] return decoded_sentence
def get_feats(seqs, train=False): print "get_feats" vec_model, dim = word2vec.load(vecfile) zero_vec = data_util.zero_vec(dim) feats = [] labels = [] global label_set label_set = set([]) for s in seqs: s_feats = [] s_labels = [] for pair in s: word = pair[0] vector = word2vec.get(word, vec_model) s_feats.append(vector) s_labels.append(pair[1]) label_set.add(pair[1]) feats.append(s_feats) labels.append(s_labels) if train: num_labels = len(list(label_set)) create_labelencoder(list(label_set), num_labels) global max_seq_len #max_seq_len = max([len(txt) for txt in feats]) print "max_seq_len: " + str(max_seq_len) # Pad sequences #feats = pad_sequences(numpy.array(feats), maxlen=max_seq_len, dtype='float32', padding="pre") #labels = pad_sequences(numpy.array(labels), maxlen=max_seq_len, dtype='str', padding="pre", value='O') padded_feats = [] padded_labels = [] for feat in feats: #print "seq len: " + str(len(feat)) while len(feat) > max_seq_len: feat_part = feat[0:max_seq_len] padded_feats.append(pad_feat(feat_part, max_seq_len, zero_vec)) feat = feat[max_seq_len:] new_feat = pad_feat(feat, max_seq_len, zero_vec) padded_feats.append(new_feat) for labs in labels: while len(labs) > max_seq_len: labs_part = labs[0:max_seq_len] padded_labels.append(pad_feat(labs_part, max_seq_len, 'O')) labs = labs[max_seq_len:] padded_labels.append(pad_feat(labs, max_seq_len, 'O')) feats = padded_feats labels = padded_labels # Encode labels encoded_labels = encode_labels(labels, max_len=max_seq_len) print "labels[0]: " + str(encoded_labels[0]) #for row in labels: # encoded_row = encode_labels(row) # encoded_labels.append(encoded_row) print "feats: " + str(len(feats)) + " labels: " + str(len(encoded_labels)) return feats, encoded_labels
def vectorize(phrase, vec_model, dim): stopwords = ['and', 'or', 'with', 'in', 'of', 'at', 'had', 'ho'] words = phrase.split(' ') vecs = [] zero_vec = numpy.asarray(data_util.zero_vec(dim)) for word in words: if word not in stopwords: vecs.append(word2vec.get(word, vec_model)) # Average vectors if len(vecs) > 0: avg_vec = numpy.average(numpy.asarray(vecs), axis=0) return avg_vec else: return zero_vec
def read_cluster_file(clusterfile, word2vec, dim, cluster_names=None): train = False if cluster_names is None: cluster_names = set() cluster_names.add(0) train = True keywords = [] kw_vecs = [] kw_clusters = [] zero_vec = numpy.array(data_util.zero_vec(dim)) with open(clusterfile, 'r') as f: for line in f: cols = line.split(',') kw = cols[0] clust = int(cols[1].strip()) # Look up keyword in word2vec vec = zero_vec for word in kw.split(' '): vec2 = zero_vec # ignore stopwords #if word not in stopwords and word in word2vec: if word in word2vec: vec2 = numpy.array(word2vec[word]) vec = vec + vec2 keywords.append(kw) kw_vecs.append(vec) kw_clusters.append(clust) if train: cluster_names.add(clust) # Convert cluster names to numbers 0 to num_clusters if train: cluster_names = list(cluster_names) for x in range(len(kw_clusters)): val = kw_clusters[x] if val in cluster_names: kw_clusters[x] = cluster_names.index(val) else: kw_clusters[x] = 0 return keywords, kw_clusters, kw_vecs, cluster_names
def get(word, model): dim = model.vector_size if word in model: #.wv.vocab: return list(model[word]) else: return data_util.zero_vec(dim)
def train_seq2seq(trainx, trainy, num_nodes=100, vec_labels=False, loss_function="cosine_proximity", num_epochs=10): trainx = numpy.array(trainx) print "trainx shape: " + str(trainx.shape) trainy = numpy.array(trainy) print "trainy shape: " + str(trainy.shape) input_dim = trainx.shape[-1] output_dim = trainy.shape[-1] input_seq_len = trainx.shape[1] output_seq_len = trainy.shape[1] # Create decoder target data trainy_target = [] zero_lab = data_util.zero_vec(output_dim) if not vec_labels: zero_lab = encode_labels([['O']])[0][0] print "zero_lab shape: " + str(numpy.asarray(zero_lab)) for i in range(trainy.shape[0]): row = trainy[i].tolist() new_row = row[1:] new_row.append(zero_lab) trainy_target.append(new_row) trainy_target = numpy.asarray(trainy_target) print "trainy_target shape: " + str(trainy_target.shape) # Set up the encoder latent_dim = num_nodes dropout = 0.1 encoder_inputs = Input(shape=(None, input_dim)) #seq_len encoder = LSTM(latent_dim, return_state=True) # Encoder-Decoder model encoder_outputs, state_h, state_c = encoder(encoder_inputs) encoder_states = [state_h, state_c] # Set up the decoder, using `encoder_states` as initial state. decoder_inputs = Input(shape=(None, output_dim)) decoder_rnn = LSTM(latent_dim, return_sequences=True, return_state=True) decoder_outputs, d_state_h, d_state_c = decoder_rnn( decoder_inputs, initial_state=encoder_states) decoder_dense = Dense(output_dim, activation='softmax') decoder_outputs = decoder_dense(decoder_outputs) # Define the model that will turn # `encoder_input_data` & `decoder_input_data` into `decoder_target_data` model = Model([encoder_inputs, decoder_inputs], decoder_outputs) model.compile(optimizer='rmsprop', loss=loss_function) model.fit([trainx, trainy], trainy_target, epochs=num_epochs) # Normal RNN #rnn_out = GRU(latent_dim, return_sequences=False)(encoder_inputs) #dropout_out = Dropout(dropout)(rnn_out) #prediction = Dense(output_dim, activation='softmax')(dropout_out) #model = Model(inputs=encoder_inputs, outputs=prediction) #model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) #model.fit(trainx, trainy, nb_epoch=20) model.summary() model.save('seq2seq.model') # Create models for inference encoder_model = Model(encoder_inputs, encoder_states) decoder_state_input_h = Input(shape=(latent_dim, )) decoder_state_input_c = Input(shape=(latent_dim, )) decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c] decoder_outputs, state_h, state_c = decoder_rnn( decoder_inputs, initial_state=decoder_states_inputs) decoder_states = [state_h, state_c] decoder_outputs = decoder_dense(decoder_outputs) decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states) return model, encoder_model, decoder_model, output_dim
def cluster_embeddings(labels, clusterfile, vecfile, return_phrases=False, max_length=10): print "getting cluster embeddings" word2vec, dim = data_util.load_word2vec(vecfile) cluster_map = {} cluster_names = {} cluster_embeddings = [] with open(clusterfile, 'r') as f: for line in f.readlines(): phrases = line.strip().strip(',').split(',') key = int(phrases[0]) phrases = phrases[1:] cluster_map[key] = phrases # Get cluster centers cluster_centers = {} cluster_vecs = {} center_file = clusterfile + ".centers" calculate_centers = True if os.path.exists(center_file): cluster_centers = get_cluster_centers(clusterfile) calculate_centers = False # Get the vectors for each phrase in the clusters for num in cluster_map.keys(): #print "cluster " + str(num) vecs = [] phrases = cluster_map[num] for phrase in phrases: words = phrase.split(' ') word_vecs = [] for word in words: vec = data_util.zero_vec(dim) if word in word2vec: vec = word2vec[word] word_vecs.append(vec) if len(word_vecs) == 0: #print "ZERO VEC: " + phrase phrase_vec = data_util.zero_vec(dim) else: phrase_vec = numpy.average(numpy.asarray(word_vecs), axis=0) vecs.append(phrase_vec) cluster_vecs[num] = vecs if calculate_centers: cluster_vec = numpy.average(numpy.asarray(vecs), axis=0) #print "cluster " + str(num) + " vec shape: " + str(cluster_vec.shape) cluster_centers[num] = cluster_vec # Get closest phrase if return_phrases: for num in cluster_map.keys(): cluster_vec = cluster_centers[num] phrases = cluster_map[num] vecs = cluster_vecs[num] #print 'phrases: ' + str(len(phrases)) + ', vecs: ' + str(len(vecs)) best_vec = data_util.zero_vec(dim) best_phrase = "" best_dist = 10000000.0 for x in range(len(phrases)): phrase = phrases[x] phrase_len = len(phrase.split(' ')) phrase_vec = vecs[x] dist_temp = numpy.linalg.norm(phrase_vec - cluster_vec) # Length penalty dist = dist_temp * phrase_len #print "phrase: " + phrase + ", dist: " + str(dist) if dist < best_dist: best_dist = dist best_vec = phrase_vec best_phrase = phrase #print "best phrase: " + best_phrase cluster_names[num] = best_phrase zero_vec = data_util.zero_vec(dim) kw_names = [] for kw_list in labels: #print "kw_list: " + str(type(kw_list)) + " : " + str(kw_list) if type(kw_list) is str: kw_list = kw_list.split(',') kw_embeddings = [] kw_text = [] for cluster_num in kw_list: if cluster_num != '': #print "converting cluster " + cluster_num num = int(cluster_num) vec = cluster_centers[num] #print "vec: " + str(len(vec)) kw_embeddings.append(vec) if return_phrases: name = cluster_names[num] kw_text.append(name) kw_names.append(kw_text) # Pad vectors while len(kw_embeddings) < max_length: kw_embeddings.insert(0, zero_vec) if len(kw_embeddings) > max_length: kw_embeddings = kw_embeddings[:max_length] #print "kw_embeddings: " + str(len(kw_embeddings)) cluster_embeddings.append(kw_embeddings) if return_phrases: # Write cluster name mapping to file outname = clusterfile + ".names" outfile = open(outname, 'w') for key in cluster_names.keys(): name = cluster_names[key] phrases = cluster_map[key] outfile.write( str(key) + " : " + name + " : " + str(phrases) + "\n") outfile.close() return cluster_embeddings, kw_names else: return cluster_embeddings